gcc/config/aarch64/aarch64.cc

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2024 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #define INCLUDE_STRING
  24 #define INCLUDE_ALGORITHM
  25 #define INCLUDE_VECTOR
  26 #include "config.h"
  27 #include "system.h"
  28 #include "coretypes.h"
  29 #include "backend.h"
  30 #include "target.h"
  31 #include "rtl.h"
  32 #include "tree.h"
  33 #include "memmodel.h"
  34 #include "gimple.h"
  35 #include "cfghooks.h"
  36 #include "cfgloop.h"
  37 #include "df.h"
  38 #include "tm_p.h"
  39 #include "stringpool.h"
  40 #include "attribs.h"
  41 #include "optabs.h"
  42 #include "regs.h"
  43 #include "emit-rtl.h"
  44 #include "recog.h"
  45 #include "cgraph.h"
  46 #include "diagnostic.h"
  47 #include "insn-attr.h"
  48 #include "alias.h"
  49 #include "fold-const.h"
  50 #include "stor-layout.h"
  51 #include "calls.h"
  52 #include "varasm.h"
  53 #include "output.h"
  54 #include "flags.h"
  55 #include "explow.h"
  56 #include "expr.h"
  57 #include "reload.h"
  58 #include "langhooks.h"
  59 #include "opts.h"
  60 #include "gimplify.h"
  61 #include "dwarf2.h"
  62 #include "gimple-iterator.h"
  63 #include "tree-vectorizer.h"
  64 #include "aarch64-cost-tables.h"
  65 #include "dumpfile.h"
  66 #include "builtins.h"
  67 #include "rtl-iter.h"
  68 #include "tm-constrs.h"
  69 #include "sched-int.h"
  70 #include "target-globals.h"
  71 #include "common/common-target.h"
  72 #include "cfgrtl.h"
  73 #include "selftest.h"
  74 #include "selftest-rtl.h"
  75 #include "rtx-vector-builder.h"
  76 #include "intl.h"
  77 #include "expmed.h"
  78 #include "function-abi.h"
  79 #include "gimple-pretty-print.h"
  80 #include "tree-ssa-loop-niter.h"
  81 #include "fractional-cost.h"
  82 #include "rtlanal.h"
  83 #include "tree-dfa.h"
  84 #include "asan.h"
  85 #include "aarch64-feature-deps.h"
  86 #include "config/arm/aarch-common.h"
  87 #include "config/arm/aarch-common-protos.h"
  88 #include "common/config/aarch64/cpuinfo.h"
  89 #include "ssa.h"
  90 #include "except.h"
  91 #include "tree-pass.h"
  92 #include "cfgbuild.h"
  93 #include "symbol-summary.h"
  94 #include "sreal.h"
  95 #include "ipa-cp.h"
  96 #include "ipa-prop.h"
  97 #include "ipa-fnsummary.h"
  98 #include "hash-map.h"
  99
 100 /* This file should be included last.  */
 101 #include "target-def.h"
 102
 103 /* Defined for convenience.  */
 104 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 105
 106 /* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
 107    and 1 MOVI/DUP (same size as a call).  */
 108 #define MAX_SET_SIZE(speed) (speed ? 256 : 96)
 109
 110 /* Flags that describe how a function shares certain architectural state
 111    with its callers.
 112
 113    - AARCH64_STATE_SHARED indicates that the function does share the state
 114      with callers.
 115
 116    - AARCH64_STATE_IN indicates that the function reads (or might read) the
 117      incoming state.  The converse is that the function ignores the incoming
 118      state.
 119
 120    - AARCH64_STATE_OUT indicates that the function returns new state.
 121      The converse is that the state on return is the same as it was on entry.
 122
 123    A function that partially modifies the state treats it as both IN
 124    and OUT (because the value on return depends to some extent on the
 125    value on input).  */
 126 constexpr auto AARCH64_STATE_SHARED = 1U << 0;
 127 constexpr auto AARCH64_STATE_IN = 1U << 1;
 128 constexpr auto AARCH64_STATE_OUT = 1U << 2;
 129
 130 /* Information about a legitimate vector immediate operand.  */
 131 struct simd_immediate_info
 132 {
 133   enum insn_type { MOV, MVN, INDEX, PTRUE };
 134   enum modifier_type { LSL, MSL };
 135
 136   simd_immediate_info () {}
 137   simd_immediate_info (scalar_float_mode, rtx);
 138   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 139                        insn_type = MOV, modifier_type = LSL,
 140                        unsigned int = 0);
 141   simd_immediate_info (scalar_mode, rtx, rtx);
 142   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
 143
 144   /* The mode of the elements.  */
 145   scalar_mode elt_mode;
 146
 147   /* The instruction to use to move the immediate into a vector.  */
 148   insn_type insn;
 149
 150   union
 151   {
 152     /* For MOV and MVN.  */
 153     struct
 154     {
 155       /* The value of each element.  */
 156       rtx value;
 157
 158       /* The kind of shift modifier to use, and the number of bits to shift.
 159          This is (LSL, 0) if no shift is needed.  */
 160       modifier_type modifier;
 161       unsigned int shift;
 162     } mov;
 163
 164     /* For INDEX.  */
 165     struct
 166     {
 167       /* The value of the first element and the step to be added for each
 168          subsequent element.  */
 169       rtx base, step;
 170     } index;
 171
 172     /* For PTRUE.  */
 173     aarch64_svpattern pattern;
 174   } u;
 175 };
 176
 177 /* Construct a floating-point immediate in which each element has mode
 178    ELT_MODE_IN and value VALUE_IN.  */
 179 inline simd_immediate_info
 180 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 181   : elt_mode (elt_mode_in), insn (MOV)
 182 {
 183   u.mov.value = value_in;
 184   u.mov.modifier = LSL;
 185   u.mov.shift = 0;
 186 }
 187
 188 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 189    and value VALUE_IN.  The other parameters are as for the structure
 190    fields.  */
 191 inline simd_immediate_info
 192 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 193                        unsigned HOST_WIDE_INT value_in,
 194                        insn_type insn_in, modifier_type modifier_in,
 195                        unsigned int shift_in)
 196   : elt_mode (elt_mode_in), insn (insn_in)
 197 {
 198   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 199   u.mov.modifier = modifier_in;
 200   u.mov.shift = shift_in;
 201 }
 202
 203 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 204    and where element I is equal to BASE_IN + I * STEP_IN.  */
 205 inline simd_immediate_info
 206 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 207   : elt_mode (elt_mode_in), insn (INDEX)
 208 {
 209   u.index.base = base_in;
 210   u.index.step = step_in;
 211 }
 212
 213 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 214    and has PTRUE pattern PATTERN_IN.  */
 215 inline simd_immediate_info
 216 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 217                        aarch64_svpattern pattern_in)
 218   : elt_mode (elt_mode_in), insn (PTRUE)
 219 {
 220   u.pattern = pattern_in;
 221 }
 222
 223 namespace {
 224
 225 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
 226 class pure_scalable_type_info
 227 {
 228 public:
 229   /* Represents the result of analyzing a type.  All values are nonzero,
 230      in the possibly forlorn hope that accidental conversions to bool
 231      trigger a warning.  */
 232   enum analysis_result
 233   {
 234     /* The type does not have an ABI identity; i.e. it doesn't contain
 235        at least one object whose type is a Fundamental Data Type.  */
 236     NO_ABI_IDENTITY = 1,
 237
 238     /* The type is definitely a Pure Scalable Type.  */
 239     IS_PST,
 240
 241     /* The type is definitely not a Pure Scalable Type.  */
 242     ISNT_PST,
 243
 244     /* It doesn't matter for PCS purposes whether the type is a Pure
 245        Scalable Type or not, since the type will be handled the same
 246        way regardless.
 247
 248        Specifically, this means that if the type is a Pure Scalable Type,
 249        there aren't enough argument registers to hold it, and so it will
 250        need to be passed or returned in memory.  If the type isn't a
 251        Pure Scalable Type, it's too big to be passed or returned in core
 252        or SIMD&FP registers, and so again will need to go in memory.  */
 253     DOESNT_MATTER
 254   };
 255
 256   /* Aggregates of 17 bytes or more are normally passed and returned
 257      in memory, so aggregates of that size can safely be analyzed as
 258      DOESNT_MATTER.  We need to be able to collect enough pieces to
 259      represent a PST that is smaller than that.  Since predicates are
 260      2 bytes in size for -msve-vector-bits=128, that means we need to be
 261      able to store at least 8 pieces.
 262
 263      We also need to be able to store enough pieces to represent
 264      a single vector in each vector argument register and a single
 265      predicate in each predicate argument register.  This means that
 266      we need at least 12 pieces.  */
 267   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
 268   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
 269
 270   /* Describes one piece of a PST.  Each piece is one of:
 271
 272      - a single Scalable Vector Type (SVT)
 273      - a single Scalable Predicate Type (SPT)
 274      - a PST containing 2, 3 or 4 SVTs, with no padding
 275
 276      It either represents a single built-in type or a PST formed from
 277      multiple homogeneous built-in types.  */
 278   struct piece
 279   {
 280     rtx get_rtx (unsigned int, unsigned int) const;
 281
 282     /* The number of vector and predicate registers that the piece
 283        occupies.  One of the two is always zero.  */
 284     unsigned int num_zr;
 285     unsigned int num_pr;
 286
 287     /* The mode of the registers described above.  */
 288     machine_mode mode;
 289
 290     /* If this piece is formed from multiple homogeneous built-in types,
 291        this is the mode of the built-in types, otherwise it is MODE.  */
 292     machine_mode orig_mode;
 293
 294     /* The offset in bytes of the piece from the start of the type.  */
 295     poly_uint64 offset;
 296   };
 297
 298   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
 299      are in memory order.  */
 300   auto_vec<piece, MAX_PIECES> pieces;
 301
 302   unsigned int num_zr () const;
 303   unsigned int num_pr () const;
 304
 305   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
 306
 307   analysis_result analyze (const_tree);
 308   bool analyze_registers (const_tree);
 309
 310 private:
 311   analysis_result analyze_array (const_tree);
 312   analysis_result analyze_record (const_tree);
 313   void add_piece (const piece &);
 314 };
 315 }
 316
 317 /* The current code model.  */
 318 enum aarch64_code_model aarch64_cmodel;
 319
 320 enum aarch64_tp_reg aarch64_tpidr_register;
 321
 322 /* The number of 64-bit elements in an SVE vector.  */
 323 poly_uint16 aarch64_sve_vg;
 324
 325 #ifdef HAVE_AS_TLS
 326 #undef TARGET_HAVE_TLS
 327 #define TARGET_HAVE_TLS 1
 328 #endif
 329
 330 static bool aarch64_composite_type_p (const_tree, machine_mode);
 331 static bool aarch64_return_in_memory_1 (const_tree);
 332 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 333                                                      const_tree,
 334                                                      machine_mode *, int *,
 335                                                      bool *, bool);
 336 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 337 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 338 static void aarch64_override_options_after_change (void);
 339 static bool aarch64_vector_mode_supported_p (machine_mode);
 340 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 341 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 342                                                          const_tree type,
 343                                                          int misalignment,
 344                                                          bool is_packed);
 345 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 346 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 347                                             aarch64_addr_query_type);
 348
 349 /* The processor for which instructions should be scheduled.  */
 350 enum aarch64_processor aarch64_tune = cortexa53;
 351
 352 /* Mask to specify which instruction scheduling options should be used.  */
 353 uint64_t aarch64_tune_flags = 0;
 354
 355 /* Global flag for PC relative loads.  */
 356 bool aarch64_pcrelative_literal_loads;
 357
 358 /* Global flag for whether frame pointer is enabled.  */
 359 bool aarch64_use_frame_pointer;
 360
 361 /* Support for command line parsing of boolean flags in the tuning
 362    structures.  */
 363 struct aarch64_flag_desc
 364 {
 365   const char* name;
 366   unsigned int flag;
 367 };
 368
 369 #define AARCH64_FUSION_PAIR(name, internal_name) \
 370   { name, AARCH64_FUSE_##internal_name },
 371 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 372 {
 373   { "none", AARCH64_FUSE_NOTHING },
 374 #include "aarch64-fusion-pairs.def"
 375   { "all", AARCH64_FUSE_ALL },
 376   { NULL, AARCH64_FUSE_NOTHING }
 377 };
 378
 379 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 380   { name, AARCH64_EXTRA_TUNE_##internal_name },
 381 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 382 {
 383   { "none", AARCH64_EXTRA_TUNE_NONE },
 384 #include "aarch64-tuning-flags.def"
 385   { "all", AARCH64_EXTRA_TUNE_ALL },
 386   { NULL, AARCH64_EXTRA_TUNE_NONE }
 387 };
 388
 389 /* Tuning parameters.  */
 390 #include "tuning_models/generic.h"
 391 #include "tuning_models/generic_armv8_a.h"
 392 #include "tuning_models/generic_armv9_a.h"
 393 #include "tuning_models/cortexa35.h"
 394 #include "tuning_models/cortexa53.h"
 395 #include "tuning_models/cortexa57.h"
 396 #include "tuning_models/cortexa72.h"
 397 #include "tuning_models/cortexa73.h"
 398 #include "tuning_models/exynosm1.h"
 399 #include "tuning_models/thunderxt88.h"
 400 #include "tuning_models/thunderx.h"
 401 #include "tuning_models/tsv110.h"
 402 #include "tuning_models/xgene1.h"
 403 #include "tuning_models/emag.h"
 404 #include "tuning_models/qdf24xx.h"
 405 #include "tuning_models/saphira.h"
 406 #include "tuning_models/thunderx2t99.h"
 407 #include "tuning_models/thunderx3t110.h"
 408 #include "tuning_models/neoversen1.h"
 409 #include "tuning_models/ampere1.h"
 410 #include "tuning_models/ampere1a.h"
 411 #include "tuning_models/ampere1b.h"
 412 #include "tuning_models/neoversev1.h"
 413 #include "tuning_models/neoverse512tvb.h"
 414 #include "tuning_models/neoversen2.h"
 415 #include "tuning_models/neoversev2.h"
 416 #include "tuning_models/a64fx.h"
 417
 418 /* Support for fine-grained override of the tuning structures.  */
 419 struct aarch64_tuning_override_function
 420 {
 421   const char* name;
 422   void (*parse_override)(const char*, struct tune_params*);
 423 };
 424
 425 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 426 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 427 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
 428
 429 static const struct aarch64_tuning_override_function
 430 aarch64_tuning_override_functions[] =
 431 {
 432   { "fuse", aarch64_parse_fuse_string },
 433   { "tune", aarch64_parse_tune_string },
 434   { "sve_width", aarch64_parse_sve_width_string },
 435   { NULL, NULL }
 436 };
 437
 438 /* A processor implementing AArch64.  */
 439 struct processor
 440 {
 441   const char *name;
 442   aarch64_processor ident;
 443   aarch64_processor sched_core;
 444   aarch64_arch arch;
 445   aarch64_feature_flags flags;
 446   const tune_params *tune;
 447 };
 448
 449 /* Architectures implementing AArch64.  */
 450 static CONSTEXPR const processor all_architectures[] =
 451 {
 452 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
 453   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
 454    feature_deps::ARCH_IDENT ().enable, NULL},
 455 #include "aarch64-arches.def"
 456   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
 457 };
 458
 459 /* Processor cores implementing AArch64.  */
 460 static const struct processor all_cores[] =
 461 {
 462 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
 463   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
 464    feature_deps::cpu_##IDENT, &COSTS##_tunings},
 465 #include "aarch64-cores.def"
 466   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
 467 };
 468 /* Internal representation of system registers.  */
 469 typedef struct {
 470   const char *name;
 471   /* Stringified sysreg encoding values, represented as
 472      s<sn>_<op1>_c<cn>_c<cm>_<op2>.  */
 473   const char *encoding;
 474   /* Flags affecting sysreg usage, such as read/write-only.  */
 475   unsigned properties;
 476   /* Architectural features implied by sysreg.  */
 477   aarch64_feature_flags arch_reqs;
 478 } sysreg_t;
 479
 480 /* An aarch64_feature_set initializer for a single feature,
 481    AARCH64_FEATURE_<FEAT>.  */
 482 #define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
 483
 484 /* Used by AARCH64_FEATURES.  */
 485 #define AARCH64_OR_FEATURES_1(X, F1) \
 486   AARCH64_FEATURE (F1)
 487 #define AARCH64_OR_FEATURES_2(X, F1, F2) \
 488   (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
 489 #define AARCH64_OR_FEATURES_3(X, F1, ...) \
 490   (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
 491
 492 /* An aarch64_feature_set initializer for the N features listed in "...".  */
 493 #define AARCH64_FEATURES(N, ...) \
 494   AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
 495
 496 #define AARCH64_NO_FEATURES        0
 497
 498 /* Flags associated with the properties of system registers.  It mainly serves
 499    to mark particular registers as read or write only.  */
 500 #define F_DEPRECATED               (1 << 1)
 501 #define F_REG_READ                 (1 << 2)
 502 #define F_REG_WRITE                (1 << 3)
 503 #define F_ARCHEXT                  (1 << 4)
 504 /* Flag indicating register name is alias for another system register.  */
 505 #define F_REG_ALIAS                (1 << 5)
 506 /* Flag indicatinig registers which may be implemented with 128-bits.  */
 507 #define F_REG_128                  (1 << 6)
 508
 509 /* Database of system registers, their encodings and architectural
 510    requirements.  */
 511 const sysreg_t aarch64_sysregs[] =
 512 {
 513 #define CPENC(SN, OP1, CN, CM, OP2) "s"#SN"_"#OP1"_c"#CN"_c"#CM"_"#OP2
 514 #define SYSREG(NAME, ENC, FLAGS, ARCH) \
 515   { NAME, ENC, FLAGS, ARCH },
 516 #include "aarch64-sys-regs.def"
 517 #undef CPENC
 518 };
 519
 520 #undef AARCH64_NO_FEATURES
 521
 522 using sysreg_map_t = hash_map<nofree_string_hash, const sysreg_t *>;
 523 static sysreg_map_t *sysreg_map = nullptr;
 524
 525 /* Map system register names to their hardware metadata: encoding,
 526    feature flags and architectural feature requirements, all of which
 527    are encoded in a sysreg_t struct.  */
 528 void
 529 aarch64_register_sysreg (const char *name, const sysreg_t *metadata)
 530 {
 531   bool dup = sysreg_map->put (name, metadata);
 532   gcc_checking_assert (!dup);
 533 }
 534
 535 /* Lazily initialize hash table for system register validation,
 536    checking the validity of supplied register name and returning
 537    register's associated metadata.  */
 538 static void
 539 aarch64_init_sysregs (void)
 540 {
 541   gcc_assert (!sysreg_map);
 542   sysreg_map = new sysreg_map_t;
 543
 544
 545   for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
 546     {
 547       const sysreg_t *reg = aarch64_sysregs + i;
 548       aarch64_register_sysreg (reg->name, reg);
 549     }
 550 }
 551
 552 /* No direct access to the sysreg hash-map should be made.  Doing so
 553    risks trying to acess an unitialized hash-map and dereferencing the
 554    returned double pointer without due care risks dereferencing a
 555    null-pointer.  */
 556 const sysreg_t *
 557 aarch64_lookup_sysreg_map (const char *regname)
 558 {
 559   if (!sysreg_map)
 560     aarch64_init_sysregs ();
 561
 562   const sysreg_t **sysreg_entry = sysreg_map->get (regname);
 563   if (sysreg_entry != NULL)
 564     return *sysreg_entry;
 565   return NULL;
 566 }
 567
 568 /* The current tuning set.  */
 569 struct tune_params aarch64_tune_params = generic_tunings;
 570
 571 /* If NAME is the name of an arm:: attribute that describes shared state,
 572    return its associated AARCH64_STATE_* flags, otherwise return 0.  */
 573 static unsigned int
 574 aarch64_attribute_shared_state_flags (const char *name)
 575 {
 576   if (strcmp (name, "in") == 0)
 577     return AARCH64_STATE_SHARED | AARCH64_STATE_IN;
 578   if (strcmp (name, "inout") == 0)
 579     return AARCH64_STATE_SHARED | AARCH64_STATE_IN | AARCH64_STATE_OUT;
 580   if (strcmp (name, "out") == 0)
 581     return AARCH64_STATE_SHARED | AARCH64_STATE_OUT;
 582   if (strcmp (name, "preserves") == 0)
 583     return AARCH64_STATE_SHARED;
 584   return 0;
 585 }
 586
 587 /* See whether attribute list ATTRS has any sharing information
 588    for state STATE_NAME.  Return the associated state flags if so,
 589    otherwise return 0.  */
 590 static unsigned int
 591 aarch64_lookup_shared_state_flags (tree attrs, const char *state_name)
 592 {
 593   for (tree attr = attrs; attr; attr = TREE_CHAIN (attr))
 594     {
 595       if (!cxx11_attribute_p (attr))
 596         continue;
 597
 598       auto ns = IDENTIFIER_POINTER (TREE_PURPOSE (TREE_PURPOSE (attr)));
 599       if (strcmp (ns, "arm") != 0)
 600         continue;
 601
 602       auto attr_name = IDENTIFIER_POINTER (TREE_VALUE (TREE_PURPOSE (attr)));
 603       auto flags = aarch64_attribute_shared_state_flags (attr_name);
 604       if (!flags)
 605         continue;
 606
 607       for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
 608         {
 609           tree value = TREE_VALUE (arg);
 610           if (TREE_CODE (value) == STRING_CST
 611               && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
 612             return flags;
 613         }
 614     }
 615   return 0;
 616 }
 617
 618 /* Return true if DECL creates a new scope for state STATE_STRING.  */
 619 static bool
 620 aarch64_fndecl_has_new_state (const_tree decl, const char *state_name)
 621 {
 622   if (tree attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl)))
 623     for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
 624       {
 625         tree value = TREE_VALUE (arg);
 626         if (TREE_CODE (value) == STRING_CST
 627             && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
 628           return true;
 629       }
 630   return false;
 631 }
 632
 633 /* Return true if attribute argument VALUE is a recognized state string,
 634    otherwise report an error.  NAME is the name of the attribute to which
 635    VALUE is being passed.  */
 636 static bool
 637 aarch64_check_state_string (tree name, tree value)
 638 {
 639   if (TREE_CODE (value) != STRING_CST)
 640     {
 641       error ("the arguments to %qE must be constant strings", name);
 642       return false;
 643     }
 644
 645   const char *state_name = TREE_STRING_POINTER (value);
 646   if (strcmp (state_name, "za") != 0
 647       && strcmp (state_name, "zt0") != 0)
 648     {
 649       error ("unrecognized state string %qs", state_name);
 650       return false;
 651     }
 652
 653   return true;
 654 }
 655
 656 /* qsort callback to compare two STRING_CSTs.  */
 657 static int
 658 cmp_string_csts (const void *a, const void *b)
 659 {
 660   return strcmp (TREE_STRING_POINTER (*(const_tree const *) a),
 661                  TREE_STRING_POINTER (*(const_tree const *) b));
 662 }
 663
 664 /* Canonicalize a list of state strings.  ARGS contains the arguments to
 665    a new attribute while OLD_ATTR, if nonnull, contains a previous attribute
 666    of the same type.  If CAN_MERGE_IN_PLACE, it is safe to adjust OLD_ATTR's
 667    arguments and drop the new attribute.  Otherwise, the new attribute must
 668    be kept and ARGS must include the information in OLD_ATTR.
 669
 670    In both cases, the new arguments must be a sorted list of state strings
 671    with duplicates removed.
 672
 673    Return true if new attribute should be kept, false if it should be
 674    dropped.  */
 675 static bool
 676 aarch64_merge_string_arguments (tree args, tree old_attr,
 677                                 bool can_merge_in_place)
 678 {
 679   /* Get a sorted list of all state strings (including duplicates).  */
 680   auto add_args = [](vec<tree> &strings, const_tree args)
 681     {
 682       for (const_tree arg = args; arg; arg = TREE_CHAIN (arg))
 683         if (TREE_CODE (TREE_VALUE (arg)) == STRING_CST)
 684           strings.safe_push (TREE_VALUE (arg));
 685     };
 686   auto_vec<tree, 16> strings;
 687   add_args (strings, args);
 688   if (old_attr)
 689     add_args (strings, TREE_VALUE (old_attr));
 690   strings.qsort (cmp_string_csts);
 691
 692   /* The list can be empty if there was no previous attribute and if all
 693      the new arguments are erroneous.  Drop the attribute in that case.  */
 694   if (strings.is_empty ())
 695     return false;
 696
 697   /* Destructively modify one of the argument lists, removing duplicates
 698      on the fly.  */
 699   bool use_old_attr = old_attr && can_merge_in_place;
 700   tree *end = use_old_attr ? &TREE_VALUE (old_attr) : &args;
 701   tree prev = NULL_TREE;
 702   for (tree arg : strings)
 703     {
 704       if (prev && simple_cst_equal (arg, prev))
 705         continue;
 706       prev = arg;
 707       if (!*end)
 708         *end = tree_cons (NULL_TREE, arg, NULL_TREE);
 709       else
 710         TREE_VALUE (*end) = arg;
 711       end = &TREE_CHAIN (*end);
 712     }
 713   *end = NULL_TREE;
 714   return !use_old_attr;
 715 }
 716
 717 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
 718
 719 static tree
 720 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
 721                                      int, bool *no_add_attrs)
 722 {
 723   /* Since we set fn_type_req to true, the caller should have checked
 724      this for us.  */
 725   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
 726   switch ((arm_pcs) fntype_abi (*node).id ())
 727     {
 728     case ARM_PCS_AAPCS64:
 729     case ARM_PCS_SIMD:
 730       return NULL_TREE;
 731
 732     case ARM_PCS_SVE:
 733       error ("the %qE attribute cannot be applied to an SVE function type",
 734              name);
 735       *no_add_attrs = true;
 736       return NULL_TREE;
 737
 738     case ARM_PCS_TLSDESC:
 739     case ARM_PCS_UNKNOWN:
 740       break;
 741     }
 742   gcc_unreachable ();
 743 }
 744
 745 /* Return true if arm::new(ARGS) is compatible with the type of decl DECL,
 746    otherwise report an error.  */
 747 static bool
 748 aarch64_check_arm_new_against_type (tree args, tree decl)
 749 {
 750   tree type_attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl));
 751   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 752     {
 753       tree value = TREE_VALUE (arg);
 754       if (TREE_CODE (value) == STRING_CST)
 755         {
 756           const char *state_name = TREE_STRING_POINTER (value);
 757           if (aarch64_lookup_shared_state_flags (type_attrs, state_name))
 758             {
 759               error_at (DECL_SOURCE_LOCATION (decl),
 760                         "cannot create a new %qs scope since %qs is shared"
 761                         " with callers", state_name, state_name);
 762               return false;
 763             }
 764         }
 765     }
 766   return true;
 767 }
 768
 769 /* Callback for arm::new attributes.  */
 770 static tree
 771 handle_arm_new (tree *node, tree name, tree args, int, bool *no_add_attrs)
 772 {
 773   tree decl = *node;
 774   if (TREE_CODE (decl) != FUNCTION_DECL)
 775     {
 776       error ("%qE attribute applies only to function definitions", name);
 777       *no_add_attrs = true;
 778       return NULL_TREE;
 779     }
 780   if (TREE_TYPE (decl) == error_mark_node)
 781     {
 782       *no_add_attrs = true;
 783       return NULL_TREE;
 784     }
 785
 786   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 787     aarch64_check_state_string (name, TREE_VALUE (arg));
 788
 789   if (!aarch64_check_arm_new_against_type (args, decl))
 790     {
 791       *no_add_attrs = true;
 792       return NULL_TREE;
 793     }
 794
 795   /* If there is an old attribute, we should try to update it in-place,
 796      so that there is only one (definitive) arm::new attribute on the decl.  */
 797   tree old_attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl));
 798   if (!aarch64_merge_string_arguments (args, old_attr, true))
 799     *no_add_attrs = true;
 800
 801   return NULL_TREE;
 802 }
 803
 804 /* Callback for arm::{in,out,inout,preserves} attributes.  */
 805 static tree
 806 handle_arm_shared (tree *node, tree name, tree args,
 807                    int, bool *no_add_attrs)
 808 {
 809   tree type = *node;
 810   tree old_attrs = TYPE_ATTRIBUTES (type);
 811   auto flags = aarch64_attribute_shared_state_flags (IDENTIFIER_POINTER (name));
 812   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 813     {
 814       tree value = TREE_VALUE (arg);
 815       if (aarch64_check_state_string (name, value))
 816         {
 817           const char *state_name = TREE_STRING_POINTER (value);
 818           auto old_flags = aarch64_lookup_shared_state_flags (old_attrs,
 819                                                               state_name);
 820           if (old_flags && old_flags != flags)
 821             {
 822               error ("inconsistent attributes for state %qs", state_name);
 823               *no_add_attrs = true;
 824               return NULL_TREE;
 825             }
 826         }
 827     }
 828
 829   /* We can't update an old attribute in-place, since types are shared.
 830      Instead make sure that this new attribute contains all the
 831      information, so that the old attribute becomes redundant.  */
 832   tree old_attr = lookup_attribute ("arm", IDENTIFIER_POINTER (name),
 833                                     old_attrs);
 834   if (!aarch64_merge_string_arguments (args, old_attr, false))
 835     *no_add_attrs = true;
 836
 837   return NULL_TREE;
 838 }
 839
 840 /* Mutually-exclusive function type attributes for controlling PSTATE.SM.  */
 841 static const struct attribute_spec::exclusions attr_streaming_exclusions[] =
 842 {
 843   /* Attribute name     exclusion applies to:
 844                         function, type, variable */
 845   { "streaming", false, true, false },
 846   { "streaming_compatible", false, true, false },
 847   { NULL, false, false, false }
 848 };
 849
 850 /* Table of machine attributes.  */
 851 static const attribute_spec aarch64_gnu_attributes[] =
 852 {
 853   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
 854        affects_type_identity, handler, exclude } */
 855   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
 856                           handle_aarch64_vector_pcs_attribute, NULL },
 857   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
 858                           aarch64_sve::handle_arm_sve_vector_bits_attribute,
 859                           NULL },
 860   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
 861   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
 862   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
 863 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
 864   { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute, NULL },
 865   { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute, NULL },
 866 #endif
 867 #ifdef SUBTARGET_ATTRIBUTE_TABLE
 868   SUBTARGET_ATTRIBUTE_TABLE
 869 #endif
 870 };
 871
 872 static const scoped_attribute_specs aarch64_gnu_attribute_table =
 873 {
 874   "gnu", { aarch64_gnu_attributes }
 875 };
 876
 877 static const attribute_spec aarch64_arm_attributes[] =
 878 {
 879   { "streaming",          0, 0, false, true,  true,  true,
 880                           NULL, attr_streaming_exclusions },
 881   { "streaming_compatible", 0, 0, false, true,  true,  true,
 882                           NULL, attr_streaming_exclusions },
 883   { "locally_streaming",  0, 0, true, false, false, false, NULL, NULL },
 884   { "new",                1, -1, true, false, false, false,
 885                           handle_arm_new, NULL },
 886   { "preserves",          1, -1, false, true,  true,  true,
 887                           handle_arm_shared, NULL },
 888   { "in",                 1, -1, false, true,  true,  true,
 889                           handle_arm_shared, NULL },
 890   { "out",                1, -1, false, true,  true,  true,
 891                           handle_arm_shared, NULL },
 892   { "inout",              1, -1, false, true,  true,  true,
 893                           handle_arm_shared, NULL }
 894 };
 895
 896 static const scoped_attribute_specs aarch64_arm_attribute_table =
 897 {
 898   "arm", { aarch64_arm_attributes }
 899 };
 900
 901 static const scoped_attribute_specs *const aarch64_attribute_table[] =
 902 {
 903   &aarch64_gnu_attribute_table,
 904   &aarch64_arm_attribute_table
 905 };
 906
 907 typedef enum aarch64_cond_code
 908 {
 909   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 910   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 911   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 912 }
 913 aarch64_cc;
 914
 915 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 916
 917
 918 /* The condition codes of the processor, and the inverse function.  */
 919 static const char * const aarch64_condition_codes[] =
 920 {
 921   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 922   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 923 };
 924
 925 /* The preferred condition codes for SVE conditions.  */
 926 static const char *const aarch64_sve_condition_codes[] =
 927 {
 928   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
 929   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
 930 };
 931
 932 /* Return the assembly token for svpattern value VALUE.  */
 933
 934 static const char *
 935 svpattern_token (enum aarch64_svpattern pattern)
 936 {
 937   switch (pattern)
 938     {
 939 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
 940     AARCH64_FOR_SVPATTERN (CASE)
 941 #undef CASE
 942     case AARCH64_NUM_SVPATTERNS:
 943       break;
 944     }
 945   gcc_unreachable ();
 946 }
 947
 948 /* Return the location of a piece that is known to be passed or returned
 949    in registers.  FIRST_ZR is the first unused vector argument register
 950    and FIRST_PR is the first unused predicate argument register.  */
 951
 952 rtx
 953 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
 954                                          unsigned int first_pr) const
 955 {
 956   gcc_assert (VECTOR_MODE_P (mode)
 957               && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
 958               && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
 959
 960   if (num_zr > 0 && num_pr == 0)
 961     return gen_rtx_REG (mode, first_zr);
 962
 963   if (num_zr == 0 && num_pr <= 2)
 964     return gen_rtx_REG (mode, first_pr);
 965
 966   gcc_unreachable ();
 967 }
 968
 969 /* Return the total number of vector registers required by the PST.  */
 970
 971 unsigned int
 972 pure_scalable_type_info::num_zr () const
 973 {
 974   unsigned int res = 0;
 975   for (unsigned int i = 0; i < pieces.length (); ++i)
 976     res += pieces[i].num_zr;
 977   return res;
 978 }
 979
 980 /* Return the total number of predicate registers required by the PST.  */
 981
 982 unsigned int
 983 pure_scalable_type_info::num_pr () const
 984 {
 985   unsigned int res = 0;
 986   for (unsigned int i = 0; i < pieces.length (); ++i)
 987     res += pieces[i].num_pr;
 988   return res;
 989 }
 990
 991 /* Return the location of a PST that is known to be passed or returned
 992    in registers.  FIRST_ZR is the first unused vector argument register
 993    and FIRST_PR is the first unused predicate argument register.  */
 994
 995 rtx
 996 pure_scalable_type_info::get_rtx (machine_mode mode,
 997                                   unsigned int first_zr,
 998                                   unsigned int first_pr) const
 999 {
1000   /* Try to return a single REG if possible.  This leads to better
1001      code generation; it isn't required for correctness.  */
1002   if (mode == pieces[0].mode)
1003     {
1004       gcc_assert (pieces.length () == 1);
1005       return pieces[0].get_rtx (first_zr, first_pr);
1006     }
1007
1008   /* Build up a PARALLEL that contains the individual pieces.  */
1009   rtvec rtxes = rtvec_alloc (pieces.length ());
1010   for (unsigned int i = 0; i < pieces.length (); ++i)
1011     {
1012       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1013       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1014       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1015       first_zr += pieces[i].num_zr;
1016       first_pr += pieces[i].num_pr;
1017     }
1018   return gen_rtx_PARALLEL (mode, rtxes);
1019 }
1020
1021 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1022    in the AAPCS64.  */
1023
1024 pure_scalable_type_info::analysis_result
1025 pure_scalable_type_info::analyze (const_tree type)
1026 {
1027   /* Prevent accidental reuse.  */
1028   gcc_assert (pieces.is_empty ());
1029
1030   /* No code will be generated for erroneous types, so we won't establish
1031      an ABI mapping.  */
1032   if (type == error_mark_node)
1033     return NO_ABI_IDENTITY;
1034
1035   /* Zero-sized types disappear in the language->ABI mapping.  */
1036   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1037     return NO_ABI_IDENTITY;
1038
1039   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
1040   piece p = {};
1041   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1042     {
1043       machine_mode mode = TYPE_MODE_RAW (type);
1044       gcc_assert (VECTOR_MODE_P (mode)
1045                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1046
1047       p.mode = p.orig_mode = mode;
1048       add_piece (p);
1049       return IS_PST;
1050     }
1051
1052   /* Check for user-defined PSTs.  */
1053   if (TREE_CODE (type) == ARRAY_TYPE)
1054     return analyze_array (type);
1055   if (TREE_CODE (type) == RECORD_TYPE)
1056     return analyze_record (type);
1057
1058   return ISNT_PST;
1059 }
1060
1061 /* Analyze a type that is known not to be passed or returned in memory.
1062    Return true if it has an ABI identity and is a Pure Scalable Type.  */
1063
1064 bool
1065 pure_scalable_type_info::analyze_registers (const_tree type)
1066 {
1067   analysis_result result = analyze (type);
1068   gcc_assert (result != DOESNT_MATTER);
1069   return result == IS_PST;
1070 }
1071
1072 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
1073
1074 pure_scalable_type_info::analysis_result
1075 pure_scalable_type_info::analyze_array (const_tree type)
1076 {
1077   /* Analyze the element type.  */
1078   pure_scalable_type_info element_info;
1079   analysis_result result = element_info.analyze (TREE_TYPE (type));
1080   if (result != IS_PST)
1081     return result;
1082
1083   /* An array of unknown, flexible or variable length will be passed and
1084      returned by reference whatever we do.  */
1085   tree nelts_minus_one = array_type_nelts (type);
1086   if (!tree_fits_uhwi_p (nelts_minus_one))
1087     return DOESNT_MATTER;
1088
1089   /* Likewise if the array is constant-sized but too big to be interesting.
1090      The double checks against MAX_PIECES are to protect against overflow.  */
1091   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1092   if (count > MAX_PIECES)
1093     return DOESNT_MATTER;
1094   count += 1;
1095   if (count * element_info.pieces.length () > MAX_PIECES)
1096     return DOESNT_MATTER;
1097
1098   /* The above checks should have weeded out elements of unknown size.  */
1099   poly_uint64 element_bytes;
1100   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1101     gcc_unreachable ();
1102
1103   /* Build up the list of individual vectors and predicates.  */
1104   gcc_assert (!element_info.pieces.is_empty ());
1105   for (unsigned int i = 0; i < count; ++i)
1106     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1107       {
1108         piece p = element_info.pieces[j];
1109         p.offset += i * element_bytes;
1110         add_piece (p);
1111       }
1112   return IS_PST;
1113 }
1114
1115 /* Subroutine of analyze for handling RECORD_TYPEs.  */
1116
1117 pure_scalable_type_info::analysis_result
1118 pure_scalable_type_info::analyze_record (const_tree type)
1119 {
1120   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1121     {
1122       if (TREE_CODE (field) != FIELD_DECL)
1123         continue;
1124
1125       /* Zero-sized fields disappear in the language->ABI mapping.  */
1126       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1127         continue;
1128
1129       /* All fields with an ABI identity must be PSTs for the record as
1130          a whole to be a PST.  If any individual field is too big to be
1131          interesting then the record is too.  */
1132       pure_scalable_type_info field_info;
1133       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1134       if (subresult == NO_ABI_IDENTITY)
1135         continue;
1136       if (subresult != IS_PST)
1137         return subresult;
1138
1139       /* Since all previous fields are PSTs, we ought to be able to track
1140          the field offset using poly_ints.  */
1141       tree bitpos = bit_position (field);
1142       gcc_assert (poly_int_tree_p (bitpos));
1143
1144       /* For the same reason, it shouldn't be possible to create a PST field
1145          whose offset isn't byte-aligned.  */
1146       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1147                                                 BITS_PER_UNIT);
1148
1149       /* Punt if the record is too big to be interesting.  */
1150       poly_uint64 bytepos;
1151       if (!wide_bytepos.to_uhwi (&bytepos)
1152           || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1153         return DOESNT_MATTER;
1154
1155       /* Add the individual vectors and predicates in the field to the
1156          record's list.  */
1157       gcc_assert (!field_info.pieces.is_empty ());
1158       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1159         {
1160           piece p = field_info.pieces[i];
1161           p.offset += bytepos;
1162           add_piece (p);
1163         }
1164     }
1165   /* Empty structures disappear in the language->ABI mapping.  */
1166   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1167 }
1168
1169 /* Add P to the list of pieces in the type.  */
1170
1171 void
1172 pure_scalable_type_info::add_piece (const piece &p)
1173 {
1174   /* Try to fold the new piece into the previous one to form a
1175      single-mode PST.  For example, if we see three consecutive vectors
1176      of the same mode, we can represent them using the corresponding
1177      3-tuple mode.
1178
1179      This is purely an optimization.  */
1180   if (!pieces.is_empty ())
1181     {
1182       piece &prev = pieces.last ();
1183       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1184       unsigned int nelems1, nelems2;
1185       if (prev.orig_mode == p.orig_mode
1186           && GET_MODE_CLASS (p.orig_mode) != MODE_VECTOR_BOOL
1187           && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1188           && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1189                                   GET_MODE_NUNITS (p.orig_mode), &nelems1)
1190           && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1191                                   GET_MODE_NUNITS (p.orig_mode), &nelems2)
1192           && targetm.array_mode (p.orig_mode,
1193                                  nelems1 + nelems2).exists (&prev.mode))
1194         {
1195           prev.num_zr += p.num_zr;
1196           prev.num_pr += p.num_pr;
1197           return;
1198         }
1199     }
1200   pieces.quick_push (p);
1201 }
1202
1203 /* Return true if at least one possible value of type TYPE includes at
1204    least one object of Pure Scalable Type, in the sense of the AAPCS64.
1205
1206    This is a relatively expensive test for some types, so it should
1207    generally be made as late as possible.  */
1208
1209 static bool
1210 aarch64_some_values_include_pst_objects_p (const_tree type)
1211 {
1212   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1213     return false;
1214
1215   if (aarch64_sve::builtin_type_p (type))
1216     return true;
1217
1218   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1219     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1220
1221   if (RECORD_OR_UNION_TYPE_P (type))
1222     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1223       if (TREE_CODE (field) == FIELD_DECL
1224           && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1225         return true;
1226
1227   return false;
1228 }
1229
1230 /* Return the descriptor of the SIMD ABI.  */
1231
1232 static const predefined_function_abi &
1233 aarch64_simd_abi (void)
1234 {
1235   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1236   if (!simd_abi.initialized_p ())
1237     {
1238       HARD_REG_SET full_reg_clobbers
1239         = default_function_abi.full_reg_clobbers ();
1240       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1241         if (FP_SIMD_SAVED_REGNUM_P (regno))
1242           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1243       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1244     }
1245   return simd_abi;
1246 }
1247
1248 /* Return the descriptor of the SVE PCS.  */
1249
1250 static const predefined_function_abi &
1251 aarch64_sve_abi (void)
1252 {
1253   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1254   if (!sve_abi.initialized_p ())
1255     {
1256       HARD_REG_SET full_reg_clobbers
1257         = default_function_abi.full_reg_clobbers ();
1258       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1259         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1260       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1261         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1262       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1263     }
1264   return sve_abi;
1265 }
1266
1267 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
1268    wraps, otherwise return X itself.  */
1269
1270 static rtx
1271 strip_salt (rtx x)
1272 {
1273   rtx search = x;
1274   if (GET_CODE (search) == CONST)
1275     search = XEXP (search, 0);
1276   if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
1277     x = XVECEXP (search, 0, 0);
1278   return x;
1279 }
1280
1281 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
1282    expression.  */
1283
1284 static rtx
1285 strip_offset_and_salt (rtx addr, poly_int64 *offset)
1286 {
1287   return strip_salt (strip_offset (addr, offset));
1288 }
1289
1290 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1291 const char *
1292 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1293                         const char * branch_format)
1294 {
1295     rtx_code_label * tmp_label = gen_label_rtx ();
1296     char label_buf[256];
1297     char buffer[128];
1298     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1299                                  CODE_LABEL_NUMBER (tmp_label));
1300     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1301     rtx dest_label = operands[pos_label];
1302     operands[pos_label] = tmp_label;
1303
1304     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1305     output_asm_insn (buffer, operands);
1306
1307     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1308     operands[pos_label] = dest_label;
1309     output_asm_insn (buffer, operands);
1310     return "";
1311 }
1312
1313 void
1314 aarch64_err_no_fpadvsimd (machine_mode mode)
1315 {
1316   if (TARGET_GENERAL_REGS_ONLY)
1317     if (FLOAT_MODE_P (mode))
1318       error ("%qs is incompatible with the use of floating-point types",
1319              "-mgeneral-regs-only");
1320     else
1321       error ("%qs is incompatible with the use of vector types",
1322              "-mgeneral-regs-only");
1323   else
1324     if (FLOAT_MODE_P (mode))
1325       error ("%qs feature modifier is incompatible with the use of"
1326              " floating-point types", "+nofp");
1327     else
1328       error ("%qs feature modifier is incompatible with the use of"
1329              " vector types", "+nofp");
1330 }
1331
1332 /* Report when we try to do something that requires SVE when SVE is disabled.
1333    This is an error of last resort and isn't very high-quality.  It usually
1334    involves attempts to measure the vector length in some way.  */
1335 static void
1336 aarch64_report_sve_required (void)
1337 {
1338   static bool reported_p = false;
1339
1340   /* Avoid reporting a slew of messages for a single oversight.  */
1341   if (reported_p)
1342     return;
1343
1344   error ("this operation requires the SVE ISA extension");
1345   inform (input_location, "you can enable SVE using the command-line"
1346           " option %<-march%>, or by using the %<target%>"
1347           " attribute or pragma");
1348   reported_p = true;
1349 }
1350
1351 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1352    registers.  */
1353 inline bool
1354 pr_or_ffr_regnum_p (unsigned int regno)
1355 {
1356   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1357 }
1358
1359 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1360    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1361    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1362    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1363    and GENERAL_REGS is lower than the memory cost (in this case the best class
1364    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1365    cost results in bad allocations with many redundant int<->FP moves which
1366    are expensive on various cores.
1367    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1368    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1369    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1370    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1371    The result of this is that it is no longer inefficient to have a higher
1372    memory move cost than the register move cost.
1373 */
1374
1375 static reg_class_t
1376 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1377                                          reg_class_t best_class)
1378 {
1379   machine_mode mode;
1380
1381   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1382       || !reg_class_subset_p (FP_REGS, allocno_class))
1383     return allocno_class;
1384
1385   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1386       || !reg_class_subset_p (FP_REGS, best_class))
1387     return best_class;
1388
1389   mode = PSEUDO_REGNO_MODE (regno);
1390   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1391 }
1392
1393 static unsigned int
1394 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1395 {
1396   if (GET_MODE_UNIT_SIZE (mode) == 4)
1397     return aarch64_tune_params.min_div_recip_mul_sf;
1398   return aarch64_tune_params.min_div_recip_mul_df;
1399 }
1400
1401 /* Return the reassociation width of treeop OPC with mode MODE.  */
1402 static int
1403 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1404 {
1405   if (VECTOR_MODE_P (mode))
1406     return aarch64_tune_params.vec_reassoc_width;
1407   if (INTEGRAL_MODE_P (mode))
1408     return aarch64_tune_params.int_reassoc_width;
1409   /* Reassociation reduces the number of FMAs which may result in worse
1410      performance.  Use a per-CPU setting for FMA reassociation which allows
1411      narrow CPUs with few FP pipes to switch it off (value of 1), and wider
1412      CPUs with many FP pipes to enable reassociation.
1413      Since the reassociation pass doesn't understand FMA at all, assume
1414      that any FP addition might turn into FMA.  */
1415   if (FLOAT_MODE_P (mode))
1416     return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
1417                             : aarch64_tune_params.fp_reassoc_width;
1418   return 1;
1419 }
1420
1421 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1422 unsigned
1423 aarch64_debugger_regno (unsigned regno)
1424 {
1425    if (GP_REGNUM_P (regno))
1426      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1427    else if (regno == SP_REGNUM)
1428      return AARCH64_DWARF_SP;
1429    else if (FP_REGNUM_P (regno))
1430      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1431    else if (PR_REGNUM_P (regno))
1432      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1433    else if (regno == VG_REGNUM)
1434      return AARCH64_DWARF_VG;
1435
1436    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1437       equivalent DWARF register.  */
1438    return DWARF_FRAME_REGISTERS;
1439 }
1440
1441 /* Implement TARGET_DWARF_FRAME_REG_MODE.  */
1442 static machine_mode
1443 aarch64_dwarf_frame_reg_mode (int regno)
1444 {
1445   /* Predicate registers are call-clobbered in the EH ABI (which is
1446      ARM_PCS_AAPCS64), so they should not be described by CFI.
1447      Their size changes as VL changes, so any values computed by
1448      __builtin_init_dwarf_reg_size_table might not be valid for
1449      all frames.  */
1450   if (PR_REGNUM_P (regno))
1451     return VOIDmode;
1452   return default_dwarf_frame_reg_mode (regno);
1453 }
1454
1455 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1456    integer, otherwise return X unmodified.  */
1457 static rtx
1458 aarch64_bit_representation (rtx x)
1459 {
1460   if (CONST_DOUBLE_P (x))
1461     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1462   return x;
1463 }
1464
1465 /* Return an estimate for the number of quadwords in an SVE vector.  This is
1466    equivalent to the number of Advanced SIMD vectors in an SVE vector.  */
1467 static unsigned int
1468 aarch64_estimated_sve_vq ()
1469 {
1470   return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
1471 }
1472
1473 /* Return true if MODE is an SVE predicate mode.  */
1474 static bool
1475 aarch64_sve_pred_mode_p (machine_mode mode)
1476 {
1477   return (TARGET_SVE
1478           && (mode == VNx16BImode
1479               || mode == VNx8BImode
1480               || mode == VNx4BImode
1481               || mode == VNx2BImode));
1482 }
1483
1484 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1485 const unsigned int VEC_ADVSIMD  = 1;
1486 const unsigned int VEC_SVE_DATA = 2;
1487 const unsigned int VEC_SVE_PRED = 4;
1488 /* Indicates a structure of 2, 3 or 4 vectors or predicates.  */
1489 const unsigned int VEC_STRUCT   = 8;
1490 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1491    vector has fewer significant bytes than a full SVE vector.  */
1492 const unsigned int VEC_PARTIAL  = 16;
1493 /* Useful combinations of the above.  */
1494 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1495 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1496
1497 /* Return a set of flags describing the vector properties of mode MODE.
1498    If ANY_TARGET_P is false (the default), ignore modes that are not supported
1499    by the current target.  Otherwise categorize the modes that can be used
1500    with the set of all targets supported by the port.  */
1501
1502 static unsigned int
1503 aarch64_classify_vector_mode (machine_mode mode, bool any_target_p = false)
1504 {
1505   if (aarch64_sve_pred_mode_p (mode))
1506     return VEC_SVE_PRED;
1507
1508   /* Make the decision based on the mode's enum value rather than its
1509      properties, so that we keep the correct classification regardless
1510      of -msve-vector-bits.  */
1511   switch (mode)
1512     {
1513     /* Partial SVE QI vectors.  */
1514     case E_VNx2QImode:
1515     case E_VNx4QImode:
1516     case E_VNx8QImode:
1517     /* Partial SVE HI vectors.  */
1518     case E_VNx2HImode:
1519     case E_VNx4HImode:
1520     /* Partial SVE SI vector.  */
1521     case E_VNx2SImode:
1522     /* Partial SVE HF vectors.  */
1523     case E_VNx2HFmode:
1524     case E_VNx4HFmode:
1525     /* Partial SVE BF vectors.  */
1526     case E_VNx2BFmode:
1527     case E_VNx4BFmode:
1528     /* Partial SVE SF vector.  */
1529     case E_VNx2SFmode:
1530       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1531
1532     case E_VNx16QImode:
1533     case E_VNx8HImode:
1534     case E_VNx4SImode:
1535     case E_VNx2DImode:
1536     case E_VNx8BFmode:
1537     case E_VNx8HFmode:
1538     case E_VNx4SFmode:
1539     case E_VNx2DFmode:
1540       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA : 0;
1541
1542     /* x2 SVE vectors.  */
1543     case E_VNx32QImode:
1544     case E_VNx16HImode:
1545     case E_VNx8SImode:
1546     case E_VNx4DImode:
1547     case E_VNx16BFmode:
1548     case E_VNx16HFmode:
1549     case E_VNx8SFmode:
1550     case E_VNx4DFmode:
1551     /* x3 SVE vectors.  */
1552     case E_VNx48QImode:
1553     case E_VNx24HImode:
1554     case E_VNx12SImode:
1555     case E_VNx6DImode:
1556     case E_VNx24BFmode:
1557     case E_VNx24HFmode:
1558     case E_VNx12SFmode:
1559     case E_VNx6DFmode:
1560     /* x4 SVE vectors.  */
1561     case E_VNx64QImode:
1562     case E_VNx32HImode:
1563     case E_VNx16SImode:
1564     case E_VNx8DImode:
1565     case E_VNx32BFmode:
1566     case E_VNx32HFmode:
1567     case E_VNx16SFmode:
1568     case E_VNx8DFmode:
1569       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_STRUCT : 0;
1570
1571     case E_OImode:
1572     case E_CImode:
1573     case E_XImode:
1574       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1575
1576     /* Structures of 64-bit Advanced SIMD vectors.  */
1577     case E_V2x8QImode:
1578     case E_V2x4HImode:
1579     case E_V2x2SImode:
1580     case E_V2x1DImode:
1581     case E_V2x4BFmode:
1582     case E_V2x4HFmode:
1583     case E_V2x2SFmode:
1584     case E_V2x1DFmode:
1585     case E_V3x8QImode:
1586     case E_V3x4HImode:
1587     case E_V3x2SImode:
1588     case E_V3x1DImode:
1589     case E_V3x4BFmode:
1590     case E_V3x4HFmode:
1591     case E_V3x2SFmode:
1592     case E_V3x1DFmode:
1593     case E_V4x8QImode:
1594     case E_V4x4HImode:
1595     case E_V4x2SImode:
1596     case E_V4x1DImode:
1597     case E_V4x4BFmode:
1598     case E_V4x4HFmode:
1599     case E_V4x2SFmode:
1600     case E_V4x1DFmode:
1601       return (TARGET_FLOAT || any_target_p)
1602               ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
1603
1604     /* Structures of 128-bit Advanced SIMD vectors.  */
1605     case E_V2x16QImode:
1606     case E_V2x8HImode:
1607     case E_V2x4SImode:
1608     case E_V2x2DImode:
1609     case E_V2x8BFmode:
1610     case E_V2x8HFmode:
1611     case E_V2x4SFmode:
1612     case E_V2x2DFmode:
1613     case E_V3x16QImode:
1614     case E_V3x8HImode:
1615     case E_V3x4SImode:
1616     case E_V3x2DImode:
1617     case E_V3x8BFmode:
1618     case E_V3x8HFmode:
1619     case E_V3x4SFmode:
1620     case E_V3x2DFmode:
1621     case E_V4x16QImode:
1622     case E_V4x8HImode:
1623     case E_V4x4SImode:
1624     case E_V4x2DImode:
1625     case E_V4x8BFmode:
1626     case E_V4x8HFmode:
1627     case E_V4x4SFmode:
1628     case E_V4x2DFmode:
1629       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1630
1631     /* 64-bit Advanced SIMD vectors.  */
1632     case E_V8QImode:
1633     case E_V4HImode:
1634     case E_V2SImode:
1635     case E_V1DImode:
1636     case E_V4HFmode:
1637     case E_V4BFmode:
1638     case E_V2SFmode:
1639     case E_V1DFmode:
1640     /* 128-bit Advanced SIMD vectors.  */
1641     case E_V16QImode:
1642     case E_V8HImode:
1643     case E_V4SImode:
1644     case E_V2DImode:
1645     case E_V8HFmode:
1646     case E_V8BFmode:
1647     case E_V4SFmode:
1648     case E_V2DFmode:
1649       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD : 0;
1650
1651     case E_VNx32BImode:
1652       return TARGET_SVE ? VEC_SVE_PRED | VEC_STRUCT : 0;
1653
1654     default:
1655       return 0;
1656     }
1657 }
1658
1659 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1660 bool
1661 aarch64_advsimd_struct_mode_p (machine_mode mode)
1662 {
1663   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1664   return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
1665 }
1666
1667 /* Return true if MODE is an Advanced SIMD D-register structure mode.  */
1668 static bool
1669 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
1670 {
1671   return (aarch64_classify_vector_mode (mode)
1672           == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
1673 }
1674
1675 /* Return true if MODE is an Advanced SIMD Q-register structure mode.  */
1676 static bool
1677 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
1678 {
1679   return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
1680 }
1681
1682 /* Return true if MODE is any of the data vector modes, including
1683    structure modes.  */
1684 static bool
1685 aarch64_vector_data_mode_p (machine_mode mode)
1686 {
1687   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1688 }
1689
1690 /* Return true if MODE is any form of SVE mode, including predicates,
1691    vectors and structures.  */
1692 bool
1693 aarch64_sve_mode_p (machine_mode mode)
1694 {
1695   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1696 }
1697
1698 /* Return true if MODE is an SVE data vector mode; either a single vector
1699    or a structure of vectors.  */
1700 static bool
1701 aarch64_sve_data_mode_p (machine_mode mode)
1702 {
1703   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1704 }
1705
1706 /* Return the number of defined bytes in one constituent vector of
1707    SVE mode MODE, which has vector flags VEC_FLAGS.  */
1708 static poly_int64
1709 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1710 {
1711   if (vec_flags & VEC_PARTIAL)
1712     /* A single partial vector.  */
1713     return GET_MODE_SIZE (mode);
1714
1715   if (vec_flags & VEC_SVE_DATA)
1716     /* A single vector or a tuple.  */
1717     return BYTES_PER_SVE_VECTOR;
1718
1719   /* A single predicate.  */
1720   gcc_assert (vec_flags & VEC_SVE_PRED);
1721   return BYTES_PER_SVE_PRED;
1722 }
1723
1724 /* If MODE holds an array of vectors, return the number of vectors
1725    in the array, otherwise return 1.  */
1726
1727 static unsigned int
1728 aarch64_ldn_stn_vectors (machine_mode mode)
1729 {
1730   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1731   if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
1732     return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
1733   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
1734     return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
1735   if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
1736     return exact_div (GET_MODE_SIZE (mode),
1737                       BYTES_PER_SVE_VECTOR).to_constant ();
1738   return 1;
1739 }
1740
1741 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
1742    corresponding vector structure mode.  */
1743 static opt_machine_mode
1744 aarch64_advsimd_vector_array_mode (machine_mode mode,
1745                                    unsigned HOST_WIDE_INT nelems)
1746 {
1747   unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
1748   if (known_eq (GET_MODE_SIZE (mode), 8))
1749     flags |= VEC_PARTIAL;
1750
1751   machine_mode struct_mode;
1752   FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
1753     if (aarch64_classify_vector_mode (struct_mode) == flags
1754         && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
1755         && known_eq (GET_MODE_NUNITS (struct_mode),
1756              GET_MODE_NUNITS (mode) * nelems))
1757       return struct_mode;
1758   return opt_machine_mode ();
1759 }
1760
1761 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1762
1763 opt_machine_mode
1764 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1765 {
1766   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1767                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1768   machine_mode mode;
1769   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1770     if (inner_mode == GET_MODE_INNER (mode)
1771         && known_eq (nunits, GET_MODE_NUNITS (mode))
1772         && aarch64_sve_data_mode_p (mode))
1773       return mode;
1774   return opt_machine_mode ();
1775 }
1776
1777 /* Implement target hook TARGET_ARRAY_MODE.  */
1778 static opt_machine_mode
1779 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1780 {
1781   if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1782     {
1783       /* Use VNx32BI for pairs of predicates, but explicitly reject giving
1784          a mode to other array sizes.  Using integer modes requires a round
1785          trip through memory and generates terrible code.  */
1786       if (nelems == 1)
1787         return mode;
1788       if (mode == VNx16BImode && nelems == 2)
1789         return VNx32BImode;
1790       return BLKmode;
1791     }
1792
1793   auto flags = aarch64_classify_vector_mode (mode);
1794   if (flags == VEC_SVE_DATA && IN_RANGE (nelems, 2, 4))
1795     return aarch64_sve_data_mode (GET_MODE_INNER (mode),
1796                                   GET_MODE_NUNITS (mode) * nelems);
1797
1798   if (flags == VEC_ADVSIMD && IN_RANGE (nelems, 2, 4))
1799     return aarch64_advsimd_vector_array_mode (mode, nelems);
1800
1801   return opt_machine_mode ();
1802 }
1803
1804 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1805 static bool
1806 aarch64_array_mode_supported_p (machine_mode mode,
1807                                 unsigned HOST_WIDE_INT nelems)
1808 {
1809   if (TARGET_BASE_SIMD
1810       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1811           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1812       && (nelems >= 2 && nelems <= 4))
1813     return true;
1814
1815   return false;
1816 }
1817
1818 /* MODE is some form of SVE vector mode.  For data modes, return the number
1819    of vector register bits that each element of MODE occupies, such as 64
1820    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1821    in a 64-bit container).  For predicate modes, return the number of
1822    data bits controlled by each significant predicate bit.  */
1823
1824 static unsigned int
1825 aarch64_sve_container_bits (machine_mode mode)
1826 {
1827   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1828   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1829                              ? BITS_PER_SVE_VECTOR
1830                              : GET_MODE_BITSIZE (mode));
1831   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1832 }
1833
1834 /* Return the SVE predicate mode to use for elements that have
1835    ELEM_NBYTES bytes, if such a mode exists.  */
1836
1837 opt_machine_mode
1838 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1839 {
1840   if (TARGET_SVE)
1841     {
1842       if (elem_nbytes == 1)
1843         return VNx16BImode;
1844       if (elem_nbytes == 2)
1845         return VNx8BImode;
1846       if (elem_nbytes == 4)
1847         return VNx4BImode;
1848       if (elem_nbytes == 8)
1849         return VNx2BImode;
1850     }
1851   return opt_machine_mode ();
1852 }
1853
1854 /* Return the SVE predicate mode that should be used to control
1855    SVE mode MODE.  */
1856
1857 machine_mode
1858 aarch64_sve_pred_mode (machine_mode mode)
1859 {
1860   unsigned int bits = aarch64_sve_container_bits (mode);
1861   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1862 }
1863
1864 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1865
1866 static opt_machine_mode
1867 aarch64_get_mask_mode (machine_mode mode)
1868 {
1869   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1870   if (vec_flags & VEC_SVE_DATA)
1871     return aarch64_sve_pred_mode (mode);
1872
1873   return default_get_mask_mode (mode);
1874 }
1875
1876 /* Return the integer element mode associated with SVE mode MODE.  */
1877
1878 static scalar_int_mode
1879 aarch64_sve_element_int_mode (machine_mode mode)
1880 {
1881   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1882                              ? BITS_PER_SVE_VECTOR
1883                              : GET_MODE_BITSIZE (mode));
1884   unsigned int elt_bits = vector_element_size (vector_bits,
1885                                                GET_MODE_NUNITS (mode));
1886   return int_mode_for_size (elt_bits, 0).require ();
1887 }
1888
1889 /* Return an integer element mode that contains exactly
1890    aarch64_sve_container_bits (MODE) bits.  This is wider than
1891    aarch64_sve_element_int_mode if MODE is a partial vector,
1892    otherwise it's the same.  */
1893
1894 static scalar_int_mode
1895 aarch64_sve_container_int_mode (machine_mode mode)
1896 {
1897   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1898 }
1899
1900 /* Return the integer vector mode associated with SVE mode MODE.
1901    Unlike related_int_vector_mode, this can handle the case in which
1902    MODE is a predicate (and thus has a different total size).  */
1903
1904 machine_mode
1905 aarch64_sve_int_mode (machine_mode mode)
1906 {
1907   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1908   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1909 }
1910
1911 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
1912
1913 static opt_machine_mode
1914 aarch64_vectorize_related_mode (machine_mode vector_mode,
1915                                 scalar_mode element_mode,
1916                                 poly_uint64 nunits)
1917 {
1918   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
1919
1920   /* If we're operating on SVE vectors, try to return an SVE mode.  */
1921   poly_uint64 sve_nunits;
1922   if ((vec_flags & VEC_SVE_DATA)
1923       && multiple_p (BYTES_PER_SVE_VECTOR,
1924                      GET_MODE_SIZE (element_mode), &sve_nunits))
1925     {
1926       machine_mode sve_mode;
1927       if (maybe_ne (nunits, 0U))
1928         {
1929           /* Try to find a full or partial SVE mode with exactly
1930              NUNITS units.  */
1931           if (multiple_p (sve_nunits, nunits)
1932               && aarch64_sve_data_mode (element_mode,
1933                                         nunits).exists (&sve_mode))
1934             return sve_mode;
1935         }
1936       else
1937         {
1938           /* Take the preferred number of units from the number of bytes
1939              that fit in VECTOR_MODE.  We always start by "autodetecting"
1940              a full vector mode with preferred_simd_mode, so vectors
1941              chosen here will also be full vector modes.  Then
1942              autovectorize_vector_modes tries smaller starting modes
1943              and thus smaller preferred numbers of units.  */
1944           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
1945           if (aarch64_sve_data_mode (element_mode,
1946                                      sve_nunits).exists (&sve_mode))
1947             return sve_mode;
1948         }
1949     }
1950
1951   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
1952   if (TARGET_SIMD
1953       && (vec_flags & VEC_ADVSIMD)
1954       && known_eq (nunits, 0U)
1955       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
1956       && maybe_ge (GET_MODE_BITSIZE (element_mode)
1957                    * GET_MODE_NUNITS (vector_mode), 128U))
1958     {
1959       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
1960       if (VECTOR_MODE_P (res))
1961         return res;
1962     }
1963
1964   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
1965 }
1966
1967 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
1968
1969 static bool
1970 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
1971 {
1972   machine_mode mode = TYPE_MODE (type);
1973   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1974   bool sve_p = (vec_flags & VEC_ANY_SVE);
1975   bool simd_p = (vec_flags & VEC_ADVSIMD);
1976
1977   return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
1978 }
1979
1980 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1981    prefer to use the first arithmetic operand as the else value if
1982    the else value doesn't matter, since that exactly matches the SVE
1983    destructive merging form.  For ternary operations we could either
1984    pick the first operand and use FMAD-like instructions or the last
1985    operand and use FMLA-like instructions; the latter seems more
1986    natural.  */
1987
1988 static tree
1989 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1990 {
1991   return nops == 3 ? ops[2] : ops[0];
1992 }
1993
1994 /* Implement TARGET_HARD_REGNO_NREGS.  */
1995
1996 static unsigned int
1997 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1998 {
1999   /* ??? Logically we should only need to provide a value when
2000      HARD_REGNO_MODE_OK says that the combination is valid,
2001      but at the moment we need to handle all modes.  Just ignore
2002      any runtime parts for registers that can't store them.  */
2003   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
2004   switch (aarch64_regno_regclass (regno))
2005     {
2006     case FP_REGS:
2007     case FP_LO_REGS:
2008     case FP_LO8_REGS:
2009       {
2010         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2011         if (vec_flags & VEC_SVE_DATA)
2012           return exact_div (GET_MODE_SIZE (mode),
2013                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2014         if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
2015           return GET_MODE_SIZE (mode).to_constant () / 8;
2016         return CEIL (lowest_size, UNITS_PER_VREG);
2017       }
2018
2019     case PR_REGS:
2020     case PR_LO_REGS:
2021     case PR_HI_REGS:
2022       return mode == VNx32BImode ? 2 : 1;
2023
2024     case FFR_REGS:
2025     case PR_AND_FFR_REGS:
2026     case FAKE_REGS:
2027       return 1;
2028
2029     default:
2030       return CEIL (lowest_size, UNITS_PER_WORD);
2031     }
2032   gcc_unreachable ();
2033 }
2034
2035 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
2036
2037 static bool
2038 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2039 {
2040   if (mode == V8DImode)
2041     return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
2042            && multiple_p (regno - R0_REGNUM, 2);
2043
2044   if (GET_MODE_CLASS (mode) == MODE_CC)
2045     return regno == CC_REGNUM;
2046
2047   if (regno == VG_REGNUM)
2048     /* This must have the same size as _Unwind_Word.  */
2049     return mode == DImode;
2050
2051   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2052   if (vec_flags == VEC_SVE_PRED)
2053     return pr_or_ffr_regnum_p (regno);
2054
2055   if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
2056     return PR_REGNUM_P (regno);
2057
2058   if (pr_or_ffr_regnum_p (regno))
2059     return false;
2060
2061   /* These registers are abstract; their modes don't matter.  */
2062   if (FAKE_REGNUM_P (regno))
2063     return true;
2064
2065   if (regno == SP_REGNUM)
2066     /* The purpose of comparing with ptr_mode is to support the
2067        global register variable associated with the stack pointer
2068        register via the syntax of asm ("wsp") in ILP32.  */
2069     return mode == Pmode || mode == ptr_mode;
2070
2071   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2072     return mode == Pmode;
2073
2074   if (GP_REGNUM_P (regno))
2075     {
2076       if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
2077         return false;
2078       if (known_le (GET_MODE_SIZE (mode), 8))
2079         return true;
2080       if (known_le (GET_MODE_SIZE (mode), 16))
2081         return (regno & 1) == 0;
2082     }
2083   else if (FP_REGNUM_P (regno))
2084     {
2085       if (vec_flags & VEC_STRUCT)
2086         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2087       else
2088         return !VECTOR_MODE_P (mode) || vec_flags != 0;
2089     }
2090
2091   return false;
2092 }
2093
2094 /* Return true if a function with type FNTYPE returns its value in
2095    SVE vector or predicate registers.  */
2096
2097 static bool
2098 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2099 {
2100   tree return_type = TREE_TYPE (fntype);
2101
2102   pure_scalable_type_info pst_info;
2103   switch (pst_info.analyze (return_type))
2104     {
2105     case pure_scalable_type_info::IS_PST:
2106       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2107               && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2108
2109     case pure_scalable_type_info::DOESNT_MATTER:
2110       gcc_assert (aarch64_return_in_memory_1 (return_type));
2111       return false;
2112
2113     case pure_scalable_type_info::NO_ABI_IDENTITY:
2114     case pure_scalable_type_info::ISNT_PST:
2115       return false;
2116     }
2117   gcc_unreachable ();
2118 }
2119
2120 /* Return true if a function with type FNTYPE takes arguments in
2121    SVE vector or predicate registers.  */
2122
2123 static bool
2124 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2125 {
2126   CUMULATIVE_ARGS args_so_far_v;
2127   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2128                                 NULL_TREE, 0, true);
2129   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2130
2131   for (tree chain = TYPE_ARG_TYPES (fntype);
2132        chain && chain != void_list_node;
2133        chain = TREE_CHAIN (chain))
2134     {
2135       tree arg_type = TREE_VALUE (chain);
2136       if (arg_type == error_mark_node)
2137         return false;
2138
2139       function_arg_info arg (arg_type, /*named=*/true);
2140       apply_pass_by_reference_rules (&args_so_far_v, arg);
2141       pure_scalable_type_info pst_info;
2142       if (pst_info.analyze_registers (arg.type))
2143         {
2144           unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2145           unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2146           gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2147           return true;
2148         }
2149
2150       targetm.calls.function_arg_advance (args_so_far, arg);
2151     }
2152   return false;
2153 }
2154
2155 /* Implement TARGET_FNTYPE_ABI.  */
2156
2157 static const predefined_function_abi &
2158 aarch64_fntype_abi (const_tree fntype)
2159 {
2160   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2161     return aarch64_simd_abi ();
2162
2163   if (aarch64_returns_value_in_sve_regs_p (fntype)
2164       || aarch64_takes_arguments_in_sve_regs_p (fntype))
2165     return aarch64_sve_abi ();
2166
2167   return default_function_abi;
2168 }
2169
2170 /* Return the state of PSTATE.SM on entry to functions of type FNTYPE.  */
2171
2172 static aarch64_feature_flags
2173 aarch64_fntype_pstate_sm (const_tree fntype)
2174 {
2175   if (lookup_attribute ("arm", "streaming", TYPE_ATTRIBUTES (fntype)))
2176     return AARCH64_FL_SM_ON;
2177
2178   if (lookup_attribute ("arm", "streaming_compatible",
2179                         TYPE_ATTRIBUTES (fntype)))
2180     return 0;
2181
2182   return AARCH64_FL_SM_OFF;
2183 }
2184
2185 /* Return state flags that describe whether and how functions of type
2186    FNTYPE share state STATE_NAME with their callers.  */
2187
2188 static unsigned int
2189 aarch64_fntype_shared_flags (const_tree fntype, const char *state_name)
2190 {
2191   return aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (fntype),
2192                                             state_name);
2193 }
2194
2195 /* Return the state of PSTATE.ZA on entry to functions of type FNTYPE.  */
2196
2197 static aarch64_feature_flags
2198 aarch64_fntype_pstate_za (const_tree fntype)
2199 {
2200   if (aarch64_fntype_shared_flags (fntype, "za")
2201       || aarch64_fntype_shared_flags (fntype, "zt0"))
2202     return AARCH64_FL_ZA_ON;
2203
2204   return 0;
2205 }
2206
2207 /* Return the ISA mode on entry to functions of type FNTYPE.  */
2208
2209 static aarch64_feature_flags
2210 aarch64_fntype_isa_mode (const_tree fntype)
2211 {
2212   return (aarch64_fntype_pstate_sm (fntype)
2213           | aarch64_fntype_pstate_za (fntype));
2214 }
2215
2216 /* Return true if FNDECL uses streaming mode internally, as an
2217    implementation choice.  */
2218
2219 static bool
2220 aarch64_fndecl_is_locally_streaming (const_tree fndecl)
2221 {
2222   return lookup_attribute ("arm", "locally_streaming",
2223                            DECL_ATTRIBUTES (fndecl));
2224 }
2225
2226 /* Return the state of PSTATE.SM when compiling the body of
2227    function FNDECL.  This might be different from the state of
2228    PSTATE.SM on entry.  */
2229
2230 static aarch64_feature_flags
2231 aarch64_fndecl_pstate_sm (const_tree fndecl)
2232 {
2233   if (aarch64_fndecl_is_locally_streaming (fndecl))
2234     return AARCH64_FL_SM_ON;
2235
2236   return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
2237 }
2238
2239 /* Return true if function FNDECL has state STATE_NAME, either by creating
2240    new state itself or by sharing state with callers.  */
2241
2242 static bool
2243 aarch64_fndecl_has_state (tree fndecl, const char *state_name)
2244 {
2245   return (aarch64_fndecl_has_new_state (fndecl, state_name)
2246           || aarch64_fntype_shared_flags (TREE_TYPE (fndecl),
2247                                           state_name) != 0);
2248 }
2249
2250 /* Return the state of PSTATE.ZA when compiling the body of function FNDECL.
2251    This might be different from the state of PSTATE.ZA on entry.  */
2252
2253 static aarch64_feature_flags
2254 aarch64_fndecl_pstate_za (const_tree fndecl)
2255 {
2256   if (aarch64_fndecl_has_new_state (fndecl, "za")
2257       || aarch64_fndecl_has_new_state (fndecl, "zt0"))
2258     return AARCH64_FL_ZA_ON;
2259
2260   return aarch64_fntype_pstate_za (TREE_TYPE (fndecl));
2261 }
2262
2263 /* Return the ISA mode that should be used to compile the body of
2264    function FNDECL.  */
2265
2266 static aarch64_feature_flags
2267 aarch64_fndecl_isa_mode (const_tree fndecl)
2268 {
2269   return (aarch64_fndecl_pstate_sm (fndecl)
2270           | aarch64_fndecl_pstate_za (fndecl));
2271 }
2272
2273 /* Return the state of PSTATE.SM on entry to the current function.
2274    This might be different from the state of PSTATE.SM in the function
2275    body.  */
2276
2277 static aarch64_feature_flags
2278 aarch64_cfun_incoming_pstate_sm ()
2279 {
2280   return aarch64_fntype_pstate_sm (TREE_TYPE (cfun->decl));
2281 }
2282
2283 /* Return the state of PSTATE.ZA on entry to the current function.
2284    This might be different from the state of PSTATE.ZA in the function
2285    body.  */
2286
2287 static aarch64_feature_flags
2288 aarch64_cfun_incoming_pstate_za ()
2289 {
2290   return aarch64_fntype_pstate_za (TREE_TYPE (cfun->decl));
2291 }
2292
2293 /* Return state flags that describe whether and how the current function shares
2294    state STATE_NAME with callers.  */
2295
2296 static unsigned int
2297 aarch64_cfun_shared_flags (const char *state_name)
2298 {
2299   return aarch64_fntype_shared_flags (TREE_TYPE (cfun->decl), state_name);
2300 }
2301
2302 /* Return true if the current function creates new state of type STATE_NAME
2303    (as opposed to sharing the state with its callers or ignoring the state
2304    altogether).  */
2305
2306 static bool
2307 aarch64_cfun_has_new_state (const char *state_name)
2308 {
2309   return aarch64_fndecl_has_new_state (cfun->decl, state_name);
2310 }
2311
2312 /* Return true if PSTATE.SM is 1 in the body of the current function,
2313    but is not guaranteed to be 1 on entry.  */
2314
2315 static bool
2316 aarch64_cfun_enables_pstate_sm ()
2317 {
2318   return (aarch64_fndecl_is_locally_streaming (cfun->decl)
2319           && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON);
2320 }
2321
2322 /* Return true if the current function has state STATE_NAME, either by
2323    creating new state itself or by sharing state with callers.  */
2324
2325 static bool
2326 aarch64_cfun_has_state (const char *state_name)
2327 {
2328   return aarch64_fndecl_has_state (cfun->decl, state_name);
2329 }
2330
2331 /* Return true if a call from the current function to a function with
2332    ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
2333    the BL instruction.  */
2334
2335 static bool
2336 aarch64_call_switches_pstate_sm (aarch64_feature_flags callee_mode)
2337 {
2338   return (callee_mode & ~AARCH64_ISA_MODE & AARCH64_FL_SM_STATE) != 0;
2339 }
2340
2341 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
2342
2343 static bool
2344 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2345 {
2346   return (aarch64_sve::builtin_type_p (type1)
2347           == aarch64_sve::builtin_type_p (type2));
2348 }
2349
2350 /* Return true if we should emit CFI for register REGNO.  */
2351
2352 static bool
2353 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2354 {
2355   return (GP_REGNUM_P (regno)
2356           || !default_function_abi.clobbers_full_reg_p (regno));
2357 }
2358
2359 /* Return the mode we should use to save and restore register REGNO.  */
2360
2361 static machine_mode
2362 aarch64_reg_save_mode (unsigned int regno)
2363 {
2364   if (GP_REGNUM_P (regno) || regno == VG_REGNUM)
2365     return DImode;
2366
2367   if (FP_REGNUM_P (regno))
2368     switch (crtl->abi->id ())
2369       {
2370       case ARM_PCS_AAPCS64:
2371         /* Only the low 64 bits are saved by the base PCS.  */
2372         return DFmode;
2373
2374       case ARM_PCS_SIMD:
2375         /* The vector PCS saves the low 128 bits (which is the full
2376            register on non-SVE targets).  */
2377         return V16QImode;
2378
2379       case ARM_PCS_SVE:
2380         /* Use vectors of DImode for registers that need frame
2381            information, so that the first 64 bytes of the save slot
2382            are always the equivalent of what storing D<n> would give.  */
2383         if (aarch64_emit_cfi_for_reg_p (regno))
2384           return VNx2DImode;
2385
2386         /* Use vectors of bytes otherwise, so that the layout is
2387            endian-agnostic, and so that we can use LDR and STR for
2388            big-endian targets.  */
2389         return VNx16QImode;
2390
2391       case ARM_PCS_TLSDESC:
2392       case ARM_PCS_UNKNOWN:
2393         break;
2394       }
2395
2396   if (PR_REGNUM_P (regno))
2397     /* Save the full predicate register.  */
2398     return VNx16BImode;
2399
2400   gcc_unreachable ();
2401 }
2402
2403 /* Given the ISA mode on entry to a callee and the ABI of the callee,
2404    return the CONST_INT that should be placed in an UNSPEC_CALLEE_ABI rtx.  */
2405
2406 rtx
2407 aarch64_gen_callee_cookie (aarch64_feature_flags isa_mode, arm_pcs pcs_variant)
2408 {
2409   return gen_int_mode ((unsigned int) isa_mode
2410                        | (unsigned int) pcs_variant << AARCH64_NUM_ISA_MODES,
2411                        DImode);
2412 }
2413
2414 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx.  Return the
2415    callee's ABI.  */
2416
2417 static const predefined_function_abi &
2418 aarch64_callee_abi (rtx cookie)
2419 {
2420   return function_abis[UINTVAL (cookie) >> AARCH64_NUM_ISA_MODES];
2421 }
2422
2423 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx.  Return the
2424    required ISA mode on entry to the callee, which is also the ISA
2425    mode on return from the callee.  */
2426
2427 static aarch64_feature_flags
2428 aarch64_callee_isa_mode (rtx cookie)
2429 {
2430   return UINTVAL (cookie) & AARCH64_FL_ISA_MODES;
2431 }
2432
2433 /* INSN is a call instruction.  Return the CONST_INT stored in its
2434    UNSPEC_CALLEE_ABI rtx.  */
2435
2436 static rtx
2437 aarch64_insn_callee_cookie (const rtx_insn *insn)
2438 {
2439   rtx pat = PATTERN (insn);
2440   gcc_assert (GET_CODE (pat) == PARALLEL);
2441   rtx unspec = XVECEXP (pat, 0, 1);
2442   gcc_assert (GET_CODE (unspec) == UNSPEC
2443               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2444   return XVECEXP (unspec, 0, 0);
2445 }
2446
2447 /* Implement TARGET_INSN_CALLEE_ABI.  */
2448
2449 const predefined_function_abi &
2450 aarch64_insn_callee_abi (const rtx_insn *insn)
2451 {
2452   return aarch64_callee_abi (aarch64_insn_callee_cookie (insn));
2453 }
2454
2455 /* INSN is a call instruction.  Return the required ISA mode on entry to
2456    the callee, which is also the ISA mode on return from the callee.  */
2457
2458 static aarch64_feature_flags
2459 aarch64_insn_callee_isa_mode (const rtx_insn *insn)
2460 {
2461   return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn));
2462 }
2463
2464 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2465    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2466    clobbers the top 64 bits when restoring the bottom 64 bits.  */
2467
2468 static bool
2469 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2470                                         unsigned int regno,
2471                                         machine_mode mode)
2472 {
2473   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2474     {
2475       poly_int64 per_register_size = GET_MODE_SIZE (mode);
2476       unsigned int nregs = hard_regno_nregs (regno, mode);
2477       if (nregs > 1)
2478         per_register_size = exact_div (per_register_size, nregs);
2479       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2480         return maybe_gt (per_register_size, 16);
2481       return maybe_gt (per_register_size, 8);
2482     }
2483   return false;
2484 }
2485
2486 /* Implement REGMODE_NATURAL_SIZE.  */
2487 poly_uint64
2488 aarch64_regmode_natural_size (machine_mode mode)
2489 {
2490   /* The natural size for SVE data modes is one SVE data vector,
2491      and similarly for predicates.  We can't independently modify
2492      anything smaller than that.  */
2493   /* ??? For now, only do this for variable-width SVE registers.
2494      Doing it for constant-sized registers breaks lower-subreg.cc.  */
2495   /* ??? And once that's fixed, we should probably have similar
2496      code for Advanced SIMD.  */
2497   if (!aarch64_sve_vg.is_constant ())
2498     {
2499       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2500       if (vec_flags & VEC_SVE_PRED)
2501         return BYTES_PER_SVE_PRED;
2502       if (vec_flags & VEC_SVE_DATA)
2503         return BYTES_PER_SVE_VECTOR;
2504     }
2505   return UNITS_PER_WORD;
2506 }
2507
2508 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2509 machine_mode
2510 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2511                                      machine_mode mode)
2512 {
2513   /* The predicate mode determines which bits are significant and
2514      which are "don't care".  Decreasing the number of lanes would
2515      lose data while increasing the number of lanes would make bits
2516      unnecessarily significant.  */
2517   if (PR_REGNUM_P (regno))
2518     return mode;
2519   if (known_ge (GET_MODE_SIZE (mode), 4))
2520     return mode;
2521   else
2522     return SImode;
2523 }
2524
2525 /* Return true if I's bits are consecutive ones from the MSB.  */
2526 bool
2527 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2528 {
2529   return exact_log2 (-i) != HOST_WIDE_INT_M1;
2530 }
2531
2532 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2533    that strcpy from constants will be faster.  */
2534
2535 static HOST_WIDE_INT
2536 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2537 {
2538   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2539     return MAX (align, BITS_PER_WORD);
2540   return align;
2541 }
2542
2543 /* Return true if calls to DECL should be treated as
2544    long-calls (ie called via a register).  */
2545 static bool
2546 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2547 {
2548   return false;
2549 }
2550
2551 /* Return true if calls to symbol-ref SYM should be treated as
2552    long-calls (ie called via a register).  */
2553 bool
2554 aarch64_is_long_call_p (rtx sym)
2555 {
2556   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2557 }
2558
2559 /* Return true if calls to symbol-ref SYM should not go through
2560    plt stubs.  */
2561
2562 bool
2563 aarch64_is_noplt_call_p (rtx sym)
2564 {
2565   const_tree decl = SYMBOL_REF_DECL (sym);
2566
2567   if (flag_pic
2568       && decl
2569       && (!flag_plt
2570           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2571       && !targetm.binds_local_p (decl))
2572     return true;
2573
2574   return false;
2575 }
2576
2577 /* Emit an insn that's a simple single-set.  Both the operands must be
2578    known to be valid.  */
2579 inline static rtx_insn *
2580 emit_set_insn (rtx x, rtx y)
2581 {
2582   return emit_insn (gen_rtx_SET (x, y));
2583 }
2584
2585 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2586    return the rtx for register 0 in the proper mode.  */
2587 rtx
2588 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2589 {
2590   machine_mode cmp_mode = GET_MODE (x);
2591   machine_mode cc_mode;
2592   rtx cc_reg;
2593
2594   if (cmp_mode == TImode)
2595     {
2596       gcc_assert (code == NE);
2597
2598       cc_mode = CCmode;
2599       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2600
2601       rtx x_lo = operand_subword (x, 0, 0, TImode);
2602       rtx y_lo = operand_subword (y, 0, 0, TImode);
2603       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2604
2605       rtx x_hi = operand_subword (x, 1, 0, TImode);
2606       rtx y_hi = operand_subword (y, 1, 0, TImode);
2607       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2608                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2609                                GEN_INT (AARCH64_EQ)));
2610     }
2611   else
2612     {
2613       cc_mode = SELECT_CC_MODE (code, x, y);
2614       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2615       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2616     }
2617   return cc_reg;
2618 }
2619
2620 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2621
2622 static rtx
2623 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2624                                   machine_mode y_mode)
2625 {
2626   if (y_mode == E_QImode || y_mode == E_HImode)
2627     {
2628       if (CONST_INT_P (y))
2629         {
2630           y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2631           y_mode = SImode;
2632         }
2633       else
2634         {
2635           rtx t, cc_reg;
2636           machine_mode cc_mode;
2637
2638           t = gen_rtx_ZERO_EXTEND (SImode, y);
2639           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2640           cc_mode = CC_SWPmode;
2641           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2642           emit_set_insn (cc_reg, t);
2643           return cc_reg;
2644         }
2645     }
2646
2647   if (!aarch64_plus_operand (y, y_mode))
2648     y = force_reg (y_mode, y);
2649
2650   return aarch64_gen_compare_reg (code, x, y);
2651 }
2652
2653 /* Generate conditional branch to LABEL, comparing X to 0 using CODE.
2654    Return the jump instruction.  */
2655
2656 static rtx
2657 aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x,
2658                                      rtx_code_label *label)
2659 {
2660   if (aarch64_track_speculation)
2661     {
2662       /* Emit an explicit compare instruction, so that we can correctly
2663          track the condition codes.  */
2664       rtx cc_reg = aarch64_gen_compare_reg (code, x, const0_rtx);
2665       x = gen_rtx_fmt_ee (code, GET_MODE (cc_reg), cc_reg, const0_rtx);
2666     }
2667   else
2668     x = gen_rtx_fmt_ee (code, VOIDmode, x, const0_rtx);
2669
2670   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
2671                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
2672   return gen_rtx_SET (pc_rtx, x);
2673 }
2674
2675 /* Return an rtx that branches to LABEL based on the value of bit BITNUM of X.
2676    If CODE is NE, it branches to LABEL when the bit is set; if CODE is EQ,
2677    it branches to LABEL when the bit is clear.  */
2678
2679 static rtx
2680 aarch64_gen_test_and_branch (rtx_code code, rtx x, int bitnum,
2681                              rtx_code_label *label)
2682 {
2683   auto mode = GET_MODE (x);
2684   if (aarch64_track_speculation)
2685     {
2686       auto mask = gen_int_mode (HOST_WIDE_INT_1U << bitnum, mode);
2687       emit_insn (gen_aarch64_and3nr_compare0 (mode, x, mask));
2688       rtx cc_reg = gen_rtx_REG (CC_NZVmode, CC_REGNUM);
2689       rtx x = gen_rtx_fmt_ee (code, CC_NZVmode, cc_reg, const0_rtx);
2690       return gen_condjump (x, cc_reg, label);
2691     }
2692   return gen_aarch64_tb (code, mode, mode,
2693                          x, gen_int_mode (bitnum, mode), label);
2694 }
2695
2696 /* Consider the operation:
2697
2698      OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
2699
2700    where:
2701
2702    - CODE is [SU]MAX or [SU]MIN
2703    - OPERANDS[2] and OPERANDS[3] are constant integers
2704    - OPERANDS[3] is a positive or negative shifted 12-bit immediate
2705    - all operands have mode MODE
2706
2707    Decide whether it is possible to implement the operation using:
2708
2709      SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
2710      or
2711      ADDS <tmp>, OPERANDS[1], OPERANDS[3]
2712
2713    followed by:
2714
2715      <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
2716
2717    where <insn> is one of CSEL, CSINV or CSINC.  Return true if so.
2718    If GENERATE_P is true, also update OPERANDS as follows:
2719
2720      OPERANDS[4] = -OPERANDS[3]
2721      OPERANDS[5] = the rtl condition representing <cond>
2722      OPERANDS[6] = <tmp>
2723      OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC.  */
2724 bool
2725 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
2726 {
2727   signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
2728   rtx dst = operands[0];
2729   rtx maxmin_op = operands[2];
2730   rtx add_op = operands[3];
2731   machine_mode mode = GET_MODE (dst);
2732
2733   /* max (x, y) - z == (x >= y + 1 ? x : y) - z
2734                     == (x >= y ? x : y) - z
2735                     == (x > y ? x : y) - z
2736                     == (x > y - 1 ? x : y) - z
2737
2738      min (x, y) - z == (x <= y - 1 ? x : y) - z
2739                     == (x <= y ? x : y) - z
2740                     == (x < y ? x : y) - z
2741                     == (x < y + 1 ? x : y) - z
2742
2743      Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
2744      which x is compared with z.  Set DIFF to y - z.  Thus the supported
2745      combinations are as follows, with DIFF being the value after the ":":
2746
2747      max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1   [z == y + 1]
2748                     == x >= y ? x - y : 0              [z == y]
2749                     == x > y ? x - y : 0               [z == y]
2750                     == x > y - 1 ? x - (y - 1) : 1     [z == y - 1]
2751
2752      min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1    [z == y - 1]
2753                     == x <= y ? x - y : 0              [z == y]
2754                     == x < y ? x - y : 0               [z == y]
2755                     == x < y + 1 ? x - (y + 1) : -1    [z == y + 1].  */
2756   auto maxmin_val = rtx_mode_t (maxmin_op, mode);
2757   auto add_val = rtx_mode_t (add_op, mode);
2758   auto sub_val = wi::neg (add_val);
2759   auto diff = wi::sub (maxmin_val, sub_val);
2760   if (!(diff == 0
2761         || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
2762         || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
2763     return false;
2764
2765   if (!generate_p)
2766     return true;
2767
2768   rtx_code cmp;
2769   switch (code)
2770     {
2771     case SMAX:
2772       cmp = diff == 1 ? GT : GE;
2773       break;
2774     case UMAX:
2775       cmp = diff == 1 ? GTU : GEU;
2776       break;
2777     case SMIN:
2778       cmp = diff == -1 ? LT : LE;
2779       break;
2780     case UMIN:
2781       cmp = diff == -1 ? LTU : LEU;
2782       break;
2783     default:
2784       gcc_unreachable ();
2785     }
2786   rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
2787
2788   operands[4] = immed_wide_int_const (sub_val, mode);
2789   operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
2790   if (can_create_pseudo_p ())
2791     operands[6] = gen_reg_rtx (mode);
2792   else
2793     operands[6] = dst;
2794   operands[7] = immed_wide_int_const (diff, mode);
2795
2796   return true;
2797 }
2798
2799
2800 /* Build the SYMBOL_REF for __tls_get_addr.  */
2801
2802 static GTY(()) rtx tls_get_addr_libfunc;
2803
2804 rtx
2805 aarch64_tls_get_addr (void)
2806 {
2807   if (!tls_get_addr_libfunc)
2808     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2809   return tls_get_addr_libfunc;
2810 }
2811
2812 /* Return the TLS model to use for ADDR.  */
2813
2814 static enum tls_model
2815 tls_symbolic_operand_type (rtx addr)
2816 {
2817   enum tls_model tls_kind = TLS_MODEL_NONE;
2818   poly_int64 offset;
2819   addr = strip_offset_and_salt (addr, &offset);
2820   if (SYMBOL_REF_P (addr))
2821     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2822
2823   return tls_kind;
2824 }
2825
2826 /* We'll allow lo_sum's in addresses in our legitimate addresses
2827    so that combine would take care of combining addresses where
2828    necessary, but for generation purposes, we'll generate the address
2829    as :
2830    RTL                               Absolute
2831    tmp = hi (symbol_ref);            adrp  x1, foo
2832    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2833                                      nop
2834
2835    PIC                               TLS
2836    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2837    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2838                                      bl   __tls_get_addr
2839                                      nop
2840
2841    Load TLS symbol, depending on TLS mechanism and TLS access model.
2842
2843    Global Dynamic - Traditional TLS:
2844    adrp tmp, :tlsgd:imm
2845    add  dest, tmp, #:tlsgd_lo12:imm
2846    bl   __tls_get_addr
2847
2848    Global Dynamic - TLS Descriptors:
2849    adrp dest, :tlsdesc:imm
2850    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2851    add  dest, dest, #:tlsdesc_lo12:imm
2852    blr  tmp
2853    mrs  tp, tpidr_el0
2854    add  dest, dest, tp
2855
2856    Initial Exec:
2857    mrs  tp, tpidr_el0
2858    adrp tmp, :gottprel:imm
2859    ldr  dest, [tmp, #:gottprel_lo12:imm]
2860    add  dest, dest, tp
2861
2862    Local Exec:
2863    mrs  tp, tpidr_el0
2864    add  t0, tp, #:tprel_hi12:imm, lsl #12
2865    add  t0, t0, #:tprel_lo12_nc:imm
2866 */
2867
2868 static void
2869 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2870                                    enum aarch64_symbol_type type)
2871 {
2872 #if TARGET_PECOFF
2873   rtx tmp = legitimize_pe_coff_symbol (imm, true);
2874   if (tmp)
2875     {
2876       emit_insn (gen_rtx_SET (dest, tmp));
2877       return;
2878     }
2879 #endif
2880
2881   switch (type)
2882     {
2883     case SYMBOL_SMALL_ABSOLUTE:
2884       {
2885         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2886         rtx tmp_reg = dest;
2887         machine_mode mode = GET_MODE (dest);
2888
2889         gcc_assert (mode == Pmode || mode == ptr_mode);
2890
2891         if (can_create_pseudo_p ())
2892           tmp_reg = gen_reg_rtx (mode);
2893
2894         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
2895         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2896         return;
2897       }
2898
2899     case SYMBOL_TINY_ABSOLUTE:
2900       emit_insn (gen_rtx_SET (dest, imm));
2901       return;
2902
2903     case SYMBOL_SMALL_GOT_28K:
2904       {
2905         machine_mode mode = GET_MODE (dest);
2906         rtx gp_rtx = pic_offset_table_rtx;
2907         rtx insn;
2908         rtx mem;
2909
2910         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2911            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2912            decide rtx costs, in which case pic_offset_table_rtx is not
2913            initialized.  For that case no need to generate the first adrp
2914            instruction as the final cost for global variable access is
2915            one instruction.  */
2916         if (gp_rtx != NULL)
2917           {
2918             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2919                using the page base as GOT base, the first page may be wasted,
2920                in the worst scenario, there is only 28K space for GOT).
2921
2922                The generate instruction sequence for accessing global variable
2923                is:
2924
2925                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2926
2927                Only one instruction needed. But we must initialize
2928                pic_offset_table_rtx properly.  We generate initialize insn for
2929                every global access, and allow CSE to remove all redundant.
2930
2931                The final instruction sequences will look like the following
2932                for multiply global variables access.
2933
2934                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2935
2936                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2937                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2938                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2939                  ...  */
2940
2941             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2942             crtl->uses_pic_offset_table = 1;
2943             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2944
2945             if (mode != GET_MODE (gp_rtx))
2946              gp_rtx = gen_lowpart (mode, gp_rtx);
2947
2948           }
2949
2950         if (mode == ptr_mode)
2951           {
2952             if (mode == DImode)
2953               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2954             else
2955               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2956
2957             mem = XVECEXP (SET_SRC (insn), 0, 0);
2958           }
2959         else
2960           {
2961             gcc_assert (mode == Pmode);
2962
2963             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2964             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2965           }
2966
2967         /* The operand is expected to be MEM.  Whenever the related insn
2968            pattern changed, above code which calculate mem should be
2969            updated.  */
2970         gcc_assert (MEM_P (mem));
2971         MEM_READONLY_P (mem) = 1;
2972         MEM_NOTRAP_P (mem) = 1;
2973         emit_insn (insn);
2974         return;
2975       }
2976
2977     case SYMBOL_SMALL_GOT_4G:
2978       emit_insn (gen_rtx_SET (dest, imm));
2979       return;
2980
2981     case SYMBOL_SMALL_TLSGD:
2982       {
2983         rtx_insn *insns;
2984         /* The return type of __tls_get_addr is the C pointer type
2985            so use ptr_mode.  */
2986         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
2987         rtx tmp_reg = dest;
2988
2989         if (GET_MODE (dest) != ptr_mode)
2990           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
2991
2992         start_sequence ();
2993         if (ptr_mode == SImode)
2994           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2995         else
2996           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2997         insns = get_insns ();
2998         end_sequence ();
2999
3000         RTL_CONST_CALL_P (insns) = 1;
3001         emit_libcall_block (insns, tmp_reg, result, imm);
3002         /* Convert back to the mode of the dest adding a zero_extend
3003            from SImode (ptr_mode) to DImode (Pmode). */
3004         if (dest != tmp_reg)
3005           convert_move (dest, tmp_reg, true);
3006         return;
3007       }
3008
3009     case SYMBOL_SMALL_TLSDESC:
3010       {
3011         machine_mode mode = GET_MODE (dest);
3012         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3013         rtx tp;
3014
3015         gcc_assert (mode == Pmode || mode == ptr_mode);
3016
3017         /* In ILP32, the got entry is always of SImode size.  Unlike
3018            small GOT, the dest is fixed at reg 0.  */
3019         if (TARGET_ILP32)
3020           emit_insn (gen_tlsdesc_small_si (imm));
3021         else
3022           emit_insn (gen_tlsdesc_small_di (imm));
3023         tp = aarch64_load_tp (NULL);
3024
3025         if (mode != Pmode)
3026           tp = gen_lowpart (mode, tp);
3027
3028         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3029         if (REG_P (dest))
3030           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3031         return;
3032       }
3033
3034     case SYMBOL_SMALL_TLSIE:
3035       {
3036         /* In ILP32, the mode of dest can be either SImode or DImode,
3037            while the got entry is always of SImode size.  The mode of
3038            dest depends on how dest is used: if dest is assigned to a
3039            pointer (e.g. in the memory), it has SImode; it may have
3040            DImode if dest is dereferenced to access the memeory.
3041            This is why we have to handle three different tlsie_small
3042            patterns here (two patterns for ILP32).  */
3043         machine_mode mode = GET_MODE (dest);
3044         rtx tmp_reg = gen_reg_rtx (mode);
3045         rtx tp = aarch64_load_tp (NULL);
3046
3047         if (mode == ptr_mode)
3048           {
3049             if (mode == DImode)
3050               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3051             else
3052               {
3053                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3054                 tp = gen_lowpart (mode, tp);
3055               }
3056           }
3057         else
3058           {
3059             gcc_assert (mode == Pmode);
3060             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3061           }
3062
3063         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3064         if (REG_P (dest))
3065           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3066         return;
3067       }
3068
3069     case SYMBOL_TLSLE12:
3070     case SYMBOL_TLSLE24:
3071     case SYMBOL_TLSLE32:
3072     case SYMBOL_TLSLE48:
3073       {
3074         machine_mode mode = GET_MODE (dest);
3075         rtx tp = aarch64_load_tp (NULL);
3076
3077         if (mode != Pmode)
3078           tp = gen_lowpart (mode, tp);
3079
3080         switch (type)
3081           {
3082           case SYMBOL_TLSLE12:
3083             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3084                         (dest, tp, imm));
3085             break;
3086           case SYMBOL_TLSLE24:
3087             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3088                         (dest, tp, imm));
3089           break;
3090           case SYMBOL_TLSLE32:
3091             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3092                         (dest, imm));
3093             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3094                         (dest, dest, tp));
3095           break;
3096           case SYMBOL_TLSLE48:
3097             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3098                         (dest, imm));
3099             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3100                         (dest, dest, tp));
3101             break;
3102           default:
3103             gcc_unreachable ();
3104           }
3105
3106         if (REG_P (dest))
3107           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3108         return;
3109       }
3110
3111     case SYMBOL_TINY_GOT:
3112       {
3113         rtx insn;
3114         machine_mode mode = GET_MODE (dest);
3115
3116         if (mode == ptr_mode)
3117           insn = gen_ldr_got_tiny (mode, dest, imm);
3118         else
3119           {
3120             gcc_assert (mode == Pmode);
3121             insn = gen_ldr_got_tiny_sidi (dest, imm);
3122           }
3123
3124         emit_insn (insn);
3125         return;
3126       }
3127
3128     case SYMBOL_TINY_TLSIE:
3129       {
3130         machine_mode mode = GET_MODE (dest);
3131         rtx tp = aarch64_load_tp (NULL);
3132
3133         if (mode == ptr_mode)
3134           {
3135             if (mode == DImode)
3136               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3137             else
3138               {
3139                 tp = gen_lowpart (mode, tp);
3140                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3141               }
3142           }
3143         else
3144           {
3145             gcc_assert (mode == Pmode);
3146             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3147           }
3148
3149         if (REG_P (dest))
3150           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3151         return;
3152       }
3153
3154     default:
3155       gcc_unreachable ();
3156     }
3157 }
3158
3159 /* Emit a move from SRC to DEST.  Assume that the move expanders can
3160    handle all moves if !can_create_pseudo_p ().  The distinction is
3161    important because, unlike emit_move_insn, the move expanders know
3162    how to force Pmode objects into the constant pool even when the
3163    constant pool address is not itself legitimate.  */
3164 static rtx
3165 aarch64_emit_move (rtx dest, rtx src)
3166 {
3167   return (can_create_pseudo_p ()
3168           ? emit_move_insn (dest, src)
3169           : emit_move_insn_1 (dest, src));
3170 }
3171
3172 /* Apply UNOPTAB to OP and store the result in DEST.  */
3173
3174 static void
3175 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3176 {
3177   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3178   if (dest != tmp)
3179     emit_move_insn (dest, tmp);
3180 }
3181
3182 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
3183
3184 static void
3185 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3186 {
3187   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3188                           OPTAB_DIRECT);
3189   if (dest != tmp)
3190     emit_move_insn (dest, tmp);
3191 }
3192
3193 /* Split a move from SRC to DST into two moves of mode SINGLE_MODE.  */
3194
3195 void
3196 aarch64_split_double_move (rtx dst, rtx src, machine_mode single_mode)
3197 {
3198   machine_mode mode = GET_MODE (dst);
3199
3200   rtx dst0 = simplify_gen_subreg (single_mode, dst, mode, 0);
3201   rtx dst1 = simplify_gen_subreg (single_mode, dst, mode,
3202                                   GET_MODE_SIZE (single_mode));
3203   rtx src0 = simplify_gen_subreg (single_mode, src, mode, 0);
3204   rtx src1 = simplify_gen_subreg (single_mode, src, mode,
3205                                   GET_MODE_SIZE (single_mode));
3206
3207   /* At most one pairing may overlap.  */
3208   if (reg_overlap_mentioned_p (dst0, src1))
3209     {
3210       aarch64_emit_move (dst1, src1);
3211       aarch64_emit_move (dst0, src0);
3212     }
3213   else
3214     {
3215       aarch64_emit_move (dst0, src0);
3216       aarch64_emit_move (dst1, src1);
3217     }
3218 }
3219
3220 /* Split a 128-bit move operation into two 64-bit move operations,
3221    taking care to handle partial overlap of register to register
3222    copies.  Special cases are needed when moving between GP regs and
3223    FP regs.  SRC can be a register, constant or memory; DST a register
3224    or memory.  If either operand is memory it must not have any side
3225    effects.  */
3226 void
3227 aarch64_split_128bit_move (rtx dst, rtx src)
3228 {
3229   machine_mode mode = GET_MODE (dst);
3230
3231   gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
3232   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3233   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3234
3235   if (REG_P (dst) && REG_P (src))
3236     {
3237       int src_regno = REGNO (src);
3238       int dst_regno = REGNO (dst);
3239
3240       /* Handle FP <-> GP regs.  */
3241       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3242         {
3243           rtx src_lo = gen_lowpart (word_mode, src);
3244           rtx src_hi = gen_highpart (word_mode, src);
3245
3246           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3247           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3248           return;
3249         }
3250       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3251         {
3252           rtx dst_lo = gen_lowpart (word_mode, dst);
3253           rtx dst_hi = gen_highpart (word_mode, dst);
3254
3255           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3256           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3257           return;
3258         }
3259     }
3260
3261   aarch64_split_double_move (dst, src, word_mode);
3262 }
3263
3264 /* Return true if we should split a move from 128-bit value SRC
3265    to 128-bit register DEST.  */
3266
3267 bool
3268 aarch64_split_128bit_move_p (rtx dst, rtx src)
3269 {
3270   if (FP_REGNUM_P (REGNO (dst)))
3271     return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3272   /* All moves to GPRs need to be split.  */
3273   return true;
3274 }
3275
3276 /* Split a complex SIMD move.  */
3277
3278 void
3279 aarch64_split_simd_move (rtx dst, rtx src)
3280 {
3281   machine_mode src_mode = GET_MODE (src);
3282   machine_mode dst_mode = GET_MODE (dst);
3283
3284   gcc_assert (VECTOR_MODE_P (dst_mode));
3285
3286   if (REG_P (dst) && REG_P (src))
3287     {
3288       gcc_assert (VECTOR_MODE_P (src_mode));
3289       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3290     }
3291 }
3292
3293 /* Return a register that contains SVE value X reinterpreted as SVE mode MODE.
3294    The semantics of those of svreinterpret rather than those of subregs;
3295    see the comment at the head of aarch64-sve.md for details about the
3296    difference.  */
3297
3298 rtx
3299 aarch64_sve_reinterpret (machine_mode mode, rtx x)
3300 {
3301   if (GET_MODE (x) == mode)
3302     return x;
3303
3304   /* can_change_mode_class must only return true if subregs and svreinterprets
3305      have the same semantics.  */
3306   if (targetm.can_change_mode_class (GET_MODE (x), mode, FP_REGS))
3307     return force_lowpart_subreg (mode, x, GET_MODE (x));
3308
3309   rtx res = gen_reg_rtx (mode);
3310   x = force_reg (GET_MODE (x), x);
3311   emit_insn (gen_aarch64_sve_reinterpret (mode, res, x));
3312   return res;
3313 }
3314
3315 bool
3316 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3317                               machine_mode ymode, rtx y)
3318 {
3319   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3320   gcc_assert (r != NULL);
3321   return rtx_equal_p (x, r);
3322 }
3323
3324 /* Return TARGET if it is nonnull and a register of mode MODE.
3325    Otherwise, return a fresh register of mode MODE if we can,
3326    or TARGET reinterpreted as MODE if we can't.  */
3327
3328 static rtx
3329 aarch64_target_reg (rtx target, machine_mode mode)
3330 {
3331   if (target && REG_P (target) && GET_MODE (target) == mode)
3332     return target;
3333   if (!can_create_pseudo_p ())
3334     {
3335       gcc_assert (target);
3336       return gen_lowpart (mode, target);
3337     }
3338   return gen_reg_rtx (mode);
3339 }
3340
3341 /* Return a register that contains the constant in BUILDER, given that
3342    the constant is a legitimate move operand.  Use TARGET as the register
3343    if it is nonnull and convenient.  */
3344
3345 static rtx
3346 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3347 {
3348   rtx src = builder.build ();
3349   target = aarch64_target_reg (target, GET_MODE (src));
3350   emit_insn (gen_rtx_SET (target, src));
3351   return target;
3352 }
3353
3354 static rtx
3355 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3356 {
3357   if (can_create_pseudo_p ())
3358     return force_reg (mode, value);
3359   else
3360     {
3361       gcc_assert (x);
3362       aarch64_emit_move (x, value);
3363       return x;
3364     }
3365 }
3366
3367 /* Return true if predicate value X is a constant in which every element
3368    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
3369    value, i.e. as a predicate in which all bits are significant.  */
3370
3371 static bool
3372 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3373 {
3374   if (!CONST_VECTOR_P (x))
3375     return false;
3376
3377   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3378                                              GET_MODE_NUNITS (GET_MODE (x)));
3379   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3380   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3381   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3382
3383   unsigned int nelts = const_vector_encoded_nelts (x);
3384   for (unsigned int i = 0; i < nelts; ++i)
3385     {
3386       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3387       if (!CONST_INT_P (elt))
3388         return false;
3389
3390       builder.quick_push (elt);
3391       for (unsigned int j = 1; j < factor; ++j)
3392         builder.quick_push (const0_rtx);
3393     }
3394   builder.finalize ();
3395   return true;
3396 }
3397
3398 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
3399    widest predicate element size it can have (that is, the largest size
3400    for which each element would still be 0 or 1).  */
3401
3402 unsigned int
3403 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3404 {
3405   /* Start with the most optimistic assumption: that we only need
3406      one bit per pattern.  This is what we will use if only the first
3407      bit in each pattern is ever set.  */
3408   unsigned int mask = GET_MODE_SIZE (DImode);
3409   mask |= builder.npatterns ();
3410
3411   /* Look for set bits.  */
3412   unsigned int nelts = builder.encoded_nelts ();
3413   for (unsigned int i = 1; i < nelts; ++i)
3414     if (INTVAL (builder.elt (i)) != 0)
3415       {
3416         if (i & 1)
3417           return 1;
3418         mask |= i;
3419       }
3420   return mask & -mask;
3421 }
3422
3423 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3424    return that predicate mode, otherwise return opt_machine_mode ().  */
3425
3426 opt_machine_mode
3427 aarch64_ptrue_all_mode (rtx x)
3428 {
3429   gcc_assert (GET_MODE (x) == VNx16BImode);
3430   if (!CONST_VECTOR_P (x)
3431       || !CONST_VECTOR_DUPLICATE_P (x)
3432       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3433       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3434     return opt_machine_mode ();
3435
3436   unsigned int nelts = const_vector_encoded_nelts (x);
3437   for (unsigned int i = 1; i < nelts; ++i)
3438     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3439       return opt_machine_mode ();
3440
3441   return aarch64_sve_pred_mode (nelts);
3442 }
3443
3444 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
3445    that the constant would have with predicate element size ELT_SIZE
3446    (ignoring the upper bits in each element) and return:
3447
3448    * -1 if all bits are set
3449    * N if the predicate has N leading set bits followed by all clear bits
3450    * 0 if the predicate does not have any of these forms.  */
3451
3452 int
3453 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3454                               unsigned int elt_size)
3455 {
3456   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3457      followed by set bits.  */
3458   if (builder.nelts_per_pattern () == 3)
3459     return 0;
3460
3461   /* Skip over leading set bits.  */
3462   unsigned int nelts = builder.encoded_nelts ();
3463   unsigned int i = 0;
3464   for (; i < nelts; i += elt_size)
3465     if (INTVAL (builder.elt (i)) == 0)
3466       break;
3467   unsigned int vl = i / elt_size;
3468
3469   /* Check for the all-true case.  */
3470   if (i == nelts)
3471     return -1;
3472
3473   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3474      repeating pattern of set bits followed by clear bits.  */
3475   if (builder.nelts_per_pattern () != 2)
3476     return 0;
3477
3478   /* We have a "foreground" value and a duplicated "background" value.
3479      If the background might repeat and the last set bit belongs to it,
3480      we might have set bits followed by clear bits followed by set bits.  */
3481   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3482     return 0;
3483
3484   /* Make sure that the rest are all clear.  */
3485   for (; i < nelts; i += elt_size)
3486     if (INTVAL (builder.elt (i)) != 0)
3487       return 0;
3488
3489   return vl;
3490 }
3491
3492 /* See if there is an svpattern that encodes an SVE predicate of mode
3493    PRED_MODE in which the first VL bits are set and the rest are clear.
3494    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3495    A VL of -1 indicates an all-true vector.  */
3496
3497 aarch64_svpattern
3498 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3499 {
3500   if (vl < 0)
3501     return AARCH64_SV_ALL;
3502
3503   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3504     return AARCH64_NUM_SVPATTERNS;
3505
3506   if (vl >= 1 && vl <= 8)
3507     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3508
3509   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3510     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3511
3512   int max_vl;
3513   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3514     {
3515       if (vl == (max_vl / 3) * 3)
3516         return AARCH64_SV_MUL3;
3517       /* These would only trigger for non-power-of-2 lengths.  */
3518       if (vl == (max_vl & -4))
3519         return AARCH64_SV_MUL4;
3520       if (vl == (1 << floor_log2 (max_vl)))
3521         return AARCH64_SV_POW2;
3522       if (vl == max_vl)
3523         return AARCH64_SV_ALL;
3524     }
3525   return AARCH64_NUM_SVPATTERNS;
3526 }
3527
3528 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3529    bits has the lowest bit set and the upper bits clear.  This is the
3530    VNx16BImode equivalent of a PTRUE for controlling elements of
3531    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3532    all bits are significant, even the upper zeros.  */
3533
3534 rtx
3535 aarch64_ptrue_all (unsigned int elt_size)
3536 {
3537   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3538   builder.quick_push (const1_rtx);
3539   for (unsigned int i = 1; i < elt_size; ++i)
3540     builder.quick_push (const0_rtx);
3541   return builder.build ();
3542 }
3543
3544 /* Return an all-true predicate register of mode MODE.  */
3545
3546 rtx
3547 aarch64_ptrue_reg (machine_mode mode)
3548 {
3549   gcc_assert (aarch64_sve_pred_mode_p (mode));
3550   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3551   return gen_lowpart (mode, reg);
3552 }
3553
3554 /* Return an all-false predicate register of mode MODE.  */
3555
3556 rtx
3557 aarch64_pfalse_reg (machine_mode mode)
3558 {
3559   gcc_assert (aarch64_sve_pred_mode_p (mode));
3560   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3561   return gen_lowpart (mode, reg);
3562 }
3563
3564 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3565    for it.  PRED2[0] is the predicate for the instruction whose result
3566    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3567    for it.  Return true if we can prove that the two predicates are
3568    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3569    with PRED1[0] without changing behavior.  */
3570
3571 bool
3572 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3573 {
3574   machine_mode mode = GET_MODE (pred1[0]);
3575   gcc_assert (aarch64_sve_pred_mode_p (mode)
3576               && mode == GET_MODE (pred2[0])
3577               && aarch64_sve_ptrue_flag (pred1[1], SImode)
3578               && aarch64_sve_ptrue_flag (pred2[1], SImode));
3579
3580   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3581                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3582   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3583                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3584   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3585 }
3586
3587 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3588    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3589    Use TARGET as the target register if nonnull and convenient.  */
3590
3591 static rtx
3592 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3593                           machine_mode data_mode, rtx op1, rtx op2)
3594 {
3595   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3596   expand_operand ops[5];
3597   create_output_operand (&ops[0], target, pred_mode);
3598   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3599   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3600   create_input_operand (&ops[3], op1, data_mode);
3601   create_input_operand (&ops[4], op2, data_mode);
3602   expand_insn (icode, 5, ops);
3603   return ops[0].value;
3604 }
3605
3606 /* Use a comparison to convert integer vector SRC into MODE, which is
3607    the corresponding SVE predicate mode.  Use TARGET for the result
3608    if it's nonnull and convenient.  */
3609
3610 rtx
3611 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3612 {
3613   machine_mode src_mode = GET_MODE (src);
3614   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3615                                    src, CONST0_RTX (src_mode));
3616 }
3617
3618 /* Return the assembly token for svprfop value PRFOP.  */
3619
3620 static const char *
3621 svprfop_token (enum aarch64_svprfop prfop)
3622 {
3623   switch (prfop)
3624     {
3625 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3626     AARCH64_FOR_SVPRFOP (CASE)
3627 #undef CASE
3628     case AARCH64_NUM_SVPRFOPS:
3629       break;
3630     }
3631   gcc_unreachable ();
3632 }
3633
3634 /* Return the assembly string for an SVE prefetch operation with
3635    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3636    and that SUFFIX is the format for the remaining operands.  */
3637
3638 char *
3639 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3640                              const char *suffix)
3641 {
3642   static char buffer[128];
3643   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3644   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3645                                    mnemonic, svprfop_token (prfop), suffix);
3646   gcc_assert (written < sizeof (buffer));
3647   return buffer;
3648 }
3649
3650 /* Check whether we can calculate the number of elements in PATTERN
3651    at compile time, given that there are NELTS_PER_VQ elements per
3652    128-bit block.  Return the value if so, otherwise return -1.  */
3653
3654 HOST_WIDE_INT
3655 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3656 {
3657   unsigned int vl, const_vg;
3658   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3659     vl = 1 + (pattern - AARCH64_SV_VL1);
3660   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3661     vl = 16 << (pattern - AARCH64_SV_VL16);
3662   else if (aarch64_sve_vg.is_constant (&const_vg))
3663     {
3664       /* There are two vector granules per quadword.  */
3665       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3666       switch (pattern)
3667         {
3668         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3669         case AARCH64_SV_MUL4: return nelts & -4;
3670         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3671         case AARCH64_SV_ALL: return nelts;
3672         default: gcc_unreachable ();
3673         }
3674     }
3675   else
3676     return -1;
3677
3678   /* There are two vector granules per quadword.  */
3679   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3680   if (known_le (vl, nelts_all))
3681     return vl;
3682
3683   /* Requesting more elements than are available results in a PFALSE.  */
3684   if (known_gt (vl, nelts_all))
3685     return 0;
3686
3687   return -1;
3688 }
3689
3690 /* Return true if a single CNT[BHWD] instruction can multiply FACTOR
3691    by the number of 128-bit quadwords in an SVE vector.  */
3692
3693 static bool
3694 aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
3695 {
3696   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
3697   return (IN_RANGE (factor, 2, 16 * 16)
3698           && (factor & 1) == 0
3699           && factor <= 16 * (factor & -factor));
3700 }
3701
3702 /* Return true if we can move VALUE into a register using a single
3703    CNT[BHWD] instruction.  */
3704
3705 static bool
3706 aarch64_sve_cnt_immediate_p (poly_int64 value)
3707 {
3708   HOST_WIDE_INT factor = value.coeffs[0];
3709   return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
3710 }
3711
3712 /* Likewise for rtx X.  */
3713
3714 bool
3715 aarch64_sve_cnt_immediate_p (rtx x)
3716 {
3717   poly_int64 value;
3718   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3719 }
3720
3721 /* Return the asm string for an instruction with a CNT-like vector size
3722    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3723    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3724    first part of the operands template (the part that comes before the
3725    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
3726    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
3727    in each quadword.  If it is zero, we can use any element size.  */
3728
3729 static char *
3730 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3731                                   aarch64_svpattern pattern,
3732                                   unsigned int factor,
3733                                   unsigned int nelts_per_vq)
3734 {
3735   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3736
3737   if (nelts_per_vq == 0)
3738     /* There is some overlap in the ranges of the four CNT instructions.
3739        Here we always use the smallest possible element size, so that the
3740        multiplier is 1 whereever possible.  */
3741     nelts_per_vq = factor & -factor;
3742   int shift = std::min (exact_log2 (nelts_per_vq), 4);
3743   gcc_assert (IN_RANGE (shift, 1, 4));
3744   char suffix = "dwhb"[shift - 1];
3745
3746   factor >>= shift;
3747   unsigned int written;
3748   if (pattern == AARCH64_SV_ALL && factor == 1)
3749     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3750                         prefix, suffix, operands);
3751   else if (factor == 1)
3752     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3753                         prefix, suffix, operands, svpattern_token (pattern));
3754   else
3755     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3756                         prefix, suffix, operands, svpattern_token (pattern),
3757                         factor);
3758   gcc_assert (written < sizeof (buffer));
3759   return buffer;
3760 }
3761
3762 /* Return the asm string for an instruction with a CNT-like vector size
3763    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3764    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3765    first part of the operands template (the part that comes before the
3766    vector size itself).  X is the value of the vector size operand,
3767    as a polynomial integer rtx; we need to convert this into an "all"
3768    pattern with a multiplier.  */
3769
3770 char *
3771 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3772                                   rtx x)
3773 {
3774   poly_int64 value = rtx_to_poly_int64 (x);
3775   gcc_assert (aarch64_sve_cnt_immediate_p (value));
3776   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3777                                            value.coeffs[1], 0);
3778 }
3779
3780 /* Return the asm string for an instruction with a CNT-like vector size
3781    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3782    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3783    first part of the operands template (the part that comes before the
3784    vector size itself).  CNT_PAT[0..2] are the operands of the
3785    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
3786
3787 char *
3788 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3789                                       const char *operands, rtx *cnt_pat)
3790 {
3791   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3792   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3793   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3794   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3795                                            factor, nelts_per_vq);
3796 }
3797
3798 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
3799
3800 bool
3801 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3802 {
3803   poly_int64 value;
3804   return (poly_int_rtx_p (x, &value)
3805           && (aarch64_sve_cnt_immediate_p (value)
3806               || aarch64_sve_cnt_immediate_p (-value)));
3807 }
3808
3809 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3810    operand 0.  */
3811
3812 char *
3813 aarch64_output_sve_scalar_inc_dec (rtx offset)
3814 {
3815   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3816   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3817   if (offset_value.coeffs[1] > 0)
3818     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3819                                              offset_value.coeffs[1], 0);
3820   else
3821     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3822                                              -offset_value.coeffs[1], 0);
3823 }
3824
3825 /* Return true if a single RDVL instruction can multiply FACTOR by the
3826    number of 128-bit quadwords in an SVE vector.  This is also the
3827    range of ADDVL.  */
3828
3829 static bool
3830 aarch64_sve_rdvl_addvl_factor_p (HOST_WIDE_INT factor)
3831 {
3832   return (multiple_p (factor, 16)
3833           && IN_RANGE (factor, -32 * 16, 31 * 16));
3834 }
3835
3836 /* Return true if ADDPL can be used to add FACTOR multiplied by the number
3837    of quadwords in an SVE vector.  */
3838
3839 static bool
3840 aarch64_sve_addpl_factor_p (HOST_WIDE_INT factor)
3841 {
3842   return (multiple_p (factor, 2)
3843           && IN_RANGE (factor, -32 * 2, 31 * 2));
3844 }
3845
3846 /* Return true if we can move VALUE into a register using a single
3847    RDVL instruction.  */
3848
3849 static bool
3850 aarch64_sve_rdvl_immediate_p (poly_int64 value)
3851 {
3852   HOST_WIDE_INT factor = value.coeffs[0];
3853   return value.coeffs[1] == factor && aarch64_sve_rdvl_addvl_factor_p (factor);
3854 }
3855
3856 /* Likewise for rtx X.  */
3857
3858 bool
3859 aarch64_sve_rdvl_immediate_p (rtx x)
3860 {
3861   poly_int64 value;
3862   return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
3863 }
3864
3865 /* Return the asm string for moving RDVL immediate OFFSET into register
3866    operand 0.  */
3867
3868 char *
3869 aarch64_output_sve_rdvl (rtx offset)
3870 {
3871   static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
3872   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3873   gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
3874
3875   int factor = offset_value.coeffs[1];
3876   snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
3877   return buffer;
3878 }
3879
3880 /* Return true if we can add VALUE to a register using a single ADDVL
3881    or ADDPL instruction.  */
3882
3883 static bool
3884 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3885 {
3886   HOST_WIDE_INT factor = value.coeffs[0];
3887   if (factor == 0 || value.coeffs[1] != factor)
3888     return false;
3889   return (aarch64_sve_rdvl_addvl_factor_p (factor)
3890           || aarch64_sve_addpl_factor_p (factor));
3891 }
3892
3893 /* Likewise for rtx X.  */
3894
3895 bool
3896 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3897 {
3898   poly_int64 value;
3899   return (poly_int_rtx_p (x, &value)
3900           && aarch64_sve_addvl_addpl_immediate_p (value));
3901 }
3902
3903 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3904    to operand 1 and storing the result in operand 0.  */
3905
3906 char *
3907 aarch64_output_sve_addvl_addpl (rtx offset)
3908 {
3909   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3910   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3911   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3912
3913   int factor = offset_value.coeffs[1];
3914   if ((factor & 15) == 0)
3915     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3916   else
3917     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3918   return buffer;
3919 }
3920
3921 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3922    instruction.  If it is, store the number of elements in each vector
3923    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3924    factor in *FACTOR_OUT (if nonnull).  */
3925
3926 bool
3927 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3928                                         unsigned int *nelts_per_vq_out)
3929 {
3930   rtx elt;
3931   poly_int64 value;
3932
3933   if (!const_vec_duplicate_p (x, &elt)
3934       || !poly_int_rtx_p (elt, &value))
3935     return false;
3936
3937   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3938   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3939     /* There's no vector INCB.  */
3940     return false;
3941
3942   HOST_WIDE_INT factor = value.coeffs[0];
3943   if (value.coeffs[1] != factor)
3944     return false;
3945
3946   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3947   if ((factor % nelts_per_vq) != 0
3948       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3949     return false;
3950
3951   if (factor_out)
3952     *factor_out = factor;
3953   if (nelts_per_vq_out)
3954     *nelts_per_vq_out = nelts_per_vq;
3955   return true;
3956 }
3957
3958 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3959    instruction.  */
3960
3961 bool
3962 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3963 {
3964   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3965 }
3966
3967 /* Return the asm template for an SVE vector INC or DEC instruction.
3968    OPERANDS gives the operands before the vector count and X is the
3969    value of the vector count operand itself.  */
3970
3971 char *
3972 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3973 {
3974   int factor;
3975   unsigned int nelts_per_vq;
3976   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3977     gcc_unreachable ();
3978   if (factor < 0)
3979     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3980                                              -factor, nelts_per_vq);
3981   else
3982     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3983                                              factor, nelts_per_vq);
3984 }
3985
3986 /* Return a constant that represents FACTOR multiplied by the
3987    number of 128-bit quadwords in an SME vector.  ISA_MODE is the
3988    ISA mode in which the calculation is being performed.  */
3989
3990 rtx
3991 aarch64_sme_vq_immediate (machine_mode mode, HOST_WIDE_INT factor,
3992                           aarch64_feature_flags isa_mode)
3993 {
3994   gcc_assert (aarch64_sve_rdvl_addvl_factor_p (factor));
3995   if (isa_mode & AARCH64_FL_SM_ON)
3996     /* We're in streaming mode, so we can use normal poly-int values.  */
3997     return gen_int_mode ({ factor, factor }, mode);
3998
3999   rtvec vec = gen_rtvec (1, gen_int_mode (factor, SImode));
4000   rtx unspec = gen_rtx_UNSPEC (mode, vec, UNSPEC_SME_VQ);
4001   return gen_rtx_CONST (mode, unspec);
4002 }
4003
4004 /* Return true if X is a constant that represents some number X
4005    multiplied by the number of quadwords in an SME vector.  Store this X
4006    in *FACTOR if so.  */
4007
4008 static bool
4009 aarch64_sme_vq_unspec_p (const_rtx x, HOST_WIDE_INT *factor)
4010 {
4011   if (!TARGET_SME || GET_CODE (x) != CONST)
4012     return false;
4013
4014   x = XEXP (x, 0);
4015   if (GET_CODE (x) != UNSPEC
4016       || XINT (x, 1) != UNSPEC_SME_VQ
4017       || XVECLEN (x, 0) != 1)
4018     return false;
4019
4020   x = XVECEXP (x, 0, 0);
4021   if (!CONST_INT_P (x))
4022     return false;
4023
4024   *factor = INTVAL (x);
4025   return true;
4026 }
4027
4028 /* Return true if X is a constant that represents some number Y
4029    multiplied by the number of quadwords in an SME vector, and if
4030    that Y is in the range of RDSVL.  */
4031
4032 bool
4033 aarch64_rdsvl_immediate_p (const_rtx x)
4034 {
4035   HOST_WIDE_INT factor;
4036   return (aarch64_sme_vq_unspec_p (x, &factor)
4037           && aarch64_sve_rdvl_addvl_factor_p (factor));
4038 }
4039
4040 /* Return the asm string for an RDSVL instruction that calculates X,
4041    which is a constant that satisfies aarch64_rdsvl_immediate_p.  */
4042
4043 char *
4044 aarch64_output_rdsvl (const_rtx x)
4045 {
4046   gcc_assert (aarch64_rdsvl_immediate_p (x));
4047   static char buffer[sizeof ("rdsvl\t%x0, #-") + 3 * sizeof (int)];
4048   x = XVECEXP (XEXP (x, 0), 0, 0);
4049   snprintf (buffer, sizeof (buffer), "rdsvl\t%%x0, #%d",
4050             (int) INTVAL (x) / 16);
4051   return buffer;
4052 }
4053
4054 /* Return true if X is a constant that can be added using ADDSVL or ADDSPL.  */
4055
4056 bool
4057 aarch64_addsvl_addspl_immediate_p (const_rtx x)
4058 {
4059   HOST_WIDE_INT factor;
4060   return (aarch64_sme_vq_unspec_p (x, &factor)
4061           && (aarch64_sve_rdvl_addvl_factor_p (factor)
4062               || aarch64_sve_addpl_factor_p (factor)));
4063 }
4064
4065 /* X is a constant that satisfies aarch64_addsvl_addspl_immediate_p.
4066    Return the asm string for the associated instruction.  */
4067
4068 char *
4069 aarch64_output_addsvl_addspl (rtx x)
4070 {
4071   static char buffer[sizeof ("addspl\t%x0, %x1, #-") + 3 * sizeof (int)];
4072   HOST_WIDE_INT factor;
4073   if (!aarch64_sme_vq_unspec_p (x, &factor))
4074     gcc_unreachable ();
4075   if (aarch64_sve_rdvl_addvl_factor_p (factor))
4076     snprintf (buffer, sizeof (buffer), "addsvl\t%%x0, %%x1, #%d",
4077               (int) factor / 16);
4078   else if (aarch64_sve_addpl_factor_p (factor))
4079     snprintf (buffer, sizeof (buffer), "addspl\t%%x0, %%x1, #%d",
4080               (int) factor / 2);
4081   else
4082     gcc_unreachable ();
4083   return buffer;
4084 }
4085
4086 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4087
4088 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4089   {
4090     0x0000000100000001ull,
4091     0x0001000100010001ull,
4092     0x0101010101010101ull,
4093     0x1111111111111111ull,
4094     0x5555555555555555ull,
4095   };
4096
4097
4098
4099 /* Return true if 64-bit VAL is a valid bitmask immediate.  */
4100 static bool
4101 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
4102 {
4103   unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
4104   int bits;
4105
4106   /* Check for a single sequence of one bits and return quickly if so.
4107      The special cases of all ones and all zeroes returns false.  */
4108   tmp = val + (val & -val);
4109
4110   if (tmp == (tmp & -tmp))
4111     return (val + 1) > 1;
4112
4113   /* Invert if the immediate doesn't start with a zero bit - this means we
4114      only need to search for sequences of one bits.  */
4115   if (val & 1)
4116     val = ~val;
4117
4118   /* Find the first set bit and set tmp to val with the first sequence of one
4119      bits removed.  Return success if there is a single sequence of ones.  */
4120   first_one = val & -val;
4121   tmp = val & (val + first_one);
4122
4123   if (tmp == 0)
4124     return true;
4125
4126   /* Find the next set bit and compute the difference in bit position.  */
4127   next_one = tmp & -tmp;
4128   bits = clz_hwi (first_one) - clz_hwi (next_one);
4129   mask = val ^ tmp;
4130
4131   /* Check the bit position difference is a power of 2, and that the first
4132      sequence of one bits fits within 'bits' bits.  */
4133   if ((mask >> bits) != 0 || bits != (bits & -bits))
4134     return false;
4135
4136   /* Check the sequence of one bits is repeated 64/bits times.  */
4137   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4138 }
4139
4140
4141 /* Return true if VAL is a valid bitmask immediate for MODE.  */
4142 bool
4143 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4144 {
4145   if (mode == DImode)
4146     return aarch64_bitmask_imm (val);
4147
4148   if (mode == SImode)
4149     return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
4150
4151   /* Replicate small immediates to fit 64 bits.  */
4152   int size = GET_MODE_UNIT_PRECISION (mode);
4153   val &= (HOST_WIDE_INT_1U << size) - 1;
4154   val *= bitmask_imm_mul[__builtin_clz (size) - 26];
4155
4156   return aarch64_bitmask_imm (val);
4157 }
4158
4159
4160 /* Return true if the immediate VAL can be a bitfield immediate
4161    by changing the given MASK bits in VAL to zeroes, ones or bits
4162    from the other half of VAL.  Return the new immediate in VAL2.  */
4163 static inline bool
4164 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
4165                        unsigned HOST_WIDE_INT &val2,
4166                        unsigned HOST_WIDE_INT mask)
4167 {
4168   val2 = val & ~mask;
4169   if (val2 != val && aarch64_bitmask_imm (val2))
4170     return true;
4171   val2 = val | mask;
4172   if (val2 != val && aarch64_bitmask_imm (val2))
4173     return true;
4174   val = val & ~mask;
4175   val2 = val | (((val >> 32) | (val << 32)) & mask);
4176   if (val2 != val && aarch64_bitmask_imm (val2))
4177     return true;
4178   val2 = val | (((val >> 16) | (val << 48)) & mask);
4179   if (val2 != val && aarch64_bitmask_imm (val2))
4180     return true;
4181   return false;
4182 }
4183
4184
4185 /* Return true if VAL is a valid MOVZ immediate.  */
4186 static inline bool
4187 aarch64_is_movz (unsigned HOST_WIDE_INT val)
4188 {
4189   return (val >> (ctz_hwi (val) & 48)) < 65536;
4190 }
4191
4192
4193 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ.  */
4194 bool
4195 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
4196 {
4197   return aarch64_is_movz (val) || aarch64_is_movz (~val)
4198     || aarch64_bitmask_imm (val);
4199 }
4200
4201
4202 /* Return true if VAL is an immediate that can be created by a single
4203    MOV instruction.  */
4204 bool
4205 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4206 {
4207   gcc_assert (mode == SImode || mode == DImode);
4208
4209   if (val < 65536)
4210     return true;
4211
4212   unsigned HOST_WIDE_INT mask =
4213     (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
4214
4215   if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
4216     return true;
4217
4218   val = (val & mask) | ((val << 32) & ~mask);
4219   return aarch64_bitmask_imm (val);
4220 }
4221
4222
4223 static int
4224 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4225                                 machine_mode mode)
4226 {
4227   int i;
4228   unsigned HOST_WIDE_INT val, val2, val3, mask;
4229   int one_match, zero_match;
4230   int num_insns;
4231
4232   gcc_assert (mode == SImode || mode == DImode);
4233
4234   val = INTVAL (imm);
4235
4236   if (aarch64_move_imm (val, mode))
4237     {
4238       if (generate)
4239         emit_insn (gen_rtx_SET (dest, imm));
4240       return 1;
4241     }
4242
4243   if ((val >> 32) == 0 || mode == SImode)
4244     {
4245       if (generate)
4246         {
4247           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4248           if (mode == SImode)
4249             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4250                                        GEN_INT ((val >> 16) & 0xffff)));
4251           else
4252             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4253                                        GEN_INT ((val >> 16) & 0xffff)));
4254         }
4255       return 2;
4256     }
4257
4258   /* Remaining cases are all for DImode.  */
4259
4260   mask = 0xffff;
4261   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4262     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4263   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4264     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4265
4266   /* Try a bitmask immediate and a movk to generate the immediate
4267      in 2 instructions.  */
4268
4269   if (zero_match < 2 && one_match < 2)
4270     {
4271       for (i = 0; i < 64; i += 16)
4272         {
4273           if (aarch64_check_bitmask (val, val2, mask << i))
4274             break;
4275
4276           val2 = val & ~(mask << i);
4277           if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
4278             break;
4279         }
4280
4281       if (i != 64)
4282         {
4283           if (generate)
4284             {
4285               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4286               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4287                                          GEN_INT ((val >> i) & 0xffff)));
4288             }
4289           return 2;
4290         }
4291
4292       /* Try 2 bitmask immediates which are xor'd together. */
4293       for (i = 0; i < 64; i += 16)
4294         {
4295           val2 = (val >> i) & mask;
4296           val2 |= val2 << 16;
4297           val2 |= val2 << 32;
4298           if (aarch64_bitmask_imm (val2) && aarch64_bitmask_imm (val ^ val2))
4299             break;
4300         }
4301
4302       if (i != 64)
4303         {
4304           if (generate)
4305             {
4306               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4307               emit_insn (gen_xordi3 (dest, dest, GEN_INT (val ^ val2)));
4308             }
4309           return 2;
4310         }
4311     }
4312
4313   /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
4314   if (zero_match + one_match == 0)
4315     {
4316       for (i = 0; i < 48; i += 16)
4317         for (int j = i + 16; j < 64; j += 16)
4318           if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
4319             {
4320               if (generate)
4321                 {
4322                   emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4323                   emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4324                                              GEN_INT ((val >> i) & 0xffff)));
4325                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
4326                                                GEN_INT ((val >> j) & 0xffff)));
4327                 }
4328               return 3;
4329             }
4330
4331       /* Try shifting and inserting the bottom 32-bits into the top bits.  */
4332       val2 = val & 0xffffffff;
4333       val3 = 0xffffffff;
4334       val3 = val2 | (val3 << 32);
4335       for (i = 17; i < 48; i++)
4336         if ((val2 | (val2 << i)) == val)
4337           {
4338             if (generate)
4339               {
4340                 emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
4341                 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4342                                            GEN_INT (val2 >> 16)));
4343                 emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
4344               }
4345             return 3;
4346           }
4347         else if ((val3 & ~(val3 << i)) == val)
4348           {
4349             if (generate)
4350               {
4351                 emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
4352                 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4353                                            GEN_INT (val2 >> 16)));
4354                 emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
4355                                                       dest));
4356               }
4357             return 3;
4358           }
4359     }
4360
4361   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4362      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
4363      otherwise skip zero bits.  */
4364
4365   num_insns = 1;
4366   mask = 0xffff;
4367   val2 = one_match > zero_match ? ~val : val;
4368   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4369
4370   if (generate)
4371     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4372                                            ? (val | ~(mask << i))
4373                                            : (val & (mask << i)))));
4374   for (i += 16; i < 64; i += 16)
4375     {
4376       if ((val2 & (mask << i)) == 0)
4377         continue;
4378       if (generate)
4379         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4380                                    GEN_INT ((val >> i) & 0xffff)));
4381       num_insns ++;
4382     }
4383
4384   return num_insns;
4385 }
4386
4387 /* Return whether imm is a 128-bit immediate which is simple enough to
4388    expand inline.  */
4389 bool
4390 aarch64_mov128_immediate (rtx imm)
4391 {
4392   if (CONST_INT_P (imm))
4393     return true;
4394
4395   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4396
4397   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4398   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4399
4400   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4401          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4402 }
4403
4404
4405 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4406    a left shift of 0 or 12 bits.  */
4407 bool
4408 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
4409 {
4410   return val < 4096 || (val & 0xfff000) == val;
4411 }
4412
4413 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
4414    that can be created with a left shift of 0 or 12.  */
4415 static HOST_WIDE_INT
4416 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
4417 {
4418   /* Check to see if the value fits in 24 bits, as that is the maximum we can
4419      handle correctly.  */
4420   gcc_assert (val < 0x1000000);
4421
4422   if (val < 4096)
4423     return val;
4424
4425   return val & 0xfff000;
4426 }
4427
4428
4429 /* Test whether:
4430
4431      X = (X & AND_VAL) | IOR_VAL;
4432
4433    can be implemented using:
4434
4435      MOVK X, #(IOR_VAL >> shift), LSL #shift
4436
4437    Return the shift if so, otherwise return -1.  */
4438 int
4439 aarch64_movk_shift (const wide_int_ref &and_val,
4440                     const wide_int_ref &ior_val)
4441 {
4442   unsigned int precision = and_val.get_precision ();
4443   unsigned HOST_WIDE_INT mask = 0xffff;
4444   for (unsigned int shift = 0; shift < precision; shift += 16)
4445     {
4446       if (and_val == ~mask && (ior_val & mask) == ior_val)
4447         return shift;
4448       mask <<= 16;
4449     }
4450   return -1;
4451 }
4452
4453 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4454    Assumed precondition: VAL_IN Is not zero.  */
4455
4456 unsigned HOST_WIDE_INT
4457 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4458 {
4459   int lowest_bit_set = ctz_hwi (val_in);
4460   int highest_bit_set = floor_log2 (val_in);
4461   gcc_assert (val_in != 0);
4462
4463   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4464           (HOST_WIDE_INT_1U << lowest_bit_set));
4465 }
4466
4467 /* Create constant where bits outside of lowest bit set to highest bit set
4468    are set to 1.  */
4469
4470 unsigned HOST_WIDE_INT
4471 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4472 {
4473   return val_in | ~aarch64_and_split_imm1 (val_in);
4474 }
4475
4476 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4477
4478 bool
4479 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4480 {
4481   scalar_int_mode int_mode;
4482   if (!is_a <scalar_int_mode> (mode, &int_mode))
4483     return false;
4484
4485   if (aarch64_bitmask_imm (val_in, int_mode))
4486     return false;
4487
4488   if (aarch64_move_imm (val_in, int_mode))
4489     return false;
4490
4491   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4492
4493   return aarch64_bitmask_imm (imm2, int_mode);
4494 }
4495
4496 /* Return the number of temporary registers that aarch64_add_offset_1
4497    would need to add OFFSET to a register.  */
4498
4499 static unsigned int
4500 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4501 {
4502   return absu_hwi (offset) < 0x1000000 ? 0 : 1;
4503 }
4504
4505 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
4506    a non-polynomial OFFSET.  MODE is the mode of the addition.
4507    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4508    be set and CFA adjustments added to the generated instructions.
4509
4510    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4511    temporary if register allocation is already complete.  This temporary
4512    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
4513    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4514    the immediate again.
4515
4516    Since this function may be used to adjust the stack pointer, we must
4517    ensure that it cannot cause transient stack deallocation (for example
4518    by first incrementing SP and then decrementing when adjusting by a
4519    large immediate).  */
4520
4521 static void
4522 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4523                       rtx src, HOST_WIDE_INT offset, rtx temp1,
4524                       bool frame_related_p, bool emit_move_imm)
4525 {
4526   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4527   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4528
4529   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4530   rtx_insn *insn;
4531
4532   if (!moffset)
4533     {
4534       if (!rtx_equal_p (dest, src))
4535         {
4536           insn = emit_insn (gen_rtx_SET (dest, src));
4537           RTX_FRAME_RELATED_P (insn) = frame_related_p;
4538         }
4539       return;
4540     }
4541
4542   /* Single instruction adjustment.  */
4543   if (aarch64_uimm12_shift (moffset))
4544     {
4545       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4546       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4547       return;
4548     }
4549
4550   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4551      and either:
4552
4553      a) the offset cannot be loaded by a 16-bit move or
4554      b) there is no spare register into which we can move it.  */
4555   if (moffset < 0x1000000
4556       && ((!temp1 && !can_create_pseudo_p ())
4557           || !aarch64_move_imm (moffset, mode)))
4558     {
4559       HOST_WIDE_INT low_off = moffset & 0xfff;
4560
4561       low_off = offset < 0 ? -low_off : low_off;
4562       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4563       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4564       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4565       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4566       return;
4567     }
4568
4569   /* Emit a move immediate if required and an addition/subtraction.  */
4570   if (emit_move_imm)
4571     {
4572       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4573       temp1 = aarch64_force_temporary (mode, temp1,
4574                                        gen_int_mode (moffset, mode));
4575     }
4576   insn = emit_insn (offset < 0
4577                     ? gen_sub3_insn (dest, src, temp1)
4578                     : gen_add3_insn (dest, src, temp1));
4579   if (frame_related_p)
4580     {
4581       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4582       rtx adj = plus_constant (mode, src, offset);
4583       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4584     }
4585 }
4586
4587 /* Return the number of temporary registers that aarch64_add_offset
4588    would need to move OFFSET into a register or add OFFSET to a register;
4589    ADD_P is true if we want the latter rather than the former.  */
4590
4591 static unsigned int
4592 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4593 {
4594   /* This follows the same structure as aarch64_add_offset.  */
4595   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4596     return 0;
4597
4598   unsigned int count = 0;
4599   HOST_WIDE_INT factor = offset.coeffs[1];
4600   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4601   poly_int64 poly_offset (factor, factor);
4602   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4603     /* Need one register for the ADDVL/ADDPL result.  */
4604     count += 1;
4605   else if (factor != 0)
4606     {
4607       factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
4608       if (!IN_RANGE (factor, -32, 31))
4609         /* Need one register for the CNT or RDVL result and one for the
4610            multiplication factor.  If necessary, the second temporary
4611            can be reused for the constant part of the offset.  */
4612         return 2;
4613       /* Need one register for the CNT or RDVL result (which might then
4614          be shifted).  */
4615       count += 1;
4616     }
4617   return count + aarch64_add_offset_1_temporaries (constant);
4618 }
4619
4620 /* If X can be represented as a poly_int64, return the number
4621    of temporaries that are required to add it to a register.
4622    Return -1 otherwise.  */
4623
4624 int
4625 aarch64_add_offset_temporaries (rtx x)
4626 {
4627   poly_int64 offset;
4628   if (!poly_int_rtx_p (x, &offset))
4629     return -1;
4630   return aarch64_offset_temporaries (true, offset);
4631 }
4632
4633 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
4634    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4635    be set and CFA adjustments added to the generated instructions.
4636
4637    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4638    temporary if register allocation is already complete.  This temporary
4639    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4640    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4641    false to avoid emitting the immediate again.
4642
4643    TEMP2, if nonnull, is a second temporary register that doesn't
4644    overlap either DEST or REG.
4645
4646    FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
4647    is measured relative to the SME vector length instead of the current
4648    prevailing vector length.  It is 0 otherwise.
4649
4650    Since this function may be used to adjust the stack pointer, we must
4651    ensure that it cannot cause transient stack deallocation (for example
4652    by first incrementing SP and then decrementing when adjusting by a
4653    large immediate).  */
4654
4655 static void
4656 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4657                     poly_int64 offset, rtx temp1, rtx temp2,
4658                     aarch64_feature_flags force_isa_mode,
4659                     bool frame_related_p, bool emit_move_imm = true)
4660 {
4661   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4662   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4663   gcc_assert (temp1 == NULL_RTX
4664               || !frame_related_p
4665               || !reg_overlap_mentioned_p (temp1, dest));
4666   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4667
4668   /* Try using ADDVL or ADDPL to add the whole value.  */
4669   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4670     {
4671       gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
4672       rtx offset_rtx;
4673       if (force_isa_mode == 0)
4674         offset_rtx = gen_int_mode (offset, mode);
4675       else
4676         offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
4677       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4678       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4679       if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON))
4680         add_reg_note (insn, REG_CFA_ADJUST_CFA,
4681                       gen_rtx_SET (dest, plus_constant (Pmode, src,
4682                                                         offset)));
4683       return;
4684     }
4685
4686   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4687      SVE vector register, over and above the minimum size of 128 bits.
4688      This is equivalent to half the value returned by CNTD with a
4689      vector shape of ALL.  */
4690   HOST_WIDE_INT factor = offset.coeffs[1];
4691   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4692
4693   /* Try using ADDVL or ADDPL to add the VG-based part.  */
4694   poly_int64 poly_offset (factor, factor);
4695   if (src != const0_rtx
4696       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4697     {
4698       rtx offset_rtx;
4699       if (force_isa_mode == 0)
4700         offset_rtx = gen_int_mode (poly_offset, mode);
4701       else
4702         offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
4703       if (frame_related_p)
4704         {
4705           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4706           RTX_FRAME_RELATED_P (insn) = true;
4707           if (force_isa_mode & AARCH64_FL_SM_ON)
4708             add_reg_note (insn, REG_CFA_ADJUST_CFA,
4709                           gen_rtx_SET (dest, plus_constant (Pmode, src,
4710                                                             poly_offset)));
4711           src = dest;
4712         }
4713       else
4714         {
4715           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4716           src = aarch64_force_temporary (mode, temp1, addr);
4717           temp1 = temp2;
4718           temp2 = NULL_RTX;
4719         }
4720     }
4721   /* Otherwise use a CNT-based sequence.  */
4722   else if (factor != 0)
4723     {
4724       /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
4725          with negative shifts indicating a shift right.  */
4726       HOST_WIDE_INT low_bit = least_bit_hwi (factor);
4727       HOST_WIDE_INT rel_factor = factor / low_bit;
4728       int shift = exact_log2 (low_bit) - 4;
4729       gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
4730
4731       /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
4732          equal to CNTB * FACTOR / 16, with CODE being the [+-].
4733
4734          We can avoid a multiplication if REL_FACTOR is in the range
4735          of RDVL, although there are then various optimizations that
4736          we can try on top.  */
4737       rtx_code code = PLUS;
4738       rtx val;
4739       if (IN_RANGE (rel_factor, -32, 31))
4740         {
4741           if (force_isa_mode & AARCH64_FL_SM_ON)
4742             {
4743               /* Try to use an unshifted RDSVL, otherwise fall back on
4744                  a shifted RDSVL #1.  */
4745               if (aarch64_sve_rdvl_addvl_factor_p (factor))
4746                 shift = 0;
4747               else
4748                 factor = rel_factor * 16;
4749               val = aarch64_sme_vq_immediate (mode, factor, 0);
4750             }
4751           /* Try to use an unshifted CNT[BHWD] or RDVL.  */
4752           else if (aarch64_sve_cnt_factor_p (factor)
4753                    || aarch64_sve_rdvl_addvl_factor_p (factor))
4754             {
4755               val = gen_int_mode (poly_int64 (factor, factor), mode);
4756               shift = 0;
4757             }
4758           /* Try to subtract an unshifted CNT[BHWD].  */
4759           else if (aarch64_sve_cnt_factor_p (-factor))
4760             {
4761               code = MINUS;
4762               val = gen_int_mode (poly_int64 (-factor, -factor), mode);
4763               shift = 0;
4764             }
4765           /* If subtraction is free, prefer to load a positive constant.
4766              In the best case this will fit a shifted CNTB.  */
4767           else if (src != const0_rtx && rel_factor < 0)
4768             {
4769               code = MINUS;
4770               val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
4771             }
4772           /* Otherwise use a shifted RDVL or CNT[BHWD].  */
4773           else
4774             val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
4775         }
4776       else
4777         {
4778           /* If we can calculate CNTB << SHIFT directly, prefer to do that,
4779              since it should increase the chances of being able to use
4780              a shift and add sequence for the multiplication.
4781              If CNTB << SHIFT is out of range, stick with the current
4782              shift factor.  */
4783           if (force_isa_mode == 0
4784               && IN_RANGE (low_bit, 2, 16 * 16))
4785             {
4786               val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
4787               shift = 0;
4788             }
4789           else if ((force_isa_mode & AARCH64_FL_SM_ON)
4790                    && aarch64_sve_rdvl_addvl_factor_p (low_bit))
4791             {
4792               val = aarch64_sme_vq_immediate (mode, low_bit, 0);
4793               shift = 0;
4794             }
4795           else
4796             val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
4797
4798           val = aarch64_force_temporary (mode, temp1, val);
4799
4800           /* Prefer to multiply by a positive factor and subtract rather
4801              than multiply by a negative factor and add, since positive
4802              values are usually easier to move.  */
4803           if (rel_factor < 0 && src != const0_rtx)
4804             {
4805               rel_factor = -rel_factor;
4806               code = MINUS;
4807             }
4808
4809           if (can_create_pseudo_p ())
4810             {
4811               rtx coeff1 = gen_int_mode (rel_factor, mode);
4812               val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
4813             }
4814           else
4815             {
4816               rtx coeff1 = gen_int_mode (rel_factor, mode);
4817               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4818               val = gen_rtx_MULT (mode, val, coeff1);
4819             }
4820         }
4821
4822       /* Multiply by 2 ** SHIFT.  */
4823       if (shift > 0)
4824         {
4825           val = aarch64_force_temporary (mode, temp1, val);
4826           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4827         }
4828       else if (shift < 0)
4829         {
4830           val = aarch64_force_temporary (mode, temp1, val);
4831           val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
4832         }
4833
4834       /* Add the result to SRC or subtract the result from SRC.  */
4835       if (src != const0_rtx)
4836         {
4837           val = aarch64_force_temporary (mode, temp1, val);
4838           val = gen_rtx_fmt_ee (code, mode, src, val);
4839         }
4840       else if (code == MINUS)
4841         {
4842           val = aarch64_force_temporary (mode, temp1, val);
4843           val = gen_rtx_NEG (mode, val);
4844         }
4845
4846       if (constant == 0 || frame_related_p)
4847         {
4848           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4849           if (frame_related_p)
4850             {
4851               RTX_FRAME_RELATED_P (insn) = true;
4852               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4853                             gen_rtx_SET (dest, plus_constant (Pmode, src,
4854                                                               poly_offset)));
4855             }
4856           src = dest;
4857           if (constant == 0)
4858             return;
4859         }
4860       else
4861         {
4862           src = aarch64_force_temporary (mode, temp1, val);
4863           temp1 = temp2;
4864           temp2 = NULL_RTX;
4865         }
4866
4867       emit_move_imm = true;
4868     }
4869
4870   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4871                         frame_related_p, emit_move_imm);
4872 }
4873
4874 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4875    than a poly_int64.  */
4876
4877 void
4878 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4879                           rtx offset_rtx, rtx temp1, rtx temp2)
4880 {
4881   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4882                       temp1, temp2, 0, false);
4883 }
4884
4885 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4886    TEMP1 is available as a temporary if nonnull.  FORCE_ISA_MODE is as
4887    for aarch64_add_offset.  EMIT_MOVE_IMM is false if TEMP1 already
4888    contains abs (DELTA).  */
4889
4890 static inline void
4891 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
4892                 aarch64_feature_flags force_isa_mode, bool emit_move_imm)
4893 {
4894   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4895                       temp1, temp2, force_isa_mode, true, emit_move_imm);
4896 }
4897
4898 /* Subtract DELTA from the stack pointer, marking the instructions
4899    frame-related if FRAME_RELATED_P.  FORCE_ISA_MODE is as for
4900    aarch64_add_offset.  TEMP1 is available as a temporary if nonnull.  */
4901
4902 static inline void
4903 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
4904                 aarch64_feature_flags force_isa_mode,
4905                 bool frame_related_p, bool emit_move_imm = true)
4906 {
4907   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4908                       temp1, temp2, force_isa_mode, frame_related_p,
4909                       emit_move_imm);
4910 }
4911
4912 /* A streaming-compatible function needs to switch temporarily to the known
4913    PSTATE.SM mode described by LOCAL_MODE.  The low bit of OLD_SVCR contains
4914    the runtime state of PSTATE.SM in the streaming-compatible code, before
4915    the start of the switch to LOCAL_MODE.
4916
4917    Emit instructions to branch around the mode switch if PSTATE.SM already
4918    matches LOCAL_MODE.  Return the label that the branch jumps to.  */
4919
4920 static rtx_insn *
4921 aarch64_guard_switch_pstate_sm (rtx old_svcr, aarch64_feature_flags local_mode)
4922 {
4923   local_mode &= AARCH64_FL_SM_STATE;
4924   gcc_assert (local_mode != 0);
4925   auto already_ok_cond = (local_mode & AARCH64_FL_SM_ON ? NE : EQ);
4926   auto *label = gen_label_rtx ();
4927   auto branch = aarch64_gen_test_and_branch (already_ok_cond, old_svcr, 0,
4928                                              label);
4929   auto *jump = emit_jump_insn (branch);
4930   JUMP_LABEL (jump) = label;
4931   return label;
4932 }
4933
4934 /* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM
4935    state in NEW_MODE.  This is known to involve either an SMSTART SM or
4936    an SMSTOP SM.  */
4937
4938 static void
4939 aarch64_switch_pstate_sm (aarch64_feature_flags old_mode,
4940                           aarch64_feature_flags new_mode)
4941 {
4942   old_mode &= AARCH64_FL_SM_STATE;
4943   new_mode &= AARCH64_FL_SM_STATE;
4944   gcc_assert (old_mode != new_mode);
4945
4946   if ((new_mode & AARCH64_FL_SM_ON)
4947       || (new_mode == 0 && (old_mode & AARCH64_FL_SM_OFF)))
4948     emit_insn (gen_aarch64_smstart_sm ());
4949   else
4950     emit_insn (gen_aarch64_smstop_sm ());
4951 }
4952
4953 /* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all
4954    FP and predicate registers.  This class emits code to preserve any
4955    necessary registers around the mode switch.
4956
4957    The class uses four approaches to saving and restoring contents, enumerated
4958    by group_type:
4959
4960    - GPR: save and restore the contents of FP registers using GPRs.
4961      This is used if the FP register contains no more than 64 significant
4962      bits.  The registers used are FIRST_GPR onwards.
4963
4964    - MEM_128: save and restore 128-bit SIMD registers using memory.
4965
4966    - MEM_SVE_PRED: save and restore full SVE predicate registers using memory.
4967
4968    - MEM_SVE_DATA: save and restore full SVE vector registers using memory.
4969
4970    The save slots within each memory group are consecutive, with the
4971    MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots.
4972
4973    There will only be two mode switches for each use of SME, so they should
4974    not be particularly performance-sensitive.  It's also rare for SIMD, SVE
4975    or predicate registers to be live across mode switches.  We therefore
4976    don't preallocate the save slots but instead allocate them locally on
4977    demand.  This makes the code emitted by the class self-contained.  */
4978
4979 class aarch64_sme_mode_switch_regs
4980 {
4981 public:
4982   static const unsigned int FIRST_GPR = R10_REGNUM;
4983
4984   void add_reg (machine_mode, unsigned int);
4985   void add_call_args (rtx_call_insn *);
4986   void add_call_result (rtx_call_insn *);
4987   void add_call_preserved_reg (unsigned int);
4988   void add_call_preserved_regs (bitmap);
4989
4990   void emit_prologue ();
4991   void emit_epilogue ();
4992
4993   /* The number of GPRs needed to save FP registers, starting from
4994      FIRST_GPR.  */
4995   unsigned int num_gprs () { return m_group_count[GPR]; }
4996
4997 private:
4998   enum sequence { PROLOGUE, EPILOGUE };
4999   enum group_type { GPR, MEM_128, MEM_SVE_PRED, MEM_SVE_DATA, NUM_GROUPS };
5000
5001   /* Information about the save location for one FP, SIMD, SVE data, or
5002      SVE predicate register.  */
5003   struct save_location {
5004     /* The register to be saved.  */
5005     rtx reg;
5006
5007     /* Which group the save location belongs to.  */
5008     group_type group;
5009
5010     /* A zero-based index of the register within the group.  */
5011     unsigned int index;
5012   };
5013
5014   unsigned int sve_data_headroom ();
5015   rtx get_slot_mem (machine_mode, poly_int64);
5016   void emit_stack_adjust (sequence, poly_int64);
5017   void emit_mem_move (sequence, const save_location &, poly_int64);
5018
5019   void emit_gpr_moves (sequence);
5020   void emit_mem_128_moves (sequence);
5021   void emit_sve_sp_adjust (sequence);
5022   void emit_sve_pred_moves (sequence);
5023   void emit_sve_data_moves (sequence);
5024
5025   /* All save locations, in no particular order.  */
5026   auto_vec<save_location, 12> m_save_locations;
5027
5028   /* The number of registers in each group.  */
5029   unsigned int m_group_count[NUM_GROUPS] = {};
5030 };
5031
5032 /* Record that (reg:MODE REGNO) needs to be preserved around the mode
5033    switch.  */
5034
5035 void
5036 aarch64_sme_mode_switch_regs::add_reg (machine_mode mode, unsigned int regno)
5037 {
5038   if (!FP_REGNUM_P (regno) && !PR_REGNUM_P (regno))
5039     return;
5040
5041   unsigned int end_regno = end_hard_regno (mode, regno);
5042   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5043   gcc_assert ((vec_flags & VEC_STRUCT) || end_regno == regno + 1);
5044   for (; regno < end_regno; regno++)
5045     {
5046       /* Force the mode of SVE saves and restores even for single registers.
5047          This is necessary because big-endian targets only allow LDR Z and
5048          STR Z to be used with byte modes.  */
5049       machine_mode submode = mode;
5050       if (vec_flags & VEC_SVE_PRED)
5051         submode = VNx16BImode;
5052       else if (vec_flags & VEC_SVE_DATA)
5053         submode = SVE_BYTE_MODE;
5054       else if (vec_flags & VEC_STRUCT)
5055         {
5056           if (vec_flags & VEC_PARTIAL)
5057             submode = V8QImode;
5058           else
5059             submode = V16QImode;
5060         }
5061       save_location loc;
5062       loc.reg = gen_rtx_REG (submode, regno);
5063       if (vec_flags & VEC_SVE_PRED)
5064         {
5065           gcc_assert (PR_REGNUM_P (regno));
5066           loc.group = MEM_SVE_PRED;
5067         }
5068       else
5069         {
5070           gcc_assert (FP_REGNUM_P (regno));
5071           if (known_le (GET_MODE_SIZE (submode), 8))
5072             loc.group = GPR;
5073           else if (known_eq (GET_MODE_SIZE (submode), 16))
5074             loc.group = MEM_128;
5075           else
5076             loc.group = MEM_SVE_DATA;
5077         }
5078       loc.index = m_group_count[loc.group]++;
5079       m_save_locations.quick_push (loc);
5080     }
5081 }
5082
5083 /* Record that the arguments to CALL_INSN need to be preserved around
5084    the mode switch.  */
5085
5086 void
5087 aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn *call_insn)
5088 {
5089   for (rtx node = CALL_INSN_FUNCTION_USAGE (call_insn);
5090        node; node = XEXP (node, 1))
5091     {
5092       rtx item = XEXP (node, 0);
5093       if (GET_CODE (item) != USE)
5094         continue;
5095       item = XEXP (item, 0);
5096       if (!REG_P (item))
5097         continue;
5098       add_reg (GET_MODE (item), REGNO (item));
5099     }
5100 }
5101
5102 /* Record that the return value from CALL_INSN (if any) needs to be
5103    preserved around the mode switch.  */
5104
5105 void
5106 aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn *call_insn)
5107 {
5108   rtx pat = PATTERN (call_insn);
5109   gcc_assert (GET_CODE (pat) == PARALLEL);
5110   pat = XVECEXP (pat, 0, 0);
5111   if (GET_CODE (pat) == CALL)
5112     return;
5113   rtx dest = SET_DEST (pat);
5114   if (GET_CODE (dest) == PARALLEL)
5115     for (int i = 0; i < XVECLEN (dest, 0); ++i)
5116       {
5117         rtx x = XVECEXP (dest, 0, i);
5118         gcc_assert (GET_CODE (x) == EXPR_LIST);
5119         rtx reg = XEXP (x, 0);
5120         add_reg (GET_MODE (reg), REGNO (reg));
5121       }
5122   else
5123     add_reg (GET_MODE (dest), REGNO (dest));
5124 }
5125
5126 /* REGNO is a register that is call-preserved under the current function's ABI.
5127    Record that it must be preserved around the mode switch.  */
5128
5129 void
5130 aarch64_sme_mode_switch_regs::add_call_preserved_reg (unsigned int regno)
5131 {
5132   if (FP_REGNUM_P (regno))
5133     switch (crtl->abi->id ())
5134       {
5135       case ARM_PCS_SVE:
5136         add_reg (VNx16QImode, regno);
5137         break;
5138       case ARM_PCS_SIMD:
5139         add_reg (V16QImode, regno);
5140         break;
5141       case ARM_PCS_AAPCS64:
5142         add_reg (DImode, regno);
5143         break;
5144       default:
5145         gcc_unreachable ();
5146       }
5147   else if (PR_REGNUM_P (regno))
5148     add_reg (VNx16BImode, regno);
5149 }
5150
5151 /* The hard registers in REGS are call-preserved under the current function's
5152    ABI.  Record that they must be preserved around the mode switch.  */
5153
5154 void
5155 aarch64_sme_mode_switch_regs::add_call_preserved_regs (bitmap regs)
5156 {
5157   bitmap_iterator bi;
5158   unsigned int regno;
5159   EXECUTE_IF_SET_IN_BITMAP (regs, 0, regno, bi)
5160     if (HARD_REGISTER_NUM_P (regno))
5161       add_call_preserved_reg (regno);
5162     else
5163       break;
5164 }
5165
5166 /* Emit code to save registers before the mode switch.  */
5167
5168 void
5169 aarch64_sme_mode_switch_regs::emit_prologue ()
5170 {
5171   emit_sve_sp_adjust (PROLOGUE);
5172   emit_sve_pred_moves (PROLOGUE);
5173   emit_sve_data_moves (PROLOGUE);
5174   emit_mem_128_moves (PROLOGUE);
5175   emit_gpr_moves (PROLOGUE);
5176 }
5177
5178 /* Emit code to restore registers after the mode switch.  */
5179
5180 void
5181 aarch64_sme_mode_switch_regs::emit_epilogue ()
5182 {
5183   emit_gpr_moves (EPILOGUE);
5184   emit_mem_128_moves (EPILOGUE);
5185   emit_sve_pred_moves (EPILOGUE);
5186   emit_sve_data_moves (EPILOGUE);
5187   emit_sve_sp_adjust (EPILOGUE);
5188 }
5189
5190 /* The SVE predicate registers are stored below the SVE data registers,
5191    with the predicate save area being padded to a data-register-sized
5192    boundary.  Return the size of this padded area as a whole number
5193    of data register slots.  */
5194
5195 unsigned int
5196 aarch64_sme_mode_switch_regs::sve_data_headroom ()
5197 {
5198   return CEIL (m_group_count[MEM_SVE_PRED], 8);
5199 }
5200
5201 /* Return a memory reference of mode MODE to OFFSET bytes from the
5202    stack pointer.  */
5203
5204 rtx
5205 aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode,
5206                                             poly_int64 offset)
5207 {
5208   rtx addr = plus_constant (Pmode, stack_pointer_rtx, offset);
5209   return gen_rtx_MEM (mode, addr);
5210 }
5211
5212 /* Allocate or deallocate SIZE bytes of stack space: SEQ decides which.  */
5213
5214 void
5215 aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq,
5216                                                  poly_int64 size)
5217 {
5218   if (seq == PROLOGUE)
5219     size = -size;
5220   emit_insn (gen_rtx_SET (stack_pointer_rtx,
5221                           plus_constant (Pmode, stack_pointer_rtx, size)));
5222 }
5223
5224 /* Save or restore the register in LOC, whose slot is OFFSET bytes from
5225    the stack pointer.  SEQ chooses between saving and restoring.  */
5226
5227 void
5228 aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq,
5229                                              const save_location &loc,
5230                                              poly_int64 offset)
5231 {
5232   rtx mem = get_slot_mem (GET_MODE (loc.reg), offset);
5233   if (seq == PROLOGUE)
5234     emit_move_insn (mem, loc.reg);
5235   else
5236     emit_move_insn (loc.reg, mem);
5237 }
5238
5239 /* Emit instructions to save or restore the GPR group.  SEQ chooses between
5240    saving and restoring.  */
5241
5242 void
5243 aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq)
5244 {
5245   for (auto &loc : m_save_locations)
5246     if (loc.group == GPR)
5247       {
5248         gcc_assert (loc.index < 8);
5249         rtx gpr = gen_rtx_REG (GET_MODE (loc.reg), FIRST_GPR + loc.index);
5250         if (seq == PROLOGUE)
5251           emit_move_insn (gpr, loc.reg);
5252         else
5253           emit_move_insn (loc.reg, gpr);
5254       }
5255 }
5256
5257 /* Emit instructions to save or restore the MEM_128 group.  SEQ chooses
5258    between saving and restoring.  */
5259
5260 void
5261 aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq)
5262 {
5263   HOST_WIDE_INT count = m_group_count[MEM_128];
5264   if (count == 0)
5265     return;
5266
5267   auto sp = stack_pointer_rtx;
5268   auto sp_adjust = (seq == PROLOGUE ? -count : count) * 16;
5269
5270   /* Pick a common mode that supports LDR & STR with pre/post-modification
5271      and LDP & STP with pre/post-modification.  */
5272   auto mode = TFmode;
5273
5274   /* An instruction pattern that should be emitted at the end.  */
5275   rtx last_pat = NULL_RTX;
5276
5277   /* A previous MEM_128 location that hasn't been handled yet.  */
5278   save_location *prev_loc = nullptr;
5279
5280   /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC.  */
5281   for (auto &loc : m_save_locations)
5282     if (loc.group == MEM_128)
5283       {
5284         if (!prev_loc)
5285           {
5286             prev_loc = &loc;
5287             continue;
5288           }
5289         gcc_assert (loc.index == prev_loc->index + 1);
5290
5291         /* The offset of the base of the save area from the current
5292            stack pointer.  */
5293         HOST_WIDE_INT bias = 0;
5294         if (prev_loc->index == 0 && seq == PROLOGUE)
5295           bias = sp_adjust;
5296
5297         /* Get the two sets in the LDP/STP.  */
5298         rtx ops[] = {
5299           gen_rtx_REG (mode, REGNO (prev_loc->reg)),
5300           get_slot_mem (mode, prev_loc->index * 16 + bias),
5301           gen_rtx_REG (mode, REGNO (loc.reg)),
5302           get_slot_mem (mode, loc.index * 16 + bias)
5303         };
5304         unsigned int lhs = (seq == PROLOGUE);
5305         rtx set1 = gen_rtx_SET (ops[lhs], ops[1 - lhs]);
5306         rtx set2 = gen_rtx_SET (ops[lhs + 2], ops[3 - lhs]);
5307
5308         /* Combine the sets with any stack allocation/deallocation.  */
5309         rtx pat;
5310         if (prev_loc->index == 0)
5311           {
5312             rtx plus_sp = plus_constant (Pmode, sp, sp_adjust);
5313             rtvec vec = gen_rtvec (3, gen_rtx_SET (sp, plus_sp), set1, set2);
5314             pat = gen_rtx_PARALLEL (VOIDmode, vec);
5315           }
5316         else if (seq == PROLOGUE)
5317           pat = aarch64_gen_store_pair (ops[1], ops[0], ops[2]);
5318         else
5319           pat = aarch64_gen_load_pair (ops[0], ops[2], ops[1]);
5320
5321         /* Queue a deallocation to the end, otherwise emit the
5322            instruction now.  */
5323         if (seq == EPILOGUE && prev_loc->index == 0)
5324           last_pat = pat;
5325         else
5326           emit_insn (pat);
5327         prev_loc = nullptr;
5328       }
5329
5330   /* Handle any leftover LDR/STR.  */
5331   if (prev_loc)
5332     {
5333       rtx reg = gen_rtx_REG (mode, REGNO (prev_loc->reg));
5334       rtx addr;
5335       if (prev_loc->index != 0)
5336         addr = plus_constant (Pmode, sp, prev_loc->index * 16);
5337       else if (seq == PROLOGUE)
5338         {
5339           rtx allocate = plus_constant (Pmode, sp, -count * 16);
5340           addr = gen_rtx_PRE_MODIFY (Pmode, sp, allocate);
5341         }
5342       else
5343         {
5344           rtx deallocate = plus_constant (Pmode, sp, count * 16);
5345           addr = gen_rtx_POST_MODIFY (Pmode, sp, deallocate);
5346         }
5347       rtx mem = gen_rtx_MEM (mode, addr);
5348       if (seq == PROLOGUE)
5349         emit_move_insn (mem, reg);
5350       else
5351         emit_move_insn (reg, mem);
5352     }
5353
5354   if (last_pat)
5355     emit_insn (last_pat);
5356 }
5357
5358 /* Allocate or deallocate the stack space needed by the SVE groups.
5359    SEQ chooses between allocating and deallocating.  */
5360
5361 void
5362 aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq)
5363 {
5364   if (unsigned int count = m_group_count[MEM_SVE_DATA] + sve_data_headroom ())
5365     emit_stack_adjust (seq, count * BYTES_PER_SVE_VECTOR);
5366 }
5367
5368 /* Save or restore the MEM_SVE_DATA group.  SEQ chooses between saving
5369    and restoring.  */
5370
5371 void
5372 aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq)
5373 {
5374   for (auto &loc : m_save_locations)
5375     if (loc.group == MEM_SVE_DATA)
5376       {
5377         auto index = loc.index + sve_data_headroom ();
5378         emit_mem_move (seq, loc, index * BYTES_PER_SVE_VECTOR);
5379       }
5380 }
5381
5382 /* Save or restore the MEM_SVE_PRED group.  SEQ chooses between saving
5383    and restoring.  */
5384
5385 void
5386 aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq)
5387 {
5388   for (auto &loc : m_save_locations)
5389     if (loc.group == MEM_SVE_PRED)
5390       emit_mem_move (seq, loc, loc.index * BYTES_PER_SVE_PRED);
5391 }
5392
5393 /* Set DEST to (vec_series BASE STEP).  */
5394
5395 static void
5396 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
5397 {
5398   machine_mode mode = GET_MODE (dest);
5399   scalar_mode inner = GET_MODE_INNER (mode);
5400
5401   /* Each operand can be a register or an immediate in the range [-16, 15].  */
5402   if (!aarch64_sve_index_immediate_p (base))
5403     base = force_reg (inner, base);
5404   if (!aarch64_sve_index_immediate_p (step))
5405     step = force_reg (inner, step);
5406
5407   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
5408 }
5409
5410 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5411    register of mode MODE.  Use TARGET for the result if it's nonnull
5412    and convenient.
5413
5414    The two vector modes must have the same element mode.  The behavior
5415    is to duplicate architectural lane N of SRC into architectural lanes
5416    N + I * STEP of the result.  On big-endian targets, architectural
5417    lane 0 of an Advanced SIMD vector is the last element of the vector
5418    in memory layout, so for big-endian targets this operation has the
5419    effect of reversing SRC before duplicating it.  Callers need to
5420    account for this.  */
5421
5422 rtx
5423 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
5424 {
5425   machine_mode src_mode = GET_MODE (src);
5426   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
5427   insn_code icode = (BYTES_BIG_ENDIAN
5428                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
5429                      : code_for_aarch64_vec_duplicate_vq_le (mode));
5430
5431   unsigned int i = 0;
5432   expand_operand ops[3];
5433   create_output_operand (&ops[i++], target, mode);
5434   create_output_operand (&ops[i++], src, src_mode);
5435   if (BYTES_BIG_ENDIAN)
5436     {
5437       /* Create a PARALLEL describing the reversal of SRC.  */
5438       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
5439       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
5440                                                   nelts_per_vq - 1, -1);
5441       create_fixed_operand (&ops[i++], sel);
5442     }
5443   expand_insn (icode, i, ops);
5444   return ops[0].value;
5445 }
5446
5447 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5448    the memory image into DEST.  Return true on success.  */
5449
5450 static bool
5451 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
5452 {
5453   src = force_const_mem (GET_MODE (src), src);
5454   if (!src)
5455     return false;
5456
5457   /* Make sure that the address is legitimate.  */
5458   if (!aarch64_sve_ld1rq_operand_p (src))
5459     {
5460       rtx addr = force_reg (Pmode, XEXP (src, 0));
5461       src = replace_equiv_address (src, addr);
5462     }
5463
5464   machine_mode mode = GET_MODE (dest);
5465   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5466   rtx ptrue = aarch64_ptrue_reg (pred_mode);
5467   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
5468   return true;
5469 }
5470
5471 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5472    by N "background" values.  Try to move it into TARGET using:
5473
5474       PTRUE PRED.<T>, VL<N>
5475       MOV TRUE.<T>, #<foreground>
5476       MOV FALSE.<T>, #<background>
5477       SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5478
5479    The PTRUE is always a single instruction but the MOVs might need a
5480    longer sequence.  If the background value is zero (as it often is),
5481    the sequence can sometimes collapse to a PTRUE followed by a
5482    zero-predicated move.
5483
5484    Return the target on success, otherwise return null.  */
5485
5486 static rtx
5487 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
5488 {
5489   gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
5490
5491   /* Make sure that the PTRUE is valid.  */
5492   machine_mode mode = GET_MODE (src);
5493   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5494   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5495   if (aarch64_svpattern_for_vl (pred_mode, npatterns)
5496       == AARCH64_NUM_SVPATTERNS)
5497     return NULL_RTX;
5498
5499   rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
5500   rtx_vector_builder true_builder (mode, npatterns, 1);
5501   rtx_vector_builder false_builder (mode, npatterns, 1);
5502   for (unsigned int i = 0; i < npatterns; ++i)
5503     {
5504       true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5505       pred_builder.quick_push (CONST1_RTX (BImode));
5506     }
5507   for (unsigned int i = 0; i < npatterns; ++i)
5508     {
5509       false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
5510       pred_builder.quick_push (CONST0_RTX (BImode));
5511     }
5512   expand_operand ops[4];
5513   create_output_operand (&ops[0], target, mode);
5514   create_input_operand (&ops[1], true_builder.build (), mode);
5515   create_input_operand (&ops[2], false_builder.build (), mode);
5516   create_input_operand (&ops[3], pred_builder.build (), pred_mode);
5517   expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
5518   return target;
5519 }
5520
5521 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5522    SVE data mode and isn't a legitimate constant.  Use TARGET for the
5523    result if convenient.
5524
5525    The returned register can have whatever mode seems most natural
5526    given the contents of SRC.  */
5527
5528 static rtx
5529 aarch64_expand_sve_const_vector (rtx target, rtx src)
5530 {
5531   machine_mode mode = GET_MODE (src);
5532   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5533   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
5534   scalar_mode elt_mode = GET_MODE_INNER (mode);
5535   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
5536   unsigned int container_bits = aarch64_sve_container_bits (mode);
5537   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
5538
5539   if (nelts_per_pattern == 1
5540       && encoded_bits <= 128
5541       && container_bits != elt_bits)
5542     {
5543       /* We have a partial vector mode and a constant whose full-vector
5544          equivalent would occupy a repeating 128-bit sequence.  Build that
5545          full-vector equivalent instead, so that we have the option of
5546          using LD1RQ and Advanced SIMD operations.  */
5547       unsigned int repeat = container_bits / elt_bits;
5548       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
5549       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
5550       for (unsigned int i = 0; i < npatterns; ++i)
5551         for (unsigned int j = 0; j < repeat; ++j)
5552           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5553       target = aarch64_target_reg (target, full_mode);
5554       return aarch64_expand_sve_const_vector (target, builder.build ());
5555     }
5556
5557   if (nelts_per_pattern == 1 && encoded_bits == 128)
5558     {
5559       /* The constant is a duplicated quadword but can't be narrowed
5560          beyond a quadword.  Get the memory image of the first quadword
5561          as a 128-bit vector and try using LD1RQ to load it from memory.
5562
5563          The effect for both endiannesses is to load memory lane N into
5564          architectural lanes N + I * STEP of the result.  On big-endian
5565          targets, the layout of the 128-bit vector in an Advanced SIMD
5566          register would be different from its layout in an SVE register,
5567          but this 128-bit vector is a memory value only.  */
5568       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5569       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
5570       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
5571         return target;
5572     }
5573
5574   if (nelts_per_pattern == 1 && encoded_bits < 128)
5575     {
5576       /* The vector is a repeating sequence of 64 bits or fewer.
5577          See if we can load them using an Advanced SIMD move and then
5578          duplicate it to fill a vector.  This is better than using a GPR
5579          move because it keeps everything in the same register file.  */
5580       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5581       rtx_vector_builder builder (vq_mode, npatterns, 1);
5582       for (unsigned int i = 0; i < npatterns; ++i)
5583         {
5584           /* We want memory lane N to go into architectural lane N,
5585              so reverse for big-endian targets.  The DUP .Q pattern
5586              has a compensating reverse built-in.  */
5587           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
5588           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
5589         }
5590       rtx vq_src = builder.build ();
5591       if (aarch64_simd_valid_immediate (vq_src, NULL))
5592         {
5593           vq_src = force_reg (vq_mode, vq_src);
5594           return aarch64_expand_sve_dupq (target, mode, vq_src);
5595         }
5596
5597       /* Get an integer representation of the repeating part of Advanced
5598          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
5599          which for big-endian targets is lane-swapped wrt a normal
5600          Advanced SIMD vector.  This means that for both endiannesses,
5601          memory lane N of SVE vector SRC corresponds to architectural
5602          lane N of a register holding VQ_SRC.  This in turn means that
5603          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5604          as a single 128-bit value) and thus that memory lane 0 of SRC is
5605          in the lsb of the integer.  Duplicating the integer therefore
5606          ensures that memory lane N of SRC goes into architectural lane
5607          N + I * INDEX of the SVE register.  */
5608       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
5609       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
5610       if (elt_value)
5611         {
5612           /* Pretend that we had a vector of INT_MODE to start with.  */
5613           elt_mode = int_mode;
5614           mode = aarch64_full_sve_mode (int_mode).require ();
5615
5616           /* If the integer can be moved into a general register by a
5617              single instruction, do that and duplicate the result.  */
5618           if (CONST_INT_P (elt_value)
5619               && aarch64_move_imm (INTVAL (elt_value),
5620                                    encoded_bits <= 32 ? SImode : DImode))
5621             {
5622               elt_value = force_reg (elt_mode, elt_value);
5623               return expand_vector_broadcast (mode, elt_value);
5624             }
5625         }
5626       else if (npatterns == 1)
5627         /* We're duplicating a single value, but can't do better than
5628            force it to memory and load from there.  This handles things
5629            like symbolic constants.  */
5630         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
5631
5632       if (elt_value)
5633         {
5634           /* Load the element from memory if we can, otherwise move it into
5635              a register and use a DUP.  */
5636           rtx op = force_const_mem (elt_mode, elt_value);
5637           if (!op)
5638             op = force_reg (elt_mode, elt_value);
5639           return expand_vector_broadcast (mode, op);
5640         }
5641     }
5642
5643   /* Try using INDEX.  */
5644   rtx base, step;
5645   if (const_vec_series_p (src, &base, &step))
5646     {
5647       aarch64_expand_vec_series (target, base, step);
5648       return target;
5649     }
5650
5651   /* From here on, it's better to force the whole constant to memory
5652      if we can.  */
5653   if (GET_MODE_NUNITS (mode).is_constant ())
5654     return NULL_RTX;
5655
5656   if (nelts_per_pattern == 2)
5657     if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5658       return res;
5659
5660   /* Expand each pattern individually.  */
5661   gcc_assert (npatterns > 1);
5662   rtx_vector_builder builder;
5663   auto_vec<rtx, 16> vectors (npatterns);
5664   for (unsigned int i = 0; i < npatterns; ++i)
5665     {
5666       builder.new_vector (mode, 1, nelts_per_pattern);
5667       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5668         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5669       vectors.quick_push (force_reg (mode, builder.build ()));
5670     }
5671
5672   /* Use permutes to interleave the separate vectors.  */
5673   while (npatterns > 1)
5674     {
5675       npatterns /= 2;
5676       for (unsigned int i = 0; i < npatterns; ++i)
5677         {
5678           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
5679           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5680           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5681           vectors[i] = tmp;
5682         }
5683     }
5684   gcc_assert (vectors[0] == target);
5685   return target;
5686 }
5687
5688 /* Use WHILE to set a predicate register of mode MODE in which the first
5689    VL bits are set and the rest are clear.  Use TARGET for the register
5690    if it's nonnull and convenient.  */
5691
5692 static rtx
5693 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5694                                  unsigned int vl)
5695 {
5696   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
5697   target = aarch64_target_reg (target, mode);
5698   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
5699                         target, const0_rtx, limit));
5700   return target;
5701 }
5702
5703 static rtx
5704 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5705
5706 /* BUILDER is a constant predicate in which the index of every set bit
5707    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5708    by inverting every element at a multiple of ELT_SIZE and EORing the
5709    result with an ELT_SIZE PTRUE.
5710
5711    Return a register that contains the constant on success, otherwise
5712    return null.  Use TARGET as the register if it is nonnull and
5713    convenient.  */
5714
5715 static rtx
5716 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5717                                    unsigned int elt_size)
5718 {
5719   /* Invert every element at a multiple of ELT_SIZE, keeping the
5720      other bits zero.  */
5721   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5722                                   builder.nelts_per_pattern ());
5723   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5724     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5725       inv_builder.quick_push (const1_rtx);
5726     else
5727       inv_builder.quick_push (const0_rtx);
5728   inv_builder.finalize ();
5729
5730   /* See if we can load the constant cheaply.  */
5731   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5732   if (!inv)
5733     return NULL_RTX;
5734
5735   /* EOR the result with an ELT_SIZE PTRUE.  */
5736   rtx mask = aarch64_ptrue_all (elt_size);
5737   mask = force_reg (VNx16BImode, mask);
5738   inv = gen_lowpart (VNx16BImode, inv);
5739   target = aarch64_target_reg (target, VNx16BImode);
5740   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5741   return target;
5742 }
5743
5744 /* BUILDER is a constant predicate in which the index of every set bit
5745    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5746    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
5747    register on success, otherwise return null.  Use TARGET as the register
5748    if nonnull and convenient.  */
5749
5750 static rtx
5751 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5752                                    unsigned int elt_size,
5753                                    unsigned int permute_size)
5754 {
5755   /* We're going to split the constant into two new constants A and B,
5756      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5757      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5758
5759      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5760      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5761
5762      where _ indicates elements that will be discarded by the permute.
5763
5764      First calculate the ELT_SIZEs for A and B.  */
5765   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
5766   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
5767   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
5768     if (INTVAL (builder.elt (i)) != 0)
5769       {
5770         if (i & permute_size)
5771           b_elt_size |= i - permute_size;
5772         else
5773           a_elt_size |= i;
5774       }
5775   a_elt_size &= -a_elt_size;
5776   b_elt_size &= -b_elt_size;
5777
5778   /* Now construct the vectors themselves.  */
5779   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5780                                 builder.nelts_per_pattern ());
5781   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5782                                 builder.nelts_per_pattern ());
5783   unsigned int nelts = builder.encoded_nelts ();
5784   for (unsigned int i = 0; i < nelts; ++i)
5785     if (i & (elt_size - 1))
5786       {
5787         a_builder.quick_push (const0_rtx);
5788         b_builder.quick_push (const0_rtx);
5789       }
5790     else if ((i & permute_size) == 0)
5791       {
5792         /* The A and B elements are significant.  */
5793         a_builder.quick_push (builder.elt (i));
5794         b_builder.quick_push (builder.elt (i + permute_size));
5795       }
5796     else
5797       {
5798         /* The A and B elements are going to be discarded, so pick whatever
5799            is likely to give a nice constant.  We are targeting element
5800            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5801            with the aim of each being a sequence of ones followed by
5802            a sequence of zeros.  So:
5803
5804            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5805              duplicate the last X_ELT_SIZE element, to extend the
5806              current sequence of ones or zeros.
5807
5808            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5809              zero, so that the constant really does have X_ELT_SIZE and
5810              not a smaller size.  */
5811         if (a_elt_size > permute_size)
5812           a_builder.quick_push (const0_rtx);
5813         else
5814           a_builder.quick_push (a_builder.elt (i - a_elt_size));
5815         if (b_elt_size > permute_size)
5816           b_builder.quick_push (const0_rtx);
5817         else
5818           b_builder.quick_push (b_builder.elt (i - b_elt_size));
5819       }
5820   a_builder.finalize ();
5821   b_builder.finalize ();
5822
5823   /* Try loading A into a register.  */
5824   rtx_insn *last = get_last_insn ();
5825   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5826   if (!a)
5827     return NULL_RTX;
5828
5829   /* Try loading B into a register.  */
5830   rtx b = a;
5831   if (a_builder != b_builder)
5832     {
5833       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5834       if (!b)
5835         {
5836           delete_insns_since (last);
5837           return NULL_RTX;
5838         }
5839     }
5840
5841   /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
5842      operands but permutes them as though they had mode MODE.  */
5843   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5844   target = aarch64_target_reg (target, GET_MODE (a));
5845   rtx type_reg = CONST0_RTX (mode);
5846   emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
5847   return target;
5848 }
5849
5850 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
5851    constant in BUILDER into an SVE predicate register.  Return the register
5852    on success, otherwise return null.  Use TARGET for the register if
5853    nonnull and convenient.
5854
5855    ALLOW_RECURSE_P is true if we can use methods that would call this
5856    function recursively.  */
5857
5858 static rtx
5859 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5860                                  bool allow_recurse_p)
5861 {
5862   if (builder.encoded_nelts () == 1)
5863     /* A PFALSE or a PTRUE .B ALL.  */
5864     return aarch64_emit_set_immediate (target, builder);
5865
5866   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5867   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5868     {
5869       /* If we can load the constant using PTRUE, use it as-is.  */
5870       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5871       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5872         return aarch64_emit_set_immediate (target, builder);
5873
5874       /* Otherwise use WHILE to set the first VL bits.  */
5875       return aarch64_sve_move_pred_via_while (target, mode, vl);
5876     }
5877
5878   if (!allow_recurse_p)
5879     return NULL_RTX;
5880
5881   /* Try inverting the vector in element size ELT_SIZE and then EORing
5882      the result with an ELT_SIZE PTRUE.  */
5883   if (INTVAL (builder.elt (0)) == 0)
5884     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5885                                                      elt_size))
5886       return res;
5887
5888   /* Try using TRN1 to permute two simpler constants.  */
5889   for (unsigned int i = elt_size; i <= 8; i *= 2)
5890     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5891                                                      elt_size, i))
5892       return res;
5893
5894   return NULL_RTX;
5895 }
5896
5897 /* Return an SVE predicate register that contains the VNx16BImode
5898    constant in BUILDER, without going through the move expanders.
5899
5900    The returned register can have whatever mode seems most natural
5901    given the contents of BUILDER.  Use TARGET for the result if
5902    convenient.  */
5903
5904 static rtx
5905 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5906 {
5907   /* Try loading the constant using pure predicate operations.  */
5908   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5909     return res;
5910
5911   /* Try forcing the constant to memory.  */
5912   if (builder.full_nelts ().is_constant ())
5913     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5914       {
5915         target = aarch64_target_reg (target, VNx16BImode);
5916         emit_move_insn (target, mem);
5917         return target;
5918       }
5919
5920   /* The last resort is to load the constant as an integer and then
5921      compare it against zero.  Use -1 for set bits in order to increase
5922      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
5923   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5924                                   builder.nelts_per_pattern ());
5925   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5926     int_builder.quick_push (INTVAL (builder.elt (i))
5927                             ? constm1_rtx : const0_rtx);
5928   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5929                                            int_builder.build ());
5930 }
5931
5932 /* Set DEST to immediate IMM.  */
5933
5934 void
5935 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5936 {
5937   machine_mode mode = GET_MODE (dest);
5938
5939   /* Check on what type of symbol it is.  */
5940   scalar_int_mode int_mode;
5941   if ((SYMBOL_REF_P (imm)
5942        || LABEL_REF_P (imm)
5943        || GET_CODE (imm) == CONST
5944        || GET_CODE (imm) == CONST_POLY_INT)
5945       && is_a <scalar_int_mode> (mode, &int_mode))
5946     {
5947       rtx mem;
5948       poly_int64 offset;
5949       HOST_WIDE_INT const_offset;
5950       enum aarch64_symbol_type sty;
5951
5952       /* If we have (const (plus symbol offset)), separate out the offset
5953          before we start classifying the symbol.  */
5954       rtx base = strip_offset (imm, &offset);
5955
5956       /* We must always add an offset involving VL separately, rather than
5957          folding it into the relocation.  */
5958       if (!offset.is_constant (&const_offset))
5959         {
5960           if (!TARGET_SVE)
5961             {
5962               aarch64_report_sve_required ();
5963               return;
5964             }
5965           if (base == const0_rtx
5966               && (aarch64_sve_cnt_immediate_p (offset)
5967                   || aarch64_sve_rdvl_immediate_p (offset)))
5968             emit_insn (gen_rtx_SET (dest, imm));
5969           else
5970             {
5971               /* Do arithmetic on 32-bit values if the result is smaller
5972                  than that.  */
5973               if (partial_subreg_p (int_mode, SImode))
5974                 {
5975                   /* It is invalid to do symbol calculations in modes
5976                      narrower than SImode.  */
5977                   gcc_assert (base == const0_rtx);
5978                   dest = gen_lowpart (SImode, dest);
5979                   int_mode = SImode;
5980                 }
5981               if (base != const0_rtx)
5982                 {
5983                   base = aarch64_force_temporary (int_mode, dest, base);
5984                   aarch64_add_offset (int_mode, dest, base, offset,
5985                                       NULL_RTX, NULL_RTX, 0, false);
5986                 }
5987               else
5988                 aarch64_add_offset (int_mode, dest, base, offset,
5989                                     dest, NULL_RTX, 0, false);
5990             }
5991           return;
5992         }
5993
5994       if (aarch64_rdsvl_immediate_p (base))
5995         {
5996           /* We could handle non-constant offsets if they are ever
5997              generated.  */
5998           gcc_assert (const_offset == 0);
5999           emit_insn (gen_rtx_SET (dest, imm));
6000           return;
6001         }
6002
6003       sty = aarch64_classify_symbol (base, const_offset);
6004       switch (sty)
6005         {
6006         case SYMBOL_FORCE_TO_MEM:
6007           if (int_mode != ptr_mode)
6008             imm = convert_memory_address (ptr_mode, imm);
6009
6010           if (const_offset != 0
6011               && targetm.cannot_force_const_mem (ptr_mode, imm))
6012             {
6013               gcc_assert (can_create_pseudo_p ());
6014               base = aarch64_force_temporary (int_mode, dest, base);
6015               aarch64_add_offset (int_mode, dest, base, const_offset,
6016                                   NULL_RTX, NULL_RTX, 0, false);
6017               return;
6018             }
6019
6020           mem = force_const_mem (ptr_mode, imm);
6021           gcc_assert (mem);
6022
6023           /* If we aren't generating PC relative literals, then
6024              we need to expand the literal pool access carefully.
6025              This is something that needs to be done in a number
6026              of places, so could well live as a separate function.  */
6027           if (!aarch64_pcrelative_literal_loads)
6028             {
6029               gcc_assert (can_create_pseudo_p ());
6030               base = gen_reg_rtx (ptr_mode);
6031               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6032               if (ptr_mode != Pmode)
6033                 base = convert_memory_address (Pmode, base);
6034               mem = gen_rtx_MEM (ptr_mode, base);
6035             }
6036
6037           if (int_mode != ptr_mode)
6038             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6039
6040           emit_insn (gen_rtx_SET (dest, mem));
6041
6042           return;
6043
6044         case SYMBOL_SMALL_TLSGD:
6045         case SYMBOL_SMALL_TLSDESC:
6046         case SYMBOL_SMALL_TLSIE:
6047         case SYMBOL_SMALL_GOT_28K:
6048         case SYMBOL_SMALL_GOT_4G:
6049         case SYMBOL_TINY_GOT:
6050         case SYMBOL_TINY_TLSIE:
6051           if (const_offset != 0)
6052             {
6053               gcc_assert(can_create_pseudo_p ());
6054               base = aarch64_force_temporary (int_mode, dest, base);
6055               aarch64_add_offset (int_mode, dest, base, const_offset,
6056                                   NULL_RTX, NULL_RTX, 0, false);
6057               return;
6058             }
6059           /* FALLTHRU */
6060
6061         case SYMBOL_SMALL_ABSOLUTE:
6062         case SYMBOL_TINY_ABSOLUTE:
6063         case SYMBOL_TLSLE12:
6064         case SYMBOL_TLSLE24:
6065         case SYMBOL_TLSLE32:
6066         case SYMBOL_TLSLE48:
6067           aarch64_load_symref_appropriately (dest, imm, sty);
6068           return;
6069
6070         default:
6071           gcc_unreachable ();
6072         }
6073     }
6074
6075   if (!CONST_INT_P (imm))
6076     {
6077       if (aarch64_sve_pred_mode_p (mode))
6078         {
6079           /* Only the low bit of each .H, .S and .D element is defined,
6080              so we can set the upper bits to whatever we like.  If the
6081              predicate is all-true in MODE, prefer to set all the undefined
6082              bits as well, so that we can share a single .B predicate for
6083              all modes.  */
6084           if (imm == CONSTM1_RTX (mode))
6085             imm = CONSTM1_RTX (VNx16BImode);
6086
6087           /* All methods for constructing predicate modes wider than VNx16BI
6088              will set the upper bits of each element to zero.  Expose this
6089              by moving such constants as a VNx16BI, so that all bits are
6090              significant and so that constants for different modes can be
6091              shared.  The wider constant will still be available as a
6092              REG_EQUAL note.  */
6093           rtx_vector_builder builder;
6094           if (aarch64_get_sve_pred_bits (builder, imm))
6095             {
6096               rtx res = aarch64_expand_sve_const_pred (dest, builder);
6097               if (dest != res)
6098                 emit_move_insn (dest, gen_lowpart (mode, res));
6099               return;
6100             }
6101         }
6102
6103       if (GET_CODE (imm) == HIGH
6104           || aarch64_simd_valid_immediate (imm, NULL))
6105         {
6106           emit_insn (gen_rtx_SET (dest, imm));
6107           return;
6108         }
6109
6110       if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6111         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6112           {
6113             if (dest != res)
6114               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6115             return;
6116           }
6117
6118       rtx mem = force_const_mem (mode, imm);
6119       gcc_assert (mem);
6120       emit_move_insn (dest, mem);
6121       return;
6122     }
6123
6124   aarch64_internal_mov_immediate (dest, imm, true, mode);
6125 }
6126
6127 /* Return the MEM rtx that provides the canary value that should be used
6128    for stack-smashing protection.  MODE is the mode of the memory.
6129    For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6130    (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
6131    indicates whether the caller is performing a SET or a TEST operation.  */
6132
6133 rtx
6134 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6135                                   aarch64_salt_type salt_type)
6136 {
6137   rtx addr;
6138   if (aarch64_stack_protector_guard == SSP_GLOBAL)
6139     {
6140       gcc_assert (MEM_P (decl_rtl));
6141       addr = XEXP (decl_rtl, 0);
6142       poly_int64 offset;
6143       rtx base = strip_offset_and_salt (addr, &offset);
6144       if (!SYMBOL_REF_P (base))
6145         return decl_rtl;
6146
6147       rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6148       addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6149       addr = gen_rtx_CONST (Pmode, addr);
6150       addr = plus_constant (Pmode, addr, offset);
6151     }
6152   else
6153     {
6154       /* Calculate the address from the system register.  */
6155       rtx salt = GEN_INT (salt_type);
6156       addr = gen_reg_rtx (mode);
6157       if (mode == DImode)
6158         emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6159       else
6160         {
6161           emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6162           addr = convert_memory_address (Pmode, addr);
6163         }
6164       addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6165     }
6166   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6167 }
6168
6169 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
6170    that is known to contain PTRUE.  */
6171
6172 void
6173 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6174 {
6175   expand_operand ops[3];
6176   machine_mode mode = GET_MODE (dest);
6177   create_output_operand (&ops[0], dest, mode);
6178   create_input_operand (&ops[1], pred, GET_MODE(pred));
6179   create_input_operand (&ops[2], src, mode);
6180   temporary_volatile_ok v (true);
6181   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6182 }
6183
6184 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6185    operand is in memory.  In this case we need to use the predicated LD1
6186    and ST1 instead of LDR and STR, both for correctness on big-endian
6187    targets and because LD1 and ST1 support a wider range of addressing modes.
6188    PRED_MODE is the mode of the predicate.
6189
6190    See the comment at the head of aarch64-sve.md for details about the
6191    big-endian handling.  */
6192
6193 void
6194 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6195 {
6196   machine_mode mode = GET_MODE (dest);
6197   rtx ptrue = aarch64_ptrue_reg (pred_mode);
6198   if (!register_operand (src, mode)
6199       && !register_operand (dest, mode))
6200     {
6201       rtx tmp = gen_reg_rtx (mode);
6202       if (MEM_P (src))
6203         aarch64_emit_sve_pred_move (tmp, ptrue, src);
6204       else
6205         emit_move_insn (tmp, src);
6206       src = tmp;
6207     }
6208   aarch64_emit_sve_pred_move (dest, ptrue, src);
6209 }
6210
6211 /* Called only on big-endian targets.  See whether an SVE vector move
6212    from SRC to DEST is effectively a REV[BHW] instruction, because at
6213    least one operand is a subreg of an SVE vector that has wider or
6214    narrower elements.  Return true and emit the instruction if so.
6215
6216    For example:
6217
6218      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6219
6220    represents a VIEW_CONVERT between the following vectors, viewed
6221    in memory order:
6222
6223      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
6224      R1: { [0],      [1],      [2],      [3],     ... }
6225
6226    The high part of lane X in R2 should therefore correspond to lane X*2
6227    of R1, but the register representations are:
6228
6229          msb                                      lsb
6230      R2: ...... [1].high  [1].low   [0].high  [0].low
6231      R1: ...... [3]       [2]       [1]       [0]
6232
6233    where the low part of lane X in R2 corresponds to lane X*2 in R1.
6234    We therefore need a reverse operation to swap the high and low values
6235    around.
6236
6237    This is purely an optimization.  Without it we would spill the
6238    subreg operand to the stack in one mode and reload it in the
6239    other mode, which has the same effect as the REV.  */
6240
6241 bool
6242 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6243 {
6244   gcc_assert (BYTES_BIG_ENDIAN);
6245
6246   /* Do not try to optimize subregs that LRA has created for matched
6247      reloads.  These subregs only exist as a temporary measure to make
6248      the RTL well-formed, but they are exempt from the usual
6249      TARGET_CAN_CHANGE_MODE_CLASS rules.
6250
6251      For example, if we have:
6252
6253        (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6254
6255      and the constraints require R1 and R2 to be in the same register,
6256      LRA may need to create RTL such as:
6257
6258        (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6259        (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6260        (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6261
6262      which forces both the input and output of the original instruction
6263      to use the same hard register.  But for this to work, the normal
6264      rules have to be suppressed on the subreg input, otherwise LRA
6265      would need to reload that input too, meaning that the process
6266      would never terminate.  To compensate for this, the normal rules
6267      are also suppressed for the subreg output of the first move.
6268      Ignoring the special case and handling the first move normally
6269      would therefore generate wrong code: we would reverse the elements
6270      for the first subreg but not reverse them back for the second subreg.  */
6271   if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
6272     dest = SUBREG_REG (dest);
6273   if (SUBREG_P (src) && !LRA_SUBREG_P (src))
6274     src = SUBREG_REG (src);
6275
6276   /* The optimization handles two single SVE REGs with different element
6277      sizes.  */
6278   if (!REG_P (dest)
6279       || !REG_P (src)
6280       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6281       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6282       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6283           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6284     return false;
6285
6286   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
6287   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
6288   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6289                                UNSPEC_REV_SUBREG);
6290   emit_insn (gen_rtx_SET (dest, unspec));
6291   return true;
6292 }
6293
6294 /* Return a copy of X with mode MODE, without changing its other
6295    attributes.  Unlike gen_lowpart, this doesn't care whether the
6296    mode change is valid.  */
6297
6298 rtx
6299 aarch64_replace_reg_mode (rtx x, machine_mode mode)
6300 {
6301   if (GET_MODE (x) == mode)
6302     return x;
6303
6304   x = shallow_copy_rtx (x);
6305   set_mode_and_regno (x, mode, REGNO (x));
6306   return x;
6307 }
6308
6309 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6310    stored in wider integer containers.  */
6311
6312 static unsigned int
6313 aarch64_sve_rev_unspec (machine_mode mode)
6314 {
6315   switch (GET_MODE_UNIT_SIZE (mode))
6316     {
6317     case 1: return UNSPEC_REVB;
6318     case 2: return UNSPEC_REVH;
6319     case 4: return UNSPEC_REVW;
6320     }
6321   gcc_unreachable ();
6322 }
6323
6324 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6325    operands.  */
6326
6327 void
6328 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6329 {
6330   /* Decide which REV operation we need.  The mode with wider elements
6331      determines the mode of the operands and the mode with the narrower
6332      elements determines the reverse width.  */
6333   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6334   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6335   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6336       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6337     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6338
6339   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6340   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6341
6342   /* Get the operands in the appropriate modes and emit the instruction.  */
6343   ptrue = gen_lowpart (pred_mode, ptrue);
6344   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6345   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6346   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6347                                dest, ptrue, src));
6348 }
6349
6350 static bool
6351 aarch64_function_ok_for_sibcall (tree, tree exp)
6352 {
6353   if (crtl->abi->id () != expr_callee_abi (exp).id ())
6354     return false;
6355
6356   tree fntype = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp)));
6357   if (aarch64_fntype_pstate_sm (fntype) & ~aarch64_cfun_incoming_pstate_sm ())
6358     return false;
6359   for (auto state : { "za", "zt0" })
6360     if (bool (aarch64_cfun_shared_flags (state))
6361         != bool (aarch64_fntype_shared_flags (fntype, state)))
6362       return false;
6363   return true;
6364 }
6365
6366 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6367    passed in SVE registers.  */
6368
6369 static bool
6370 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6371                              const function_arg_info &arg)
6372 {
6373   HOST_WIDE_INT size;
6374   machine_mode dummymode;
6375   int nregs;
6376
6377   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
6378   if (arg.mode == BLKmode && arg.type)
6379     size = int_size_in_bytes (arg.type);
6380   else
6381     /* No frontends can create types with variable-sized modes, so we
6382        shouldn't be asked to pass or return them.  */
6383     size = GET_MODE_SIZE (arg.mode).to_constant ();
6384
6385   /* Aggregates are passed by reference based on their size.  */
6386   if (arg.aggregate_type_p ())
6387     size = int_size_in_bytes (arg.type);
6388
6389   /* Variable sized arguments are always returned by reference.  */
6390   if (size < 0)
6391     return true;
6392
6393   /* Can this be a candidate to be passed in fp/simd register(s)?  */
6394   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6395                                                &dummymode, &nregs, NULL,
6396                                                !pcum || pcum->silent_p))
6397     return false;
6398
6399   /* Arguments which are variable sized or larger than 2 registers are
6400      passed by reference unless they are a homogenous floating point
6401      aggregate.  */
6402   return size > 2 * UNITS_PER_WORD;
6403 }
6404
6405 /* Implement TARGET_PASS_BY_REFERENCE.  */
6406
6407 static bool
6408 aarch64_pass_by_reference (cumulative_args_t pcum_v,
6409                            const function_arg_info &arg)
6410 {
6411   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6412
6413   if (!arg.type)
6414     return aarch64_pass_by_reference_1 (pcum, arg);
6415
6416   pure_scalable_type_info pst_info;
6417   switch (pst_info.analyze (arg.type))
6418     {
6419     case pure_scalable_type_info::IS_PST:
6420       if (pcum && !pcum->silent_p && !TARGET_SVE)
6421         /* We can't gracefully recover at this point, so make this a
6422            fatal error.  */
6423         fatal_error (input_location, "arguments of type %qT require"
6424                      " the SVE ISA extension", arg.type);
6425
6426       /* Variadic SVE types are passed by reference.  Normal non-variadic
6427          arguments are too if we've run out of registers.  */
6428       return (!arg.named
6429               || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
6430               || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
6431
6432     case pure_scalable_type_info::DOESNT_MATTER:
6433       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
6434       return true;
6435
6436     case pure_scalable_type_info::NO_ABI_IDENTITY:
6437     case pure_scalable_type_info::ISNT_PST:
6438       return aarch64_pass_by_reference_1 (pcum, arg);
6439     }
6440   gcc_unreachable ();
6441 }
6442
6443 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
6444 static bool
6445 aarch64_return_in_msb (const_tree valtype)
6446 {
6447   machine_mode dummy_mode;
6448   int dummy_int;
6449
6450   /* Never happens in little-endian mode.  */
6451   if (!BYTES_BIG_ENDIAN)
6452     return false;
6453
6454   /* Only composite types smaller than or equal to 16 bytes can
6455      be potentially returned in registers.  */
6456   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
6457       || int_size_in_bytes (valtype) <= 0
6458       || int_size_in_bytes (valtype) > 16)
6459     return false;
6460
6461   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6462      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6463      is always passed/returned in the least significant bits of fp/simd
6464      register(s).  */
6465   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
6466                                                &dummy_mode, &dummy_int, NULL,
6467                                                false))
6468     return false;
6469
6470   /* Likewise pure scalable types for SVE vector and predicate registers.  */
6471   pure_scalable_type_info pst_info;
6472   if (pst_info.analyze_registers (valtype))
6473     return false;
6474
6475   return true;
6476 }
6477
6478 /* Implement TARGET_FUNCTION_VALUE.
6479    Define how to find the value returned by a function.  */
6480
6481 static rtx
6482 aarch64_function_value (const_tree type, const_tree func,
6483                         bool outgoing ATTRIBUTE_UNUSED)
6484 {
6485   machine_mode mode;
6486   int unsignedp;
6487
6488   mode = TYPE_MODE (type);
6489   if (INTEGRAL_TYPE_P (type))
6490     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
6491
6492   pure_scalable_type_info pst_info;
6493   if (type && pst_info.analyze_registers (type))
6494     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
6495
6496   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6497      are returned in memory, not by value.  */
6498   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6499   bool sve_p = (vec_flags & VEC_ANY_SVE);
6500
6501   if (aarch64_return_in_msb (type))
6502     {
6503       HOST_WIDE_INT size = int_size_in_bytes (type);
6504
6505       if (size % UNITS_PER_WORD != 0)
6506         {
6507           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
6508           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
6509         }
6510     }
6511
6512   int count;
6513   machine_mode ag_mode;
6514   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
6515                                                NULL, false))
6516     {
6517       gcc_assert (!sve_p);
6518       if (!aarch64_composite_type_p (type, mode))
6519         {
6520           gcc_assert (count == 1 && mode == ag_mode);
6521           return gen_rtx_REG (mode, V0_REGNUM);
6522         }
6523       else if (aarch64_advsimd_full_struct_mode_p (mode)
6524                && known_eq (GET_MODE_SIZE (ag_mode), 16))
6525         return gen_rtx_REG (mode, V0_REGNUM);
6526       else if (aarch64_advsimd_partial_struct_mode_p (mode)
6527                && known_eq (GET_MODE_SIZE (ag_mode), 8))
6528         return gen_rtx_REG (mode, V0_REGNUM);
6529       else
6530         {
6531           int i;
6532           rtx par;
6533
6534           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
6535           for (i = 0; i < count; i++)
6536             {
6537               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6538               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
6539               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6540               XVECEXP (par, 0, i) = tmp;
6541             }
6542           return par;
6543         }
6544     }
6545   else
6546     {
6547       if (sve_p)
6548         {
6549           /* Vector types can acquire a partial SVE mode using things like
6550              __attribute__((vector_size(N))), and this is potentially useful.
6551              However, the choice of mode doesn't affect the type's ABI
6552              identity, so we should treat the types as though they had
6553              the associated integer mode, just like they did before SVE
6554              was introduced.
6555
6556              We know that the vector must be 128 bits or smaller,
6557              otherwise we'd have returned it in memory instead.  */
6558           gcc_assert (type
6559                       && (aarch64_some_values_include_pst_objects_p (type)
6560                           || (vec_flags & VEC_PARTIAL)));
6561
6562           scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
6563           rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
6564           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
6565           return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
6566         }
6567       return gen_rtx_REG (mode, R0_REGNUM);
6568     }
6569 }
6570
6571 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6572    Return true if REGNO is the number of a hard register in which the values
6573    of called function may come back.  */
6574
6575 static bool
6576 aarch64_function_value_regno_p (const unsigned int regno)
6577 {
6578   /* Maximum of 16 bytes can be returned in the general registers.  Examples
6579      of 16-byte return values are: 128-bit integers and 16-byte small
6580      structures (excluding homogeneous floating-point aggregates).  */
6581   if (regno == R0_REGNUM || regno == R1_REGNUM)
6582     return true;
6583
6584   /* Up to four fp/simd registers can return a function value, e.g. a
6585      homogeneous floating-point aggregate having four members.  */
6586   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
6587     return TARGET_FLOAT;
6588
6589   if (regno >= P0_REGNUM && regno < P0_REGNUM + HA_MAX_NUM_FLDS)
6590     return TARGET_SVE;
6591
6592   return false;
6593 }
6594
6595 /* Subroutine for aarch64_return_in_memory for types that are not returned
6596    in SVE registers.  */
6597
6598 static bool
6599 aarch64_return_in_memory_1 (const_tree type)
6600 {
6601   HOST_WIDE_INT size;
6602   machine_mode ag_mode;
6603   int count;
6604
6605   if (!AGGREGATE_TYPE_P (type)
6606       && TREE_CODE (type) != BITINT_TYPE
6607       && TREE_CODE (type) != COMPLEX_TYPE
6608       && TREE_CODE (type) != VECTOR_TYPE)
6609     /* Simple scalar types always returned in registers.  */
6610     return false;
6611
6612   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6613                                                &ag_mode, &count, NULL, false))
6614     return false;
6615
6616   /* Types larger than 2 registers returned in memory.  */
6617   size = int_size_in_bytes (type);
6618   return (size < 0 || size > 2 * UNITS_PER_WORD);
6619 }
6620
6621 /* Implement TARGET_RETURN_IN_MEMORY.
6622
6623    If the type T of the result of a function is such that
6624      void func (T arg)
6625    would require that arg be passed as a value in a register (or set of
6626    registers) according to the parameter passing rules, then the result
6627    is returned in the same registers as would be used for such an
6628    argument.  */
6629
6630 static bool
6631 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
6632 {
6633   pure_scalable_type_info pst_info;
6634   switch (pst_info.analyze (type))
6635     {
6636     case pure_scalable_type_info::IS_PST:
6637       return (pst_info.num_zr () > NUM_FP_ARG_REGS
6638               || pst_info.num_pr () > NUM_PR_ARG_REGS);
6639
6640     case pure_scalable_type_info::DOESNT_MATTER:
6641       gcc_assert (aarch64_return_in_memory_1 (type));
6642       return true;
6643
6644     case pure_scalable_type_info::NO_ABI_IDENTITY:
6645     case pure_scalable_type_info::ISNT_PST:
6646       return aarch64_return_in_memory_1 (type);
6647     }
6648   gcc_unreachable ();
6649 }
6650
6651 static bool
6652 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
6653                                const_tree type, int *nregs)
6654 {
6655   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6656   return aarch64_vfp_is_call_or_return_candidate (mode, type,
6657                                                   &pcum->aapcs_vfp_rmode,
6658                                                   nregs, NULL, pcum->silent_p);
6659 }
6660
6661 /* Given MODE and TYPE of a function argument, return the alignment in
6662    bits.  The idea is to suppress any stronger alignment requested by
6663    the user and opt for the natural alignment (specified in AAPCS64 \S
6664    4.1).  ABI_BREAK_GCC_9 is set to the old alignment if the alignment
6665    was incorrectly calculated in versions of GCC prior to GCC 9.
6666    ABI_BREAK_GCC_13 is set to the old alignment if it was incorrectly
6667    calculated in versions between GCC 9 and GCC 13.  If the alignment
6668    might have changed between GCC 13 and GCC 14, ABI_BREAK_GCC_14
6669    is the old GCC 13 alignment, otherwise it is zero.
6670
6671    This is a helper function for local use only.  */
6672
6673 static unsigned int
6674 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6675                                 unsigned int *abi_break_gcc_9,
6676                                 unsigned int *abi_break_gcc_13,
6677                                 unsigned int *abi_break_gcc_14)
6678 {
6679   *abi_break_gcc_9 = 0;
6680   *abi_break_gcc_13 = 0;
6681   *abi_break_gcc_14 = 0;
6682   if (!type)
6683     return GET_MODE_ALIGNMENT (mode);
6684
6685   if (integer_zerop (TYPE_SIZE (type)))
6686     return 0;
6687
6688   gcc_assert (TYPE_MODE (type) == mode);
6689
6690   if (!AGGREGATE_TYPE_P (type))
6691     {
6692       /* The ABI alignment is the natural alignment of the type, without
6693          any attributes applied.  Normally this is the alignment of the
6694          TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
6695          For now we just handle the known exceptions explicitly.  */
6696       type = TYPE_MAIN_VARIANT (type);
6697       if (POINTER_TYPE_P (type))
6698         {
6699           gcc_assert (known_eq (POINTER_SIZE, GET_MODE_BITSIZE (mode)));
6700           return POINTER_SIZE;
6701         }
6702       if (TREE_CODE (type) == ENUMERAL_TYPE && TREE_TYPE (type))
6703         {
6704           *abi_break_gcc_14 = TYPE_ALIGN (type);
6705           type = TYPE_MAIN_VARIANT (TREE_TYPE (type));
6706         }
6707       gcc_assert (!TYPE_USER_ALIGN (type));
6708       return TYPE_ALIGN (type);
6709     }
6710
6711   if (TREE_CODE (type) == ARRAY_TYPE)
6712     return TYPE_ALIGN (TREE_TYPE (type));
6713
6714   unsigned int alignment = 0;
6715   unsigned int bitfield_alignment_with_packed = 0;
6716   unsigned int bitfield_alignment = 0;
6717   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6718     if (TREE_CODE (field) == FIELD_DECL)
6719       {
6720         /* Note that we explicitly consider zero-sized fields here,
6721            even though they don't map to AAPCS64 machine types.
6722            For example, in:
6723
6724                struct __attribute__((aligned(8))) empty {};
6725
6726                struct s {
6727                  [[no_unique_address]] empty e;
6728                  int x;
6729                };
6730
6731            "s" contains only one Fundamental Data Type (the int field)
6732            but gains 8-byte alignment and size thanks to "e".  */
6733         alignment = std::max (alignment, DECL_ALIGN (field));
6734         if (DECL_BIT_FIELD_TYPE (field))
6735           {
6736             /* Take the bit-field type's alignment into account only
6737                if the user didn't reduce this field's alignment with
6738                the packed attribute.  */
6739             if (!DECL_PACKED (field))
6740               bitfield_alignment
6741                 = std::max (bitfield_alignment,
6742                             TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6743
6744             /* Compute the alignment even if the bit-field is
6745                packed, so that we can emit a warning in case the
6746                alignment changed between GCC versions.  */
6747             bitfield_alignment_with_packed
6748               = std::max (bitfield_alignment_with_packed,
6749                           TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6750           }
6751       }
6752
6753   /* Emit a warning if the alignment is different when taking the
6754      'packed' attribute into account.  */
6755   if (bitfield_alignment != bitfield_alignment_with_packed
6756       && bitfield_alignment_with_packed > alignment)
6757     *abi_break_gcc_13 = bitfield_alignment_with_packed;
6758
6759   if (bitfield_alignment > alignment)
6760     {
6761       *abi_break_gcc_9 = alignment;
6762       return bitfield_alignment;
6763     }
6764
6765   return alignment;
6766 }
6767
6768 /* Return true if TYPE describes a _BitInt(N) or an angreggate that uses the
6769    _BitInt(N) type.  These include ARRAY_TYPE's with an element that is a
6770    _BitInt(N) or an aggregate that uses it, and a RECORD_TYPE or a UNION_TYPE
6771    with a field member that is a _BitInt(N) or an aggregate that uses it.
6772    Return false otherwise.  */
6773
6774 static bool
6775 bitint_or_aggr_of_bitint_p (tree type)
6776 {
6777   if (!type)
6778     return false;
6779
6780   if (TREE_CODE (type) == BITINT_TYPE)
6781     return true;
6782
6783   /* If ARRAY_TYPE, check it's element type.  */
6784   if (TREE_CODE (type) == ARRAY_TYPE)
6785     return bitint_or_aggr_of_bitint_p (TREE_TYPE (type));
6786
6787   /* If RECORD_TYPE or UNION_TYPE, check the fields' types.  */
6788   if (RECORD_OR_UNION_TYPE_P (type))
6789     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
6790       {
6791         if (TREE_CODE (field) != FIELD_DECL)
6792           continue;
6793         if (bitint_or_aggr_of_bitint_p (TREE_TYPE (field)))
6794           return true;
6795       }
6796   return false;
6797 }
6798
6799 /* Layout a function argument according to the AAPCS64 rules.  The rule
6800    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
6801    mode that was originally given to us by the target hook, whereas the
6802    mode in ARG might be the result of replacing partial SVE modes with
6803    the equivalent integer mode.  */
6804
6805 static void
6806 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6807 {
6808   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6809   tree type = arg.type;
6810   machine_mode mode = arg.mode;
6811   int ncrn, nvrn, nregs;
6812   bool allocate_ncrn, allocate_nvrn;
6813   HOST_WIDE_INT size;
6814   unsigned int abi_break_gcc_9;
6815   unsigned int abi_break_gcc_13;
6816   unsigned int abi_break_gcc_14;
6817
6818   /* We need to do this once per argument.  */
6819   if (pcum->aapcs_arg_processed)
6820     return;
6821
6822   bool warn_pcs_change
6823     = (warn_psabi
6824        && !pcum->silent_p
6825        && (currently_expanding_function_start
6826            || currently_expanding_gimple_stmt));
6827
6828   /* HFAs and HVAs can have an alignment greater than 16 bytes.  For example:
6829
6830        typedef struct foo {
6831          __Int8x16_t foo[2] __attribute__((aligned(32)));
6832        } foo;
6833
6834      is still a HVA despite its larger-than-normal alignment.
6835      However, such over-aligned HFAs and HVAs are guaranteed to have
6836      no padding.
6837
6838      If we exclude HFAs and HVAs from the discussion below, then there
6839      are several things to note:
6840
6841      - Both the C and AAPCS64 interpretations of a type's alignment should
6842        give a value that is no greater than the type's size.
6843
6844      - Types bigger than 16 bytes are passed indirectly.
6845
6846      - If an argument of type T is passed indirectly, TYPE and MODE describe
6847        a pointer to T rather than T iself.
6848
6849      It follows that the AAPCS64 alignment of TYPE must be no greater
6850      than 16 bytes.
6851
6852      Versions prior to GCC 9.1 ignored a bitfield's underlying type
6853      and so could calculate an alignment that was too small.  If this
6854      happened for TYPE then ABI_BREAK_GCC_9 is this older, too-small alignment.
6855
6856      Although GCC 9.1 fixed that bug, it introduced a different one:
6857      it would consider the alignment of a bitfield's underlying type even
6858      if the field was packed (which should have the effect of overriding
6859      the alignment of the underlying type).  This was fixed in GCC 13.1.
6860
6861      As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
6862      that was too big.  If this happened for TYPE, ABI_BREAK_GCC_13 is
6863      this older, too-big alignment.
6864
6865      Also, the fact that GCC 9 to GCC 12 considered irrelevant
6866      alignments meant they could calculate type alignments that were
6867      bigger than the type's size, contrary to the assumption above.
6868      The handling of register arguments was nevertheless (and justifiably)
6869      written to follow the assumption that the alignment can never be
6870      greater than the size.  The same was not true for stack arguments;
6871      their alignment was instead handled by MIN bounds in
6872      aarch64_function_arg_boundary.
6873
6874      The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
6875      an alignment of more than 16 bytes for TYPE then:
6876
6877      - If the argument was passed in registers, these GCC versions
6878        would treat the alignment as though it was *less than* 16 bytes.
6879
6880      - If the argument was passed on the stack, these GCC versions
6881        would treat the alignment as though it was *equal to* 16 bytes.
6882
6883      Both behaviors were wrong, but in different cases.  */
6884
6885   pcum->aapcs_arg_processed = true;
6886
6887   pure_scalable_type_info pst_info;
6888   if (type && pst_info.analyze_registers (type))
6889     {
6890       /* aarch64_function_arg_alignment has never had an effect on
6891          this case.  */
6892
6893       /* The PCS says that it is invalid to pass an SVE value to an
6894          unprototyped function.  There is no ABI-defined location we
6895          can return in this case, so we have no real choice but to raise
6896          an error immediately, even though this is only a query function.  */
6897       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
6898         {
6899           gcc_assert (!pcum->silent_p);
6900           error ("SVE type %qT cannot be passed to an unprototyped function",
6901                  arg.type);
6902           /* Avoid repeating the message, and avoid tripping the assert
6903              below.  */
6904           pcum->pcs_variant = ARM_PCS_SVE;
6905         }
6906
6907       /* We would have converted the argument into pass-by-reference
6908          form if it didn't fit in registers.  */
6909       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
6910       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
6911       gcc_assert (arg.named
6912                   && pcum->pcs_variant == ARM_PCS_SVE
6913                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
6914                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
6915       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
6916                                           P0_REGNUM + pcum->aapcs_nprn);
6917       return;
6918     }
6919
6920   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6921      are passed by reference, not by value.  */
6922   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6923   bool sve_p = (vec_flags & VEC_ANY_SVE);
6924   if (sve_p)
6925     /* Vector types can acquire a partial SVE mode using things like
6926        __attribute__((vector_size(N))), and this is potentially useful.
6927        However, the choice of mode doesn't affect the type's ABI
6928        identity, so we should treat the types as though they had
6929        the associated integer mode, just like they did before SVE
6930        was introduced.
6931
6932        We know that the vector must be 128 bits or smaller,
6933        otherwise we'd have passed it in memory instead.  */
6934     gcc_assert (type
6935                 && (aarch64_some_values_include_pst_objects_p (type)
6936                     || (vec_flags & VEC_PARTIAL)));
6937
6938   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
6939   if (type)
6940     size = int_size_in_bytes (type);
6941   else
6942     /* No frontends can create types with variable-sized modes, so we
6943        shouldn't be asked to pass or return them.  */
6944     size = GET_MODE_SIZE (mode).to_constant ();
6945   size = ROUND_UP (size, UNITS_PER_WORD);
6946
6947   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6948   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6949                                                  mode,
6950                                                  type,
6951                                                  &nregs);
6952   gcc_assert (!sve_p || !allocate_nvrn);
6953
6954   unsigned int alignment
6955     = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
6956                                       &abi_break_gcc_13, &abi_break_gcc_14);
6957
6958   gcc_assert ((allocate_nvrn || alignment <= 16 * BITS_PER_UNIT)
6959               && (!alignment || abi_break_gcc_9 < alignment)
6960               && (!abi_break_gcc_13 || alignment < abi_break_gcc_13));
6961
6962   /* _BitInt(N) was only added in GCC 14.  */
6963   bool warn_pcs_change_le_gcc14
6964     = warn_pcs_change && !bitint_or_aggr_of_bitint_p (type);
6965
6966   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6967      The following code thus handles passing by SIMD/FP registers first.  */
6968
6969   nvrn = pcum->aapcs_nvrn;
6970
6971   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6972      and homogenous short-vector aggregates (HVA).  */
6973   if (allocate_nvrn)
6974     {
6975       /* aarch64_function_arg_alignment has never had an effect on
6976          this case.  */
6977       if (!pcum->silent_p && !TARGET_FLOAT)
6978         aarch64_err_no_fpadvsimd (mode);
6979
6980       if (nvrn + nregs <= NUM_FP_ARG_REGS)
6981         {
6982           pcum->aapcs_nextnvrn = nvrn + nregs;
6983           if (!aarch64_composite_type_p (type, mode))
6984             {
6985               gcc_assert (nregs == 1);
6986               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6987             }
6988           else if (aarch64_advsimd_full_struct_mode_p (mode)
6989                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
6990             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6991           else if (aarch64_advsimd_partial_struct_mode_p (mode)
6992                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
6993             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6994           else
6995             {
6996               rtx par;
6997               int i;
6998               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6999               for (i = 0; i < nregs; i++)
7000                 {
7001                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7002                                          V0_REGNUM + nvrn + i);
7003                   rtx offset = gen_int_mode
7004                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7005                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7006                   XVECEXP (par, 0, i) = tmp;
7007                 }
7008               pcum->aapcs_reg = par;
7009             }
7010           return;
7011         }
7012       else
7013         {
7014           /* C.3 NSRN is set to 8.  */
7015           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7016           goto on_stack;
7017         }
7018     }
7019
7020   ncrn = pcum->aapcs_ncrn;
7021   nregs = size / UNITS_PER_WORD;
7022
7023   /* C6 - C9.  though the sign and zero extension semantics are
7024      handled elsewhere.  This is the case where the argument fits
7025      entirely general registers.  */
7026   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7027     {
7028       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7029
7030       /* C.8 if the argument has an alignment of 16 then the NGRN is
7031          rounded up to the next even number.  */
7032       if (nregs == 2
7033           && ncrn % 2)
7034         {
7035           /* Emit a warning if the alignment changed when taking the
7036              'packed' attribute into account.  */
7037           if (warn_pcs_change_le_gcc14
7038               && abi_break_gcc_13
7039               && ((abi_break_gcc_13 == 16 * BITS_PER_UNIT)
7040                   != (alignment == 16 * BITS_PER_UNIT)))
7041             inform (input_location, "parameter passing for argument of type "
7042                     "%qT changed in GCC 13.1", type);
7043
7044           if (warn_pcs_change_le_gcc14
7045               && abi_break_gcc_14
7046               && ((abi_break_gcc_14 == 16 * BITS_PER_UNIT)
7047                   != (alignment == 16 * BITS_PER_UNIT)))
7048             inform (input_location, "parameter passing for argument of type "
7049                     "%qT changed in GCC 14.1", type);
7050
7051           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7052              comparison is there because for > 16 * BITS_PER_UNIT
7053              alignment nregs should be > 2 and therefore it should be
7054              passed by reference rather than value.  */
7055           if (alignment == 16 * BITS_PER_UNIT)
7056             {
7057               if (warn_pcs_change_le_gcc14
7058                   && abi_break_gcc_9)
7059                 inform (input_location, "parameter passing for argument of type "
7060                         "%qT changed in GCC 9.1", type);
7061               ++ncrn;
7062               gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7063             }
7064         }
7065
7066       /* If an argument with an SVE mode needs to be shifted up to the
7067          high part of the register, treat it as though it had an integer mode.
7068          Using the normal (parallel [...]) would suppress the shifting.  */
7069       if (sve_p
7070           && BYTES_BIG_ENDIAN
7071           && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7072           && aarch64_pad_reg_upward (mode, type, false))
7073         {
7074           mode = int_mode_for_mode (mode).require ();
7075           sve_p = false;
7076         }
7077
7078       /* NREGS can be 0 when e.g. an empty structure is to be passed.
7079          A reg is still generated for it, but the caller should be smart
7080          enough not to use it.  */
7081       if (nregs == 0
7082           || (nregs == 1 && !sve_p)
7083           || GET_MODE_CLASS (mode) == MODE_INT)
7084         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7085       else
7086         {
7087           rtx par;
7088           int i;
7089
7090           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7091           for (i = 0; i < nregs; i++)
7092             {
7093               scalar_int_mode reg_mode = word_mode;
7094               if (nregs == 1)
7095                 reg_mode = int_mode_for_mode (mode).require ();
7096               rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7097               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7098                                        GEN_INT (i * UNITS_PER_WORD));
7099               XVECEXP (par, 0, i) = tmp;
7100             }
7101           pcum->aapcs_reg = par;
7102         }
7103
7104       pcum->aapcs_nextncrn = ncrn + nregs;
7105       return;
7106     }
7107
7108   /* C.11  */
7109   pcum->aapcs_nextncrn = NUM_ARG_REGS;
7110
7111   /* The argument is passed on stack; record the needed number of words for
7112      this argument and align the total size if necessary.  */
7113 on_stack:
7114   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7115
7116   if (warn_pcs_change_le_gcc14
7117       && abi_break_gcc_13
7118       && ((abi_break_gcc_13 >= 16 * BITS_PER_UNIT)
7119           != (alignment >= 16 * BITS_PER_UNIT)))
7120     inform (input_location, "parameter passing for argument of type "
7121             "%qT changed in GCC 13.1", type);
7122
7123   if (warn_pcs_change_le_gcc14
7124       && abi_break_gcc_14
7125       && ((abi_break_gcc_14 >= 16 * BITS_PER_UNIT)
7126           != (alignment >= 16 * BITS_PER_UNIT)))
7127     inform (input_location, "parameter passing for argument of type "
7128             "%qT changed in GCC 14.1", type);
7129
7130   if (alignment == 16 * BITS_PER_UNIT)
7131     {
7132       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7133       if (pcum->aapcs_stack_size != new_size)
7134         {
7135           if (warn_pcs_change_le_gcc14
7136               && abi_break_gcc_9)
7137             inform (input_location, "parameter passing for argument of type "
7138                     "%qT changed in GCC 9.1", type);
7139           pcum->aapcs_stack_size = new_size;
7140         }
7141     }
7142   return;
7143 }
7144
7145 /* Add the current argument register to the set of those that need
7146    to be saved and restored around a change to PSTATE.SM.  */
7147
7148 static void
7149 aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7150 {
7151   subrtx_var_iterator::array_type array;
7152   FOR_EACH_SUBRTX_VAR (iter, array, pcum->aapcs_reg, NONCONST)
7153     {
7154       rtx x = *iter;
7155       if (REG_P (x) && (FP_REGNUM_P (REGNO (x)) || PR_REGNUM_P (REGNO (x))))
7156         {
7157           unsigned int i = pcum->num_sme_mode_switch_args++;
7158           gcc_assert (i < ARRAY_SIZE (pcum->sme_mode_switch_args));
7159           pcum->sme_mode_switch_args[i] = x;
7160         }
7161     }
7162 }
7163
7164 /* Return a parallel that contains all the registers that need to be
7165    saved around a change to PSTATE.SM.  Return const0_rtx if there is
7166    no such mode switch, or if no registers need to be saved.  */
7167
7168 static rtx
7169 aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7170 {
7171   if (!pcum->num_sme_mode_switch_args)
7172     return const0_rtx;
7173
7174   auto argvec = gen_rtvec_v (pcum->num_sme_mode_switch_args,
7175                              pcum->sme_mode_switch_args);
7176   return gen_rtx_PARALLEL (VOIDmode, argvec);
7177 }
7178
7179 /* Implement TARGET_FUNCTION_ARG.  */
7180
7181 static rtx
7182 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7183 {
7184   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7185   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7186               || pcum->pcs_variant == ARM_PCS_SIMD
7187               || pcum->pcs_variant == ARM_PCS_SVE);
7188
7189   if (arg.end_marker_p ())
7190     {
7191       rtx abi_cookie = aarch64_gen_callee_cookie (pcum->isa_mode,
7192                                                   pcum->pcs_variant);
7193       rtx sme_mode_switch_args = aarch64_finish_sme_mode_switch_args (pcum);
7194       rtx shared_za_flags = gen_int_mode (pcum->shared_za_flags, SImode);
7195       rtx shared_zt0_flags = gen_int_mode (pcum->shared_zt0_flags, SImode);
7196       return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (4, abi_cookie,
7197                                                     sme_mode_switch_args,
7198                                                     shared_za_flags,
7199                                                     shared_zt0_flags));
7200     }
7201
7202   aarch64_layout_arg (pcum_v, arg);
7203   return pcum->aapcs_reg;
7204 }
7205
7206 void
7207 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7208                               const_tree fntype,
7209                               rtx libname ATTRIBUTE_UNUSED,
7210                               const_tree fndecl,
7211                               unsigned n_named ATTRIBUTE_UNUSED,
7212                               bool silent_p)
7213 {
7214   pcum->aapcs_ncrn = 0;
7215   pcum->aapcs_nvrn = 0;
7216   pcum->aapcs_nprn = 0;
7217   pcum->aapcs_nextncrn = 0;
7218   pcum->aapcs_nextnvrn = 0;
7219   pcum->aapcs_nextnprn = 0;
7220   if (fntype)
7221     {
7222       pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7223       pcum->isa_mode = aarch64_fntype_isa_mode (fntype);
7224     }
7225   else
7226     {
7227       pcum->pcs_variant = ARM_PCS_AAPCS64;
7228       pcum->isa_mode = AARCH64_FL_DEFAULT_ISA_MODE;
7229     }
7230   pcum->aapcs_reg = NULL_RTX;
7231   pcum->aapcs_arg_processed = false;
7232   pcum->aapcs_stack_words = 0;
7233   pcum->aapcs_stack_size = 0;
7234   pcum->silent_p = silent_p;
7235   pcum->shared_za_flags
7236     = (fntype ? aarch64_fntype_shared_flags (fntype, "za") : 0U);
7237   pcum->shared_zt0_flags
7238     = (fntype ? aarch64_fntype_shared_flags (fntype, "zt0") : 0U);
7239   pcum->num_sme_mode_switch_args = 0;
7240
7241   if (!silent_p
7242       && !TARGET_FLOAT
7243       && fntype && fntype != error_mark_node)
7244     {
7245       const_tree type = TREE_TYPE (fntype);
7246       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
7247       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
7248       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7249                                                    &mode, &nregs, NULL, false))
7250         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7251     }
7252
7253   if (!silent_p
7254       && !TARGET_SVE
7255       && pcum->pcs_variant == ARM_PCS_SVE)
7256     {
7257       /* We can't gracefully recover at this point, so make this a
7258          fatal error.  */
7259       if (fndecl)
7260         fatal_error (input_location, "%qE requires the SVE ISA extension",
7261                      fndecl);
7262       else
7263         fatal_error (input_location, "calls to functions of type %qT require"
7264                      " the SVE ISA extension", fntype);
7265     }
7266 }
7267
7268 static void
7269 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7270                               const function_arg_info &arg)
7271 {
7272   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7273   if (pcum->pcs_variant == ARM_PCS_AAPCS64
7274       || pcum->pcs_variant == ARM_PCS_SIMD
7275       || pcum->pcs_variant == ARM_PCS_SVE)
7276     {
7277       aarch64_layout_arg (pcum_v, arg);
7278       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7279                   != (pcum->aapcs_stack_words != 0));
7280       if (pcum->aapcs_reg
7281           && aarch64_call_switches_pstate_sm (pcum->isa_mode))
7282         aarch64_record_sme_mode_switch_args (pcum);
7283
7284       pcum->aapcs_arg_processed = false;
7285       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7286       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7287       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7288       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7289       pcum->aapcs_stack_words = 0;
7290       pcum->aapcs_reg = NULL_RTX;
7291     }
7292 }
7293
7294 bool
7295 aarch64_function_arg_regno_p (unsigned regno)
7296 {
7297   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7298           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS)
7299           || (PR_REGNUM_P (regno) && regno < P0_REGNUM + NUM_PR_ARG_REGS));
7300 }
7301
7302 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
7303    PARM_BOUNDARY bits of alignment, but will be given anything up
7304    to STACK_BOUNDARY bits if the type requires it.  This makes sure
7305    that both before and after the layout of each argument, the Next
7306    Stacked Argument Address (NSAA) will have a minimum alignment of
7307    8 bytes.  */
7308
7309 static unsigned int
7310 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7311 {
7312   unsigned int abi_break_gcc_9;
7313   unsigned int abi_break_gcc_13;
7314   unsigned int abi_break_gcc_14;
7315   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7316                                                            &abi_break_gcc_9,
7317                                                            &abi_break_gcc_13,
7318                                                            &abi_break_gcc_14);
7319   /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7320      to emit warnings about ABI incompatibility.  */
7321   alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7322   return alignment;
7323 }
7324
7325 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
7326
7327 static fixed_size_mode
7328 aarch64_get_reg_raw_mode (int regno)
7329 {
7330   /* Don't use any non GP registers for __builtin_apply and
7331      __builtin_return if general registers only mode is requested. */
7332   if (TARGET_GENERAL_REGS_ONLY && !GP_REGNUM_P (regno))
7333     return as_a <fixed_size_mode> (VOIDmode);
7334   if (TARGET_SVE && FP_REGNUM_P (regno))
7335     /* Don't use the SVE part of the register for __builtin_apply and
7336        __builtin_return.  The SVE registers aren't used by the normal PCS,
7337        so using them there would be a waste of time.  The PCS extensions
7338        for SVE types are fundamentally incompatible with the
7339        __builtin_return/__builtin_apply interface.  */
7340     return as_a <fixed_size_mode> (V16QImode);
7341   if (PR_REGNUM_P (regno))
7342     /* For SVE PR regs, indicate that they should be ignored for
7343        __builtin_apply/__builtin_return.  */
7344     return as_a <fixed_size_mode> (VOIDmode);
7345   return default_get_reg_raw_mode (regno);
7346 }
7347
7348 /* Implement TARGET_FUNCTION_ARG_PADDING.
7349
7350    Small aggregate types are placed in the lowest memory address.
7351
7352    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
7353
7354 static pad_direction
7355 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7356 {
7357   /* On little-endian targets, the least significant byte of every stack
7358      argument is passed at the lowest byte address of the stack slot.  */
7359   if (!BYTES_BIG_ENDIAN)
7360     return PAD_UPWARD;
7361
7362   /* Otherwise, integral, floating-point and pointer types are padded downward:
7363      the least significant byte of a stack argument is passed at the highest
7364      byte address of the stack slot.  */
7365   if (type
7366       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7367          || POINTER_TYPE_P (type))
7368       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7369     return PAD_DOWNWARD;
7370
7371   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
7372   return PAD_UPWARD;
7373 }
7374
7375 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7376
7377    It specifies padding for the last (may also be the only)
7378    element of a block move between registers and memory.  If
7379    assuming the block is in the memory, padding upward means that
7380    the last element is padded after its highest significant byte,
7381    while in downward padding, the last element is padded at the
7382    its least significant byte side.
7383
7384    Small aggregates and small complex types are always padded
7385    upwards.
7386
7387    We don't need to worry about homogeneous floating-point or
7388    short-vector aggregates; their move is not affected by the
7389    padding direction determined here.  Regardless of endianness,
7390    each element of such an aggregate is put in the least
7391    significant bits of a fp/simd register.
7392
7393    Return !BYTES_BIG_ENDIAN if the least significant byte of the
7394    register has useful data, and return the opposite if the most
7395    significant byte does.  */
7396
7397 bool
7398 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
7399                      bool first ATTRIBUTE_UNUSED)
7400 {
7401
7402   /* Aside from pure scalable types, small composite types are always
7403      padded upward.  */
7404   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
7405     {
7406       HOST_WIDE_INT size;
7407       if (type)
7408         size = int_size_in_bytes (type);
7409       else
7410         /* No frontends can create types with variable-sized modes, so we
7411            shouldn't be asked to pass or return them.  */
7412         size = GET_MODE_SIZE (mode).to_constant ();
7413       if (size < 2 * UNITS_PER_WORD)
7414         {
7415           pure_scalable_type_info pst_info;
7416           if (pst_info.analyze_registers (type))
7417             return false;
7418           return true;
7419         }
7420     }
7421
7422   /* Otherwise, use the default padding.  */
7423   return !BYTES_BIG_ENDIAN;
7424 }
7425
7426 static scalar_int_mode
7427 aarch64_libgcc_cmp_return_mode (void)
7428 {
7429   return SImode;
7430 }
7431
7432 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7433
7434 /* We use the 12-bit shifted immediate arithmetic instructions so values
7435    must be multiple of (1 << 12), i.e. 4096.  */
7436 #define ARITH_FACTOR 4096
7437
7438 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7439 #error Cannot use simple address calculation for stack probing
7440 #endif
7441
7442 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7443    inclusive.  These are offsets from the current stack pointer.  */
7444
7445 static void
7446 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
7447 {
7448   HOST_WIDE_INT size;
7449   if (!poly_size.is_constant (&size))
7450     {
7451       sorry ("stack probes for SVE frames");
7452       return;
7453     }
7454
7455   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
7456
7457   /* See the same assertion on PROBE_INTERVAL above.  */
7458   gcc_assert ((first % ARITH_FACTOR) == 0);
7459
7460   /* See if we have a constant small number of probes to generate.  If so,
7461      that's the easy case.  */
7462   if (size <= PROBE_INTERVAL)
7463     {
7464       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7465
7466       emit_set_insn (reg1,
7467                      plus_constant (Pmode,
7468                                     stack_pointer_rtx, -(first + base)));
7469       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
7470     }
7471
7472   /* The run-time loop is made up of 8 insns in the generic case while the
7473      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
7474   else if (size <= 4 * PROBE_INTERVAL)
7475     {
7476       HOST_WIDE_INT i, rem;
7477
7478       emit_set_insn (reg1,
7479                      plus_constant (Pmode,
7480                                     stack_pointer_rtx,
7481                                     -(first + PROBE_INTERVAL)));
7482       emit_stack_probe (reg1);
7483
7484       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7485          it exceeds SIZE.  If only two probes are needed, this will not
7486          generate any code.  Then probe at FIRST + SIZE.  */
7487       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7488         {
7489           emit_set_insn (reg1,
7490                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
7491           emit_stack_probe (reg1);
7492         }
7493
7494       rem = size - (i - PROBE_INTERVAL);
7495       if (rem > 256)
7496         {
7497           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7498
7499           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7500           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
7501         }
7502       else
7503         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
7504     }
7505
7506   /* Otherwise, do the same as above, but in a loop.  Note that we must be
7507      extra careful with variables wrapping around because we might be at
7508      the very top (or the very bottom) of the address space and we have
7509      to be able to handle this case properly; in particular, we use an
7510      equality test for the loop condition.  */
7511   else
7512     {
7513       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
7514
7515       /* Step 1: round SIZE to the previous multiple of the interval.  */
7516
7517       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7518
7519
7520       /* Step 2: compute initial and final value of the loop counter.  */
7521
7522       /* TEST_ADDR = SP + FIRST.  */
7523       emit_set_insn (reg1,
7524                      plus_constant (Pmode, stack_pointer_rtx, -first));
7525
7526       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
7527       HOST_WIDE_INT adjustment = - (first + rounded_size);
7528       if (! aarch64_uimm12_shift (adjustment))
7529         {
7530           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7531                                           true, Pmode);
7532           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7533         }
7534       else
7535         emit_set_insn (reg2,
7536                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
7537
7538       /* Step 3: the loop
7539
7540          do
7541            {
7542              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7543              probe at TEST_ADDR
7544            }
7545          while (TEST_ADDR != LAST_ADDR)
7546
7547          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7548          until it is equal to ROUNDED_SIZE.  */
7549
7550       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
7551
7552
7553       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7554          that SIZE is equal to ROUNDED_SIZE.  */
7555
7556       if (size != rounded_size)
7557         {
7558           HOST_WIDE_INT rem = size - rounded_size;
7559
7560           if (rem > 256)
7561             {
7562               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7563
7564               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7565               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
7566             }
7567           else
7568             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
7569         }
7570     }
7571
7572   /* Make sure nothing is scheduled before we are done.  */
7573   emit_insn (gen_blockage ());
7574 }
7575
7576 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
7577    absolute addresses.  */
7578
7579 const char *
7580 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7581 {
7582   static int labelno = 0;
7583   char loop_lab[32];
7584   rtx xops[2];
7585
7586   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7587
7588   /* Loop.  */
7589   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7590
7591   HOST_WIDE_INT stack_clash_probe_interval
7592     = 1 << param_stack_clash_protection_guard_size;
7593
7594   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
7595   xops[0] = reg1;
7596   HOST_WIDE_INT interval;
7597   if (flag_stack_clash_protection)
7598     interval = stack_clash_probe_interval;
7599   else
7600     interval = PROBE_INTERVAL;
7601
7602   gcc_assert (aarch64_uimm12_shift (interval));
7603   xops[1] = GEN_INT (interval);
7604
7605   output_asm_insn ("sub\t%0, %0, %1", xops);
7606
7607   /* If doing stack clash protection then we probe up by the ABI specified
7608      amount.  We do this because we're dropping full pages at a time in the
7609      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
7610   if (flag_stack_clash_protection)
7611     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7612   else
7613     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7614
7615   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
7616      by this amount for each iteration.  */
7617   output_asm_insn ("str\txzr, [%0, %1]", xops);
7618
7619   /* Test if TEST_ADDR == LAST_ADDR.  */
7620   xops[1] = reg2;
7621   output_asm_insn ("cmp\t%0, %1", xops);
7622
7623   /* Branch.  */
7624   fputs ("\tb.ne\t", asm_out_file);
7625   assemble_name_raw (asm_out_file, loop_lab);
7626   fputc ('\n', asm_out_file);
7627
7628   return "";
7629 }
7630
7631 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7632    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7633    of GUARD_SIZE.  When a probe is emitted it is done at most
7634    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7635    at most MIN_PROBE_THRESHOLD.  By the end of this function
7636    BASE = BASE - ADJUSTMENT.  */
7637
7638 const char *
7639 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7640                                       rtx min_probe_threshold, rtx guard_size)
7641 {
7642   /* This function is not allowed to use any instruction generation function
7643      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
7644      so instead emit the code you want using output_asm_insn.  */
7645   gcc_assert (flag_stack_clash_protection);
7646   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7647   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7648
7649   /* The minimum required allocation before the residual requires probing.  */
7650   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7651
7652   /* Clamp the value down to the nearest value that can be used with a cmp.  */
7653   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7654   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7655
7656   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7657   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7658
7659   static int labelno = 0;
7660   char loop_start_lab[32];
7661   char loop_end_lab[32];
7662   rtx xops[2];
7663
7664   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7665   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7666
7667   /* Emit loop start label.  */
7668   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7669
7670   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
7671   xops[0] = adjustment;
7672   xops[1] = probe_offset_value_rtx;
7673   output_asm_insn ("cmp\t%0, %1", xops);
7674
7675   /* Branch to end if not enough adjustment to probe.  */
7676   fputs ("\tb.lt\t", asm_out_file);
7677   assemble_name_raw (asm_out_file, loop_end_lab);
7678   fputc ('\n', asm_out_file);
7679
7680   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
7681   xops[0] = base;
7682   xops[1] = probe_offset_value_rtx;
7683   output_asm_insn ("sub\t%0, %0, %1", xops);
7684
7685   /* Probe at BASE.  */
7686   xops[1] = const0_rtx;
7687   output_asm_insn ("str\txzr, [%0, %1]", xops);
7688
7689   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
7690   xops[0] = adjustment;
7691   xops[1] = probe_offset_value_rtx;
7692   output_asm_insn ("sub\t%0, %0, %1", xops);
7693
7694   /* Branch to start if still more bytes to allocate.  */
7695   fputs ("\tb\t", asm_out_file);
7696   assemble_name_raw (asm_out_file, loop_start_lab);
7697   fputc ('\n', asm_out_file);
7698
7699   /* No probe leave.  */
7700   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
7701
7702   /* BASE = BASE - ADJUSTMENT.  */
7703   xops[0] = base;
7704   xops[1] = adjustment;
7705   output_asm_insn ("sub\t%0, %0, %1", xops);
7706   return "";
7707 }
7708
7709 /* Determine whether a frame chain needs to be generated.  */
7710 static bool
7711 aarch64_needs_frame_chain (void)
7712 {
7713   if (frame_pointer_needed)
7714     return true;
7715
7716   /* A leaf function cannot have calls or write LR.  */
7717   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
7718
7719   /* Don't use a frame chain in leaf functions if leaf frame pointers
7720      are disabled.  */
7721   if (flag_omit_leaf_frame_pointer && is_leaf)
7722     return false;
7723
7724   return aarch64_use_frame_pointer;
7725 }
7726
7727 /* Return true if the current function should save registers above
7728    the locals area, rather than below it.  */
7729
7730 static bool
7731 aarch64_save_regs_above_locals_p ()
7732 {
7733   /* When using stack smash protection, make sure that the canary slot
7734      comes between the locals and the saved registers.  Otherwise,
7735      it would be possible for a carefully sized smash attack to change
7736      the saved registers (particularly LR and FP) without reaching the
7737      canary.  */
7738   return crtl->stack_protect_guard;
7739 }
7740
7741 /* Return true if the current function needs to record the incoming
7742    value of PSTATE.SM.  */
7743 static bool
7744 aarch64_need_old_pstate_sm ()
7745 {
7746   /* Exit early if the incoming value of PSTATE.SM is known at
7747      compile time.  */
7748   if (aarch64_cfun_incoming_pstate_sm () != 0)
7749     return false;
7750
7751   if (aarch64_cfun_enables_pstate_sm ())
7752     return true;
7753
7754   /* Non-local goto receivers are entered with PSTATE.SM equal to 0,
7755      but the function needs to return with PSTATE.SM unchanged.  */
7756   if (nonlocal_goto_handler_labels)
7757     return true;
7758
7759   /* Likewise for exception handlers.  */
7760   eh_landing_pad lp;
7761   for (unsigned int i = 1; vec_safe_iterate (cfun->eh->lp_array, i, &lp); ++i)
7762     if (lp && lp->post_landing_pad)
7763       return true;
7764
7765   /* Non-local gotos need to set PSTATE.SM to zero.  It's possible to call
7766      streaming-compatible functions without SME being available, so PSTATE.SM
7767      should only be changed if it is currently set to one.  */
7768   if (crtl->has_nonlocal_goto)
7769     return true;
7770
7771   if (cfun->machine->call_switches_pstate_sm)
7772     for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
7773       if (auto *call = dyn_cast<rtx_call_insn *> (insn))
7774         if (!SIBLING_CALL_P (call))
7775           {
7776             /* Return true if there is a call to a non-streaming-compatible
7777                function.  */
7778             auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
7779             if (aarch64_call_switches_pstate_sm (callee_isa_mode))
7780               return true;
7781           }
7782   return false;
7783 }
7784
7785 /* Mark the registers that need to be saved by the callee and calculate
7786    the size of the callee-saved registers area and frame record (both FP
7787    and LR may be omitted).  */
7788 static void
7789 aarch64_layout_frame (void)
7790 {
7791   unsigned regno, last_fp_reg = INVALID_REGNUM;
7792   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
7793   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
7794   bool frame_related_fp_reg_p = false;
7795   aarch64_frame &frame = cfun->machine->frame;
7796   poly_int64 top_of_locals = -1;
7797   bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
7798
7799   vec_safe_truncate (frame.saved_gprs, 0);
7800   vec_safe_truncate (frame.saved_fprs, 0);
7801   vec_safe_truncate (frame.saved_prs, 0);
7802
7803   frame.emit_frame_chain = aarch64_needs_frame_chain ();
7804
7805   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
7806      the mid-end is doing.  */
7807   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
7808
7809 #define SLOT_NOT_REQUIRED (-2)
7810 #define SLOT_REQUIRED     (-1)
7811
7812   frame.wb_push_candidate1 = INVALID_REGNUM;
7813   frame.wb_push_candidate2 = INVALID_REGNUM;
7814   frame.spare_pred_reg = INVALID_REGNUM;
7815
7816   /* First mark all the registers that really need to be saved...  */
7817   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7818     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
7819   frame.old_svcr_offset = SLOT_NOT_REQUIRED;
7820
7821   /* ... that includes the eh data registers (if needed)...  */
7822   if (crtl->calls_eh_return)
7823     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
7824       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
7825
7826   /* ... and any callee saved register that dataflow says is live.  */
7827   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7828     if (df_regs_ever_live_p (regno)
7829         && !fixed_regs[regno]
7830         && (regno == R30_REGNUM
7831             || !crtl->abi->clobbers_full_reg_p (regno)))
7832       frame.reg_offset[regno] = SLOT_REQUIRED;
7833
7834   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7835     if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7836         && !fixed_regs[regno]
7837         && !crtl->abi->clobbers_full_reg_p (regno))
7838       {
7839         frame.reg_offset[regno] = SLOT_REQUIRED;
7840         last_fp_reg = regno;
7841         if (aarch64_emit_cfi_for_reg_p (regno))
7842           frame_related_fp_reg_p = true;
7843       }
7844
7845   /* Big-endian SVE frames need a spare predicate register in order
7846      to save Z8-Z15.  Decide which register they should use.  Prefer
7847      an unused argument register if possible, so that we don't force P4
7848      to be saved unnecessarily.  */
7849   if (frame_related_fp_reg_p
7850       && crtl->abi->id () == ARM_PCS_SVE
7851       && BYTES_BIG_ENDIAN)
7852     {
7853       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7854       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
7855       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
7856         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
7857           break;
7858       gcc_assert (regno <= P7_REGNUM);
7859       frame.spare_pred_reg = regno;
7860       df_set_regs_ever_live (regno, true);
7861     }
7862
7863   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7864     if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7865         && !fixed_regs[regno]
7866         && !crtl->abi->clobbers_full_reg_p (regno))
7867       frame.reg_offset[regno] = SLOT_REQUIRED;
7868
7869   bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
7870
7871   poly_int64 offset = crtl->outgoing_args_size;
7872   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
7873   if (regs_at_top_p)
7874     {
7875       offset += get_frame_size ();
7876       offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7877       top_of_locals = offset;
7878     }
7879   frame.bytes_below_saved_regs = offset;
7880   frame.sve_save_and_probe = INVALID_REGNUM;
7881
7882   /* Now assign stack slots for the registers.  Start with the predicate
7883      registers, since predicate LDR and STR have a relatively small
7884      offset range.  These saves happen below the hard frame pointer.  */
7885   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7886     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7887       {
7888         vec_safe_push (frame.saved_prs, regno);
7889         if (frame.sve_save_and_probe == INVALID_REGNUM)
7890           frame.sve_save_and_probe = regno;
7891         frame.reg_offset[regno] = offset;
7892         offset += BYTES_PER_SVE_PRED;
7893       }
7894
7895   poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
7896   if (maybe_ne (saved_prs_size, 0))
7897     {
7898       /* If we have any vector registers to save above the predicate registers,
7899          the offset of the vector register save slots need to be a multiple
7900          of the vector size.  This lets us use the immediate forms of LDR/STR
7901          (or LD1/ST1 for big-endian).
7902
7903          A vector register is 8 times the size of a predicate register,
7904          and we need to save a maximum of 12 predicate registers, so the
7905          first vector register will be at either #1, MUL VL or #2, MUL VL.
7906
7907          If we don't have any vector registers to save, and we know how
7908          big the predicate save area is, we can just round it up to the
7909          next 16-byte boundary.  */
7910       if (last_fp_reg == INVALID_REGNUM && offset.is_constant ())
7911         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7912       else
7913         {
7914           if (known_le (saved_prs_size, vector_save_size))
7915             offset = frame.bytes_below_saved_regs + vector_save_size;
7916           else if (known_le (saved_prs_size, vector_save_size * 2))
7917             offset = frame.bytes_below_saved_regs + vector_save_size * 2;
7918           else
7919             gcc_unreachable ();
7920         }
7921     }
7922
7923   /* If we need to save any SVE vector registers, add them next.  */
7924   if (last_fp_reg != INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
7925     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7926       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7927         {
7928           vec_safe_push (frame.saved_fprs, regno);
7929           if (frame.sve_save_and_probe == INVALID_REGNUM)
7930             frame.sve_save_and_probe = regno;
7931           frame.reg_offset[regno] = offset;
7932           offset += vector_save_size;
7933         }
7934
7935   /* OFFSET is now the offset of the hard frame pointer from the bottom
7936      of the callee save area.  */
7937   auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
7938   bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
7939   gcc_assert (!saves_below_hard_fp_p
7940               || (frame.sve_save_and_probe != INVALID_REGNUM
7941                   && known_eq (frame.reg_offset[frame.sve_save_and_probe],
7942                                frame.bytes_below_saved_regs)));
7943
7944   frame.bytes_below_hard_fp = offset;
7945   frame.hard_fp_save_and_probe = INVALID_REGNUM;
7946
7947   auto allocate_gpr_slot = [&](unsigned int regno)
7948     {
7949       vec_safe_push (frame.saved_gprs, regno);
7950       frame.reg_offset[regno] = offset;
7951       offset += UNITS_PER_WORD;
7952     };
7953
7954   if (frame.emit_frame_chain)
7955     {
7956       /* FP and LR are placed in the linkage record.  */
7957       allocate_gpr_slot (R29_REGNUM);
7958       allocate_gpr_slot (R30_REGNUM);
7959     }
7960   else if ((flag_stack_clash_protection || !frame.is_scs_enabled)
7961            && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
7962     /* Put the LR save slot first, since it makes a good choice of probe
7963        for stack clash purposes.  The idea is that the link register usually
7964        has to be saved before a call anyway, and so we lose little by
7965        stopping it from being individually shrink-wrapped.  */
7966     allocate_gpr_slot (R30_REGNUM);
7967
7968   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7969     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7970       allocate_gpr_slot (regno);
7971
7972   if (aarch64_need_old_pstate_sm ())
7973     {
7974       frame.old_svcr_offset = offset;
7975       offset += UNITS_PER_WORD;
7976     }
7977
7978   /* If the current function changes the SVE vector length, ensure that the
7979      old value of the DWARF VG register is saved and available in the CFI,
7980      so that outer frames with VL-sized offsets can be processed correctly.  */
7981   if (cfun->machine->call_switches_pstate_sm
7982       || aarch64_cfun_enables_pstate_sm ())
7983     {
7984       frame.reg_offset[VG_REGNUM] = offset;
7985       offset += UNITS_PER_WORD;
7986     }
7987
7988   poly_int64 max_int_offset = offset;
7989   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7990   bool has_align_gap = maybe_ne (offset, max_int_offset);
7991
7992   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7993     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7994       {
7995         vec_safe_push (frame.saved_fprs, regno);
7996         /* If there is an alignment gap between integer and fp callee-saves,
7997            allocate the last fp register to it if possible.  */
7998         if (regno == last_fp_reg
7999             && has_align_gap
8000             && known_eq (vector_save_size, 8)
8001             && multiple_p (offset, 16))
8002           {
8003             frame.reg_offset[regno] = max_int_offset;
8004             break;
8005           }
8006
8007         frame.reg_offset[regno] = offset;
8008         offset += vector_save_size;
8009       }
8010
8011   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8012   auto saved_regs_size = offset - frame.bytes_below_saved_regs;
8013
8014   array_slice<unsigned int> push_regs = (!vec_safe_is_empty (frame.saved_gprs)
8015                                          ? frame.saved_gprs
8016                                          : frame.saved_fprs);
8017   if (!push_regs.empty ()
8018       && known_eq (frame.reg_offset[push_regs[0]], frame.bytes_below_hard_fp))
8019     {
8020       frame.hard_fp_save_and_probe = push_regs[0];
8021       frame.wb_push_candidate1 = push_regs[0];
8022       if (push_regs.size () > 1)
8023         frame.wb_push_candidate2 = push_regs[1];
8024     }
8025
8026   /* With stack-clash, a register must be saved in non-leaf functions.
8027      The saving of the bottommost register counts as an implicit probe,
8028      which allows us to maintain the invariant described in the comment
8029      at expand_prologue.  */
8030   gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
8031
8032   if (!regs_at_top_p)
8033     {
8034       offset += get_frame_size ();
8035       offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8036       top_of_locals = offset;
8037     }
8038   offset += frame.saved_varargs_size;
8039   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
8040   frame.frame_size = offset;
8041
8042   frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
8043   gcc_assert (known_ge (top_of_locals, 0));
8044   frame.bytes_above_locals = frame.frame_size - top_of_locals;
8045
8046   frame.initial_adjust = 0;
8047   frame.final_adjust = 0;
8048   frame.callee_adjust = 0;
8049   frame.sve_callee_adjust = 0;
8050
8051   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8052   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8053
8054   /* Shadow call stack only deals with functions where the LR is pushed
8055      onto the stack and without specifying the "no_sanitize" attribute
8056      with the argument "shadow-call-stack".  */
8057   frame.is_scs_enabled
8058     = (!crtl->calls_eh_return
8059        && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8060        && known_ge (frame.reg_offset[LR_REGNUM], 0));
8061
8062   /* When shadow call stack is enabled, the scs_pop in the epilogue will
8063      restore x30, and we don't need to pop x30 again in the traditional
8064      way.  Pop candidates record the registers that need to be popped
8065      eventually.  */
8066   if (frame.is_scs_enabled)
8067     {
8068       if (frame.wb_pop_candidate2 == R30_REGNUM)
8069         frame.wb_pop_candidate2 = INVALID_REGNUM;
8070       else if (frame.wb_pop_candidate1 == R30_REGNUM)
8071         frame.wb_pop_candidate1 = INVALID_REGNUM;
8072     }
8073
8074   /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8075      256 to ensure that the offset meets the requirements of emit_move_insn.
8076      Similarly, if candidate1 is INVALID_REGNUM, we need to set
8077      max_push_offset to 0, because no registers are popped at this time,
8078      so callee_adjust cannot be adjusted.  */
8079   HOST_WIDE_INT max_push_offset = 0;
8080   if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8081     {
8082       if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8083         max_push_offset = 512;
8084       else
8085         max_push_offset = 256;
8086     }
8087
8088   HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
8089   HOST_WIDE_INT const_saved_regs_size;
8090   if (known_eq (saved_regs_size, 0))
8091     frame.initial_adjust = frame.frame_size;
8092   else if (frame.frame_size.is_constant (&const_size)
8093            && const_size < max_push_offset
8094            && known_eq (frame.bytes_above_hard_fp, const_size))
8095     {
8096       /* Simple, small frame with no data below the saved registers.
8097
8098          stp reg1, reg2, [sp, -frame_size]!
8099          stp reg3, reg4, [sp, 16]  */
8100       frame.callee_adjust = const_size;
8101     }
8102   else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
8103            && saved_regs_size.is_constant (&const_saved_regs_size)
8104            && const_below_saved_regs + const_saved_regs_size < 512
8105            /* We could handle this case even with data below the saved
8106               registers, provided that that data left us with valid offsets
8107               for all predicate and vector save slots.  It's such a rare
8108               case that it hardly seems worth the effort though.  */
8109            && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
8110            && !(cfun->calls_alloca
8111                 && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8112                 && const_above_fp < max_push_offset))
8113     {
8114       /* Frame with small area below the saved registers:
8115
8116          sub sp, sp, frame_size
8117          stp reg1, reg2, [sp, bytes_below_saved_regs]
8118          stp reg3, reg4, [sp, bytes_below_saved_regs + 16]  */
8119       frame.initial_adjust = frame.frame_size;
8120     }
8121   else if (saves_below_hard_fp_p
8122            && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
8123     {
8124       /* Frame in which all saves are SVE saves:
8125
8126          sub sp, sp, frame_size - bytes_below_saved_regs
8127          save SVE registers relative to SP
8128          sub sp, sp, bytes_below_saved_regs  */
8129       frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
8130       frame.final_adjust = frame.bytes_below_saved_regs;
8131     }
8132   else if (frame.wb_push_candidate1 != INVALID_REGNUM
8133            && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8134            && const_above_fp < max_push_offset)
8135     {
8136       /* Frame with large area below the saved registers, or with SVE saves,
8137          but with a small area above:
8138
8139          stp reg1, reg2, [sp, -hard_fp_offset]!
8140          stp reg3, reg4, [sp, 16]
8141          [sub sp, sp, below_hard_fp_saved_regs_size]
8142          [save SVE registers relative to SP]
8143          sub sp, sp, bytes_below_saved_regs  */
8144       frame.callee_adjust = const_above_fp;
8145       frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8146       frame.final_adjust = frame.bytes_below_saved_regs;
8147     }
8148   else
8149     {
8150       /* General case:
8151
8152          sub sp, sp, hard_fp_offset
8153          stp x29, x30, [sp, 0]
8154          add x29, sp, 0
8155          stp reg3, reg4, [sp, 16]
8156          [sub sp, sp, below_hard_fp_saved_regs_size]
8157          [save SVE registers relative to SP]
8158          sub sp, sp, bytes_below_saved_regs  */
8159       frame.initial_adjust = frame.bytes_above_hard_fp;
8160       frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8161       frame.final_adjust = frame.bytes_below_saved_regs;
8162     }
8163
8164   /* The frame is allocated in pieces, with each non-final piece
8165      including a register save at offset 0 that acts as a probe for
8166      the following piece.  In addition, the save of the bottommost register
8167      acts as a probe for callees and allocas.  Roll back any probes that
8168      aren't needed.
8169
8170      A probe isn't needed if it is associated with the final allocation
8171      (including callees and allocas) that happens before the epilogue is
8172      executed.  */
8173   if (crtl->is_leaf
8174       && !cfun->calls_alloca
8175       && known_eq (frame.final_adjust, 0))
8176     {
8177       if (maybe_ne (frame.sve_callee_adjust, 0))
8178         frame.sve_save_and_probe = INVALID_REGNUM;
8179       else
8180         frame.hard_fp_save_and_probe = INVALID_REGNUM;
8181     }
8182
8183   /* Make sure the individual adjustments add up to the full frame size.  */
8184   gcc_assert (known_eq (frame.initial_adjust
8185                         + frame.callee_adjust
8186                         + frame.sve_callee_adjust
8187                         + frame.final_adjust, frame.frame_size));
8188
8189   if (frame.callee_adjust == 0)
8190     {
8191       /* We've decided not to do a "real" push and pop.  However,
8192          setting up the frame chain is treated as being essentially
8193          a multi-instruction push.  */
8194       frame.wb_pop_candidate1 = frame.wb_pop_candidate2 = INVALID_REGNUM;
8195       if (!frame.emit_frame_chain)
8196         frame.wb_push_candidate1 = frame.wb_push_candidate2 = INVALID_REGNUM;
8197     }
8198
8199   frame.laid_out = true;
8200 }
8201
8202 /* Return true if the register REGNO is saved on entry to
8203    the current function.  */
8204
8205 static bool
8206 aarch64_register_saved_on_entry (int regno)
8207 {
8208   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8209 }
8210
8211 /* Push the register number REGNO of mode MODE to the stack with write-back
8212    adjusting the stack by ADJUSTMENT.  */
8213
8214 static void
8215 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8216                            HOST_WIDE_INT adjustment)
8217  {
8218   rtx base_rtx = stack_pointer_rtx;
8219   rtx insn, reg, mem;
8220
8221   reg = gen_rtx_REG (mode, regno);
8222   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8223                             plus_constant (Pmode, base_rtx, -adjustment));
8224   mem = gen_frame_mem (mode, mem);
8225
8226   insn = emit_move_insn (mem, reg);
8227   RTX_FRAME_RELATED_P (insn) = 1;
8228 }
8229
8230 /* Generate and return an instruction to store the pair of registers
8231    REG and REG2 of mode MODE to location BASE with write-back adjusting
8232    the stack location BASE by ADJUSTMENT.  */
8233
8234 static rtx
8235 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8236                           HOST_WIDE_INT adjustment)
8237 {
8238   rtx new_base = plus_constant (Pmode, base, -adjustment);
8239   rtx mem = gen_frame_mem (mode, new_base);
8240   rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8241
8242   return gen_rtx_PARALLEL (VOIDmode,
8243                            gen_rtvec (3,
8244                                       gen_rtx_SET (base, new_base),
8245                                       gen_rtx_SET (mem, reg),
8246                                       gen_rtx_SET (mem2, reg2)));
8247 }
8248
8249 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8250    stack pointer by ADJUSTMENT.  */
8251
8252 static void
8253 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8254 {
8255   rtx_insn *insn;
8256   machine_mode mode = aarch64_reg_save_mode (regno1);
8257
8258   if (regno2 == INVALID_REGNUM)
8259     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8260
8261   rtx reg1 = gen_rtx_REG (mode, regno1);
8262   rtx reg2 = gen_rtx_REG (mode, regno2);
8263
8264   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8265                                               reg2, adjustment));
8266   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8267   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8268   RTX_FRAME_RELATED_P (insn) = 1;
8269 }
8270
8271 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8272    adjusting it by ADJUSTMENT afterwards.  */
8273
8274 static rtx
8275 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8276                          HOST_WIDE_INT adjustment)
8277 {
8278   rtx mem = gen_frame_mem (mode, base);
8279   rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8280   rtx new_base = plus_constant (Pmode, base, adjustment);
8281
8282   return gen_rtx_PARALLEL (VOIDmode,
8283                            gen_rtvec (3,
8284                                       gen_rtx_SET (base, new_base),
8285                                       gen_rtx_SET (reg, mem),
8286                                       gen_rtx_SET (reg2, mem2)));
8287 }
8288
8289 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8290    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8291    into CFI_OPS.  */
8292
8293 static void
8294 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8295                   rtx *cfi_ops)
8296 {
8297   machine_mode mode = aarch64_reg_save_mode (regno1);
8298   rtx reg1 = gen_rtx_REG (mode, regno1);
8299
8300   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8301
8302   if (regno2 == INVALID_REGNUM)
8303     {
8304       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8305       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8306       emit_move_insn (reg1, gen_frame_mem (mode, mem));
8307     }
8308   else
8309     {
8310       rtx reg2 = gen_rtx_REG (mode, regno2);
8311       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8312       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8313                                           reg2, adjustment));
8314     }
8315 }
8316
8317 /* Given an ldp/stp register operand mode MODE, return a suitable mode to use
8318    for a mem rtx representing the entire pair.  */
8319
8320 static machine_mode
8321 aarch64_pair_mode_for_mode (machine_mode mode)
8322 {
8323   if (known_eq (GET_MODE_SIZE (mode), 4))
8324     return V2x4QImode;
8325   else if (known_eq (GET_MODE_SIZE (mode), 8))
8326     return V2x8QImode;
8327   else if (known_eq (GET_MODE_SIZE (mode), 16))
8328     return V2x16QImode;
8329   else
8330     gcc_unreachable ();
8331 }
8332
8333 /* Given a base mem MEM with mode and address suitable for a single ldp/stp
8334    operand, return an rtx like MEM which instead represents the entire pair.  */
8335
8336 static rtx
8337 aarch64_pair_mem_from_base (rtx mem)
8338 {
8339   auto pair_mode = aarch64_pair_mode_for_mode (GET_MODE (mem));
8340   mem = adjust_bitfield_address_nv (mem, pair_mode, 0);
8341   gcc_assert (aarch64_mem_pair_lanes_operand (mem, pair_mode));
8342   return mem;
8343 }
8344
8345 /* Generate and return a store pair instruction to store REG1 and REG2
8346    into memory starting at BASE_MEM.  All three rtxes should have modes of the
8347    same size.  */
8348
8349 rtx
8350 aarch64_gen_store_pair (rtx base_mem, rtx reg1, rtx reg2)
8351 {
8352   rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8353
8354   return gen_rtx_SET (pair_mem,
8355                       gen_rtx_UNSPEC (GET_MODE (pair_mem),
8356                                       gen_rtvec (2, reg1, reg2),
8357                                       UNSPEC_STP));
8358 }
8359
8360 /* Generate and return a load pair instruction to load a pair of
8361    registers starting at BASE_MEM into REG1 and REG2.  If CODE is
8362    UNKNOWN, all three rtxes should have modes of the same size.
8363    Otherwise, CODE is {SIGN,ZERO}_EXTEND, base_mem should be in SImode,
8364    and REG{1,2} should be in DImode.  */
8365
8366 rtx
8367 aarch64_gen_load_pair (rtx reg1, rtx reg2, rtx base_mem, enum rtx_code code)
8368 {
8369   rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8370
8371   const bool any_extend_p = (code == ZERO_EXTEND || code == SIGN_EXTEND);
8372   if (any_extend_p)
8373     gcc_checking_assert (GET_MODE (base_mem) == SImode
8374                          && GET_MODE (reg1) == DImode
8375                          && GET_MODE (reg2) == DImode);
8376   else
8377     gcc_assert (code == UNKNOWN);
8378
8379   rtx unspecs[2] = {
8380     gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg1),
8381                     gen_rtvec (1, pair_mem),
8382                     UNSPEC_LDP_FST),
8383     gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg2),
8384                     gen_rtvec (1, copy_rtx (pair_mem)),
8385                     UNSPEC_LDP_SND)
8386   };
8387
8388   if (any_extend_p)
8389     for (int i = 0; i < 2; i++)
8390       unspecs[i] = gen_rtx_fmt_e (code, DImode, unspecs[i]);
8391
8392   return gen_rtx_PARALLEL (VOIDmode,
8393                            gen_rtvec (2,
8394                                       gen_rtx_SET (reg1, unspecs[0]),
8395                                       gen_rtx_SET (reg2, unspecs[1])));
8396 }
8397
8398 /* Return TRUE if return address signing should be enabled for the current
8399    function, otherwise return FALSE.  */
8400
8401 bool
8402 aarch64_return_address_signing_enabled (void)
8403 {
8404   /* This function should only be called after frame laid out.   */
8405   gcc_assert (cfun->machine->frame.laid_out);
8406
8407   /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8408      if its LR is pushed onto stack.  */
8409   return (aarch_ra_sign_scope == AARCH_FUNCTION_ALL
8410           || (aarch_ra_sign_scope == AARCH_FUNCTION_NON_LEAF
8411               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8412 }
8413
8414 /* Only used by the arm backend.  */
8415 void aarch_bti_arch_check (void)
8416 {}
8417
8418 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
8419 bool
8420 aarch_bti_enabled (void)
8421 {
8422   return (aarch_enable_bti == 1);
8423 }
8424
8425 /* Check if INSN is a BTI J insn.  */
8426 bool
8427 aarch_bti_j_insn_p (rtx_insn *insn)
8428 {
8429   if (!insn || !INSN_P (insn))
8430     return false;
8431
8432   rtx pat = PATTERN (insn);
8433   return GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == UNSPECV_BTI_J;
8434 }
8435
8436 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction.  */
8437 bool
8438 aarch_pac_insn_p (rtx x)
8439 {
8440   if (!INSN_P (x))
8441     return false;
8442
8443   subrtx_var_iterator::array_type array;
8444   FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (x), ALL)
8445     {
8446       rtx sub = *iter;
8447       if (sub && GET_CODE (sub) == UNSPEC)
8448         {
8449           int unspec_val = XINT (sub, 1);
8450           switch (unspec_val)
8451             {
8452             case UNSPEC_PACIASP:
8453             case UNSPEC_PACIBSP:
8454               return true;
8455
8456             default:
8457               return false;
8458             }
8459           iter.skip_subrtxes ();
8460         }
8461     }
8462   return false;
8463 }
8464
8465 rtx aarch_gen_bti_c (void)
8466 {
8467   return gen_bti_c ();
8468 }
8469
8470 rtx aarch_gen_bti_j (void)
8471 {
8472   return gen_bti_j ();
8473 }
8474
8475 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8476    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8477    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
8478
8479      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8480          or LD1D address
8481
8482      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8483          if the variable isn't already nonnull
8484
8485    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8486    Handle this case using a temporary base register that is suitable for
8487    all offsets in that range.  Use ANCHOR_REG as this base register if it
8488    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
8489
8490 static inline void
8491 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8492                                      rtx &anchor_reg, poly_int64 &offset,
8493                                      rtx &ptrue)
8494 {
8495   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8496     {
8497       /* This is the maximum valid offset of the anchor from the base.
8498          Lower values would be valid too.  */
8499       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8500       if (!anchor_reg)
8501         {
8502           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8503           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8504                                     gen_int_mode (anchor_offset, Pmode)));
8505         }
8506       base_rtx = anchor_reg;
8507       offset -= anchor_offset;
8508     }
8509   if (!ptrue)
8510     {
8511       int pred_reg = cfun->machine->frame.spare_pred_reg;
8512       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8513                       CONSTM1_RTX (VNx16BImode));
8514       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8515     }
8516 }
8517
8518 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8519    is saved at BASE + OFFSET.  */
8520
8521 static void
8522 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8523                             rtx base, poly_int64 offset)
8524 {
8525   rtx mem = gen_frame_mem (GET_MODE (reg),
8526                            plus_constant (Pmode, base, offset));
8527   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8528 }
8529
8530 /* Emit code to save the callee-saved registers in REGS.  Skip any
8531    write-back candidates if SKIP_WB is true, otherwise consider only
8532    write-back candidates.
8533
8534    The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8535    of the static frame.  HARD_FP_VALID_P is true if the hard frame pointer
8536    has been set up.  */
8537
8538 static void
8539 aarch64_save_callee_saves (poly_int64 bytes_below_sp,
8540                            array_slice<unsigned int> regs, bool skip_wb,
8541                            bool hard_fp_valid_p)
8542 {
8543   aarch64_frame &frame = cfun->machine->frame;
8544   rtx_insn *insn;
8545   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8546
8547   auto skip_save_p = [&](unsigned int regno)
8548     {
8549       if (cfun->machine->reg_is_wrapped_separately[regno])
8550         return true;
8551
8552       if (skip_wb == (regno == frame.wb_push_candidate1
8553                       || regno == frame.wb_push_candidate2))
8554         return true;
8555
8556       return false;
8557     };
8558
8559   for (unsigned int i = 0; i < regs.size (); ++i)
8560     {
8561       unsigned int regno = regs[i];
8562       poly_int64 offset;
8563       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8564
8565       if (skip_save_p (regno))
8566         continue;
8567
8568       machine_mode mode = aarch64_reg_save_mode (regno);
8569       rtx reg = gen_rtx_REG (mode, regno);
8570       rtx move_src = reg;
8571       offset = frame.reg_offset[regno] - bytes_below_sp;
8572       if (regno == VG_REGNUM)
8573         {
8574           move_src = gen_rtx_REG (DImode, IP0_REGNUM);
8575           emit_move_insn (move_src, gen_int_mode (aarch64_sve_vg, DImode));
8576         }
8577       rtx base_rtx = stack_pointer_rtx;
8578       poly_int64 sp_offset = offset;
8579
8580       HOST_WIDE_INT const_offset;
8581       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8582         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8583                                              offset, ptrue);
8584       else if (GP_REGNUM_P (REGNO (reg))
8585                && (!offset.is_constant (&const_offset) || const_offset >= 512))
8586         {
8587           poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
8588           if (hard_fp_valid_p)
8589             base_rtx = hard_frame_pointer_rtx;
8590           else
8591             {
8592               if (!anchor_reg)
8593                 {
8594                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8595                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8596                                             gen_int_mode (fp_offset, Pmode)));
8597                 }
8598               base_rtx = anchor_reg;
8599             }
8600           offset -= fp_offset;
8601         }
8602       rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8603       rtx cfi_mem = gen_frame_mem (mode, plus_constant (Pmode,
8604                                                         stack_pointer_rtx,
8605                                                         sp_offset));
8606       rtx cfi_set = gen_rtx_SET (cfi_mem, reg);
8607       bool need_cfi_note_p = (base_rtx != stack_pointer_rtx);
8608
8609       unsigned int regno2;
8610       if (!aarch64_sve_mode_p (mode)
8611           && reg == move_src
8612           && i + 1 < regs.size ()
8613           && (regno2 = regs[i + 1], !skip_save_p (regno2))
8614           && known_eq (GET_MODE_SIZE (mode),
8615                        frame.reg_offset[regno2] - frame.reg_offset[regno]))
8616         {
8617           rtx reg2 = gen_rtx_REG (mode, regno2);
8618
8619           offset += GET_MODE_SIZE (mode);
8620           insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
8621
8622           rtx cfi_mem2
8623             = gen_frame_mem (mode,
8624                              plus_constant (Pmode,
8625                                             stack_pointer_rtx,
8626                                             sp_offset + GET_MODE_SIZE (mode)));
8627           rtx cfi_set2 = gen_rtx_SET (cfi_mem2, reg2);
8628
8629           /* The first part of a frame-related parallel insn is always
8630              assumed to be relevant to the frame calculations;
8631              subsequent parts, are only frame-related if
8632              explicitly marked.  */
8633           if (aarch64_emit_cfi_for_reg_p (regno2))
8634             RTX_FRAME_RELATED_P (cfi_set2) = 1;
8635
8636           /* Add a REG_FRAME_RELATED_EXPR note since the unspec
8637              representation of stp cannot be understood directly by
8638              dwarf2cfi.  */
8639           rtx par = gen_rtx_PARALLEL (VOIDmode,
8640                                       gen_rtvec (2, cfi_set, cfi_set2));
8641           add_reg_note (insn, REG_FRAME_RELATED_EXPR, par);
8642
8643           regno = regno2;
8644           ++i;
8645         }
8646       else
8647         {
8648           if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8649             {
8650               insn = emit_insn (gen_aarch64_pred_mov (mode, mem,
8651                                                       ptrue, move_src));
8652               need_cfi_note_p = true;
8653             }
8654           else if (aarch64_sve_mode_p (mode))
8655             insn = emit_insn (gen_rtx_SET (mem, move_src));
8656           else
8657             insn = emit_move_insn (mem, move_src);
8658
8659           if (frame_related_p && (need_cfi_note_p || move_src != reg))
8660             add_reg_note (insn, REG_FRAME_RELATED_EXPR, cfi_set);
8661         }
8662
8663       RTX_FRAME_RELATED_P (insn) = frame_related_p;
8664
8665       /* Emit a fake instruction to indicate that the VG save slot has
8666          been initialized.  */
8667       if (regno == VG_REGNUM)
8668         emit_insn (gen_aarch64_old_vg_saved (move_src, mem));
8669     }
8670 }
8671
8672 /* Emit code to restore the callee registers in REGS, ignoring pop candidates
8673    and any other registers that are handled separately.  Write the appropriate
8674    REG_CFA_RESTORE notes into CFI_OPS.
8675
8676    The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8677    of the static frame.  */
8678
8679 static void
8680 aarch64_restore_callee_saves (poly_int64 bytes_below_sp,
8681                               array_slice<unsigned int> regs, rtx *cfi_ops)
8682 {
8683   aarch64_frame &frame = cfun->machine->frame;
8684   poly_int64 offset;
8685   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8686
8687   auto skip_restore_p = [&](unsigned int regno)
8688     {
8689       if (cfun->machine->reg_is_wrapped_separately[regno])
8690         return true;
8691
8692       if (regno == frame.wb_pop_candidate1
8693           || regno == frame.wb_pop_candidate2)
8694         return true;
8695
8696       /* The shadow call stack code restores LR separately.  */
8697       if (frame.is_scs_enabled && regno == LR_REGNUM)
8698         return true;
8699
8700       return false;
8701     };
8702
8703   for (unsigned int i = 0; i < regs.size (); ++i)
8704     {
8705       unsigned int regno = regs[i];
8706       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8707       if (skip_restore_p (regno))
8708         continue;
8709
8710       machine_mode mode = aarch64_reg_save_mode (regno);
8711       rtx reg = gen_rtx_REG (mode, regno);
8712       offset = frame.reg_offset[regno] - bytes_below_sp;
8713       rtx base_rtx = stack_pointer_rtx;
8714       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8715         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8716                                              offset, ptrue);
8717       rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8718
8719       unsigned int regno2;
8720       if (!aarch64_sve_mode_p (mode)
8721           && i + 1 < regs.size ()
8722           && (regno2 = regs[i + 1], !skip_restore_p (regno2))
8723           && known_eq (GET_MODE_SIZE (mode),
8724                        frame.reg_offset[regno2] - frame.reg_offset[regno]))
8725         {
8726           rtx reg2 = gen_rtx_REG (mode, regno2);
8727
8728           offset += GET_MODE_SIZE (mode);
8729           emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
8730
8731           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8732           regno = regno2;
8733           ++i;
8734         }
8735       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8736         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
8737       else if (aarch64_sve_mode_p (mode))
8738         emit_insn (gen_rtx_SET (reg, mem));
8739       else
8740         emit_move_insn (reg, mem);
8741       if (frame_related_p)
8742         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
8743     }
8744 }
8745
8746 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
8747    of MODE.  */
8748
8749 static inline bool
8750 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8751 {
8752   HOST_WIDE_INT multiple;
8753   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8754           && IN_RANGE (multiple, -8, 7));
8755 }
8756
8757 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
8758    of MODE.  */
8759
8760 static inline bool
8761 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8762 {
8763   HOST_WIDE_INT multiple;
8764   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8765           && IN_RANGE (multiple, -32, 31));
8766 }
8767
8768 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
8769    of MODE.  */
8770
8771 static inline bool
8772 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8773 {
8774   HOST_WIDE_INT multiple;
8775   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8776           && IN_RANGE (multiple, 0, 63));
8777 }
8778
8779 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
8780    of MODE.  */
8781
8782 bool
8783 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8784 {
8785   HOST_WIDE_INT multiple;
8786   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8787           && IN_RANGE (multiple, -64, 63));
8788 }
8789
8790 /* Return true if OFFSET is a signed 9-bit value.  */
8791
8792 bool
8793 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
8794                                        poly_int64 offset)
8795 {
8796   HOST_WIDE_INT const_offset;
8797   return (offset.is_constant (&const_offset)
8798           && IN_RANGE (const_offset, -256, 255));
8799 }
8800
8801 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
8802    of MODE.  */
8803
8804 static inline bool
8805 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8806 {
8807   HOST_WIDE_INT multiple;
8808   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8809           && IN_RANGE (multiple, -256, 255));
8810 }
8811
8812 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
8813    of MODE.  */
8814
8815 static inline bool
8816 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8817 {
8818   HOST_WIDE_INT multiple;
8819   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8820           && IN_RANGE (multiple, 0, 4095));
8821 }
8822
8823 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
8824
8825 static sbitmap
8826 aarch64_get_separate_components (void)
8827 {
8828   aarch64_frame &frame = cfun->machine->frame;
8829   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8830   bitmap_clear (components);
8831
8832   /* The registers we need saved to the frame.  */
8833   bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
8834   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8835     if (aarch64_register_saved_on_entry (regno))
8836       {
8837         /* Disallow shrink wrapping for registers that will be clobbered
8838            by an SMSTART SM in the prologue.  */
8839         if (enables_pstate_sm
8840             && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
8841           continue;
8842
8843         /* Punt on saves and restores that use ST1D and LD1D.  We could
8844            try to be smarter, but it would involve making sure that the
8845            spare predicate register itself is safe to use at the save
8846            and restore points.  Also, when a frame pointer is being used,
8847            the slots are often out of reach of ST1D and LD1D anyway.  */
8848         machine_mode mode = aarch64_reg_save_mode (regno);
8849         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8850           continue;
8851
8852         poly_int64 offset = frame.reg_offset[regno];
8853
8854         /* Get the offset relative to the register we'll use.  */
8855         if (frame_pointer_needed)
8856           offset -= frame.bytes_below_hard_fp;
8857
8858         /* Check that we can access the stack slot of the register with one
8859            direct load with no adjustments needed.  */
8860         if (aarch64_sve_mode_p (mode)
8861             ? offset_9bit_signed_scaled_p (mode, offset)
8862             : offset_12bit_unsigned_scaled_p (mode, offset))
8863           bitmap_set_bit (components, regno);
8864       }
8865
8866   /* Don't mess with the hard frame pointer.  */
8867   if (frame_pointer_needed)
8868     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
8869
8870   /* If the spare predicate register used by big-endian SVE code
8871      is call-preserved, it must be saved in the main prologue
8872      before any saves that use it.  */
8873   if (frame.spare_pred_reg != INVALID_REGNUM)
8874     bitmap_clear_bit (components, frame.spare_pred_reg);
8875
8876   unsigned reg1 = frame.wb_push_candidate1;
8877   unsigned reg2 = frame.wb_push_candidate2;
8878   /* If registers have been chosen to be stored/restored with
8879      writeback don't interfere with them to avoid having to output explicit
8880      stack adjustment instructions.  */
8881   if (reg2 != INVALID_REGNUM)
8882     bitmap_clear_bit (components, reg2);
8883   if (reg1 != INVALID_REGNUM)
8884     bitmap_clear_bit (components, reg1);
8885
8886   bitmap_clear_bit (components, LR_REGNUM);
8887   bitmap_clear_bit (components, SP_REGNUM);
8888   if (flag_stack_clash_protection)
8889     {
8890       if (frame.sve_save_and_probe != INVALID_REGNUM)
8891         bitmap_clear_bit (components, frame.sve_save_and_probe);
8892       if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
8893         bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
8894     }
8895
8896   /* The VG save sequence needs a temporary GPR.  Punt for now on trying
8897      to find one.  */
8898   bitmap_clear_bit (components, VG_REGNUM);
8899
8900   return components;
8901 }
8902
8903 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
8904
8905 static sbitmap
8906 aarch64_components_for_bb (basic_block bb)
8907 {
8908   bitmap in = DF_LIVE_IN (bb);
8909   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
8910   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
8911
8912   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8913   bitmap_clear (components);
8914
8915   /* Clobbered registers don't generate values in any meaningful sense,
8916      since nothing after the clobber can rely on their value.  And we can't
8917      say that partially-clobbered registers are unconditionally killed,
8918      because whether they're killed or not depends on the mode of the
8919      value they're holding.  Thus partially call-clobbered registers
8920      appear in neither the kill set nor the gen set.
8921
8922      Check manually for any calls that clobber more of a register than the
8923      current function can.  */
8924   function_abi_aggregator callee_abis;
8925   rtx_insn *insn;
8926   FOR_BB_INSNS (bb, insn)
8927     if (CALL_P (insn))
8928       callee_abis.note_callee_abi (insn_callee_abi (insn));
8929   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
8930
8931   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
8932   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8933     if (!fixed_regs[regno]
8934         && !crtl->abi->clobbers_full_reg_p (regno)
8935         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
8936             || bitmap_bit_p (in, regno)
8937             || bitmap_bit_p (gen, regno)
8938             || bitmap_bit_p (kill, regno)))
8939       {
8940         bitmap_set_bit (components, regno);
8941
8942         /* If there is a callee-save at an adjacent offset, add it too
8943            to increase the use of LDP/STP.  */
8944         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8945         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
8946
8947         if (regno2 <= LAST_SAVED_REGNUM)
8948           {
8949             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
8950             if (regno < regno2
8951                 ? known_eq (offset + 8, offset2)
8952                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
8953               bitmap_set_bit (components, regno2);
8954           }
8955       }
8956
8957   return components;
8958 }
8959
8960 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
8961    Nothing to do for aarch64.  */
8962
8963 static void
8964 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
8965 {
8966 }
8967
8968 /* Return the next set bit in BMP from START onwards.  Return the total number
8969    of bits in BMP if no set bit is found at or after START.  */
8970
8971 static unsigned int
8972 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
8973 {
8974   unsigned int nbits = SBITMAP_SIZE (bmp);
8975   if (start == nbits)
8976     return start;
8977
8978   gcc_assert (start < nbits);
8979   for (unsigned int i = start; i < nbits; i++)
8980     if (bitmap_bit_p (bmp, i))
8981       return i;
8982
8983   return nbits;
8984 }
8985
8986 /* Do the work for aarch64_emit_prologue_components and
8987    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
8988    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
8989    for these components or the epilogue sequence.  That is, it determines
8990    whether we should emit stores or loads and what kind of CFA notes to attach
8991    to the insns.  Otherwise the logic for the two sequences is very
8992    similar.  */
8993
8994 static void
8995 aarch64_process_components (sbitmap components, bool prologue_p)
8996 {
8997   aarch64_frame &frame = cfun->machine->frame;
8998   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
8999                              ? HARD_FRAME_POINTER_REGNUM
9000                              : STACK_POINTER_REGNUM);
9001
9002   unsigned last_regno = SBITMAP_SIZE (components);
9003   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9004   rtx_insn *insn = NULL;
9005
9006   while (regno != last_regno)
9007     {
9008       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9009       machine_mode mode = aarch64_reg_save_mode (regno);
9010
9011       rtx reg = gen_rtx_REG (mode, regno);
9012       poly_int64 offset = frame.reg_offset[regno];
9013       if (frame_pointer_needed)
9014         offset -= frame.bytes_below_hard_fp;
9015
9016       rtx addr = plus_constant (Pmode, ptr_reg, offset);
9017       rtx mem = gen_frame_mem (mode, addr);
9018
9019       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9020       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9021       /* No more registers to handle after REGNO.
9022          Emit a single save/restore and exit.  */
9023       if (regno2 == last_regno)
9024         {
9025           insn = emit_insn (set);
9026           if (frame_related_p)
9027             {
9028               RTX_FRAME_RELATED_P (insn) = 1;
9029               if (prologue_p)
9030                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9031               else
9032                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9033             }
9034           break;
9035         }
9036
9037       poly_int64 offset2 = frame.reg_offset[regno2];
9038       /* The next register is not of the same class or its offset is not
9039          mergeable with the current one into a pair.  */
9040       if (aarch64_sve_mode_p (mode)
9041           || !satisfies_constraint_Ump (mem)
9042           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9043           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9044           || maybe_ne ((offset2 - frame.reg_offset[regno]),
9045                        GET_MODE_SIZE (mode)))
9046         {
9047           insn = emit_insn (set);
9048           if (frame_related_p)
9049             {
9050               RTX_FRAME_RELATED_P (insn) = 1;
9051               if (prologue_p)
9052                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9053               else
9054                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9055             }
9056
9057           regno = regno2;
9058           continue;
9059         }
9060
9061       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9062
9063       /* REGNO2 can be saved/restored in a pair with REGNO.  */
9064       rtx reg2 = gen_rtx_REG (mode, regno2);
9065       if (frame_pointer_needed)
9066         offset2 -= frame.bytes_below_hard_fp;
9067       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9068       rtx mem2 = gen_frame_mem (mode, addr2);
9069       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9070                              : gen_rtx_SET (reg2, mem2);
9071
9072       if (prologue_p)
9073         insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
9074       else
9075         insn = emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
9076
9077       if (frame_related_p || frame_related2_p)
9078         {
9079           RTX_FRAME_RELATED_P (insn) = 1;
9080           if (prologue_p)
9081             {
9082               if (frame_related_p)
9083                 add_reg_note (insn, REG_CFA_OFFSET, set);
9084               if (frame_related2_p)
9085                 add_reg_note (insn, REG_CFA_OFFSET, set2);
9086             }
9087           else
9088             {
9089               if (frame_related_p)
9090                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9091               if (frame_related2_p)
9092                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9093             }
9094         }
9095
9096       regno = aarch64_get_next_set_bit (components, regno2 + 1);
9097     }
9098 }
9099
9100 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
9101
9102 static void
9103 aarch64_emit_prologue_components (sbitmap components)
9104 {
9105   aarch64_process_components (components, true);
9106 }
9107
9108 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
9109
9110 static void
9111 aarch64_emit_epilogue_components (sbitmap components)
9112 {
9113   aarch64_process_components (components, false);
9114 }
9115
9116 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
9117
9118 static void
9119 aarch64_set_handled_components (sbitmap components)
9120 {
9121   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9122     if (bitmap_bit_p (components, regno))
9123       cfun->machine->reg_is_wrapped_separately[regno] = true;
9124 }
9125
9126 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
9127    determining the probe offset for alloca.  */
9128
9129 static HOST_WIDE_INT
9130 aarch64_stack_clash_protection_alloca_probe_range (void)
9131 {
9132   return STACK_CLASH_CALLER_GUARD;
9133 }
9134
9135 /* Emit a stack tie that acts as a scheduling barrier for all previous and
9136    subsequent memory accesses and that requires the stack pointer and REG
9137    to have their current values.  REG can be stack_pointer_rtx if no
9138    other register's value needs to be fixed.  */
9139
9140 static void
9141 aarch64_emit_stack_tie (rtx reg)
9142 {
9143   emit_insn (gen_stack_tie (reg, gen_int_mode (REGNO (reg), DImode)));
9144 }
9145
9146 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9147    registers.  If POLY_SIZE is not large enough to require a probe this function
9148    will only adjust the stack.  When allocating the stack space
9149    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9150    FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
9151    the saved registers.  If we are then we ensure that any allocation
9152    larger than the ABI defined buffer needs a probe so that the
9153    invariant of having a 1KB buffer is maintained.
9154
9155    We emit barriers after each stack adjustment to prevent optimizations from
9156    breaking the invariant that we never drop the stack more than a page.  This
9157    invariant is needed to make it easier to correctly handle asynchronous
9158    events, e.g. if we were to allow the stack to be dropped by more than a page
9159    and then have multiple probes up and we take a signal somewhere in between
9160    then the signal handler doesn't know the state of the stack and can make no
9161    assumptions about which pages have been probed.
9162
9163    FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE
9164    is measured relative to the SME vector length instead of the current
9165    prevailing vector length.  It is 0 otherwise.  */
9166
9167 static void
9168 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9169                                         poly_int64 poly_size,
9170                                         aarch64_feature_flags force_isa_mode,
9171                                         bool frame_related_p,
9172                                         bool final_adjustment_p)
9173 {
9174   aarch64_frame &frame = cfun->machine->frame;
9175   HOST_WIDE_INT guard_size
9176     = 1 << param_stack_clash_protection_guard_size;
9177   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9178   HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
9179   gcc_assert (multiple_p (poly_size, byte_sp_alignment));
9180   HOST_WIDE_INT min_probe_threshold
9181     = (final_adjustment_p
9182        ? guard_used_by_caller + byte_sp_alignment
9183        : guard_size - guard_used_by_caller);
9184   poly_int64 frame_size = frame.frame_size;
9185
9186   /* We should always have a positive probe threshold.  */
9187   gcc_assert (min_probe_threshold > 0);
9188
9189   if (flag_stack_clash_protection && !final_adjustment_p)
9190     {
9191       poly_int64 initial_adjust = frame.initial_adjust;
9192       poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9193       poly_int64 final_adjust = frame.final_adjust;
9194
9195       if (known_eq (frame_size, 0))
9196         {
9197           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9198         }
9199       else if (known_lt (initial_adjust + sve_callee_adjust,
9200                          guard_size - guard_used_by_caller)
9201                && known_lt (final_adjust, guard_used_by_caller))
9202         {
9203           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9204         }
9205     }
9206
9207   /* If SIZE is not large enough to require probing, just adjust the stack and
9208      exit.  */
9209   if (known_lt (poly_size, min_probe_threshold)
9210       || !flag_stack_clash_protection)
9211     {
9212       aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
9213                       frame_related_p);
9214       return;
9215     }
9216
9217   HOST_WIDE_INT size;
9218   /* Handle the SVE non-constant case first.  */
9219   if (!poly_size.is_constant (&size))
9220     {
9221      if (dump_file)
9222       {
9223         fprintf (dump_file, "Stack clash SVE prologue: ");
9224         print_dec (poly_size, dump_file);
9225         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9226       }
9227
9228       /* First calculate the amount of bytes we're actually spilling.  */
9229       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9230                           poly_size, temp1, temp2, force_isa_mode,
9231                           false, true);
9232
9233       rtx_insn *insn = get_last_insn ();
9234
9235       if (frame_related_p)
9236         {
9237           /* This is done to provide unwinding information for the stack
9238              adjustments we're about to do, however to prevent the optimizers
9239              from removing the R11 move and leaving the CFA note (which would be
9240              very wrong) we tie the old and new stack pointer together.
9241              The tie will expand to nothing but the optimizers will not touch
9242              the instruction.  */
9243           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9244           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9245           aarch64_emit_stack_tie (stack_ptr_copy);
9246
9247           /* We want the CFA independent of the stack pointer for the
9248              duration of the loop.  */
9249           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9250           RTX_FRAME_RELATED_P (insn) = 1;
9251         }
9252
9253       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9254       rtx guard_const = gen_int_mode (guard_size, Pmode);
9255
9256       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9257                                                    stack_pointer_rtx, temp1,
9258                                                    probe_const, guard_const));
9259
9260       /* Now reset the CFA register if needed.  */
9261       if (frame_related_p)
9262         {
9263           add_reg_note (insn, REG_CFA_DEF_CFA,
9264                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9265                                       gen_int_mode (poly_size, Pmode)));
9266           RTX_FRAME_RELATED_P (insn) = 1;
9267         }
9268
9269       return;
9270     }
9271
9272   if (dump_file)
9273     fprintf (dump_file,
9274              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9275              " bytes, probing will be required.\n", size);
9276
9277   /* Round size to the nearest multiple of guard_size, and calculate the
9278      residual as the difference between the original size and the rounded
9279      size.  */
9280   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9281   HOST_WIDE_INT residual = size - rounded_size;
9282
9283   /* We can handle a small number of allocations/probes inline.  Otherwise
9284      punt to a loop.  */
9285   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9286     {
9287       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9288         {
9289           aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
9290           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9291                                            guard_used_by_caller));
9292           emit_insn (gen_blockage ());
9293         }
9294       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9295     }
9296   else
9297     {
9298       /* Compute the ending address.  */
9299       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9300                           temp1, NULL, force_isa_mode, false, true);
9301       rtx_insn *insn = get_last_insn ();
9302
9303       /* For the initial allocation, we don't have a frame pointer
9304          set up, so we always need CFI notes.  If we're doing the
9305          final allocation, then we may have a frame pointer, in which
9306          case it is the CFA, otherwise we need CFI notes.
9307
9308          We can determine which allocation we are doing by looking at
9309          the value of FRAME_RELATED_P since the final allocations are not
9310          frame related.  */
9311       if (frame_related_p)
9312         {
9313           /* We want the CFA independent of the stack pointer for the
9314              duration of the loop.  */
9315           add_reg_note (insn, REG_CFA_DEF_CFA,
9316                         plus_constant (Pmode, temp1, rounded_size));
9317           RTX_FRAME_RELATED_P (insn) = 1;
9318         }
9319
9320       /* This allocates and probes the stack.  Note that this re-uses some of
9321          the existing Ada stack protection code.  However we are guaranteed not
9322          to enter the non loop or residual branches of that code.
9323
9324          The non-loop part won't be entered because if our allocation amount
9325          doesn't require a loop, the case above would handle it.
9326
9327          The residual amount won't be entered because TEMP1 is a mutliple of
9328          the allocation size.  The residual will always be 0.  As such, the only
9329          part we are actually using from that code is the loop setup.  The
9330          actual probing is done in aarch64_output_probe_stack_range.  */
9331       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9332                                                stack_pointer_rtx, temp1));
9333
9334       /* Now reset the CFA register if needed.  */
9335       if (frame_related_p)
9336         {
9337           add_reg_note (insn, REG_CFA_DEF_CFA,
9338                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9339           RTX_FRAME_RELATED_P (insn) = 1;
9340         }
9341
9342       emit_insn (gen_blockage ());
9343       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9344     }
9345
9346   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
9347      be probed.  This maintains the requirement that each page is probed at
9348      least once.  For initial probing we probe only if the allocation is
9349      more than GUARD_SIZE - buffer, and below the saved registers we probe
9350      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
9351      GUARD_SIZE.  This works that for any allocation that is large enough to
9352      trigger a probe here, we'll have at least one, and if they're not large
9353      enough for this code to emit anything for them, The page would have been
9354      probed by the saving of FP/LR either by this function or any callees.  If
9355      we don't have any callees then we won't have more stack adjustments and so
9356      are still safe.  */
9357   if (residual)
9358     {
9359       gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
9360
9361       /* If we're doing final adjustments, and we've done any full page
9362          allocations then any residual needs to be probed.  */
9363       if (final_adjustment_p && rounded_size != 0)
9364         min_probe_threshold = 0;
9365
9366       aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
9367       if (residual >= min_probe_threshold)
9368         {
9369           if (dump_file)
9370             fprintf (dump_file,
9371                      "Stack clash AArch64 prologue residuals: "
9372                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9373                      "\n", residual);
9374
9375           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9376                                            guard_used_by_caller));
9377           emit_insn (gen_blockage ());
9378         }
9379     }
9380 }
9381
9382 /* Implement TARGET_EXTRA_LIVE_ON_ENTRY.  */
9383
9384 void
9385 aarch64_extra_live_on_entry (bitmap regs)
9386 {
9387   if (TARGET_ZA)
9388     {
9389       bitmap_set_bit (regs, LOWERING_REGNUM);
9390       bitmap_set_bit (regs, SME_STATE_REGNUM);
9391       bitmap_set_bit (regs, TPIDR2_SETUP_REGNUM);
9392       bitmap_set_bit (regs, ZA_FREE_REGNUM);
9393       bitmap_set_bit (regs, ZA_SAVED_REGNUM);
9394
9395       /* The only time ZA can't have live contents on entry is when
9396          the function explicitly treats it as a pure output.  */
9397       auto za_flags = aarch64_cfun_shared_flags ("za");
9398       if (za_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9399         bitmap_set_bit (regs, ZA_REGNUM);
9400
9401       /* Since ZT0 is call-clobbered, it is only live on input if
9402          it is explicitly shared, and is not a pure output.  */
9403       auto zt0_flags = aarch64_cfun_shared_flags ("zt0");
9404       if (zt0_flags != 0
9405           && zt0_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9406         bitmap_set_bit (regs, ZT0_REGNUM);
9407     }
9408 }
9409
9410 /* Return 1 if the register is used by the epilogue.  We need to say the
9411    return register is used, but only after epilogue generation is complete.
9412    Note that in the case of sibcalls, the values "used by the epilogue" are
9413    considered live at the start of the called function.  */
9414
9415 int
9416 aarch64_epilogue_uses (int regno)
9417 {
9418   if (epilogue_completed)
9419     {
9420       if (regno == LR_REGNUM)
9421         return 1;
9422     }
9423   if (regno == LOWERING_REGNUM && TARGET_ZA)
9424     return 1;
9425   if (regno == SME_STATE_REGNUM && TARGET_ZA)
9426     return 1;
9427   if (regno == TPIDR2_SETUP_REGNUM && TARGET_ZA)
9428     return 1;
9429   /* If the function shares SME state with its caller, ensure that that
9430      data is not in the lazy save buffer on exit.  */
9431   if (regno == ZA_SAVED_REGNUM && aarch64_cfun_incoming_pstate_za () != 0)
9432     return 1;
9433   if (regno == ZA_REGNUM && aarch64_cfun_shared_flags ("za") != 0)
9434     return 1;
9435   if (regno == ZT0_REGNUM && aarch64_cfun_shared_flags ("zt0") != 0)
9436     return 1;
9437   return 0;
9438 }
9439
9440 /* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE.  */
9441
9442 static bool
9443 aarch64_use_late_prologue_epilogue ()
9444 {
9445   return aarch64_cfun_enables_pstate_sm ();
9446 }
9447
9448 /* The current function's frame has a save slot for the incoming state
9449    of SVCR.  Return a legitimate memory for the slot, based on the hard
9450    frame pointer.  */
9451
9452 static rtx
9453 aarch64_old_svcr_mem ()
9454 {
9455   gcc_assert (frame_pointer_needed
9456               && known_ge (cfun->machine->frame.old_svcr_offset, 0));
9457   rtx base = hard_frame_pointer_rtx;
9458   poly_int64 offset = (0
9459                        /* hard fp -> bottom of frame.  */
9460                        - cfun->machine->frame.bytes_below_hard_fp
9461                        /* bottom of frame -> save slot.  */
9462                        + cfun->machine->frame.old_svcr_offset);
9463   return gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
9464 }
9465
9466 /* The current function's frame has a save slot for the incoming state
9467    of SVCR.  Load the slot into register REGNO and return the register.  */
9468
9469 static rtx
9470 aarch64_read_old_svcr (unsigned int regno)
9471 {
9472   rtx svcr = gen_rtx_REG (DImode, regno);
9473   emit_move_insn (svcr, aarch64_old_svcr_mem ());
9474   return svcr;
9475 }
9476
9477 /* Like the rtx version of aarch64_guard_switch_pstate_sm, but first
9478    load the incoming value of SVCR from its save slot into temporary
9479    register REGNO.  */
9480
9481 static rtx_insn *
9482 aarch64_guard_switch_pstate_sm (unsigned int regno,
9483                                 aarch64_feature_flags local_mode)
9484 {
9485   rtx old_svcr = aarch64_read_old_svcr (regno);
9486   return aarch64_guard_switch_pstate_sm (old_svcr, local_mode);
9487 }
9488
9489 /* AArch64 stack frames generated by this compiler look like:
9490
9491         +-------------------------------+
9492         |                               |
9493         |  incoming stack arguments     |
9494         |                               |
9495         +-------------------------------+
9496         |                               | <-- incoming stack pointer (aligned)
9497         |  callee-allocated save area   |
9498         |  for register varargs         |
9499         |                               |
9500         +-------------------------------+
9501         |  local variables (1)          | <-- frame_pointer_rtx
9502         |                               |
9503         +-------------------------------+
9504         |  padding (1)                  |
9505         +-------------------------------+
9506         |  callee-saved registers       |
9507         +-------------------------------+
9508         |  LR'                          |
9509         +-------------------------------+
9510         |  FP'                          |
9511         +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
9512         |  SVE vector registers         |
9513         +-------------------------------+
9514         |  SVE predicate registers      |
9515         +-------------------------------+
9516         |  local variables (2)          |
9517         +-------------------------------+
9518         |  padding (2)                  |
9519         +-------------------------------+
9520         |  dynamic allocation           |
9521         +-------------------------------+
9522         |  padding                      |
9523         +-------------------------------+
9524         |  outgoing stack arguments     | <-- arg_pointer
9525         |                               |
9526         +-------------------------------+
9527         |                               | <-- stack_pointer_rtx (aligned)
9528
9529    The regions marked (1) and (2) are mutually exclusive.  (2) is used
9530    when aarch64_save_regs_above_locals_p is true.
9531
9532    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9533    but leave frame_pointer_rtx and hard_frame_pointer_rtx
9534    unchanged.
9535
9536    By default for stack-clash we assume the guard is at least 64KB, but this
9537    value is configurable to either 4KB or 64KB.  We also force the guard size to
9538    be the same as the probing interval and both values are kept in sync.
9539
9540    With those assumptions the callee can allocate up to 63KB (or 3KB depending
9541    on the guard size) of stack space without probing.
9542
9543    When probing is needed, we emit a probe at the start of the prologue
9544    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9545
9546    We can also use register saves as probes.  These are stored in
9547    sve_save_and_probe and hard_fp_save_and_probe.
9548
9549    For outgoing arguments we probe if the size is larger than 1KB, such that
9550    the ABI specified buffer is maintained for the next callee.
9551
9552    The following registers are reserved during frame layout and should not be
9553    used for any other purpose:
9554
9555    - r11: Used by stack clash protection when SVE is enabled, and also
9556           as an anchor register when saving and restoring registers
9557    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9558    - r14 and r15: Used for speculation tracking.
9559    - r16(IP0), r17(IP1): Used by indirect tailcalls.
9560    - r30(LR), r29(FP): Used by standard frame layout.
9561
9562    These registers must be avoided in frame layout related code unless the
9563    explicit intention is to interact with one of the features listed above.  */
9564
9565 /* Generate the prologue instructions for entry into a function.
9566    Establish the stack frame by decreasing the stack pointer with a
9567    properly calculated size and, if necessary, create a frame record
9568    filled with the values of LR and previous frame pointer.  The
9569    current FP is also set up if it is in use.  */
9570
9571 void
9572 aarch64_expand_prologue (void)
9573 {
9574   aarch64_frame &frame = cfun->machine->frame;
9575   poly_int64 frame_size = frame.frame_size;
9576   poly_int64 initial_adjust = frame.initial_adjust;
9577   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9578   poly_int64 final_adjust = frame.final_adjust;
9579   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9580   unsigned reg1 = frame.wb_push_candidate1;
9581   unsigned reg2 = frame.wb_push_candidate2;
9582   bool emit_frame_chain = frame.emit_frame_chain;
9583   rtx_insn *insn;
9584   aarch64_feature_flags force_isa_mode = 0;
9585   if (aarch64_cfun_enables_pstate_sm ())
9586     force_isa_mode = AARCH64_FL_SM_ON;
9587
9588   if (flag_stack_clash_protection
9589       && known_eq (callee_adjust, 0)
9590       && known_lt (frame.reg_offset[VG_REGNUM], 0))
9591     {
9592       /* Fold the SVE allocation into the initial allocation.
9593          We don't do this in aarch64_layout_arg to avoid pessimizing
9594          the epilogue code.  */
9595       initial_adjust += sve_callee_adjust;
9596       sve_callee_adjust = 0;
9597     }
9598
9599   /* Sign return address for functions.  */
9600   if (aarch64_return_address_signing_enabled ())
9601     {
9602       switch (aarch64_ra_sign_key)
9603         {
9604           case AARCH64_KEY_A:
9605             insn = emit_insn (gen_paciasp ());
9606             break;
9607           case AARCH64_KEY_B:
9608             insn = emit_insn (gen_pacibsp ());
9609             break;
9610           default:
9611             gcc_unreachable ();
9612         }
9613       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9614       RTX_FRAME_RELATED_P (insn) = 1;
9615     }
9616
9617   /* Push return address to shadow call stack.  */
9618   if (frame.is_scs_enabled)
9619     emit_insn (gen_scs_push ());
9620
9621   if (flag_stack_usage_info)
9622     current_function_static_stack_size = constant_lower_bound (frame_size);
9623
9624   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9625     {
9626       if (crtl->is_leaf && !cfun->calls_alloca)
9627         {
9628           if (maybe_gt (frame_size, PROBE_INTERVAL)
9629               && maybe_gt (frame_size, get_stack_check_protect ()))
9630             aarch64_emit_probe_stack_range (get_stack_check_protect (),
9631                                             (frame_size
9632                                              - get_stack_check_protect ()));
9633         }
9634       else if (maybe_gt (frame_size, 0))
9635         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9636     }
9637
9638   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9639   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9640
9641   /* In theory we should never have both an initial adjustment
9642      and a callee save adjustment.  Verify that is the case since the
9643      code below does not handle it for -fstack-clash-protection.  */
9644   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9645
9646   /* Will only probe if the initial adjustment is larger than the guard
9647      less the amount of the guard reserved for use by the caller's
9648      outgoing args.  */
9649   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9650                                           force_isa_mode, true, false);
9651
9652   if (callee_adjust != 0)
9653     aarch64_push_regs (reg1, reg2, callee_adjust);
9654
9655   /* The offset of the current SP from the bottom of the static frame.  */
9656   poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
9657
9658   if (emit_frame_chain)
9659     {
9660       /* The offset of the frame chain record (if any) from the current SP.  */
9661       poly_int64 chain_offset = (initial_adjust + callee_adjust
9662                                  - frame.bytes_above_hard_fp);
9663       gcc_assert (known_ge (chain_offset, 0));
9664
9665       gcc_assert (reg1 == R29_REGNUM && reg2 == R30_REGNUM);
9666       if (callee_adjust == 0)
9667         aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs,
9668                                    false, false);
9669       else
9670         gcc_assert (known_eq (chain_offset, 0));
9671       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9672                           stack_pointer_rtx, chain_offset,
9673                           tmp1_rtx, tmp0_rtx, force_isa_mode,
9674                           frame_pointer_needed);
9675       if (frame_pointer_needed && !frame_size.is_constant ())
9676         {
9677           /* Variable-sized frames need to describe the save slot
9678              address using DW_CFA_expression rather than DW_CFA_offset.
9679              This means that, without taking further action, the
9680              locations of the registers that we've already saved would
9681              remain based on the stack pointer even after we redefine
9682              the CFA based on the frame pointer.  We therefore need new
9683              DW_CFA_expressions to re-express the save slots with addresses
9684              based on the frame pointer.  */
9685           rtx_insn *insn = get_last_insn ();
9686           gcc_assert (RTX_FRAME_RELATED_P (insn));
9687
9688           /* Add an explicit CFA definition if this was previously
9689              implicit.  */
9690           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9691             {
9692               rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
9693               add_reg_note (insn, REG_CFA_ADJUST_CFA,
9694                             gen_rtx_SET (hard_frame_pointer_rtx, src));
9695             }
9696
9697           /* Change the save slot expressions for the registers that
9698              we've already saved.  */
9699           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9700                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
9701           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
9702                                       hard_frame_pointer_rtx, 0);
9703         }
9704       aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9705     }
9706
9707   aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs, true,
9708                              emit_frame_chain);
9709   if (maybe_ge (frame.reg_offset[VG_REGNUM], 0))
9710     {
9711       unsigned int saved_regs[] = { VG_REGNUM };
9712       aarch64_save_callee_saves (bytes_below_sp, saved_regs, true,
9713                                  emit_frame_chain);
9714     }
9715   if (maybe_ne (sve_callee_adjust, 0))
9716     {
9717       gcc_assert (!flag_stack_clash_protection
9718                   || known_eq (initial_adjust, 0)
9719                   /* The VG save isn't shrink-wrapped and so serves as
9720                      a probe of the initial allocation.  */
9721                   || known_eq (frame.reg_offset[VG_REGNUM], bytes_below_sp));
9722       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
9723                                               sve_callee_adjust,
9724                                               force_isa_mode,
9725                                               !frame_pointer_needed, false);
9726       bytes_below_sp -= sve_callee_adjust;
9727     }
9728   aarch64_save_callee_saves (bytes_below_sp, frame.saved_prs, true,
9729                              emit_frame_chain);
9730   aarch64_save_callee_saves (bytes_below_sp, frame.saved_fprs, true,
9731                              emit_frame_chain);
9732
9733   /* We may need to probe the final adjustment if it is larger than the guard
9734      that is assumed by the called.  */
9735   gcc_assert (known_eq (bytes_below_sp, final_adjust));
9736   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
9737                                           force_isa_mode,
9738                                           !frame_pointer_needed, true);
9739   if (emit_frame_chain && maybe_ne (final_adjust, 0))
9740     aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9741
9742   /* Save the incoming value of PSTATE.SM, if required.  Code further
9743      down does this for locally-streaming functions.  */
9744   if (known_ge (frame.old_svcr_offset, 0)
9745       && !aarch64_cfun_enables_pstate_sm ())
9746     {
9747       rtx mem = aarch64_old_svcr_mem ();
9748       MEM_VOLATILE_P (mem) = 1;
9749       if (TARGET_SME)
9750         {
9751           rtx reg = gen_rtx_REG (DImode, IP0_REGNUM);
9752           emit_insn (gen_aarch64_read_svcr (reg));
9753           emit_move_insn (mem, reg);
9754         }
9755       else
9756         {
9757           rtx old_r0 = NULL_RTX, old_r1 = NULL_RTX;
9758           auto &args = crtl->args.info;
9759           if (args.aapcs_ncrn > 0)
9760             {
9761               old_r0 = gen_rtx_REG (DImode, PROBE_STACK_FIRST_REGNUM);
9762               emit_move_insn (old_r0, gen_rtx_REG (DImode, R0_REGNUM));
9763             }
9764           if (args.aapcs_ncrn > 1)
9765             {
9766               old_r1 = gen_rtx_REG (DImode, PROBE_STACK_SECOND_REGNUM);
9767               emit_move_insn (old_r1, gen_rtx_REG (DImode, R1_REGNUM));
9768             }
9769           emit_insn (gen_aarch64_get_sme_state ());
9770           emit_move_insn (mem, gen_rtx_REG (DImode, R0_REGNUM));
9771           if (old_r0)
9772             emit_move_insn (gen_rtx_REG (DImode, R0_REGNUM), old_r0);
9773           if (old_r1)
9774             emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
9775         }
9776     }
9777
9778   /* Enable PSTATE.SM, if required.  */
9779   if (aarch64_cfun_enables_pstate_sm ())
9780     {
9781       rtx_insn *guard_label = nullptr;
9782       if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9783         {
9784           /* The current function is streaming-compatible.  Save the
9785              original state of PSTATE.SM.  */
9786           rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
9787           emit_insn (gen_aarch64_read_svcr (svcr));
9788           emit_move_insn (aarch64_old_svcr_mem (), svcr);
9789           guard_label = aarch64_guard_switch_pstate_sm (svcr,
9790                                                         aarch64_isa_flags);
9791         }
9792       aarch64_sme_mode_switch_regs args_switch;
9793       auto &args = crtl->args.info;
9794       for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
9795         {
9796           rtx x = args.sme_mode_switch_args[i];
9797           args_switch.add_reg (GET_MODE (x), REGNO (x));
9798         }
9799       args_switch.emit_prologue ();
9800       emit_insn (gen_aarch64_smstart_sm ());
9801       args_switch.emit_epilogue ();
9802       if (guard_label)
9803         emit_label (guard_label);
9804     }
9805 }
9806
9807 /* Return TRUE if we can use a simple_return insn.
9808
9809    This function checks whether the callee saved stack is empty, which
9810    means no restore actions are need. The pro_and_epilogue will use
9811    this to check whether shrink-wrapping opt is feasible.  */
9812
9813 bool
9814 aarch64_use_return_insn_p (void)
9815 {
9816   if (!reload_completed)
9817     return false;
9818
9819   if (crtl->profile)
9820     return false;
9821
9822   return known_eq (cfun->machine->frame.frame_size, 0);
9823 }
9824
9825 /* Generate the epilogue instructions for returning from a function.
9826    This is almost exactly the reverse of the prolog sequence, except
9827    that we need to insert barriers to avoid scheduling loads that read
9828    from a deallocated stack, and we optimize the unwind records by
9829    emitting them all together if possible.  */
9830 void
9831 aarch64_expand_epilogue (rtx_call_insn *sibcall)
9832 {
9833   aarch64_frame &frame = cfun->machine->frame;
9834   poly_int64 initial_adjust = frame.initial_adjust;
9835   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9836   poly_int64 final_adjust = frame.final_adjust;
9837   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9838   poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
9839   unsigned reg1 = frame.wb_pop_candidate1;
9840   unsigned reg2 = frame.wb_pop_candidate2;
9841   rtx cfi_ops = NULL;
9842   rtx_insn *insn;
9843   /* A stack clash protection prologue may not have left EP0_REGNUM or
9844      EP1_REGNUM in a usable state.  The same is true for allocations
9845      with an SVE component, since we then need both temporary registers
9846      for each allocation.  For stack clash we are in a usable state if
9847      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
9848   HOST_WIDE_INT guard_size
9849     = 1 << param_stack_clash_protection_guard_size;
9850   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9851   aarch64_feature_flags force_isa_mode = 0;
9852   if (aarch64_cfun_enables_pstate_sm ())
9853     force_isa_mode = AARCH64_FL_SM_ON;
9854
9855   /* We can re-use the registers when:
9856
9857      (a) the deallocation amount is the same as the corresponding
9858          allocation amount (which is false if we combine the initial
9859          and SVE callee save allocations in the prologue); and
9860
9861      (b) the allocation amount doesn't need a probe (which is false
9862          if the amount is guard_size - guard_used_by_caller or greater).
9863
9864      In such situations the register should remain live with the correct
9865      value.  */
9866   bool can_inherit_p = (initial_adjust.is_constant ()
9867                         && final_adjust.is_constant ()
9868                         && (!flag_stack_clash_protection
9869                             || (known_lt (initial_adjust,
9870                                           guard_size - guard_used_by_caller)
9871                                 && known_eq (sve_callee_adjust, 0))));
9872
9873   /* We need to add memory barrier to prevent read from deallocated stack.  */
9874   bool need_barrier_p
9875     = maybe_ne (get_frame_size ()
9876                 + frame.saved_varargs_size, 0);
9877
9878   /* Reset PSTATE.SM, if required.  */
9879   if (aarch64_cfun_enables_pstate_sm ())
9880     {
9881       rtx_insn *guard_label = nullptr;
9882       if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9883         guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
9884                                                       aarch64_isa_flags);
9885       aarch64_sme_mode_switch_regs return_switch;
9886       if (sibcall)
9887         return_switch.add_call_args (sibcall);
9888       else if (crtl->return_rtx && REG_P (crtl->return_rtx))
9889         return_switch.add_reg (GET_MODE (crtl->return_rtx),
9890                                REGNO (crtl->return_rtx));
9891       return_switch.emit_prologue ();
9892       emit_insn (gen_aarch64_smstop_sm ());
9893       return_switch.emit_epilogue ();
9894       if (guard_label)
9895         emit_label (guard_label);
9896     }
9897
9898   /* Emit a barrier to prevent loads from a deallocated stack.  */
9899   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
9900       || cfun->calls_alloca
9901       || crtl->calls_eh_return)
9902     {
9903       aarch64_emit_stack_tie (stack_pointer_rtx);
9904       need_barrier_p = false;
9905     }
9906
9907   /* Restore the stack pointer from the frame pointer if it may not
9908      be the same as the stack pointer.  */
9909   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9910   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9911   if (frame_pointer_needed
9912       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
9913     /* If writeback is used when restoring callee-saves, the CFA
9914        is restored on the instruction doing the writeback.  */
9915     aarch64_add_offset (Pmode, stack_pointer_rtx,
9916                         hard_frame_pointer_rtx,
9917                         -bytes_below_hard_fp + final_adjust,
9918                         tmp1_rtx, tmp0_rtx, force_isa_mode,
9919                         callee_adjust == 0);
9920   else
9921      /* The case where we need to re-use the register here is very rare, so
9922         avoid the complicated condition and just always emit a move if the
9923         immediate doesn't fit.  */
9924      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
9925
9926   /* Restore the vector registers before the predicate registers,
9927      so that we can use P4 as a temporary for big-endian SVE frames.  */
9928   aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
9929   aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
9930   if (maybe_ne (sve_callee_adjust, 0))
9931     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
9932                     force_isa_mode, true);
9933
9934   /* When shadow call stack is enabled, the scs_pop in the epilogue will
9935      restore x30, we don't need to restore x30 again in the traditional
9936      way.  */
9937   aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
9938                                 frame.saved_gprs, &cfi_ops);
9939
9940   if (need_barrier_p)
9941     aarch64_emit_stack_tie (stack_pointer_rtx);
9942
9943   if (callee_adjust != 0)
9944     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
9945
9946   /* If we have no register restore information, the CFA must have been
9947      defined in terms of the stack pointer since the end of the prologue.  */
9948   gcc_assert (cfi_ops || !frame_pointer_needed);
9949
9950   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
9951     {
9952       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
9953       insn = get_last_insn ();
9954       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
9955       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
9956       RTX_FRAME_RELATED_P (insn) = 1;
9957       cfi_ops = NULL;
9958     }
9959
9960   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
9961      add restriction on emit_move optimization to leaf functions.  */
9962   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
9963                   (!can_inherit_p || !crtl->is_leaf
9964                    || df_regs_ever_live_p (EP0_REGNUM)));
9965
9966   if (cfi_ops)
9967     {
9968       /* Emit delayed restores and reset the CFA to be SP.  */
9969       insn = get_last_insn ();
9970       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
9971       REG_NOTES (insn) = cfi_ops;
9972       RTX_FRAME_RELATED_P (insn) = 1;
9973     }
9974
9975   /* Pop return address from shadow call stack.  */
9976   if (frame.is_scs_enabled)
9977     {
9978       machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
9979       rtx reg = gen_rtx_REG (mode, R30_REGNUM);
9980
9981       insn = emit_insn (gen_scs_pop ());
9982       add_reg_note (insn, REG_CFA_RESTORE, reg);
9983       RTX_FRAME_RELATED_P (insn) = 1;
9984     }
9985
9986   /* Stack adjustment for exception handler.  */
9987   if (crtl->calls_eh_return && !sibcall)
9988     {
9989       /* If the EH_RETURN_TAKEN_RTX flag is set then we need
9990          to unwind the stack and jump to the handler, otherwise
9991          skip this eh_return logic and continue with normal
9992          return after the label.  We have already reset the CFA
9993          to be SP; letting the CFA move during this adjustment
9994          is just as correct as retaining the CFA from the body
9995          of the function.  Therefore, do nothing special.  */
9996       rtx_code_label *label = gen_label_rtx ();
9997       rtx x = aarch64_gen_compare_zero_and_branch (EQ, EH_RETURN_TAKEN_RTX,
9998                                                    label);
9999       rtx jump = emit_jump_insn (x);
10000       JUMP_LABEL (jump) = label;
10001       LABEL_NUSES (label)++;
10002       emit_insn (gen_add2_insn (stack_pointer_rtx,
10003                                 EH_RETURN_STACKADJ_RTX));
10004       emit_jump_insn (gen_indirect_jump (EH_RETURN_HANDLER_RTX));
10005       emit_barrier ();
10006       emit_label (label);
10007     }
10008
10009   /* We prefer to emit the combined return/authenticate instruction RETAA,
10010      however there are three cases in which we must instead emit an explicit
10011      authentication instruction.
10012
10013         1) Sibcalls don't return in a normal way, so if we're about to call one
10014            we must authenticate.
10015
10016         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10017            generating code for !TARGET_ARMV8_3 we can't use it and must
10018            explicitly authenticate.
10019     */
10020   if (aarch64_return_address_signing_enabled ()
10021       && (sibcall || !TARGET_ARMV8_3))
10022     {
10023       switch (aarch64_ra_sign_key)
10024         {
10025           case AARCH64_KEY_A:
10026             insn = emit_insn (gen_autiasp ());
10027             break;
10028           case AARCH64_KEY_B:
10029             insn = emit_insn (gen_autibsp ());
10030             break;
10031           default:
10032             gcc_unreachable ();
10033         }
10034       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10035       RTX_FRAME_RELATED_P (insn) = 1;
10036     }
10037
10038   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10039   if (!sibcall)
10040     emit_jump_insn (ret_rtx);
10041 }
10042
10043 /* Output code to add DELTA to the first argument, and then jump
10044    to FUNCTION.  Used for C++ multiple inheritance.  */
10045 static void
10046 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10047                          HOST_WIDE_INT delta,
10048                          HOST_WIDE_INT vcall_offset,
10049                          tree function)
10050 {
10051   /* The this pointer is always in x0.  Note that this differs from
10052      Arm where the this pointer maybe bumped to r1 if r0 is required
10053      to return a pointer to an aggregate.  On AArch64 a result value
10054      pointer will be in x8.  */
10055   int this_regno = R0_REGNUM;
10056   rtx this_rtx, temp0, temp1, addr, funexp;
10057   rtx_insn *insn;
10058   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10059
10060   if (aarch_bti_enabled ())
10061     emit_insn (gen_bti_c());
10062
10063   reload_completed = 1;
10064   emit_note (NOTE_INSN_PROLOGUE_END);
10065
10066   this_rtx = gen_rtx_REG (Pmode, this_regno);
10067   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10068   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10069
10070   if (vcall_offset == 0)
10071     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
10072                         0, false);
10073   else
10074     {
10075       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10076
10077       addr = this_rtx;
10078       if (delta != 0)
10079         {
10080           if (delta >= -256 && delta < 256)
10081             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10082                                        plus_constant (Pmode, this_rtx, delta));
10083           else
10084             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10085                                 temp1, temp0, 0, false);
10086         }
10087
10088       if (Pmode == ptr_mode)
10089         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10090       else
10091         aarch64_emit_move (temp0,
10092                            gen_rtx_ZERO_EXTEND (Pmode,
10093                                                 gen_rtx_MEM (ptr_mode, addr)));
10094
10095       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10096           addr = plus_constant (Pmode, temp0, vcall_offset);
10097       else
10098         {
10099           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10100                                           Pmode);
10101           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10102         }
10103
10104       if (Pmode == ptr_mode)
10105         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10106       else
10107         aarch64_emit_move (temp1,
10108                            gen_rtx_SIGN_EXTEND (Pmode,
10109                                                 gen_rtx_MEM (ptr_mode, addr)));
10110
10111       emit_insn (gen_add2_insn (this_rtx, temp1));
10112     }
10113
10114   /* Generate a tail call to the target function.  */
10115   if (!TREE_USED (function))
10116     {
10117       assemble_external (function);
10118       TREE_USED (function) = 1;
10119     }
10120   funexp = XEXP (DECL_RTL (function), 0);
10121   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10122   auto isa_mode = aarch64_fntype_isa_mode (TREE_TYPE (function));
10123   auto pcs_variant = arm_pcs (fndecl_abi (function).id ());
10124   rtx callee_abi = aarch64_gen_callee_cookie (isa_mode, pcs_variant);
10125   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10126   SIBLING_CALL_P (insn) = 1;
10127
10128   insn = get_insns ();
10129   shorten_branches (insn);
10130
10131   assemble_start_function (thunk, fnname);
10132   final_start_function (insn, file, 1);
10133   final (insn, file, 1);
10134   final_end_function ();
10135   assemble_end_function (thunk, fnname);
10136
10137   /* Stop pretending to be a post-reload pass.  */
10138   reload_completed = 0;
10139 }
10140
10141 static bool
10142 aarch64_tls_referenced_p (rtx x)
10143 {
10144   if (!TARGET_HAVE_TLS)
10145     return false;
10146   subrtx_iterator::array_type array;
10147   FOR_EACH_SUBRTX (iter, array, x, ALL)
10148     {
10149       const_rtx x = *iter;
10150       if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10151         return true;
10152       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10153          TLS offsets, not real symbol references.  */
10154       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10155         iter.skip_subrtxes ();
10156     }
10157   return false;
10158 }
10159
10160
10161 static bool
10162 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10163 {
10164   if (GET_CODE (x) == HIGH)
10165     return true;
10166
10167   /* There's no way to calculate VL-based values using relocations.  */
10168   subrtx_iterator::array_type array;
10169   HOST_WIDE_INT factor;
10170   FOR_EACH_SUBRTX (iter, array, x, ALL)
10171     if (GET_CODE (*iter) == CONST_POLY_INT
10172         || aarch64_sme_vq_unspec_p (x, &factor))
10173       return true;
10174
10175   poly_int64 offset;
10176   rtx base = strip_offset_and_salt (x, &offset);
10177   if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10178     {
10179       /* We checked for POLY_INT_CST offsets above.  */
10180       if (aarch64_classify_symbol (base, offset.to_constant ())
10181           != SYMBOL_FORCE_TO_MEM)
10182         return true;
10183       else
10184         /* Avoid generating a 64-bit relocation in ILP32; leave
10185            to aarch64_expand_mov_immediate to handle it properly.  */
10186         return mode != ptr_mode;
10187     }
10188
10189   return aarch64_tls_referenced_p (x);
10190 }
10191
10192 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10193    The expansion for a table switch is quite expensive due to the number
10194    of instructions, the table lookup and hard to predict indirect jump.
10195    When optimizing for speed, and -O3 enabled, use the per-core tuning if
10196    set, otherwise use tables for >= 11 cases as a tradeoff between size and
10197    performance.  When optimizing for size, use 8 for smallest codesize.  */
10198
10199 static unsigned int
10200 aarch64_case_values_threshold (void)
10201 {
10202   /* Use the specified limit for the number of cases before using jump
10203      tables at higher optimization levels.  */
10204   if (optimize > 2
10205       && aarch64_tune_params.max_case_values != 0)
10206     return aarch64_tune_params.max_case_values;
10207   else
10208     return optimize_size ? 8 : 11;
10209 }
10210
10211 /* Return true if register REGNO is a valid index register.
10212    STRICT_P is true if REG_OK_STRICT is in effect.  */
10213
10214 bool
10215 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10216 {
10217   if (!HARD_REGISTER_NUM_P (regno))
10218     {
10219       if (!strict_p)
10220         return true;
10221
10222       if (!reg_renumber)
10223         return false;
10224
10225       regno = reg_renumber[regno];
10226     }
10227   return GP_REGNUM_P (regno);
10228 }
10229
10230 /* Return true if register REGNO is a valid base register for mode MODE.
10231    STRICT_P is true if REG_OK_STRICT is in effect.  */
10232
10233 bool
10234 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10235 {
10236   if (!HARD_REGISTER_NUM_P (regno))
10237     {
10238       if (!strict_p)
10239         return true;
10240
10241       if (!reg_renumber)
10242         return false;
10243
10244       regno = reg_renumber[regno];
10245     }
10246
10247   /* The fake registers will be eliminated to either the stack or
10248      hard frame pointer, both of which are usually valid base registers.
10249      Reload deals with the cases where the eliminated form isn't valid.  */
10250   return (GP_REGNUM_P (regno)
10251           || regno == SP_REGNUM
10252           || regno == FRAME_POINTER_REGNUM
10253           || regno == ARG_POINTER_REGNUM);
10254 }
10255
10256 /* Return true if X is a valid base register for mode MODE.
10257    STRICT_P is true if REG_OK_STRICT is in effect.  */
10258
10259 static bool
10260 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10261 {
10262   if (!strict_p
10263       && SUBREG_P (x)
10264       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10265     x = SUBREG_REG (x);
10266
10267   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10268 }
10269
10270 /* Return true if address offset is a valid index.  If it is, fill in INFO
10271    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10272
10273 static bool
10274 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10275                         machine_mode mode, bool strict_p)
10276 {
10277   enum aarch64_address_type type;
10278   rtx index;
10279   int shift;
10280
10281   /* (reg:P) */
10282   if ((REG_P (x) || SUBREG_P (x))
10283       && GET_MODE (x) == Pmode)
10284     {
10285       type = ADDRESS_REG_REG;
10286       index = x;
10287       shift = 0;
10288     }
10289   /* (sign_extend:DI (reg:SI)) */
10290   else if ((GET_CODE (x) == SIGN_EXTEND
10291             || GET_CODE (x) == ZERO_EXTEND)
10292            && GET_MODE (x) == DImode
10293            && GET_MODE (XEXP (x, 0)) == SImode)
10294     {
10295       type = (GET_CODE (x) == SIGN_EXTEND)
10296         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10297       index = XEXP (x, 0);
10298       shift = 0;
10299     }
10300   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10301   else if (GET_CODE (x) == MULT
10302            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10303                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10304            && GET_MODE (XEXP (x, 0)) == DImode
10305            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10306            && CONST_INT_P (XEXP (x, 1)))
10307     {
10308       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10309         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10310       index = XEXP (XEXP (x, 0), 0);
10311       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10312     }
10313   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10314   else if (GET_CODE (x) == ASHIFT
10315            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10316                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10317            && GET_MODE (XEXP (x, 0)) == DImode
10318            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10319            && CONST_INT_P (XEXP (x, 1)))
10320     {
10321       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10322         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10323       index = XEXP (XEXP (x, 0), 0);
10324       shift = INTVAL (XEXP (x, 1));
10325     }
10326   /* (and:DI (mult:DI (reg:DI) (const_int scale))
10327      (const_int 0xffffffff<<shift)) */
10328   else if (GET_CODE (x) == AND
10329            && GET_MODE (x) == DImode
10330            && GET_CODE (XEXP (x, 0)) == MULT
10331            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10332            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10333            && CONST_INT_P (XEXP (x, 1)))
10334     {
10335       type = ADDRESS_REG_UXTW;
10336       index = XEXP (XEXP (x, 0), 0);
10337       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10338       /* Avoid undefined code dealing with shift being -1. */
10339       if (shift != -1
10340           && INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10341         shift = -1;
10342     }
10343   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10344      (const_int 0xffffffff<<shift)) */
10345   else if (GET_CODE (x) == AND
10346            && GET_MODE (x) == DImode
10347            && GET_CODE (XEXP (x, 0)) == ASHIFT
10348            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10349            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10350            && CONST_INT_P (XEXP (x, 1)))
10351     {
10352       type = ADDRESS_REG_UXTW;
10353       index = XEXP (XEXP (x, 0), 0);
10354       shift = INTVAL (XEXP (XEXP (x, 0), 1));
10355       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10356         shift = -1;
10357     }
10358   /* (mult:P (reg:P) (const_int scale)) */
10359   else if (GET_CODE (x) == MULT
10360            && GET_MODE (x) == Pmode
10361            && GET_MODE (XEXP (x, 0)) == Pmode
10362            && CONST_INT_P (XEXP (x, 1)))
10363     {
10364       type = ADDRESS_REG_REG;
10365       index = XEXP (x, 0);
10366       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10367     }
10368   /* (ashift:P (reg:P) (const_int shift)) */
10369   else if (GET_CODE (x) == ASHIFT
10370            && GET_MODE (x) == Pmode
10371            && GET_MODE (XEXP (x, 0)) == Pmode
10372            && CONST_INT_P (XEXP (x, 1)))
10373     {
10374       type = ADDRESS_REG_REG;
10375       index = XEXP (x, 0);
10376       shift = INTVAL (XEXP (x, 1));
10377     }
10378   else
10379     return false;
10380
10381   if (!strict_p
10382       && SUBREG_P (index)
10383       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10384     index = SUBREG_REG (index);
10385
10386   if (aarch64_sve_data_mode_p (mode) || mode == VNx1TImode)
10387     {
10388       if (type != ADDRESS_REG_REG
10389           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10390         return false;
10391     }
10392   else
10393     {
10394       if (shift != 0
10395           && !(IN_RANGE (shift, 1, 3)
10396                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10397         return false;
10398     }
10399
10400   if (REG_P (index)
10401       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10402     {
10403       info->type = type;
10404       info->offset = index;
10405       info->shift = shift;
10406       return true;
10407     }
10408
10409   return false;
10410 }
10411
10412 /* Return true if MODE is one of the modes for which we
10413    support LDP/STP operations.  */
10414
10415 static bool
10416 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10417 {
10418   return mode == SImode || mode == DImode
10419          || mode == SFmode || mode == DFmode
10420          || mode == SDmode || mode == DDmode
10421          || (aarch64_vector_mode_supported_p (mode)
10422              && (known_eq (GET_MODE_SIZE (mode), 8)
10423                  || known_eq (GET_MODE_SIZE (mode), 16)));
10424 }
10425
10426 /* Return true if REGNO is a virtual pointer register, or an eliminable
10427    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
10428    include stack_pointer or hard_frame_pointer.  */
10429 static bool
10430 virt_or_elim_regno_p (unsigned regno)
10431 {
10432   return ((regno >= FIRST_VIRTUAL_REGISTER
10433            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10434           || regno == FRAME_POINTER_REGNUM
10435           || regno == ARG_POINTER_REGNUM);
10436 }
10437
10438 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10439    If it is, fill in INFO appropriately.  STRICT_P is true if
10440    REG_OK_STRICT is in effect.  */
10441
10442 bool
10443 aarch64_classify_address (struct aarch64_address_info *info,
10444                           rtx x, machine_mode mode, bool strict_p,
10445                           aarch64_addr_query_type type)
10446 {
10447   enum rtx_code code = GET_CODE (x);
10448   rtx op0, op1;
10449   poly_int64 offset;
10450
10451   HOST_WIDE_INT const_size;
10452
10453   /* Whether a vector mode is partial doesn't affect address legitimacy.
10454      Partial vectors like VNx8QImode allow the same indexed addressing
10455      mode and MUL VL addressing mode as full vectors like VNx16QImode;
10456      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
10457   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10458   vec_flags &= ~VEC_PARTIAL;
10459
10460   /* On BE, we use load/store pair for all large int mode load/stores.
10461      TI/TF/TDmode may also use a load/store pair.  */
10462   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10463   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10464                             || type == ADDR_QUERY_LDP_STP_N
10465                             || mode == TImode
10466                             || mode == TFmode
10467                             || mode == TDmode
10468                             || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10469                                 && advsimd_struct_p));
10470   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10471      corresponds to the actual size of the memory being loaded/stored and the
10472      mode of the corresponding addressing mode is half of that.  */
10473   if (type == ADDR_QUERY_LDP_STP_N)
10474     {
10475       if (known_eq (GET_MODE_SIZE (mode), 32))
10476         mode = V16QImode;
10477       else if (known_eq (GET_MODE_SIZE (mode), 16))
10478         mode = DFmode;
10479       else if (known_eq (GET_MODE_SIZE (mode), 8))
10480         mode = SFmode;
10481       else
10482         return false;
10483
10484       /* This isn't really an Advanced SIMD struct mode, but a mode
10485          used to represent the complete mem in a load/store pair.  */
10486       advsimd_struct_p = false;
10487     }
10488
10489   bool allow_reg_index_p = (!load_store_pair_p
10490                             && ((vec_flags == 0
10491                                  && known_lt (GET_MODE_SIZE (mode), 16))
10492                                 || vec_flags == VEC_ADVSIMD
10493                                 || vec_flags & VEC_SVE_DATA
10494                                 || mode == VNx1TImode));
10495
10496   /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10497      The latter is not valid for SVE predicates, and that's rejected through
10498      allow_reg_index_p above.  */
10499   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10500       && (code != REG && code != PLUS))
10501     return false;
10502
10503   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10504      REG addressing.  */
10505   if (advsimd_struct_p
10506       && TARGET_SIMD
10507       && !BYTES_BIG_ENDIAN
10508       && (code != POST_INC && code != REG))
10509     return false;
10510
10511   gcc_checking_assert (GET_MODE (x) == VOIDmode
10512                        || SCALAR_INT_MODE_P (GET_MODE (x)));
10513
10514   switch (code)
10515     {
10516     case REG:
10517     case SUBREG:
10518       info->type = ADDRESS_REG_IMM;
10519       info->base = x;
10520       info->offset = const0_rtx;
10521       info->const_offset = 0;
10522       return aarch64_base_register_rtx_p (x, strict_p);
10523
10524     case PLUS:
10525       op0 = XEXP (x, 0);
10526       op1 = XEXP (x, 1);
10527
10528       if (! strict_p
10529           && REG_P (op0)
10530           && virt_or_elim_regno_p (REGNO (op0))
10531           && poly_int_rtx_p (op1, &offset))
10532         {
10533           info->type = ADDRESS_REG_IMM;
10534           info->base = op0;
10535           info->offset = op1;
10536           info->const_offset = offset;
10537
10538           return true;
10539         }
10540
10541       if (maybe_ne (GET_MODE_SIZE (mode), 0)
10542           && aarch64_base_register_rtx_p (op0, strict_p)
10543           && poly_int_rtx_p (op1, &offset))
10544         {
10545           info->type = ADDRESS_REG_IMM;
10546           info->base = op0;
10547           info->offset = op1;
10548           info->const_offset = offset;
10549
10550           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10551              registers and individual Q registers.  The available
10552              address modes are:
10553              X,X: 7-bit signed scaled offset
10554              Q:   9-bit signed offset
10555              We conservatively require an offset representable in either mode.
10556              When performing the check for pairs of X registers i.e.  LDP/STP
10557              pass down DImode since that is the natural size of the LDP/STP
10558              instruction memory accesses.  */
10559           if (mode == TImode || mode == TFmode || mode == TDmode)
10560             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10561                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10562                         || offset_12bit_unsigned_scaled_p (mode, offset)));
10563
10564           if (mode == V8DImode)
10565             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10566                     && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10567
10568           /* A 7bit offset check because OImode will emit a ldp/stp
10569              instruction (only !TARGET_SIMD or big endian will get here).
10570              For ldp/stp instructions, the offset is scaled for the size of a
10571              single element of the pair.  */
10572           if (aarch64_advsimd_partial_struct_mode_p (mode)
10573               && known_eq (GET_MODE_SIZE (mode), 16))
10574             return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10575           if (aarch64_advsimd_full_struct_mode_p (mode)
10576               && known_eq (GET_MODE_SIZE (mode), 32))
10577             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10578
10579           /* Three 9/12 bit offsets checks because CImode will emit three
10580              ldr/str instructions (only !TARGET_SIMD or big endian will
10581              get here).  */
10582           if (aarch64_advsimd_partial_struct_mode_p (mode)
10583               && known_eq (GET_MODE_SIZE (mode), 24))
10584             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10585                     && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10586                                                                offset + 16)
10587                         || offset_12bit_unsigned_scaled_p (DImode,
10588                                                            offset + 16)));
10589           if (aarch64_advsimd_full_struct_mode_p (mode)
10590               && known_eq (GET_MODE_SIZE (mode), 48))
10591             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10592                     && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10593                                                                offset + 32)
10594                         || offset_12bit_unsigned_scaled_p (TImode,
10595                                                            offset + 32)));
10596
10597           /* Two 7bit offsets checks because XImode will emit two ldp/stp
10598              instructions (only big endian will get here).  */
10599           if (aarch64_advsimd_partial_struct_mode_p (mode)
10600               && known_eq (GET_MODE_SIZE (mode), 32))
10601             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10602                     && aarch64_offset_7bit_signed_scaled_p (DImode,
10603                                                             offset + 16));
10604           if (aarch64_advsimd_full_struct_mode_p (mode)
10605               && known_eq (GET_MODE_SIZE (mode), 64))
10606             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10607                     && aarch64_offset_7bit_signed_scaled_p (TImode,
10608                                                             offset + 32));
10609
10610           /* Make "m" use the LD1 offset range for SVE data modes, so
10611              that pre-RTL optimizers like ivopts will work to that
10612              instead of the wider LDR/STR range.  */
10613           if (vec_flags == VEC_SVE_DATA || mode == VNx1TImode)
10614             return (type == ADDR_QUERY_M
10615                     ? offset_4bit_signed_scaled_p (mode, offset)
10616                     : offset_9bit_signed_scaled_p (mode, offset));
10617
10618           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10619             {
10620               poly_int64 end_offset = (offset
10621                                        + GET_MODE_SIZE (mode)
10622                                        - BYTES_PER_SVE_VECTOR);
10623               return (type == ADDR_QUERY_M
10624                       ? offset_4bit_signed_scaled_p (mode, offset)
10625                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10626                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10627                                                          end_offset)));
10628             }
10629
10630           if (vec_flags == VEC_SVE_PRED)
10631             return offset_9bit_signed_scaled_p (mode, offset);
10632
10633           if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
10634             {
10635               poly_int64 end_offset = (offset
10636                                        + GET_MODE_SIZE (mode)
10637                                        - BYTES_PER_SVE_PRED);
10638               return (offset_9bit_signed_scaled_p (VNx16BImode, end_offset)
10639                       && offset_9bit_signed_scaled_p (VNx16BImode, offset));
10640             }
10641
10642           if (load_store_pair_p)
10643             return ((known_eq (GET_MODE_SIZE (mode), 4)
10644                      || known_eq (GET_MODE_SIZE (mode), 8)
10645                      || known_eq (GET_MODE_SIZE (mode), 16))
10646                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10647           else
10648             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10649                     || offset_12bit_unsigned_scaled_p (mode, offset));
10650         }
10651
10652       if (allow_reg_index_p)
10653         {
10654           /* Look for base + (scaled/extended) index register.  */
10655           if (aarch64_base_register_rtx_p (op0, strict_p)
10656               && aarch64_classify_index (info, op1, mode, strict_p))
10657             {
10658               info->base = op0;
10659               return true;
10660             }
10661           if (aarch64_base_register_rtx_p (op1, strict_p)
10662               && aarch64_classify_index (info, op0, mode, strict_p))
10663             {
10664               info->base = op1;
10665               return true;
10666             }
10667         }
10668
10669       return false;
10670
10671     case POST_INC:
10672     case POST_DEC:
10673     case PRE_INC:
10674     case PRE_DEC:
10675       info->type = ADDRESS_REG_WB;
10676       info->base = XEXP (x, 0);
10677       info->offset = NULL_RTX;
10678       return aarch64_base_register_rtx_p (info->base, strict_p);
10679
10680     case POST_MODIFY:
10681     case PRE_MODIFY:
10682       info->type = ADDRESS_REG_WB;
10683       info->base = XEXP (x, 0);
10684       if (GET_CODE (XEXP (x, 1)) == PLUS
10685           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10686           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10687           && aarch64_base_register_rtx_p (info->base, strict_p))
10688         {
10689           info->offset = XEXP (XEXP (x, 1), 1);
10690           info->const_offset = offset;
10691
10692           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10693              registers and individual Q registers.  The available
10694              address modes are:
10695              X,X: 7-bit signed scaled offset
10696              Q:   9-bit signed offset
10697              We conservatively require an offset representable in either mode.
10698            */
10699           if (mode == TImode || mode == TFmode || mode == TDmode)
10700             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10701                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10702
10703           if (load_store_pair_p)
10704             return ((known_eq (GET_MODE_SIZE (mode), 4)
10705                      || known_eq (GET_MODE_SIZE (mode), 8)
10706                      || known_eq (GET_MODE_SIZE (mode), 16))
10707                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10708           else
10709             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10710         }
10711       return false;
10712
10713     case CONST:
10714     case SYMBOL_REF:
10715     case LABEL_REF:
10716       /* load literal: pc-relative constant pool entry.  Only supported
10717          for SI mode or larger.  */
10718       info->type = ADDRESS_SYMBOLIC;
10719
10720       if (!load_store_pair_p
10721           && GET_MODE_SIZE (mode).is_constant (&const_size)
10722           && const_size >= 4)
10723         {
10724           poly_int64 offset;
10725           rtx sym = strip_offset_and_salt (x, &offset);
10726           return ((LABEL_REF_P (sym)
10727                    || (SYMBOL_REF_P (sym)
10728                        && CONSTANT_POOL_ADDRESS_P (sym)
10729                        && aarch64_pcrelative_literal_loads)));
10730         }
10731       return false;
10732
10733     case LO_SUM:
10734       info->type = ADDRESS_LO_SUM;
10735       info->base = XEXP (x, 0);
10736       info->offset = XEXP (x, 1);
10737       if (allow_reg_index_p
10738           && aarch64_base_register_rtx_p (info->base, strict_p))
10739         {
10740           poly_int64 offset;
10741           HOST_WIDE_INT const_offset;
10742           rtx sym = strip_offset_and_salt (info->offset, &offset);
10743           if (SYMBOL_REF_P (sym)
10744               && offset.is_constant (&const_offset)
10745               && (aarch64_classify_symbol (sym, const_offset)
10746                   == SYMBOL_SMALL_ABSOLUTE))
10747             {
10748               /* The symbol and offset must be aligned to the access size.  */
10749               unsigned int align;
10750
10751               if (CONSTANT_POOL_ADDRESS_P (sym))
10752                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
10753               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
10754                 {
10755                   tree exp = SYMBOL_REF_DECL (sym);
10756                   align = TYPE_ALIGN (TREE_TYPE (exp));
10757                   align = aarch64_constant_alignment (exp, align);
10758                 }
10759               else if (SYMBOL_REF_DECL (sym))
10760                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
10761               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
10762                        && SYMBOL_REF_BLOCK (sym) != NULL)
10763                 align = SYMBOL_REF_BLOCK (sym)->alignment;
10764               else
10765                 align = BITS_PER_UNIT;
10766
10767               poly_int64 ref_size = GET_MODE_SIZE (mode);
10768               if (known_eq (ref_size, 0))
10769                 ref_size = GET_MODE_SIZE (DImode);
10770
10771               return (multiple_p (const_offset, ref_size)
10772                       && multiple_p (align / BITS_PER_UNIT, ref_size));
10773             }
10774         }
10775       return false;
10776
10777     default:
10778       return false;
10779     }
10780 }
10781
10782 /* Return true if the address X is valid for a PRFM instruction.
10783    STRICT_P is true if we should do strict checking with
10784    aarch64_classify_address.  */
10785
10786 bool
10787 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
10788 {
10789   struct aarch64_address_info addr;
10790
10791   /* PRFM accepts the same addresses as DImode...  */
10792   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
10793   if (!res)
10794     return false;
10795
10796   /* ... except writeback forms.  */
10797   return addr.type != ADDRESS_REG_WB;
10798 }
10799
10800 bool
10801 aarch64_symbolic_address_p (rtx x)
10802 {
10803   poly_int64 offset;
10804   x = strip_offset_and_salt (x, &offset);
10805   return SYMBOL_REF_P (x) || LABEL_REF_P (x);
10806 }
10807
10808 /* Classify the base of symbolic expression X.  */
10809
10810 enum aarch64_symbol_type
10811 aarch64_classify_symbolic_expression (rtx x)
10812 {
10813   rtx offset;
10814
10815   split_const (x, &x, &offset);
10816   return aarch64_classify_symbol (x, INTVAL (offset));
10817 }
10818
10819
10820 /* Return TRUE if X is a legitimate address for accessing memory in
10821    mode MODE.  */
10822 static bool
10823 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p,
10824                                    code_helper = ERROR_MARK)
10825 {
10826   struct aarch64_address_info addr;
10827
10828   return aarch64_classify_address (&addr, x, mode, strict_p);
10829 }
10830
10831 /* Return TRUE if X is a legitimate address of type TYPE for accessing
10832    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10833 bool
10834 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
10835                               aarch64_addr_query_type type)
10836 {
10837   struct aarch64_address_info addr;
10838
10839   return aarch64_classify_address (&addr, x, mode, strict_p, type);
10840 }
10841
10842 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
10843
10844 static bool
10845 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
10846                                          poly_int64 orig_offset,
10847                                          machine_mode mode)
10848 {
10849   HOST_WIDE_INT size;
10850   if (GET_MODE_SIZE (mode).is_constant (&size))
10851     {
10852       HOST_WIDE_INT const_offset, second_offset;
10853
10854       /* A general SVE offset is A * VQ + B.  Remove the A component from
10855          coefficient 0 in order to get the constant B.  */
10856       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
10857
10858       /* Split an out-of-range address displacement into a base and
10859          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
10860          range otherwise to increase opportunities for sharing the base
10861          address of different sizes.  Unaligned accesses use the signed
10862          9-bit range, TImode/TFmode/TDmode use the intersection of signed
10863          scaled 7-bit and signed 9-bit offset.  */
10864       if (mode == TImode || mode == TFmode || mode == TDmode)
10865         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
10866       else if ((const_offset & (size - 1)) != 0)
10867         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
10868       else
10869         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
10870
10871       if (second_offset == 0 || known_eq (orig_offset, second_offset))
10872         return false;
10873
10874       /* Split the offset into second_offset and the rest.  */
10875       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10876       *offset2 = gen_int_mode (second_offset, Pmode);
10877       return true;
10878     }
10879   else
10880     {
10881       /* Get the mode we should use as the basis of the range.  For structure
10882          modes this is the mode of one vector.  */
10883       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10884       machine_mode step_mode
10885         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
10886
10887       /* Get the "mul vl" multiplier we'd like to use.  */
10888       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
10889       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
10890       if (vec_flags & VEC_SVE_DATA)
10891         /* LDR supports a 9-bit range, but the move patterns for
10892            structure modes require all vectors to be in range of the
10893            same base.  The simplest way of accomodating that while still
10894            promoting reuse of anchor points between different modes is
10895            to use an 8-bit range unconditionally.  */
10896         vnum = ((vnum + 128) & 255) - 128;
10897       else
10898         /* Predicates are only handled singly, so we might as well use
10899            the full range.  */
10900         vnum = ((vnum + 256) & 511) - 256;
10901       if (vnum == 0)
10902         return false;
10903
10904       /* Convert the "mul vl" multiplier into a byte offset.  */
10905       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
10906       if (known_eq (second_offset, orig_offset))
10907         return false;
10908
10909       /* Split the offset into second_offset and the rest.  */
10910       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10911       *offset2 = gen_int_mode (second_offset, Pmode);
10912       return true;
10913     }
10914 }
10915
10916 /* Return the binary representation of floating point constant VALUE in INTVAL.
10917    If the value cannot be converted, return false without setting INTVAL.
10918    The conversion is done in the given MODE.  */
10919 bool
10920 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
10921 {
10922
10923   /* We make a general exception for 0.  */
10924   if (aarch64_float_const_zero_rtx_p (value))
10925     {
10926       *intval = 0;
10927       return true;
10928     }
10929
10930   scalar_float_mode mode;
10931   if (!CONST_DOUBLE_P (value)
10932       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
10933       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
10934       /* Only support up to DF mode.  */
10935       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
10936     return false;
10937
10938   unsigned HOST_WIDE_INT ival = 0;
10939
10940   long res[2];
10941   real_to_target (res,
10942                   CONST_DOUBLE_REAL_VALUE (value),
10943                   REAL_MODE_FORMAT (mode));
10944
10945   if (mode == DFmode || mode == DDmode)
10946     {
10947       int order = BYTES_BIG_ENDIAN ? 1 : 0;
10948       ival = zext_hwi (res[order], 32);
10949       ival |= (zext_hwi (res[1 - order], 32) << 32);
10950     }
10951   else
10952       ival = zext_hwi (res[0], 32);
10953
10954   *intval = ival;
10955   return true;
10956 }
10957
10958 /* Return TRUE if rtx X is an immediate constant that can be moved using a
10959    single MOV(+MOVK) followed by an FMOV.  */
10960 bool
10961 aarch64_float_const_rtx_p (rtx x)
10962 {
10963   machine_mode mode = GET_MODE (x);
10964   if (mode == VOIDmode)
10965     return false;
10966
10967   /* Determine whether it's cheaper to write float constants as
10968      mov/movk pairs over ldr/adrp pairs.  */
10969   unsigned HOST_WIDE_INT ival;
10970
10971   if (CONST_DOUBLE_P (x)
10972       && SCALAR_FLOAT_MODE_P (mode)
10973       && aarch64_reinterpret_float_as_int (x, &ival))
10974     {
10975       machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
10976       int num_instr = aarch64_internal_mov_immediate
10977                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10978       return num_instr < 3;
10979     }
10980
10981   return false;
10982 }
10983
10984 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
10985    Floating Point).  */
10986 bool
10987 aarch64_float_const_zero_rtx_p (rtx x)
10988 {
10989   /* 0.0 in Decimal Floating Point cannot be represented by #0 or
10990      zr as our callers expect, so no need to check the actual
10991      value if X is of Decimal Floating Point type.  */
10992   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
10993     return false;
10994
10995   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
10996     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
10997   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
10998 }
10999
11000 /* Return true if X is any kind of constant zero rtx.  */
11001
11002 bool
11003 aarch64_const_zero_rtx_p (rtx x)
11004 {
11005   return (x == CONST0_RTX (GET_MODE (x))
11006           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)));
11007 }
11008
11009 /* Return TRUE if rtx X is immediate constant that fits in a single
11010    MOVI immediate operation.  */
11011 bool
11012 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11013 {
11014   if (!TARGET_SIMD)
11015      return false;
11016
11017   machine_mode vmode;
11018   scalar_int_mode imode;
11019   unsigned HOST_WIDE_INT ival;
11020
11021   if (CONST_DOUBLE_P (x)
11022       && SCALAR_FLOAT_MODE_P (mode))
11023     {
11024       if (!aarch64_reinterpret_float_as_int (x, &ival))
11025         return false;
11026
11027       /* We make a general exception for 0.  */
11028       if (aarch64_float_const_zero_rtx_p (x))
11029         return true;
11030
11031       imode = int_mode_for_mode (mode).require ();
11032     }
11033   else if (CONST_INT_P (x)
11034            && is_a <scalar_int_mode> (mode, &imode))
11035     ival = INTVAL (x);
11036   else
11037     return false;
11038
11039    /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11040      a 128 bit vector mode.  */
11041   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11042
11043   vmode = aarch64_simd_container_mode (imode, width);
11044   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11045
11046   return aarch64_simd_valid_immediate (v_op, NULL);
11047 }
11048
11049
11050 /* Return the fixed registers used for condition codes.  */
11051
11052 static bool
11053 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11054 {
11055   *p1 = CC_REGNUM;
11056   *p2 = INVALID_REGNUM;
11057   return true;
11058 }
11059
11060 /* Return a fresh memory reference to the current function's TPIDR2 block,
11061    creating a block if necessary.  */
11062
11063 static rtx
11064 aarch64_get_tpidr2_block ()
11065 {
11066   if (!cfun->machine->tpidr2_block)
11067     /* The TPIDR2 block is 16 bytes in size and must be aligned to a 128-bit
11068        boundary.  */
11069     cfun->machine->tpidr2_block = assign_stack_local (V16QImode, 16, 128);
11070   return copy_rtx (cfun->machine->tpidr2_block);
11071 }
11072
11073 /* Return a fresh register that points to the current function's
11074    TPIDR2 block, creating a block if necessary.  */
11075
11076 static rtx
11077 aarch64_get_tpidr2_ptr ()
11078 {
11079   rtx block = aarch64_get_tpidr2_block ();
11080   return force_reg (Pmode, XEXP (block, 0));
11081 }
11082
11083 /* Emit instructions to allocate a ZA lazy save buffer and initialize the
11084    current function's TPIDR2 block.  */
11085
11086 static void
11087 aarch64_init_tpidr2_block ()
11088 {
11089   rtx block = aarch64_get_tpidr2_block ();
11090
11091   /* The ZA save buffer is SVL.B*SVL.B bytes in size.  */
11092   rtx svl_bytes = aarch64_sme_vq_immediate (Pmode, 16, AARCH64_ISA_MODE);
11093   rtx svl_bytes_reg = force_reg (DImode, svl_bytes);
11094   rtx za_size = expand_simple_binop (Pmode, MULT, svl_bytes_reg,
11095                                      svl_bytes_reg, NULL, 0, OPTAB_LIB_WIDEN);
11096   rtx za_save_buffer = allocate_dynamic_stack_space (za_size, 128,
11097                                                      BITS_PER_UNIT, -1, true);
11098   za_save_buffer = force_reg (Pmode, za_save_buffer);
11099   cfun->machine->za_save_buffer = za_save_buffer;
11100
11101   /* The first word of the block points to the save buffer and the second
11102      word is the number of ZA slices to save.  */
11103   rtx block_0 = adjust_address (block, DImode, 0);
11104   emit_insn (aarch64_gen_store_pair (block_0, za_save_buffer, svl_bytes_reg));
11105
11106   if (!memory_operand (block, V16QImode))
11107     block = replace_equiv_address (block, force_reg (Pmode, XEXP (block, 0)));
11108   emit_insn (gen_aarch64_setup_local_tpidr2 (block));
11109 }
11110
11111 /* Restore the contents of ZA from the lazy save buffer, given that
11112    register TPIDR2_BLOCK points to the current function's TPIDR2 block.
11113    PSTATE.ZA is known to be 0 and TPIDR2_EL0 is known to be null.  */
11114
11115 void
11116 aarch64_restore_za (rtx tpidr2_block)
11117 {
11118   emit_insn (gen_aarch64_smstart_za ());
11119   if (REGNO (tpidr2_block) != R0_REGNUM)
11120     emit_move_insn (gen_rtx_REG (Pmode, R0_REGNUM), tpidr2_block);
11121   emit_insn (gen_aarch64_tpidr2_restore ());
11122 }
11123
11124 /* Return the ZT0 save buffer, creating one if necessary.  */
11125
11126 static rtx
11127 aarch64_get_zt0_save_buffer ()
11128 {
11129   if (!cfun->machine->zt0_save_buffer)
11130     cfun->machine->zt0_save_buffer = assign_stack_local (V8DImode, 64, 128);
11131   return cfun->machine->zt0_save_buffer;
11132 }
11133
11134 /* Save ZT0 to the current function's save buffer.  */
11135
11136 static void
11137 aarch64_save_zt0 ()
11138 {
11139   rtx mem = aarch64_get_zt0_save_buffer ();
11140   mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11141   emit_insn (gen_aarch64_sme_str_zt0 (mem));
11142 }
11143
11144 /* Restore ZT0 from the current function's save buffer.  FROM_LAZY_SAVE_P
11145    is true if the load is happening after a call to a private-ZA function,
11146    false if it can be treated as a normal load.  */
11147
11148 static void
11149 aarch64_restore_zt0 (bool from_lazy_save_p)
11150 {
11151   rtx mem = aarch64_get_zt0_save_buffer ();
11152   mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11153   emit_insn (from_lazy_save_p
11154              ? gen_aarch64_restore_zt0 (mem)
11155              : gen_aarch64_sme_ldr_zt0 (mem));
11156 }
11157
11158 /* Implement TARGET_START_CALL_ARGS.  */
11159
11160 static void
11161 aarch64_start_call_args (cumulative_args_t ca_v)
11162 {
11163   CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11164
11165   if (!TARGET_SME && (ca->isa_mode & AARCH64_FL_SM_ON))
11166     {
11167       error ("calling a streaming function requires the ISA extension %qs",
11168              "sme");
11169       inform (input_location, "you can enable %qs using the command-line"
11170               " option %<-march%>, or by using the %<target%>"
11171               " attribute or pragma", "sme");
11172     }
11173
11174   if ((ca->shared_za_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11175       && !aarch64_cfun_has_state ("za"))
11176     error ("call to a function that shares %qs state from a function"
11177            " that has no %qs state", "za", "za");
11178   else if ((ca->shared_zt0_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11179            && !aarch64_cfun_has_state ("zt0"))
11180     error ("call to a function that shares %qs state from a function"
11181            " that has no %qs state", "zt0", "zt0");
11182   else if (!TARGET_ZA && (ca->isa_mode & AARCH64_FL_ZA_ON))
11183     error ("call to a function that shares SME state from a function"
11184            " that has no SME state");
11185
11186   /* If this is a call to a private ZA function, emit a marker to
11187      indicate where any necessary set-up code could be inserted.
11188      The code itself is inserted by the mode-switching pass.  */
11189   if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
11190     emit_insn (gen_aarch64_start_private_za_call ());
11191
11192   /* If this is a call to a shared-ZA function that doesn't share ZT0,
11193      save and restore ZT0 around the call.  */
11194   if (aarch64_cfun_has_state ("zt0")
11195       && (ca->isa_mode & AARCH64_FL_ZA_ON)
11196       && ca->shared_zt0_flags == 0)
11197     aarch64_save_zt0 ();
11198 }
11199
11200 /* This function is used by the call expanders of the machine description.
11201    RESULT is the register in which the result is returned.  It's NULL for
11202    "call" and "sibcall".
11203    MEM is the location of the function call.
11204    COOKIE is either:
11205      - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI.
11206      - a PARALLEL that contains such a const_int as its first element.
11207        The second element is a PARALLEL that lists all the argument
11208        registers that need to be saved and restored around a change
11209        in PSTATE.SM, or const0_rtx if no such switch is needed.
11210        The third and fourth elements are const_ints that contain the
11211        sharing flags for ZA and ZT0 respectively.
11212    SIBCALL indicates whether this function call is normal call or sibling call.
11213    It will generate different pattern accordingly.  */
11214
11215 void
11216 aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
11217 {
11218   rtx call, callee, tmp;
11219   rtvec vec;
11220   machine_mode mode;
11221
11222   rtx callee_abi = cookie;
11223   rtx sme_mode_switch_args = const0_rtx;
11224   unsigned int shared_za_flags = 0;
11225   unsigned int shared_zt0_flags = 0;
11226   if (GET_CODE (cookie) == PARALLEL)
11227     {
11228       callee_abi = XVECEXP (cookie, 0, 0);
11229       sme_mode_switch_args = XVECEXP (cookie, 0, 1);
11230       shared_za_flags = INTVAL (XVECEXP (cookie, 0, 2));
11231       shared_zt0_flags = INTVAL (XVECEXP (cookie, 0, 3));
11232     }
11233
11234   gcc_assert (CONST_INT_P (callee_abi));
11235   auto callee_isa_mode = aarch64_callee_isa_mode (callee_abi);
11236
11237   if (aarch64_cfun_has_state ("za")
11238       && (callee_isa_mode & AARCH64_FL_ZA_ON)
11239       && !shared_za_flags)
11240     {
11241       sorry ("call to a function that shares state other than %qs"
11242              " from a function that has %qs state", "za", "za");
11243       inform (input_location, "use %<__arm_preserves(\"za\")%> if the"
11244               " callee preserves ZA");
11245     }
11246
11247   gcc_assert (MEM_P (mem));
11248   callee = XEXP (mem, 0);
11249
11250 #if TARGET_PECOFF
11251   tmp = legitimize_pe_coff_symbol (callee, false);
11252   if (tmp)
11253     callee = tmp;
11254 #endif
11255
11256   mode = GET_MODE (callee);
11257   gcc_assert (mode == Pmode);
11258
11259   /* Decide if we should generate indirect calls by loading the
11260      address of the callee into a register before performing
11261      the branch-and-link.  */
11262   if (SYMBOL_REF_P (callee)
11263       ? (aarch64_is_long_call_p (callee)
11264          || aarch64_is_noplt_call_p (callee))
11265       : !REG_P (callee))
11266     XEXP (mem, 0) = force_reg (mode, callee);
11267
11268   /* Accumulate the return values, including state that is shared via
11269      attributes.  */
11270   auto_vec<rtx, 8> return_values;
11271   if (result)
11272     {
11273       if (GET_CODE (result) == PARALLEL)
11274         for (int i = 0; i < XVECLEN (result, 0); ++i)
11275           return_values.safe_push (XVECEXP (result, 0, i));
11276       else
11277         return_values.safe_push (result);
11278     }
11279   unsigned int orig_num_return_values = return_values.length ();
11280   if (shared_za_flags & AARCH64_STATE_OUT)
11281     return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11282   /* When calling private-ZA functions from functions with ZA state,
11283      we want to know whether the call committed a lazy save.  */
11284   if (TARGET_ZA && !shared_za_flags)
11285     return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11286   if (shared_zt0_flags & AARCH64_STATE_OUT)
11287     return_values.safe_push (gen_rtx_REG (V8DImode, ZT0_REGNUM));
11288
11289   /* Create the new return value, if necessary.  */
11290   if (orig_num_return_values != return_values.length ())
11291     {
11292       if (return_values.length () == 1)
11293         result = return_values[0];
11294       else
11295         {
11296           for (rtx &x : return_values)
11297             if (GET_CODE (x) != EXPR_LIST)
11298               x = gen_rtx_EXPR_LIST (VOIDmode, x, const0_rtx);
11299           rtvec v = gen_rtvec_v (return_values.length (),
11300                                  return_values.address ());
11301           result = gen_rtx_PARALLEL (VOIDmode, v);
11302         }
11303     }
11304
11305   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11306
11307   if (result != NULL_RTX)
11308     call = gen_rtx_SET (result, call);
11309
11310   if (sibcall)
11311     tmp = ret_rtx;
11312   else
11313     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11314
11315   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11316                                UNSPEC_CALLEE_ABI);
11317
11318   vec = gen_rtvec (3, call, callee_abi, tmp);
11319   call = gen_rtx_PARALLEL (VOIDmode, vec);
11320
11321   auto call_insn = aarch64_emit_call_insn (call);
11322
11323   /* Check whether the call requires a change to PSTATE.SM.  We can't
11324      emit the instructions to change PSTATE.SM yet, since they involve
11325      a change in vector length and a change in instruction set, which
11326      cannot be represented in RTL.
11327
11328      For now, just record which registers will be clobbered and used
11329      by the changes to PSTATE.SM.  */
11330   if (!sibcall && aarch64_call_switches_pstate_sm (callee_isa_mode))
11331     {
11332       aarch64_sme_mode_switch_regs args_switch;
11333       if (sme_mode_switch_args != const0_rtx)
11334         {
11335           unsigned int num_args = XVECLEN (sme_mode_switch_args, 0);
11336           for (unsigned int i = 0; i < num_args; ++i)
11337             {
11338               rtx x = XVECEXP (sme_mode_switch_args, 0, i);
11339               args_switch.add_reg (GET_MODE (x), REGNO (x));
11340             }
11341         }
11342
11343       aarch64_sme_mode_switch_regs result_switch;
11344       if (result)
11345         result_switch.add_call_result (call_insn);
11346
11347       unsigned int num_gprs = MAX (args_switch.num_gprs (),
11348                                    result_switch.num_gprs ());
11349       for (unsigned int i = 0; i < num_gprs; ++i)
11350         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11351                      gen_rtx_REG (DImode, args_switch.FIRST_GPR + i));
11352
11353       for (int regno = V0_REGNUM; regno < V0_REGNUM + 32; regno += 4)
11354         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11355                      gen_rtx_REG (V4x16QImode, regno));
11356
11357       for (int regno = P0_REGNUM; regno < P0_REGNUM + 16; regno += 1)
11358         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11359                      gen_rtx_REG (VNx16BImode, regno));
11360
11361       /* Ensure that the VG save slot has been initialized.  Also emit
11362          an instruction to model the effect of the temporary clobber
11363          of VG, so that the prologue/epilogue pass sees the need to
11364          save the old value.  */
11365       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11366                gen_rtx_REG (DImode, VG_REGNUM));
11367       emit_insn_before (gen_aarch64_update_vg (), call_insn);
11368
11369       cfun->machine->call_switches_pstate_sm = true;
11370     }
11371
11372   /* Add any ZA-related information.
11373
11374      ZA_REGNUM represents the current function's ZA state, rather than
11375      the contents of the ZA register itself.  We ensure that the function's
11376      ZA state is preserved by private-ZA call sequences, so the call itself
11377      does not use or clobber ZA_REGNUM.  The same thing applies to
11378      ZT0_REGNUM.  */
11379   if (TARGET_ZA)
11380     {
11381       /* The callee requires ZA to be active if the callee is shared-ZA,
11382          otherwise it requires ZA to be dormant or off.  The state of ZA is
11383          captured by a combination of SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM,
11384          and ZA_SAVED_REGNUM.  */
11385       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11386                gen_rtx_REG (DImode, SME_STATE_REGNUM));
11387       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11388                gen_rtx_REG (DImode, TPIDR2_SETUP_REGNUM));
11389       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11390                gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11391
11392       /* Keep the aarch64_start/end_private_za_call markers live.  */
11393       if (!(callee_isa_mode & AARCH64_FL_ZA_ON))
11394         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11395                  gen_rtx_REG (VNx16BImode, LOWERING_REGNUM));
11396
11397       /* If the callee is a shared-ZA function, record whether it uses the
11398          current value of ZA and ZT0.  */
11399       if (shared_za_flags & AARCH64_STATE_IN)
11400         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11401                  gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11402
11403       if (shared_zt0_flags & AARCH64_STATE_IN)
11404         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11405                  gen_rtx_REG (V8DImode, ZT0_REGNUM));
11406     }
11407 }
11408
11409 /* Implement TARGET_END_CALL_ARGS.  */
11410
11411 static void
11412 aarch64_end_call_args (cumulative_args_t ca_v)
11413 {
11414   CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11415
11416   /* If this is a call to a private ZA function, emit a marker to
11417      indicate where any necessary restoration code could be inserted.
11418      The code itself is inserted by the mode-switching pass.  */
11419   if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
11420     emit_insn (gen_aarch64_end_private_za_call ());
11421
11422   /* If this is a call to a shared-ZA function that doesn't share ZT0,
11423      save and restore ZT0 around the call.  */
11424   if (aarch64_cfun_has_state ("zt0")
11425       && (ca->isa_mode & AARCH64_FL_ZA_ON)
11426       && ca->shared_zt0_flags == 0)
11427     aarch64_restore_zt0 (false);
11428 }
11429
11430 /* Emit call insn with PAT and do aarch64-specific handling.  */
11431
11432 rtx_call_insn *
11433 aarch64_emit_call_insn (rtx pat)
11434 {
11435   auto insn = emit_call_insn (pat);
11436
11437   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11438   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11439   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11440   return as_a<rtx_call_insn *> (insn);
11441 }
11442
11443 machine_mode
11444 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11445 {
11446   machine_mode mode_x = GET_MODE (x);
11447   rtx_code code_x = GET_CODE (x);
11448
11449   /* All floating point compares return CCFP if it is an equality
11450      comparison, and CCFPE otherwise.  */
11451   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11452     {
11453       switch (code)
11454         {
11455         case EQ:
11456         case NE:
11457         case UNORDERED:
11458         case ORDERED:
11459         case UNLT:
11460         case UNLE:
11461         case UNGT:
11462         case UNGE:
11463         case UNEQ:
11464           return CCFPmode;
11465
11466         case LT:
11467         case LE:
11468         case GT:
11469         case GE:
11470         case LTGT:
11471           return CCFPEmode;
11472
11473         default:
11474           gcc_unreachable ();
11475         }
11476     }
11477
11478   /* Equality comparisons of short modes against zero can be performed
11479      using the TST instruction with the appropriate bitmask.  */
11480   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11481       && (code == EQ || code == NE)
11482       && (mode_x == HImode || mode_x == QImode))
11483     return CC_Zmode;
11484
11485   /* Similarly, comparisons of zero_extends from shorter modes can
11486      be performed using an ANDS with an immediate mask.  */
11487   if (y == const0_rtx && code_x == ZERO_EXTEND
11488       && (mode_x == SImode || mode_x == DImode)
11489       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11490       && (code == EQ || code == NE))
11491     return CC_Zmode;
11492
11493   /* Zero extracts support equality comparisons.  */
11494   if ((mode_x == SImode || mode_x == DImode)
11495       && y == const0_rtx
11496       && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11497           && CONST_INT_P (XEXP (x, 2)))
11498       && (code == EQ || code == NE))
11499     return CC_Zmode;
11500
11501   /* ANDS/BICS/TST support equality and all signed comparisons.  */
11502   if ((mode_x == SImode || mode_x == DImode)
11503       && y == const0_rtx
11504       && (code_x == AND)
11505       && (code == EQ || code == NE || code == LT || code == GE
11506           || code == GT || code == LE))
11507     return CC_NZVmode;
11508
11509   /* ADDS/SUBS correctly set N and Z flags.  */
11510   if ((mode_x == SImode || mode_x == DImode)
11511       && y == const0_rtx
11512       && (code == EQ || code == NE || code == LT || code == GE)
11513       && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11514     return CC_NZmode;
11515
11516   /* A compare with a shifted operand.  Because of canonicalization,
11517      the comparison will have to be swapped when we emit the assembly
11518      code.  */
11519   if ((mode_x == SImode || mode_x == DImode)
11520       && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11521       && (code_x == ASHIFT || code_x == ASHIFTRT
11522           || code_x == LSHIFTRT
11523           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11524     return CC_SWPmode;
11525
11526   /* Similarly for a negated operand, but we can only do this for
11527      equalities.  */
11528   if ((mode_x == SImode || mode_x == DImode)
11529       && (REG_P (y) || SUBREG_P (y))
11530       && (code == EQ || code == NE)
11531       && code_x == NEG)
11532     return CC_Zmode;
11533
11534   /* A test for unsigned overflow from an addition.  */
11535   if ((mode_x == DImode || mode_x == TImode)
11536       && (code == LTU || code == GEU)
11537       && code_x == PLUS
11538       && rtx_equal_p (XEXP (x, 0), y))
11539     return CC_Cmode;
11540
11541   /* A test for unsigned overflow from an add with carry.  */
11542   if ((mode_x == DImode || mode_x == TImode)
11543       && (code == LTU || code == GEU)
11544       && code_x == PLUS
11545       && CONST_SCALAR_INT_P (y)
11546       && (rtx_mode_t (y, mode_x)
11547           == (wi::shwi (1, mode_x)
11548               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11549     return CC_ADCmode;
11550
11551   /* A test for signed overflow.  */
11552   if ((mode_x == DImode || mode_x == TImode)
11553       && code == NE
11554       && code_x == PLUS
11555       && GET_CODE (y) == SIGN_EXTEND)
11556     return CC_Vmode;
11557
11558   /* For everything else, return CCmode.  */
11559   return CCmode;
11560 }
11561
11562 static int
11563 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11564
11565 int
11566 aarch64_get_condition_code (rtx x)
11567 {
11568   machine_mode mode = GET_MODE (XEXP (x, 0));
11569   enum rtx_code comp_code = GET_CODE (x);
11570
11571   if (GET_MODE_CLASS (mode) != MODE_CC)
11572     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11573   return aarch64_get_condition_code_1 (mode, comp_code);
11574 }
11575
11576 static int
11577 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11578 {
11579   switch (mode)
11580     {
11581     case E_CCFPmode:
11582     case E_CCFPEmode:
11583       switch (comp_code)
11584         {
11585         case GE: return AARCH64_GE;
11586         case GT: return AARCH64_GT;
11587         case LE: return AARCH64_LS;
11588         case LT: return AARCH64_MI;
11589         case NE: return AARCH64_NE;
11590         case EQ: return AARCH64_EQ;
11591         case ORDERED: return AARCH64_VC;
11592         case UNORDERED: return AARCH64_VS;
11593         case UNLT: return AARCH64_LT;
11594         case UNLE: return AARCH64_LE;
11595         case UNGT: return AARCH64_HI;
11596         case UNGE: return AARCH64_PL;
11597         default: return -1;
11598         }
11599       break;
11600
11601     case E_CCmode:
11602       switch (comp_code)
11603         {
11604         case NE: return AARCH64_NE;
11605         case EQ: return AARCH64_EQ;
11606         case GE: return AARCH64_GE;
11607         case GT: return AARCH64_GT;
11608         case LE: return AARCH64_LE;
11609         case LT: return AARCH64_LT;
11610         case GEU: return AARCH64_CS;
11611         case GTU: return AARCH64_HI;
11612         case LEU: return AARCH64_LS;
11613         case LTU: return AARCH64_CC;
11614         default: return -1;
11615         }
11616       break;
11617
11618     case E_CC_SWPmode:
11619       switch (comp_code)
11620         {
11621         case NE: return AARCH64_NE;
11622         case EQ: return AARCH64_EQ;
11623         case GE: return AARCH64_LE;
11624         case GT: return AARCH64_LT;
11625         case LE: return AARCH64_GE;
11626         case LT: return AARCH64_GT;
11627         case GEU: return AARCH64_LS;
11628         case GTU: return AARCH64_CC;
11629         case LEU: return AARCH64_CS;
11630         case LTU: return AARCH64_HI;
11631         default: return -1;
11632         }
11633       break;
11634
11635     case E_CC_NZCmode:
11636       switch (comp_code)
11637         {
11638         case NE: return AARCH64_NE; /* = any */
11639         case EQ: return AARCH64_EQ; /* = none */
11640         case GE: return AARCH64_PL; /* = nfrst */
11641         case LT: return AARCH64_MI; /* = first */
11642         case GEU: return AARCH64_CS; /* = nlast */
11643         case GTU: return AARCH64_HI; /* = pmore */
11644         case LEU: return AARCH64_LS; /* = plast */
11645         case LTU: return AARCH64_CC; /* = last */
11646         default: return -1;
11647         }
11648       break;
11649
11650     case E_CC_NZVmode:
11651       switch (comp_code)
11652         {
11653         case NE: return AARCH64_NE;
11654         case EQ: return AARCH64_EQ;
11655         case GE: return AARCH64_PL;
11656         case LT: return AARCH64_MI;
11657         case GT: return AARCH64_GT;
11658         case LE: return AARCH64_LE;
11659         default: return -1;
11660         }
11661       break;
11662
11663     case E_CC_NZmode:
11664       switch (comp_code)
11665         {
11666         case NE: return AARCH64_NE;
11667         case EQ: return AARCH64_EQ;
11668         case GE: return AARCH64_PL;
11669         case LT: return AARCH64_MI;
11670         default: return -1;
11671         }
11672       break;
11673
11674     case E_CC_Zmode:
11675       switch (comp_code)
11676         {
11677         case NE: return AARCH64_NE;
11678         case EQ: return AARCH64_EQ;
11679         default: return -1;
11680         }
11681       break;
11682
11683     case E_CC_Cmode:
11684       switch (comp_code)
11685         {
11686         case LTU: return AARCH64_CS;
11687         case GEU: return AARCH64_CC;
11688         default: return -1;
11689         }
11690       break;
11691
11692     case E_CC_ADCmode:
11693       switch (comp_code)
11694         {
11695         case GEU: return AARCH64_CS;
11696         case LTU: return AARCH64_CC;
11697         default: return -1;
11698         }
11699       break;
11700
11701     case E_CC_Vmode:
11702       switch (comp_code)
11703         {
11704         case NE: return AARCH64_VS;
11705         case EQ: return AARCH64_VC;
11706         default: return -1;
11707         }
11708       break;
11709
11710     default:
11711       return -1;
11712     }
11713
11714   return -1;
11715 }
11716
11717 /* Return true if X is a CONST_INT, CONST_WIDE_INT or a constant vector
11718    duplicate of such constants.  If so, store in RET_WI the wide_int
11719    representation of the constant paired with the inner mode of the vector mode
11720    or MODE for scalar X constants.  If MODE is not provided then TImode is
11721    used.  */
11722
11723 static bool
11724 aarch64_extract_vec_duplicate_wide_int (rtx x, wide_int *ret_wi,
11725                                         scalar_mode mode = TImode)
11726 {
11727   rtx elt = unwrap_const_vec_duplicate (x);
11728   if (!CONST_SCALAR_INT_P (elt))
11729     return false;
11730   scalar_mode smode
11731     = CONST_SCALAR_INT_P (x) ? mode : GET_MODE_INNER (GET_MODE (x));
11732   *ret_wi = rtx_mode_t (elt, smode);
11733   return true;
11734 }
11735
11736 /* Return true if X is a scalar or a constant vector of integer
11737    immediates that represent the rounding constant used in the fixed-point
11738    arithmetic instructions.
11739    The accepted form of the constant is (1 << (C - 1)) where C is in the range
11740    [1, MODE_WIDTH/2].  */
11741
11742 bool
11743 aarch64_rnd_imm_p (rtx x)
11744 {
11745   wide_int rnd_cst;
11746   if (!aarch64_extract_vec_duplicate_wide_int (x, &rnd_cst))
11747     return false;
11748   int log2 = wi::exact_log2 (rnd_cst);
11749   if (log2 < 0)
11750     return false;
11751   return IN_RANGE (log2, 0, rnd_cst.get_precision () / 2 - 1);
11752 }
11753
11754 /* Return true if RND is a constant vector of integer rounding constants
11755    corresponding to a constant vector of shifts, SHIFT.
11756    The relationship should be RND == (1 << (SHIFT - 1)).  */
11757
11758 bool
11759 aarch64_const_vec_rnd_cst_p (rtx rnd, rtx shift)
11760 {
11761   wide_int rnd_cst, shft_cst;
11762   if (!aarch64_extract_vec_duplicate_wide_int (rnd, &rnd_cst)
11763       || !aarch64_extract_vec_duplicate_wide_int (shift, &shft_cst))
11764     return false;
11765
11766   return rnd_cst == (wi::shwi (1, rnd_cst.get_precision ()) << (shft_cst - 1));
11767 }
11768
11769 bool
11770 aarch64_const_vec_all_same_in_range_p (rtx x,
11771                                        HOST_WIDE_INT minval,
11772                                        HOST_WIDE_INT maxval)
11773 {
11774   rtx elt;
11775   return (const_vec_duplicate_p (x, &elt)
11776           && CONST_INT_P (elt)
11777           && IN_RANGE (INTVAL (elt), minval, maxval));
11778 }
11779
11780 /* Some constants can't be made using normal mov instructions in Advanced SIMD
11781    but we can still create them in various ways.  If the constant in VAL can be
11782    created using alternate methods then if possible then return true and
11783    additionally set TARGET to the rtx for the sequence if TARGET is not NULL.
11784    Otherwise return false if sequence is not possible.  */
11785
11786 bool
11787 aarch64_maybe_generate_simd_constant (rtx target, rtx val, machine_mode mode)
11788 {
11789   wide_int wval;
11790   auto smode = GET_MODE_INNER (mode);
11791   if (!aarch64_extract_vec_duplicate_wide_int (val, &wval, smode))
11792     return false;
11793
11794   /* For Advanced SIMD we can create an integer with only the top bit set
11795      using fneg (0.0f).  */
11796   if (TARGET_SIMD
11797       && !TARGET_SVE
11798       && smode == DImode
11799       && wi::only_sign_bit_p (wval))
11800     {
11801       if (!target)
11802         return true;
11803
11804       /* Use the same base type as aarch64_gen_shareable_zero.  */
11805       rtx zero = CONST0_RTX (V4SImode);
11806       emit_move_insn (lowpart_subreg (V4SImode, target, mode), zero);
11807       rtx neg = lowpart_subreg (V2DFmode, target, mode);
11808       emit_insn (gen_negv2df2 (neg, copy_rtx (neg)));
11809       return true;
11810     }
11811
11812   return false;
11813 }
11814
11815 /* Check if the value in VAL with mode MODE can be created using special
11816    instruction sequences.  */
11817
11818 bool aarch64_simd_special_constant_p (rtx val, machine_mode mode)
11819 {
11820   return aarch64_maybe_generate_simd_constant (NULL_RTX, val, mode);
11821 }
11822
11823 bool
11824 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11825 {
11826   return aarch64_const_vec_all_same_in_range_p (x, val, val);
11827 }
11828
11829 /* Return true if VEC is a constant in which every element is in the range
11830    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
11831
11832 static bool
11833 aarch64_const_vec_all_in_range_p (rtx vec,
11834                                   HOST_WIDE_INT minval,
11835                                   HOST_WIDE_INT maxval)
11836 {
11837   if (!CONST_VECTOR_P (vec)
11838       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11839     return false;
11840
11841   int nunits;
11842   if (!CONST_VECTOR_STEPPED_P (vec))
11843     nunits = const_vector_encoded_nelts (vec);
11844   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11845     return false;
11846
11847   for (int i = 0; i < nunits; i++)
11848     {
11849       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11850       if (!CONST_INT_P (vec_elem)
11851           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11852         return false;
11853     }
11854   return true;
11855 }
11856
11857 /* N Z C V.  */
11858 #define AARCH64_CC_V 1
11859 #define AARCH64_CC_C (1 << 1)
11860 #define AARCH64_CC_Z (1 << 2)
11861 #define AARCH64_CC_N (1 << 3)
11862
11863 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
11864 static const int aarch64_nzcv_codes[] =
11865 {
11866   0,            /* EQ, Z == 1.  */
11867   AARCH64_CC_Z, /* NE, Z == 0.  */
11868   0,            /* CS, C == 1.  */
11869   AARCH64_CC_C, /* CC, C == 0.  */
11870   0,            /* MI, N == 1.  */
11871   AARCH64_CC_N, /* PL, N == 0.  */
11872   0,            /* VS, V == 1.  */
11873   AARCH64_CC_V, /* VC, V == 0.  */
11874   0,            /* HI, C ==1 && Z == 0.  */
11875   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
11876   AARCH64_CC_V, /* GE, N == V.  */
11877   0,            /* LT, N != V.  */
11878   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
11879   0,            /* LE, !(Z == 0 && N == V).  */
11880   0,            /* AL, Any.  */
11881   0             /* NV, Any.  */
11882 };
11883
11884 /* Print floating-point vector immediate operand X to F, negating it
11885    first if NEGATE is true.  Return true on success, false if it isn't
11886    a constant we can handle.  */
11887
11888 static bool
11889 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11890 {
11891   rtx elt;
11892
11893   if (!const_vec_duplicate_p (x, &elt))
11894     return false;
11895
11896   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11897   if (negate)
11898     r = real_value_negate (&r);
11899
11900   /* Handle the SVE single-bit immediates specially, since they have a
11901      fixed form in the assembly syntax.  */
11902   if (real_equal (&r, &dconst0))
11903     asm_fprintf (f, "0.0");
11904   else if (real_equal (&r, &dconst2))
11905     asm_fprintf (f, "2.0");
11906   else if (real_equal (&r, &dconst1))
11907     asm_fprintf (f, "1.0");
11908   else if (real_equal (&r, &dconsthalf))
11909     asm_fprintf (f, "0.5");
11910   else
11911     {
11912       const int buf_size = 20;
11913       char float_buf[buf_size] = {'\0'};
11914       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11915                                 1, GET_MODE (elt));
11916       asm_fprintf (f, "%s", float_buf);
11917     }
11918
11919   return true;
11920 }
11921
11922 /* Return the equivalent letter for size.  */
11923 static char
11924 sizetochar (int size)
11925 {
11926   switch (size)
11927     {
11928     case 64: return 'd';
11929     case 32: return 's';
11930     case 16: return 'h';
11931     case 8 : return 'b';
11932     default: gcc_unreachable ();
11933     }
11934 }
11935
11936 /* Print operand X to file F in a target specific manner according to CODE.
11937    The acceptable formatting commands given by CODE are:
11938      'c':               An integer or symbol address without a preceding #
11939                         sign.
11940      'C':               Take the duplicated element in a vector constant
11941                         and print it in hex.
11942      'D':               Take the duplicated element in a vector constant
11943                         and print it as an unsigned integer, in decimal.
11944      'e':               Print the sign/zero-extend size as a character 8->b,
11945                         16->h, 32->w.  Can also be used for masks:
11946                         0xff->b, 0xffff->h, 0xffffffff->w.
11947      'I':               If the operand is a duplicated vector constant,
11948                         replace it with the duplicated scalar.  If the
11949                         operand is then a floating-point constant, replace
11950                         it with the integer bit representation.  Print the
11951                         transformed constant as a signed decimal number.
11952      'p':               Prints N such that 2^N == X (X must be power of 2 and
11953                         const int).
11954      'P':               Print the number of non-zero bits in X (a const_int).
11955      'H':               Print the higher numbered register of a pair (TImode)
11956                         of regs.
11957      'm':               Print a condition (eq, ne, etc).
11958      'M':               Same as 'm', but invert condition.
11959      'N':               Take the duplicated element in a vector constant
11960                         and print the negative of it in decimal.
11961      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
11962      'Z':               Same for SVE registers.  ('z' was already taken.)
11963                         Note that it is not necessary to use %Z for operands
11964                         that have SVE modes.  The convention is to use %Z
11965                         only for non-SVE (or potentially non-SVE) modes.
11966      'S/T/U/V':         Print a FP/SIMD register name for a register list.
11967                         The register printed is the FP/SIMD register name
11968                         of X + 0/1/2/3 for S/T/U/V.
11969      'R':               Print a scalar Integer/FP/SIMD register name + 1.
11970      'X':               Print bottom 16 bits of integer constant in hex.
11971      'w/x':             Print a general register name or the zero register
11972                         (32-bit or 64-bit).
11973      '0':               Print a normal operand, if it's a general register,
11974                         then we assume DImode.
11975      'k':               Print NZCV for conditional compare instructions.
11976      'K':               Print a predicate register as pn<N> rather than p<N>
11977      'A':               Output address constant representing the first
11978                         argument of X, specifying a relocation offset
11979                         if appropriate.
11980      'L':               Output constant address specified by X
11981                         with a relocation offset if appropriate.
11982      'G':               Prints address of X, specifying a PC relative
11983                         relocation mode if appropriate.
11984      'y':               Output address of LDP or STP - this is used for
11985                         some LDP/STPs which don't use a PARALLEL in their
11986                         pattern (so the mode needs to be adjusted).
11987      'z':               Output address of a typical LDP or STP.  */
11988
11989 static void
11990 aarch64_print_operand (FILE *f, rtx x, int code)
11991 {
11992   rtx elt;
11993   switch (code)
11994     {
11995     case 'c':
11996       if (CONST_INT_P (x))
11997         fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11998       else
11999         {
12000           poly_int64 offset;
12001           rtx base = strip_offset_and_salt (x, &offset);
12002           if (SYMBOL_REF_P (base))
12003             output_addr_const (f, x);
12004           else
12005             output_operand_lossage ("unsupported operand for code '%c'", code);
12006         }
12007       break;
12008
12009     case 'e':
12010       {
12011         x = unwrap_const_vec_duplicate (x);
12012         if (!CONST_INT_P (x))
12013           {
12014             output_operand_lossage ("invalid operand for '%%%c'", code);
12015             return;
12016           }
12017
12018         HOST_WIDE_INT val = INTVAL (x);
12019         if ((val & ~7) == 8 || val == 0xff)
12020           fputc ('b', f);
12021         else if ((val & ~7) == 16 || val == 0xffff)
12022           fputc ('h', f);
12023         else if ((val & ~7) == 32 || val == 0xffffffff)
12024           fputc ('w', f);
12025         else
12026           {
12027             output_operand_lossage ("invalid operand for '%%%c'", code);
12028             return;
12029           }
12030       }
12031       break;
12032
12033     case 'p':
12034       {
12035         int n;
12036
12037         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
12038           {
12039             output_operand_lossage ("invalid operand for '%%%c'", code);
12040             return;
12041           }
12042
12043         asm_fprintf (f, "%d", n);
12044       }
12045       break;
12046
12047     case 'P':
12048       if (!CONST_INT_P (x))
12049         {
12050           output_operand_lossage ("invalid operand for '%%%c'", code);
12051           return;
12052         }
12053
12054       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
12055       break;
12056
12057     case 'H':
12058       if (x == const0_rtx)
12059         {
12060           asm_fprintf (f, "xzr");
12061           break;
12062         }
12063
12064       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
12065         {
12066           output_operand_lossage ("invalid operand for '%%%c'", code);
12067           return;
12068         }
12069
12070       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
12071       break;
12072
12073     case 'I':
12074       {
12075         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
12076         if (CONST_INT_P (x))
12077           asm_fprintf (f, "%wd", INTVAL (x));
12078         else
12079           {
12080             output_operand_lossage ("invalid operand for '%%%c'", code);
12081             return;
12082           }
12083         break;
12084       }
12085
12086     case 'M':
12087     case 'm':
12088       {
12089         int cond_code;
12090         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
12091         if (x == const_true_rtx)
12092           {
12093             if (code == 'M')
12094               fputs ("nv", f);
12095             return;
12096           }
12097
12098         if (!COMPARISON_P (x))
12099           {
12100             output_operand_lossage ("invalid operand for '%%%c'", code);
12101             return;
12102           }
12103
12104         cond_code = aarch64_get_condition_code (x);
12105         gcc_assert (cond_code >= 0);
12106         if (code == 'M')
12107           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
12108         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
12109           fputs (aarch64_sve_condition_codes[cond_code], f);
12110         else
12111           fputs (aarch64_condition_codes[cond_code], f);
12112       }
12113       break;
12114
12115     case 'N':
12116       if (!const_vec_duplicate_p (x, &elt))
12117         {
12118           output_operand_lossage ("invalid vector constant");
12119           return;
12120         }
12121
12122       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12123         asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
12124       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12125                && aarch64_print_vector_float_operand (f, x, true))
12126         ;
12127       else
12128         {
12129           output_operand_lossage ("invalid vector constant");
12130           return;
12131         }
12132       break;
12133
12134     case 'b':
12135     case 'h':
12136     case 's':
12137     case 'd':
12138     case 'q':
12139     case 'Z':
12140       code = TOLOWER (code);
12141       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12142         {
12143           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12144           return;
12145         }
12146       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
12147       break;
12148
12149     case 'S':
12150     case 'T':
12151     case 'U':
12152     case 'V':
12153       if (!REG_P (x) || (!FP_REGNUM_P (REGNO (x)) && !PR_REGNUM_P (REGNO (x))))
12154         {
12155           output_operand_lossage ("incompatible operand for '%%%c'", code);
12156           return;
12157         }
12158       if (PR_REGNUM_P (REGNO (x)))
12159         asm_fprintf (f, "p%d", REGNO (x) - P0_REGNUM + (code - 'S'));
12160       else
12161         asm_fprintf (f, "%c%d",
12162                      aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12163                      REGNO (x) - V0_REGNUM + (code - 'S'));
12164       break;
12165
12166     case 'R':
12167       if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12168           && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12169         asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12170       else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
12171         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12172       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12173         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12174       else
12175         output_operand_lossage ("incompatible register operand for '%%%c'",
12176                                 code);
12177       break;
12178
12179     case 'X':
12180       if (!CONST_INT_P (x))
12181         {
12182           output_operand_lossage ("invalid operand for '%%%c'", code);
12183           return;
12184         }
12185       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
12186       break;
12187
12188     case 'C':
12189       {
12190         /* Print a replicated constant in hex.  */
12191         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12192           {
12193             output_operand_lossage ("invalid operand for '%%%c'", code);
12194             return;
12195           }
12196         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12197         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12198       }
12199       break;
12200
12201     case 'D':
12202       {
12203         /* Print a replicated constant in decimal, treating it as
12204            unsigned.  */
12205         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12206           {
12207             output_operand_lossage ("invalid operand for '%%%c'", code);
12208             return;
12209           }
12210         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12211         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12212       }
12213       break;
12214
12215     case 'w':
12216     case 'x':
12217       if (aarch64_const_zero_rtx_p (x))
12218         {
12219           asm_fprintf (f, "%czr", code);
12220           break;
12221         }
12222
12223       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12224         {
12225           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12226           break;
12227         }
12228
12229       if (REG_P (x) && REGNO (x) == SP_REGNUM)
12230         {
12231           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12232           break;
12233         }
12234
12235       /* Fall through */
12236
12237     case 0:
12238       if (x == NULL)
12239         {
12240           output_operand_lossage ("missing operand");
12241           return;
12242         }
12243
12244       switch (GET_CODE (x))
12245         {
12246         case CONST_STRING:
12247           {
12248             asm_fprintf (f, "%s", XSTR (x, 0));
12249             break;
12250           }
12251         case REG:
12252           if (aarch64_sve_data_mode_p (GET_MODE (x)))
12253             {
12254               if (REG_NREGS (x) == 1)
12255                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12256               else
12257                 {
12258                   char suffix
12259                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12260                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
12261                                REGNO (x) - V0_REGNUM, suffix,
12262                                END_REGNO (x) - V0_REGNUM - 1, suffix);
12263                 }
12264             }
12265           else
12266             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12267           break;
12268
12269         case MEM:
12270           output_address (GET_MODE (x), XEXP (x, 0));
12271           break;
12272
12273         case LABEL_REF:
12274         case SYMBOL_REF:
12275           output_addr_const (asm_out_file, x);
12276           break;
12277
12278         case CONST_INT:
12279           asm_fprintf (f, "%wd", INTVAL (x));
12280           break;
12281
12282         case CONST:
12283           if (!VECTOR_MODE_P (GET_MODE (x)))
12284             {
12285               output_addr_const (asm_out_file, x);
12286               break;
12287             }
12288           /* fall through */
12289
12290         case CONST_VECTOR:
12291           if (!const_vec_duplicate_p (x, &elt))
12292             {
12293               output_operand_lossage ("invalid vector constant");
12294               return;
12295             }
12296
12297           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12298             asm_fprintf (f, "%wd", INTVAL (elt));
12299           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12300                    && aarch64_print_vector_float_operand (f, x, false))
12301             ;
12302           else
12303             {
12304               output_operand_lossage ("invalid vector constant");
12305               return;
12306             }
12307           break;
12308
12309         case CONST_DOUBLE:
12310           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12311              be getting CONST_DOUBLEs holding integers.  */
12312           gcc_assert (GET_MODE (x) != VOIDmode);
12313           if (aarch64_float_const_zero_rtx_p (x))
12314             {
12315               fputc ('0', f);
12316               break;
12317             }
12318           else if (aarch64_float_const_representable_p (x))
12319             {
12320 #define buf_size 20
12321               char float_buf[buf_size] = {'\0'};
12322               real_to_decimal_for_mode (float_buf,
12323                                         CONST_DOUBLE_REAL_VALUE (x),
12324                                         buf_size, buf_size,
12325                                         1, GET_MODE (x));
12326               asm_fprintf (asm_out_file, "%s", float_buf);
12327               break;
12328 #undef buf_size
12329             }
12330           output_operand_lossage ("invalid constant");
12331           return;
12332         default:
12333           output_operand_lossage ("invalid operand");
12334           return;
12335         }
12336       break;
12337
12338     case 'A':
12339       if (GET_CODE (x) == HIGH)
12340         x = XEXP (x, 0);
12341
12342       switch (aarch64_classify_symbolic_expression (x))
12343         {
12344         case SYMBOL_SMALL_GOT_4G:
12345           asm_fprintf (asm_out_file, ":got:");
12346           break;
12347
12348         case SYMBOL_SMALL_TLSGD:
12349           asm_fprintf (asm_out_file, ":tlsgd:");
12350           break;
12351
12352         case SYMBOL_SMALL_TLSDESC:
12353           asm_fprintf (asm_out_file, ":tlsdesc:");
12354           break;
12355
12356         case SYMBOL_SMALL_TLSIE:
12357           asm_fprintf (asm_out_file, ":gottprel:");
12358           break;
12359
12360         case SYMBOL_TLSLE24:
12361           asm_fprintf (asm_out_file, ":tprel:");
12362           break;
12363
12364         case SYMBOL_TINY_GOT:
12365           gcc_unreachable ();
12366           break;
12367
12368         default:
12369           break;
12370         }
12371       output_addr_const (asm_out_file, x);
12372       break;
12373
12374     case 'L':
12375       switch (aarch64_classify_symbolic_expression (x))
12376         {
12377         case SYMBOL_SMALL_GOT_4G:
12378           asm_fprintf (asm_out_file, ":got_lo12:");
12379           break;
12380
12381         case SYMBOL_SMALL_TLSGD:
12382           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12383           break;
12384
12385         case SYMBOL_SMALL_TLSDESC:
12386           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12387           break;
12388
12389         case SYMBOL_SMALL_TLSIE:
12390           asm_fprintf (asm_out_file, ":gottprel_lo12:");
12391           break;
12392
12393         case SYMBOL_TLSLE12:
12394           asm_fprintf (asm_out_file, ":tprel_lo12:");
12395           break;
12396
12397         case SYMBOL_TLSLE24:
12398           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12399           break;
12400
12401         case SYMBOL_TINY_GOT:
12402           asm_fprintf (asm_out_file, ":got:");
12403           break;
12404
12405         case SYMBOL_TINY_TLSIE:
12406           asm_fprintf (asm_out_file, ":gottprel:");
12407           break;
12408
12409         default:
12410           break;
12411         }
12412       output_addr_const (asm_out_file, x);
12413       break;
12414
12415     case 'G':
12416       switch (aarch64_classify_symbolic_expression (x))
12417         {
12418         case SYMBOL_TLSLE24:
12419           asm_fprintf (asm_out_file, ":tprel_hi12:");
12420           break;
12421         default:
12422           break;
12423         }
12424       output_addr_const (asm_out_file, x);
12425       break;
12426
12427     case 'k':
12428       {
12429         HOST_WIDE_INT cond_code;
12430
12431         if (!CONST_INT_P (x))
12432           {
12433             output_operand_lossage ("invalid operand for '%%%c'", code);
12434             return;
12435           }
12436
12437         cond_code = INTVAL (x);
12438         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12439         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12440       }
12441       break;
12442
12443     case 'K':
12444       if (!REG_P (x) || !PR_REGNUM_P (REGNO (x)))
12445         {
12446           output_operand_lossage ("invalid operand for '%%%c'", code);
12447           return;
12448         }
12449       asm_fprintf (f, "pn%d", REGNO (x) - P0_REGNUM);
12450       break;
12451
12452     case 'y':
12453     case 'z':
12454       {
12455         machine_mode mode = GET_MODE (x);
12456
12457         if (!MEM_P (x)
12458             || (code == 'y'
12459                 && maybe_ne (GET_MODE_SIZE (mode), 8)
12460                 && maybe_ne (GET_MODE_SIZE (mode), 16)
12461                 && maybe_ne (GET_MODE_SIZE (mode), 32)))
12462           {
12463             output_operand_lossage ("invalid operand for '%%%c'", code);
12464             return;
12465           }
12466
12467         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12468                                             code == 'y'
12469                                             ? ADDR_QUERY_LDP_STP_N
12470                                             : ADDR_QUERY_LDP_STP))
12471           output_operand_lossage ("invalid operand prefix '%%%c'", code);
12472       }
12473       break;
12474
12475     default:
12476       output_operand_lossage ("invalid operand prefix '%%%c'", code);
12477       return;
12478     }
12479 }
12480
12481 /* Print address 'x' of a memory access with mode 'mode'.
12482    'op' is the context required by aarch64_classify_address.  It can either be
12483    MEM for a normal memory access or PARALLEL for LDP/STP.  */
12484 static bool
12485 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12486                                 aarch64_addr_query_type type)
12487 {
12488   struct aarch64_address_info addr;
12489   unsigned int size, vec_flags;
12490
12491   /* Check all addresses are Pmode - including ILP32.  */
12492   if (GET_MODE (x) != Pmode
12493       && (!CONST_INT_P (x)
12494           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12495     {
12496       output_operand_lossage ("invalid address mode");
12497       return false;
12498     }
12499
12500   const bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
12501                                   || type == ADDR_QUERY_LDP_STP_N);
12502
12503   if (aarch64_classify_address (&addr, x, mode, true, type))
12504     switch (addr.type)
12505       {
12506       case ADDRESS_REG_IMM:
12507         if (known_eq (addr.const_offset, 0))
12508           {
12509             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12510             return true;
12511           }
12512
12513         vec_flags = aarch64_classify_vector_mode (mode);
12514         if ((vec_flags & VEC_ANY_SVE) && !load_store_pair_p)
12515           {
12516             HOST_WIDE_INT vnum
12517               = exact_div (addr.const_offset,
12518                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12519             asm_fprintf (f, "[%s, #%wd, mul vl]",
12520                          reg_names[REGNO (addr.base)], vnum);
12521             return true;
12522           }
12523
12524         if (!CONST_INT_P (addr.offset))
12525           return false;
12526
12527         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12528                      INTVAL (addr.offset));
12529         return true;
12530
12531       case ADDRESS_REG_REG:
12532         if (addr.shift == 0)
12533           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12534                        reg_names [REGNO (addr.offset)]);
12535         else
12536           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12537                        reg_names [REGNO (addr.offset)], addr.shift);
12538         return true;
12539
12540       case ADDRESS_REG_UXTW:
12541         if (addr.shift == 0)
12542           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12543                        REGNO (addr.offset) - R0_REGNUM);
12544         else
12545           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12546                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12547         return true;
12548
12549       case ADDRESS_REG_SXTW:
12550         if (addr.shift == 0)
12551           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12552                        REGNO (addr.offset) - R0_REGNUM);
12553         else
12554           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12555                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12556         return true;
12557
12558       case ADDRESS_REG_WB:
12559         /* Writeback is only supported for fixed-width modes.  */
12560         size = GET_MODE_SIZE (mode).to_constant ();
12561         switch (GET_CODE (x))
12562           {
12563           case PRE_INC:
12564             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12565             return true;
12566           case POST_INC:
12567             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12568             return true;
12569           case PRE_DEC:
12570             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12571             return true;
12572           case POST_DEC:
12573             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12574             return true;
12575           case PRE_MODIFY:
12576             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12577                          INTVAL (addr.offset));
12578             return true;
12579           case POST_MODIFY:
12580             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12581                          INTVAL (addr.offset));
12582             return true;
12583           default:
12584             break;
12585           }
12586         break;
12587
12588       case ADDRESS_LO_SUM:
12589         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12590         output_addr_const (f, addr.offset);
12591         asm_fprintf (f, "]");
12592         return true;
12593
12594       case ADDRESS_SYMBOLIC:
12595         output_addr_const (f, x);
12596         return true;
12597       }
12598
12599   return false;
12600 }
12601
12602 /* Print address 'x' of a memory access with mode 'mode'.  */
12603 static void
12604 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12605 {
12606   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12607     output_addr_const (f, x);
12608 }
12609
12610 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
12611
12612 static bool
12613 aarch64_output_addr_const_extra (FILE *file, rtx x)
12614 {
12615   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12616     {
12617       output_addr_const (file, XVECEXP (x, 0, 0));
12618       return true;
12619    }
12620   return false;
12621 }
12622
12623 bool
12624 aarch64_label_mentioned_p (rtx x)
12625 {
12626   const char *fmt;
12627   int i;
12628
12629   if (LABEL_REF_P (x))
12630     return true;
12631
12632   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12633      referencing instruction, but they are constant offsets, not
12634      symbols.  */
12635   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12636     return false;
12637
12638   fmt = GET_RTX_FORMAT (GET_CODE (x));
12639   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12640     {
12641       if (fmt[i] == 'E')
12642         {
12643           int j;
12644
12645           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12646             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12647               return 1;
12648         }
12649       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12650         return 1;
12651     }
12652
12653   return 0;
12654 }
12655
12656 /* Implement REGNO_REG_CLASS.  */
12657
12658 enum reg_class
12659 aarch64_regno_regclass (unsigned regno)
12660 {
12661   if (W8_W11_REGNUM_P (regno))
12662     return W8_W11_REGS;
12663
12664   if (W12_W15_REGNUM_P (regno))
12665     return W12_W15_REGS;
12666
12667   if (STUB_REGNUM_P (regno))
12668     return STUB_REGS;
12669
12670   if (GP_REGNUM_P (regno))
12671     return GENERAL_REGS;
12672
12673   if (regno == SP_REGNUM)
12674     return STACK_REG;
12675
12676   if (regno == FRAME_POINTER_REGNUM
12677       || regno == ARG_POINTER_REGNUM)
12678     return POINTER_REGS;
12679
12680   if (FP_REGNUM_P (regno))
12681     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12682             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12683
12684   if (PR_REGNUM_P (regno))
12685     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12686
12687   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12688     return FFR_REGS;
12689
12690   if (FAKE_REGNUM_P (regno))
12691     return FAKE_REGS;
12692
12693   return NO_REGS;
12694 }
12695
12696 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12697    If OFFSET is out of range, return an offset of an anchor point
12698    that is in range.  Return 0 otherwise.  */
12699
12700 static HOST_WIDE_INT
12701 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12702                        machine_mode mode)
12703 {
12704   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
12705   if (size > 16)
12706     return (offset + 0x400) & ~0x7f0;
12707
12708   /* For offsets that aren't a multiple of the access size, the limit is
12709      -256...255.  */
12710   if (offset & (size - 1))
12711     {
12712       /* BLKmode typically uses LDP of X-registers.  */
12713       if (mode == BLKmode)
12714         return (offset + 512) & ~0x3ff;
12715       return (offset + 0x100) & ~0x1ff;
12716     }
12717
12718   /* Small negative offsets are supported.  */
12719   if (IN_RANGE (offset, -256, 0))
12720     return 0;
12721
12722   if (mode == TImode || mode == TFmode || mode == TDmode)
12723     return (offset + 0x100) & ~0x1ff;
12724
12725   /* Use 12-bit offset by access size.  */
12726   return offset & (~0xfff * size);
12727 }
12728
12729 static rtx
12730 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
12731 {
12732 #if TARGET_PECOFF
12733   rtx tmp = legitimize_pe_coff_symbol (x, true);
12734   if (tmp)
12735     return tmp;
12736 #endif
12737
12738   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12739      where mask is selected by alignment and size of the offset.
12740      We try to pick as large a range for the offset as possible to
12741      maximize the chance of a CSE.  However, for aligned addresses
12742      we limit the range to 4k so that structures with different sized
12743      elements are likely to use the same base.  We need to be careful
12744      not to split a CONST for some forms of address expression, otherwise
12745      it will generate sub-optimal code.  */
12746
12747   /* First split X + CONST (base, offset) into (base + X) + offset.  */
12748   if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 1)) == CONST)
12749     {
12750       poly_int64 offset;
12751       rtx base = strip_offset (XEXP (x, 1), &offset);
12752
12753       base = expand_binop (Pmode, add_optab, base, XEXP (x, 0),
12754                            NULL_RTX, true, OPTAB_DIRECT);
12755       x = plus_constant (Pmode, base, offset);
12756     }
12757
12758   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12759     {
12760       rtx base = XEXP (x, 0);
12761       rtx offset_rtx = XEXP (x, 1);
12762       HOST_WIDE_INT offset = INTVAL (offset_rtx);
12763
12764       if (GET_CODE (base) == PLUS)
12765         {
12766           rtx op0 = XEXP (base, 0);
12767           rtx op1 = XEXP (base, 1);
12768
12769           /* Force any scaling into a temp for CSE.  */
12770           op0 = force_reg (Pmode, op0);
12771           op1 = force_reg (Pmode, op1);
12772
12773           /* Let the pointer register be in op0.  */
12774           if (REG_POINTER (op1))
12775             std::swap (op0, op1);
12776
12777           /* If the pointer is virtual or frame related, then we know that
12778              virtual register instantiation or register elimination is going
12779              to apply a second constant.  We want the two constants folded
12780              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
12781           if (virt_or_elim_regno_p (REGNO (op0)))
12782             {
12783               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12784                                    NULL_RTX, true, OPTAB_DIRECT);
12785               return gen_rtx_PLUS (Pmode, base, op1);
12786             }
12787
12788           /* Otherwise, in order to encourage CSE (and thence loop strength
12789              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
12790           base = expand_binop (Pmode, add_optab, op0, op1,
12791                                NULL_RTX, true, OPTAB_DIRECT);
12792           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12793         }
12794
12795       HOST_WIDE_INT size;
12796       if (GET_MODE_SIZE (mode).is_constant (&size))
12797         {
12798           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12799                                                              mode);
12800           if (base_offset != 0)
12801             {
12802               base = plus_constant (Pmode, base, base_offset);
12803               base = force_operand (base, NULL_RTX);
12804               return plus_constant (Pmode, base, offset - base_offset);
12805             }
12806         }
12807     }
12808
12809   return x;
12810 }
12811
12812 static reg_class_t
12813 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12814                           reg_class_t rclass,
12815                           machine_mode mode,
12816                           secondary_reload_info *sri)
12817 {
12818   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12819      LDR and STR.  See the comment at the head of aarch64-sve.md for
12820      more details about the big-endian handling.  */
12821   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12822   if (reg_class_subset_p (rclass, FP_REGS)
12823       && !((REG_P (x) && HARD_REGISTER_P (x))
12824            || aarch64_simd_valid_immediate (x, NULL))
12825       && mode != VNx16QImode
12826       && (vec_flags & VEC_SVE_DATA)
12827       && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12828     {
12829       sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12830       return NO_REGS;
12831     }
12832
12833   /* If we have to disable direct literal pool loads and stores because the
12834      function is too big, then we need a scratch register.  */
12835   if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12836       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12837           || targetm.vector_mode_supported_p (GET_MODE (x)))
12838       && !aarch64_pcrelative_literal_loads)
12839     {
12840       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12841       return NO_REGS;
12842     }
12843
12844   /* Without the TARGET_SIMD or TARGET_SVE instructions we cannot move a
12845      Q register to a Q register directly.  We need a scratch.  */
12846   if (REG_P (x)
12847       && (mode == TFmode
12848           || mode == TImode
12849           || mode == TDmode
12850           || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
12851       && mode == GET_MODE (x)
12852       && !TARGET_SIMD
12853       && FP_REGNUM_P (REGNO (x))
12854       && reg_class_subset_p (rclass, FP_REGS))
12855     {
12856       sri->icode = code_for_aarch64_reload_mov (mode);
12857       return NO_REGS;
12858     }
12859
12860   /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12861      because AArch64 has richer addressing modes for LDR/STR instructions
12862      than LDP/STP instructions.  */
12863   if (TARGET_FLOAT && rclass == GENERAL_REGS
12864       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12865     return FP_REGS;
12866
12867   if (rclass == FP_REGS
12868       && (mode == TImode || mode == TFmode || mode == TDmode)
12869       && CONSTANT_P(x))
12870       return GENERAL_REGS;
12871
12872   return NO_REGS;
12873 }
12874
12875 /* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
12876
12877 static bool
12878 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12879                                  reg_class_t class2)
12880 {
12881   if (!TARGET_SIMD
12882       && reg_classes_intersect_p (class1, FP_REGS)
12883       && reg_classes_intersect_p (class2, FP_REGS))
12884     {
12885       /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12886          so we can't easily split a move involving tuples of 128-bit
12887          vectors.  Force the copy through memory instead.
12888
12889          (Tuples of 64-bit vectors are fine.)  */
12890       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12891       if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12892         return true;
12893     }
12894   return false;
12895 }
12896
12897 /* Implement TARGET_FRAME_POINTER_REQUIRED.  */
12898
12899 static bool
12900 aarch64_frame_pointer_required ()
12901 {
12902   /* If the function needs to record the incoming value of PSTATE.SM,
12903      make sure that the slot is accessible from the frame pointer.  */
12904   return aarch64_need_old_pstate_sm ();
12905 }
12906
12907 static bool
12908 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12909 {
12910   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12911
12912   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12913      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
12914   if (frame_pointer_needed)
12915     return to == HARD_FRAME_POINTER_REGNUM;
12916   return true;
12917 }
12918
12919 poly_int64
12920 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12921 {
12922   aarch64_frame &frame = cfun->machine->frame;
12923
12924   if (to == HARD_FRAME_POINTER_REGNUM)
12925     {
12926       if (from == ARG_POINTER_REGNUM)
12927         return frame.bytes_above_hard_fp;
12928
12929       if (from == FRAME_POINTER_REGNUM)
12930         return frame.bytes_above_hard_fp - frame.bytes_above_locals;
12931     }
12932
12933   if (to == STACK_POINTER_REGNUM)
12934     {
12935       if (from == FRAME_POINTER_REGNUM)
12936         return frame.frame_size - frame.bytes_above_locals;
12937     }
12938
12939   return frame.frame_size;
12940 }
12941
12942
12943 /* Get return address without mangling.  */
12944
12945 rtx
12946 aarch64_return_addr_rtx (void)
12947 {
12948   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12949   /* Note: aarch64_return_address_signing_enabled only
12950      works after cfun->machine->frame.laid_out is set,
12951      so here we don't know if the return address will
12952      be signed or not.  */
12953   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12954   emit_move_insn (lr, val);
12955   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12956   return lr;
12957 }
12958
12959
12960 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
12961    previous frame.  */
12962
12963 rtx
12964 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12965 {
12966   if (count != 0)
12967     return const0_rtx;
12968   return aarch64_return_addr_rtx ();
12969 }
12970
12971 static void
12972 aarch64_asm_trampoline_template (FILE *f)
12973 {
12974   /* Even if the current function doesn't have branch protection, some
12975      later function might, so since this template is only generated once
12976      we have to add a BTI just in case. */
12977   asm_fprintf (f, "\thint\t34 // bti c\n");
12978
12979   if (TARGET_ILP32)
12980     {
12981       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12982       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12983     }
12984   else
12985     {
12986       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12987       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12988     }
12989   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12990
12991   /* We always emit a speculation barrier.
12992      This is because the same trampoline template is used for every nested
12993      function.  Since nested functions are not particularly common or
12994      performant we don't worry too much about the extra instructions to copy
12995      around.
12996      This is not yet a problem, since we have not yet implemented function
12997      specific attributes to choose between hardening against straight line
12998      speculation or not, but such function specific attributes are likely to
12999      happen in the future.  */
13000   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
13001
13002   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
13003   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
13004 }
13005
13006 static void
13007 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
13008 {
13009   rtx fnaddr, mem, a_tramp;
13010   const int tramp_code_sz = 24;
13011
13012   /* Don't need to copy the trailing D-words, we fill those in below.  */
13013   /* We create our own memory address in Pmode so that `emit_block_move` can
13014      use parts of the backend which expect Pmode addresses.  */
13015   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
13016   emit_block_move (gen_rtx_MEM (BLKmode, temp),
13017                    assemble_trampoline_template (),
13018                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
13019   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
13020   fnaddr = XEXP (DECL_RTL (fndecl), 0);
13021   if (GET_MODE (fnaddr) != ptr_mode)
13022     fnaddr = convert_memory_address (ptr_mode, fnaddr);
13023   emit_move_insn (mem, fnaddr);
13024
13025   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
13026   emit_move_insn (mem, chain_value);
13027
13028   /* XXX We should really define a "clear_cache" pattern and use
13029      gen_clear_cache().  */
13030   a_tramp = XEXP (m_tramp, 0);
13031   maybe_emit_call_builtin___clear_cache (a_tramp,
13032                                          plus_constant (ptr_mode,
13033                                                         a_tramp,
13034                                                         TRAMPOLINE_SIZE));
13035 }
13036
13037 static unsigned char
13038 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
13039 {
13040   /* ??? Logically we should only need to provide a value when
13041      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
13042      can hold MODE, but at the moment we need to handle all modes.
13043      Just ignore any runtime parts for registers that can't store them.  */
13044   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
13045   unsigned int nregs, vec_flags;
13046   switch (regclass)
13047     {
13048     case W8_W11_REGS:
13049     case W12_W15_REGS:
13050     case STUB_REGS:
13051     case TAILCALL_ADDR_REGS:
13052     case POINTER_REGS:
13053     case GENERAL_REGS:
13054     case ALL_REGS:
13055     case POINTER_AND_FP_REGS:
13056     case FP_REGS:
13057     case FP_LO_REGS:
13058     case FP_LO8_REGS:
13059       vec_flags = aarch64_classify_vector_mode (mode);
13060       if ((vec_flags & VEC_SVE_DATA)
13061           && constant_multiple_p (GET_MODE_SIZE (mode),
13062                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
13063         return nregs;
13064       if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
13065         return GET_MODE_SIZE (mode).to_constant () / 8;
13066       return (vec_flags & VEC_ADVSIMD
13067               ? CEIL (lowest_size, UNITS_PER_VREG)
13068               : CEIL (lowest_size, UNITS_PER_WORD));
13069
13070     case PR_REGS:
13071     case PR_LO_REGS:
13072     case PR_HI_REGS:
13073       return mode == VNx32BImode ? 2 : 1;
13074
13075     case STACK_REG:
13076     case FFR_REGS:
13077     case PR_AND_FFR_REGS:
13078     case FAKE_REGS:
13079       return 1;
13080
13081     case NO_REGS:
13082       return 0;
13083
13084     default:
13085       break;
13086     }
13087   gcc_unreachable ();
13088 }
13089
13090 static reg_class_t
13091 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
13092 {
13093   if (regclass == POINTER_REGS)
13094     return GENERAL_REGS;
13095
13096   if (regclass == STACK_REG)
13097     {
13098       if (REG_P(x)
13099           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
13100           return regclass;
13101
13102       return NO_REGS;
13103     }
13104
13105   /* Register eliminiation can result in a request for
13106      SP+constant->FP_REGS.  We cannot support such operations which
13107      use SP as source and an FP_REG as destination, so reject out
13108      right now.  */
13109   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
13110     {
13111       rtx lhs = XEXP (x, 0);
13112
13113       /* Look through a possible SUBREG introduced by ILP32.  */
13114       if (SUBREG_P (lhs))
13115         lhs = SUBREG_REG (lhs);
13116
13117       gcc_assert (REG_P (lhs));
13118       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
13119                                       POINTER_REGS));
13120       return NO_REGS;
13121     }
13122
13123   return regclass;
13124 }
13125
13126 void
13127 aarch64_asm_output_labelref (FILE* f, const char *name)
13128 {
13129   asm_fprintf (f, "%U%s", name);
13130 }
13131
13132 static void
13133 aarch64_elf_asm_constructor (rtx symbol, int priority)
13134 {
13135   if (priority == DEFAULT_INIT_PRIORITY)
13136     default_ctor_section_asm_out_constructor (symbol, priority);
13137   else
13138     {
13139       section *s;
13140       /* While priority is known to be in range [0, 65535], so 18 bytes
13141          would be enough, the compiler might not know that.  To avoid
13142          -Wformat-truncation false positive, use a larger size.  */
13143       char buf[23];
13144       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
13145       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13146       switch_to_section (s);
13147       assemble_align (POINTER_SIZE);
13148       assemble_aligned_integer (POINTER_BYTES, symbol);
13149     }
13150 }
13151
13152 static void
13153 aarch64_elf_asm_destructor (rtx symbol, int priority)
13154 {
13155   if (priority == DEFAULT_INIT_PRIORITY)
13156     default_dtor_section_asm_out_destructor (symbol, priority);
13157   else
13158     {
13159       section *s;
13160       /* While priority is known to be in range [0, 65535], so 18 bytes
13161          would be enough, the compiler might not know that.  To avoid
13162          -Wformat-truncation false positive, use a larger size.  */
13163       char buf[23];
13164       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
13165       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13166       switch_to_section (s);
13167       assemble_align (POINTER_SIZE);
13168       assemble_aligned_integer (POINTER_BYTES, symbol);
13169     }
13170 }
13171
13172 const char*
13173 aarch64_output_casesi (rtx *operands)
13174 {
13175   char buf[100];
13176   char label[100];
13177   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
13178   int index;
13179   static const char *const patterns[4][2] =
13180   {
13181     {
13182       "ldrb\t%w3, [%0,%w1,uxtw]",
13183       "add\t%3, %4, %w3, sxtb #2"
13184     },
13185     {
13186       "ldrh\t%w3, [%0,%w1,uxtw #1]",
13187       "add\t%3, %4, %w3, sxth #2"
13188     },
13189     {
13190       "ldr\t%w3, [%0,%w1,uxtw #2]",
13191       "add\t%3, %4, %w3, sxtw #2"
13192     },
13193     /* We assume that DImode is only generated when not optimizing and
13194        that we don't really need 64-bit address offsets.  That would
13195        imply an object file with 8GB of code in a single function!  */
13196     {
13197       "ldr\t%w3, [%0,%w1,uxtw #2]",
13198       "add\t%3, %4, %w3, sxtw #2"
13199     }
13200   };
13201
13202   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13203
13204   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13205   index = exact_log2 (GET_MODE_SIZE (mode));
13206
13207   gcc_assert (index >= 0 && index <= 3);
13208
13209   /* Need to implement table size reduction, by chaning the code below.  */
13210   output_asm_insn (patterns[index][0], operands);
13211   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13212   snprintf (buf, sizeof (buf),
13213             "adr\t%%4, %s", targetm.strip_name_encoding (label));
13214   output_asm_insn (buf, operands);
13215   output_asm_insn (patterns[index][1], operands);
13216   output_asm_insn ("br\t%3", operands);
13217   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13218                    operands);
13219   assemble_label (asm_out_file, label);
13220   return "";
13221 }
13222
13223 /* Return the asm string for an SME ZERO instruction whose 8-bit mask
13224    operand is MASK.  */
13225 const char *
13226 aarch64_output_sme_zero_za (rtx mask)
13227 {
13228   auto mask_val = UINTVAL (mask);
13229   if (mask_val == 0)
13230     return "zero\t{}";
13231
13232   if (mask_val == 0xff)
13233     return "zero\t{ za }";
13234
13235   static constexpr struct { unsigned char mask; char letter; } tiles[] = {
13236     { 0xff, 'b' },
13237     { 0x55, 'h' },
13238     { 0x11, 's' },
13239     { 0x01, 'd' }
13240   };
13241   /* The last entry in the list has the form "za7.d }", but that's the
13242      same length as "za7.d, ".  */
13243   static char buffer[sizeof("zero\t{ ") + sizeof ("za7.d, ") * 8 + 1];
13244   for (auto &tile : tiles)
13245     {
13246       unsigned int tile_mask = tile.mask;
13247       unsigned int tile_index = 0;
13248       unsigned int i = snprintf (buffer, sizeof (buffer), "zero\t");
13249       const char *prefix = "{ ";
13250       auto remaining_mask = mask_val;
13251       while (tile_mask < 0x100)
13252         {
13253           if ((remaining_mask & tile_mask) == tile_mask)
13254             {
13255               i += snprintf (buffer + i, sizeof (buffer) - i, "%sza%d.%c",
13256                              prefix, tile_index, tile.letter);
13257               prefix = ", ";
13258               remaining_mask &= ~tile_mask;
13259             }
13260           tile_mask <<= 1;
13261           tile_index += 1;
13262         }
13263       if (remaining_mask == 0)
13264         {
13265           gcc_assert (i + 3 <= sizeof (buffer));
13266           snprintf (buffer + i, sizeof (buffer) - i, " }");
13267           return buffer;
13268         }
13269     }
13270   gcc_unreachable ();
13271 }
13272
13273 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13274    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13275    operator.  */
13276
13277 int
13278 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13279 {
13280   if (shift >= 0 && shift <= 4)
13281     {
13282       int size;
13283       for (size = 8; size <= 32; size *= 2)
13284         {
13285           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13286           if (mask == bits << shift)
13287             return size;
13288         }
13289     }
13290   return 0;
13291 }
13292
13293 /* Constant pools are per function only when PC relative
13294    literal loads are true or we are in the large memory
13295    model.  */
13296
13297 static inline bool
13298 aarch64_can_use_per_function_literal_pools_p (void)
13299 {
13300   return (aarch64_pcrelative_literal_loads
13301           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13302 }
13303
13304 static bool
13305 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
13306 {
13307   /* We can't use blocks for constants when we're using a per-function
13308      constant pool.  */
13309   return !aarch64_can_use_per_function_literal_pools_p ();
13310 }
13311
13312 /* Select appropriate section for constants depending
13313    on where we place literal pools.  */
13314
13315 static section *
13316 aarch64_select_rtx_section (machine_mode mode,
13317                             rtx x,
13318                             unsigned HOST_WIDE_INT align)
13319 {
13320   if (aarch64_can_use_per_function_literal_pools_p ())
13321     return function_section (current_function_decl);
13322
13323   return default_elf_select_rtx_section (mode, x, align);
13324 }
13325
13326 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
13327 void
13328 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13329                                   HOST_WIDE_INT offset)
13330 {
13331   /* When using per-function literal pools, we must ensure that any code
13332      section is aligned to the minimal instruction length, lest we get
13333      errors from the assembler re "unaligned instructions".  */
13334   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13335     ASM_OUTPUT_ALIGN (f, 2);
13336 }
13337
13338 /* Costs.  */
13339
13340 /* Helper function for rtx cost calculation.  Strip a shift expression
13341    from X.  Returns the inner operand if successful, or the original
13342    expression on failure.  */
13343 static rtx
13344 aarch64_strip_shift (rtx x)
13345 {
13346   rtx op = x;
13347
13348   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13349      we can convert both to ROR during final output.  */
13350   if ((GET_CODE (op) == ASHIFT
13351        || GET_CODE (op) == ASHIFTRT
13352        || GET_CODE (op) == LSHIFTRT
13353        || GET_CODE (op) == ROTATERT
13354        || GET_CODE (op) == ROTATE)
13355       && CONST_INT_P (XEXP (op, 1)))
13356     return XEXP (op, 0);
13357
13358   if (GET_CODE (op) == MULT
13359       && CONST_INT_P (XEXP (op, 1))
13360       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13361     return XEXP (op, 0);
13362
13363   return x;
13364 }
13365
13366 /* Helper function for rtx cost calculation.  Strip an extend
13367    expression from X.  Returns the inner operand if successful, or the
13368    original expression on failure.  We deal with a number of possible
13369    canonicalization variations here. If STRIP_SHIFT is true, then
13370    we can strip off a shift also.  */
13371 static rtx
13372 aarch64_strip_extend (rtx x, bool strip_shift)
13373 {
13374   scalar_int_mode mode;
13375   rtx op = x;
13376
13377   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13378     return op;
13379
13380   if (GET_CODE (op) == AND
13381       && GET_CODE (XEXP (op, 0)) == MULT
13382       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13383       && CONST_INT_P (XEXP (op, 1))
13384       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13385                            INTVAL (XEXP (op, 1))) != 0)
13386     return XEXP (XEXP (op, 0), 0);
13387
13388   /* Now handle extended register, as this may also have an optional
13389      left shift by 1..4.  */
13390   if (strip_shift
13391       && GET_CODE (op) == ASHIFT
13392       && CONST_INT_P (XEXP (op, 1))
13393       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13394     op = XEXP (op, 0);
13395
13396   if (GET_CODE (op) == ZERO_EXTEND
13397       || GET_CODE (op) == SIGN_EXTEND)
13398     op = XEXP (op, 0);
13399
13400   if (op != x)
13401     return op;
13402
13403   return x;
13404 }
13405
13406 /* Helper function for rtx cost calculation. Strip extension as well as any
13407    inner VEC_SELECT high-half from X. Returns the inner vector operand if
13408    successful, or the original expression on failure.  */
13409 static rtx
13410 aarch64_strip_extend_vec_half (rtx x)
13411 {
13412   if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13413     {
13414       x = XEXP (x, 0);
13415       if (GET_CODE (x) == VEC_SELECT
13416           && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13417                                     XEXP (x, 1)))
13418         x = XEXP (x, 0);
13419     }
13420   return x;
13421 }
13422
13423 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13424    any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13425    operand if successful, or the original expression on failure.  */
13426 static rtx
13427 aarch64_strip_duplicate_vec_elt (rtx x)
13428 {
13429   if (GET_CODE (x) == VEC_DUPLICATE
13430       && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13431     {
13432       x = XEXP (x, 0);
13433       if (GET_CODE (x) == VEC_SELECT)
13434         x = XEXP (x, 0);
13435       else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13436                && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13437         x = XEXP (XEXP (x, 0), 0);
13438     }
13439   return x;
13440 }
13441
13442 /* Return true iff CODE is a shift supported in combination
13443    with arithmetic instructions.  */
13444
13445 static bool
13446 aarch64_shift_p (enum rtx_code code)
13447 {
13448   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13449 }
13450
13451
13452 /* Return true iff X is a cheap shift without a sign extend. */
13453
13454 static bool
13455 aarch64_cheap_mult_shift_p (rtx x)
13456 {
13457   rtx op0, op1;
13458
13459   op0 = XEXP (x, 0);
13460   op1 = XEXP (x, 1);
13461
13462   if (!(aarch64_tune_params.extra_tuning_flags
13463                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13464     return false;
13465
13466   if (GET_CODE (op0) == SIGN_EXTEND)
13467     return false;
13468
13469   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13470       && UINTVAL (op1) <= 4)
13471     return true;
13472
13473   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13474     return false;
13475
13476   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13477
13478   if (l2 > 0 && l2 <= 4)
13479     return true;
13480
13481   return false;
13482 }
13483
13484 /* Helper function for rtx cost calculation.  Calculate the cost of
13485    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13486    Return the calculated cost of the expression, recursing manually in to
13487    operands where needed.  */
13488
13489 static int
13490 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13491 {
13492   rtx op0, op1;
13493   const struct cpu_cost_table *extra_cost
13494     = aarch64_tune_params.insn_extra_cost;
13495   int cost = 0;
13496   bool compound_p = (outer == PLUS || outer == MINUS);
13497   machine_mode mode = GET_MODE (x);
13498
13499   gcc_checking_assert (code == MULT);
13500
13501   op0 = XEXP (x, 0);
13502   op1 = XEXP (x, 1);
13503
13504   if (VECTOR_MODE_P (mode))
13505     {
13506       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13507       if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13508         {
13509           /* The select-operand-high-half versions of the instruction have the
13510              same cost as the three vector version - don't add the costs of the
13511              extension or selection into the costs of the multiply.  */
13512           op0 = aarch64_strip_extend_vec_half (op0);
13513           op1 = aarch64_strip_extend_vec_half (op1);
13514           /* The by-element versions of the instruction have the same costs as
13515              the normal 3-vector version.  We make an assumption that the input
13516              to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
13517              costing of a MUL by element pre RA is a bit optimistic.  */
13518           op0 = aarch64_strip_duplicate_vec_elt (op0);
13519           op1 = aarch64_strip_duplicate_vec_elt (op1);
13520         }
13521       cost += rtx_cost (op0, mode, MULT, 0, speed);
13522       cost += rtx_cost (op1, mode, MULT, 1, speed);
13523       if (speed)
13524         {
13525           if (GET_CODE (x) == MULT)
13526             cost += extra_cost->vect.mult;
13527           /* This is to catch the SSRA costing currently flowing here.  */
13528           else
13529             cost += extra_cost->vect.alu;
13530         }
13531       return cost;
13532     }
13533
13534   /* Integer multiply/fma.  */
13535   if (GET_MODE_CLASS (mode) == MODE_INT)
13536     {
13537       /* The multiply will be canonicalized as a shift, cost it as such.  */
13538       if (aarch64_shift_p (GET_CODE (x))
13539           || (CONST_INT_P (op1)
13540               && exact_log2 (INTVAL (op1)) > 0))
13541         {
13542           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13543                            || GET_CODE (op0) == SIGN_EXTEND;
13544           if (speed)
13545             {
13546               if (compound_p)
13547                 {
13548                   /* If the shift is considered cheap,
13549                      then don't add any cost. */
13550                   if (aarch64_cheap_mult_shift_p (x))
13551                     ;
13552                   else if (REG_P (op1))
13553                     /* ARITH + shift-by-register.  */
13554                     cost += extra_cost->alu.arith_shift_reg;
13555                   else if (is_extend)
13556                     /* ARITH + extended register.  We don't have a cost field
13557                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
13558                     cost += extra_cost->alu.extend_arith;
13559                   else
13560                     /* ARITH + shift-by-immediate.  */
13561                     cost += extra_cost->alu.arith_shift;
13562                 }
13563               else
13564                 /* LSL (immediate).  */
13565                 cost += extra_cost->alu.shift;
13566
13567             }
13568           /* Strip extends as we will have costed them in the case above.  */
13569           if (is_extend)
13570             op0 = aarch64_strip_extend (op0, true);
13571
13572           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13573
13574           return cost;
13575         }
13576
13577       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
13578          compound and let the below cases handle it.  After all, MNEG is a
13579          special-case alias of MSUB.  */
13580       if (GET_CODE (op0) == NEG)
13581         {
13582           op0 = XEXP (op0, 0);
13583           compound_p = true;
13584         }
13585
13586       /* Integer multiplies or FMAs have zero/sign extending variants.  */
13587       if ((GET_CODE (op0) == ZERO_EXTEND
13588            && GET_CODE (op1) == ZERO_EXTEND)
13589           || (GET_CODE (op0) == SIGN_EXTEND
13590               && GET_CODE (op1) == SIGN_EXTEND))
13591         {
13592           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13593           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13594
13595           if (speed)
13596             {
13597               if (compound_p)
13598                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
13599                 cost += extra_cost->mult[0].extend_add;
13600               else
13601                 /* MUL/SMULL/UMULL.  */
13602                 cost += extra_cost->mult[0].extend;
13603             }
13604
13605           return cost;
13606         }
13607
13608       /* This is either an integer multiply or a MADD.  In both cases
13609          we want to recurse and cost the operands.  */
13610       cost += rtx_cost (op0, mode, MULT, 0, speed);
13611       cost += rtx_cost (op1, mode, MULT, 1, speed);
13612
13613       if (speed)
13614         {
13615           if (compound_p)
13616             /* MADD/MSUB.  */
13617             cost += extra_cost->mult[mode == DImode].add;
13618           else
13619             /* MUL.  */
13620             cost += extra_cost->mult[mode == DImode].simple;
13621         }
13622
13623       return cost;
13624     }
13625   else
13626     {
13627       if (speed)
13628         {
13629           /* Floating-point FMA/FMUL can also support negations of the
13630              operands, unless the rounding mode is upward or downward in
13631              which case FNMUL is different than FMUL with operand negation.  */
13632           bool neg0 = GET_CODE (op0) == NEG;
13633           bool neg1 = GET_CODE (op1) == NEG;
13634           if (compound_p || !flag_rounding_math || (neg0 && neg1))
13635             {
13636               if (neg0)
13637                 op0 = XEXP (op0, 0);
13638               if (neg1)
13639                 op1 = XEXP (op1, 0);
13640             }
13641
13642           if (compound_p)
13643             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
13644             cost += extra_cost->fp[mode == DFmode].fma;
13645           else
13646             /* FMUL/FNMUL.  */
13647             cost += extra_cost->fp[mode == DFmode].mult;
13648         }
13649
13650       cost += rtx_cost (op0, mode, MULT, 0, speed);
13651       cost += rtx_cost (op1, mode, MULT, 1, speed);
13652       return cost;
13653     }
13654 }
13655
13656 static int
13657 aarch64_address_cost (rtx x,
13658                       machine_mode mode,
13659                       addr_space_t as ATTRIBUTE_UNUSED,
13660                       bool speed)
13661 {
13662   enum rtx_code c = GET_CODE (x);
13663   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13664   struct aarch64_address_info info;
13665   int cost = 0;
13666   info.shift = 0;
13667
13668   if (!aarch64_classify_address (&info, x, mode, false))
13669     {
13670       if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13671         {
13672           /* This is a CONST or SYMBOL ref which will be split
13673              in a different way depending on the code model in use.
13674              Cost it through the generic infrastructure.  */
13675           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13676           /* Divide through by the cost of one instruction to
13677              bring it to the same units as the address costs.  */
13678           cost_symbol_ref /= COSTS_N_INSNS (1);
13679           /* The cost is then the cost of preparing the address,
13680              followed by an immediate (possibly 0) offset.  */
13681           return cost_symbol_ref + addr_cost->imm_offset;
13682         }
13683       else
13684         {
13685           /* This is most likely a jump table from a case
13686              statement.  */
13687           return addr_cost->register_offset;
13688         }
13689     }
13690
13691   switch (info.type)
13692     {
13693       case ADDRESS_LO_SUM:
13694       case ADDRESS_SYMBOLIC:
13695       case ADDRESS_REG_IMM:
13696         cost += addr_cost->imm_offset;
13697         break;
13698
13699       case ADDRESS_REG_WB:
13700         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13701           cost += addr_cost->pre_modify;
13702         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13703           {
13704             unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13705             if (nvectors == 3)
13706               cost += addr_cost->post_modify_ld3_st3;
13707             else if (nvectors == 4)
13708               cost += addr_cost->post_modify_ld4_st4;
13709             else
13710               cost += addr_cost->post_modify;
13711           }
13712         else
13713           gcc_unreachable ();
13714
13715         break;
13716
13717       case ADDRESS_REG_REG:
13718         cost += addr_cost->register_offset;
13719         break;
13720
13721       case ADDRESS_REG_SXTW:
13722         cost += addr_cost->register_sextend;
13723         break;
13724
13725       case ADDRESS_REG_UXTW:
13726         cost += addr_cost->register_zextend;
13727         break;
13728
13729       default:
13730         gcc_unreachable ();
13731     }
13732
13733
13734   if (info.shift > 0)
13735     {
13736       /* For the sake of calculating the cost of the shifted register
13737          component, we can treat same sized modes in the same way.  */
13738       if (known_eq (GET_MODE_BITSIZE (mode), 16))
13739         cost += addr_cost->addr_scale_costs.hi;
13740       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13741         cost += addr_cost->addr_scale_costs.si;
13742       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13743         cost += addr_cost->addr_scale_costs.di;
13744       else
13745         /* We can't tell, or this is a 128-bit vector.  */
13746         cost += addr_cost->addr_scale_costs.ti;
13747     }
13748
13749   return cost;
13750 }
13751
13752 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
13753    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
13754    to be taken.  */
13755
13756 int
13757 aarch64_branch_cost (bool speed_p, bool predictable_p)
13758 {
13759   /* When optimizing for speed, use the cost of unpredictable branches.  */
13760   const struct cpu_branch_cost *branch_costs =
13761     aarch64_tune_params.branch_costs;
13762
13763   if (!speed_p || predictable_p)
13764     return branch_costs->predictable;
13765   else
13766     return branch_costs->unpredictable;
13767 }
13768
13769 /* Return true if X is a zero or sign extract
13770    usable in an ADD or SUB (extended register) instruction.  */
13771 static bool
13772 aarch64_rtx_arith_op_extract_p (rtx x)
13773 {
13774   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13775      No shift.  */
13776   if (GET_CODE (x) == SIGN_EXTEND
13777       || GET_CODE (x) == ZERO_EXTEND)
13778     return REG_P (XEXP (x, 0));
13779
13780   return false;
13781 }
13782
13783 static bool
13784 aarch64_frint_unspec_p (unsigned int u)
13785 {
13786   switch (u)
13787     {
13788       case UNSPEC_FRINTZ:
13789       case UNSPEC_FRINTP:
13790       case UNSPEC_FRINTM:
13791       case UNSPEC_FRINTA:
13792       case UNSPEC_FRINTN:
13793       case UNSPEC_FRINTX:
13794       case UNSPEC_FRINTI:
13795         return true;
13796
13797       default:
13798         return false;
13799     }
13800 }
13801
13802 /* Return true iff X is an rtx that will match an extr instruction
13803    i.e. as described in the *extr<mode>5_insn family of patterns.
13804    OP0 and OP1 will be set to the operands of the shifts involved
13805    on success and will be NULL_RTX otherwise.  */
13806
13807 static bool
13808 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13809 {
13810   rtx op0, op1;
13811   scalar_int_mode mode;
13812   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13813     return false;
13814
13815   *res_op0 = NULL_RTX;
13816   *res_op1 = NULL_RTX;
13817
13818   if (GET_CODE (x) != IOR)
13819     return false;
13820
13821   op0 = XEXP (x, 0);
13822   op1 = XEXP (x, 1);
13823
13824   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13825       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13826     {
13827      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
13828       if (GET_CODE (op1) == ASHIFT)
13829         std::swap (op0, op1);
13830
13831       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13832         return false;
13833
13834       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13835       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13836
13837       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13838           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13839         {
13840           *res_op0 = XEXP (op0, 0);
13841           *res_op1 = XEXP (op1, 0);
13842           return true;
13843         }
13844     }
13845
13846   return false;
13847 }
13848
13849 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13850    storing it in *COST.  Result is true if the total cost of the operation
13851    has now been calculated.  */
13852 static bool
13853 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13854 {
13855   rtx inner;
13856   rtx comparator;
13857   enum rtx_code cmpcode;
13858   const struct cpu_cost_table *extra_cost
13859     = aarch64_tune_params.insn_extra_cost;
13860
13861   if (COMPARISON_P (op0))
13862     {
13863       inner = XEXP (op0, 0);
13864       comparator = XEXP (op0, 1);
13865       cmpcode = GET_CODE (op0);
13866     }
13867   else
13868     {
13869       inner = op0;
13870       comparator = const0_rtx;
13871       cmpcode = NE;
13872     }
13873
13874   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13875     {
13876       /* Conditional branch.  */
13877       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13878         return true;
13879       else
13880         {
13881           if (cmpcode == NE || cmpcode == EQ)
13882             {
13883               if (comparator == const0_rtx)
13884                 {
13885                   /* TBZ/TBNZ/CBZ/CBNZ.  */
13886                   if (GET_CODE (inner) == ZERO_EXTRACT)
13887                     /* TBZ/TBNZ.  */
13888                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13889                                        ZERO_EXTRACT, 0, speed);
13890                   else
13891                     /* CBZ/CBNZ.  */
13892                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13893
13894                   return true;
13895                 }
13896               if (register_operand (inner, VOIDmode)
13897                   && aarch64_imm24 (comparator, VOIDmode))
13898                 {
13899                   /* SUB and SUBS.  */
13900                   *cost += COSTS_N_INSNS (2);
13901                   if (speed)
13902                     *cost += extra_cost->alu.arith * 2;
13903                   return true;
13904                 }
13905             }
13906           else if (cmpcode == LT || cmpcode == GE)
13907             {
13908               /* TBZ/TBNZ.  */
13909               if (comparator == const0_rtx)
13910                 return true;
13911             }
13912         }
13913     }
13914   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13915     {
13916       /* CCMP.  */
13917       if (GET_CODE (op1) == COMPARE)
13918         {
13919           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
13920           if (XEXP (op1, 1) == const0_rtx)
13921             *cost += 1;
13922           if (speed)
13923             {
13924               machine_mode mode = GET_MODE (XEXP (op1, 0));
13925
13926               if (GET_MODE_CLASS (mode) == MODE_INT)
13927                 *cost += extra_cost->alu.arith;
13928               else
13929                 *cost += extra_cost->fp[mode == DFmode].compare;
13930             }
13931           return true;
13932         }
13933
13934       /* It's a conditional operation based on the status flags,
13935          so it must be some flavor of CSEL.  */
13936
13937       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
13938       if (GET_CODE (op1) == NEG
13939           || GET_CODE (op1) == NOT
13940           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13941         op1 = XEXP (op1, 0);
13942       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13943         {
13944           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
13945           op1 = XEXP (op1, 0);
13946           op2 = XEXP (op2, 0);
13947         }
13948       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13949         {
13950           inner = XEXP (op1, 0);
13951           if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13952             /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
13953             op1 = XEXP (inner, 0);
13954         }
13955       else if (op1 == constm1_rtx || op1 == const1_rtx)
13956         {
13957           /* Use CSINV or CSINC.  */
13958           *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13959           return true;
13960         }
13961       else if (op2 == constm1_rtx || op2 == const1_rtx)
13962         {
13963           /* Use CSINV or CSINC.  */
13964           *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13965           return true;
13966         }
13967
13968       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13969       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13970       return true;
13971     }
13972
13973   /* We don't know what this is, cost all operands.  */
13974   return false;
13975 }
13976
13977 /* Check whether X is a bitfield operation of the form shift + extend that
13978    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
13979    operand to which the bitfield operation is applied.  Otherwise return
13980    NULL_RTX.  */
13981
13982 static rtx
13983 aarch64_extend_bitfield_pattern_p (rtx x)
13984 {
13985   rtx_code outer_code = GET_CODE (x);
13986   machine_mode outer_mode = GET_MODE (x);
13987
13988   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13989       && outer_mode != SImode && outer_mode != DImode)
13990     return NULL_RTX;
13991
13992   rtx inner = XEXP (x, 0);
13993   rtx_code inner_code = GET_CODE (inner);
13994   machine_mode inner_mode = GET_MODE (inner);
13995   rtx op = NULL_RTX;
13996
13997   switch (inner_code)
13998     {
13999       case ASHIFT:
14000         if (CONST_INT_P (XEXP (inner, 1))
14001             && (inner_mode == QImode || inner_mode == HImode))
14002           op = XEXP (inner, 0);
14003         break;
14004       case LSHIFTRT:
14005         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
14006             && (inner_mode == QImode || inner_mode == HImode))
14007           op = XEXP (inner, 0);
14008         break;
14009       case ASHIFTRT:
14010         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
14011             && (inner_mode == QImode || inner_mode == HImode))
14012           op = XEXP (inner, 0);
14013         break;
14014       default:
14015         break;
14016     }
14017
14018   return op;
14019 }
14020
14021 /* Return true if the mask and a shift amount from an RTX of the form
14022    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
14023    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
14024
14025 bool
14026 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
14027                                     rtx shft_amnt)
14028 {
14029   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
14030          && INTVAL (mask) > 0
14031          && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
14032          && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
14033          && (UINTVAL (mask)
14034              & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
14035 }
14036
14037 /* Return true if the masks and a shift amount from an RTX of the form
14038    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
14039    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
14040
14041 bool
14042 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
14043                                    unsigned HOST_WIDE_INT mask1,
14044                                    unsigned HOST_WIDE_INT shft_amnt,
14045                                    unsigned HOST_WIDE_INT mask2)
14046 {
14047   unsigned HOST_WIDE_INT t;
14048
14049   /* Verify that there is no overlap in what bits are set in the two masks.  */
14050   if (mask1 != ~mask2)
14051     return false;
14052
14053   /* Verify that mask2 is not all zeros or ones.  */
14054   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
14055     return false;
14056
14057   /* The shift amount should always be less than the mode size.  */
14058   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
14059
14060   /* Verify that the mask being shifted is contiguous and would be in the
14061      least significant bits after shifting by shft_amnt.  */
14062   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
14063   return (t == (t & -t));
14064 }
14065
14066 /* Return true if X is an RTX representing an operation in the ABD family
14067    of instructions.  */
14068
14069 static bool
14070 aarch64_abd_rtx_p (rtx x)
14071 {
14072   if (GET_CODE (x) != MINUS)
14073     return false;
14074   rtx max_arm = XEXP (x, 0);
14075   rtx min_arm = XEXP (x, 1);
14076   if (GET_CODE (max_arm) != SMAX && GET_CODE (max_arm) != UMAX)
14077     return false;
14078   bool signed_p = GET_CODE (max_arm) == SMAX;
14079   if (signed_p && GET_CODE (min_arm) != SMIN)
14080     return false;
14081   else if (!signed_p && GET_CODE (min_arm) != UMIN)
14082     return false;
14083
14084   rtx maxop0 = XEXP (max_arm, 0);
14085   rtx maxop1 = XEXP (max_arm, 1);
14086   rtx minop0 = XEXP (min_arm, 0);
14087   rtx minop1 = XEXP (min_arm, 1);
14088   return rtx_equal_p (maxop0, minop0) && rtx_equal_p (maxop1, minop1);
14089 }
14090
14091 /* Calculate the cost of calculating X, storing it in *COST.  Result
14092    is true if the total cost of the operation has now been calculated.  */
14093 static bool
14094 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
14095                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
14096 {
14097   rtx op0, op1, op2;
14098   const struct cpu_cost_table *extra_cost
14099     = aarch64_tune_params.insn_extra_cost;
14100   rtx_code code = GET_CODE (x);
14101   scalar_int_mode int_mode;
14102
14103   /* By default, assume that everything has equivalent cost to the
14104      cheapest instruction.  Any additional costs are applied as a delta
14105      above this default.  */
14106   *cost = COSTS_N_INSNS (1);
14107
14108   switch (code)
14109     {
14110     case SET:
14111       /* The cost depends entirely on the operands to SET.  */
14112       *cost = 0;
14113       op0 = SET_DEST (x);
14114       op1 = SET_SRC (x);
14115
14116       switch (GET_CODE (op0))
14117         {
14118         case MEM:
14119           if (speed)
14120             {
14121               rtx address = XEXP (op0, 0);
14122               if (VECTOR_MODE_P (mode))
14123                 *cost += extra_cost->ldst.storev;
14124               else if (GET_MODE_CLASS (mode) == MODE_INT)
14125                 *cost += extra_cost->ldst.store;
14126               else if (mode == SFmode || mode == SDmode)
14127                 *cost += extra_cost->ldst.storef;
14128               else if (mode == DFmode || mode == DDmode)
14129                 *cost += extra_cost->ldst.stored;
14130
14131               *cost +=
14132                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14133                                                      0, speed));
14134             }
14135
14136           *cost += rtx_cost (op1, mode, SET, 1, speed);
14137           return true;
14138
14139         case SUBREG:
14140           if (! REG_P (SUBREG_REG (op0)))
14141             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
14142
14143           /* Fall through.  */
14144         case REG:
14145           /* The cost is one per vector-register copied.  */
14146           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
14147             {
14148               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
14149               *cost = COSTS_N_INSNS (nregs);
14150             }
14151           /* const0_rtx is in general free, but we will use an
14152              instruction to set a register to 0.  */
14153           else if (REG_P (op1) || op1 == const0_rtx)
14154             {
14155               /* The cost is 1 per register copied.  */
14156               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
14157               *cost = COSTS_N_INSNS (nregs);
14158             }
14159           else
14160             /* Cost is just the cost of the RHS of the set.  */
14161             *cost += rtx_cost (op1, mode, SET, 1, speed);
14162           return true;
14163
14164         case ZERO_EXTRACT:
14165         case SIGN_EXTRACT:
14166           /* Bit-field insertion.  Strip any redundant widening of
14167              the RHS to meet the width of the target.  */
14168           if (SUBREG_P (op1))
14169             op1 = SUBREG_REG (op1);
14170           if ((GET_CODE (op1) == ZERO_EXTEND
14171                || GET_CODE (op1) == SIGN_EXTEND)
14172               && CONST_INT_P (XEXP (op0, 1))
14173               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
14174               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
14175             op1 = XEXP (op1, 0);
14176
14177           if (CONST_INT_P (op1))
14178             {
14179               /* MOV immediate is assumed to always be cheap.  */
14180               *cost = COSTS_N_INSNS (1);
14181             }
14182           else
14183             {
14184               /* BFM.  */
14185               if (speed)
14186                 *cost += extra_cost->alu.bfi;
14187               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
14188             }
14189
14190           return true;
14191
14192         default:
14193           /* We can't make sense of this, assume default cost.  */
14194           *cost = COSTS_N_INSNS (1);
14195           return false;
14196         }
14197       return false;
14198
14199     case CONST_INT:
14200       /* If an instruction can incorporate a constant within the
14201          instruction, the instruction's expression avoids calling
14202          rtx_cost() on the constant.  If rtx_cost() is called on a
14203          constant, then it is usually because the constant must be
14204          moved into a register by one or more instructions.
14205
14206          The exception is constant 0, which can be expressed
14207          as XZR/WZR and is therefore free.  The exception to this is
14208          if we have (set (reg) (const0_rtx)) in which case we must cost
14209          the move.  However, we can catch that when we cost the SET, so
14210          we don't need to consider that here.  */
14211       if (x == const0_rtx)
14212         *cost = 0;
14213       else
14214         {
14215           /* To an approximation, building any other constant is
14216              proportionally expensive to the number of instructions
14217              required to build that constant.  This is true whether we
14218              are compiling for SPEED or otherwise.  */
14219           machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
14220                                 ? SImode : DImode;
14221           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
14222                                  (NULL_RTX, x, false, imode));
14223         }
14224       return true;
14225
14226     case CONST_DOUBLE:
14227
14228       /* First determine number of instructions to do the move
14229           as an integer constant.  */
14230       if (!aarch64_float_const_representable_p (x)
14231            && !aarch64_can_const_movi_rtx_p (x, mode)
14232            && aarch64_float_const_rtx_p (x))
14233         {
14234           unsigned HOST_WIDE_INT ival;
14235           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
14236           gcc_assert (succeed);
14237
14238           machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
14239                                 ? DImode : SImode;
14240           int ncost = aarch64_internal_mov_immediate
14241                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
14242           *cost += COSTS_N_INSNS (ncost);
14243           return true;
14244         }
14245
14246       if (speed)
14247         {
14248           /* mov[df,sf]_aarch64.  */
14249           if (aarch64_float_const_representable_p (x))
14250             /* FMOV (scalar immediate).  */
14251             *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
14252           else if (!aarch64_float_const_zero_rtx_p (x))
14253             {
14254               /* This will be a load from memory.  */
14255               if (mode == DFmode || mode == DDmode)
14256                 *cost += extra_cost->ldst.loadd;
14257               else
14258                 *cost += extra_cost->ldst.loadf;
14259             }
14260           else
14261             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
14262                or MOV v0.s[0], wzr - neither of which are modeled by the
14263                cost tables.  Just use the default cost.  */
14264             {
14265             }
14266         }
14267
14268       return true;
14269
14270     case MEM:
14271       if (speed)
14272         {
14273           /* For loads we want the base cost of a load, plus an
14274              approximation for the additional cost of the addressing
14275              mode.  */
14276           rtx address = XEXP (x, 0);
14277           if (VECTOR_MODE_P (mode))
14278             *cost += extra_cost->ldst.loadv;
14279           else if (GET_MODE_CLASS (mode) == MODE_INT)
14280             *cost += extra_cost->ldst.load;
14281           else if (mode == SFmode || mode == SDmode)
14282             *cost += extra_cost->ldst.loadf;
14283           else if (mode == DFmode || mode == DDmode)
14284             *cost += extra_cost->ldst.loadd;
14285
14286           *cost +=
14287                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14288                                                      0, speed));
14289         }
14290
14291       return true;
14292
14293     case NEG:
14294       op0 = XEXP (x, 0);
14295
14296       if (VECTOR_MODE_P (mode))
14297         {
14298           /* Many vector comparison operations are represented as NEG
14299              of a comparison.  */
14300           if (COMPARISON_P (op0))
14301             {
14302               rtx op00 = XEXP (op0, 0);
14303               rtx op01 = XEXP (op0, 1);
14304               machine_mode inner_mode = GET_MODE (op00);
14305               /* FACGE/FACGT.  */
14306               if (GET_MODE_CLASS (inner_mode) == MODE_VECTOR_FLOAT
14307                   && GET_CODE (op00) == ABS
14308                   && GET_CODE (op01) == ABS)
14309                 {
14310                   op00 = XEXP (op00, 0);
14311                   op01 = XEXP (op01, 0);
14312                 }
14313               *cost += rtx_cost (op00, inner_mode, GET_CODE (op0), 0, speed);
14314               *cost += rtx_cost (op01, inner_mode, GET_CODE (op0), 1, speed);
14315               if (speed)
14316                 *cost += extra_cost->vect.alu;
14317               return true;
14318             }
14319           if (speed)
14320             {
14321               /* FNEG.  */
14322               *cost += extra_cost->vect.alu;
14323             }
14324           return false;
14325         }
14326
14327       if (GET_MODE_CLASS (mode) == MODE_INT)
14328         {
14329           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14330               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14331             {
14332               /* CSETM.  */
14333               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
14334               return true;
14335             }
14336
14337           /* Cost this as SUB wzr, X.  */
14338           op0 = CONST0_RTX (mode);
14339           op1 = XEXP (x, 0);
14340           goto cost_minus;
14341         }
14342
14343       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14344         {
14345           /* Support (neg(fma...)) as a single instruction only if
14346              sign of zeros is unimportant.  This matches the decision
14347              making in aarch64.md.  */
14348           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14349             {
14350               /* FNMADD.  */
14351               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14352               return true;
14353             }
14354           if (GET_CODE (op0) == MULT)
14355             {
14356               /* FNMUL.  */
14357               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14358               return true;
14359             }
14360           if (speed)
14361             /* FNEG.  */
14362             *cost += extra_cost->fp[mode == DFmode].neg;
14363           return false;
14364         }
14365
14366       return false;
14367
14368     case CLRSB:
14369     case CLZ:
14370       if (speed)
14371         {
14372           if (VECTOR_MODE_P (mode))
14373             *cost += extra_cost->vect.alu;
14374           else
14375             *cost += extra_cost->alu.clz;
14376         }
14377
14378       return false;
14379
14380     case CTZ:
14381       if (VECTOR_MODE_P (mode))
14382         {
14383           *cost = COSTS_N_INSNS (3);
14384           if (speed)
14385             *cost += extra_cost->vect.alu * 3;
14386         }
14387       else if (TARGET_CSSC)
14388         {
14389           *cost = COSTS_N_INSNS (1);
14390           if (speed)
14391             *cost += extra_cost->alu.clz;
14392         }
14393       else
14394         {
14395           *cost = COSTS_N_INSNS (2);
14396           if (speed)
14397             *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14398         }
14399       return false;
14400
14401     case COMPARE:
14402       op0 = XEXP (x, 0);
14403       op1 = XEXP (x, 1);
14404
14405       if (op1 == const0_rtx
14406           && GET_CODE (op0) == AND)
14407         {
14408           x = op0;
14409           mode = GET_MODE (op0);
14410           goto cost_logic;
14411         }
14412
14413       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14414         {
14415           /* TODO: A write to the CC flags possibly costs extra, this
14416              needs encoding in the cost tables.  */
14417
14418           mode = GET_MODE (op0);
14419           /* ANDS.  */
14420           if (GET_CODE (op0) == AND)
14421             {
14422               x = op0;
14423               goto cost_logic;
14424             }
14425
14426           if (GET_CODE (op0) == PLUS)
14427             {
14428               /* ADDS (and CMN alias).  */
14429               x = op0;
14430               goto cost_plus;
14431             }
14432
14433           if (GET_CODE (op0) == MINUS)
14434             {
14435               /* SUBS.  */
14436               x = op0;
14437               goto cost_minus;
14438             }
14439
14440           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14441               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14442               && CONST_INT_P (XEXP (op0, 2)))
14443             {
14444               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14445                  Handle it here directly rather than going to cost_logic
14446                  since we know the immediate generated for the TST is valid
14447                  so we can avoid creating an intermediate rtx for it only
14448                  for costing purposes.  */
14449               if (speed)
14450                 *cost += extra_cost->alu.logical;
14451
14452               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14453                                  ZERO_EXTRACT, 0, speed);
14454               return true;
14455             }
14456
14457           if (GET_CODE (op1) == NEG)
14458             {
14459               /* CMN.  */
14460               if (speed)
14461                 *cost += extra_cost->alu.arith;
14462
14463               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14464               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14465               return true;
14466             }
14467
14468           /* CMP.
14469
14470              Compare can freely swap the order of operands, and
14471              canonicalization puts the more complex operation first.
14472              But the integer MINUS logic expects the shift/extend
14473              operation in op1.  */
14474           if (! (REG_P (op0)
14475                  || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14476           {
14477             op0 = XEXP (x, 1);
14478             op1 = XEXP (x, 0);
14479           }
14480           goto cost_minus;
14481         }
14482
14483       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14484         {
14485           /* FCMP.  */
14486           if (speed)
14487             *cost += extra_cost->fp[mode == DFmode].compare;
14488
14489           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14490             {
14491               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14492               /* FCMP supports constant 0.0 for no extra cost. */
14493               return true;
14494             }
14495           return false;
14496         }
14497
14498       if (VECTOR_MODE_P (mode))
14499         {
14500           /* Vector compare.  */
14501           if (speed)
14502             *cost += extra_cost->vect.alu;
14503
14504           if (aarch64_float_const_zero_rtx_p (op1))
14505             {
14506               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14507                  cost.  */
14508               return true;
14509             }
14510           return false;
14511         }
14512       return false;
14513
14514     case MINUS:
14515       {
14516         op0 = XEXP (x, 0);
14517         op1 = XEXP (x, 1);
14518
14519 cost_minus:
14520         if (VECTOR_MODE_P (mode))
14521           {
14522             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14523             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14524               {
14525                 /* Recognise the SABD and UABD operation here.
14526                    Recursion from the PLUS case will catch the accumulating
14527                    forms.  */
14528                 if (aarch64_abd_rtx_p (x))
14529                   {
14530                     if (speed)
14531                       *cost += extra_cost->vect.alu;
14532                     return true;
14533                   }
14534                   /* SUBL2 and SUBW2.
14535                    The select-operand-high-half versions of the sub instruction
14536                    have the same cost as the regular three vector version -
14537                    don't add the costs of the select into the costs of the sub.
14538                    */
14539                 op0 = aarch64_strip_extend_vec_half (op0);
14540                 op1 = aarch64_strip_extend_vec_half (op1);
14541               }
14542           }
14543
14544         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14545
14546         /* Detect valid immediates.  */
14547         if ((GET_MODE_CLASS (mode) == MODE_INT
14548              || (GET_MODE_CLASS (mode) == MODE_CC
14549                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14550             && CONST_INT_P (op1)
14551             && aarch64_uimm12_shift (INTVAL (op1)))
14552           {
14553             if (speed)
14554               /* SUB(S) (immediate).  */
14555               *cost += extra_cost->alu.arith;
14556             return true;
14557           }
14558
14559         /* Look for SUB (extended register).  */
14560         if (is_a <scalar_int_mode> (mode)
14561             && aarch64_rtx_arith_op_extract_p (op1))
14562           {
14563             if (speed)
14564               *cost += extra_cost->alu.extend_arith;
14565
14566             op1 = aarch64_strip_extend (op1, true);
14567             *cost += rtx_cost (op1, VOIDmode,
14568                                (enum rtx_code) GET_CODE (op1), 0, speed);
14569             return true;
14570           }
14571
14572         rtx new_op1 = aarch64_strip_extend (op1, false);
14573
14574         /* Cost this as an FMA-alike operation.  */
14575         if ((GET_CODE (new_op1) == MULT
14576              || aarch64_shift_p (GET_CODE (new_op1)))
14577             && code != COMPARE)
14578           {
14579             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14580                                             (enum rtx_code) code,
14581                                             speed);
14582             return true;
14583           }
14584
14585         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14586
14587         if (speed)
14588           {
14589             if (VECTOR_MODE_P (mode))
14590               {
14591                 /* Vector SUB.  */
14592                 *cost += extra_cost->vect.alu;
14593               }
14594             else if (GET_MODE_CLASS (mode) == MODE_INT)
14595               {
14596                 /* SUB(S).  */
14597                 *cost += extra_cost->alu.arith;
14598               }
14599             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14600               {
14601                 /* FSUB.  */
14602                 *cost += extra_cost->fp[mode == DFmode].addsub;
14603               }
14604           }
14605         return true;
14606       }
14607
14608     case PLUS:
14609       {
14610         rtx new_op0;
14611
14612         op0 = XEXP (x, 0);
14613         op1 = XEXP (x, 1);
14614
14615 cost_plus:
14616         if (VECTOR_MODE_P (mode))
14617           {
14618             /* ADDL2 and ADDW2.  */
14619             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14620             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14621               {
14622                 /* The select-operand-high-half versions of the add instruction
14623                    have the same cost as the regular three vector version -
14624                    don't add the costs of the select into the costs of the add.
14625                    */
14626                 op0 = aarch64_strip_extend_vec_half (op0);
14627                 op1 = aarch64_strip_extend_vec_half (op1);
14628               }
14629           }
14630
14631         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14632             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14633           {
14634             /* CSINC.  */
14635             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14636             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14637             return true;
14638           }
14639
14640         if (GET_MODE_CLASS (mode) == MODE_INT
14641             && (aarch64_plus_immediate (op1, mode)
14642                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14643           {
14644             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14645
14646             if (speed)
14647               {
14648                 /* ADD (immediate).  */
14649                 *cost += extra_cost->alu.arith;
14650
14651                 /* Some tunings prefer to not use the VL-based scalar ops.
14652                    Increase the cost of the poly immediate to prevent their
14653                    formation.  */
14654                 if (GET_CODE (op1) == CONST_POLY_INT
14655                     && (aarch64_tune_params.extra_tuning_flags
14656                         & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14657                   *cost += COSTS_N_INSNS (1);
14658               }
14659             return true;
14660           }
14661
14662         if (aarch64_pluslong_immediate (op1, mode))
14663           {
14664             /* 24-bit add in 2 instructions or 12-bit shifted add.  */
14665             if ((INTVAL (op1) & 0xfff) != 0)
14666               *cost += COSTS_N_INSNS (1);
14667
14668             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14669             return true;
14670           }
14671
14672         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14673
14674         /* Look for ADD (extended register).  */
14675         if (is_a <scalar_int_mode> (mode)
14676             && aarch64_rtx_arith_op_extract_p (op0))
14677           {
14678             if (speed)
14679               *cost += extra_cost->alu.extend_arith;
14680
14681             op0 = aarch64_strip_extend (op0, true);
14682             *cost += rtx_cost (op0, VOIDmode,
14683                                (enum rtx_code) GET_CODE (op0), 0, speed);
14684             return true;
14685           }
14686
14687         /* Strip any extend, leave shifts behind as we will
14688            cost them through mult_cost.  */
14689         new_op0 = aarch64_strip_extend (op0, false);
14690
14691         if (GET_CODE (new_op0) == MULT
14692             || aarch64_shift_p (GET_CODE (new_op0)))
14693           {
14694             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14695                                             speed);
14696             return true;
14697           }
14698
14699         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14700
14701         if (speed)
14702           {
14703             if (VECTOR_MODE_P (mode))
14704               {
14705                 /* Vector ADD.  */
14706                 *cost += extra_cost->vect.alu;
14707               }
14708             else if (GET_MODE_CLASS (mode) == MODE_INT)
14709               {
14710                 /* ADD.  */
14711                 *cost += extra_cost->alu.arith;
14712               }
14713             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14714               {
14715                 /* FADD.  */
14716                 *cost += extra_cost->fp[mode == DFmode].addsub;
14717               }
14718           }
14719         return true;
14720       }
14721
14722     case BITREVERSE:
14723     case BSWAP:
14724       *cost = COSTS_N_INSNS (1);
14725
14726       if (speed)
14727         {
14728           if (VECTOR_MODE_P (mode))
14729             *cost += extra_cost->vect.alu;
14730           else
14731             *cost += extra_cost->alu.rev;
14732         }
14733       return false;
14734
14735     case IOR:
14736       if (aarch_rev16_p (x))
14737         {
14738           *cost = COSTS_N_INSNS (1);
14739
14740           if (speed)
14741             {
14742               if (VECTOR_MODE_P (mode))
14743                 *cost += extra_cost->vect.alu;
14744               else
14745                 *cost += extra_cost->alu.rev;
14746             }
14747           return true;
14748         }
14749
14750       if (aarch64_extr_rtx_p (x, &op0, &op1))
14751         {
14752           *cost += rtx_cost (op0, mode, IOR, 0, speed);
14753           *cost += rtx_cost (op1, mode, IOR, 1, speed);
14754           if (speed)
14755             *cost += extra_cost->alu.shift;
14756
14757           return true;
14758         }
14759     /* Fall through.  */
14760     case XOR:
14761     case AND:
14762     cost_logic:
14763       op0 = XEXP (x, 0);
14764       op1 = XEXP (x, 1);
14765
14766       if (VECTOR_MODE_P (mode))
14767         {
14768           if (speed)
14769             *cost += extra_cost->vect.alu;
14770           return true;
14771         }
14772
14773       if (code == AND
14774           && GET_CODE (op0) == MULT
14775           && CONST_INT_P (XEXP (op0, 1))
14776           && CONST_INT_P (op1)
14777           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14778                                INTVAL (op1)) != 0)
14779         {
14780           /* This is a UBFM/SBFM.  */
14781           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14782           if (speed)
14783             *cost += extra_cost->alu.bfx;
14784           return true;
14785         }
14786
14787       if (is_int_mode (mode, &int_mode))
14788         {
14789           if (CONST_INT_P (op1))
14790             {
14791               /* We have a mask + shift version of a UBFIZ
14792                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
14793               if (GET_CODE (op0) == ASHIFT
14794                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14795                                                          XEXP (op0, 1)))
14796                 {
14797                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
14798                                      (enum rtx_code) code, 0, speed);
14799                   if (speed)
14800                     *cost += extra_cost->alu.bfx;
14801
14802                   return true;
14803                 }
14804               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14805                 {
14806                 /* We possibly get the immediate for free, this is not
14807                    modelled.  */
14808                   *cost += rtx_cost (op0, int_mode,
14809                                      (enum rtx_code) code, 0, speed);
14810                   if (speed)
14811                     *cost += extra_cost->alu.logical;
14812
14813                   return true;
14814                 }
14815             }
14816           else
14817             {
14818               rtx new_op0 = op0;
14819
14820               /* Handle ORN, EON, or BIC.  */
14821               if (GET_CODE (op0) == NOT)
14822                 op0 = XEXP (op0, 0);
14823
14824               new_op0 = aarch64_strip_shift (op0);
14825
14826               /* If we had a shift on op0 then this is a logical-shift-
14827                  by-register/immediate operation.  Otherwise, this is just
14828                  a logical operation.  */
14829               if (speed)
14830                 {
14831                   if (new_op0 != op0)
14832                     {
14833                       /* Shift by immediate.  */
14834                       if (CONST_INT_P (XEXP (op0, 1)))
14835                         *cost += extra_cost->alu.log_shift;
14836                       else
14837                         *cost += extra_cost->alu.log_shift_reg;
14838                     }
14839                   else
14840                     *cost += extra_cost->alu.logical;
14841                 }
14842
14843               /* In both cases we want to cost both operands.  */
14844               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14845                                  0, speed);
14846               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14847                                  1, speed);
14848
14849               return true;
14850             }
14851         }
14852       return false;
14853
14854     case NOT:
14855       x = XEXP (x, 0);
14856       op0 = aarch64_strip_shift (x);
14857
14858       if (VECTOR_MODE_P (mode))
14859         {
14860           /* Vector NOT.  */
14861           *cost += extra_cost->vect.alu;
14862           return false;
14863         }
14864
14865       /* MVN-shifted-reg.  */
14866       if (op0 != x)
14867         {
14868           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14869
14870           if (speed)
14871             *cost += extra_cost->alu.log_shift;
14872
14873           return true;
14874         }
14875       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14876          Handle the second form here taking care that 'a' in the above can
14877          be a shift.  */
14878       else if (GET_CODE (op0) == XOR)
14879         {
14880           rtx newop0 = XEXP (op0, 0);
14881           rtx newop1 = XEXP (op0, 1);
14882           rtx op0_stripped = aarch64_strip_shift (newop0);
14883
14884           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14885           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14886
14887           if (speed)
14888             {
14889               if (op0_stripped != newop0)
14890                 *cost += extra_cost->alu.log_shift;
14891               else
14892                 *cost += extra_cost->alu.logical;
14893             }
14894
14895           return true;
14896         }
14897       /* MVN.  */
14898       if (speed)
14899         *cost += extra_cost->alu.logical;
14900
14901       return false;
14902
14903     case ZERO_EXTEND:
14904
14905       op0 = XEXP (x, 0);
14906       /* If a value is written in SI mode, then zero extended to DI
14907          mode, the operation will in general be free as a write to
14908          a 'w' register implicitly zeroes the upper bits of an 'x'
14909          register.  However, if this is
14910
14911            (set (reg) (zero_extend (reg)))
14912
14913          we must cost the explicit register move.  */
14914       if (mode == DImode
14915           && GET_MODE (op0) == SImode)
14916         {
14917           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14918
14919         /* If OP_COST is non-zero, then the cost of the zero extend
14920            is effectively the cost of the inner operation.  Otherwise
14921            we have a MOV instruction and we take the cost from the MOV
14922            itself.  This is true independently of whether we are
14923            optimizing for space or time.  */
14924           if (op_cost)
14925             *cost = op_cost;
14926
14927           return true;
14928         }
14929       else if (MEM_P (op0))
14930         {
14931           /* All loads can zero extend to any size for free.  */
14932           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14933           return true;
14934         }
14935
14936       op0 = aarch64_extend_bitfield_pattern_p (x);
14937       if (op0)
14938         {
14939           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14940           if (speed)
14941             *cost += extra_cost->alu.bfx;
14942           return true;
14943         }
14944
14945       if (speed)
14946         {
14947           if (VECTOR_MODE_P (mode))
14948             {
14949               /* UMOV.  */
14950               *cost += extra_cost->vect.alu;
14951             }
14952           else
14953             {
14954               /* We generate an AND instead of UXTB/UXTH.  */
14955               *cost += extra_cost->alu.logical;
14956             }
14957         }
14958       return false;
14959
14960     case SIGN_EXTEND:
14961       if (MEM_P (XEXP (x, 0)))
14962         {
14963           /* LDRSH.  */
14964           if (speed)
14965             {
14966               rtx address = XEXP (XEXP (x, 0), 0);
14967               *cost += extra_cost->ldst.load_sign_extend;
14968
14969               *cost +=
14970                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14971                                                      0, speed));
14972             }
14973           return true;
14974         }
14975
14976       op0 = aarch64_extend_bitfield_pattern_p (x);
14977       if (op0)
14978         {
14979           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14980           if (speed)
14981             *cost += extra_cost->alu.bfx;
14982           return true;
14983         }
14984
14985       if (speed)
14986         {
14987           if (VECTOR_MODE_P (mode))
14988             *cost += extra_cost->vect.alu;
14989           else
14990             *cost += extra_cost->alu.extend;
14991         }
14992       return false;
14993
14994     case ROTATE:
14995     case ROTATERT:
14996     case LSHIFTRT:
14997     case ASHIFTRT:
14998     case ASHIFT:
14999       op0 = XEXP (x, 0);
15000       op1 = XEXP (x, 1);
15001
15002       if (CONST_INT_P (op1))
15003         {
15004           if (speed)
15005             {
15006               if (VECTOR_MODE_P (mode))
15007                 {
15008                   /* Vector shift (immediate).  */
15009                   *cost += extra_cost->vect.alu;
15010                 }
15011               else
15012                 {
15013                   /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
15014                      These are all aliases.  */
15015                   *cost += extra_cost->alu.shift;
15016                 }
15017             }
15018
15019           /* We can incorporate zero/sign extend for free.  */
15020           if (GET_CODE (op0) == ZERO_EXTEND
15021               || GET_CODE (op0) == SIGN_EXTEND)
15022             op0 = XEXP (op0, 0);
15023
15024           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
15025           return true;
15026         }
15027       else
15028         {
15029           if (VECTOR_MODE_P (mode))
15030             {
15031               if (speed)
15032                 /* Vector shift (register).  */
15033                 *cost += extra_cost->vect.alu;
15034             }
15035           else
15036             {
15037               if (speed)
15038                 /* LSLV, ASRV.  */
15039                 *cost += extra_cost->alu.shift_reg;
15040
15041                /* The register shift amount may be in a shorter mode expressed
15042                   as a lowpart SUBREG.  For costing purposes just look inside.  */
15043               if (SUBREG_P (op1) && subreg_lowpart_p (op1))
15044                 op1 = SUBREG_REG (op1);
15045               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
15046                   && CONST_INT_P (XEXP (op1, 1))
15047                   && known_eq (INTVAL (XEXP (op1, 1)),
15048                                GET_MODE_BITSIZE (mode) - 1))
15049                 {
15050                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
15051                   /* We already demanded XEXP (op1, 0) to be REG_P, so
15052                      don't recurse into it.  */
15053                   return true;
15054                 }
15055             }
15056           return false;  /* All arguments need to be in registers.  */
15057         }
15058
15059     case SYMBOL_REF:
15060
15061       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
15062           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
15063         {
15064           /* LDR.  */
15065           if (speed)
15066             *cost += extra_cost->ldst.load;
15067         }
15068       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
15069                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
15070         {
15071           /* ADRP, followed by ADD.  */
15072           *cost += COSTS_N_INSNS (1);
15073           if (speed)
15074             *cost += 2 * extra_cost->alu.arith;
15075         }
15076       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
15077                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
15078         {
15079           /* ADR.  */
15080           if (speed)
15081             *cost += extra_cost->alu.arith;
15082         }
15083
15084       if (flag_pic)
15085         {
15086           /* One extra load instruction, after accessing the GOT.  */
15087           *cost += COSTS_N_INSNS (1);
15088           if (speed)
15089             *cost += extra_cost->ldst.load;
15090         }
15091       return true;
15092
15093     case HIGH:
15094     case LO_SUM:
15095       /* ADRP/ADD (immediate).  */
15096       if (speed)
15097         *cost += extra_cost->alu.arith;
15098       return true;
15099
15100     case ZERO_EXTRACT:
15101     case SIGN_EXTRACT:
15102       /* UBFX/SBFX.  */
15103       if (speed)
15104         {
15105           if (VECTOR_MODE_P (mode))
15106             *cost += extra_cost->vect.alu;
15107           else
15108             *cost += extra_cost->alu.bfx;
15109         }
15110
15111       /* We can trust that the immediates used will be correct (there
15112          are no by-register forms), so we need only cost op0.  */
15113       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
15114       return true;
15115
15116     case MULT:
15117       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
15118       /* aarch64_rtx_mult_cost always handles recursion to its
15119          operands.  */
15120       return true;
15121
15122     case MOD:
15123     /* We can expand signed mod by power of 2 using a NEGS, two parallel
15124        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
15125        an unconditional negate.  This case should only ever be reached through
15126        the set_smod_pow2_cheap check in expmed.cc.  */
15127       if (CONST_INT_P (XEXP (x, 1))
15128           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
15129           && (mode == SImode || mode == DImode))
15130         {
15131           /* We expand to 4 instructions.  Reset the baseline.  */
15132           *cost = COSTS_N_INSNS (4);
15133
15134           if (speed)
15135             *cost += 2 * extra_cost->alu.logical
15136                      + 2 * extra_cost->alu.arith;
15137
15138           return true;
15139         }
15140
15141     /* Fall-through.  */
15142     case UMOD:
15143       if (speed)
15144         {
15145           /* Slighly prefer UMOD over SMOD.  */
15146           if (VECTOR_MODE_P (mode))
15147             *cost += extra_cost->vect.alu;
15148           else if (GET_MODE_CLASS (mode) == MODE_INT)
15149             *cost += (extra_cost->mult[mode == DImode].add
15150                       + extra_cost->mult[mode == DImode].idiv
15151                       + (code == MOD ? 1 : 0));
15152         }
15153       return false;  /* All arguments need to be in registers.  */
15154
15155     case DIV:
15156     case UDIV:
15157     case SQRT:
15158       if (speed)
15159         {
15160           if (VECTOR_MODE_P (mode))
15161             *cost += extra_cost->vect.alu;
15162           else if (GET_MODE_CLASS (mode) == MODE_INT)
15163             /* There is no integer SQRT, so only DIV and UDIV can get
15164                here.  */
15165             *cost += (extra_cost->mult[mode == DImode].idiv
15166                      /* Slighly prefer UDIV over SDIV.  */
15167                      + (code == DIV ? 1 : 0));
15168           else
15169             *cost += extra_cost->fp[mode == DFmode].div;
15170         }
15171       return false;  /* All arguments need to be in registers.  */
15172
15173     case IF_THEN_ELSE:
15174       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
15175                                          XEXP (x, 2), cost, speed);
15176
15177     case EQ:
15178     case NE:
15179     case GT:
15180     case GTU:
15181     case LT:
15182     case LTU:
15183     case GE:
15184     case GEU:
15185     case LE:
15186     case LEU:
15187
15188       return false; /* All arguments must be in registers.  */
15189
15190     case FMA:
15191       op0 = XEXP (x, 0);
15192       op1 = XEXP (x, 1);
15193       op2 = XEXP (x, 2);
15194
15195       if (speed)
15196         {
15197           if (VECTOR_MODE_P (mode))
15198             *cost += extra_cost->vect.alu;
15199           else
15200             *cost += extra_cost->fp[mode == DFmode].fma;
15201         }
15202
15203       /* FMSUB, FNMADD, and FNMSUB are free.  */
15204       if (GET_CODE (op0) == NEG)
15205         op0 = XEXP (op0, 0);
15206
15207       if (GET_CODE (op2) == NEG)
15208         op2 = XEXP (op2, 0);
15209
15210       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
15211          and the by-element operand as operand 0.  */
15212       if (GET_CODE (op1) == NEG)
15213         op1 = XEXP (op1, 0);
15214
15215       /* Catch vector-by-element operations.  The by-element operand can
15216          either be (vec_duplicate (vec_select (x))) or just
15217          (vec_select (x)), depending on whether we are multiplying by
15218          a vector or a scalar.
15219
15220          Canonicalization is not very good in these cases, FMA4 will put the
15221          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
15222       if (GET_CODE (op0) == VEC_DUPLICATE)
15223         op0 = XEXP (op0, 0);
15224       else if (GET_CODE (op1) == VEC_DUPLICATE)
15225         op1 = XEXP (op1, 0);
15226
15227       if (GET_CODE (op0) == VEC_SELECT)
15228         op0 = XEXP (op0, 0);
15229       else if (GET_CODE (op1) == VEC_SELECT)
15230         op1 = XEXP (op1, 0);
15231
15232       /* If the remaining parameters are not registers,
15233          get the cost to put them into registers.  */
15234       *cost += rtx_cost (op0, mode, FMA, 0, speed);
15235       *cost += rtx_cost (op1, mode, FMA, 1, speed);
15236       *cost += rtx_cost (op2, mode, FMA, 2, speed);
15237       return true;
15238
15239     case FLOAT:
15240     case UNSIGNED_FLOAT:
15241       if (speed)
15242         *cost += extra_cost->fp[mode == DFmode].fromint;
15243       return false;
15244
15245     case FLOAT_EXTEND:
15246       if (speed)
15247         {
15248           if (VECTOR_MODE_P (mode))
15249             {
15250               /*Vector truncate.  */
15251               *cost += extra_cost->vect.alu;
15252             }
15253           else
15254             *cost += extra_cost->fp[mode == DFmode].widen;
15255         }
15256       return false;
15257
15258     case FLOAT_TRUNCATE:
15259       if (speed)
15260         {
15261           if (VECTOR_MODE_P (mode))
15262             {
15263               /*Vector conversion.  */
15264               *cost += extra_cost->vect.alu;
15265             }
15266           else
15267             *cost += extra_cost->fp[mode == DFmode].narrow;
15268         }
15269       return false;
15270
15271     case FIX:
15272     case UNSIGNED_FIX:
15273       x = XEXP (x, 0);
15274       /* Strip the rounding part.  They will all be implemented
15275          by the fcvt* family of instructions anyway.  */
15276       if (GET_CODE (x) == UNSPEC)
15277         {
15278           unsigned int uns_code = XINT (x, 1);
15279
15280           if (uns_code == UNSPEC_FRINTA
15281               || uns_code == UNSPEC_FRINTM
15282               || uns_code == UNSPEC_FRINTN
15283               || uns_code == UNSPEC_FRINTP
15284               || uns_code == UNSPEC_FRINTZ)
15285             x = XVECEXP (x, 0, 0);
15286         }
15287
15288       if (speed)
15289         {
15290           if (VECTOR_MODE_P (mode))
15291             *cost += extra_cost->vect.alu;
15292           else
15293             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
15294         }
15295
15296       /* We can combine fmul by a power of 2 followed by a fcvt into a single
15297          fixed-point fcvt.  */
15298       if (GET_CODE (x) == MULT
15299           && ((VECTOR_MODE_P (mode)
15300                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
15301               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
15302         {
15303           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
15304                              0, speed);
15305           return true;
15306         }
15307
15308       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
15309       return true;
15310
15311     case ABS:
15312       if (VECTOR_MODE_P (mode))
15313         {
15314           /* ABS (vector).  */
15315           if (speed)
15316             *cost += extra_cost->vect.alu;
15317         }
15318       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15319         {
15320           op0 = XEXP (x, 0);
15321
15322           /* FABD, which is analogous to FADD.  */
15323           if (GET_CODE (op0) == MINUS)
15324             {
15325               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15326               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
15327               if (speed)
15328                 *cost += extra_cost->fp[mode == DFmode].addsub;
15329
15330               return true;
15331             }
15332           /* Simple FABS is analogous to FNEG.  */
15333           if (speed)
15334             *cost += extra_cost->fp[mode == DFmode].neg;
15335         }
15336       else
15337         {
15338           /* Integer ABS will either be split to
15339              two arithmetic instructions, or will be an ABS
15340              (scalar), which we don't model.  */
15341           *cost = COSTS_N_INSNS (2);
15342           if (speed)
15343             *cost += 2 * extra_cost->alu.arith;
15344         }
15345       return false;
15346
15347     case SMAX:
15348     case SMIN:
15349       if (speed)
15350         {
15351           if (VECTOR_MODE_P (mode))
15352             *cost += extra_cost->vect.alu;
15353           else
15354             {
15355               /* FMAXNM/FMINNM/FMAX/FMIN.
15356                  TODO: This may not be accurate for all implementations, but
15357                  we do not model this in the cost tables.  */
15358               *cost += extra_cost->fp[mode == DFmode].addsub;
15359             }
15360         }
15361       return false;
15362
15363     case UNSPEC:
15364       /* The floating point round to integer frint* instructions.  */
15365       if (aarch64_frint_unspec_p (XINT (x, 1)))
15366         {
15367           if (speed)
15368             *cost += extra_cost->fp[mode == DFmode].roundint;
15369
15370           return false;
15371         }
15372       break;
15373
15374     case TRUNCATE:
15375
15376       /* Decompose <su>muldi3_highpart.  */
15377       if (/* (truncate:DI  */
15378           mode == DImode
15379           /*   (lshiftrt:TI  */
15380           && GET_MODE (XEXP (x, 0)) == TImode
15381           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15382           /*      (mult:TI  */
15383           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15384           /*        (ANY_EXTEND:TI (reg:DI))
15385                     (ANY_EXTEND:TI (reg:DI)))  */
15386           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15387                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15388               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15389                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15390           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15391           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15392           /*     (const_int 64)  */
15393           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15394           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15395         {
15396           /* UMULH/SMULH.  */
15397           if (speed)
15398             *cost += extra_cost->mult[mode == DImode].extend;
15399           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15400                              mode, MULT, 0, speed);
15401           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15402                              mode, MULT, 1, speed);
15403           return true;
15404         }
15405         break;
15406     case CONST_VECTOR:
15407         {
15408           /* Load using MOVI/MVNI.  */
15409           if (aarch64_simd_valid_immediate (x, NULL))
15410             *cost = extra_cost->vect.movi;
15411           else /* Load using constant pool.  */
15412             *cost = extra_cost->ldst.load;
15413           break;
15414         }
15415     case VEC_CONCAT:
15416         /* depending on the operation, either DUP or INS.
15417            For now, keep default costing.  */
15418         break;
15419     case VEC_DUPLICATE:
15420         /* Load using a DUP.  */
15421         *cost = extra_cost->vect.dup;
15422         return false;
15423     case VEC_SELECT:
15424         {
15425           rtx op0 = XEXP (x, 0);
15426           *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15427
15428           /* cost subreg of 0 as free, otherwise as DUP */
15429           rtx op1 = XEXP (x, 1);
15430           if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15431             ;
15432           else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15433             *cost = extra_cost->vect.dup;
15434           else
15435             *cost = extra_cost->vect.extract;
15436           return true;
15437         }
15438     default:
15439       break;
15440     }
15441
15442   if (dump_file
15443       && flag_aarch64_verbose_cost)
15444     fprintf (dump_file,
15445       "\nFailed to cost RTX.  Assuming default cost.\n");
15446
15447   return true;
15448 }
15449
15450 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15451    calculated for X.  This cost is stored in *COST.  Returns true
15452    if the total cost of X was calculated.  */
15453 static bool
15454 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15455                    int param, int *cost, bool speed)
15456 {
15457   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15458
15459   if (dump_file
15460       && flag_aarch64_verbose_cost)
15461     {
15462       print_rtl_single (dump_file, x);
15463       fprintf (dump_file, "\n%s cost: %d (%s)\n",
15464                speed ? "Hot" : "Cold",
15465                *cost, result ? "final" : "partial");
15466     }
15467
15468   return result;
15469 }
15470
15471 static int
15472 aarch64_register_move_cost (machine_mode mode,
15473                             reg_class_t from_i, reg_class_t to_i)
15474 {
15475   enum reg_class from = (enum reg_class) from_i;
15476   enum reg_class to = (enum reg_class) to_i;
15477   const struct cpu_regmove_cost *regmove_cost
15478     = aarch64_tune_params.regmove_cost;
15479
15480   /* Trest any subset of POINTER_REGS as though it were GENERAL_REGS.  */
15481   if (reg_class_subset_p (to, POINTER_REGS))
15482     to = GENERAL_REGS;
15483
15484   if (reg_class_subset_p (from, POINTER_REGS))
15485     from = GENERAL_REGS;
15486
15487   /* Make RDFFR very expensive.  In particular, if we know that the FFR
15488      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15489      as a way of obtaining a PTRUE.  */
15490   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15491       && hard_reg_set_subset_p (reg_class_contents[from_i],
15492                                 reg_class_contents[FFR_REGS]))
15493     return 80;
15494
15495   /* Moving between GPR and stack cost is the same as GP2GP.  */
15496   if ((from == GENERAL_REGS && to == STACK_REG)
15497       || (to == GENERAL_REGS && from == STACK_REG))
15498     return regmove_cost->GP2GP;
15499
15500   /* To/From the stack register, we move via the gprs.  */
15501   if (to == STACK_REG || from == STACK_REG)
15502     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15503             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15504
15505   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15506   if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15507       && known_eq (GET_MODE_SIZE (mode), 16))
15508     {
15509       /* 128-bit operations on general registers require 2 instructions.  */
15510       if (from == GENERAL_REGS && to == GENERAL_REGS)
15511         return regmove_cost->GP2GP * 2;
15512       else if (from == GENERAL_REGS)
15513         return regmove_cost->GP2FP * 2;
15514       else if (to == GENERAL_REGS)
15515         return regmove_cost->FP2GP * 2;
15516
15517       /* When AdvSIMD instructions are disabled it is not possible to move
15518          a 128-bit value directly between Q registers.  This is handled in
15519          secondary reload.  A general register is used as a scratch to move
15520          the upper DI value and the lower DI value is moved directly,
15521          hence the cost is the sum of three moves. */
15522       if (!TARGET_SIMD && !TARGET_SVE)
15523         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15524
15525       return regmove_cost->FP2FP;
15526     }
15527
15528   if (from == GENERAL_REGS && to == GENERAL_REGS)
15529     return regmove_cost->GP2GP;
15530   else if (from == GENERAL_REGS)
15531     return regmove_cost->GP2FP;
15532   else if (to == GENERAL_REGS)
15533     return regmove_cost->FP2GP;
15534
15535   if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15536     {
15537       /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15538          The cost must be greater than 2 units to indicate that direct
15539          moves aren't possible.  */
15540       auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15541                          + aarch64_tune_params.memmov_cost.store_fp);
15542       return MIN (CEIL (per_vector, 2), 4);
15543     }
15544
15545   return regmove_cost->FP2FP;
15546 }
15547
15548 /* Implements TARGET_MEMORY_MOVE_COST.  */
15549 static int
15550 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15551 {
15552   enum reg_class rclass = (enum reg_class) rclass_i;
15553   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15554       ? reg_classes_intersect_p (rclass, PR_REGS)
15555       : reg_class_subset_p (rclass, PR_REGS))
15556     return (in
15557             ? aarch64_tune_params.memmov_cost.load_pred
15558             : aarch64_tune_params.memmov_cost.store_pred);
15559
15560   if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15561       ? reg_classes_intersect_p (rclass, FP_REGS)
15562       : reg_class_subset_p (rclass, FP_REGS))
15563     return (in
15564             ? aarch64_tune_params.memmov_cost.load_fp
15565             : aarch64_tune_params.memmov_cost.store_fp);
15566
15567   return (in
15568           ? aarch64_tune_params.memmov_cost.load_int
15569           : aarch64_tune_params.memmov_cost.store_int);
15570 }
15571
15572 /* Implement TARGET_INSN_COST.  We have the opportunity to do something
15573    much more productive here, such as using insn attributes to cost things.
15574    But we don't, not yet.
15575
15576    The main point of this current definition is to make calling insn_cost
15577    on one instruction equivalent to calling seq_cost on a sequence that
15578    contains only that instruction.  The default definition would instead
15579    only look at SET_SRCs, ignoring SET_DESTs.
15580
15581    This ensures that, for example, storing a 128-bit zero vector is more
15582    expensive than storing a 128-bit vector register.  A move of zero
15583    into a 128-bit vector register followed by multiple stores of that
15584    register is then cheaper than multiple stores of zero (which would
15585    use STP of XZR).  This in turn allows STP Qs to be formed.  */
15586 static int
15587 aarch64_insn_cost (rtx_insn *insn, bool speed)
15588 {
15589   if (rtx set = single_set (insn))
15590     return set_rtx_cost (set, speed);
15591   return pattern_cost (PATTERN (insn), speed);
15592 }
15593
15594 /* Implement TARGET_INIT_BUILTINS.  */
15595 static void
15596 aarch64_init_builtins ()
15597 {
15598   aarch64_general_init_builtins ();
15599   aarch64_sve::init_builtins ();
15600 #ifdef SUBTARGET_INIT_BUILTINS
15601   SUBTARGET_INIT_BUILTINS;
15602 #endif
15603 }
15604
15605 /* Implement TARGET_FOLD_BUILTIN.  */
15606 static tree
15607 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15608 {
15609   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15610   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15611   tree type = TREE_TYPE (TREE_TYPE (fndecl));
15612   switch (code & AARCH64_BUILTIN_CLASS)
15613     {
15614     case AARCH64_BUILTIN_GENERAL:
15615       return aarch64_general_fold_builtin (subcode, type, nargs, args);
15616
15617     case AARCH64_BUILTIN_SVE:
15618       return NULL_TREE;
15619     }
15620   gcc_unreachable ();
15621 }
15622
15623 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
15624 static bool
15625 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15626 {
15627   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15628   tree fndecl = gimple_call_fndecl (stmt);
15629   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15630   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15631   gimple *new_stmt = NULL;
15632   switch (code & AARCH64_BUILTIN_CLASS)
15633     {
15634     case AARCH64_BUILTIN_GENERAL:
15635       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15636       break;
15637
15638     case AARCH64_BUILTIN_SVE:
15639       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15640       break;
15641     }
15642
15643   if (!new_stmt)
15644     return false;
15645
15646   gsi_replace (gsi, new_stmt, false);
15647   return true;
15648 }
15649
15650 /* Implement TARGET_EXPAND_BUILTIN.  */
15651 static rtx
15652 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15653 {
15654   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15655   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15656   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15657   switch (code & AARCH64_BUILTIN_CLASS)
15658     {
15659     case AARCH64_BUILTIN_GENERAL:
15660       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15661
15662     case AARCH64_BUILTIN_SVE:
15663       return aarch64_sve::expand_builtin (subcode, exp, target);
15664     }
15665   gcc_unreachable ();
15666 }
15667
15668 /* Implement TARGET_BUILTIN_DECL.  */
15669 static tree
15670 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15671 {
15672   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15673   switch (code & AARCH64_BUILTIN_CLASS)
15674     {
15675     case AARCH64_BUILTIN_GENERAL:
15676       return aarch64_general_builtin_decl (subcode, initialize_p);
15677
15678     case AARCH64_BUILTIN_SVE:
15679       return aarch64_sve::builtin_decl (subcode, initialize_p);
15680     }
15681   gcc_unreachable ();
15682 }
15683
15684 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15685    to optimize 1.0/sqrt.  */
15686
15687 static bool
15688 use_rsqrt_p (machine_mode mode)
15689 {
15690   return (!flag_trapping_math
15691           && flag_unsafe_math_optimizations
15692           && ((aarch64_tune_params.approx_modes->recip_sqrt
15693                & AARCH64_APPROX_MODE (mode))
15694               || flag_mrecip_low_precision_sqrt));
15695 }
15696
15697 /* Function to decide when to use the approximate reciprocal square root
15698    builtin.  */
15699
15700 static tree
15701 aarch64_builtin_reciprocal (tree fndecl)
15702 {
15703   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15704
15705   if (!use_rsqrt_p (mode))
15706     return NULL_TREE;
15707   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15708   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15709   switch (code & AARCH64_BUILTIN_CLASS)
15710     {
15711     case AARCH64_BUILTIN_GENERAL:
15712       return aarch64_general_builtin_rsqrt (subcode);
15713
15714     case AARCH64_BUILTIN_SVE:
15715       return NULL_TREE;
15716     }
15717   gcc_unreachable ();
15718 }
15719
15720 /* Emit code to perform the floating-point operation:
15721
15722      DST = SRC1 * SRC2
15723
15724    where all three operands are already known to be registers.
15725    If the operation is an SVE one, PTRUE is a suitable all-true
15726    predicate.  */
15727
15728 static void
15729 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15730 {
15731   if (ptrue)
15732     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15733                                  dst, ptrue, src1, src2,
15734                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
15735   else
15736     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15737 }
15738
15739 /* Emit instruction sequence to compute either the approximate square root
15740    or its approximate reciprocal, depending on the flag RECP, and return
15741    whether the sequence was emitted or not.  */
15742
15743 bool
15744 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15745 {
15746   machine_mode mode = GET_MODE (dst);
15747
15748   if (GET_MODE_INNER (mode) == HFmode)
15749     {
15750       gcc_assert (!recp);
15751       return false;
15752     }
15753
15754   if (!recp)
15755     {
15756       if (!(flag_mlow_precision_sqrt
15757             || (aarch64_tune_params.approx_modes->sqrt
15758                 & AARCH64_APPROX_MODE (mode))))
15759         return false;
15760
15761       if (!flag_finite_math_only
15762           || flag_trapping_math
15763           || !flag_unsafe_math_optimizations
15764           || optimize_function_for_size_p (cfun))
15765         return false;
15766     }
15767   else
15768     /* Caller assumes we cannot fail.  */
15769     gcc_assert (use_rsqrt_p (mode));
15770
15771   rtx pg = NULL_RTX;
15772   if (aarch64_sve_mode_p (mode))
15773     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15774   machine_mode mmsk = (VECTOR_MODE_P (mode)
15775                        ? related_int_vector_mode (mode).require ()
15776                        : int_mode_for_mode (mode).require ());
15777   rtx xmsk = NULL_RTX;
15778   if (!recp)
15779     {
15780       /* When calculating the approximate square root, compare the
15781          argument with 0.0 and create a mask.  */
15782       rtx zero = CONST0_RTX (mode);
15783       if (pg)
15784         {
15785           xmsk = gen_reg_rtx (GET_MODE (pg));
15786           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15787           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15788                                            xmsk, pg, hint, src, zero));
15789         }
15790       else
15791         {
15792           xmsk = gen_reg_rtx (mmsk);
15793           emit_insn (gen_rtx_SET (xmsk,
15794                                   gen_rtx_NEG (mmsk,
15795                                                gen_rtx_EQ (mmsk, src, zero))));
15796         }
15797     }
15798
15799   /* Estimate the approximate reciprocal square root.  */
15800   rtx xdst = gen_reg_rtx (mode);
15801   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15802
15803   /* Iterate over the series twice for SF and thrice for DF.  */
15804   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15805
15806   /* Optionally iterate over the series once less for faster performance
15807      while sacrificing the accuracy.  */
15808   if ((recp && flag_mrecip_low_precision_sqrt)
15809       || (!recp && flag_mlow_precision_sqrt))
15810     iterations--;
15811
15812   /* Iterate over the series to calculate the approximate reciprocal square
15813      root.  */
15814   rtx x1 = gen_reg_rtx (mode);
15815   while (iterations--)
15816     {
15817       rtx x2 = gen_reg_rtx (mode);
15818       aarch64_emit_mult (x2, pg, xdst, xdst);
15819
15820       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15821
15822       if (iterations > 0)
15823         aarch64_emit_mult (xdst, pg, xdst, x1);
15824     }
15825
15826   if (!recp)
15827     {
15828       if (pg)
15829         /* Multiply nonzero source values by the corresponding intermediate
15830            result elements, so that the final calculation is the approximate
15831            square root rather than its reciprocal.  Select a zero result for
15832            zero source values, to avoid the Inf * 0 -> NaN that we'd get
15833            otherwise.  */
15834         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15835                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15836       else
15837         {
15838           /* Qualify the approximate reciprocal square root when the
15839              argument is 0.0 by squashing the intermediary result to 0.0.  */
15840           rtx xtmp = gen_reg_rtx (mmsk);
15841           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15842                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
15843           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15844
15845           /* Calculate the approximate square root.  */
15846           aarch64_emit_mult (xdst, pg, xdst, src);
15847         }
15848     }
15849
15850   /* Finalize the approximation.  */
15851   aarch64_emit_mult (dst, pg, xdst, x1);
15852
15853   return true;
15854 }
15855
15856 /* Emit the instruction sequence to compute the approximation for the division
15857    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
15858
15859 bool
15860 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15861 {
15862   machine_mode mode = GET_MODE (quo);
15863
15864   if (GET_MODE_INNER (mode) == HFmode)
15865     return false;
15866
15867   bool use_approx_division_p = (flag_mlow_precision_div
15868                                 || (aarch64_tune_params.approx_modes->division
15869                                     & AARCH64_APPROX_MODE (mode)));
15870
15871   if (!flag_finite_math_only
15872       || flag_trapping_math
15873       || !flag_unsafe_math_optimizations
15874       || optimize_function_for_size_p (cfun)
15875       || !use_approx_division_p)
15876     return false;
15877
15878   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15879     return false;
15880
15881   rtx pg = NULL_RTX;
15882   if (aarch64_sve_mode_p (mode))
15883     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15884
15885   /* Estimate the approximate reciprocal.  */
15886   rtx xrcp = gen_reg_rtx (mode);
15887   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15888
15889   /* Iterate over the series twice for SF and thrice for DF.  */
15890   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15891
15892   /* Optionally iterate over the series less for faster performance,
15893      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
15894   if (flag_mlow_precision_div)
15895     iterations = (GET_MODE_INNER (mode) == DFmode
15896                   ? aarch64_double_recp_precision
15897                   : aarch64_float_recp_precision);
15898
15899   /* Iterate over the series to calculate the approximate reciprocal.  */
15900   rtx xtmp = gen_reg_rtx (mode);
15901   while (iterations--)
15902     {
15903       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15904
15905       if (iterations > 0)
15906         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15907     }
15908
15909   if (num != CONST1_RTX (mode))
15910     {
15911       /* As the approximate reciprocal of DEN is already calculated, only
15912          calculate the approximate division when NUM is not 1.0.  */
15913       rtx xnum = force_reg (mode, num);
15914       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15915     }
15916
15917   /* Finalize the approximation.  */
15918   aarch64_emit_mult (quo, pg, xrcp, xtmp);
15919   return true;
15920 }
15921
15922 /* Return the number of instructions that can be issued per cycle.  */
15923 static int
15924 aarch64_sched_issue_rate (void)
15925 {
15926   return aarch64_tune_params.issue_rate;
15927 }
15928
15929 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
15930 static int
15931 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15932 {
15933   if (DEBUG_INSN_P (insn))
15934     return more;
15935
15936   rtx_code code = GET_CODE (PATTERN (insn));
15937   if (code == USE || code == CLOBBER)
15938     return more;
15939
15940   if (get_attr_type (insn) == TYPE_NO_INSN)
15941     return more;
15942
15943   return more - 1;
15944 }
15945
15946 static int
15947 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15948 {
15949   int issue_rate = aarch64_sched_issue_rate ();
15950
15951   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15952 }
15953
15954
15955 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15956    autopref_multipass_dfa_lookahead_guard from haifa-sched.cc.  It only
15957    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
15958
15959 static int
15960 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15961                                                     int ready_index)
15962 {
15963   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15964 }
15965
15966
15967 /* Vectorizer cost model target hooks.  */
15968
15969 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
15970    return the decl that should be recorded.  Return null otherwise.  */
15971 tree
15972 aarch64_vector_load_decl (tree addr)
15973 {
15974   if (TREE_CODE (addr) != ADDR_EXPR)
15975     return NULL_TREE;
15976   tree base = get_base_address (TREE_OPERAND (addr, 0));
15977   if (TREE_CODE (base) != VAR_DECL)
15978     return NULL_TREE;
15979   return base;
15980 }
15981
15982 /* Return true if STMT_INFO accesses a decl that is known to be the
15983    argument to a vld1 in the same function.  */
15984 static bool
15985 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
15986 {
15987   if (!cfun->machine->vector_load_decls)
15988     return false;
15989   auto dr = STMT_VINFO_DATA_REF (stmt_info);
15990   if (!dr)
15991     return false;
15992   tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
15993   return decl && cfun->machine->vector_load_decls->contains (decl);
15994 }
15995
15996 /* Information about how the CPU would issue the scalar, Advanced SIMD
15997    or SVE version of a vector loop, using the scheme defined by the
15998    aarch64_base_vec_issue_info hierarchy of structures.  */
15999 class aarch64_vec_op_count
16000 {
16001 public:
16002   aarch64_vec_op_count () = default;
16003   aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
16004                         unsigned int = 1);
16005
16006   unsigned int vec_flags () const { return m_vec_flags; }
16007   unsigned int vf_factor () const { return m_vf_factor; }
16008
16009   const aarch64_base_vec_issue_info *base_issue_info () const;
16010   const aarch64_simd_vec_issue_info *simd_issue_info () const;
16011   const aarch64_sve_vec_issue_info *sve_issue_info () const;
16012
16013   fractional_cost rename_cycles_per_iter () const;
16014   fractional_cost min_nonpred_cycles_per_iter () const;
16015   fractional_cost min_pred_cycles_per_iter () const;
16016   fractional_cost min_cycles_per_iter () const;
16017
16018   void dump () const;
16019
16020   /* The number of individual "general" operations.  See the comments
16021      in aarch64_base_vec_issue_info for details.  */
16022   unsigned int general_ops = 0;
16023
16024   /* The number of load and store operations, under the same scheme
16025      as above.  */
16026   unsigned int loads = 0;
16027   unsigned int stores = 0;
16028
16029   /* The minimum number of cycles needed to execute all loop-carried
16030      operations, which in the vector code become associated with
16031      reductions.  */
16032   unsigned int reduction_latency = 0;
16033
16034   /* The number of individual predicate operations.  See the comments
16035      in aarch64_sve_vec_issue_info for details.  */
16036   unsigned int pred_ops = 0;
16037
16038 private:
16039   /* The issue information for the core.  */
16040   const aarch64_vec_issue_info *m_issue_info = nullptr;
16041
16042   /* - If M_VEC_FLAGS is zero then this structure describes scalar code
16043      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
16044        Advanced SIMD code.
16045      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
16046        SVE code.  */
16047   unsigned int m_vec_flags = 0;
16048
16049   /* Assume that, when the code is executing on the core described
16050      by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
16051      times more data than the vectorizer anticipates.
16052
16053      This is only ever different from 1 for SVE.  It allows us to consider
16054      what would happen on a 256-bit SVE target even when the -mtune
16055      parameters say that the “likely” SVE length is 128 bits.  */
16056   unsigned int m_vf_factor = 1;
16057 };
16058
16059 aarch64_vec_op_count::
16060 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
16061                       unsigned int vec_flags, unsigned int vf_factor)
16062   : m_issue_info (issue_info),
16063     m_vec_flags (vec_flags),
16064     m_vf_factor (vf_factor)
16065 {
16066 }
16067
16068 /* Return the base issue information (i.e. the parts that make sense
16069    for both scalar and vector code).  Return null if we have no issue
16070    information.  */
16071 const aarch64_base_vec_issue_info *
16072 aarch64_vec_op_count::base_issue_info () const
16073 {
16074   if (auto *ret = simd_issue_info ())
16075     return ret;
16076   return m_issue_info->scalar;
16077 }
16078
16079 /* If the structure describes vector code and we have associated issue
16080    information, return that issue information, otherwise return null.  */
16081 const aarch64_simd_vec_issue_info *
16082 aarch64_vec_op_count::simd_issue_info () const
16083 {
16084   if (auto *ret = sve_issue_info ())
16085     return ret;
16086   if (m_vec_flags)
16087     return m_issue_info->advsimd;
16088   return nullptr;
16089 }
16090
16091 /* If the structure describes SVE code and we have associated issue
16092    information, return that issue information, otherwise return null.  */
16093 const aarch64_sve_vec_issue_info *
16094 aarch64_vec_op_count::sve_issue_info () const
16095 {
16096   if (m_vec_flags & VEC_ANY_SVE)
16097     return m_issue_info->sve;
16098   return nullptr;
16099 }
16100
16101 /* Estimate the minimum number of cycles per iteration needed to rename
16102    the instructions.
16103
16104    ??? For now this is done inline rather than via cost tables, since it
16105    isn't clear how it should be parameterized for the general case.  */
16106 fractional_cost
16107 aarch64_vec_op_count::rename_cycles_per_iter () const
16108 {
16109   if (sve_issue_info () == &neoverse512tvb_sve_issue_info
16110       || sve_issue_info () == &neoversen2_sve_issue_info
16111       || sve_issue_info () == &neoversev2_sve_issue_info)
16112     /* + 1 for an addition.  We've already counted a general op for each
16113        store, so we don't need to account for stores separately.  The branch
16114        reads no registers and so does not need to be counted either.
16115
16116        ??? This value is very much on the pessimistic side, but seems to work
16117        pretty well in practice.  */
16118     return { general_ops + loads + pred_ops + 1, 5 };
16119
16120   return 0;
16121 }
16122
16123 /* Like min_cycles_per_iter, but excluding predicate operations.  */
16124 fractional_cost
16125 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
16126 {
16127   auto *issue_info = base_issue_info ();
16128
16129   fractional_cost cycles = MAX (reduction_latency, 1);
16130   cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
16131   cycles = std::max (cycles, { loads + stores,
16132                                issue_info->loads_stores_per_cycle });
16133   cycles = std::max (cycles, { general_ops,
16134                                issue_info->general_ops_per_cycle });
16135   cycles = std::max (cycles, rename_cycles_per_iter ());
16136   return cycles;
16137 }
16138
16139 /* Like min_cycles_per_iter, but including only the predicate operations.  */
16140 fractional_cost
16141 aarch64_vec_op_count::min_pred_cycles_per_iter () const
16142 {
16143   if (auto *issue_info = sve_issue_info ())
16144     return { pred_ops, issue_info->pred_ops_per_cycle };
16145   return 0;
16146 }
16147
16148 /* Estimate the minimum number of cycles needed to issue the operations.
16149    This is a very simplistic model!  */
16150 fractional_cost
16151 aarch64_vec_op_count::min_cycles_per_iter () const
16152 {
16153   return std::max (min_nonpred_cycles_per_iter (),
16154                    min_pred_cycles_per_iter ());
16155 }
16156
16157 /* Dump information about the structure.  */
16158 void
16159 aarch64_vec_op_count::dump () const
16160 {
16161   dump_printf_loc (MSG_NOTE, vect_location,
16162                    "  load operations = %d\n", loads);
16163   dump_printf_loc (MSG_NOTE, vect_location,
16164                    "  store operations = %d\n", stores);
16165   dump_printf_loc (MSG_NOTE, vect_location,
16166                    "  general operations = %d\n", general_ops);
16167   if (sve_issue_info ())
16168     dump_printf_loc (MSG_NOTE, vect_location,
16169                      "  predicate operations = %d\n", pred_ops);
16170   dump_printf_loc (MSG_NOTE, vect_location,
16171                    "  reduction latency = %d\n", reduction_latency);
16172   if (auto rcpi = rename_cycles_per_iter ())
16173     dump_printf_loc (MSG_NOTE, vect_location,
16174                      "  estimated cycles per iteration to rename = %f\n",
16175                      rcpi.as_double ());
16176   if (auto pred_cpi = min_pred_cycles_per_iter ())
16177     {
16178       dump_printf_loc (MSG_NOTE, vect_location,
16179                        "  estimated min cycles per iteration"
16180                        " without predication = %f\n",
16181                        min_nonpred_cycles_per_iter ().as_double ());
16182       dump_printf_loc (MSG_NOTE, vect_location,
16183                        "  estimated min cycles per iteration"
16184                        " for predication = %f\n", pred_cpi.as_double ());
16185     }
16186   if (auto cpi = min_cycles_per_iter ())
16187     dump_printf_loc (MSG_NOTE, vect_location,
16188                      "  estimated min cycles per iteration = %f\n",
16189                      cpi.as_double ());
16190 }
16191
16192 /* Information about vector code that we're in the process of costing.  */
16193 class aarch64_vector_costs : public vector_costs
16194 {
16195 public:
16196   aarch64_vector_costs (vec_info *, bool);
16197
16198   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
16199                               stmt_vec_info stmt_info, slp_tree, tree vectype,
16200                               int misalign,
16201                               vect_cost_model_location where) override;
16202   void finish_cost (const vector_costs *) override;
16203   bool better_main_loop_than_p (const vector_costs *other) const override;
16204
16205 private:
16206   void record_potential_advsimd_unrolling (loop_vec_info);
16207   void analyze_loop_vinfo (loop_vec_info);
16208   void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
16209                   aarch64_vec_op_count *);
16210   fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
16211                                         fractional_cost, unsigned int,
16212                                         unsigned int *, bool *);
16213   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
16214                                  unsigned int);
16215   bool prefer_unrolled_loop () const;
16216   unsigned int determine_suggested_unroll_factor ();
16217
16218   /* True if we have performed one-time initialization based on the
16219      vec_info.  */
16220   bool m_analyzed_vinfo = false;
16221
16222   /* This loop uses an average operation that is not supported by SVE, but is
16223      supported by Advanced SIMD and SVE2.  */
16224   bool m_has_avg = false;
16225
16226   /* True if the vector body contains a store to a decl and if the
16227      function is known to have a vld1 from the same decl.
16228
16229      In the Advanced SIMD ACLE, the recommended endian-agnostic way of
16230      initializing a vector is:
16231
16232        float f[4] = { elts };
16233        float32x4_t x = vld1q_f32(f);
16234
16235      We should strongly prefer vectorization of the initialization of f,
16236      so that the store to f and the load back can be optimized away,
16237      leaving a vectorization of { elts }.  */
16238   bool m_stores_to_vector_load_decl = false;
16239
16240   /* Non-zero if the last operation we costed is a vector promotion or demotion.
16241      In this case the value is the number of insns in the last operation.
16242
16243      On AArch64 vector promotion and demotions require us to first widen or
16244      narrow the input and only after that emit conversion instructions.  For
16245      costing this means we need to emit the cost of the final conversions as
16246      well.  */
16247   unsigned int m_num_last_promote_demote = 0;
16248
16249   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
16250      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
16251        SIMD code.
16252      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
16253   unsigned int m_vec_flags = 0;
16254
16255   /* At the moment, we do not model LDP and STP in the vector and scalar costs.
16256      This means that code such as:
16257
16258         a[0] = x;
16259         a[1] = x;
16260
16261      will be costed as two scalar instructions and two vector instructions
16262      (a scalar_to_vec and an unaligned_store).  For SLP, the vector form
16263      wins if the costs are equal, because of the fact that the vector costs
16264      include constant initializations whereas the scalar costs don't.
16265      We would therefore tend to vectorize the code above, even though
16266      the scalar version can use a single STP.
16267
16268      We should eventually fix this and model LDP and STP in the main costs;
16269      see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
16270      Until then, we look specifically for code that does nothing more than
16271      STP-like operations.  We cost them on that basis in addition to the
16272      normal latency-based costs.
16273
16274      If the scalar or vector code could be a sequence of STPs +
16275      initialization, this variable counts the cost of the sequence,
16276      with 2 units per instruction.  The variable is ~0U for other
16277      kinds of code.  */
16278   unsigned int m_stp_sequence_cost = 0;
16279
16280   /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
16281      throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
16282      situations, we try to predict whether an Advanced SIMD implementation
16283      of the loop could be completely unrolled and become straight-line code.
16284      If so, it is generally better to use the Advanced SIMD version rather
16285      than length-agnostic SVE, since the SVE loop would execute an unknown
16286      number of times and so could not be completely unrolled in the same way.
16287
16288      If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
16289      number of Advanced SIMD loop iterations that would be unrolled and
16290      M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
16291      in the unrolled loop.  Both values are zero if we're not applying
16292      the heuristic.  */
16293   unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
16294   unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
16295
16296   /* If we're vectorizing a loop that executes a constant number of times,
16297      this variable gives the number of times that the vector loop would
16298      iterate, otherwise it is zero.  */
16299   uint64_t m_num_vector_iterations = 0;
16300
16301   /* Used only when vectorizing loops.  Estimates the number and kind of
16302      operations that would be needed by one iteration of the scalar
16303      or vector loop.  There is one entry for each tuning option of
16304      interest.  */
16305   auto_vec<aarch64_vec_op_count, 2> m_ops;
16306 };
16307
16308 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
16309                                             bool costing_for_scalar)
16310   : vector_costs (vinfo, costing_for_scalar),
16311     m_vec_flags (costing_for_scalar ? 0
16312                  : aarch64_classify_vector_mode (vinfo->vector_mode))
16313 {
16314   if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
16315     {
16316       m_ops.quick_push ({ issue_info, m_vec_flags });
16317       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
16318         {
16319           unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
16320           m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
16321                               vf_factor });
16322         }
16323     }
16324 }
16325
16326 /* Implement TARGET_VECTORIZE_CREATE_COSTS.  */
16327 vector_costs *
16328 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
16329 {
16330   return new aarch64_vector_costs (vinfo, costing_for_scalar);
16331 }
16332
16333 /* Return true if the current CPU should use the new costs defined
16334    in GCC 11.  This should be removed for GCC 12 and above, with the
16335    costs applying to all CPUs instead.  */
16336 static bool
16337 aarch64_use_new_vector_costs_p ()
16338 {
16339   return (aarch64_tune_params.extra_tuning_flags
16340           & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
16341 }
16342
16343 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
16344 static const simd_vec_cost *
16345 aarch64_simd_vec_costs (tree vectype)
16346 {
16347   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16348   if (vectype != NULL
16349       && aarch64_sve_mode_p (TYPE_MODE (vectype))
16350       && costs->sve != NULL)
16351     return costs->sve;
16352   return costs->advsimd;
16353 }
16354
16355 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
16356 static const simd_vec_cost *
16357 aarch64_simd_vec_costs_for_flags (unsigned int flags)
16358 {
16359   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16360   if ((flags & VEC_ANY_SVE) && costs->sve)
16361     return costs->sve;
16362   return costs->advsimd;
16363 }
16364
16365 /* If STMT_INFO is a memory reference, return the scalar memory type,
16366    otherwise return null.  */
16367 static tree
16368 aarch64_dr_type (stmt_vec_info stmt_info)
16369 {
16370   if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16371     return TREE_TYPE (DR_REF (dr));
16372   return NULL_TREE;
16373 }
16374
16375 /* Decide whether to use the unrolling heuristic described above
16376    m_unrolled_advsimd_niters, updating that field if so.  LOOP_VINFO
16377    describes the loop that we're vectorizing.  */
16378 void
16379 aarch64_vector_costs::
16380 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
16381 {
16382   /* The heuristic only makes sense on targets that have the same
16383      vector throughput for SVE and Advanced SIMD.  */
16384   if (!(aarch64_tune_params.extra_tuning_flags
16385         & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16386     return;
16387
16388   /* We only want to apply the heuristic if LOOP_VINFO is being
16389      vectorized for SVE.  */
16390   if (!(m_vec_flags & VEC_ANY_SVE))
16391     return;
16392
16393   /* Check whether it is possible in principle to use Advanced SIMD
16394      instead.  */
16395   if (aarch64_autovec_preference == 2)
16396     return;
16397
16398   /* We don't want to apply the heuristic to outer loops, since it's
16399      harder to track two levels of unrolling.  */
16400   if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16401     return;
16402
16403   /* Only handle cases in which the number of Advanced SIMD iterations
16404      would be known at compile time but the number of SVE iterations
16405      would not.  */
16406   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16407       || aarch64_sve_vg.is_constant ())
16408     return;
16409
16410   /* Guess how many times the Advanced SIMD loop would iterate and make
16411      sure that it is within the complete unrolling limit.  Even if the
16412      number of iterations is small enough, the number of statements might
16413      not be, which is why we need to estimate the number of statements too.  */
16414   unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16415   unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16416   unsigned HOST_WIDE_INT unrolled_advsimd_niters
16417     = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16418   if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16419     return;
16420
16421   /* Record that we're applying the heuristic and should try to estimate
16422      the number of statements in the Advanced SIMD loop.  */
16423   m_unrolled_advsimd_niters = unrolled_advsimd_niters;
16424 }
16425
16426 /* Do one-time initialization of the aarch64_vector_costs given that we're
16427    costing the loop vectorization described by LOOP_VINFO.  */
16428 void
16429 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
16430 {
16431   /* Record the number of times that the vector loop would execute,
16432      if known.  */
16433   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16434   auto scalar_niters = max_stmt_executions_int (loop);
16435   if (scalar_niters >= 0)
16436     {
16437       unsigned int vf = vect_vf_for_cost (loop_vinfo);
16438       if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16439         m_num_vector_iterations = scalar_niters / vf;
16440       else
16441         m_num_vector_iterations = CEIL (scalar_niters, vf);
16442     }
16443
16444   /* Detect whether we're vectorizing for SVE and should apply the unrolling
16445      heuristic described above m_unrolled_advsimd_niters.  */
16446   record_potential_advsimd_unrolling (loop_vinfo);
16447 }
16448
16449 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
16450 static int
16451 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16452                                     tree vectype,
16453                                     int misalign ATTRIBUTE_UNUSED)
16454 {
16455   unsigned elements;
16456   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16457   bool fp = false;
16458
16459   if (vectype != NULL)
16460     fp = FLOAT_TYPE_P (vectype);
16461
16462   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16463
16464   switch (type_of_cost)
16465     {
16466       case scalar_stmt:
16467         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16468
16469       case scalar_load:
16470         return costs->scalar_load_cost;
16471
16472       case scalar_store:
16473         return costs->scalar_store_cost;
16474
16475       case vector_stmt:
16476         return fp ? simd_costs->fp_stmt_cost
16477                   : simd_costs->int_stmt_cost;
16478
16479       case vector_load:
16480         return simd_costs->align_load_cost;
16481
16482       case vector_store:
16483         return simd_costs->store_cost;
16484
16485       case vec_to_scalar:
16486         return simd_costs->vec_to_scalar_cost;
16487
16488       case scalar_to_vec:
16489         return simd_costs->scalar_to_vec_cost;
16490
16491       case unaligned_load:
16492       case vector_gather_load:
16493         return simd_costs->unalign_load_cost;
16494
16495       case unaligned_store:
16496       case vector_scatter_store:
16497         return simd_costs->unalign_store_cost;
16498
16499       case cond_branch_taken:
16500         return costs->cond_taken_branch_cost;
16501
16502       case cond_branch_not_taken:
16503         return costs->cond_not_taken_branch_cost;
16504
16505       case vec_perm:
16506         return simd_costs->permute_cost;
16507
16508       case vec_promote_demote:
16509         return fp ? simd_costs->fp_stmt_cost
16510                   : simd_costs->int_stmt_cost;
16511
16512       case vec_construct:
16513         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16514         return elements / 2 + 1;
16515
16516       default:
16517         gcc_unreachable ();
16518     }
16519 }
16520
16521 /* Return true if an access of kind KIND for STMT_INFO represents one
16522    vector of an LD[234] or ST[234] operation.  Return the total number of
16523    vectors (2, 3 or 4) if so, otherwise return a value outside that range.  */
16524 static int
16525 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16526 {
16527   if ((kind == vector_load
16528        || kind == unaligned_load
16529        || kind == vector_store
16530        || kind == unaligned_store)
16531       && STMT_VINFO_DATA_REF (stmt_info))
16532     {
16533       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16534       if (stmt_info
16535           && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16536         return DR_GROUP_SIZE (stmt_info);
16537     }
16538   return 0;
16539 }
16540
16541 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16542    vectors would produce a series of LDP or STP operations.  KIND is the
16543    kind of statement that STMT_INFO represents.  */
16544 static bool
16545 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16546                            stmt_vec_info stmt_info)
16547 {
16548   switch (kind)
16549     {
16550     case vector_load:
16551     case vector_store:
16552     case unaligned_load:
16553     case unaligned_store:
16554       break;
16555
16556     default:
16557       return false;
16558     }
16559
16560   return is_gimple_assign (stmt_info->stmt);
16561 }
16562
16563 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16564    or multiply-subtract sequence that might be suitable for fusing into a
16565    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16566    a scalar one, otherwise analyze it as an operation on vectors with those
16567    VEC_* flags.  */
16568 static bool
16569 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16570                         unsigned int vec_flags)
16571 {
16572   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16573   if (!assign)
16574     return false;
16575   tree_code code = gimple_assign_rhs_code (assign);
16576   if (code != PLUS_EXPR && code != MINUS_EXPR)
16577     return false;
16578
16579   auto is_mul_result = [&](int i)
16580     {
16581       tree rhs = gimple_op (assign, i);
16582       /* ??? Should we try to check for a single use as well?  */
16583       if (TREE_CODE (rhs) != SSA_NAME)
16584         return false;
16585
16586       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16587       if (!def_stmt_info
16588           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16589         return false;
16590       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16591       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16592         return false;
16593
16594       if (vec_flags & VEC_ADVSIMD)
16595         {
16596           /* Scalar and SVE code can tie the result to any FMLA input (or none,
16597              although that requires a MOVPRFX for SVE).  However, Advanced SIMD
16598              only supports MLA forms, so will require a move if the result
16599              cannot be tied to the accumulator.  The most important case in
16600              which this is true is when the accumulator input is invariant.  */
16601           rhs = gimple_op (assign, 3 - i);
16602           if (TREE_CODE (rhs) != SSA_NAME)
16603             return false;
16604           def_stmt_info = vinfo->lookup_def (rhs);
16605           if (!def_stmt_info
16606               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
16607               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
16608             return false;
16609         }
16610
16611       return true;
16612     };
16613
16614   if (code == MINUS_EXPR && (vec_flags & VEC_ADVSIMD))
16615     /* Advanced SIMD doesn't have FNMADD/FNMSUB/FNMLA/FNMLS, so the
16616        multiplication must be on the second operand (to form an FMLS).
16617        But if both operands are multiplications and the second operand
16618        is used more than once, we'll instead negate the second operand
16619        and use it as an accumulator for the first operand.  */
16620     return (is_mul_result (2)
16621             && (has_single_use (gimple_assign_rhs2 (assign))
16622                 || !is_mul_result (1)));
16623
16624   return is_mul_result (1) || is_mul_result (2);
16625 }
16626
16627 /* Return true if STMT_INFO is the second part of a two-statement boolean AND
16628    expression sequence that might be suitable for fusing into a
16629    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16630    a scalar one, otherwise analyze it as an operation on vectors with those
16631    VEC_* flags.  */
16632
16633 static bool
16634 aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
16635                          unsigned int vec_flags)
16636 {
16637   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16638   if (!assign
16639       || gimple_assign_rhs_code (assign) != BIT_AND_EXPR
16640       || !STMT_VINFO_VECTYPE (stmt_info)
16641       || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info)))
16642     return false;
16643
16644   for (int i = 1; i < 3; ++i)
16645     {
16646       tree rhs = gimple_op (assign, i);
16647
16648       if (TREE_CODE (rhs) != SSA_NAME)
16649         continue;
16650
16651       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16652       if (!def_stmt_info
16653           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16654         continue;
16655
16656       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16657       if (!rhs_assign
16658           || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign))
16659                 != tcc_comparison)
16660         continue;
16661
16662       if (vec_flags & VEC_ADVSIMD)
16663         return false;
16664
16665       return true;
16666     }
16667   return false;
16668 }
16669
16670 /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
16671    in-loop reduction that SVE supports directly, return its latency in cycles,
16672    otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
16673    instructions.  */
16674 static unsigned int
16675 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16676                                        stmt_vec_info stmt_info,
16677                                        const sve_vec_cost *sve_costs)
16678 {
16679   switch (vect_reduc_type (vinfo, stmt_info))
16680     {
16681     case EXTRACT_LAST_REDUCTION:
16682       return sve_costs->clast_cost;
16683
16684     case FOLD_LEFT_REDUCTION:
16685       switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16686         {
16687         case E_HFmode:
16688         case E_BFmode:
16689           return sve_costs->fadda_f16_cost;
16690
16691         case E_SFmode:
16692           return sve_costs->fadda_f32_cost;
16693
16694         case E_DFmode:
16695           return sve_costs->fadda_f64_cost;
16696
16697         default:
16698           break;
16699         }
16700       break;
16701     }
16702
16703   return 0;
16704 }
16705
16706 /* STMT_INFO describes a loop-carried operation in the original scalar code
16707    that we are considering implementing as a reduction.  Return one of the
16708    following values, depending on VEC_FLAGS:
16709
16710    - If VEC_FLAGS is zero, return the loop carry latency of the original
16711      scalar operation.
16712
16713    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16714      Advanced SIMD implementation.
16715
16716    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16717      SVE implementation.  */
16718 static unsigned int
16719 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16720                                    unsigned int vec_flags)
16721 {
16722   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16723   const sve_vec_cost *sve_costs = nullptr;
16724   if (vec_flags & VEC_ANY_SVE)
16725     sve_costs = aarch64_tune_params.vec_costs->sve;
16726
16727   /* If the caller is asking for the SVE latency, check for forms of reduction
16728      that only SVE can handle directly.  */
16729   if (sve_costs)
16730     {
16731       unsigned int latency
16732         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16733       if (latency)
16734         return latency;
16735     }
16736
16737   /* Handle scalar costs.  */
16738   bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16739   if (vec_flags == 0)
16740     {
16741       if (is_float)
16742         return vec_costs->scalar_fp_stmt_cost;
16743       return vec_costs->scalar_int_stmt_cost;
16744     }
16745
16746   /* Otherwise, the loop body just contains normal integer or FP operations,
16747      with a vector reduction outside the loop.  */
16748   const simd_vec_cost *simd_costs
16749     = aarch64_simd_vec_costs_for_flags (vec_flags);
16750   if (is_float)
16751     return simd_costs->fp_stmt_cost;
16752   return simd_costs->int_stmt_cost;
16753 }
16754
16755 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16756    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
16757    try to subdivide the target-independent categorization provided by KIND
16758    to get a more accurate cost.  */
16759 static fractional_cost
16760 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16761                                     stmt_vec_info stmt_info,
16762                                     fractional_cost stmt_cost)
16763 {
16764   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
16765      the extension with the load.  */
16766   if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16767     return 0;
16768
16769   return stmt_cost;
16770 }
16771
16772 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16773    for the vectorized form of STMT_INFO, which has cost kind KIND and which
16774    when vectorized would operate on vector type VECTYPE.  Try to subdivide
16775    the target-independent categorization provided by KIND to get a more
16776    accurate cost.  WHERE specifies where the cost associated with KIND
16777    occurs.  */
16778 static fractional_cost
16779 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16780                                     stmt_vec_info stmt_info, tree vectype,
16781                                     enum vect_cost_model_location where,
16782                                     fractional_cost stmt_cost)
16783 {
16784   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16785   const sve_vec_cost *sve_costs = nullptr;
16786   if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16787     sve_costs = aarch64_tune_params.vec_costs->sve;
16788
16789   /* It's generally better to avoid costing inductions, since the induction
16790      will usually be hidden by other operations.  This is particularly true
16791      for things like COND_REDUCTIONS.  */
16792   if (is_a<gphi *> (stmt_info->stmt))
16793     return 0;
16794
16795   /* Detect cases in which vec_to_scalar is describing the extraction of a
16796      vector element in preparation for a scalar store.  The store itself is
16797      costed separately.  */
16798   if (vect_is_store_elt_extraction (kind, stmt_info))
16799     return simd_costs->store_elt_extra_cost;
16800
16801   /* Detect SVE gather loads, which are costed as a single scalar_load
16802      for each element.  We therefore need to divide the full-instruction
16803      cost by the number of elements in the vector.  */
16804   if (kind == scalar_load
16805       && sve_costs
16806       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16807     {
16808       unsigned int nunits = vect_nunits_for_cost (vectype);
16809       if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16810         return { sve_costs->gather_load_x64_cost, nunits };
16811       return { sve_costs->gather_load_x32_cost, nunits };
16812     }
16813
16814   /* Detect cases in which a scalar_store is really storing one element
16815      in a scatter operation.  */
16816   if (kind == scalar_store
16817       && sve_costs
16818       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16819     return sve_costs->scatter_store_elt_cost;
16820
16821   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
16822   if (kind == vec_to_scalar
16823       && where == vect_body
16824       && sve_costs)
16825     {
16826       unsigned int latency
16827         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16828       if (latency)
16829         return latency;
16830     }
16831
16832   /* Detect cases in which vec_to_scalar represents a single reduction
16833      instruction like FADDP or MAXV.  */
16834   if (kind == vec_to_scalar
16835       && where == vect_epilogue
16836       && vect_is_reduction (stmt_info))
16837     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16838       {
16839       case E_QImode:
16840         return simd_costs->reduc_i8_cost;
16841
16842       case E_HImode:
16843         return simd_costs->reduc_i16_cost;
16844
16845       case E_SImode:
16846         return simd_costs->reduc_i32_cost;
16847
16848       case E_DImode:
16849         return simd_costs->reduc_i64_cost;
16850
16851       case E_HFmode:
16852       case E_BFmode:
16853         return simd_costs->reduc_f16_cost;
16854
16855       case E_SFmode:
16856         return simd_costs->reduc_f32_cost;
16857
16858       case E_DFmode:
16859         return simd_costs->reduc_f64_cost;
16860
16861       default:
16862         break;
16863       }
16864
16865   /* Otherwise stick with the original categorization.  */
16866   return stmt_cost;
16867 }
16868
16869 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16870    for STMT_INFO, which has cost kind KIND and which when vectorized would
16871    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
16872    targets.  */
16873 static fractional_cost
16874 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16875                               stmt_vec_info stmt_info, tree vectype,
16876                               fractional_cost stmt_cost)
16877 {
16878   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16879      vector register size or number of units.  Integer promotions of this
16880      type therefore map to SXT[BHW] or UXT[BHW].
16881
16882      Most loads have extending forms that can do the sign or zero extension
16883      on the fly.  Optimistically assume that a load followed by an extension
16884      will fold to this form during combine, and that the extension therefore
16885      comes for free.  */
16886   if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16887     stmt_cost = 0;
16888
16889   /* For similar reasons, vector_stmt integer truncations are a no-op,
16890      because we can just ignore the unused upper bits of the source.  */
16891   if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16892     stmt_cost = 0;
16893
16894   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16895      but there are no equivalent instructions for SVE.  This means that
16896      (all other things being equal) 128-bit SVE needs twice as many load
16897      and store instructions as Advanced SIMD in order to process vector pairs.
16898
16899      Also, scalar code can often use LDP and STP to access pairs of values,
16900      so it is too simplistic to say that one SVE load or store replaces
16901      VF scalar loads and stores.
16902
16903      Ideally we would account for this in the scalar and Advanced SIMD
16904      costs by making suitable load/store pairs as cheap as a single
16905      load/store.  However, that would be a very invasive change and in
16906      practice it tends to stress other parts of the cost model too much.
16907      E.g. stores of scalar constants currently count just a store,
16908      whereas stores of vector constants count a store and a vec_init.
16909      This is an artificial distinction for AArch64, where stores of
16910      nonzero scalar constants need the same kind of register invariant
16911      as vector stores.
16912
16913      An alternative would be to double the cost of any SVE loads and stores
16914      that could be paired in Advanced SIMD (and possibly also paired in
16915      scalar code).  But this tends to stress other parts of the cost model
16916      in the same way.  It also means that we can fall back to Advanced SIMD
16917      even if full-loop predication would have been useful.
16918
16919      Here we go for a more conservative version: double the costs of SVE
16920      loads and stores if one iteration of the scalar loop processes enough
16921      elements for it to use a whole number of Advanced SIMD LDP or STP
16922      instructions.  This makes it very likely that the VF would be 1 for
16923      Advanced SIMD, and so no epilogue should be needed.  */
16924   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16925     {
16926       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16927       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16928       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16929       if (multiple_p (count * elt_bits, 256)
16930           && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16931         stmt_cost *= 2;
16932     }
16933
16934   return stmt_cost;
16935 }
16936
16937 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16938    and which when vectorized would operate on vector type VECTYPE.  Add the
16939    cost of any embedded operations.  */
16940 static fractional_cost
16941 aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
16942                           stmt_vec_info stmt_info, tree vectype,
16943                           unsigned vec_flags, fractional_cost stmt_cost)
16944 {
16945   if (vectype)
16946     {
16947       const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16948
16949       /* Detect cases in which a vector load or store represents an
16950          LD[234] or ST[234] instruction.  */
16951       switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16952         {
16953         case 2:
16954           stmt_cost += simd_costs->ld2_st2_permute_cost;
16955           break;
16956
16957         case 3:
16958           stmt_cost += simd_costs->ld3_st3_permute_cost;
16959           break;
16960
16961         case 4:
16962           stmt_cost += simd_costs->ld4_st4_permute_cost;
16963           break;
16964         }
16965
16966       gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
16967       if ((kind == scalar_stmt || kind == vector_stmt) && assign)
16968         {
16969           /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
16970           if (!vect_is_reduction (stmt_info)
16971               && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
16972             return 0;
16973
16974           /* For vector boolean ANDs with a compare operand we just need
16975              one insn.  */
16976           if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
16977             return 0;
16978         }
16979
16980       if (kind == vector_stmt || kind == vec_to_scalar)
16981         if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16982           {
16983             if (FLOAT_TYPE_P (cmp_type))
16984               stmt_cost += simd_costs->fp_stmt_cost;
16985             else
16986               stmt_cost += simd_costs->int_stmt_cost;
16987           }
16988     }
16989
16990   if (kind == scalar_stmt)
16991     if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16992       {
16993         if (FLOAT_TYPE_P (cmp_type))
16994           stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16995         else
16996           stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16997       }
16998
16999   return stmt_cost;
17000 }
17001
17002 /* Return true if STMT_INFO is part of a reduction that has the form:
17003
17004       r = r op ...;
17005       r = r op ...;
17006
17007    with the single accumulator being read and written multiple times.  */
17008 static bool
17009 aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info)
17010 {
17011   if (!STMT_VINFO_REDUC_DEF (stmt_info))
17012     return false;
17013
17014   auto reduc_info = info_for_reduction (vinfo, stmt_info);
17015   return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
17016 }
17017
17018 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
17019    and they describe an operation in the body of a vector loop.  Record issue
17020    information relating to the vector operation in OPS.  */
17021 void
17022 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
17023                                  stmt_vec_info stmt_info,
17024                                  aarch64_vec_op_count *ops)
17025 {
17026   const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
17027   if (!base_issue)
17028     return;
17029   const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
17030   const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
17031
17032   /* Calculate the minimum cycles per iteration imposed by a reduction
17033      operation.  */
17034   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17035       && vect_is_reduction (stmt_info))
17036     {
17037       unsigned int base
17038         = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
17039       if (aarch64_force_single_cycle (m_vinfo, stmt_info))
17040         /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
17041            and then accumulate that, but at the moment the loop-carried
17042            dependency includes all copies.  */
17043         ops->reduction_latency = MAX (ops->reduction_latency, base * count);
17044       else
17045         ops->reduction_latency = MAX (ops->reduction_latency, base);
17046     }
17047
17048   if (stmt_info && (kind == scalar_stmt || kind == vector_stmt))
17049     {
17050       /* Assume that multiply-adds will become a single operation.  */
17051       if (aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
17052         return;
17053
17054       /* Assume that bool AND with compare operands will become a single
17055          operation.  */
17056       if (aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
17057         return;
17058     }
17059
17060
17061   /* Count the basic operation cost associated with KIND.  */
17062   switch (kind)
17063     {
17064     case cond_branch_taken:
17065     case cond_branch_not_taken:
17066     case vector_gather_load:
17067     case vector_scatter_store:
17068       /* We currently don't expect these to be used in a loop body.  */
17069       break;
17070
17071     case vec_perm:
17072     case vec_promote_demote:
17073     case vec_construct:
17074     case vec_to_scalar:
17075     case scalar_to_vec:
17076     case vector_stmt:
17077     case scalar_stmt:
17078       ops->general_ops += count;
17079       break;
17080
17081     case scalar_load:
17082     case vector_load:
17083     case unaligned_load:
17084       ops->loads += count;
17085       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17086         ops->general_ops += base_issue->fp_simd_load_general_ops * count;
17087       break;
17088
17089     case vector_store:
17090     case unaligned_store:
17091     case scalar_store:
17092       ops->stores += count;
17093       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17094         ops->general_ops += base_issue->fp_simd_store_general_ops * count;
17095       break;
17096     }
17097
17098   /* Add any embedded comparison operations.  */
17099   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17100       && vect_embedded_comparison_type (stmt_info))
17101     ops->general_ops += count;
17102
17103   /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
17104      have only accounted for one.  */
17105   if ((kind == vector_stmt || kind == vec_to_scalar)
17106       && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
17107     ops->general_ops += count;
17108
17109   /* Count the predicate operations needed by an SVE comparison.  */
17110   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
17111     if (tree type = vect_comparison_type (stmt_info))
17112       {
17113         unsigned int base = (FLOAT_TYPE_P (type)
17114                              ? sve_issue->fp_cmp_pred_ops
17115                              : sve_issue->int_cmp_pred_ops);
17116         ops->pred_ops += base * count;
17117       }
17118
17119   /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
17120   if (simd_issue)
17121     switch (aarch64_ld234_st234_vectors (kind, stmt_info))
17122       {
17123       case 2:
17124         ops->general_ops += simd_issue->ld2_st2_general_ops * count;
17125         break;
17126
17127       case 3:
17128         ops->general_ops += simd_issue->ld3_st3_general_ops * count;
17129         break;
17130
17131       case 4:
17132         ops->general_ops += simd_issue->ld4_st4_general_ops * count;
17133         break;
17134       }
17135
17136   /* Add any overhead associated with gather loads and scatter stores.  */
17137   if (sve_issue
17138       && (kind == scalar_load || kind == scalar_store)
17139       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
17140     {
17141       unsigned int pairs = CEIL (count, 2);
17142       ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
17143       ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
17144     }
17145 }
17146
17147 /* Return true if STMT_INFO contains a memory access and if the constant
17148    component of the memory address is aligned to SIZE bytes.  */
17149 static bool
17150 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
17151                                    poly_uint64 size)
17152 {
17153   if (!STMT_VINFO_DATA_REF (stmt_info))
17154     return false;
17155
17156   if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
17157     stmt_info = first_stmt;
17158   tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
17159   /* Needed for gathers & scatters, for example.  */
17160   if (!constant_offset)
17161     return false;
17162
17163   return multiple_p (wi::to_poly_offset (constant_offset), size);
17164 }
17165
17166 /* Check if a scalar or vector stmt could be part of a region of code
17167    that does nothing more than store values to memory, in the scalar
17168    case using STP.  Return the cost of the stmt if so, counting 2 for
17169    one instruction.  Return ~0U otherwise.
17170
17171    The arguments are a subset of those passed to add_stmt_cost.  */
17172 unsigned int
17173 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
17174                            stmt_vec_info stmt_info, tree vectype)
17175 {
17176   /* Code that stores vector constants uses a vector_load to create
17177      the constant.  We don't apply the heuristic to that case for two
17178      main reasons:
17179
17180      - At the moment, STPs are only formed via peephole2, and the
17181        constant scalar moves would often come between STRs and so
17182        prevent STP formation.
17183
17184      - The scalar code also has to load the constant somehow, and that
17185        isn't costed.  */
17186   switch (kind)
17187     {
17188     case scalar_to_vec:
17189       /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup.  */
17190       return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
17191
17192     case vec_construct:
17193       if (FLOAT_TYPE_P (vectype))
17194         /* Count 1 insn for the maximum number of FP->SIMD INS
17195            instructions.  */
17196         return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
17197
17198       /* Count 2 insns for a GPR->SIMD move and 2 insns for the
17199          maximum number of GPR->SIMD INS instructions.  */
17200       return vect_nunits_for_cost (vectype) * 4 * count;
17201
17202     case vector_store:
17203     case unaligned_store:
17204       /* Count 1 insn per vector if we can't form STP Q pairs.  */
17205       if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
17206         return count * 2;
17207
17208       if (stmt_info)
17209         {
17210           /* Assume we won't be able to use STP if the constant offset
17211              component of the address is misaligned.  ??? This could be
17212              removed if we formed STP pairs earlier, rather than relying
17213              on peephole2.  */
17214           auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
17215           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17216             return count * 2;
17217         }
17218       return CEIL (count, 2) * 2;
17219
17220     case scalar_store:
17221       if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
17222         {
17223           /* Check for a mode in which STP pairs can be formed.  */
17224           auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
17225           if (maybe_ne (size, 4) && maybe_ne (size, 8))
17226             return ~0U;
17227
17228           /* Assume we won't be able to use STP if the constant offset
17229              component of the address is misaligned.  ??? This could be
17230              removed if we formed STP pairs earlier, rather than relying
17231              on peephole2.  */
17232           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17233             return ~0U;
17234         }
17235       return count;
17236
17237     default:
17238       return ~0U;
17239     }
17240 }
17241
17242 unsigned
17243 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
17244                                      stmt_vec_info stmt_info, slp_tree,
17245                                      tree vectype, int misalign,
17246                                      vect_cost_model_location where)
17247 {
17248   fractional_cost stmt_cost
17249     = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
17250
17251   bool in_inner_loop_p = (where == vect_body
17252                           && stmt_info
17253                           && stmt_in_inner_loop_p (m_vinfo, stmt_info));
17254
17255   /* Do one-time initialization based on the vinfo.  */
17256   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17257   if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
17258     {
17259       if (loop_vinfo)
17260         analyze_loop_vinfo (loop_vinfo);
17261
17262       m_analyzed_vinfo = true;
17263     }
17264
17265   /* Apply the heuristic described above m_stp_sequence_cost.  */
17266   if (m_stp_sequence_cost != ~0U)
17267     {
17268       uint64_t cost = aarch64_stp_sequence_cost (count, kind,
17269                                                  stmt_info, vectype);
17270       m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
17271     }
17272
17273   /* Try to get a more accurate cost by looking at STMT_INFO instead
17274      of just looking at KIND.  */
17275   if (stmt_info && aarch64_use_new_vector_costs_p ())
17276     {
17277       /* If we scalarize a strided store, the vectorizer costs one
17278          vec_to_scalar for each element.  However, we can store the first
17279          element using an FP store without a separate extract step.  */
17280       if (vect_is_store_elt_extraction (kind, stmt_info))
17281         count -= 1;
17282
17283       stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
17284                                                       stmt_info, stmt_cost);
17285
17286       if (vectype && m_vec_flags)
17287         stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
17288                                                         stmt_info, vectype,
17289                                                         where, stmt_cost);
17290     }
17291
17292   /* Do any SVE-specific adjustments to the cost.  */
17293   if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
17294     stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17295                                               vectype, stmt_cost);
17296
17297   /*  Vector promotion and demotion requires us to widen the operation first
17298       and only after that perform the conversion.  Unfortunately the mid-end
17299       expects this to be doable as a single operation and doesn't pass on
17300       enough context here for us to tell which operation is happening.  To
17301       account for this we count every promote-demote operation twice and if
17302       the previously costed operation was also a promote-demote we reduce
17303       the cost of the currently being costed operation to simulate the final
17304       conversion cost.  Note that for SVE we can do better here if the converted
17305       value comes from a load since the widening load would consume the widening
17306       operations.  However since we're in stage 3 we can't change the helper
17307       vect_is_extending_load and duplicating the code seems not useful.  */
17308   gassign *assign = NULL;
17309   if (kind == vec_promote_demote
17310       && (assign = dyn_cast <gassign *> (STMT_VINFO_STMT (stmt_info)))
17311       && gimple_assign_rhs_code (assign) == FLOAT_EXPR)
17312     {
17313       auto new_count = count * 2 - m_num_last_promote_demote;
17314       m_num_last_promote_demote = count;
17315       count = new_count;
17316     }
17317   else
17318     m_num_last_promote_demote = 0;
17319
17320   if (stmt_info && aarch64_use_new_vector_costs_p ())
17321     {
17322       /* Account for any extra "embedded" costs that apply additively
17323          to the base cost calculated above.  */
17324       stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17325                                             vectype, m_vec_flags, stmt_cost);
17326
17327       /* If we're recording a nonzero vector loop body cost for the
17328          innermost loop, also estimate the operations that would need
17329          to be issued by all relevant implementations of the loop.  */
17330       if (loop_vinfo
17331           && (m_costing_for_scalar || where == vect_body)
17332           && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
17333           && stmt_cost != 0)
17334         for (auto &ops : m_ops)
17335           count_ops (count, kind, stmt_info, &ops);
17336
17337       /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
17338          estimate the number of statements in the unrolled Advanced SIMD
17339          loop.  For simplicitly, we assume that one iteration of the
17340          Advanced SIMD loop would need the same number of statements
17341          as one iteration of the SVE loop.  */
17342       if (where == vect_body && m_unrolled_advsimd_niters)
17343         m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
17344
17345       /* Detect the use of an averaging operation.  */
17346       gimple *stmt = stmt_info->stmt;
17347       if (is_gimple_call (stmt)
17348           && gimple_call_internal_p (stmt))
17349         {
17350           switch (gimple_call_internal_fn (stmt))
17351             {
17352             case IFN_AVG_FLOOR:
17353             case IFN_AVG_CEIL:
17354               m_has_avg = true;
17355             default:
17356               break;
17357             }
17358         }
17359     }
17360
17361   /* If the statement stores to a decl that is known to be the argument
17362      to a vld1 in the same function, ignore the store for costing purposes.
17363      See the comment above m_stores_to_vector_load_decl for more details.  */
17364   if (stmt_info
17365       && (kind == vector_store || kind == unaligned_store)
17366       && aarch64_accesses_vector_load_decl_p (stmt_info))
17367     {
17368       stmt_cost = 0;
17369       m_stores_to_vector_load_decl = true;
17370     }
17371
17372   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
17373 }
17374
17375 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
17376    heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
17377    says that we should prefer the Advanced SIMD loop.  */
17378 bool
17379 aarch64_vector_costs::prefer_unrolled_loop () const
17380 {
17381   if (!m_unrolled_advsimd_stmts)
17382     return false;
17383
17384   if (dump_enabled_p ())
17385     dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
17386                      " unrolled Advanced SIMD loop = "
17387                      HOST_WIDE_INT_PRINT_UNSIGNED "\n",
17388                      m_unrolled_advsimd_stmts);
17389
17390   /* The balance here is tricky.  On the one hand, we can't be sure whether
17391      the code is vectorizable with Advanced SIMD or not.  However, even if
17392      it isn't vectorizable with Advanced SIMD, there's a possibility that
17393      the scalar code could also be unrolled.  Some of the code might then
17394      benefit from SLP, or from using LDP and STP.  We therefore apply
17395      the heuristic regardless of can_use_advsimd_p.  */
17396   return (m_unrolled_advsimd_stmts
17397           && (m_unrolled_advsimd_stmts
17398               <= (unsigned int) param_max_completely_peeled_insns));
17399 }
17400
17401 /* Subroutine of adjust_body_cost for handling SVE.  Use ISSUE_INFO to work out
17402    how fast the SVE code can be issued and compare it to the equivalent value
17403    for scalar code (SCALAR_CYCLES_PER_ITER).  If COULD_USE_ADVSIMD is true,
17404    also compare it to the issue rate of Advanced SIMD code
17405    (ADVSIMD_CYCLES_PER_ITER).
17406
17407    ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
17408    *BODY_COST is the current value of the adjusted cost.  *SHOULD_DISPARAGE
17409    is true if we think the loop body is too expensive.  */
17410
17411 fractional_cost
17412 aarch64_vector_costs::
17413 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
17414                       fractional_cost scalar_cycles_per_iter,
17415                       unsigned int orig_body_cost, unsigned int *body_cost,
17416                       bool *should_disparage)
17417 {
17418   if (dump_enabled_p ())
17419     ops->dump ();
17420
17421   fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
17422   fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
17423
17424   /* If the scalar version of the loop could issue at least as
17425      quickly as the predicate parts of the SVE loop, make the SVE loop
17426      prohibitively expensive.  In this case vectorization is adding an
17427      overhead that the original scalar code didn't have.
17428
17429      This is mostly intended to detect cases in which WHILELOs dominate
17430      for very tight loops, which is something that normal latency-based
17431      costs would not model.  Adding this kind of cliffedge would be
17432      too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17433      code in the caller handles that case in a more conservative way.  */
17434   fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
17435   if (scalar_cycles_per_iter < sve_estimate)
17436     {
17437       unsigned int min_cost
17438         = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17439       if (*body_cost < min_cost)
17440         {
17441           if (dump_enabled_p ())
17442             dump_printf_loc (MSG_NOTE, vect_location,
17443                              "Increasing body cost to %d because the"
17444                              " scalar code could issue within the limit"
17445                              " imposed by predicate operations\n",
17446                              min_cost);
17447           *body_cost = min_cost;
17448           *should_disparage = true;
17449         }
17450     }
17451
17452   return sve_cycles_per_iter;
17453 }
17454
17455 unsigned int
17456 aarch64_vector_costs::determine_suggested_unroll_factor ()
17457 {
17458   bool sve = m_vec_flags & VEC_ANY_SVE;
17459   /* If we are trying to unroll an Advanced SIMD main loop that contains
17460      an averaging operation that we do not support with SVE and we might use a
17461      predicated epilogue, we need to be conservative and block unrolling as
17462      this might lead to a less optimal loop for the first and only epilogue
17463      using the original loop's vectorization factor.
17464      TODO: Remove this constraint when we add support for multiple epilogue
17465      vectorization.  */
17466   if (!sve && !TARGET_SVE2 && m_has_avg)
17467     return 1;
17468
17469   unsigned int max_unroll_factor = 1;
17470   for (auto vec_ops : m_ops)
17471     {
17472       aarch64_simd_vec_issue_info const *vec_issue
17473         = vec_ops.simd_issue_info ();
17474       if (!vec_issue)
17475         return 1;
17476       /* Limit unroll factor to a value adjustable by the user, the default
17477          value is 4. */
17478       unsigned int unroll_factor = aarch64_vect_unroll_limit;
17479       unsigned int factor
17480        = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17481       unsigned int temp;
17482
17483       /* Sanity check, this should never happen.  */
17484       if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17485         return 1;
17486
17487       /* Check stores.  */
17488       if (vec_ops.stores > 0)
17489         {
17490           temp = CEIL (factor * vec_issue->stores_per_cycle,
17491                        vec_ops.stores);
17492           unroll_factor = MIN (unroll_factor, temp);
17493         }
17494
17495       /* Check loads + stores.  */
17496       if (vec_ops.loads > 0)
17497         {
17498           temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17499                        vec_ops.loads + vec_ops.stores);
17500           unroll_factor = MIN (unroll_factor, temp);
17501         }
17502
17503       /* Check general ops.  */
17504       if (vec_ops.general_ops > 0)
17505         {
17506           temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17507                        vec_ops.general_ops);
17508           unroll_factor = MIN (unroll_factor, temp);
17509          }
17510       max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17511     }
17512
17513   /* Make sure unroll factor is power of 2.  */
17514   return 1 << ceil_log2 (max_unroll_factor);
17515 }
17516
17517 /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
17518    and return the new cost.  */
17519 unsigned int
17520 aarch64_vector_costs::
17521 adjust_body_cost (loop_vec_info loop_vinfo,
17522                   const aarch64_vector_costs *scalar_costs,
17523                   unsigned int body_cost)
17524 {
17525   if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17526     return body_cost;
17527
17528   const auto &scalar_ops = scalar_costs->m_ops[0];
17529   const auto &vector_ops = m_ops[0];
17530   unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17531   unsigned int orig_body_cost = body_cost;
17532   bool should_disparage = false;
17533
17534   if (dump_enabled_p ())
17535     dump_printf_loc (MSG_NOTE, vect_location,
17536                      "Original vector body cost = %d\n", body_cost);
17537
17538   fractional_cost scalar_cycles_per_iter
17539     = scalar_ops.min_cycles_per_iter () * estimated_vf;
17540
17541   fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17542
17543   if (dump_enabled_p ())
17544     {
17545       if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17546         dump_printf_loc (MSG_NOTE, vect_location,
17547                          "Vector loop iterates at most %wd times\n",
17548                          m_num_vector_iterations);
17549       dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17550       scalar_ops.dump ();
17551       dump_printf_loc (MSG_NOTE, vect_location,
17552                        "  estimated cycles per vector iteration"
17553                        " (for VF %d) = %f\n",
17554                        estimated_vf, scalar_cycles_per_iter.as_double ());
17555     }
17556
17557   if (vector_ops.sve_issue_info ())
17558     {
17559       if (dump_enabled_p ())
17560         dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17561       vector_cycles_per_iter
17562         = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17563                                 orig_body_cost, &body_cost, &should_disparage);
17564
17565       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17566         {
17567           /* Also take Neoverse V1 tuning into account, doubling the
17568              scalar and Advanced SIMD estimates to account for the
17569              doubling in SVE vector length.  */
17570           if (dump_enabled_p ())
17571             dump_printf_loc (MSG_NOTE, vect_location,
17572                              "Neoverse V1 estimate:\n");
17573           auto vf_factor = m_ops[1].vf_factor ();
17574           adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17575                                 orig_body_cost, &body_cost, &should_disparage);
17576         }
17577     }
17578   else
17579     {
17580       if (dump_enabled_p ())
17581         {
17582           dump_printf_loc (MSG_NOTE, vect_location,
17583                            "Vector issue estimate:\n");
17584           vector_ops.dump ();
17585         }
17586     }
17587
17588   /* Decide whether to stick to latency-based costs or whether to try to
17589      take issue rates into account.  */
17590   unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17591   if (m_vec_flags & VEC_ANY_SVE)
17592     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17593
17594   if (m_num_vector_iterations >= 1
17595       && m_num_vector_iterations < threshold)
17596     {
17597       if (dump_enabled_p ())
17598         dump_printf_loc (MSG_NOTE, vect_location,
17599                          "Low iteration count, so using pure latency"
17600                          " costs\n");
17601     }
17602   /* Increase the cost of the vector code if it looks like the scalar code
17603      could issue more quickly.  These values are only rough estimates,
17604      so minor differences should only result in minor changes.  */
17605   else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17606     {
17607       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17608                                           scalar_cycles_per_iter);
17609       if (dump_enabled_p ())
17610         dump_printf_loc (MSG_NOTE, vect_location,
17611                          "Increasing body cost to %d because scalar code"
17612                          " would issue more quickly\n", body_cost);
17613     }
17614   /* In general, it's expected that the proposed vector code would be able
17615      to issue more quickly than the original scalar code.  This should
17616      already be reflected to some extent in the latency-based costs.
17617
17618      However, the latency-based costs effectively assume that the scalar
17619      code and the vector code execute serially, which tends to underplay
17620      one important case: if the real (non-serialized) execution time of
17621      a scalar iteration is dominated by loop-carried dependencies,
17622      and if the vector code is able to reduce both the length of
17623      the loop-carried dependencies *and* the number of cycles needed
17624      to issue the code in general, we can be more confident that the
17625      vector code is an improvement, even if adding the other (non-loop-carried)
17626      latencies tends to hide this saving.  We therefore reduce the cost of the
17627      vector loop body in proportion to the saving.  */
17628   else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17629            && scalar_ops.reduction_latency == scalar_cycles_per_iter
17630            && scalar_cycles_per_iter > vector_cycles_per_iter
17631            && !should_disparage)
17632     {
17633       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17634                                           scalar_cycles_per_iter);
17635       if (dump_enabled_p ())
17636         dump_printf_loc (MSG_NOTE, vect_location,
17637                          "Decreasing body cost to %d account for smaller"
17638                          " reduction latency\n", body_cost);
17639     }
17640
17641   return body_cost;
17642 }
17643
17644 void
17645 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17646 {
17647   /* Record the issue information for any SVE WHILE instructions that the
17648      loop needs.  */
17649   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17650   if (!m_ops.is_empty ()
17651       && loop_vinfo
17652       && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
17653     {
17654       unsigned int num_masks = 0;
17655       rgroup_controls *rgm;
17656       unsigned int num_vectors_m1;
17657       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
17658                         num_vectors_m1, rgm)
17659         if (rgm->type)
17660           num_masks += num_vectors_m1 + 1;
17661       for (auto &ops : m_ops)
17662         if (auto *issue = ops.sve_issue_info ())
17663           ops.pred_ops += num_masks * issue->while_pred_ops;
17664     }
17665
17666   auto *scalar_costs
17667     = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17668   if (loop_vinfo
17669       && m_vec_flags
17670       && aarch64_use_new_vector_costs_p ())
17671     {
17672       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17673                                              m_costs[vect_body]);
17674       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17675     }
17676
17677   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
17678      the scalar code in the event of a tie, since there is more chance
17679      of scalar code being optimized with surrounding operations.
17680
17681      In addition, if the vector body is a simple store to a decl that
17682      is elsewhere loaded using vld1, strongly prefer the vector form,
17683      to the extent of giving the prologue a zero cost.  See the comment
17684      above m_stores_to_vector_load_decl for details.  */
17685   if (!loop_vinfo
17686       && scalar_costs
17687       && m_stp_sequence_cost != ~0U)
17688     {
17689       if (m_stores_to_vector_load_decl)
17690         m_costs[vect_prologue] = 0;
17691       else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17692         m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17693     }
17694
17695   vector_costs::finish_cost (scalar_costs);
17696 }
17697
17698 bool
17699 aarch64_vector_costs::
17700 better_main_loop_than_p (const vector_costs *uncast_other) const
17701 {
17702   auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17703
17704   auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17705   auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17706
17707   if (dump_enabled_p ())
17708     dump_printf_loc (MSG_NOTE, vect_location,
17709                      "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17710                      GET_MODE_NAME (this_loop_vinfo->vector_mode),
17711                      vect_vf_for_cost (this_loop_vinfo),
17712                      GET_MODE_NAME (other_loop_vinfo->vector_mode),
17713                      vect_vf_for_cost (other_loop_vinfo));
17714
17715   /* Apply the unrolling heuristic described above
17716      m_unrolled_advsimd_niters.  */
17717   if (bool (m_unrolled_advsimd_stmts)
17718       != bool (other->m_unrolled_advsimd_stmts))
17719     {
17720       bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17721       bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17722       if (this_prefer_unrolled != other_prefer_unrolled)
17723         {
17724           if (dump_enabled_p ())
17725             dump_printf_loc (MSG_NOTE, vect_location,
17726                              "Preferring Advanced SIMD loop because"
17727                              " it can be unrolled\n");
17728           return other_prefer_unrolled;
17729         }
17730     }
17731
17732   for (unsigned int i = 0; i < m_ops.length (); ++i)
17733     {
17734       if (dump_enabled_p ())
17735         {
17736           if (i)
17737             dump_printf_loc (MSG_NOTE, vect_location,
17738                              "Reconsidering with subtuning %d\n", i);
17739           dump_printf_loc (MSG_NOTE, vect_location,
17740                            "Issue info for %s loop:\n",
17741                            GET_MODE_NAME (this_loop_vinfo->vector_mode));
17742           this->m_ops[i].dump ();
17743           dump_printf_loc (MSG_NOTE, vect_location,
17744                            "Issue info for %s loop:\n",
17745                            GET_MODE_NAME (other_loop_vinfo->vector_mode));
17746           other->m_ops[i].dump ();
17747         }
17748
17749       auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17750                                 * this->m_ops[i].vf_factor ());
17751       auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17752                                  * other->m_ops[i].vf_factor ());
17753
17754       /* If it appears that one loop could process the same amount of data
17755          in fewer cycles, prefer that loop over the other one.  */
17756       fractional_cost this_cost
17757         = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17758       fractional_cost other_cost
17759         = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17760       if (dump_enabled_p ())
17761         {
17762           dump_printf_loc (MSG_NOTE, vect_location,
17763                            "Weighted cycles per iteration of %s loop ~= %f\n",
17764                            GET_MODE_NAME (this_loop_vinfo->vector_mode),
17765                            this_cost.as_double ());
17766           dump_printf_loc (MSG_NOTE, vect_location,
17767                            "Weighted cycles per iteration of %s loop ~= %f\n",
17768                            GET_MODE_NAME (other_loop_vinfo->vector_mode),
17769                            other_cost.as_double ());
17770         }
17771       if (this_cost != other_cost)
17772         {
17773           if (dump_enabled_p ())
17774             dump_printf_loc (MSG_NOTE, vect_location,
17775                              "Preferring loop with lower cycles"
17776                              " per iteration\n");
17777           return this_cost < other_cost;
17778         }
17779
17780       /* If the issue rate of SVE code is limited by predicate operations
17781          (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17782          and if Advanced SIMD code could issue within the limit imposed
17783          by the predicate operations, the predicate operations are adding an
17784          overhead that the original code didn't have and so we should prefer
17785          the Advanced SIMD version.  */
17786       auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17787                                     const aarch64_vec_op_count &b) -> bool
17788         {
17789           if (a.pred_ops == 0
17790               && (b.min_pred_cycles_per_iter ()
17791                   > b.min_nonpred_cycles_per_iter ()))
17792             {
17793               if (dump_enabled_p ())
17794                 dump_printf_loc (MSG_NOTE, vect_location,
17795                                  "Preferring Advanced SIMD loop since"
17796                                  " SVE loop is predicate-limited\n");
17797               return true;
17798             }
17799           return false;
17800         };
17801       if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17802         return true;
17803       if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17804         return false;
17805     }
17806
17807   return vector_costs::better_main_loop_than_p (other);
17808 }
17809
17810 static void initialize_aarch64_code_model (struct gcc_options *);
17811
17812 /* Parse the TO_PARSE string and put the architecture struct that it
17813    selects into RES and the architectural features into ISA_FLAGS.
17814    Return an aarch_parse_opt_result describing the parse result.
17815    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17816    When the TO_PARSE string contains an invalid extension,
17817    a copy of the string is created and stored to INVALID_EXTENSION.  */
17818
17819 static enum aarch_parse_opt_result
17820 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17821                     aarch64_feature_flags *isa_flags,
17822                     std::string *invalid_extension)
17823 {
17824   const char *ext;
17825   const struct processor *arch;
17826   size_t len;
17827
17828   ext = strchr (to_parse, '+');
17829
17830   if (ext != NULL)
17831     len = ext - to_parse;
17832   else
17833     len = strlen (to_parse);
17834
17835   if (len == 0)
17836     return AARCH_PARSE_MISSING_ARG;
17837
17838
17839   /* Loop through the list of supported ARCHes to find a match.  */
17840   for (arch = all_architectures; arch->name != NULL; arch++)
17841     {
17842       if (strlen (arch->name) == len
17843           && strncmp (arch->name, to_parse, len) == 0)
17844         {
17845           auto isa_temp = arch->flags;
17846
17847           if (ext != NULL)
17848             {
17849               /* TO_PARSE string contains at least one extension.  */
17850               enum aarch_parse_opt_result ext_res
17851                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17852
17853               if (ext_res != AARCH_PARSE_OK)
17854                 return ext_res;
17855             }
17856           /* Extension parsing was successful.  Confirm the result
17857              arch and ISA flags.  */
17858           *res = arch;
17859           *isa_flags = isa_temp;
17860           return AARCH_PARSE_OK;
17861         }
17862     }
17863
17864   /* ARCH name not found in list.  */
17865   return AARCH_PARSE_INVALID_ARG;
17866 }
17867
17868 /* Parse the TO_PARSE string and put the result tuning in RES and the
17869    architecture flags in ISA_FLAGS.  Return an aarch_parse_opt_result
17870    describing the parse result.  If there is an error parsing, RES and
17871    ISA_FLAGS are left unchanged.
17872    When the TO_PARSE string contains an invalid extension,
17873    a copy of the string is created and stored to INVALID_EXTENSION.  */
17874
17875 static enum aarch_parse_opt_result
17876 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17877                    aarch64_feature_flags *isa_flags,
17878                    std::string *invalid_extension)
17879 {
17880   const char *ext;
17881   const struct processor *cpu;
17882   size_t len;
17883
17884   ext = strchr (to_parse, '+');
17885
17886   if (ext != NULL)
17887     len = ext - to_parse;
17888   else
17889     len = strlen (to_parse);
17890
17891   if (len == 0)
17892     return AARCH_PARSE_MISSING_ARG;
17893
17894
17895   /* Loop through the list of supported CPUs to find a match.  */
17896   for (cpu = all_cores; cpu->name != NULL; cpu++)
17897     {
17898       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17899         {
17900           auto isa_temp = cpu->flags;
17901
17902           if (ext != NULL)
17903             {
17904               /* TO_PARSE string contains at least one extension.  */
17905               enum aarch_parse_opt_result ext_res
17906                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17907
17908               if (ext_res != AARCH_PARSE_OK)
17909                 return ext_res;
17910             }
17911           /* Extension parsing was successfull.  Confirm the result
17912              cpu and ISA flags.  */
17913           *res = cpu;
17914           *isa_flags = isa_temp;
17915           return AARCH_PARSE_OK;
17916         }
17917     }
17918
17919   /* CPU name not found in list.  */
17920   return AARCH_PARSE_INVALID_ARG;
17921 }
17922
17923 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17924    Return an aarch_parse_opt_result describing the parse result.
17925    If the parsing fails the RES does not change.  */
17926
17927 static enum aarch_parse_opt_result
17928 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17929 {
17930   const struct processor *cpu;
17931
17932   /* Loop through the list of supported CPUs to find a match.  */
17933   for (cpu = all_cores; cpu->name != NULL; cpu++)
17934     {
17935       if (strcmp (cpu->name, to_parse) == 0)
17936         {
17937           *res = cpu;
17938           return AARCH_PARSE_OK;
17939         }
17940     }
17941
17942   /* CPU name not found in list.  */
17943   return AARCH_PARSE_INVALID_ARG;
17944 }
17945
17946 /* Parse TOKEN, which has length LENGTH to see if it is an option
17947    described in FLAG.  If it is, return the index bit for that fusion type.
17948    If not, error (printing OPTION_NAME) and return zero.  */
17949
17950 static unsigned int
17951 aarch64_parse_one_option_token (const char *token,
17952                                 size_t length,
17953                                 const struct aarch64_flag_desc *flag,
17954                                 const char *option_name)
17955 {
17956   for (; flag->name != NULL; flag++)
17957     {
17958       if (length == strlen (flag->name)
17959           && !strncmp (flag->name, token, length))
17960         return flag->flag;
17961     }
17962
17963   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17964   return 0;
17965 }
17966
17967 /* Parse OPTION which is a comma-separated list of flags to enable.
17968    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17969    default state we inherit from the CPU tuning structures.  OPTION_NAME
17970    gives the top-level option we are parsing in the -moverride string,
17971    for use in error messages.  */
17972
17973 static unsigned int
17974 aarch64_parse_boolean_options (const char *option,
17975                                const struct aarch64_flag_desc *flags,
17976                                unsigned int initial_state,
17977                                const char *option_name)
17978 {
17979   const char separator = '.';
17980   const char* specs = option;
17981   const char* ntoken = option;
17982   unsigned int found_flags = initial_state;
17983
17984   while ((ntoken = strchr (specs, separator)))
17985     {
17986       size_t token_length = ntoken - specs;
17987       unsigned token_ops = aarch64_parse_one_option_token (specs,
17988                                                            token_length,
17989                                                            flags,
17990                                                            option_name);
17991       /* If we find "none" (or, for simplicity's sake, an error) anywhere
17992          in the token stream, reset the supported operations.  So:
17993
17994            adrp+add.cmp+branch.none.adrp+add
17995
17996            would have the result of turning on only adrp+add fusion.  */
17997       if (!token_ops)
17998         found_flags = 0;
17999
18000       found_flags |= token_ops;
18001       specs = ++ntoken;
18002     }
18003
18004   /* We ended with a comma, print something.  */
18005   if (!(*specs))
18006     {
18007       error ("%qs string ill-formed", option_name);
18008       return 0;
18009     }
18010
18011   /* We still have one more token to parse.  */
18012   size_t token_length = strlen (specs);
18013   unsigned token_ops = aarch64_parse_one_option_token (specs,
18014                                                        token_length,
18015                                                        flags,
18016                                                        option_name);
18017    if (!token_ops)
18018      found_flags = 0;
18019
18020   found_flags |= token_ops;
18021   return found_flags;
18022 }
18023
18024 /* Support for overriding instruction fusion.  */
18025
18026 static void
18027 aarch64_parse_fuse_string (const char *fuse_string,
18028                             struct tune_params *tune)
18029 {
18030   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
18031                                                      aarch64_fusible_pairs,
18032                                                      tune->fusible_ops,
18033                                                      "fuse=");
18034 }
18035
18036 /* Support for overriding other tuning flags.  */
18037
18038 static void
18039 aarch64_parse_tune_string (const char *tune_string,
18040                             struct tune_params *tune)
18041 {
18042   tune->extra_tuning_flags
18043     = aarch64_parse_boolean_options (tune_string,
18044                                      aarch64_tuning_flags,
18045                                      tune->extra_tuning_flags,
18046                                      "tune=");
18047 }
18048
18049 /* Parse the sve_width tuning moverride string in TUNE_STRING.
18050    Accept the valid SVE vector widths allowed by
18051    aarch64_sve_vector_bits_enum and use it to override sve_width
18052    in TUNE.  */
18053
18054 static void
18055 aarch64_parse_sve_width_string (const char *tune_string,
18056                                 struct tune_params *tune)
18057 {
18058   int width = -1;
18059
18060   int n = sscanf (tune_string, "%d", &width);
18061   if (n == EOF)
18062     {
18063       error ("invalid format for %<sve_width%>");
18064       return;
18065     }
18066   switch (width)
18067     {
18068     case SVE_128:
18069     case SVE_256:
18070     case SVE_512:
18071     case SVE_1024:
18072     case SVE_2048:
18073       break;
18074     default:
18075       error ("invalid %<sve_width%> value: %d", width);
18076     }
18077   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
18078 }
18079
18080 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
18081    we understand.  If it is, extract the option string and handoff to
18082    the appropriate function.  */
18083
18084 void
18085 aarch64_parse_one_override_token (const char* token,
18086                                   size_t length,
18087                                   struct tune_params *tune)
18088 {
18089   const struct aarch64_tuning_override_function *fn
18090     = aarch64_tuning_override_functions;
18091
18092   const char *option_part = strchr (token, '=');
18093   if (!option_part)
18094     {
18095       error ("tuning string missing in option (%s)", token);
18096       return;
18097     }
18098
18099   /* Get the length of the option name.  */
18100   length = option_part - token;
18101   /* Skip the '=' to get to the option string.  */
18102   option_part++;
18103
18104   for (; fn->name != NULL; fn++)
18105     {
18106       if (!strncmp (fn->name, token, length))
18107         {
18108           fn->parse_override (option_part, tune);
18109           return;
18110         }
18111     }
18112
18113   error ("unknown tuning option (%s)",token);
18114   return;
18115 }
18116
18117 /* A checking mechanism for the implementation of the tls size.  */
18118
18119 static void
18120 initialize_aarch64_tls_size (struct gcc_options *opts)
18121 {
18122   if (aarch64_tls_size == 0)
18123     aarch64_tls_size = 24;
18124
18125   switch (opts->x_aarch64_cmodel_var)
18126     {
18127     case AARCH64_CMODEL_TINY:
18128       /* Both the default and maximum TLS size allowed under tiny is 1M which
18129          needs two instructions to address, so we clamp the size to 24.  */
18130       if (aarch64_tls_size > 24)
18131         aarch64_tls_size = 24;
18132       break;
18133     case AARCH64_CMODEL_SMALL:
18134       /* The maximum TLS size allowed under small is 4G.  */
18135       if (aarch64_tls_size > 32)
18136         aarch64_tls_size = 32;
18137       break;
18138     case AARCH64_CMODEL_LARGE:
18139       /* The maximum TLS size allowed under large is 16E.
18140          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
18141       if (aarch64_tls_size > 48)
18142         aarch64_tls_size = 48;
18143       break;
18144     default:
18145       gcc_unreachable ();
18146     }
18147
18148   return;
18149 }
18150
18151 /* Return the CPU corresponding to the enum CPU.  */
18152
18153 static const struct processor *
18154 aarch64_get_tune_cpu (enum aarch64_processor cpu)
18155 {
18156   gcc_assert (cpu != aarch64_none);
18157
18158   return &all_cores[cpu];
18159 }
18160
18161 /* Return the architecture corresponding to the enum ARCH.  */
18162
18163 static const struct processor *
18164 aarch64_get_arch (enum aarch64_arch arch)
18165 {
18166   gcc_assert (arch != aarch64_no_arch);
18167
18168   return &all_architectures[arch];
18169 }
18170
18171 /* Parse STRING looking for options in the format:
18172      string     :: option:string
18173      option     :: name=substring
18174      name       :: {a-z}
18175      substring  :: defined by option.  */
18176
18177 static void
18178 aarch64_parse_override_string (const char* input_string,
18179                                struct tune_params* tune)
18180 {
18181   const char separator = ':';
18182   size_t string_length = strlen (input_string) + 1;
18183   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
18184   char *string = string_root;
18185   strncpy (string, input_string, string_length);
18186   string[string_length - 1] = '\0';
18187
18188   char* ntoken = string;
18189
18190   while ((ntoken = strchr (string, separator)))
18191     {
18192       size_t token_length = ntoken - string;
18193       /* Make this substring look like a string.  */
18194       *ntoken = '\0';
18195       aarch64_parse_one_override_token (string, token_length, tune);
18196       string = ++ntoken;
18197     }
18198
18199   /* One last option to parse.  */
18200   aarch64_parse_one_override_token (string, strlen (string), tune);
18201   free (string_root);
18202 }
18203
18204 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
18205    are best for a generic target with the currently-enabled architecture
18206    extensions.  */
18207 static void
18208 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
18209 {
18210   /* Neoverse V1 is the only core that is known to benefit from
18211      AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.  There is therefore no
18212      point enabling it for SVE2 and above.  */
18213   if (TARGET_SVE2)
18214     current_tune.extra_tuning_flags
18215       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
18216 }
18217
18218 static void
18219 aarch64_override_options_after_change_1 (struct gcc_options *opts)
18220 {
18221   /* PR 70044: We have to be careful about being called multiple times for the
18222      same function.  This means all changes should be repeatable.  */
18223
18224   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
18225      Disable the frame pointer flag so the mid-end will not use a frame
18226      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
18227      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
18228      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
18229   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
18230   if (opts->x_flag_omit_frame_pointer == 0)
18231     opts->x_flag_omit_frame_pointer = 2;
18232
18233   /* If not optimizing for size, set the default
18234      alignment to what the target wants.  */
18235   if (!opts->x_optimize_size)
18236     {
18237       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
18238         opts->x_str_align_loops = aarch64_tune_params.loop_align;
18239       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
18240         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
18241       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
18242         opts->x_str_align_functions = aarch64_tune_params.function_align;
18243     }
18244
18245   /* We default to no pc-relative literal loads.  */
18246
18247   aarch64_pcrelative_literal_loads = false;
18248
18249   /* If -mpc-relative-literal-loads is set on the command line, this
18250      implies that the user asked for PC relative literal loads.  */
18251   if (opts->x_pcrelative_literal_loads == 1)
18252     aarch64_pcrelative_literal_loads = true;
18253
18254   /* In the tiny memory model it makes no sense to disallow PC relative
18255      literal pool loads.  */
18256   if (aarch64_cmodel == AARCH64_CMODEL_TINY
18257       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
18258     aarch64_pcrelative_literal_loads = true;
18259
18260   /* When enabling the lower precision Newton series for the square root, also
18261      enable it for the reciprocal square root, since the latter is an
18262      intermediary step for the former.  */
18263   if (flag_mlow_precision_sqrt)
18264     flag_mrecip_low_precision_sqrt = true;
18265 }
18266
18267 /* 'Unpack' up the internal tuning structs and update the options
18268     in OPTS.  The caller must have set up selected_tune and selected_arch
18269     as all the other target-specific codegen decisions are
18270     derived from them.  */
18271
18272 void
18273 aarch64_override_options_internal (struct gcc_options *opts)
18274 {
18275   const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
18276   aarch64_tune_flags = tune->flags;
18277   aarch64_tune = tune->sched_core;
18278   /* Make a copy of the tuning parameters attached to the core, which
18279      we may later overwrite.  */
18280   aarch64_tune_params = *(tune->tune);
18281   if (tune->tune == &generic_tunings)
18282     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
18283
18284   if (opts->x_aarch64_override_tune_string)
18285     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
18286                                    &aarch64_tune_params);
18287
18288   if (opts->x_aarch64_ldp_policy_param)
18289     aarch64_tune_params.ldp_policy_model = opts->x_aarch64_ldp_policy_param;
18290
18291   if (opts->x_aarch64_stp_policy_param)
18292     aarch64_tune_params.stp_policy_model = opts->x_aarch64_stp_policy_param;
18293
18294   /* This target defaults to strict volatile bitfields.  */
18295   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
18296     opts->x_flag_strict_volatile_bitfields = 1;
18297
18298   if (aarch64_stack_protector_guard == SSP_GLOBAL
18299       && opts->x_aarch64_stack_protector_guard_offset_str)
18300     {
18301       error ("incompatible options %<-mstack-protector-guard=global%> and "
18302              "%<-mstack-protector-guard-offset=%s%>",
18303              aarch64_stack_protector_guard_offset_str);
18304     }
18305
18306   if (aarch64_stack_protector_guard == SSP_SYSREG
18307       && !(opts->x_aarch64_stack_protector_guard_offset_str
18308            && opts->x_aarch64_stack_protector_guard_reg_str))
18309     {
18310       error ("both %<-mstack-protector-guard-offset%> and "
18311              "%<-mstack-protector-guard-reg%> must be used "
18312              "with %<-mstack-protector-guard=sysreg%>");
18313     }
18314
18315   if (opts->x_aarch64_stack_protector_guard_reg_str)
18316     {
18317       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
18318           error ("specify a system register with a small string length");
18319     }
18320
18321   if (opts->x_aarch64_stack_protector_guard_offset_str)
18322     {
18323       char *end;
18324       const char *str = aarch64_stack_protector_guard_offset_str;
18325       errno = 0;
18326       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
18327       if (!*str || *end || errno)
18328         error ("%qs is not a valid offset in %qs", str,
18329                "-mstack-protector-guard-offset=");
18330       aarch64_stack_protector_guard_offset = offs;
18331     }
18332
18333   if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
18334       && !fixed_regs[R18_REGNUM])
18335     error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
18336
18337   if ((opts->x_aarch64_isa_flags & (AARCH64_FL_SM_ON | AARCH64_FL_ZA_ON))
18338       && !(opts->x_aarch64_isa_flags & AARCH64_FL_SME))
18339     {
18340       if (opts->x_aarch64_isa_flags & AARCH64_FL_SM_ON)
18341         error ("streaming functions require the ISA extension %qs", "sme");
18342       else
18343         error ("functions with SME state require the ISA extension %qs",
18344                "sme");
18345       inform (input_location, "you can enable %qs using the command-line"
18346               " option %<-march%>, or by using the %<target%>"
18347               " attribute or pragma", "sme");
18348       opts->x_target_flags &= ~MASK_GENERAL_REGS_ONLY;
18349       auto new_flags = (opts->x_aarch64_asm_isa_flags
18350                         | feature_deps::SME ().enable);
18351       aarch64_set_asm_isa_flags (opts, new_flags);
18352     }
18353
18354   initialize_aarch64_code_model (opts);
18355   initialize_aarch64_tls_size (opts);
18356   aarch64_tpidr_register = opts->x_aarch64_tpidr_reg;
18357
18358   int queue_depth = 0;
18359   switch (aarch64_tune_params.autoprefetcher_model)
18360     {
18361       case tune_params::AUTOPREFETCHER_OFF:
18362         queue_depth = -1;
18363         break;
18364       case tune_params::AUTOPREFETCHER_WEAK:
18365         queue_depth = 0;
18366         break;
18367       case tune_params::AUTOPREFETCHER_STRONG:
18368         queue_depth = max_insn_queue_index + 1;
18369         break;
18370       default:
18371         gcc_unreachable ();
18372     }
18373
18374   /* We don't mind passing in global_options_set here as we don't use
18375      the *options_set structs anyway.  */
18376   SET_OPTION_IF_UNSET (opts, &global_options_set,
18377                        param_sched_autopref_queue_depth, queue_depth);
18378
18379   /* Set up parameters to be used in prefetching algorithm.  Do not
18380      override the defaults unless we are tuning for a core we have
18381      researched values for.  */
18382   if (aarch64_tune_params.prefetch->num_slots > 0)
18383     SET_OPTION_IF_UNSET (opts, &global_options_set,
18384                          param_simultaneous_prefetches,
18385                          aarch64_tune_params.prefetch->num_slots);
18386   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
18387     SET_OPTION_IF_UNSET (opts, &global_options_set,
18388                          param_l1_cache_size,
18389                          aarch64_tune_params.prefetch->l1_cache_size);
18390   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18391     SET_OPTION_IF_UNSET (opts, &global_options_set,
18392                          param_l1_cache_line_size,
18393                          aarch64_tune_params.prefetch->l1_cache_line_size);
18394
18395   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18396     {
18397       SET_OPTION_IF_UNSET (opts, &global_options_set,
18398                            param_destruct_interfere_size,
18399                            aarch64_tune_params.prefetch->l1_cache_line_size);
18400       SET_OPTION_IF_UNSET (opts, &global_options_set,
18401                            param_construct_interfere_size,
18402                            aarch64_tune_params.prefetch->l1_cache_line_size);
18403     }
18404   else
18405     {
18406       /* For a generic AArch64 target, cover the current range of cache line
18407          sizes.  */
18408       SET_OPTION_IF_UNSET (opts, &global_options_set,
18409                            param_destruct_interfere_size,
18410                            256);
18411       SET_OPTION_IF_UNSET (opts, &global_options_set,
18412                            param_construct_interfere_size,
18413                            64);
18414     }
18415
18416   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
18417     SET_OPTION_IF_UNSET (opts, &global_options_set,
18418                          param_l2_cache_size,
18419                          aarch64_tune_params.prefetch->l2_cache_size);
18420   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
18421     SET_OPTION_IF_UNSET (opts, &global_options_set,
18422                          param_prefetch_dynamic_strides, 0);
18423   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
18424     SET_OPTION_IF_UNSET (opts, &global_options_set,
18425                          param_prefetch_minimum_stride,
18426                          aarch64_tune_params.prefetch->minimum_stride);
18427
18428   /* Use the alternative scheduling-pressure algorithm by default.  */
18429   SET_OPTION_IF_UNSET (opts, &global_options_set,
18430                        param_sched_pressure_algorithm,
18431                        SCHED_PRESSURE_MODEL);
18432
18433   /* Validate the guard size.  */
18434   int guard_size = param_stack_clash_protection_guard_size;
18435
18436   if (guard_size != 12 && guard_size != 16)
18437     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
18438            "size.  Given value %d (%llu KB) is out of range",
18439            guard_size, (1ULL << guard_size) / 1024ULL);
18440
18441   /* Enforce that interval is the same size as size so the mid-end does the
18442      right thing.  */
18443   SET_OPTION_IF_UNSET (opts, &global_options_set,
18444                        param_stack_clash_protection_probe_interval,
18445                        guard_size);
18446
18447   /* The maybe_set calls won't update the value if the user has explicitly set
18448      one.  Which means we need to validate that probing interval and guard size
18449      are equal.  */
18450   int probe_interval
18451     = param_stack_clash_protection_probe_interval;
18452   if (guard_size != probe_interval)
18453     error ("stack clash guard size %<%d%> must be equal to probing interval "
18454            "%<%d%>", guard_size, probe_interval);
18455
18456   /* Enable sw prefetching at specified optimization level for
18457      CPUS that have prefetch.  Lower optimization level threshold by 1
18458      when profiling is enabled.  */
18459   if (opts->x_flag_prefetch_loop_arrays < 0
18460       && !opts->x_optimize_size
18461       && aarch64_tune_params.prefetch->default_opt_level >= 0
18462       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
18463     opts->x_flag_prefetch_loop_arrays = 1;
18464
18465   /* Avoid loop-dependant FMA chains.  */
18466   if (aarch64_tune_params.extra_tuning_flags
18467       & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA)
18468     SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
18469                          512);
18470
18471   /* Consider fully pipelined FMA in reassociation.  */
18472   if (aarch64_tune_params.extra_tuning_flags
18473       & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
18474     SET_OPTION_IF_UNSET (opts, &global_options_set, param_fully_pipelined_fma,
18475                          1);
18476
18477   aarch64_override_options_after_change_1 (opts);
18478 }
18479
18480 /* Print a hint with a suggestion for a core or architecture name that
18481    most closely resembles what the user passed in STR.  ARCH is true if
18482    the user is asking for an architecture name.  ARCH is false if the user
18483    is asking for a core name.  */
18484
18485 static void
18486 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
18487 {
18488   auto_vec<const char *> candidates;
18489   const struct processor *entry = arch ? all_architectures : all_cores;
18490   for (; entry->name != NULL; entry++)
18491     candidates.safe_push (entry->name);
18492
18493 #ifdef HAVE_LOCAL_CPU_DETECT
18494   /* Add also "native" as possible value.  */
18495   if (arch)
18496     candidates.safe_push ("native");
18497 #endif
18498
18499   char *s;
18500   const char *hint = candidates_list_and_hint (str, s, candidates);
18501   if (hint)
18502     inform (input_location, "valid arguments are: %s;"
18503                              " did you mean %qs?", s, hint);
18504   else
18505     inform (input_location, "valid arguments are: %s", s);
18506
18507   XDELETEVEC (s);
18508 }
18509
18510 /* Print a hint with a suggestion for a core name that most closely resembles
18511    what the user passed in STR.  */
18512
18513 inline static void
18514 aarch64_print_hint_for_core (const char *str)
18515 {
18516   aarch64_print_hint_for_core_or_arch (str, false);
18517 }
18518
18519 /* Print a hint with a suggestion for an architecture name that most closely
18520    resembles what the user passed in STR.  */
18521
18522 inline static void
18523 aarch64_print_hint_for_arch (const char *str)
18524 {
18525   aarch64_print_hint_for_core_or_arch (str, true);
18526 }
18527
18528
18529 /* Print a hint with a suggestion for an extension name
18530    that most closely resembles what the user passed in STR.  */
18531
18532 void
18533 aarch64_print_hint_for_extensions (const std::string &str)
18534 {
18535   auto_vec<const char *> candidates;
18536   aarch64_get_all_extension_candidates (&candidates);
18537   char *s;
18538   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
18539   if (hint)
18540     inform (input_location, "valid arguments are: %s;"
18541                              " did you mean %qs?", s, hint);
18542   else
18543     inform (input_location, "valid arguments are: %s", s);
18544
18545   XDELETEVEC (s);
18546 }
18547
18548 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
18549    specified in STR and throw errors if appropriate.  Put the results if
18550    they are valid in RES and ISA_FLAGS.  Return whether the option is
18551    valid.  */
18552
18553 static bool
18554 aarch64_validate_mcpu (const char *str, const struct processor **res,
18555                        aarch64_feature_flags *isa_flags)
18556 {
18557   std::string invalid_extension;
18558   enum aarch_parse_opt_result parse_res
18559     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18560
18561   if (parse_res == AARCH_PARSE_OK)
18562     return true;
18563
18564   switch (parse_res)
18565     {
18566       case AARCH_PARSE_MISSING_ARG:
18567         error ("missing cpu name in %<-mcpu=%s%>", str);
18568         break;
18569       case AARCH_PARSE_INVALID_ARG:
18570         error ("unknown value %qs for %<-mcpu%>", str);
18571         aarch64_print_hint_for_core (str);
18572         /* A common user error is confusing -march and -mcpu.
18573            If the -mcpu string matches a known architecture then suggest
18574            -march=.  */
18575         parse_res = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18576         if (parse_res == AARCH_PARSE_OK)
18577           inform (input_location, "did you mean %<-march=%s%>?", str);
18578         break;
18579       case AARCH_PARSE_INVALID_FEATURE:
18580         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18581                invalid_extension.c_str (), str);
18582         aarch64_print_hint_for_extensions (invalid_extension);
18583         break;
18584       default:
18585         gcc_unreachable ();
18586     }
18587
18588   return false;
18589 }
18590
18591 /* Straight line speculation indicators.  */
18592 enum aarch64_sls_hardening_type
18593 {
18594   SLS_NONE = 0,
18595   SLS_RETBR = 1,
18596   SLS_BLR = 2,
18597   SLS_ALL = 3,
18598 };
18599 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18600
18601 /* Return whether we should mitigatate Straight Line Speculation for the RET
18602    and BR instructions.  */
18603 bool
18604 aarch64_harden_sls_retbr_p (void)
18605 {
18606   return aarch64_sls_hardening & SLS_RETBR;
18607 }
18608
18609 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18610    instruction.  */
18611 bool
18612 aarch64_harden_sls_blr_p (void)
18613 {
18614   return aarch64_sls_hardening & SLS_BLR;
18615 }
18616
18617 /* As of yet we only allow setting these options globally, in the future we may
18618    allow setting them per function.  */
18619 static void
18620 aarch64_validate_sls_mitigation (const char *const_str)
18621 {
18622   char *token_save = NULL;
18623   char *str = NULL;
18624
18625   if (strcmp (const_str, "none") == 0)
18626     {
18627       aarch64_sls_hardening = SLS_NONE;
18628       return;
18629     }
18630   if (strcmp (const_str, "all") == 0)
18631     {
18632       aarch64_sls_hardening = SLS_ALL;
18633       return;
18634     }
18635
18636   char *str_root = xstrdup (const_str);
18637   str = strtok_r (str_root, ",", &token_save);
18638   if (!str)
18639     error ("invalid argument given to %<-mharden-sls=%>");
18640
18641   int temp = SLS_NONE;
18642   while (str)
18643     {
18644       if (strcmp (str, "blr") == 0)
18645         temp |= SLS_BLR;
18646       else if (strcmp (str, "retbr") == 0)
18647         temp |= SLS_RETBR;
18648       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18649         {
18650           error ("%qs must be by itself for %<-mharden-sls=%>", str);
18651           break;
18652         }
18653       else
18654         {
18655           error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18656           break;
18657         }
18658       str = strtok_r (NULL, ",", &token_save);
18659     }
18660   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18661   free (str_root);
18662 }
18663
18664 /* Validate a command-line -march option.  Parse the arch and extensions
18665    (if any) specified in STR and throw errors if appropriate.  Put the
18666    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
18667    option is valid.  */
18668
18669 static bool
18670 aarch64_validate_march (const char *str, const struct processor **res,
18671                         aarch64_feature_flags *isa_flags)
18672 {
18673   std::string invalid_extension;
18674   enum aarch_parse_opt_result parse_res
18675     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18676
18677   if (parse_res == AARCH_PARSE_OK)
18678     return true;
18679
18680   switch (parse_res)
18681     {
18682       case AARCH_PARSE_MISSING_ARG:
18683         error ("missing arch name in %<-march=%s%>", str);
18684         break;
18685       case AARCH_PARSE_INVALID_ARG:
18686         error ("unknown value %qs for %<-march%>", str);
18687         aarch64_print_hint_for_arch (str);
18688         /* A common user error is confusing -march and -mcpu.
18689            If the -march string matches a known CPU suggest -mcpu.  */
18690         parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18691         if (parse_res == AARCH_PARSE_OK)
18692           inform (input_location, "did you mean %<-mcpu=%s%>?", str);
18693         break;
18694       case AARCH_PARSE_INVALID_FEATURE:
18695         error ("invalid feature modifier %qs in %<-march=%s%>",
18696                invalid_extension.c_str (), str);
18697         aarch64_print_hint_for_extensions (invalid_extension);
18698         break;
18699       default:
18700         gcc_unreachable ();
18701     }
18702
18703   return false;
18704 }
18705
18706 /* Validate a command-line -mtune option.  Parse the cpu
18707    specified in STR and throw errors if appropriate.  Put the
18708    result, if it is valid, in RES.  Return whether the option is
18709    valid.  */
18710
18711 static bool
18712 aarch64_validate_mtune (const char *str, const struct processor **res)
18713 {
18714   enum aarch_parse_opt_result parse_res
18715     = aarch64_parse_tune (str, res);
18716
18717   if (parse_res == AARCH_PARSE_OK)
18718     return true;
18719
18720   switch (parse_res)
18721     {
18722       case AARCH_PARSE_MISSING_ARG:
18723         error ("missing cpu name in %<-mtune=%s%>", str);
18724         break;
18725       case AARCH_PARSE_INVALID_ARG:
18726         error ("unknown value %qs for %<-mtune%>", str);
18727         aarch64_print_hint_for_core (str);
18728         break;
18729       default:
18730         gcc_unreachable ();
18731     }
18732   return false;
18733 }
18734
18735 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
18736
18737 static poly_uint16
18738 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18739 {
18740   /* 128-bit SVE and Advanced SIMD modes use different register layouts
18741      on big-endian targets, so we would need to forbid subregs that convert
18742      from one to the other.  By default a reinterpret sequence would then
18743      involve a store to memory in one mode and a load back in the other.
18744      Even if we optimize that sequence using reverse instructions,
18745      it would still be a significant potential overhead.
18746
18747      For now, it seems better to generate length-agnostic code for that
18748      case instead.  */
18749   if (value == SVE_SCALABLE
18750       || (value == SVE_128 && BYTES_BIG_ENDIAN))
18751     return poly_uint16 (2, 2);
18752   else
18753     return (int) value / 64;
18754 }
18755
18756 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18757    aarch64_isa_flags accordingly.  */
18758
18759 void
18760 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18761 {
18762   aarch64_set_asm_isa_flags (&global_options, flags);
18763 }
18764
18765 static void
18766 aarch64_handle_no_branch_protection (void)
18767 {
18768   aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18769   aarch_enable_bti = 0;
18770 }
18771
18772 static void
18773 aarch64_handle_standard_branch_protection (void)
18774 {
18775   aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18776   aarch64_ra_sign_key = AARCH64_KEY_A;
18777   aarch_enable_bti = 1;
18778 }
18779
18780 static void
18781 aarch64_handle_pac_ret_protection (void)
18782 {
18783   aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18784   aarch64_ra_sign_key = AARCH64_KEY_A;
18785 }
18786
18787 static void
18788 aarch64_handle_pac_ret_leaf (void)
18789 {
18790   aarch_ra_sign_scope = AARCH_FUNCTION_ALL;
18791 }
18792
18793 static void
18794 aarch64_handle_pac_ret_b_key (void)
18795 {
18796   aarch64_ra_sign_key = AARCH64_KEY_B;
18797 }
18798
18799 static void
18800 aarch64_handle_bti_protection (void)
18801 {
18802   aarch_enable_bti = 1;
18803 }
18804
18805 static const struct aarch_branch_protect_type aarch64_pac_ret_subtypes[] = {
18806   { "leaf", false, aarch64_handle_pac_ret_leaf, NULL, 0 },
18807   { "b-key", false, aarch64_handle_pac_ret_b_key, NULL, 0 },
18808   { NULL, false, NULL, NULL, 0 }
18809 };
18810
18811 static const struct aarch_branch_protect_type aarch64_branch_protect_types[] =
18812 {
18813   { "none", true, aarch64_handle_no_branch_protection, NULL, 0 },
18814   { "standard", true, aarch64_handle_standard_branch_protection, NULL, 0 },
18815   { "pac-ret", false, aarch64_handle_pac_ret_protection,
18816     aarch64_pac_ret_subtypes, ARRAY_SIZE (aarch64_pac_ret_subtypes) },
18817   { "bti", false, aarch64_handle_bti_protection, NULL, 0 },
18818   { NULL, false, NULL, NULL, 0 }
18819 };
18820
18821 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
18822    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18823    tuning structs.  In particular it must set selected_tune and
18824    aarch64_asm_isa_flags that define the available ISA features and tuning
18825    decisions.  It must also set selected_arch as this will be used to
18826    output the .arch asm tags for each function.  */
18827
18828 static void
18829 aarch64_override_options (void)
18830 {
18831   aarch64_feature_flags cpu_isa = 0;
18832   aarch64_feature_flags arch_isa = 0;
18833   aarch64_set_asm_isa_flags (0);
18834
18835   const struct processor *cpu = NULL;
18836   const struct processor *arch = NULL;
18837   const struct processor *tune = NULL;
18838
18839   if (aarch64_harden_sls_string)
18840     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18841
18842   if (aarch64_branch_protection_string)
18843     aarch_validate_mbranch_protection (aarch64_branch_protect_types,
18844                                        aarch64_branch_protection_string,
18845                                        "-mbranch-protection=");
18846
18847   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18848      If either of -march or -mtune is given, they override their
18849      respective component of -mcpu.  */
18850   if (aarch64_cpu_string)
18851     aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18852
18853   if (aarch64_arch_string)
18854     aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18855
18856   if (aarch64_tune_string)
18857     aarch64_validate_mtune (aarch64_tune_string, &tune);
18858
18859 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18860   SUBTARGET_OVERRIDE_OPTIONS;
18861 #endif
18862
18863   auto isa_mode = AARCH64_FL_DEFAULT_ISA_MODE;
18864   if (cpu && arch)
18865     {
18866       /* If both -mcpu and -march are specified, warn if they are not
18867          feature compatible.  feature compatible means that the inclusion of the
18868          cpu features would end up disabling an achitecture feature.  In
18869          otherwords the cpu features need to be a strict superset of the arch
18870          features and if so prefer the -march ISA flags.  */
18871       auto full_arch_flags = arch->flags | arch_isa;
18872       auto full_cpu_flags = cpu->flags | cpu_isa;
18873       if (~full_cpu_flags & full_arch_flags)
18874         {
18875           std::string ext_diff
18876             = aarch64_get_extension_string_for_isa_flags (full_arch_flags,
18877                                                           full_cpu_flags);
18878           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch "
18879                       "and resulted in options %<%s%> being added",
18880                        aarch64_cpu_string,
18881                        aarch64_arch_string,
18882                        ext_diff.c_str ());
18883         }
18884
18885       selected_arch = arch->arch;
18886       aarch64_set_asm_isa_flags (arch_isa | isa_mode);
18887     }
18888   else if (cpu)
18889     {
18890       selected_arch = cpu->arch;
18891       aarch64_set_asm_isa_flags (cpu_isa | isa_mode);
18892     }
18893   else if (arch)
18894     {
18895       cpu = &all_cores[arch->ident];
18896       selected_arch = arch->arch;
18897       aarch64_set_asm_isa_flags (arch_isa | isa_mode);
18898     }
18899   else
18900     {
18901       /* No -mcpu or -march specified, so use the default CPU.  */
18902       cpu = &all_cores[TARGET_CPU_DEFAULT];
18903       selected_arch = cpu->arch;
18904       aarch64_set_asm_isa_flags (cpu->flags | isa_mode);
18905     }
18906
18907   selected_tune = tune ? tune->ident : cpu->ident;
18908
18909   if (aarch_enable_bti == 2)
18910     {
18911 #ifdef TARGET_ENABLE_BTI
18912       aarch_enable_bti = 1;
18913 #else
18914       aarch_enable_bti = 0;
18915 #endif
18916     }
18917
18918   /* Return address signing is currently not supported for ILP32 targets.  For
18919      LP64 targets use the configured option in the absence of a command-line
18920      option for -mbranch-protection.  */
18921   if (!TARGET_ILP32 && aarch64_branch_protection_string == NULL)
18922     {
18923 #ifdef TARGET_ENABLE_PAC_RET
18924       aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18925 #else
18926       aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18927 #endif
18928     }
18929
18930 #ifndef HAVE_AS_MABI_OPTION
18931   /* The compiler may have been configured with 2.23.* binutils, which does
18932      not have support for ILP32.  */
18933   if (TARGET_ILP32)
18934     error ("assembler does not support %<-mabi=ilp32%>");
18935 #endif
18936
18937   /* Convert -msve-vector-bits to a VG count.  */
18938   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18939
18940   if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE && TARGET_ILP32)
18941     sorry ("return address signing is only supported for %<-mabi=lp64%>");
18942
18943   /* The pass to insert speculation tracking runs before
18944      shrink-wrapping and the latter does not know how to update the
18945      tracking status.  So disable it in this case.  */
18946   if (aarch64_track_speculation)
18947     flag_shrink_wrap = 0;
18948
18949   aarch64_override_options_internal (&global_options);
18950
18951   /* Save these options as the default ones in case we push and pop them later
18952      while processing functions with potential target attributes.  */
18953   target_option_default_node = target_option_current_node
18954     = build_target_option_node (&global_options, &global_options_set);
18955 }
18956
18957 /* Implement targetm.override_options_after_change.  */
18958
18959 static void
18960 aarch64_override_options_after_change (void)
18961 {
18962   aarch64_override_options_after_change_1 (&global_options);
18963 }
18964
18965 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
18966 static char *
18967 aarch64_offload_options (void)
18968 {
18969   if (TARGET_ILP32)
18970     return xstrdup ("-foffload-abi=ilp32");
18971   else
18972     return xstrdup ("-foffload-abi=lp64");
18973 }
18974
18975 static struct machine_function *
18976 aarch64_init_machine_status (void)
18977 {
18978   struct machine_function *machine;
18979   machine = ggc_cleared_alloc<machine_function> ();
18980   return machine;
18981 }
18982
18983 void
18984 aarch64_init_expanders (void)
18985 {
18986   init_machine_status = aarch64_init_machine_status;
18987 }
18988
18989 /* A checking mechanism for the implementation of the various code models.  */
18990 static void
18991 initialize_aarch64_code_model (struct gcc_options *opts)
18992 {
18993   aarch64_cmodel = opts->x_aarch64_cmodel_var;
18994   switch (opts->x_aarch64_cmodel_var)
18995     {
18996     case AARCH64_CMODEL_TINY:
18997       if (opts->x_flag_pic)
18998         aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18999       break;
19000     case AARCH64_CMODEL_SMALL:
19001       if (opts->x_flag_pic)
19002         {
19003 #ifdef HAVE_AS_SMALL_PIC_RELOCS
19004           aarch64_cmodel = (flag_pic == 2
19005                             ? AARCH64_CMODEL_SMALL_PIC
19006                             : AARCH64_CMODEL_SMALL_SPIC);
19007 #else
19008           aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
19009 #endif
19010         }
19011       break;
19012     case AARCH64_CMODEL_LARGE:
19013       if (opts->x_flag_pic)
19014         sorry ("code model %qs with %<-f%s%>", "large",
19015                opts->x_flag_pic > 1 ? "PIC" : "pic");
19016       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
19017         sorry ("code model %qs not supported in ilp32 mode", "large");
19018       break;
19019     case AARCH64_CMODEL_TINY_PIC:
19020     case AARCH64_CMODEL_SMALL_PIC:
19021     case AARCH64_CMODEL_SMALL_SPIC:
19022       gcc_unreachable ();
19023     }
19024 }
19025
19026 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
19027    using the information saved in PTR.  */
19028
19029 static void
19030 aarch64_option_restore (struct gcc_options *opts,
19031                         struct gcc_options * /* opts_set */,
19032                         struct cl_target_option * /* ptr */)
19033 {
19034   aarch64_override_options_internal (opts);
19035 }
19036
19037 /* Implement TARGET_OPTION_PRINT.  */
19038
19039 static void
19040 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
19041 {
19042   const struct processor *cpu
19043     = aarch64_get_tune_cpu (ptr->x_selected_tune);
19044   const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
19045   std::string extension
19046     = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_asm_isa_flags,
19047                                                   arch->flags);
19048
19049   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
19050   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
19051            arch->name, extension.c_str ());
19052 }
19053
19054 static GTY(()) tree aarch64_previous_fndecl;
19055
19056 void
19057 aarch64_reset_previous_fndecl (void)
19058 {
19059   aarch64_previous_fndecl = NULL;
19060 }
19061
19062 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
19063    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
19064    make sure optab availability predicates are recomputed when necessary.  */
19065
19066 void
19067 aarch64_save_restore_target_globals (tree new_tree)
19068 {
19069   if (TREE_TARGET_GLOBALS (new_tree))
19070     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
19071   else if (new_tree == target_option_default_node)
19072     restore_target_globals (&default_target_globals);
19073   else
19074     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
19075 }
19076
19077 /* Return the target_option_node for FNDECL, or the current options
19078    if FNDECL is null.  */
19079
19080 static tree
19081 aarch64_fndecl_options (tree fndecl)
19082 {
19083   if (!fndecl)
19084     return target_option_current_node;
19085
19086   if (tree options = DECL_FUNCTION_SPECIFIC_TARGET (fndecl))
19087     return options;
19088
19089   return target_option_default_node;
19090 }
19091
19092 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
19093    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
19094    of the function, if such exists.  This function may be called multiple
19095    times on a single function so use aarch64_previous_fndecl to avoid
19096    setting up identical state.  */
19097
19098 static void
19099 aarch64_set_current_function (tree fndecl)
19100 {
19101   tree old_tree = aarch64_fndecl_options (aarch64_previous_fndecl);
19102   tree new_tree = aarch64_fndecl_options (fndecl);
19103
19104   auto new_isa_mode = (fndecl
19105                        ? aarch64_fndecl_isa_mode (fndecl)
19106                        : AARCH64_FL_DEFAULT_ISA_MODE);
19107   auto isa_flags = TREE_TARGET_OPTION (new_tree)->x_aarch64_isa_flags;
19108
19109   static bool reported_zt0_p;
19110   if (!reported_zt0_p
19111       && !(isa_flags & AARCH64_FL_SME2)
19112       && fndecl
19113       && aarch64_fndecl_has_state (fndecl, "zt0"))
19114     {
19115       error ("functions with %qs state require the ISA extension %qs",
19116              "zt0", "sme2");
19117       inform (input_location, "you can enable %qs using the command-line"
19118               " option %<-march%>, or by using the %<target%>"
19119               " attribute or pragma", "sme2");
19120       reported_zt0_p = true;
19121     }
19122
19123   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
19124      the default have been handled by aarch64_save_restore_target_globals from
19125      aarch64_pragma_target_parse.  */
19126   if (old_tree == new_tree
19127       && (!fndecl || aarch64_previous_fndecl)
19128       && (isa_flags & AARCH64_FL_ISA_MODES) == new_isa_mode)
19129     {
19130       gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19131       return;
19132     }
19133
19134   aarch64_previous_fndecl = fndecl;
19135
19136   /* First set the target options.  */
19137   cl_target_option_restore (&global_options, &global_options_set,
19138                             TREE_TARGET_OPTION (new_tree));
19139
19140   /* The ISA mode can vary based on function type attributes and
19141      function declaration attributes.  Make sure that the target
19142      options correctly reflect these attributes.  */
19143   if ((isa_flags & AARCH64_FL_ISA_MODES) != new_isa_mode)
19144     {
19145       auto base_flags = (aarch64_asm_isa_flags & ~AARCH64_FL_ISA_MODES);
19146       aarch64_set_asm_isa_flags (base_flags | new_isa_mode);
19147
19148       aarch64_override_options_internal (&global_options);
19149       new_tree = build_target_option_node (&global_options,
19150                                            &global_options_set);
19151       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_tree;
19152
19153       tree new_optimize = build_optimization_node (&global_options,
19154                                                    &global_options_set);
19155       if (new_optimize != optimization_default_node)
19156         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19157     }
19158
19159   aarch64_save_restore_target_globals (new_tree);
19160
19161   gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19162 }
19163
19164 /* Enum describing the various ways we can handle attributes.
19165    In many cases we can reuse the generic option handling machinery.  */
19166
19167 enum aarch64_attr_opt_type
19168 {
19169   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
19170   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
19171   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
19172   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
19173 };
19174
19175 /* All the information needed to handle a target attribute.
19176    NAME is the name of the attribute.
19177    ATTR_TYPE specifies the type of behavior of the attribute as described
19178    in the definition of enum aarch64_attr_opt_type.
19179    ALLOW_NEG is true if the attribute supports a "no-" form.
19180    HANDLER is the function that takes the attribute string as an argument
19181    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
19182    OPT_NUM is the enum specifying the option that the attribute modifies.
19183    This is needed for attributes that mirror the behavior of a command-line
19184    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
19185    aarch64_attr_enum.  */
19186
19187 struct aarch64_attribute_info
19188 {
19189   const char *name;
19190   enum aarch64_attr_opt_type attr_type;
19191   bool allow_neg;
19192   bool (*handler) (const char *);
19193   enum opt_code opt_num;
19194 };
19195
19196 /* Handle the ARCH_STR argument to the arch= target attribute.  */
19197
19198 static bool
19199 aarch64_handle_attr_arch (const char *str)
19200 {
19201   const struct processor *tmp_arch = NULL;
19202   std::string invalid_extension;
19203   aarch64_feature_flags tmp_flags;
19204   enum aarch_parse_opt_result parse_res
19205     = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
19206
19207   if (parse_res == AARCH_PARSE_OK)
19208     {
19209       gcc_assert (tmp_arch);
19210       selected_arch = tmp_arch->arch;
19211       aarch64_set_asm_isa_flags (tmp_flags | AARCH64_ISA_MODE);
19212       return true;
19213     }
19214
19215   switch (parse_res)
19216     {
19217       case AARCH_PARSE_MISSING_ARG:
19218         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
19219         break;
19220       case AARCH_PARSE_INVALID_ARG:
19221         error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
19222         aarch64_print_hint_for_arch (str);
19223         break;
19224       case AARCH_PARSE_INVALID_FEATURE:
19225         error ("invalid feature modifier %s of value %qs in "
19226                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19227         aarch64_print_hint_for_extensions (invalid_extension);
19228         break;
19229       default:
19230         gcc_unreachable ();
19231     }
19232
19233   return false;
19234 }
19235
19236 /* Handle the argument CPU_STR to the cpu= target attribute.  */
19237
19238 static bool
19239 aarch64_handle_attr_cpu (const char *str)
19240 {
19241   const struct processor *tmp_cpu = NULL;
19242   std::string invalid_extension;
19243   aarch64_feature_flags tmp_flags;
19244   enum aarch_parse_opt_result parse_res
19245     = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
19246
19247   if (parse_res == AARCH_PARSE_OK)
19248     {
19249       gcc_assert (tmp_cpu);
19250       selected_tune = tmp_cpu->ident;
19251       selected_arch = tmp_cpu->arch;
19252       aarch64_set_asm_isa_flags (tmp_flags | AARCH64_ISA_MODE);
19253       return true;
19254     }
19255
19256   switch (parse_res)
19257     {
19258       case AARCH_PARSE_MISSING_ARG:
19259         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
19260         break;
19261       case AARCH_PARSE_INVALID_ARG:
19262         error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
19263         aarch64_print_hint_for_core (str);
19264         break;
19265       case AARCH_PARSE_INVALID_FEATURE:
19266         error ("invalid feature modifier %qs of value %qs in "
19267                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19268         aarch64_print_hint_for_extensions (invalid_extension);
19269         break;
19270       default:
19271         gcc_unreachable ();
19272     }
19273
19274   return false;
19275 }
19276
19277 /* Handle the argument STR to the branch-protection= attribute.  */
19278
19279 static bool
19280 aarch64_handle_attr_branch_protection (const char* str)
19281 {
19282   return aarch_validate_mbranch_protection (aarch64_branch_protect_types, str,
19283                                             "target(\"branch-protection=\")");
19284 }
19285
19286 /* Handle the argument STR to the tune= target attribute.  */
19287
19288 static bool
19289 aarch64_handle_attr_tune (const char *str)
19290 {
19291   const struct processor *tmp_tune = NULL;
19292   enum aarch_parse_opt_result parse_res
19293     = aarch64_parse_tune (str, &tmp_tune);
19294
19295   if (parse_res == AARCH_PARSE_OK)
19296     {
19297       gcc_assert (tmp_tune);
19298       selected_tune = tmp_tune->ident;
19299       return true;
19300     }
19301
19302   switch (parse_res)
19303     {
19304       case AARCH_PARSE_INVALID_ARG:
19305         error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
19306         aarch64_print_hint_for_core (str);
19307         break;
19308       default:
19309         gcc_unreachable ();
19310     }
19311
19312   return false;
19313 }
19314
19315 /* Parse an architecture extensions target attribute string specified in STR.
19316    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
19317    if successful.  Update aarch64_isa_flags to reflect the ISA features
19318    modified.  */
19319
19320 static bool
19321 aarch64_handle_attr_isa_flags (char *str)
19322 {
19323   enum aarch_parse_opt_result parse_res;
19324   auto isa_flags = aarch64_asm_isa_flags;
19325
19326   /* We allow "+nothing" in the beginning to clear out all architectural
19327      features if the user wants to handpick specific features.  */
19328   if (strncmp ("+nothing", str, 8) == 0)
19329     {
19330       isa_flags = AARCH64_ISA_MODE;
19331       str += 8;
19332     }
19333
19334   std::string invalid_extension;
19335   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
19336
19337   if (parse_res == AARCH_PARSE_OK)
19338     {
19339       aarch64_set_asm_isa_flags (isa_flags);
19340       return true;
19341     }
19342
19343   switch (parse_res)
19344     {
19345       case AARCH_PARSE_MISSING_ARG:
19346         error ("missing value in %<target()%> pragma or attribute");
19347         break;
19348
19349       case AARCH_PARSE_INVALID_FEATURE:
19350         error ("invalid feature modifier %qs of value %qs in "
19351                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19352         break;
19353
19354       default:
19355         gcc_unreachable ();
19356     }
19357
19358  return false;
19359 }
19360
19361 /* The target attributes that we support.  On top of these we also support just
19362    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
19363    handled explicitly in aarch64_process_one_target_attr.  */
19364
19365 static const struct aarch64_attribute_info aarch64_attributes[] =
19366 {
19367   { "general-regs-only", aarch64_attr_mask, false, NULL,
19368      OPT_mgeneral_regs_only },
19369   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
19370      OPT_mfix_cortex_a53_835769 },
19371   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
19372      OPT_mfix_cortex_a53_843419 },
19373   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
19374   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
19375   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
19376      OPT_momit_leaf_frame_pointer },
19377   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
19378   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
19379      OPT_march_ },
19380   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
19381   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
19382      OPT_mtune_ },
19383   { "branch-protection", aarch64_attr_custom, false,
19384      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
19385   { "sign-return-address", aarch64_attr_enum, false, NULL,
19386      OPT_msign_return_address_ },
19387   { "outline-atomics", aarch64_attr_bool, true, NULL,
19388      OPT_moutline_atomics},
19389   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
19390 };
19391
19392 /* Parse ARG_STR which contains the definition of one target attribute.
19393    Show appropriate errors if any or return true if the attribute is valid.  */
19394
19395 static bool
19396 aarch64_process_one_target_attr (char *arg_str)
19397 {
19398   bool invert = false;
19399
19400   size_t len = strlen (arg_str);
19401
19402   if (len == 0)
19403     {
19404       error ("malformed %<target()%> pragma or attribute");
19405       return false;
19406     }
19407
19408   auto_vec<char, 32> buffer;
19409   buffer.safe_grow (len + 1);
19410   char *str_to_check = buffer.address ();
19411   memcpy (str_to_check, arg_str, len + 1);
19412
19413   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
19414      It is easier to detect and handle it explicitly here rather than going
19415      through the machinery for the rest of the target attributes in this
19416      function.  */
19417   if (*str_to_check == '+')
19418     return aarch64_handle_attr_isa_flags (str_to_check);
19419
19420   if (len > 3 && startswith (str_to_check, "no-"))
19421     {
19422       invert = true;
19423       str_to_check += 3;
19424     }
19425   char *arg = strchr (str_to_check, '=');
19426
19427   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
19428      and point ARG to "foo".  */
19429   if (arg)
19430     {
19431       *arg = '\0';
19432       arg++;
19433     }
19434   const struct aarch64_attribute_info *p_attr;
19435   bool found = false;
19436   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
19437     {
19438       /* If the names don't match up, or the user has given an argument
19439          to an attribute that doesn't accept one, or didn't give an argument
19440          to an attribute that expects one, fail to match.  */
19441       if (strcmp (str_to_check, p_attr->name) != 0)
19442         continue;
19443
19444       found = true;
19445       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
19446                               || p_attr->attr_type == aarch64_attr_enum;
19447
19448       if (attr_need_arg_p ^ (arg != NULL))
19449         {
19450           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
19451           return false;
19452         }
19453
19454       /* If the name matches but the attribute does not allow "no-" versions
19455          then we can't match.  */
19456       if (invert && !p_attr->allow_neg)
19457         {
19458           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
19459           return false;
19460         }
19461
19462       switch (p_attr->attr_type)
19463         {
19464         /* Has a custom handler registered.
19465            For example, cpu=, arch=, tune=.  */
19466           case aarch64_attr_custom:
19467             gcc_assert (p_attr->handler);
19468             if (!p_attr->handler (arg))
19469               return false;
19470             break;
19471
19472           /* Either set or unset a boolean option.  */
19473           case aarch64_attr_bool:
19474             {
19475               struct cl_decoded_option decoded;
19476
19477               generate_option (p_attr->opt_num, NULL, !invert,
19478                                CL_TARGET, &decoded);
19479               aarch64_handle_option (&global_options, &global_options_set,
19480                                       &decoded, input_location);
19481               break;
19482             }
19483           /* Set or unset a bit in the target_flags.  aarch64_handle_option
19484              should know what mask to apply given the option number.  */
19485           case aarch64_attr_mask:
19486             {
19487               struct cl_decoded_option decoded;
19488               /* We only need to specify the option number.
19489                  aarch64_handle_option will know which mask to apply.  */
19490               decoded.opt_index = p_attr->opt_num;
19491               decoded.value = !invert;
19492               aarch64_handle_option (&global_options, &global_options_set,
19493                                       &decoded, input_location);
19494               break;
19495             }
19496           /* Use the option setting machinery to set an option to an enum.  */
19497           case aarch64_attr_enum:
19498             {
19499               gcc_assert (arg);
19500               bool valid;
19501               int value;
19502               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
19503                                               &value, CL_TARGET);
19504               if (valid)
19505                 {
19506                   set_option (&global_options, NULL, p_attr->opt_num, value,
19507                               NULL, DK_UNSPECIFIED, input_location,
19508                               global_dc);
19509                 }
19510               else
19511                 {
19512                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
19513                 }
19514               break;
19515             }
19516           default:
19517             gcc_unreachable ();
19518         }
19519     }
19520
19521   /* If we reached here we either have found an attribute and validated
19522      it or didn't match any.  If we matched an attribute but its arguments
19523      were malformed we will have returned false already.  */
19524   return found;
19525 }
19526
19527 /* Count how many times the character C appears in
19528    NULL-terminated string STR.  */
19529
19530 static unsigned int
19531 num_occurences_in_str (char c, char *str)
19532 {
19533   unsigned int res = 0;
19534   while (*str != '\0')
19535     {
19536       if (*str == c)
19537         res++;
19538
19539       str++;
19540     }
19541
19542   return res;
19543 }
19544
19545 /* Parse the tree in ARGS that contains the target attribute information
19546    and update the global target options space.  */
19547
19548 bool
19549 aarch64_process_target_attr (tree args)
19550 {
19551   if (TREE_CODE (args) == TREE_LIST)
19552     {
19553       do
19554         {
19555           tree head = TREE_VALUE (args);
19556           if (head)
19557             {
19558               if (!aarch64_process_target_attr (head))
19559                 return false;
19560             }
19561           args = TREE_CHAIN (args);
19562         } while (args);
19563
19564       return true;
19565     }
19566
19567   if (TREE_CODE (args) != STRING_CST)
19568     {
19569       error ("attribute %<target%> argument not a string");
19570       return false;
19571     }
19572
19573   size_t len = strlen (TREE_STRING_POINTER (args));
19574   auto_vec<char, 32> buffer;
19575   buffer.safe_grow (len + 1);
19576   char *str_to_check = buffer.address ();
19577   memcpy (str_to_check, TREE_STRING_POINTER (args), len + 1);
19578
19579   if (len == 0)
19580     {
19581       error ("malformed %<target()%> pragma or attribute");
19582       return false;
19583     }
19584
19585   /* Used to catch empty spaces between commas i.e.
19586      attribute ((target ("attr1,,attr2"))).  */
19587   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19588
19589   /* Handle multiple target attributes separated by ','.  */
19590   char *token = strtok_r (str_to_check, ",", &str_to_check);
19591
19592   unsigned int num_attrs = 0;
19593   while (token)
19594     {
19595       num_attrs++;
19596       if (!aarch64_process_one_target_attr (token))
19597         {
19598           /* Check if token is possibly an arch extension without
19599              leading '+'.  */
19600           aarch64_feature_flags isa_temp = 0;
19601           auto with_plus = std::string ("+") + token;
19602           enum aarch_parse_opt_result ext_res
19603             = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19604
19605           if (ext_res == AARCH_PARSE_OK)
19606             error ("arch extension %<%s%> should be prefixed by %<+%>",
19607                    token);
19608           else
19609             error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19610           return false;
19611         }
19612
19613       token = strtok_r (NULL, ",", &str_to_check);
19614     }
19615
19616   if (num_attrs != num_commas + 1)
19617     {
19618       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19619       return false;
19620     }
19621
19622   return true;
19623 }
19624
19625 static bool aarch64_process_target_version_attr (tree args);
19626
19627 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
19628    process attribute ((target ("..."))).  */
19629
19630 static bool
19631 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19632 {
19633   struct cl_target_option cur_target;
19634   bool ret;
19635   tree old_optimize;
19636   tree new_target, new_optimize;
19637   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19638
19639   /* If what we're processing is the current pragma string then the
19640      target option node is already stored in target_option_current_node
19641      by aarch64_pragma_target_parse in aarch64-c.cc.  Use that to avoid
19642      having to re-parse the string.  This is especially useful to keep
19643      arm_neon.h compile times down since that header contains a lot
19644      of intrinsics enclosed in pragmas.  */
19645   if (!existing_target && args == current_target_pragma)
19646     {
19647       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19648       return true;
19649     }
19650   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19651
19652   old_optimize
19653     = build_optimization_node (&global_options, &global_options_set);
19654   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19655
19656   /* If the function changed the optimization levels as well as setting
19657      target options, start with the optimizations specified.  */
19658   if (func_optimize && func_optimize != old_optimize)
19659     cl_optimization_restore (&global_options, &global_options_set,
19660                              TREE_OPTIMIZATION (func_optimize));
19661
19662   /* Save the current target options to restore at the end.  */
19663   cl_target_option_save (&cur_target, &global_options, &global_options_set);
19664
19665   /* If fndecl already has some target attributes applied to it, unpack
19666      them so that we add this attribute on top of them, rather than
19667      overwriting them.  */
19668   if (existing_target)
19669     {
19670       struct cl_target_option *existing_options
19671         = TREE_TARGET_OPTION (existing_target);
19672
19673       if (existing_options)
19674         cl_target_option_restore (&global_options, &global_options_set,
19675                                   existing_options);
19676     }
19677   else
19678     cl_target_option_restore (&global_options, &global_options_set,
19679                               TREE_TARGET_OPTION (target_option_current_node));
19680
19681   ret = aarch64_process_target_attr (args);
19682   if (ret)
19683     {
19684       tree version_attr = lookup_attribute ("target_version",
19685                                             DECL_ATTRIBUTES (fndecl));
19686       if (version_attr != NULL_TREE)
19687         {
19688           /* Reapply any target_version attribute after target attribute.
19689              This should be equivalent to applying the target_version once
19690              after processing all target attributes.  */
19691           tree version_args = TREE_VALUE (version_attr);
19692           ret = aarch64_process_target_version_attr (version_args);
19693         }
19694     }
19695
19696   /* Set up any additional state.  */
19697   if (ret)
19698     {
19699       aarch64_override_options_internal (&global_options);
19700       new_target = build_target_option_node (&global_options,
19701                                              &global_options_set);
19702     }
19703   else
19704     new_target = NULL;
19705
19706   new_optimize = build_optimization_node (&global_options,
19707                                           &global_options_set);
19708
19709   if (fndecl && ret)
19710     {
19711       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19712
19713       if (old_optimize != new_optimize)
19714         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19715     }
19716
19717   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19718
19719   if (old_optimize != new_optimize)
19720     cl_optimization_restore (&global_options, &global_options_set,
19721                              TREE_OPTIMIZATION (old_optimize));
19722   return ret;
19723 }
19724
19725 typedef unsigned long long aarch64_fmv_feature_mask;
19726
19727 typedef struct
19728 {
19729   const char *name;
19730   aarch64_fmv_feature_mask feature_mask;
19731   aarch64_feature_flags opt_flags;
19732 } aarch64_fmv_feature_datum;
19733
19734 #define AARCH64_FMV_FEATURE(NAME, FEAT_NAME, C) \
19735   {NAME, 1ULL << FEAT_##FEAT_NAME, ::feature_deps::fmv_deps_##FEAT_NAME},
19736
19737 /* The "rdma" alias uses a different FEAT_NAME to avoid a duplicate
19738    feature_deps name.  */
19739 #define FEAT_RDMA FEAT_RDM
19740
19741 /* FMV features are listed in priority order, to make it easier to sort target
19742    strings.  */
19743 static aarch64_fmv_feature_datum aarch64_fmv_feature_data[] = {
19744 #include "config/aarch64/aarch64-option-extensions.def"
19745 };
19746
19747 /* Parse a function multiversioning feature string STR, as found in a
19748    target_version or target_clones attribute.
19749
19750    If ISA_FLAGS is nonnull, then update it with the specified architecture
19751    features turned on.  If FEATURE_MASK is nonnull, then assign to it a bitmask
19752    representing the set of features explicitly specified in the feature string.
19753    Return an aarch_parse_opt_result describing the result.
19754
19755    When the STR string contains an invalid or duplicate extension, a copy of
19756    the extension string is created and stored to INVALID_EXTENSION.  */
19757
19758 static enum aarch_parse_opt_result
19759 aarch64_parse_fmv_features (const char *str, aarch64_feature_flags *isa_flags,
19760                             aarch64_fmv_feature_mask *feature_mask,
19761                             std::string *invalid_extension)
19762 {
19763   if (feature_mask)
19764     *feature_mask = 0ULL;
19765
19766   if (strcmp (str, "default") == 0)
19767     return AARCH_PARSE_OK;
19768
19769   while (str != NULL && *str != 0)
19770     {
19771       const char *ext;
19772       size_t len;
19773
19774       ext = strchr (str, '+');
19775
19776       if (ext != NULL)
19777         len = ext - str;
19778       else
19779         len = strlen (str);
19780
19781       if (len == 0)
19782         return AARCH_PARSE_MISSING_ARG;
19783
19784       int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
19785       int i;
19786       for (i = 0; i < num_features; i++)
19787         {
19788           if (strlen (aarch64_fmv_feature_data[i].name) == len
19789               && strncmp (aarch64_fmv_feature_data[i].name, str, len) == 0)
19790             {
19791               if (isa_flags)
19792                 *isa_flags |= aarch64_fmv_feature_data[i].opt_flags;
19793               if (feature_mask)
19794                 {
19795                   auto old_feature_mask = *feature_mask;
19796                   *feature_mask |= aarch64_fmv_feature_data[i].feature_mask;
19797                   if (*feature_mask == old_feature_mask)
19798                     {
19799                       /* Duplicate feature.  */
19800                       if (invalid_extension)
19801                         *invalid_extension = std::string (str, len);
19802                       return AARCH_PARSE_DUPLICATE_FEATURE;
19803                     }
19804                 }
19805               break;
19806             }
19807         }
19808
19809       if (i == num_features)
19810         {
19811           /* Feature not found in list.  */
19812           if (invalid_extension)
19813             *invalid_extension = std::string (str, len);
19814           return AARCH_PARSE_INVALID_FEATURE;
19815         }
19816
19817       str = ext;
19818       if (str)
19819         /* Skip over the next '+'.  */
19820         str++;
19821     }
19822
19823   return AARCH_PARSE_OK;
19824 }
19825
19826 /* Parse the tree in ARGS that contains the target_version attribute
19827    information and update the global target options space.  */
19828
19829 static bool
19830 aarch64_process_target_version_attr (tree args)
19831 {
19832   if (TREE_CODE (args) == TREE_LIST)
19833     {
19834       if (TREE_CHAIN (args))
19835         {
19836           error ("attribute %<target_version%> has multiple values");
19837           return false;
19838         }
19839       args = TREE_VALUE (args);
19840     }
19841
19842   if (!args || TREE_CODE (args) != STRING_CST)
19843     {
19844       error ("attribute %<target_version%> argument not a string");
19845       return false;
19846     }
19847
19848   const char *str = TREE_STRING_POINTER (args);
19849
19850   enum aarch_parse_opt_result parse_res;
19851   auto isa_flags = aarch64_asm_isa_flags;
19852
19853   std::string invalid_extension;
19854   parse_res = aarch64_parse_fmv_features (str, &isa_flags, NULL,
19855                                           &invalid_extension);
19856
19857   if (parse_res == AARCH_PARSE_OK)
19858     {
19859       aarch64_set_asm_isa_flags (isa_flags);
19860       return true;
19861     }
19862
19863   switch (parse_res)
19864     {
19865     case AARCH_PARSE_MISSING_ARG:
19866       error ("missing value in %<target_version%> attribute");
19867       break;
19868
19869     case AARCH_PARSE_INVALID_FEATURE:
19870       error ("invalid feature modifier %qs of value %qs in "
19871              "%<target_version%> attribute", invalid_extension.c_str (),
19872              str);
19873       break;
19874
19875     case AARCH_PARSE_DUPLICATE_FEATURE:
19876       error ("duplicate feature modifier %qs of value %qs in "
19877              "%<target_version%> attribute", invalid_extension.c_str (),
19878              str);
19879       break;
19880
19881     default:
19882       gcc_unreachable ();
19883     }
19884
19885   return false;
19886 }
19887
19888 /* Implement TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P.  This is used to
19889    process attribute ((target_version ("..."))).  */
19890
19891 static bool
19892 aarch64_option_valid_version_attribute_p (tree fndecl, tree, tree args, int)
19893 {
19894   struct cl_target_option cur_target;
19895   bool ret;
19896   tree new_target;
19897   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19898
19899   /* Save the current target options to restore at the end.  */
19900   cl_target_option_save (&cur_target, &global_options, &global_options_set);
19901
19902   /* If fndecl already has some target attributes applied to it, unpack
19903      them so that we add this attribute on top of them, rather than
19904      overwriting them.  */
19905   if (existing_target)
19906     {
19907       struct cl_target_option *existing_options
19908         = TREE_TARGET_OPTION (existing_target);
19909
19910       if (existing_options)
19911         cl_target_option_restore (&global_options, &global_options_set,
19912                                   existing_options);
19913     }
19914   else
19915     cl_target_option_restore (&global_options, &global_options_set,
19916                               TREE_TARGET_OPTION (target_option_current_node));
19917
19918   ret = aarch64_process_target_version_attr (args);
19919
19920   /* Set up any additional state.  */
19921   if (ret)
19922     {
19923       aarch64_override_options_internal (&global_options);
19924       new_target = build_target_option_node (&global_options,
19925                                              &global_options_set);
19926     }
19927   else
19928     new_target = NULL;
19929
19930   if (fndecl && ret)
19931       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19932
19933   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19934
19935   return ret;
19936 }
19937
19938 /* This parses the attribute arguments to target_version in DECL and the
19939    feature mask required to select those targets.  No adjustments are made to
19940    add or remove redundant feature requirements.  */
19941
19942 static aarch64_fmv_feature_mask
19943 get_feature_mask_for_version (tree decl)
19944 {
19945   tree version_attr = lookup_attribute ("target_version",
19946                                         DECL_ATTRIBUTES (decl));
19947   if (version_attr == NULL)
19948     return 0;
19949
19950   const char *version_string = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE
19951                                                     (version_attr)));
19952   enum aarch_parse_opt_result parse_res;
19953   aarch64_fmv_feature_mask feature_mask;
19954
19955   parse_res = aarch64_parse_fmv_features (version_string, NULL, &feature_mask,
19956                                           NULL);
19957
19958   /* We should have detected any errors before getting here.  */
19959   gcc_assert (parse_res == AARCH_PARSE_OK);
19960
19961   return feature_mask;
19962 }
19963
19964 /* Compare priorities of two feature masks. Return:
19965      1: mask1 is higher priority
19966     -1: mask2 is higher priority
19967      0: masks are equal.  */
19968
19969 static int
19970 compare_feature_masks (aarch64_fmv_feature_mask mask1,
19971                        aarch64_fmv_feature_mask mask2)
19972 {
19973   int pop1 = popcount_hwi (mask1);
19974   int pop2 = popcount_hwi (mask2);
19975   if (pop1 > pop2)
19976     return 1;
19977   if (pop2 > pop1)
19978     return -1;
19979
19980   auto diff_mask = mask1 ^ mask2;
19981   if (diff_mask == 0ULL)
19982     return 0;
19983   int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
19984   for (int i = num_features - 1; i >= 0; i--)
19985     {
19986       auto bit_mask = aarch64_fmv_feature_data[i].feature_mask;
19987       if (diff_mask & bit_mask)
19988         return (mask1 & bit_mask) ? 1 : -1;
19989     }
19990   gcc_unreachable();
19991 }
19992
19993 /* Compare priorities of two version decls.  */
19994
19995 int
19996 aarch64_compare_version_priority (tree decl1, tree decl2)
19997 {
19998   auto mask1 = get_feature_mask_for_version (decl1);
19999   auto mask2 = get_feature_mask_for_version (decl2);
20000
20001   return compare_feature_masks (mask1, mask2);
20002 }
20003
20004 /* Build the struct __ifunc_arg_t type:
20005
20006    struct __ifunc_arg_t
20007    {
20008      unsigned long _size; // Size of the struct, so it can grow.
20009      unsigned long _hwcap;
20010      unsigned long _hwcap2;
20011    }
20012  */
20013
20014 static tree
20015 build_ifunc_arg_type ()
20016 {
20017   tree ifunc_arg_type = lang_hooks.types.make_type (RECORD_TYPE);
20018   tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20019                             get_identifier ("_size"),
20020                             long_unsigned_type_node);
20021   tree field2 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20022                             get_identifier ("_hwcap"),
20023                             long_unsigned_type_node);
20024   tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20025                             get_identifier ("_hwcap2"),
20026                             long_unsigned_type_node);
20027
20028   DECL_FIELD_CONTEXT (field1) = ifunc_arg_type;
20029   DECL_FIELD_CONTEXT (field2) = ifunc_arg_type;
20030   DECL_FIELD_CONTEXT (field3) = ifunc_arg_type;
20031
20032   TYPE_FIELDS (ifunc_arg_type) = field1;
20033   DECL_CHAIN (field1) = field2;
20034   DECL_CHAIN (field2) = field3;
20035
20036   layout_type (ifunc_arg_type);
20037
20038   tree const_type = build_qualified_type (ifunc_arg_type, TYPE_QUAL_CONST);
20039   tree pointer_type = build_pointer_type (const_type);
20040
20041   return pointer_type;
20042 }
20043
20044 /* Implement TARGET_MANGLE_DECL_ASSEMBLER_NAME, to add function multiversioning
20045    suffixes.  */
20046
20047 tree
20048 aarch64_mangle_decl_assembler_name (tree decl, tree id)
20049 {
20050   /* For function version, add the target suffix to the assembler name.  */
20051   if (TREE_CODE (decl) == FUNCTION_DECL
20052       && DECL_FUNCTION_VERSIONED (decl))
20053     {
20054       aarch64_fmv_feature_mask feature_mask = get_feature_mask_for_version (decl);
20055
20056       std::string name = IDENTIFIER_POINTER (id);
20057
20058       /* For the default version, append ".default".  */
20059       if (feature_mask == 0ULL)
20060         {
20061           name += ".default";
20062           return get_identifier (name.c_str());
20063         }
20064
20065       name += "._";
20066
20067       int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
20068       for (int i = 0; i < num_features; i++)
20069         {
20070           if (feature_mask & aarch64_fmv_feature_data[i].feature_mask)
20071             {
20072               name += "M";
20073               name += aarch64_fmv_feature_data[i].name;
20074             }
20075         }
20076
20077       if (DECL_ASSEMBLER_NAME_SET_P (decl))
20078         SET_DECL_RTL (decl, NULL);
20079
20080       id = get_identifier (name.c_str());
20081     }
20082   return id;
20083 }
20084
20085 /* Return an identifier for the base assembler name of a versioned function.
20086    This is computed by taking the default version's assembler name, and
20087    stripping off the ".default" suffix if it's already been appended.  */
20088
20089 static tree
20090 get_suffixed_assembler_name (tree default_decl, const char *suffix)
20091 {
20092   std::string name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (default_decl));
20093
20094   auto size = name.size ();
20095   if (size >= 8 && name.compare (size - 8, 8, ".default") == 0)
20096     name.resize (size - 8);
20097   name += suffix;
20098   return get_identifier (name.c_str());
20099 }
20100
20101 /* Make the resolver function decl to dispatch the versions of
20102    a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
20103    ifunc alias that will point to the created resolver.  Create an
20104    empty basic block in the resolver and store the pointer in
20105    EMPTY_BB.  Return the decl of the resolver function.  */
20106
20107 static tree
20108 make_resolver_func (const tree default_decl,
20109                     const tree ifunc_alias_decl,
20110                     basic_block *empty_bb)
20111 {
20112   tree decl, type, t;
20113
20114   /* Create resolver function name based on default_decl.  We need to remove an
20115      existing ".default" suffix if this has already been appended.  */
20116   tree decl_name = get_suffixed_assembler_name (default_decl, ".resolver");
20117   const char *resolver_name = IDENTIFIER_POINTER (decl_name);
20118
20119   /* The resolver function should have signature
20120      (void *) resolver (uint64_t, const __ifunc_arg_t *) */
20121   type = build_function_type_list (ptr_type_node,
20122                                    uint64_type_node,
20123                                    build_ifunc_arg_type (),
20124                                    NULL_TREE);
20125
20126   decl = build_fn_decl (resolver_name, type);
20127   SET_DECL_ASSEMBLER_NAME (decl, decl_name);
20128
20129   DECL_NAME (decl) = decl_name;
20130   TREE_USED (decl) = 1;
20131   DECL_ARTIFICIAL (decl) = 1;
20132   DECL_IGNORED_P (decl) = 1;
20133   TREE_PUBLIC (decl) = 0;
20134   DECL_UNINLINABLE (decl) = 1;
20135
20136   /* Resolver is not external, body is generated.  */
20137   DECL_EXTERNAL (decl) = 0;
20138   DECL_EXTERNAL (ifunc_alias_decl) = 0;
20139
20140   DECL_CONTEXT (decl) = NULL_TREE;
20141   DECL_INITIAL (decl) = make_node (BLOCK);
20142   DECL_STATIC_CONSTRUCTOR (decl) = 0;
20143
20144   if (DECL_COMDAT_GROUP (default_decl)
20145       || TREE_PUBLIC (default_decl))
20146     {
20147       /* In this case, each translation unit with a call to this
20148          versioned function will put out a resolver.  Ensure it
20149          is comdat to keep just one copy.  */
20150       DECL_COMDAT (decl) = 1;
20151       make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
20152     }
20153   else
20154     TREE_PUBLIC (ifunc_alias_decl) = 0;
20155
20156   /* Build result decl and add to function_decl. */
20157   t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
20158   DECL_CONTEXT (t) = decl;
20159   DECL_ARTIFICIAL (t) = 1;
20160   DECL_IGNORED_P (t) = 1;
20161   DECL_RESULT (decl) = t;
20162
20163   /* Build parameter decls and add to function_decl. */
20164   tree arg1 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20165                           get_identifier ("hwcap"),
20166                           uint64_type_node);
20167   tree arg2 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20168                           get_identifier ("arg"),
20169                           build_ifunc_arg_type());
20170   DECL_CONTEXT (arg1) = decl;
20171   DECL_CONTEXT (arg2) = decl;
20172   DECL_ARTIFICIAL (arg1) = 1;
20173   DECL_ARTIFICIAL (arg2) = 1;
20174   DECL_IGNORED_P (arg1) = 1;
20175   DECL_IGNORED_P (arg2) = 1;
20176   DECL_ARG_TYPE (arg1) = uint64_type_node;
20177   DECL_ARG_TYPE (arg2) = build_ifunc_arg_type ();
20178   DECL_ARGUMENTS (decl) = arg1;
20179   TREE_CHAIN (arg1) = arg2;
20180
20181   gimplify_function_tree (decl);
20182   push_cfun (DECL_STRUCT_FUNCTION (decl));
20183   *empty_bb = init_lowered_empty_function (decl, false,
20184                                            profile_count::uninitialized ());
20185
20186   cgraph_node::add_new_function (decl, true);
20187   symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
20188
20189   pop_cfun ();
20190
20191   gcc_assert (ifunc_alias_decl != NULL);
20192   /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
20193   DECL_ATTRIBUTES (ifunc_alias_decl)
20194     = make_attribute ("ifunc", resolver_name,
20195                       DECL_ATTRIBUTES (ifunc_alias_decl));
20196
20197   /* Create the alias for dispatch to resolver here.  */
20198   cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
20199   return decl;
20200 }
20201
20202 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
20203    to return a pointer to VERSION_DECL if all feature bits specified in
20204    FEATURE_MASK are not set in MASK_VAR.  This function will be called during
20205    version dispatch to decide which function version to execute.  It returns
20206    the basic block at the end, to which more conditions can be added.  */
20207 static basic_block
20208 add_condition_to_bb (tree function_decl, tree version_decl,
20209                      aarch64_fmv_feature_mask feature_mask,
20210                      tree mask_var, basic_block new_bb)
20211 {
20212   gimple *return_stmt;
20213   tree convert_expr, result_var;
20214   gimple *convert_stmt;
20215   gimple *if_else_stmt;
20216
20217   basic_block bb1, bb2, bb3;
20218   edge e12, e23;
20219
20220   gimple_seq gseq;
20221
20222   push_cfun (DECL_STRUCT_FUNCTION (function_decl));
20223
20224   gcc_assert (new_bb != NULL);
20225   gseq = bb_seq (new_bb);
20226
20227   convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
20228                          build_fold_addr_expr (version_decl));
20229   result_var = create_tmp_var (ptr_type_node);
20230   convert_stmt = gimple_build_assign (result_var, convert_expr);
20231   return_stmt = gimple_build_return (result_var);
20232
20233   if (feature_mask == 0ULL)
20234     {
20235       /* Default version.  */
20236       gimple_seq_add_stmt (&gseq, convert_stmt);
20237       gimple_seq_add_stmt (&gseq, return_stmt);
20238       set_bb_seq (new_bb, gseq);
20239       gimple_set_bb (convert_stmt, new_bb);
20240       gimple_set_bb (return_stmt, new_bb);
20241       pop_cfun ();
20242       return new_bb;
20243     }
20244
20245   tree and_expr_var = create_tmp_var (long_long_unsigned_type_node);
20246   tree and_expr = build2 (BIT_AND_EXPR,
20247                           long_long_unsigned_type_node,
20248                           mask_var,
20249                           build_int_cst (long_long_unsigned_type_node,
20250                                          feature_mask));
20251   gimple *and_stmt = gimple_build_assign (and_expr_var, and_expr);
20252   gimple_set_block (and_stmt, DECL_INITIAL (function_decl));
20253   gimple_set_bb (and_stmt, new_bb);
20254   gimple_seq_add_stmt (&gseq, and_stmt);
20255
20256   tree zero_llu = build_int_cst (long_long_unsigned_type_node, 0);
20257   if_else_stmt = gimple_build_cond (EQ_EXPR, and_expr_var, zero_llu,
20258                                     NULL_TREE, NULL_TREE);
20259   gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
20260   gimple_set_bb (if_else_stmt, new_bb);
20261   gimple_seq_add_stmt (&gseq, if_else_stmt);
20262
20263   gimple_seq_add_stmt (&gseq, convert_stmt);
20264   gimple_seq_add_stmt (&gseq, return_stmt);
20265   set_bb_seq (new_bb, gseq);
20266
20267   bb1 = new_bb;
20268   e12 = split_block (bb1, if_else_stmt);
20269   bb2 = e12->dest;
20270   e12->flags &= ~EDGE_FALLTHRU;
20271   e12->flags |= EDGE_TRUE_VALUE;
20272
20273   e23 = split_block (bb2, return_stmt);
20274
20275   gimple_set_bb (convert_stmt, bb2);
20276   gimple_set_bb (return_stmt, bb2);
20277
20278   bb3 = e23->dest;
20279   make_edge (bb1, bb3, EDGE_FALSE_VALUE);
20280
20281   remove_edge (e23);
20282   make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
20283
20284   pop_cfun ();
20285
20286   return bb3;
20287 }
20288
20289 /* This function generates the dispatch function for
20290    multi-versioned functions.  DISPATCH_DECL is the function which will
20291    contain the dispatch logic.  FNDECLS are the function choices for
20292    dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
20293    in DISPATCH_DECL in which the dispatch code is generated.  */
20294
20295 static int
20296 dispatch_function_versions (tree dispatch_decl,
20297                             void *fndecls_p,
20298                             basic_block *empty_bb)
20299 {
20300   gimple *ifunc_cpu_init_stmt;
20301   gimple_seq gseq;
20302   vec<tree> *fndecls;
20303
20304   gcc_assert (dispatch_decl != NULL
20305               && fndecls_p != NULL
20306               && empty_bb != NULL);
20307
20308   push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
20309
20310   gseq = bb_seq (*empty_bb);
20311   /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
20312      constructors, so explicity call __init_cpu_features_resolver here.  */
20313   tree init_fn_type = build_function_type_list (void_type_node,
20314                                                 long_unsigned_type_node,
20315                                                 build_ifunc_arg_type(),
20316                                                 NULL);
20317   tree init_fn_id = get_identifier ("__init_cpu_features_resolver");
20318   tree init_fn_decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
20319                                   init_fn_id, init_fn_type);
20320   tree arg1 = DECL_ARGUMENTS (dispatch_decl);
20321   tree arg2 = TREE_CHAIN (arg1);
20322   ifunc_cpu_init_stmt = gimple_build_call (init_fn_decl, 2, arg1, arg2);
20323   gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
20324   gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
20325
20326   /* Build the struct type for __aarch64_cpu_features.  */
20327   tree global_type = lang_hooks.types.make_type (RECORD_TYPE);
20328   tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20329                             get_identifier ("features"),
20330                             long_long_unsigned_type_node);
20331   DECL_FIELD_CONTEXT (field1) = global_type;
20332   TYPE_FIELDS (global_type) = field1;
20333   layout_type (global_type);
20334
20335   tree global_var = build_decl (UNKNOWN_LOCATION, VAR_DECL,
20336                                 get_identifier ("__aarch64_cpu_features"),
20337                                 global_type);
20338   DECL_EXTERNAL (global_var) = 1;
20339   tree mask_var = create_tmp_var (long_long_unsigned_type_node);
20340
20341   tree component_expr = build3 (COMPONENT_REF, long_long_unsigned_type_node,
20342                                 global_var, field1, NULL_TREE);
20343   gimple *component_stmt = gimple_build_assign (mask_var, component_expr);
20344   gimple_set_block (component_stmt, DECL_INITIAL (dispatch_decl));
20345   gimple_set_bb (component_stmt, *empty_bb);
20346   gimple_seq_add_stmt (&gseq, component_stmt);
20347
20348   tree not_expr = build1 (BIT_NOT_EXPR, long_long_unsigned_type_node, mask_var);
20349   gimple *not_stmt = gimple_build_assign (mask_var, not_expr);
20350   gimple_set_block (not_stmt, DECL_INITIAL (dispatch_decl));
20351   gimple_set_bb (not_stmt, *empty_bb);
20352   gimple_seq_add_stmt (&gseq, not_stmt);
20353
20354   set_bb_seq (*empty_bb, gseq);
20355
20356   pop_cfun ();
20357
20358   /* fndecls_p is actually a vector.  */
20359   fndecls = static_cast<vec<tree> *> (fndecls_p);
20360
20361   /* At least one more version other than the default.  */
20362   unsigned int num_versions = fndecls->length ();
20363   gcc_assert (num_versions >= 2);
20364
20365   struct function_version_info
20366     {
20367       tree version_decl;
20368       aarch64_fmv_feature_mask feature_mask;
20369     } *function_versions;
20370
20371   function_versions = (struct function_version_info *)
20372     XNEWVEC (struct function_version_info, (num_versions));
20373
20374   unsigned int actual_versions = 0;
20375
20376   for (tree version_decl : *fndecls)
20377     {
20378       aarch64_fmv_feature_mask feature_mask;
20379       /* Get attribute string, parse it and find the right features.  */
20380       feature_mask = get_feature_mask_for_version (version_decl);
20381       function_versions [actual_versions].version_decl = version_decl;
20382       function_versions [actual_versions].feature_mask = feature_mask;
20383       actual_versions++;
20384     }
20385
20386   auto compare_feature_version_info = [](const void *p1, const void *p2) {
20387     const function_version_info v1 = *(const function_version_info *)p1;
20388     const function_version_info v2 = *(const function_version_info *)p2;
20389     return - compare_feature_masks (v1.feature_mask, v2.feature_mask);
20390   };
20391
20392   /* Sort the versions according to descending order of dispatch priority.  */
20393   qsort (function_versions, actual_versions,
20394          sizeof (struct function_version_info), compare_feature_version_info);
20395
20396   for (unsigned int i = 0; i < actual_versions; ++i)
20397     *empty_bb = add_condition_to_bb (dispatch_decl,
20398                                      function_versions[i].version_decl,
20399                                      function_versions[i].feature_mask,
20400                                      mask_var,
20401                                      *empty_bb);
20402
20403   free (function_versions);
20404   return 0;
20405 }
20406
20407 /* Implement TARGET_GENERATE_VERSION_DISPATCHER_BODY.  */
20408
20409 tree
20410 aarch64_generate_version_dispatcher_body (void *node_p)
20411 {
20412   tree resolver_decl;
20413   basic_block empty_bb;
20414   tree default_ver_decl;
20415   struct cgraph_node *versn;
20416   struct cgraph_node *node;
20417
20418   struct cgraph_function_version_info *node_version_info = NULL;
20419   struct cgraph_function_version_info *versn_info = NULL;
20420
20421   node = (cgraph_node *)node_p;
20422
20423   node_version_info = node->function_version ();
20424   gcc_assert (node->dispatcher_function
20425               && node_version_info != NULL);
20426
20427   if (node_version_info->dispatcher_resolver)
20428     return node_version_info->dispatcher_resolver;
20429
20430   /* The first version in the chain corresponds to the default version.  */
20431   default_ver_decl = node_version_info->next->this_node->decl;
20432
20433   /* node is going to be an alias, so remove the finalized bit.  */
20434   node->definition = false;
20435
20436   resolver_decl = make_resolver_func (default_ver_decl,
20437                                       node->decl, &empty_bb);
20438
20439   node_version_info->dispatcher_resolver = resolver_decl;
20440
20441   push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
20442
20443   auto_vec<tree, 2> fn_ver_vec;
20444
20445   for (versn_info = node_version_info->next; versn_info;
20446        versn_info = versn_info->next)
20447     {
20448       versn = versn_info->this_node;
20449       /* Check for virtual functions here again, as by this time it should
20450          have been determined if this function needs a vtable index or
20451          not.  This happens for methods in derived classes that override
20452          virtual methods in base classes but are not explicitly marked as
20453          virtual.  */
20454       if (DECL_VINDEX (versn->decl))
20455         sorry ("virtual function multiversioning not supported");
20456
20457       fn_ver_vec.safe_push (versn->decl);
20458     }
20459
20460   dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
20461   cgraph_edge::rebuild_edges ();
20462   pop_cfun ();
20463
20464   /* Fix up symbol names.  First we need to obtain the base name, which may
20465      have already been mangled.  */
20466   tree base_name = get_suffixed_assembler_name (default_ver_decl, "");
20467
20468   /* We need to redo the version mangling on the non-default versions for the
20469      target_clones case.  Redoing the mangling for the target_version case is
20470      redundant but does no harm.  We need to skip the default version, because
20471      expand_clones will append ".default" later; fortunately that suffix is the
20472      one we want anyway.  */
20473   for (versn_info = node_version_info->next->next; versn_info;
20474        versn_info = versn_info->next)
20475     {
20476       tree version_decl = versn_info->this_node->decl;
20477       tree name = aarch64_mangle_decl_assembler_name (version_decl,
20478                                                       base_name);
20479       symtab->change_decl_assembler_name (version_decl, name);
20480     }
20481
20482   /* We also need to use the base name for the ifunc declaration.  */
20483   symtab->change_decl_assembler_name (node->decl, base_name);
20484
20485   return resolver_decl;
20486 }
20487
20488 /* Make a dispatcher declaration for the multi-versioned function DECL.
20489    Calls to DECL function will be replaced with calls to the dispatcher
20490    by the front-end.  Returns the decl of the dispatcher function.  */
20491
20492 tree
20493 aarch64_get_function_versions_dispatcher (void *decl)
20494 {
20495   tree fn = (tree) decl;
20496   struct cgraph_node *node = NULL;
20497   struct cgraph_node *default_node = NULL;
20498   struct cgraph_function_version_info *node_v = NULL;
20499   struct cgraph_function_version_info *first_v = NULL;
20500
20501   tree dispatch_decl = NULL;
20502
20503   struct cgraph_function_version_info *default_version_info = NULL;
20504
20505   gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
20506
20507   node = cgraph_node::get (fn);
20508   gcc_assert (node != NULL);
20509
20510   node_v = node->function_version ();
20511   gcc_assert (node_v != NULL);
20512
20513   if (node_v->dispatcher_resolver != NULL)
20514     return node_v->dispatcher_resolver;
20515
20516   /* Find the default version and make it the first node.  */
20517   first_v = node_v;
20518   /* Go to the beginning of the chain.  */
20519   while (first_v->prev != NULL)
20520     first_v = first_v->prev;
20521   default_version_info = first_v;
20522   while (default_version_info != NULL)
20523     {
20524       if (get_feature_mask_for_version
20525             (default_version_info->this_node->decl) == 0ULL)
20526         break;
20527       default_version_info = default_version_info->next;
20528     }
20529
20530   /* If there is no default node, just return NULL.  */
20531   if (default_version_info == NULL)
20532     return NULL;
20533
20534   /* Make default info the first node.  */
20535   if (first_v != default_version_info)
20536     {
20537       default_version_info->prev->next = default_version_info->next;
20538       if (default_version_info->next)
20539         default_version_info->next->prev = default_version_info->prev;
20540       first_v->prev = default_version_info;
20541       default_version_info->next = first_v;
20542       default_version_info->prev = NULL;
20543     }
20544
20545   default_node = default_version_info->this_node;
20546
20547   if (targetm.has_ifunc_p ())
20548     {
20549       struct cgraph_function_version_info *it_v = NULL;
20550       struct cgraph_node *dispatcher_node = NULL;
20551       struct cgraph_function_version_info *dispatcher_version_info = NULL;
20552
20553       /* Right now, the dispatching is done via ifunc.  */
20554       dispatch_decl = make_dispatcher_decl (default_node->decl);
20555       TREE_NOTHROW (dispatch_decl) = TREE_NOTHROW (fn);
20556
20557       dispatcher_node = cgraph_node::get_create (dispatch_decl);
20558       gcc_assert (dispatcher_node != NULL);
20559       dispatcher_node->dispatcher_function = 1;
20560       dispatcher_version_info
20561         = dispatcher_node->insert_new_function_version ();
20562       dispatcher_version_info->next = default_version_info;
20563       dispatcher_node->definition = 1;
20564
20565       /* Set the dispatcher for all the versions.  */
20566       it_v = default_version_info;
20567       while (it_v != NULL)
20568         {
20569           it_v->dispatcher_resolver = dispatch_decl;
20570           it_v = it_v->next;
20571         }
20572     }
20573   else
20574     {
20575       error_at (DECL_SOURCE_LOCATION (default_node->decl),
20576                 "multiversioning needs %<ifunc%> which is not supported "
20577                 "on this target");
20578     }
20579
20580   return dispatch_decl;
20581 }
20582
20583 /* This function returns true if FN1 and FN2 are versions of the same function,
20584    that is, the target_version attributes of the function decls are different.
20585    This assumes that FN1 and FN2 have the same signature.  */
20586
20587 bool
20588 aarch64_common_function_versions (tree fn1, tree fn2)
20589 {
20590   if (TREE_CODE (fn1) != FUNCTION_DECL
20591       || TREE_CODE (fn2) != FUNCTION_DECL)
20592     return false;
20593
20594   return (aarch64_compare_version_priority (fn1, fn2) != 0);
20595 }
20596
20597 /* Implement TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P.  Use an opt-out
20598    rather than an opt-in list.  */
20599
20600 static bool
20601 aarch64_function_attribute_inlinable_p (const_tree fndecl)
20602 {
20603   /* A function that has local SME state cannot be inlined into its caller,
20604      since we only support managing PSTATE.ZA switches at function scope.  */
20605   return (!aarch64_fndecl_has_new_state (fndecl, "za")
20606           && !aarch64_fndecl_has_new_state (fndecl, "zt0"));
20607 }
20608
20609 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
20610    tri-bool options (yes, no, don't care) and the default value is
20611    DEF, determine whether to reject inlining.  */
20612
20613 static bool
20614 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
20615                                      int dont_care, int def)
20616 {
20617   /* If the callee doesn't care, always allow inlining.  */
20618   if (callee == dont_care)
20619     return true;
20620
20621   /* If the caller doesn't care, always allow inlining.  */
20622   if (caller == dont_care)
20623     return true;
20624
20625   /* Otherwise, allow inlining if either the callee and caller values
20626      agree, or if the callee is using the default value.  */
20627   return (callee == caller || callee == def);
20628 }
20629
20630 /* Bit allocations for ipa_fn_summary::target_info.  */
20631
20632 /* Set if the function contains a stmt that relies on the function's
20633    choice of PSTATE.SM setting (0 for non-streaming, 1 for streaming).
20634    Not meaningful for streaming-compatible functions.  */
20635 constexpr auto AARCH64_IPA_SM_FIXED = 1U << 0;
20636
20637 /* Set if the function clobbers ZA and ZT0.  Not meaningful for functions that
20638    have ZA state.  */
20639 constexpr auto AARCH64_IPA_CLOBBERS_ZA = 1U << 1;
20640 constexpr auto AARCH64_IPA_CLOBBERS_ZT0 = 1U << 2;
20641
20642 /* Implement TARGET_NEED_IPA_FN_TARGET_INFO.  */
20643
20644 static bool
20645 aarch64_need_ipa_fn_target_info (const_tree, unsigned int &)
20646 {
20647   /* We could in principle skip this for streaming-compatible functions
20648      that have ZA state, but that's a rare combination.  */
20649   return true;
20650 }
20651
20652 /* Implement TARGET_UPDATE_IPA_FN_TARGET_INFO.  */
20653
20654 static bool
20655 aarch64_update_ipa_fn_target_info (unsigned int &info, const gimple *stmt)
20656 {
20657   if (auto *ga = dyn_cast<const gasm *> (stmt))
20658     {
20659       /* We don't know what the asm does, so conservatively assume that
20660          it requires the function's current SM mode.  */
20661       info |= AARCH64_IPA_SM_FIXED;
20662       for (unsigned int i = 0; i < gimple_asm_nclobbers (ga); ++i)
20663         {
20664           tree op = gimple_asm_clobber_op (ga, i);
20665           const char *clobber = TREE_STRING_POINTER (TREE_VALUE (op));
20666           if (strcmp (clobber, "za") == 0)
20667             info |= AARCH64_IPA_CLOBBERS_ZA;
20668           if (strcmp (clobber, "zt0") == 0)
20669             info |= AARCH64_IPA_CLOBBERS_ZT0;
20670         }
20671     }
20672   if (auto *call = dyn_cast<const gcall *> (stmt))
20673     {
20674       if (gimple_call_builtin_p (call, BUILT_IN_MD))
20675         {
20676           /* The attributes on AArch64 builtins are supposed to be accurate.
20677              If the function isn't marked streaming-compatible then it
20678              needs whichever SM mode it selects.  */
20679           tree decl = gimple_call_fndecl (call);
20680           if (aarch64_fndecl_pstate_sm (decl) != 0)
20681             info |= AARCH64_IPA_SM_FIXED;
20682         }
20683     }
20684   return true;
20685 }
20686
20687 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
20688    to inline CALLEE into CALLER based on target-specific info.
20689    Make sure that the caller and callee have compatible architectural
20690    features.  Then go through the other possible target attributes
20691    and see if they can block inlining.  Try not to reject always_inline
20692    callees unless they are incompatible architecturally.  */
20693
20694 static bool
20695 aarch64_can_inline_p (tree caller, tree callee)
20696 {
20697   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
20698   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
20699
20700   struct cl_target_option *caller_opts
20701         = TREE_TARGET_OPTION (caller_tree ? caller_tree
20702                                            : target_option_default_node);
20703
20704   struct cl_target_option *callee_opts
20705         = TREE_TARGET_OPTION (callee_tree ? callee_tree
20706                                            : target_option_default_node);
20707
20708   /* Callee's ISA flags should be a subset of the caller's.  */
20709   auto caller_asm_isa = (caller_opts->x_aarch64_asm_isa_flags
20710                          & ~AARCH64_FL_ISA_MODES);
20711   auto callee_asm_isa = (callee_opts->x_aarch64_asm_isa_flags
20712                          & ~AARCH64_FL_ISA_MODES);
20713   if (callee_asm_isa & ~caller_asm_isa)
20714     return false;
20715
20716   auto caller_isa = (caller_opts->x_aarch64_isa_flags
20717                      & ~AARCH64_FL_ISA_MODES);
20718   auto callee_isa = (callee_opts->x_aarch64_isa_flags
20719                      & ~AARCH64_FL_ISA_MODES);
20720   if (callee_isa & ~caller_isa)
20721     return false;
20722
20723   /* Return true if the callee might have target_info property PROPERTY.
20724      The answer must be true unless we have positive proof to the contrary.  */
20725   auto callee_has_property = [&](unsigned int property)
20726     {
20727       if (ipa_fn_summaries)
20728         if (auto *summary = ipa_fn_summaries->get (cgraph_node::get (callee)))
20729           if (!(summary->target_info & property))
20730             return false;
20731       return true;
20732     };
20733
20734   /* Streaming-compatible code can be inlined into functions with any
20735      PSTATE.SM mode.  Otherwise the caller and callee must agree on
20736      PSTATE.SM mode, unless we can prove that the callee is naturally
20737      streaming-compatible.  */
20738   auto caller_sm = (caller_opts->x_aarch64_isa_flags & AARCH64_FL_SM_STATE);
20739   auto callee_sm = (callee_opts->x_aarch64_isa_flags & AARCH64_FL_SM_STATE);
20740   if (callee_sm
20741       && caller_sm != callee_sm
20742       && callee_has_property (AARCH64_IPA_SM_FIXED))
20743     return false;
20744
20745   /* aarch64_function_attribute_inlinable_p prevents new-ZA and new-ZT0
20746      functions from being inlined into others.  We also need to prevent
20747      inlining of shared-ZA functions into functions without ZA state,
20748      since this is an error condition.
20749
20750      The only other problematic case for ZA is inlining a function that
20751      directly clobbers ZA or ZT0 into a function that has ZA or ZT0 state.  */
20752   auto caller_za = (caller_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
20753   auto callee_za = (callee_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
20754   if (!caller_za && callee_za)
20755     return false;
20756   if (!callee_za
20757       && aarch64_fndecl_has_state (caller, "za")
20758       && callee_has_property (AARCH64_IPA_CLOBBERS_ZA))
20759     return false;
20760   if (!callee_za
20761       && aarch64_fndecl_has_state (caller, "zt0")
20762       && callee_has_property (AARCH64_IPA_CLOBBERS_ZT0))
20763     return false;
20764
20765   /* Allow non-strict aligned functions inlining into strict
20766      aligned ones.  */
20767   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
20768        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
20769       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
20770            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
20771     return false;
20772
20773   bool always_inline = lookup_attribute ("always_inline",
20774                                           DECL_ATTRIBUTES (callee));
20775
20776   /* If the architectural features match up and the callee is always_inline
20777      then the other attributes don't matter.  */
20778   if (always_inline)
20779     return true;
20780
20781   if (caller_opts->x_aarch64_cmodel_var
20782       != callee_opts->x_aarch64_cmodel_var)
20783     return false;
20784
20785   if (caller_opts->x_aarch64_tls_dialect
20786       != callee_opts->x_aarch64_tls_dialect)
20787     return false;
20788
20789   /* Honour explicit requests to workaround errata.  */
20790   if (!aarch64_tribools_ok_for_inlining_p (
20791           caller_opts->x_aarch64_fix_a53_err835769,
20792           callee_opts->x_aarch64_fix_a53_err835769,
20793           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
20794     return false;
20795
20796   if (!aarch64_tribools_ok_for_inlining_p (
20797           caller_opts->x_aarch64_fix_a53_err843419,
20798           callee_opts->x_aarch64_fix_a53_err843419,
20799           2, TARGET_FIX_ERR_A53_843419))
20800     return false;
20801
20802   /* If the user explicitly specified -momit-leaf-frame-pointer for the
20803      caller and calle and they don't match up, reject inlining.  */
20804   if (!aarch64_tribools_ok_for_inlining_p (
20805           caller_opts->x_flag_omit_leaf_frame_pointer,
20806           callee_opts->x_flag_omit_leaf_frame_pointer,
20807           2, 1))
20808     return false;
20809
20810   /* If the callee has specific tuning overrides, respect them.  */
20811   if (callee_opts->x_aarch64_override_tune_string != NULL
20812       && caller_opts->x_aarch64_override_tune_string == NULL)
20813     return false;
20814
20815   /* If the user specified tuning override strings for the
20816      caller and callee and they don't match up, reject inlining.
20817      We just do a string compare here, we don't analyze the meaning
20818      of the string, as it would be too costly for little gain.  */
20819   if (callee_opts->x_aarch64_override_tune_string
20820       && caller_opts->x_aarch64_override_tune_string
20821       && (strcmp (callee_opts->x_aarch64_override_tune_string,
20822                   caller_opts->x_aarch64_override_tune_string) != 0))
20823     return false;
20824
20825   return true;
20826 }
20827
20828 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
20829    been already.  */
20830
20831 arm_pcs
20832 aarch64_tlsdesc_abi_id ()
20833 {
20834   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
20835   if (!tlsdesc_abi.initialized_p ())
20836     {
20837       HARD_REG_SET full_reg_clobbers;
20838       CLEAR_HARD_REG_SET (full_reg_clobbers);
20839       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
20840       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
20841       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
20842         SET_HARD_REG_BIT (full_reg_clobbers, regno);
20843       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
20844     }
20845   return ARM_PCS_TLSDESC;
20846 }
20847
20848 /* Return true if SYMBOL_REF X binds locally.  */
20849
20850 static bool
20851 aarch64_symbol_binds_local_p (const_rtx x)
20852 {
20853   return (SYMBOL_REF_DECL (x)
20854           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
20855           : SYMBOL_REF_LOCAL_P (x));
20856 }
20857
20858 /* Return true if SYMBOL_REF X is thread local */
20859 static bool
20860 aarch64_tls_symbol_p (rtx x)
20861 {
20862   if (! TARGET_HAVE_TLS)
20863     return false;
20864
20865   x = strip_salt (x);
20866   if (!SYMBOL_REF_P (x))
20867     return false;
20868
20869   return SYMBOL_REF_TLS_MODEL (x) != 0;
20870 }
20871
20872 /* Classify a TLS symbol into one of the TLS kinds.  */
20873 enum aarch64_symbol_type
20874 aarch64_classify_tls_symbol (rtx x)
20875 {
20876   enum tls_model tls_kind = tls_symbolic_operand_type (x);
20877
20878   switch (tls_kind)
20879     {
20880     case TLS_MODEL_GLOBAL_DYNAMIC:
20881     case TLS_MODEL_LOCAL_DYNAMIC:
20882       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
20883
20884     case TLS_MODEL_INITIAL_EXEC:
20885       switch (aarch64_cmodel)
20886         {
20887         case AARCH64_CMODEL_TINY:
20888         case AARCH64_CMODEL_TINY_PIC:
20889           return SYMBOL_TINY_TLSIE;
20890         default:
20891           return SYMBOL_SMALL_TLSIE;
20892         }
20893
20894     case TLS_MODEL_LOCAL_EXEC:
20895       if (aarch64_tls_size == 12)
20896         return SYMBOL_TLSLE12;
20897       else if (aarch64_tls_size == 24)
20898         return SYMBOL_TLSLE24;
20899       else if (aarch64_tls_size == 32)
20900         return SYMBOL_TLSLE32;
20901       else if (aarch64_tls_size == 48)
20902         return SYMBOL_TLSLE48;
20903       else
20904         gcc_unreachable ();
20905
20906     case TLS_MODEL_EMULATED:
20907     case TLS_MODEL_NONE:
20908       return SYMBOL_FORCE_TO_MEM;
20909
20910     default:
20911       gcc_unreachable ();
20912     }
20913 }
20914
20915 /* Return the correct method for accessing X + OFFSET, where X is either
20916    a SYMBOL_REF or LABEL_REF.  */
20917
20918 enum aarch64_symbol_type
20919 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
20920 {
20921   x = strip_salt (x);
20922
20923   if (LABEL_REF_P (x))
20924     {
20925       switch (aarch64_cmodel)
20926         {
20927         case AARCH64_CMODEL_LARGE:
20928           return SYMBOL_FORCE_TO_MEM;
20929
20930         case AARCH64_CMODEL_TINY_PIC:
20931         case AARCH64_CMODEL_TINY:
20932           return SYMBOL_TINY_ABSOLUTE;
20933
20934         case AARCH64_CMODEL_SMALL_SPIC:
20935         case AARCH64_CMODEL_SMALL_PIC:
20936         case AARCH64_CMODEL_SMALL:
20937           return SYMBOL_SMALL_ABSOLUTE;
20938
20939         default:
20940           gcc_unreachable ();
20941         }
20942     }
20943
20944   if (SYMBOL_REF_P (x))
20945     {
20946       if (aarch64_tls_symbol_p (x))
20947         return aarch64_classify_tls_symbol (x);
20948
20949       switch (aarch64_cmodel)
20950         {
20951         case AARCH64_CMODEL_TINY_PIC:
20952         case AARCH64_CMODEL_TINY:
20953           /* With -fPIC non-local symbols use the GOT.  For orthogonality
20954              always use the GOT for extern weak symbols.  */
20955           if ((flag_pic || SYMBOL_REF_WEAK (x))
20956               && !aarch64_symbol_binds_local_p (x))
20957             return SYMBOL_TINY_GOT;
20958
20959           /* When we retrieve symbol + offset address, we have to make sure
20960              the offset does not cause overflow of the final address.  But
20961              we have no way of knowing the address of symbol at compile time
20962              so we can't accurately say if the distance between the PC and
20963              symbol + offset is outside the addressible range of +/-1MB in the
20964              TINY code model.  So we limit the maximum offset to +/-64KB and
20965              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
20966              If offset_within_block_p is true we allow larger offsets.  */
20967           if (!(IN_RANGE (offset, -0x10000, 0x10000)
20968                 || offset_within_block_p (x, offset)))
20969             return SYMBOL_FORCE_TO_MEM;
20970
20971           return SYMBOL_TINY_ABSOLUTE;
20972
20973
20974         case AARCH64_CMODEL_SMALL_SPIC:
20975         case AARCH64_CMODEL_SMALL_PIC:
20976         case AARCH64_CMODEL_SMALL:
20977           if ((flag_pic || SYMBOL_REF_WEAK (x))
20978               && !aarch64_symbol_binds_local_p (x))
20979             return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
20980                     ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
20981
20982           /* Same reasoning as the tiny code model, but the offset cap here is
20983              1MB, allowing +/-3.9GB for the offset to the symbol.  */
20984           if (!(IN_RANGE (offset, -0x100000, 0x100000)
20985                 || offset_within_block_p (x, offset)))
20986             return SYMBOL_FORCE_TO_MEM;
20987
20988           return SYMBOL_SMALL_ABSOLUTE;
20989
20990         case AARCH64_CMODEL_LARGE:
20991           /* This is alright even in PIC code as the constant
20992              pool reference is always PC relative and within
20993              the same translation unit.  */
20994           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
20995             return SYMBOL_SMALL_ABSOLUTE;
20996           else
20997             return SYMBOL_FORCE_TO_MEM;
20998
20999         default:
21000           gcc_unreachable ();
21001         }
21002     }
21003
21004   /* By default push everything into the constant pool.  */
21005   return SYMBOL_FORCE_TO_MEM;
21006 }
21007
21008 bool
21009 aarch64_constant_address_p (rtx x)
21010 {
21011   return (CONSTANT_P (x) && memory_address_p (DImode, x));
21012 }
21013
21014 bool
21015 aarch64_legitimate_pic_operand_p (rtx x)
21016 {
21017   poly_int64 offset;
21018   x = strip_offset_and_salt (x, &offset);
21019   if (SYMBOL_REF_P (x))
21020     return false;
21021
21022   return true;
21023 }
21024
21025 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
21026    that should be rematerialized rather than spilled.  */
21027
21028 static bool
21029 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
21030 {
21031   /* Support CSE and rematerialization of common constants.  */
21032   if (CONST_INT_P (x)
21033       || CONST_DOUBLE_P (x))
21034     return true;
21035
21036   /* Only accept variable-length vector constants if they can be
21037      handled directly.
21038
21039      ??? It would be possible (but complex) to handle rematerialization
21040      of other constants via secondary reloads.  */
21041   if (!GET_MODE_SIZE (mode).is_constant ())
21042     return aarch64_simd_valid_immediate (x, NULL);
21043
21044   /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
21045      least be forced to memory and loaded from there.  */
21046   if (CONST_VECTOR_P (x))
21047     return !targetm.cannot_force_const_mem (mode, x);
21048
21049   /* Do not allow vector struct mode constants for Advanced SIMD.
21050      We could support 0 and -1 easily, but they need support in
21051      aarch64-simd.md.  */
21052   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21053   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21054     return false;
21055
21056   if (GET_CODE (x) == HIGH)
21057     x = XEXP (x, 0);
21058
21059   /* Accept polynomial constants that can be calculated by using the
21060      destination of a move as the sole temporary.  Constants that
21061      require a second temporary cannot be rematerialized (they can't be
21062      forced to memory and also aren't legitimate constants).  */
21063   poly_int64 offset;
21064   if (poly_int_rtx_p (x, &offset))
21065     return aarch64_offset_temporaries (false, offset) <= 1;
21066
21067   /* If an offset is being added to something else, we need to allow the
21068      base to be moved into the destination register, meaning that there
21069      are no free temporaries for the offset.  */
21070   x = strip_offset_and_salt (x, &offset);
21071   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
21072     return false;
21073
21074   /* Do not allow const (plus (anchor_symbol, const_int)).  */
21075   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
21076     return false;
21077
21078   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
21079      so spilling them is better than rematerialization.  */
21080   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
21081     return true;
21082
21083   /* Label references are always constant.  */
21084   if (LABEL_REF_P (x))
21085     return true;
21086
21087   return false;
21088 }
21089
21090 rtx
21091 aarch64_load_tp (rtx target)
21092 {
21093   if (!target
21094       || GET_MODE (target) != Pmode
21095       || !register_operand (target, Pmode))
21096     target = gen_reg_rtx (Pmode);
21097
21098   /* Can return in any reg.  */
21099   emit_insn (gen_aarch64_load_tp_hard (target));
21100   return target;
21101 }
21102
21103 /* On AAPCS systems, this is the "struct __va_list".  */
21104 static GTY(()) tree va_list_type;
21105
21106 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
21107    Return the type to use as __builtin_va_list.
21108
21109    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
21110
21111    struct __va_list
21112    {
21113      void *__stack;
21114      void *__gr_top;
21115      void *__vr_top;
21116      int   __gr_offs;
21117      int   __vr_offs;
21118    };  */
21119
21120 static tree
21121 aarch64_build_builtin_va_list (void)
21122 {
21123   tree va_list_name;
21124   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21125
21126   /* Create the type.  */
21127   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
21128   /* Give it the required name.  */
21129   va_list_name = build_decl (BUILTINS_LOCATION,
21130                              TYPE_DECL,
21131                              get_identifier ("__va_list"),
21132                              va_list_type);
21133   DECL_ARTIFICIAL (va_list_name) = 1;
21134   TYPE_NAME (va_list_type) = va_list_name;
21135   TYPE_STUB_DECL (va_list_type) = va_list_name;
21136
21137   /* Create the fields.  */
21138   f_stack = build_decl (BUILTINS_LOCATION,
21139                         FIELD_DECL, get_identifier ("__stack"),
21140                         ptr_type_node);
21141   f_grtop = build_decl (BUILTINS_LOCATION,
21142                         FIELD_DECL, get_identifier ("__gr_top"),
21143                         ptr_type_node);
21144   f_vrtop = build_decl (BUILTINS_LOCATION,
21145                         FIELD_DECL, get_identifier ("__vr_top"),
21146                         ptr_type_node);
21147   f_groff = build_decl (BUILTINS_LOCATION,
21148                         FIELD_DECL, get_identifier ("__gr_offs"),
21149                         integer_type_node);
21150   f_vroff = build_decl (BUILTINS_LOCATION,
21151                         FIELD_DECL, get_identifier ("__vr_offs"),
21152                         integer_type_node);
21153
21154   /* Tell tree-stdarg pass about our internal offset fields.
21155      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
21156      purpose to identify whether the code is updating va_list internal
21157      offset fields through irregular way.  */
21158   va_list_gpr_counter_field = f_groff;
21159   va_list_fpr_counter_field = f_vroff;
21160
21161   DECL_ARTIFICIAL (f_stack) = 1;
21162   DECL_ARTIFICIAL (f_grtop) = 1;
21163   DECL_ARTIFICIAL (f_vrtop) = 1;
21164   DECL_ARTIFICIAL (f_groff) = 1;
21165   DECL_ARTIFICIAL (f_vroff) = 1;
21166
21167   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
21168   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
21169   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
21170   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
21171   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
21172
21173   TYPE_FIELDS (va_list_type) = f_stack;
21174   DECL_CHAIN (f_stack) = f_grtop;
21175   DECL_CHAIN (f_grtop) = f_vrtop;
21176   DECL_CHAIN (f_vrtop) = f_groff;
21177   DECL_CHAIN (f_groff) = f_vroff;
21178
21179   /* Compute its layout.  */
21180   layout_type (va_list_type);
21181
21182   return va_list_type;
21183 }
21184
21185 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
21186 static void
21187 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
21188 {
21189   const CUMULATIVE_ARGS *cum;
21190   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21191   tree stack, grtop, vrtop, groff, vroff;
21192   tree t;
21193   int gr_save_area_size = cfun->va_list_gpr_size;
21194   int vr_save_area_size = cfun->va_list_fpr_size;
21195   int vr_offset;
21196
21197   cum = &crtl->args.info;
21198   if (cfun->va_list_gpr_size)
21199     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
21200                              cfun->va_list_gpr_size);
21201   if (cfun->va_list_fpr_size)
21202     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
21203                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
21204
21205   if (!TARGET_FLOAT)
21206     {
21207       gcc_assert (cum->aapcs_nvrn == 0);
21208       vr_save_area_size = 0;
21209     }
21210
21211   f_stack = TYPE_FIELDS (va_list_type_node);
21212   f_grtop = DECL_CHAIN (f_stack);
21213   f_vrtop = DECL_CHAIN (f_grtop);
21214   f_groff = DECL_CHAIN (f_vrtop);
21215   f_vroff = DECL_CHAIN (f_groff);
21216
21217   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
21218                   NULL_TREE);
21219   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
21220                   NULL_TREE);
21221   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
21222                   NULL_TREE);
21223   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
21224                   NULL_TREE);
21225   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
21226                   NULL_TREE);
21227
21228   /* Emit code to initialize STACK, which points to the next varargs stack
21229      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
21230      by named arguments.  STACK is 8-byte aligned.  */
21231   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
21232   if (cum->aapcs_stack_size > 0)
21233     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
21234   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
21235   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21236
21237   /* Emit code to initialize GRTOP, the top of the GR save area.
21238      virtual_incoming_args_rtx should have been 16 byte aligned.  */
21239   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
21240   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
21241   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21242
21243   /* Emit code to initialize VRTOP, the top of the VR save area.
21244      This address is gr_save_area_bytes below GRTOP, rounded
21245      down to the next 16-byte boundary.  */
21246   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
21247   vr_offset = ROUND_UP (gr_save_area_size,
21248                         STACK_BOUNDARY / BITS_PER_UNIT);
21249
21250   if (vr_offset)
21251     t = fold_build_pointer_plus_hwi (t, -vr_offset);
21252   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
21253   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21254
21255   /* Emit code to initialize GROFF, the offset from GRTOP of the
21256      next GPR argument.  */
21257   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
21258               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
21259   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21260
21261   /* Likewise emit code to initialize VROFF, the offset from FTOP
21262      of the next VR argument.  */
21263   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
21264               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
21265   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21266 }
21267
21268 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
21269
21270 static tree
21271 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
21272                               gimple_seq *post_p ATTRIBUTE_UNUSED)
21273 {
21274   tree addr;
21275   bool indirect_p;
21276   bool is_ha;           /* is HFA or HVA.  */
21277   bool dw_align;        /* double-word align.  */
21278   machine_mode ag_mode = VOIDmode;
21279   int nregs;
21280   machine_mode mode;
21281
21282   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21283   tree stack, f_top, f_off, off, arg, roundup, on_stack;
21284   HOST_WIDE_INT size, rsize, adjust, align;
21285   tree t, u, cond1, cond2;
21286
21287   indirect_p = pass_va_arg_by_reference (type);
21288   if (indirect_p)
21289     type = build_pointer_type (type);
21290
21291   mode = TYPE_MODE (type);
21292
21293   f_stack = TYPE_FIELDS (va_list_type_node);
21294   f_grtop = DECL_CHAIN (f_stack);
21295   f_vrtop = DECL_CHAIN (f_grtop);
21296   f_groff = DECL_CHAIN (f_vrtop);
21297   f_vroff = DECL_CHAIN (f_groff);
21298
21299   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
21300                   f_stack, NULL_TREE);
21301   size = int_size_in_bytes (type);
21302
21303   unsigned int abi_break_gcc_9;
21304   unsigned int abi_break_gcc_13;
21305   unsigned int abi_break_gcc_14;
21306   align
21307     = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
21308                                       &abi_break_gcc_13, &abi_break_gcc_14)
21309     / BITS_PER_UNIT;
21310
21311   dw_align = false;
21312   adjust = 0;
21313   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
21314                                                &is_ha, false))
21315     {
21316       /* No frontends can create types with variable-sized modes, so we
21317          shouldn't be asked to pass or return them.  */
21318       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
21319
21320       /* TYPE passed in fp/simd registers.  */
21321       if (!TARGET_FLOAT)
21322         aarch64_err_no_fpadvsimd (mode);
21323
21324       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
21325                       unshare_expr (valist), f_vrtop, NULL_TREE);
21326       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
21327                       unshare_expr (valist), f_vroff, NULL_TREE);
21328
21329       rsize = nregs * UNITS_PER_VREG;
21330
21331       if (is_ha)
21332         {
21333           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
21334             adjust = UNITS_PER_VREG - ag_size;
21335         }
21336       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21337                && size < UNITS_PER_VREG)
21338         {
21339           adjust = UNITS_PER_VREG - size;
21340         }
21341     }
21342   else
21343     {
21344       /* TYPE passed in general registers.  */
21345       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
21346                       unshare_expr (valist), f_grtop, NULL_TREE);
21347       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
21348                       unshare_expr (valist), f_groff, NULL_TREE);
21349       rsize = ROUND_UP (size, UNITS_PER_WORD);
21350       nregs = rsize / UNITS_PER_WORD;
21351
21352       if (align <= 8
21353           && abi_break_gcc_13
21354           && warn_psabi
21355           && !bitint_or_aggr_of_bitint_p (type))
21356         inform (input_location, "parameter passing for argument of type "
21357                 "%qT changed in GCC 13.1", type);
21358
21359       if (warn_psabi
21360           && abi_break_gcc_14
21361           && (abi_break_gcc_14 > 8 * BITS_PER_UNIT) != (align > 8)
21362           && !bitint_or_aggr_of_bitint_p (type))
21363         inform (input_location, "parameter passing for argument of type "
21364                 "%qT changed in GCC 14.1", type);
21365
21366       if (align > 8)
21367         {
21368           if (abi_break_gcc_9
21369               && warn_psabi
21370               && !bitint_or_aggr_of_bitint_p (type))
21371             inform (input_location, "parameter passing for argument of type "
21372                     "%qT changed in GCC 9.1", type);
21373           dw_align = true;
21374         }
21375
21376       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21377           && size < UNITS_PER_WORD)
21378         {
21379           adjust = UNITS_PER_WORD  - size;
21380         }
21381     }
21382
21383   /* Get a local temporary for the field value.  */
21384   off = get_initialized_tmp_var (f_off, pre_p, NULL);
21385
21386   /* Emit code to branch if off >= 0.  */
21387   t = build2 (GE_EXPR, boolean_type_node, off,
21388               build_int_cst (TREE_TYPE (off), 0));
21389   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
21390
21391   if (dw_align)
21392     {
21393       /* Emit: offs = (offs + 15) & -16.  */
21394       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21395                   build_int_cst (TREE_TYPE (off), 15));
21396       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
21397                   build_int_cst (TREE_TYPE (off), -16));
21398       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
21399     }
21400   else
21401     roundup = NULL;
21402
21403   /* Update ap.__[g|v]r_offs  */
21404   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21405               build_int_cst (TREE_TYPE (off), rsize));
21406   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
21407
21408   /* String up.  */
21409   if (roundup)
21410     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21411
21412   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
21413   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
21414               build_int_cst (TREE_TYPE (f_off), 0));
21415   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
21416
21417   /* String up: make sure the assignment happens before the use.  */
21418   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
21419   COND_EXPR_ELSE (cond1) = t;
21420
21421   /* Prepare the trees handling the argument that is passed on the stack;
21422      the top level node will store in ON_STACK.  */
21423   arg = get_initialized_tmp_var (stack, pre_p, NULL);
21424   if (align > 8)
21425     {
21426       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
21427       t = fold_build_pointer_plus_hwi (arg, 15);
21428       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21429                   build_int_cst (TREE_TYPE (t), -16));
21430       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
21431     }
21432   else
21433     roundup = NULL;
21434   /* Advance ap.__stack  */
21435   t = fold_build_pointer_plus_hwi (arg, size + 7);
21436   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21437               build_int_cst (TREE_TYPE (t), -8));
21438   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
21439   /* String up roundup and advance.  */
21440   if (roundup)
21441     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21442   /* String up with arg */
21443   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
21444   /* Big-endianness related address adjustment.  */
21445   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21446       && size < UNITS_PER_WORD)
21447   {
21448     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
21449                 size_int (UNITS_PER_WORD - size));
21450     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
21451   }
21452
21453   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
21454   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
21455
21456   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
21457   t = off;
21458   if (adjust)
21459     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
21460                 build_int_cst (TREE_TYPE (off), adjust));
21461
21462   t = fold_convert (sizetype, t);
21463   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
21464
21465   if (is_ha)
21466     {
21467       /* type ha; // treat as "struct {ftype field[n];}"
21468          ... [computing offs]
21469          for (i = 0; i <nregs; ++i, offs += 16)
21470            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
21471          return ha;  */
21472       int i;
21473       tree tmp_ha, field_t, field_ptr_t;
21474
21475       /* Declare a local variable.  */
21476       tmp_ha = create_tmp_var_raw (type, "ha");
21477       gimple_add_tmp_var (tmp_ha);
21478
21479       /* Establish the base type.  */
21480       switch (ag_mode)
21481         {
21482         case E_SFmode:
21483           field_t = float_type_node;
21484           field_ptr_t = float_ptr_type_node;
21485           break;
21486         case E_DFmode:
21487           field_t = double_type_node;
21488           field_ptr_t = double_ptr_type_node;
21489           break;
21490         case E_TFmode:
21491           field_t = long_double_type_node;
21492           field_ptr_t = long_double_ptr_type_node;
21493           break;
21494         case E_SDmode:
21495           field_t = dfloat32_type_node;
21496           field_ptr_t = build_pointer_type (dfloat32_type_node);
21497           break;
21498         case E_DDmode:
21499           field_t = dfloat64_type_node;
21500           field_ptr_t = build_pointer_type (dfloat64_type_node);
21501           break;
21502         case E_TDmode:
21503           field_t = dfloat128_type_node;
21504           field_ptr_t = build_pointer_type (dfloat128_type_node);
21505           break;
21506         case E_HFmode:
21507           field_t = aarch64_fp16_type_node;
21508           field_ptr_t = aarch64_fp16_ptr_type_node;
21509           break;
21510         case E_BFmode:
21511           field_t = bfloat16_type_node;
21512           field_ptr_t = aarch64_bf16_ptr_type_node;
21513           break;
21514         case E_V2SImode:
21515         case E_V4SImode:
21516             {
21517               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
21518               field_t = build_vector_type_for_mode (innertype, ag_mode);
21519               field_ptr_t = build_pointer_type (field_t);
21520             }
21521           break;
21522         default:
21523           gcc_assert (0);
21524         }
21525
21526       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
21527       TREE_ADDRESSABLE (tmp_ha) = 1;
21528       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
21529       addr = t;
21530       t = fold_convert (field_ptr_t, addr);
21531       t = build2 (MODIFY_EXPR, field_t,
21532                   build1 (INDIRECT_REF, field_t, tmp_ha),
21533                   build1 (INDIRECT_REF, field_t, t));
21534
21535       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
21536       for (i = 1; i < nregs; ++i)
21537         {
21538           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
21539           u = fold_convert (field_ptr_t, addr);
21540           u = build2 (MODIFY_EXPR, field_t,
21541                       build2 (MEM_REF, field_t, tmp_ha,
21542                               build_int_cst (field_ptr_t,
21543                                              (i *
21544                                               int_size_in_bytes (field_t)))),
21545                       build1 (INDIRECT_REF, field_t, u));
21546           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
21547         }
21548
21549       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
21550       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
21551     }
21552
21553   COND_EXPR_ELSE (cond2) = t;
21554   addr = fold_convert (build_pointer_type (type), cond1);
21555   addr = build_va_arg_indirect_ref (addr);
21556
21557   if (indirect_p)
21558     addr = build_va_arg_indirect_ref (addr);
21559
21560   return addr;
21561 }
21562
21563 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
21564
21565 static void
21566 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
21567                                 const function_arg_info &arg,
21568                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
21569 {
21570   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
21571   CUMULATIVE_ARGS local_cum;
21572   int gr_saved = cfun->va_list_gpr_size;
21573   int vr_saved = cfun->va_list_fpr_size;
21574
21575   /* The caller has advanced CUM up to, but not beyond, the last named
21576      argument.  Advance a local copy of CUM past the last "real" named
21577      argument, to find out how many registers are left over.  */
21578   local_cum = *cum;
21579   if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
21580     aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
21581
21582   /* Found out how many registers we need to save.
21583      Honor tree-stdvar analysis results.  */
21584   if (cfun->va_list_gpr_size)
21585     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
21586                     cfun->va_list_gpr_size / UNITS_PER_WORD);
21587   if (cfun->va_list_fpr_size)
21588     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
21589                     cfun->va_list_fpr_size / UNITS_PER_VREG);
21590
21591   if (!TARGET_FLOAT)
21592     {
21593       gcc_assert (local_cum.aapcs_nvrn == 0);
21594       vr_saved = 0;
21595     }
21596
21597   if (!no_rtl)
21598     {
21599       if (gr_saved > 0)
21600         {
21601           rtx ptr, mem;
21602
21603           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
21604           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
21605                                - gr_saved * UNITS_PER_WORD);
21606           mem = gen_frame_mem (BLKmode, ptr);
21607           set_mem_alias_set (mem, get_varargs_alias_set ());
21608
21609           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
21610                                mem, gr_saved);
21611         }
21612       if (vr_saved > 0)
21613         {
21614           /* We can't use move_block_from_reg, because it will use
21615              the wrong mode, storing D regs only.  */
21616           machine_mode mode = TImode;
21617           int off, i, vr_start;
21618
21619           /* Set OFF to the offset from virtual_incoming_args_rtx of
21620              the first vector register.  The VR save area lies below
21621              the GR one, and is aligned to 16 bytes.  */
21622           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
21623                            STACK_BOUNDARY / BITS_PER_UNIT);
21624           off -= vr_saved * UNITS_PER_VREG;
21625
21626           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
21627           for (i = 0; i < vr_saved; ++i)
21628             {
21629               rtx ptr, mem;
21630
21631               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
21632               mem = gen_frame_mem (mode, ptr);
21633               set_mem_alias_set (mem, get_varargs_alias_set ());
21634               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
21635               off += UNITS_PER_VREG;
21636             }
21637         }
21638     }
21639
21640   /* We don't save the size into *PRETEND_SIZE because we want to avoid
21641      any complication of having crtl->args.pretend_args_size changed.  */
21642   cfun->machine->frame.saved_varargs_size
21643     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
21644                  STACK_BOUNDARY / BITS_PER_UNIT)
21645        + vr_saved * UNITS_PER_VREG);
21646 }
21647
21648 static void
21649 aarch64_conditional_register_usage (void)
21650 {
21651   int i;
21652   if (!TARGET_FLOAT)
21653     {
21654       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
21655         {
21656           fixed_regs[i] = 1;
21657           call_used_regs[i] = 1;
21658           CLEAR_HARD_REG_BIT (operand_reg_set, i);
21659         }
21660     }
21661   if (!TARGET_SVE)
21662     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
21663       {
21664         fixed_regs[i] = 1;
21665         call_used_regs[i] = 1;
21666       }
21667
21668   /* Only allow these registers to be accessed via special patterns.  */
21669   CLEAR_HARD_REG_BIT (operand_reg_set, VG_REGNUM);
21670   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
21671   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
21672   for (int i = FIRST_FAKE_REGNUM; i <= LAST_FAKE_REGNUM; ++i)
21673     CLEAR_HARD_REG_BIT (operand_reg_set, i);
21674
21675   /* When tracking speculation, we need a couple of call-clobbered registers
21676      to track the speculation state.  It would be nice to just use
21677      IP0 and IP1, but currently there are numerous places that just
21678      assume these registers are free for other uses (eg pointer
21679      authentication).  */
21680   if (aarch64_track_speculation)
21681     {
21682       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
21683       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
21684       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21685       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21686     }
21687 }
21688
21689 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
21690
21691 bool
21692 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
21693 {
21694   /* For records we're passed a FIELD_DECL, for arrays we're passed
21695      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
21696   const_tree type = TREE_TYPE (field_or_array);
21697
21698   /* Assign BLKmode to anything that contains more than 2 SVE predicates.
21699      For structures, the "multiple" case is indicated by MODE being
21700      VOIDmode.  */
21701   unsigned int num_zr, num_pr;
21702   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr > 2)
21703     {
21704       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
21705         return !simple_cst_equal (TYPE_SIZE (field_or_array),
21706                                   TYPE_SIZE (type));
21707       return mode == VOIDmode;
21708     }
21709
21710   return default_member_type_forces_blk (field_or_array, mode);
21711 }
21712
21713 /* Bitmasks that indicate whether earlier versions of GCC would have
21714    taken a different path through the ABI logic.  This should result in
21715    a -Wpsabi warning if the earlier path led to a different ABI decision.
21716
21717    WARN_PSABI_EMPTY_CXX17_BASE
21718       Indicates that the type includes an artificial empty C++17 base field
21719       that, prior to GCC 10.1, would prevent the type from being treated as
21720       a HFA or HVA.  See PR94383 for details.
21721
21722    WARN_PSABI_NO_UNIQUE_ADDRESS
21723       Indicates that the type includes an empty [[no_unique_address]] field
21724       that, prior to GCC 10.1, would prevent the type from being treated as
21725       a HFA or HVA.  */
21726 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
21727 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
21728 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
21729
21730 /* Walk down the type tree of TYPE counting consecutive base elements.
21731    If *MODEP is VOIDmode, then set it to the first valid floating point
21732    type.  If a non-floating point type is found, or if a floating point
21733    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
21734    otherwise return the count in the sub-tree.
21735
21736    The WARN_PSABI_FLAGS argument allows the caller to check whether this
21737    function has changed its behavior relative to earlier versions of GCC.
21738    Normally the argument should be nonnull and point to a zero-initialized
21739    variable.  The function then records whether the ABI decision might
21740    be affected by a known fix to the ABI logic, setting the associated
21741    WARN_PSABI_* bits if so.
21742
21743    When the argument is instead a null pointer, the function tries to
21744    simulate the behavior of GCC before all such ABI fixes were made.
21745    This is useful to check whether the function returns something
21746    different after the ABI fixes.  */
21747 static int
21748 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
21749                          unsigned int *warn_psabi_flags)
21750 {
21751   machine_mode mode;
21752   HOST_WIDE_INT size;
21753
21754   if (aarch64_sve::builtin_type_p (type))
21755     return -1;
21756
21757   switch (TREE_CODE (type))
21758     {
21759     case REAL_TYPE:
21760       mode = TYPE_MODE (type);
21761       if (mode != DFmode && mode != SFmode
21762           && mode != TFmode && mode != HFmode
21763           && mode != SDmode && mode != DDmode && mode != TDmode)
21764         return -1;
21765
21766       if (*modep == VOIDmode)
21767         *modep = mode;
21768
21769       if (*modep == mode)
21770         return 1;
21771
21772       break;
21773
21774     case COMPLEX_TYPE:
21775       mode = TYPE_MODE (TREE_TYPE (type));
21776       if (mode != DFmode && mode != SFmode
21777           && mode != TFmode && mode != HFmode)
21778         return -1;
21779
21780       if (*modep == VOIDmode)
21781         *modep = mode;
21782
21783       if (*modep == mode)
21784         return 2;
21785
21786       break;
21787
21788     case VECTOR_TYPE:
21789       /* Use V2SImode and V4SImode as representatives of all 64-bit
21790          and 128-bit vector types.  */
21791       size = int_size_in_bytes (type);
21792       switch (size)
21793         {
21794         case 8:
21795           mode = V2SImode;
21796           break;
21797         case 16:
21798           mode = V4SImode;
21799           break;
21800         default:
21801           return -1;
21802         }
21803
21804       if (*modep == VOIDmode)
21805         *modep = mode;
21806
21807       /* Vector modes are considered to be opaque: two vectors are
21808          equivalent for the purposes of being homogeneous aggregates
21809          if they are the same size.  */
21810       if (*modep == mode)
21811         return 1;
21812
21813       break;
21814
21815     case ARRAY_TYPE:
21816       {
21817         int count;
21818         tree index = TYPE_DOMAIN (type);
21819
21820         /* Can't handle incomplete types nor sizes that are not
21821            fixed.  */
21822         if (!COMPLETE_TYPE_P (type)
21823             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21824           return -1;
21825
21826         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
21827                                          warn_psabi_flags);
21828         if (count == -1
21829             || !index
21830             || !TYPE_MAX_VALUE (index)
21831             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
21832             || !TYPE_MIN_VALUE (index)
21833             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
21834             || count < 0)
21835           return -1;
21836
21837         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
21838                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
21839
21840         /* There must be no padding.  */
21841         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21842                       count * GET_MODE_BITSIZE (*modep)))
21843           return -1;
21844
21845         return count;
21846       }
21847
21848     case RECORD_TYPE:
21849       {
21850         int count = 0;
21851         int sub_count;
21852         tree field;
21853
21854         /* Can't handle incomplete types nor sizes that are not
21855            fixed.  */
21856         if (!COMPLETE_TYPE_P (type)
21857             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21858           return -1;
21859
21860         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21861           {
21862             if (TREE_CODE (field) != FIELD_DECL)
21863               continue;
21864
21865             if (DECL_FIELD_ABI_IGNORED (field))
21866               {
21867                 /* See whether this is something that earlier versions of
21868                    GCC failed to ignore.  */
21869                 unsigned int flag;
21870                 if (lookup_attribute ("no_unique_address",
21871                                       DECL_ATTRIBUTES (field)))
21872                   flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
21873                 else if (cxx17_empty_base_field_p (field))
21874                   flag = WARN_PSABI_EMPTY_CXX17_BASE;
21875                 else
21876                   /* No compatibility problem.  */
21877                   continue;
21878
21879                 /* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
21880                 if (warn_psabi_flags)
21881                   {
21882                     *warn_psabi_flags |= flag;
21883                     continue;
21884                   }
21885               }
21886             /* A zero-width bitfield may affect layout in some
21887                circumstances, but adds no members.  The determination
21888                of whether or not a type is an HFA is performed after
21889                layout is complete, so if the type still looks like an
21890                HFA afterwards, it is still classed as one.  This is
21891                potentially an ABI break for the hard-float ABI.  */
21892             else if (DECL_BIT_FIELD (field)
21893                      && integer_zerop (DECL_SIZE (field)))
21894               {
21895                 /* Prior to GCC-12 these fields were striped early,
21896                    hiding them from the back-end entirely and
21897                    resulting in the correct behaviour for argument
21898                    passing.  Simulate that old behaviour without
21899                    generating a warning.  */
21900                 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
21901                   continue;
21902                 if (warn_psabi_flags)
21903                   {
21904                     *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
21905                     continue;
21906                   }
21907               }
21908
21909             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
21910                                                  warn_psabi_flags);
21911             if (sub_count < 0)
21912               return -1;
21913             count += sub_count;
21914           }
21915
21916         /* There must be no padding.  */
21917         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21918                       count * GET_MODE_BITSIZE (*modep)))
21919           return -1;
21920
21921         return count;
21922       }
21923
21924     case UNION_TYPE:
21925     case QUAL_UNION_TYPE:
21926       {
21927         /* These aren't very interesting except in a degenerate case.  */
21928         int count = 0;
21929         int sub_count;
21930         tree field;
21931
21932         /* Can't handle incomplete types nor sizes that are not
21933            fixed.  */
21934         if (!COMPLETE_TYPE_P (type)
21935             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21936           return -1;
21937
21938         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21939           {
21940             if (TREE_CODE (field) != FIELD_DECL)
21941               continue;
21942
21943             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
21944                                                  warn_psabi_flags);
21945             if (sub_count < 0)
21946               return -1;
21947             count = count > sub_count ? count : sub_count;
21948           }
21949
21950         /* There must be no padding.  */
21951         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21952                       count * GET_MODE_BITSIZE (*modep)))
21953           return -1;
21954
21955         return count;
21956       }
21957
21958     default:
21959       break;
21960     }
21961
21962   return -1;
21963 }
21964
21965 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
21966    type as described in AAPCS64 \S 4.1.2.
21967
21968    See the comment above aarch64_composite_type_p for the notes on MODE.  */
21969
21970 static bool
21971 aarch64_short_vector_p (const_tree type,
21972                         machine_mode mode)
21973 {
21974   poly_int64 size = -1;
21975
21976   if (type && VECTOR_TYPE_P (type))
21977     {
21978       if (aarch64_sve::builtin_type_p (type))
21979         return false;
21980       size = int_size_in_bytes (type);
21981     }
21982   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
21983            || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
21984     {
21985       /* The containing "else if" is too loose: it means that we look at TYPE
21986          if the type is a vector type (good), but that we otherwise ignore TYPE
21987          and look only at the mode.  This is wrong because the type describes
21988          the language-level information whereas the mode is purely an internal
21989          GCC concept.  We can therefore reach here for types that are not
21990          vectors in the AAPCS64 sense.
21991
21992          We can't "fix" that for the traditional Advanced SIMD vector modes
21993          without breaking backwards compatibility.  However, there's no such
21994          baggage for the structure modes, which were introduced in GCC 12.  */
21995       if (aarch64_advsimd_struct_mode_p (mode))
21996         return false;
21997
21998       /* For similar reasons, rely only on the type, not the mode, when
21999          processing SVE types.  */
22000       if (type && aarch64_some_values_include_pst_objects_p (type))
22001         /* Leave later code to report an error if SVE is disabled.  */
22002         gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
22003       else
22004         size = GET_MODE_SIZE (mode);
22005     }
22006   if (known_eq (size, 8) || known_eq (size, 16))
22007     {
22008       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
22009          they are being treated as scalable AAPCS64 types.  */
22010       gcc_assert (!aarch64_sve_mode_p (mode)
22011                   && !aarch64_advsimd_struct_mode_p (mode));
22012       return true;
22013     }
22014   return false;
22015 }
22016
22017 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
22018    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
22019    array types.  The C99 floating-point complex types are also considered
22020    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
22021    types, which are GCC extensions and out of the scope of AAPCS64, are
22022    treated as composite types here as well.
22023
22024    Note that MODE itself is not sufficient in determining whether a type
22025    is such a composite type or not.  This is because
22026    stor-layout.cc:compute_record_mode may have already changed the MODE
22027    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
22028    structure with only one field may have its MODE set to the mode of the
22029    field.  Also an integer mode whose size matches the size of the
22030    RECORD_TYPE type may be used to substitute the original mode
22031    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
22032    solely relied on.  */
22033
22034 static bool
22035 aarch64_composite_type_p (const_tree type,
22036                           machine_mode mode)
22037 {
22038   if (aarch64_short_vector_p (type, mode))
22039     return false;
22040
22041   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
22042     return true;
22043
22044   if (type
22045       && TREE_CODE (type) == BITINT_TYPE
22046       && int_size_in_bytes (type) > 16)
22047     return true;
22048
22049   if (mode == BLKmode
22050       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
22051       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
22052     return true;
22053
22054   return false;
22055 }
22056
22057 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
22058    shall be passed or returned in simd/fp register(s) (providing these
22059    parameter passing registers are available).
22060
22061    Upon successful return, *COUNT returns the number of needed registers,
22062    *BASE_MODE returns the mode of the individual register and when IS_HA
22063    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
22064    floating-point aggregate or a homogeneous short-vector aggregate.
22065
22066    SILENT_P is true if the function should refrain from reporting any
22067    diagnostics.  This should only be used if the caller is certain that
22068    any ABI decisions would eventually come through this function with
22069    SILENT_P set to false.  */
22070
22071 static bool
22072 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
22073                                          const_tree type,
22074                                          machine_mode *base_mode,
22075                                          int *count,
22076                                          bool *is_ha,
22077                                          bool silent_p)
22078 {
22079   if (is_ha != NULL) *is_ha = false;
22080
22081   machine_mode new_mode = VOIDmode;
22082   bool composite_p = aarch64_composite_type_p (type, mode);
22083
22084   if ((!composite_p
22085        && (GET_MODE_CLASS (mode) == MODE_FLOAT
22086            || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
22087       || aarch64_short_vector_p (type, mode))
22088     {
22089       *count = 1;
22090       new_mode = mode;
22091     }
22092   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
22093     {
22094       if (is_ha != NULL) *is_ha = true;
22095       *count = 2;
22096       new_mode = GET_MODE_INNER (mode);
22097     }
22098   else if (type && composite_p)
22099     {
22100       unsigned int warn_psabi_flags = 0;
22101       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
22102                                               &warn_psabi_flags);
22103       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
22104         {
22105           static unsigned last_reported_type_uid;
22106           unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
22107           int alt;
22108           if (!silent_p
22109               && warn_psabi
22110               && warn_psabi_flags
22111               && uid != last_reported_type_uid
22112               && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
22113                   != ag_count))
22114             {
22115               const char *url10
22116                 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
22117               const char *url12
22118                 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
22119               gcc_assert (alt == -1);
22120               last_reported_type_uid = uid;
22121               /* Use TYPE_MAIN_VARIANT to strip any redundant const
22122                  qualification.  */
22123               if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
22124                 inform (input_location, "parameter passing for argument of "
22125                         "type %qT with %<[[no_unique_address]]%> members "
22126                         "changed %{in GCC 10.1%}",
22127                         TYPE_MAIN_VARIANT (type), url10);
22128               else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
22129                 inform (input_location, "parameter passing for argument of "
22130                         "type %qT when C++17 is enabled changed to match "
22131                         "C++14 %{in GCC 10.1%}",
22132                         TYPE_MAIN_VARIANT (type), url10);
22133               else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
22134                 inform (input_location, "parameter passing for argument of "
22135                         "type %qT changed %{in GCC 12.1%}",
22136                         TYPE_MAIN_VARIANT (type), url12);
22137             }
22138
22139           if (is_ha != NULL) *is_ha = true;
22140           *count = ag_count;
22141         }
22142       else
22143         return false;
22144     }
22145   else
22146     return false;
22147
22148   gcc_assert (!aarch64_sve_mode_p (new_mode));
22149   *base_mode = new_mode;
22150   return true;
22151 }
22152
22153 /* Implement TARGET_STRUCT_VALUE_RTX.  */
22154
22155 static rtx
22156 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
22157                           int incoming ATTRIBUTE_UNUSED)
22158 {
22159   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
22160 }
22161
22162 /* Implements target hook vector_mode_supported_p.  */
22163 static bool
22164 aarch64_vector_mode_supported_p (machine_mode mode)
22165 {
22166   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
22167   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22168 }
22169
22170 /* Implements target hook vector_mode_supported_any_target_p.  */
22171 static bool
22172 aarch64_vector_mode_supported_any_target_p (machine_mode mode)
22173 {
22174   unsigned int vec_flags = aarch64_classify_vector_mode (mode, true);
22175   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22176 }
22177
22178 /* Return the full-width SVE vector mode for element mode MODE, if one
22179    exists.  */
22180 opt_machine_mode
22181 aarch64_full_sve_mode (scalar_mode mode)
22182 {
22183   switch (mode)
22184     {
22185     case E_DFmode:
22186       return VNx2DFmode;
22187     case E_SFmode:
22188       return VNx4SFmode;
22189     case E_HFmode:
22190       return VNx8HFmode;
22191     case E_BFmode:
22192       return VNx8BFmode;
22193     case E_DImode:
22194       return VNx2DImode;
22195     case E_SImode:
22196       return VNx4SImode;
22197     case E_HImode:
22198       return VNx8HImode;
22199     case E_QImode:
22200       return VNx16QImode;
22201     default:
22202       return opt_machine_mode ();
22203     }
22204 }
22205
22206 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
22207    if it exists.  */
22208 opt_machine_mode
22209 aarch64_vq_mode (scalar_mode mode)
22210 {
22211   switch (mode)
22212     {
22213     case E_DFmode:
22214       return V2DFmode;
22215     case E_SFmode:
22216       return V4SFmode;
22217     case E_HFmode:
22218       return V8HFmode;
22219     case E_BFmode:
22220       return V8BFmode;
22221     case E_SImode:
22222       return V4SImode;
22223     case E_HImode:
22224       return V8HImode;
22225     case E_QImode:
22226       return V16QImode;
22227     case E_DImode:
22228       return V2DImode;
22229     default:
22230       return opt_machine_mode ();
22231     }
22232 }
22233
22234 /* Return appropriate SIMD container
22235    for MODE within a vector of WIDTH bits.  */
22236 static machine_mode
22237 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
22238 {
22239   if (TARGET_SVE
22240       && maybe_ne (width, 128)
22241       && known_eq (width, BITS_PER_SVE_VECTOR))
22242     return aarch64_full_sve_mode (mode).else_mode (word_mode);
22243
22244   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
22245   if (TARGET_BASE_SIMD)
22246     {
22247       if (known_eq (width, 128))
22248         return aarch64_vq_mode (mode).else_mode (word_mode);
22249       else
22250         switch (mode)
22251           {
22252           case E_SFmode:
22253             return V2SFmode;
22254           case E_HFmode:
22255             return V4HFmode;
22256           case E_BFmode:
22257             return V4BFmode;
22258           case E_SImode:
22259             return V2SImode;
22260           case E_HImode:
22261             return V4HImode;
22262           case E_QImode:
22263             return V8QImode;
22264           default:
22265             break;
22266           }
22267     }
22268   return word_mode;
22269 }
22270
22271 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
22272    and return whether the SVE mode should be preferred over the
22273    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
22274 static bool
22275 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
22276 {
22277   /* Take into account the aarch64-autovec-preference param if non-zero.  */
22278   bool only_asimd_p = aarch64_autovec_preference == 1;
22279   bool only_sve_p = aarch64_autovec_preference == 2;
22280
22281   if (only_asimd_p)
22282     return false;
22283   if (only_sve_p)
22284     return true;
22285
22286   /* The preference in case of a tie in costs.  */
22287   bool prefer_asimd = aarch64_autovec_preference == 3;
22288   bool prefer_sve = aarch64_autovec_preference == 4;
22289
22290   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
22291   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
22292   /* If the CPU information does not have an SVE width registered use the
22293      generic poly_int comparison that prefers SVE.  If a preference is
22294      explicitly requested avoid this path.  */
22295   if (aarch64_tune_params.sve_width == SVE_SCALABLE
22296       && !prefer_asimd
22297       && !prefer_sve)
22298     return maybe_gt (nunits_sve, nunits_asimd);
22299
22300   /* Otherwise estimate the runtime width of the modes involved.  */
22301   HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
22302   HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
22303
22304   /* Preferring SVE means picking it first unless the Advanced SIMD mode
22305      is clearly wider.  */
22306   if (prefer_sve)
22307     return est_sve >= est_asimd;
22308   /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
22309      is clearly wider.  */
22310   if (prefer_asimd)
22311     return est_sve > est_asimd;
22312
22313   /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
22314   return est_sve > est_asimd;
22315 }
22316
22317 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
22318 static machine_mode
22319 aarch64_preferred_simd_mode (scalar_mode mode)
22320 {
22321   /* Take into account explicit auto-vectorization ISA preferences through
22322      aarch64_cmp_autovec_modes.  */
22323   if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
22324     return aarch64_full_sve_mode (mode).else_mode (word_mode);
22325   if (TARGET_SIMD)
22326     return aarch64_vq_mode (mode).else_mode (word_mode);
22327   return word_mode;
22328 }
22329
22330 /* Return a list of possible vector sizes for the vectorizer
22331    to iterate over.  */
22332 static unsigned int
22333 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
22334 {
22335   static const machine_mode sve_modes[] = {
22336     /* Try using full vectors for all element types.  */
22337     VNx16QImode,
22338
22339     /* Try using 16-bit containers for 8-bit elements and full vectors
22340        for wider elements.  */
22341     VNx8QImode,
22342
22343     /* Try using 32-bit containers for 8-bit and 16-bit elements and
22344        full vectors for wider elements.  */
22345     VNx4QImode,
22346
22347     /* Try using 64-bit containers for all element types.  */
22348     VNx2QImode
22349   };
22350
22351   static const machine_mode advsimd_modes[] = {
22352     /* Try using 128-bit vectors for all element types.  */
22353     V16QImode,
22354
22355     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
22356        for wider elements.  */
22357     V8QImode,
22358
22359     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
22360        for wider elements.
22361
22362        TODO: We could support a limited form of V4QImode too, so that
22363        we use 32-bit vectors for 8-bit elements.  */
22364     V4HImode,
22365
22366     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
22367        for 64-bit elements.
22368
22369        TODO: We could similarly support limited forms of V2QImode and V2HImode
22370        for this case.  */
22371     V2SImode
22372   };
22373
22374   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
22375      This is because:
22376
22377      - If we can't use N-byte Advanced SIMD vectors then the placement
22378        doesn't matter; we'll just continue as though the Advanced SIMD
22379        entry didn't exist.
22380
22381      - If an SVE main loop with N bytes ends up being cheaper than an
22382        Advanced SIMD main loop with N bytes then by default we'll replace
22383        the Advanced SIMD version with the SVE one.
22384
22385      - If an Advanced SIMD main loop with N bytes ends up being cheaper
22386        than an SVE main loop with N bytes then by default we'll try to
22387        use the SVE loop to vectorize the epilogue instead.  */
22388
22389   bool only_asimd_p = aarch64_autovec_preference == 1;
22390   bool only_sve_p = aarch64_autovec_preference == 2;
22391
22392   unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
22393   unsigned int advsimd_i = 0;
22394
22395   while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
22396     {
22397       if (sve_i < ARRAY_SIZE (sve_modes)
22398           && aarch64_cmp_autovec_modes (sve_modes[sve_i],
22399                                         advsimd_modes[advsimd_i]))
22400         modes->safe_push (sve_modes[sve_i++]);
22401       else
22402         modes->safe_push (advsimd_modes[advsimd_i++]);
22403     }
22404   while (sve_i < ARRAY_SIZE (sve_modes))
22405    modes->safe_push (sve_modes[sve_i++]);
22406
22407   unsigned int flags = 0;
22408   if (aarch64_vect_compare_costs)
22409     flags |= VECT_COMPARE_COSTS;
22410   return flags;
22411 }
22412
22413 /* Implement TARGET_MANGLE_TYPE.  */
22414
22415 static const char *
22416 aarch64_mangle_type (const_tree type)
22417 {
22418   /* The AArch64 ABI documents say that "__va_list" has to be
22419      mangled as if it is in the "std" namespace.  */
22420   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
22421     return "St9__va_list";
22422
22423   /* Half-precision floating point types.  */
22424   if (SCALAR_FLOAT_TYPE_P (type) && TYPE_PRECISION (type) == 16)
22425     {
22426       if (TYPE_MAIN_VARIANT (type) == float16_type_node)
22427         return NULL;
22428       if (TYPE_MODE (type) == BFmode)
22429         return "u6__bf16";
22430       else
22431         return "Dh";
22432     }
22433
22434   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
22435      builtin types.  */
22436   if (TYPE_NAME (type) != NULL)
22437     {
22438       const char *res;
22439       if ((res = aarch64_general_mangle_builtin_type (type))
22440           || (res = aarch64_sve::mangle_builtin_type (type)))
22441         return res;
22442     }
22443
22444   /* Use the default mangling.  */
22445   return NULL;
22446 }
22447
22448 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
22449
22450 static bool
22451 aarch64_verify_type_context (location_t loc, type_context_kind context,
22452                              const_tree type, bool silent_p)
22453 {
22454   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
22455 }
22456
22457 /* Find the first rtx_insn before insn that will generate an assembly
22458    instruction.  */
22459
22460 static rtx_insn *
22461 aarch64_prev_real_insn (rtx_insn *insn)
22462 {
22463   if (!insn)
22464     return NULL;
22465
22466   do
22467     {
22468       insn = prev_real_insn (insn);
22469     }
22470   while (insn && recog_memoized (insn) < 0);
22471
22472   return insn;
22473 }
22474
22475 static bool
22476 is_madd_op (enum attr_type t1)
22477 {
22478   unsigned int i;
22479   /* A number of these may be AArch32 only.  */
22480   enum attr_type mlatypes[] = {
22481     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
22482     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
22483     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
22484   };
22485
22486   for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
22487     {
22488       if (t1 == mlatypes[i])
22489         return true;
22490     }
22491
22492   return false;
22493 }
22494
22495 /* Check if there is a register dependency between a load and the insn
22496    for which we hold recog_data.  */
22497
22498 static bool
22499 dep_between_memop_and_curr (rtx memop)
22500 {
22501   rtx load_reg;
22502   int opno;
22503
22504   gcc_assert (GET_CODE (memop) == SET);
22505
22506   if (!REG_P (SET_DEST (memop)))
22507     return false;
22508
22509   load_reg = SET_DEST (memop);
22510   for (opno = 1; opno < recog_data.n_operands; opno++)
22511     {
22512       rtx operand = recog_data.operand[opno];
22513       if (REG_P (operand)
22514           && reg_overlap_mentioned_p (load_reg, operand))
22515         return true;
22516
22517     }
22518   return false;
22519 }
22520
22521
22522 /* When working around the Cortex-A53 erratum 835769,
22523    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
22524    instruction and has a preceding memory instruction such that a NOP
22525    should be inserted between them.  */
22526
22527 bool
22528 aarch64_madd_needs_nop (rtx_insn* insn)
22529 {
22530   enum attr_type attr_type;
22531   rtx_insn *prev;
22532   rtx body;
22533
22534   if (!TARGET_FIX_ERR_A53_835769)
22535     return false;
22536
22537   if (!INSN_P (insn) || recog_memoized (insn) < 0)
22538     return false;
22539
22540   attr_type = get_attr_type (insn);
22541   if (!is_madd_op (attr_type))
22542     return false;
22543
22544   prev = aarch64_prev_real_insn (insn);
22545   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
22546      Restore recog state to INSN to avoid state corruption.  */
22547   extract_constrain_insn_cached (insn);
22548
22549   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
22550     return false;
22551
22552   body = single_set (prev);
22553
22554   /* If the previous insn is a memory op and there is no dependency between
22555      it and the DImode madd, emit a NOP between them.  If body is NULL then we
22556      have a complex memory operation, probably a load/store pair.
22557      Be conservative for now and emit a NOP.  */
22558   if (GET_MODE (recog_data.operand[0]) == DImode
22559       && (!body || !dep_between_memop_and_curr (body)))
22560     return true;
22561
22562   return false;
22563
22564 }
22565
22566
22567 /* Implement FINAL_PRESCAN_INSN.  */
22568
22569 void
22570 aarch64_final_prescan_insn (rtx_insn *insn)
22571 {
22572   if (aarch64_madd_needs_nop (insn))
22573     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
22574 }
22575
22576
22577 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
22578    instruction.  */
22579
22580 bool
22581 aarch64_sve_index_immediate_p (rtx base_or_step)
22582 {
22583   return (CONST_INT_P (base_or_step)
22584           && IN_RANGE (INTVAL (base_or_step), -16, 15));
22585 }
22586
22587 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
22588    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
22589
22590 bool
22591 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
22592 {
22593   rtx elt = unwrap_const_vec_duplicate (x);
22594   if (!CONST_INT_P (elt))
22595     return false;
22596
22597   HOST_WIDE_INT val = INTVAL (elt);
22598   if (negate_p)
22599     val = -val;
22600   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
22601
22602   if (val & 0xff)
22603     return IN_RANGE (val, 0, 0xff);
22604   return IN_RANGE (val, 0, 0xff00);
22605 }
22606
22607 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
22608    instructions when applied to mode MODE.  Negate X first if NEGATE_P
22609    is true.  */
22610
22611 bool
22612 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
22613 {
22614   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
22615     return false;
22616
22617   /* After the optional negation, the immediate must be nonnegative.
22618      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
22619      instead of SQADD Zn.B, Zn.B, #129.  */
22620   rtx elt = unwrap_const_vec_duplicate (x);
22621   return negate_p == (INTVAL (elt) < 0);
22622 }
22623
22624 /* Return true if X is a valid immediate operand for an SVE logical
22625    instruction such as AND.  */
22626
22627 bool
22628 aarch64_sve_bitmask_immediate_p (rtx x)
22629 {
22630   rtx elt;
22631
22632   return (const_vec_duplicate_p (x, &elt)
22633           && CONST_INT_P (elt)
22634           && aarch64_bitmask_imm (INTVAL (elt),
22635                                   GET_MODE_INNER (GET_MODE (x))));
22636 }
22637
22638 /* Return true if X is a valid immediate for the SVE DUP and CPY
22639    instructions.  */
22640
22641 bool
22642 aarch64_sve_dup_immediate_p (rtx x)
22643 {
22644   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
22645   if (!CONST_INT_P (x))
22646     return false;
22647
22648   HOST_WIDE_INT val = INTVAL (x);
22649   if (val & 0xff)
22650     return IN_RANGE (val, -0x80, 0x7f);
22651   return IN_RANGE (val, -0x8000, 0x7f00);
22652 }
22653
22654 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
22655    SIGNED_P says whether the operand is signed rather than unsigned.  */
22656
22657 bool
22658 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
22659 {
22660   x = unwrap_const_vec_duplicate (x);
22661   return (CONST_INT_P (x)
22662           && (signed_p
22663               ? IN_RANGE (INTVAL (x), -16, 15)
22664               : IN_RANGE (INTVAL (x), 0, 127)));
22665 }
22666
22667 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
22668    instruction.  Negate X first if NEGATE_P is true.  */
22669
22670 bool
22671 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
22672 {
22673   rtx elt;
22674   REAL_VALUE_TYPE r;
22675
22676   if (!const_vec_duplicate_p (x, &elt)
22677       || !CONST_DOUBLE_P (elt))
22678     return false;
22679
22680   r = *CONST_DOUBLE_REAL_VALUE (elt);
22681
22682   if (negate_p)
22683     r = real_value_negate (&r);
22684
22685   if (real_equal (&r, &dconst1))
22686     return true;
22687   if (real_equal (&r, &dconsthalf))
22688     return true;
22689   return false;
22690 }
22691
22692 /* Return true if X is a valid immediate operand for an SVE FMUL
22693    instruction.  */
22694
22695 bool
22696 aarch64_sve_float_mul_immediate_p (rtx x)
22697 {
22698   rtx elt;
22699
22700   return (const_vec_duplicate_p (x, &elt)
22701           && CONST_DOUBLE_P (elt)
22702           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
22703               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
22704 }
22705
22706 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
22707    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
22708    is nonnull, use it to describe valid immediates.  */
22709 static bool
22710 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
22711                                     simd_immediate_info *info,
22712                                     enum simd_immediate_check which,
22713                                     simd_immediate_info::insn_type insn)
22714 {
22715   /* Try a 4-byte immediate with LSL.  */
22716   for (unsigned int shift = 0; shift < 32; shift += 8)
22717     if ((val32 & (0xff << shift)) == val32)
22718       {
22719         if (info)
22720           *info = simd_immediate_info (SImode, val32 >> shift, insn,
22721                                        simd_immediate_info::LSL, shift);
22722         return true;
22723       }
22724
22725   /* Try a 2-byte immediate with LSL.  */
22726   unsigned int imm16 = val32 & 0xffff;
22727   if (imm16 == (val32 >> 16))
22728     for (unsigned int shift = 0; shift < 16; shift += 8)
22729       if ((imm16 & (0xff << shift)) == imm16)
22730         {
22731           if (info)
22732             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
22733                                          simd_immediate_info::LSL, shift);
22734           return true;
22735         }
22736
22737   /* Try a 4-byte immediate with MSL, except for cases that MVN
22738      can handle.  */
22739   if (which == AARCH64_CHECK_MOV)
22740     for (unsigned int shift = 8; shift < 24; shift += 8)
22741       {
22742         unsigned int low = (1 << shift) - 1;
22743         if (((val32 & (0xff << shift)) | low) == val32)
22744           {
22745             if (info)
22746               *info = simd_immediate_info (SImode, val32 >> shift, insn,
22747                                            simd_immediate_info::MSL, shift);
22748             return true;
22749           }
22750       }
22751
22752   return false;
22753 }
22754
22755 /* Return true if replicating VAL64 is a valid immediate for the
22756    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
22757    use it to describe valid immediates.  */
22758 static bool
22759 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
22760                                  simd_immediate_info *info,
22761                                  enum simd_immediate_check which)
22762 {
22763   unsigned int val32 = val64 & 0xffffffff;
22764   unsigned int val16 = val64 & 0xffff;
22765   unsigned int val8 = val64 & 0xff;
22766
22767   if (val32 == (val64 >> 32))
22768     {
22769       if ((which & AARCH64_CHECK_ORR) != 0
22770           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
22771                                                  simd_immediate_info::MOV))
22772         return true;
22773
22774       if ((which & AARCH64_CHECK_BIC) != 0
22775           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
22776                                                  simd_immediate_info::MVN))
22777         return true;
22778
22779       /* Try using a replicated byte.  */
22780       if (which == AARCH64_CHECK_MOV
22781           && val16 == (val32 >> 16)
22782           && val8 == (val16 >> 8))
22783         {
22784           if (info)
22785             *info = simd_immediate_info (QImode, val8);
22786           return true;
22787         }
22788     }
22789
22790   /* Try using a bit-to-bytemask.  */
22791   if (which == AARCH64_CHECK_MOV)
22792     {
22793       unsigned int i;
22794       for (i = 0; i < 64; i += 8)
22795         {
22796           unsigned char byte = (val64 >> i) & 0xff;
22797           if (byte != 0 && byte != 0xff)
22798             break;
22799         }
22800       if (i == 64)
22801         {
22802           if (info)
22803             *info = simd_immediate_info (DImode, val64);
22804           return true;
22805         }
22806     }
22807   return false;
22808 }
22809
22810 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
22811    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
22812
22813 static bool
22814 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
22815                              simd_immediate_info *info)
22816 {
22817   scalar_int_mode mode = DImode;
22818   unsigned int val32 = val64 & 0xffffffff;
22819   if (val32 == (val64 >> 32))
22820     {
22821       mode = SImode;
22822       unsigned int val16 = val32 & 0xffff;
22823       if (val16 == (val32 >> 16))
22824         {
22825           mode = HImode;
22826           unsigned int val8 = val16 & 0xff;
22827           if (val8 == (val16 >> 8))
22828             mode = QImode;
22829         }
22830     }
22831   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
22832   if (IN_RANGE (val, -0x80, 0x7f))
22833     {
22834       /* DUP with no shift.  */
22835       if (info)
22836         *info = simd_immediate_info (mode, val);
22837       return true;
22838     }
22839   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
22840     {
22841       /* DUP with LSL #8.  */
22842       if (info)
22843         *info = simd_immediate_info (mode, val);
22844       return true;
22845     }
22846   if (aarch64_bitmask_imm (val64, mode))
22847     {
22848       /* DUPM.  */
22849       if (info)
22850         *info = simd_immediate_info (mode, val);
22851       return true;
22852     }
22853   return false;
22854 }
22855
22856 /* Return true if X is an UNSPEC_PTRUE constant of the form:
22857
22858        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
22859
22860    where PATTERN is the svpattern as a CONST_INT and where ZERO
22861    is a zero constant of the required PTRUE mode (which can have
22862    fewer elements than X's mode, if zero bits are significant).
22863
22864    If so, and if INFO is nonnull, describe the immediate in INFO.  */
22865 bool
22866 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
22867 {
22868   if (GET_CODE (x) != CONST)
22869     return false;
22870
22871   x = XEXP (x, 0);
22872   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
22873     return false;
22874
22875   if (info)
22876     {
22877       aarch64_svpattern pattern
22878         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
22879       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
22880       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
22881       *info = simd_immediate_info (int_mode, pattern);
22882     }
22883   return true;
22884 }
22885
22886 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
22887    it to describe valid immediates.  */
22888
22889 static bool
22890 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
22891 {
22892   if (aarch64_sve_ptrue_svpattern_p (x, info))
22893     return true;
22894
22895   if (x == CONST0_RTX (GET_MODE (x)))
22896     {
22897       if (info)
22898         *info = simd_immediate_info (DImode, 0);
22899       return true;
22900     }
22901
22902   /* Analyze the value as a VNx16BImode.  This should be relatively
22903      efficient, since rtx_vector_builder has enough built-in capacity
22904      to store all VLA predicate constants without needing the heap.  */
22905   rtx_vector_builder builder;
22906   if (!aarch64_get_sve_pred_bits (builder, x))
22907     return false;
22908
22909   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
22910   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
22911     {
22912       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
22913       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
22914       if (pattern != AARCH64_NUM_SVPATTERNS)
22915         {
22916           if (info)
22917             {
22918               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
22919               *info = simd_immediate_info (int_mode, pattern);
22920             }
22921           return true;
22922         }
22923     }
22924   return false;
22925 }
22926
22927 /* Return true if OP is a valid SIMD immediate for the operation
22928    described by WHICH.  If INFO is nonnull, use it to describe valid
22929    immediates.  */
22930 bool
22931 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
22932                               enum simd_immediate_check which)
22933 {
22934   machine_mode mode = GET_MODE (op);
22935   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
22936   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
22937     return false;
22938
22939   if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
22940     return false;
22941
22942   if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
22943     return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
22944
22945   if (vec_flags & VEC_SVE_PRED)
22946     return aarch64_sve_pred_valid_immediate (op, info);
22947
22948   scalar_mode elt_mode = GET_MODE_INNER (mode);
22949   rtx base, step;
22950   unsigned int n_elts;
22951   if (CONST_VECTOR_P (op)
22952       && CONST_VECTOR_DUPLICATE_P (op))
22953     n_elts = CONST_VECTOR_NPATTERNS (op);
22954   else if ((vec_flags & VEC_SVE_DATA)
22955            && const_vec_series_p (op, &base, &step))
22956     {
22957       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
22958       if (!aarch64_sve_index_immediate_p (base)
22959           || !aarch64_sve_index_immediate_p (step))
22960         return false;
22961
22962       if (info)
22963         {
22964           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
22965              should yield two integer values per 128-bit block, meaning
22966              that we need to treat it in the same way as V2DI and then
22967              ignore the upper 32 bits of each element.  */
22968           elt_mode = aarch64_sve_container_int_mode (mode);
22969           *info = simd_immediate_info (elt_mode, base, step);
22970         }
22971       return true;
22972     }
22973   else if (CONST_VECTOR_P (op)
22974            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
22975     /* N_ELTS set above.  */;
22976   else
22977     return false;
22978
22979   scalar_float_mode elt_float_mode;
22980   if (n_elts == 1
22981       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
22982     {
22983       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
22984       if (aarch64_float_const_zero_rtx_p (elt)
22985           || aarch64_float_const_representable_p (elt))
22986         {
22987           if (info)
22988             *info = simd_immediate_info (elt_float_mode, elt);
22989           return true;
22990         }
22991     }
22992
22993   /* If all elements in an SVE vector have the same value, we have a free
22994      choice between using the element mode and using the container mode.
22995      Using the element mode means that unused parts of the vector are
22996      duplicates of the used elements, while using the container mode means
22997      that the unused parts are an extension of the used elements.  Using the
22998      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
22999      for its container mode VNx4SI while 0x00000101 isn't.
23000
23001      If not all elements in an SVE vector have the same value, we need the
23002      transition from one element to the next to occur at container boundaries.
23003      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
23004      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
23005   scalar_int_mode elt_int_mode;
23006   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
23007     elt_int_mode = aarch64_sve_container_int_mode (mode);
23008   else
23009     elt_int_mode = int_mode_for_mode (elt_mode).require ();
23010
23011   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
23012   if (elt_size > 8)
23013     return false;
23014
23015   /* Expand the vector constant out into a byte vector, with the least
23016      significant byte of the register first.  */
23017   auto_vec<unsigned char, 16> bytes;
23018   bytes.reserve (n_elts * elt_size);
23019   for (unsigned int i = 0; i < n_elts; i++)
23020     {
23021       /* The vector is provided in gcc endian-neutral fashion.
23022          For aarch64_be Advanced SIMD, it must be laid out in the vector
23023          register in reverse order.  */
23024       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
23025       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
23026
23027       if (elt_mode != elt_int_mode)
23028         elt = gen_lowpart (elt_int_mode, elt);
23029
23030       if (!CONST_INT_P (elt))
23031         return false;
23032
23033       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
23034       for (unsigned int byte = 0; byte < elt_size; byte++)
23035         {
23036           bytes.quick_push (elt_val & 0xff);
23037           elt_val >>= BITS_PER_UNIT;
23038         }
23039     }
23040
23041   /* The immediate must repeat every eight bytes.  */
23042   unsigned int nbytes = bytes.length ();
23043   for (unsigned i = 8; i < nbytes; ++i)
23044     if (bytes[i] != bytes[i - 8])
23045       return false;
23046
23047   /* Get the repeating 8-byte value as an integer.  No endian correction
23048      is needed here because bytes is already in lsb-first order.  */
23049   unsigned HOST_WIDE_INT val64 = 0;
23050   for (unsigned int i = 0; i < 8; i++)
23051     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
23052               << (i * BITS_PER_UNIT));
23053
23054   if (vec_flags & VEC_SVE_DATA)
23055     return aarch64_sve_valid_immediate (val64, info);
23056   else
23057     return aarch64_advsimd_valid_immediate (val64, info, which);
23058 }
23059
23060 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
23061    has a step in the range of INDEX.  Return the index expression if so,
23062    otherwise return null.  */
23063 rtx
23064 aarch64_check_zero_based_sve_index_immediate (rtx x)
23065 {
23066   rtx base, step;
23067   if (const_vec_series_p (x, &base, &step)
23068       && base == const0_rtx
23069       && aarch64_sve_index_immediate_p (step))
23070     return step;
23071   return NULL_RTX;
23072 }
23073
23074 /* Check of immediate shift constants are within range.  */
23075 bool
23076 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
23077 {
23078   x = unwrap_const_vec_duplicate (x);
23079   if (!CONST_INT_P (x))
23080     return false;
23081   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
23082   if (left)
23083     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
23084   else
23085     return IN_RANGE (INTVAL (x), 1, bit_width);
23086 }
23087
23088 /* Return the bitmask CONST_INT to select the bits required by a zero extract
23089    operation of width WIDTH at bit position POS.  */
23090
23091 rtx
23092 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
23093 {
23094   gcc_assert (CONST_INT_P (width));
23095   gcc_assert (CONST_INT_P (pos));
23096
23097   unsigned HOST_WIDE_INT mask
23098     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
23099   return GEN_INT (mask << UINTVAL (pos));
23100 }
23101
23102 bool
23103 aarch64_mov_operand_p (rtx x, machine_mode mode)
23104 {
23105   if (GET_CODE (x) == HIGH
23106       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
23107     return true;
23108
23109   if (CONST_INT_P (x))
23110     return true;
23111
23112   if (VECTOR_MODE_P (GET_MODE (x)))
23113     {
23114       /* Require predicate constants to be VNx16BI before RA, so that we
23115          force everything to have a canonical form.  */
23116       if (!lra_in_progress
23117           && !reload_completed
23118           && aarch64_sve_pred_mode_p (GET_MODE (x))
23119           && known_eq (GET_MODE_SIZE (GET_MODE (x)), BYTES_PER_SVE_PRED)
23120           && GET_MODE (x) != VNx16BImode)
23121         return false;
23122
23123       return aarch64_simd_valid_immediate (x, NULL);
23124     }
23125
23126   /* Remove UNSPEC_SALT_ADDR before checking symbol reference.  */
23127   x = strip_salt (x);
23128
23129   /* GOT accesses are valid moves.  */
23130   if (SYMBOL_REF_P (x)
23131       && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
23132     return true;
23133
23134   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
23135     return true;
23136
23137   if (TARGET_SVE
23138       && (aarch64_sve_cnt_immediate_p (x)
23139           || aarch64_sve_rdvl_immediate_p (x)))
23140     return true;
23141
23142   if (aarch64_rdsvl_immediate_p (x))
23143     return true;
23144
23145   return aarch64_classify_symbolic_expression (x)
23146     == SYMBOL_TINY_ABSOLUTE;
23147 }
23148
23149 /* Return a function-invariant register that contains VALUE.  *CACHED_INSN
23150    caches instructions that set up such registers, so that they can be
23151    reused by future calls.  */
23152
23153 static rtx
23154 aarch64_get_shareable_reg (rtx_insn **cached_insn, rtx value)
23155 {
23156   rtx_insn *insn = *cached_insn;
23157   if (insn && INSN_P (insn) && !insn->deleted ())
23158     {
23159       rtx pat = PATTERN (insn);
23160       if (GET_CODE (pat) == SET)
23161         {
23162           rtx dest = SET_DEST (pat);
23163           if (REG_P (dest)
23164               && !HARD_REGISTER_P (dest)
23165               && rtx_equal_p (SET_SRC (pat), value))
23166             return dest;
23167         }
23168     }
23169   rtx reg = gen_reg_rtx (GET_MODE (value));
23170   *cached_insn = emit_insn_before (gen_rtx_SET (reg, value),
23171                                    function_beg_insn);
23172   return reg;
23173 }
23174
23175 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
23176    the constant creation.  */
23177
23178 rtx
23179 aarch64_gen_shareable_zero (machine_mode mode)
23180 {
23181   rtx reg = aarch64_get_shareable_reg (&cfun->machine->advsimd_zero_insn,
23182                                        CONST0_RTX (V4SImode));
23183   return lowpart_subreg (mode, reg, GET_MODE (reg));
23184 }
23185
23186 /* INSN is some form of extension or shift that can be split into a
23187    permutation involving a shared zero.  Return true if we should
23188    perform such a split.
23189
23190    ??? For now, make sure that the split instruction executes more
23191    frequently than the zero that feeds it.  In future it would be good
23192    to split without that restriction and instead recombine shared zeros
23193    if they turn out not to be worthwhile.  This would allow splits in
23194    single-block functions and would also cope more naturally with
23195    rematerialization.  The downside of not doing this is that we lose the
23196    optimizations for vector epilogues as well.  */
23197
23198 bool
23199 aarch64_split_simd_shift_p (rtx_insn *insn)
23200 {
23201   return (can_create_pseudo_p ()
23202           && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))
23203           && (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count
23204               < BLOCK_FOR_INSN (insn)->count));
23205 }
23206
23207 /* Return a const_int vector of VAL.  */
23208 rtx
23209 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
23210 {
23211   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
23212   return gen_const_vec_duplicate (mode, c);
23213 }
23214
23215 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
23216
23217 bool
23218 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
23219 {
23220   machine_mode vmode;
23221
23222   vmode = aarch64_simd_container_mode (mode, 64);
23223   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
23224   return aarch64_simd_valid_immediate (op_v, NULL);
23225 }
23226
23227 /* Construct and return a PARALLEL RTX vector with elements numbering the
23228    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
23229    the vector - from the perspective of the architecture.  This does not
23230    line up with GCC's perspective on lane numbers, so we end up with
23231    different masks depending on our target endian-ness.  The diagram
23232    below may help.  We must draw the distinction when building masks
23233    which select one half of the vector.  An instruction selecting
23234    architectural low-lanes for a big-endian target, must be described using
23235    a mask selecting GCC high-lanes.
23236
23237                  Big-Endian             Little-Endian
23238
23239 GCC             0   1   2   3           3   2   1   0
23240               | x | x | x | x |       | x | x | x | x |
23241 Architecture    3   2   1   0           3   2   1   0
23242
23243 Low Mask:         { 2, 3 }                { 0, 1 }
23244 High Mask:        { 0, 1 }                { 2, 3 }
23245
23246    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
23247
23248 rtx
23249 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
23250 {
23251   rtvec v = rtvec_alloc (nunits / 2);
23252   int high_base = nunits / 2;
23253   int low_base = 0;
23254   int base;
23255   rtx t1;
23256   int i;
23257
23258   if (BYTES_BIG_ENDIAN)
23259     base = high ? low_base : high_base;
23260   else
23261     base = high ? high_base : low_base;
23262
23263   for (i = 0; i < nunits / 2; i++)
23264     RTVEC_ELT (v, i) = GEN_INT (base + i);
23265
23266   t1 = gen_rtx_PARALLEL (mode, v);
23267   return t1;
23268 }
23269
23270 /* Check OP for validity as a PARALLEL RTX vector with elements
23271    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
23272    from the perspective of the architecture.  See the diagram above
23273    aarch64_simd_vect_par_cnst_half for more details.  */
23274
23275 bool
23276 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
23277                                        bool high)
23278 {
23279   int nelts;
23280   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
23281     return false;
23282
23283   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
23284   HOST_WIDE_INT count_op = XVECLEN (op, 0);
23285   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
23286   int i = 0;
23287
23288   if (count_op != count_ideal)
23289     return false;
23290
23291   for (i = 0; i < count_ideal; i++)
23292     {
23293       rtx elt_op = XVECEXP (op, 0, i);
23294       rtx elt_ideal = XVECEXP (ideal, 0, i);
23295
23296       if (!CONST_INT_P (elt_op)
23297           || INTVAL (elt_ideal) != INTVAL (elt_op))
23298         return false;
23299     }
23300   return true;
23301 }
23302
23303 /* Return a PARALLEL containing NELTS elements, with element I equal
23304    to BASE + I * STEP.  */
23305
23306 rtx
23307 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
23308 {
23309   rtvec vec = rtvec_alloc (nelts);
23310   for (unsigned int i = 0; i < nelts; ++i)
23311     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
23312   return gen_rtx_PARALLEL (VOIDmode, vec);
23313 }
23314
23315 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
23316    series with step STEP.  */
23317
23318 bool
23319 aarch64_stepped_int_parallel_p (rtx op, int step)
23320 {
23321   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
23322     return false;
23323
23324   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
23325   for (int i = 1; i < XVECLEN (op, 0); ++i)
23326     if (!CONST_INT_P (XVECEXP (op, 0, i))
23327         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
23328       return false;
23329
23330   return true;
23331 }
23332
23333 /* Return true if OPERANDS[0] to OPERANDS[NUM_OPERANDS - 1] form a
23334    sequence of strided registers, with the stride being equal STRIDE.
23335    The operands are already known to be FPRs.  */
23336 bool
23337 aarch64_strided_registers_p (rtx *operands, unsigned int num_operands,
23338                              unsigned int stride)
23339 {
23340   for (unsigned int i = 1; i < num_operands; ++i)
23341     if (REGNO (operands[i]) != REGNO (operands[0]) + i * stride)
23342       return false;
23343   return true;
23344 }
23345
23346 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
23347    HIGH (exclusive).  */
23348 void
23349 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
23350                           const_tree exp)
23351 {
23352   HOST_WIDE_INT lane;
23353   gcc_assert (CONST_INT_P (operand));
23354   lane = INTVAL (operand);
23355
23356   if (lane < low || lane >= high)
23357   {
23358     if (exp)
23359       error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
23360                 lane, low, high - 1);
23361     else
23362       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
23363   }
23364 }
23365
23366 /* Peform endian correction on lane number N, which indexes a vector
23367    of mode MODE, and return the result as an SImode rtx.  */
23368
23369 rtx
23370 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
23371 {
23372   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
23373 }
23374
23375 /* Return TRUE if OP is a valid vector addressing mode.  */
23376
23377 bool
23378 aarch64_simd_mem_operand_p (rtx op)
23379 {
23380   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
23381                         || REG_P (XEXP (op, 0)));
23382 }
23383
23384 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
23385
23386 bool
23387 aarch64_sve_ld1r_operand_p (rtx op)
23388 {
23389   struct aarch64_address_info addr;
23390   scalar_mode mode;
23391
23392   return (MEM_P (op)
23393           && is_a <scalar_mode> (GET_MODE (op), &mode)
23394           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
23395           && addr.type == ADDRESS_REG_IMM
23396           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
23397 }
23398
23399 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
23400    where the size of the read data is specified by `mode` and the size of the
23401    vector elements are specified by `elem_mode`.   */
23402 bool
23403 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
23404                                    scalar_mode elem_mode)
23405 {
23406   struct aarch64_address_info addr;
23407   if (!MEM_P (op)
23408       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
23409     return false;
23410
23411   if (addr.type == ADDRESS_REG_IMM)
23412     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
23413
23414   if (addr.type == ADDRESS_REG_REG)
23415     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
23416
23417   return false;
23418 }
23419
23420 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
23421 bool
23422 aarch64_sve_ld1rq_operand_p (rtx op)
23423 {
23424   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
23425                                             GET_MODE_INNER (GET_MODE (op)));
23426 }
23427
23428 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
23429    accessing a vector where the element size is specified by `elem_mode`.  */
23430 bool
23431 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
23432 {
23433   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
23434 }
23435
23436 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
23437 bool
23438 aarch64_sve_ldff1_operand_p (rtx op)
23439 {
23440   if (!MEM_P (op))
23441     return false;
23442
23443   struct aarch64_address_info addr;
23444   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
23445     return false;
23446
23447   if (addr.type == ADDRESS_REG_IMM)
23448     return known_eq (addr.const_offset, 0);
23449
23450   return addr.type == ADDRESS_REG_REG;
23451 }
23452
23453 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
23454 bool
23455 aarch64_sve_ldnf1_operand_p (rtx op)
23456 {
23457   struct aarch64_address_info addr;
23458
23459   return (MEM_P (op)
23460           && aarch64_classify_address (&addr, XEXP (op, 0),
23461                                        GET_MODE (op), false)
23462           && addr.type == ADDRESS_REG_IMM);
23463 }
23464
23465 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
23466    The conditions for STR are the same.  */
23467 bool
23468 aarch64_sve_ldr_operand_p (rtx op)
23469 {
23470   struct aarch64_address_info addr;
23471
23472   return (MEM_P (op)
23473           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
23474                                        false, ADDR_QUERY_ANY)
23475           && addr.type == ADDRESS_REG_IMM);
23476 }
23477
23478 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
23479    addressing memory of mode MODE.  */
23480 bool
23481 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
23482 {
23483   struct aarch64_address_info addr;
23484   if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
23485     return false;
23486
23487   if (addr.type == ADDRESS_REG_IMM)
23488     return offset_6bit_signed_scaled_p (mode, addr.const_offset);
23489
23490   return addr.type == ADDRESS_REG_REG;
23491 }
23492
23493 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
23494    We need to be able to access the individual pieces, so the range
23495    is different from LD[234] and ST[234].  */
23496 bool
23497 aarch64_sve_struct_memory_operand_p (rtx op)
23498 {
23499   if (!MEM_P (op))
23500     return false;
23501
23502   machine_mode mode = GET_MODE (op);
23503   struct aarch64_address_info addr;
23504   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
23505                                  ADDR_QUERY_ANY)
23506       || addr.type != ADDRESS_REG_IMM)
23507     return false;
23508
23509   poly_int64 first = addr.const_offset;
23510   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
23511   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
23512           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
23513 }
23514
23515 /* Return true if OFFSET is a constant integer and if VNUM is
23516    OFFSET * the number of bytes in an SVE vector.  This is the requirement
23517    that exists in SME LDR and STR instructions, where the VL offset must
23518    equal the ZA slice offset.  */
23519 bool
23520 aarch64_sme_ldr_vnum_offset_p (rtx offset, rtx vnum)
23521 {
23522   if (!CONST_INT_P (offset) || !IN_RANGE (INTVAL (offset), 0, 15))
23523     return false;
23524
23525   if (TARGET_STREAMING)
23526     {
23527       poly_int64 const_vnum;
23528       return (poly_int_rtx_p (vnum, &const_vnum)
23529               && known_eq (const_vnum,
23530                            INTVAL (offset) * BYTES_PER_SVE_VECTOR));
23531     }
23532   else
23533     {
23534       HOST_WIDE_INT factor;
23535       return (aarch64_sme_vq_unspec_p (vnum, &factor)
23536               && factor == INTVAL (offset) * 16);
23537     }
23538 }
23539
23540 /* Emit a register copy from operand to operand, taking care not to
23541    early-clobber source registers in the process.
23542
23543    COUNT is the number of components into which the copy needs to be
23544    decomposed.  */
23545 void
23546 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
23547                                 unsigned int count)
23548 {
23549   unsigned int i;
23550   int rdest = REGNO (operands[0]);
23551   int rsrc = REGNO (operands[1]);
23552
23553   if (!reg_overlap_mentioned_p (operands[0], operands[1])
23554       || rdest < rsrc)
23555     for (i = 0; i < count; i++)
23556       emit_move_insn (gen_rtx_REG (mode, rdest + i),
23557                       gen_rtx_REG (mode, rsrc + i));
23558   else
23559     for (i = 0; i < count; i++)
23560       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
23561                       gen_rtx_REG (mode, rsrc + count - i - 1));
23562 }
23563
23564 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
23565    one of VSTRUCT modes: OI, CI, or XI.  */
23566 int
23567 aarch64_simd_attr_length_rglist (machine_mode mode)
23568 {
23569   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
23570   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
23571 }
23572
23573 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
23574    alignment of a vector to 128 bits.  SVE predicates have an alignment of
23575    16 bits.  */
23576 static HOST_WIDE_INT
23577 aarch64_simd_vector_alignment (const_tree type)
23578 {
23579   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
23580      be set for non-predicate vectors of booleans.  Modes are the most
23581      direct way we have of identifying real SVE predicate types.  */
23582   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
23583     return 16;
23584   widest_int min_size
23585     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
23586   return wi::umin (min_size, 128).to_uhwi ();
23587 }
23588
23589 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
23590 static poly_uint64
23591 aarch64_vectorize_preferred_vector_alignment (const_tree type)
23592 {
23593   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
23594     {
23595       /* If the length of the vector is a fixed power of 2, try to align
23596          to that length, otherwise don't try to align at all.  */
23597       HOST_WIDE_INT result;
23598       if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
23599           || !pow2p_hwi (result))
23600         result = TYPE_ALIGN (TREE_TYPE (type));
23601       return result;
23602     }
23603   return TYPE_ALIGN (type);
23604 }
23605
23606 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
23607 static bool
23608 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
23609 {
23610   if (is_packed)
23611     return false;
23612
23613   /* For fixed-length vectors, check that the vectorizer will aim for
23614      full-vector alignment.  This isn't true for generic GCC vectors
23615      that are wider than the ABI maximum of 128 bits.  */
23616   poly_uint64 preferred_alignment =
23617     aarch64_vectorize_preferred_vector_alignment (type);
23618   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23619       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
23620                    preferred_alignment))
23621     return false;
23622
23623   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
23624   return true;
23625 }
23626
23627 /* Return true if the vector misalignment factor is supported by the
23628    target.  */
23629 static bool
23630 aarch64_builtin_support_vector_misalignment (machine_mode mode,
23631                                              const_tree type, int misalignment,
23632                                              bool is_packed)
23633 {
23634   if (TARGET_SIMD && STRICT_ALIGNMENT)
23635     {
23636       /* Return if movmisalign pattern is not supported for this mode.  */
23637       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
23638         return false;
23639
23640       /* Misalignment factor is unknown at compile time.  */
23641       if (misalignment == -1)
23642         return false;
23643     }
23644   return default_builtin_support_vector_misalignment (mode, type, misalignment,
23645                                                       is_packed);
23646 }
23647
23648 /* If VALS is a vector constant that can be loaded into a register
23649    using DUP, generate instructions to do so and return an RTX to
23650    assign to the register.  Otherwise return NULL_RTX.  */
23651 static rtx
23652 aarch64_simd_dup_constant (rtx vals)
23653 {
23654   machine_mode mode = GET_MODE (vals);
23655   machine_mode inner_mode = GET_MODE_INNER (mode);
23656   rtx x;
23657
23658   if (!const_vec_duplicate_p (vals, &x))
23659     return NULL_RTX;
23660
23661   /* We can load this constant by using DUP and a constant in a
23662      single ARM register.  This will be cheaper than a vector
23663      load.  */
23664   x = force_reg (inner_mode, x);
23665   return gen_vec_duplicate (mode, x);
23666 }
23667
23668
23669 /* Generate code to load VALS, which is a PARALLEL containing only
23670    constants (for vec_init) or CONST_VECTOR, efficiently into a
23671    register.  Returns an RTX to copy into the register, or NULL_RTX
23672    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
23673 static rtx
23674 aarch64_simd_make_constant (rtx vals)
23675 {
23676   machine_mode mode = GET_MODE (vals);
23677   rtx const_dup;
23678   rtx const_vec = NULL_RTX;
23679   int n_const = 0;
23680   int i;
23681
23682   if (CONST_VECTOR_P (vals))
23683     const_vec = vals;
23684   else if (GET_CODE (vals) == PARALLEL)
23685     {
23686       /* A CONST_VECTOR must contain only CONST_INTs and
23687          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
23688          Only store valid constants in a CONST_VECTOR.  */
23689       int n_elts = XVECLEN (vals, 0);
23690       for (i = 0; i < n_elts; ++i)
23691         {
23692           rtx x = XVECEXP (vals, 0, i);
23693           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23694             n_const++;
23695         }
23696       if (n_const == n_elts)
23697         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
23698     }
23699   else
23700     gcc_unreachable ();
23701
23702   if (const_vec != NULL_RTX
23703       && aarch64_simd_valid_immediate (const_vec, NULL))
23704     /* Load using MOVI/MVNI.  */
23705     return const_vec;
23706   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
23707     /* Loaded using DUP.  */
23708     return const_dup;
23709   else if (const_vec != NULL_RTX)
23710     /* Load from constant pool. We cannot take advantage of single-cycle
23711        LD1 because we need a PC-relative addressing mode.  */
23712     return const_vec;
23713   else
23714     /* A PARALLEL containing something not valid inside CONST_VECTOR.
23715        We cannot construct an initializer.  */
23716     return NULL_RTX;
23717 }
23718
23719 /* A subroutine of aarch64_expand_vector_init, with the same interface.
23720    The caller has already tried a divide-and-conquer approach, so do
23721    not consider that case here.  */
23722
23723 void
23724 aarch64_expand_vector_init_fallback (rtx target, rtx vals)
23725 {
23726   machine_mode mode = GET_MODE (target);
23727   scalar_mode inner_mode = GET_MODE_INNER (mode);
23728   /* The number of vector elements.  */
23729   int n_elts = XVECLEN (vals, 0);
23730   /* The number of vector elements which are not constant.  */
23731   int n_var = 0;
23732   rtx any_const = NULL_RTX;
23733   /* The first element of vals.  */
23734   rtx v0 = XVECEXP (vals, 0, 0);
23735   bool all_same = true;
23736
23737   /* This is a special vec_init<M><N> where N is not an element mode but a
23738      vector mode with half the elements of M.  We expect to find two entries
23739      of mode N in VALS and we must put their concatentation into TARGET.  */
23740   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
23741     {
23742       machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
23743       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
23744                   && known_eq (GET_MODE_SIZE (mode),
23745                                2 * GET_MODE_SIZE (narrow_mode)));
23746       emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
23747                                          XVECEXP (vals, 0, 0),
23748                                          XVECEXP (vals, 0, 1)));
23749      return;
23750    }
23751
23752   /* Count the number of variable elements to initialise.  */
23753   for (int i = 0; i < n_elts; ++i)
23754     {
23755       rtx x = XVECEXP (vals, 0, i);
23756       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
23757         ++n_var;
23758       else
23759         any_const = x;
23760
23761       all_same &= rtx_equal_p (x, v0);
23762     }
23763
23764   /* No variable elements, hand off to aarch64_simd_make_constant which knows
23765      how best to handle this.  */
23766   if (n_var == 0)
23767     {
23768       rtx constant = aarch64_simd_make_constant (vals);
23769       if (constant != NULL_RTX)
23770         {
23771           emit_move_insn (target, constant);
23772           return;
23773         }
23774     }
23775
23776   /* Splat a single non-constant element if we can.  */
23777   if (all_same)
23778     {
23779       rtx x = force_reg (inner_mode, v0);
23780       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
23781       return;
23782     }
23783
23784   enum insn_code icode = optab_handler (vec_set_optab, mode);
23785   gcc_assert (icode != CODE_FOR_nothing);
23786
23787   /* If there are only variable elements, try to optimize
23788      the insertion using dup for the most common element
23789      followed by insertions.  */
23790
23791   /* The algorithm will fill matches[*][0] with the earliest matching element,
23792      and matches[X][1] with the count of duplicate elements (if X is the
23793      earliest element which has duplicates).  */
23794
23795   if (n_var >= n_elts - 1 && n_elts <= 16)
23796     {
23797       int matches[16][2] = {0};
23798       for (int i = 0; i < n_elts; i++)
23799         {
23800           for (int j = 0; j <= i; j++)
23801             {
23802               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
23803                 {
23804                   matches[i][0] = j;
23805                   matches[j][1]++;
23806                   break;
23807                 }
23808             }
23809         }
23810       int maxelement = 0;
23811       int maxv = 0;
23812       rtx const_elem = NULL_RTX;
23813       int const_elem_pos = 0;
23814
23815       for (int i = 0; i < n_elts; i++)
23816         {
23817           if (matches[i][1] > maxv)
23818             {
23819               maxelement = i;
23820               maxv = matches[i][1];
23821             }
23822           if (CONST_INT_P (XVECEXP (vals, 0, i))
23823               || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
23824             {
23825               const_elem_pos = i;
23826               const_elem = XVECEXP (vals, 0, i);
23827             }
23828         }
23829
23830       /* Create a duplicate of the most common element, unless all elements
23831          are equally useless to us, in which case just immediately set the
23832          vector register using the first element.  */
23833
23834       if (maxv == 1)
23835         {
23836           /* For vectors of two 64-bit elements, we can do even better.  */
23837           if (n_elts == 2
23838               && (inner_mode == E_DImode
23839                   || inner_mode == E_DFmode))
23840
23841             {
23842               rtx x0 = XVECEXP (vals, 0, 0);
23843               rtx x1 = XVECEXP (vals, 0, 1);
23844               /* Combine can pick up this case, but handling it directly
23845                  here leaves clearer RTL.
23846
23847                  This is load_pair_lanes<mode>, and also gives us a clean-up
23848                  for store_pair_lanes<mode>.  */
23849               if (memory_operand (x0, inner_mode)
23850                   && memory_operand (x1, inner_mode)
23851                   && aarch64_mergeable_load_pair_p (mode, x0, x1))
23852                 {
23853                   rtx t;
23854                   if (inner_mode == DFmode)
23855                     t = gen_load_pair_lanesdf (target, x0, x1);
23856                   else
23857                     t = gen_load_pair_lanesdi (target, x0, x1);
23858                   emit_insn (t);
23859                   return;
23860                 }
23861             }
23862           /* The subreg-move sequence below will move into lane zero of the
23863              vector register.  For big-endian we want that position to hold
23864              the last element of VALS.  */
23865           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
23866
23867           /* If we have a single constant element, use that for duplicating
23868              instead.  */
23869           if (const_elem)
23870             {
23871               maxelement = const_elem_pos;
23872               aarch64_emit_move (target, gen_vec_duplicate (mode, const_elem));
23873             }
23874           else
23875             {
23876               rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
23877               aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
23878             }
23879         }
23880       else
23881         {
23882           rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
23883           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
23884         }
23885
23886       /* Insert the rest.  */
23887       for (int i = 0; i < n_elts; i++)
23888         {
23889           rtx x = XVECEXP (vals, 0, i);
23890           if (matches[i][0] == maxelement)
23891             continue;
23892           x = force_reg (inner_mode, x);
23893           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
23894         }
23895       return;
23896     }
23897
23898   /* Initialise a vector which is part-variable.  We want to first try
23899      to build those lanes which are constant in the most efficient way we
23900      can.  */
23901   if (n_var != n_elts)
23902     {
23903       rtx copy = copy_rtx (vals);
23904
23905       /* Load constant part of vector.  We really don't care what goes into the
23906          parts we will overwrite, but we're more likely to be able to load the
23907          constant efficiently if it has fewer, larger, repeating parts
23908          (see aarch64_simd_valid_immediate).  */
23909       for (int i = 0; i < n_elts; i++)
23910         {
23911           rtx x = XVECEXP (vals, 0, i);
23912           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23913             continue;
23914           rtx subst = any_const;
23915           for (int bit = n_elts / 2; bit > 0; bit /= 2)
23916             {
23917               /* Look in the copied vector, as more elements are const.  */
23918               rtx test = XVECEXP (copy, 0, i ^ bit);
23919               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
23920                 {
23921                   subst = test;
23922                   break;
23923                 }
23924             }
23925           XVECEXP (copy, 0, i) = subst;
23926         }
23927       aarch64_expand_vector_init_fallback (target, copy);
23928     }
23929
23930   /* Insert the variable lanes directly.  */
23931   for (int i = 0; i < n_elts; i++)
23932     {
23933       rtx x = XVECEXP (vals, 0, i);
23934       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23935         continue;
23936       x = force_reg (inner_mode, x);
23937       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
23938     }
23939 }
23940
23941 /* Return even or odd half of VALS depending on EVEN_P.  */
23942
23943 static rtx
23944 aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
23945 {
23946   int n = XVECLEN (vals, 0);
23947   machine_mode new_mode
23948     = aarch64_simd_container_mode (GET_MODE_INNER (mode),
23949                                    GET_MODE_BITSIZE (mode).to_constant () / 2);
23950   rtvec vec = rtvec_alloc (n / 2);
23951   for (int i = 0; i < n / 2; i++)
23952     RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
23953                                   : XVECEXP (vals, 0, 2 * i + 1);
23954   return gen_rtx_PARALLEL (new_mode, vec);
23955 }
23956
23957 /* Return true if SET is a scalar move.  */
23958
23959 static bool
23960 scalar_move_insn_p (rtx set)
23961 {
23962   rtx src = SET_SRC (set);
23963   rtx dest = SET_DEST (set);
23964   return (is_a<scalar_mode> (GET_MODE (dest))
23965           && aarch64_mov_operand (src, GET_MODE (dest)));
23966 }
23967
23968 /* Similar to seq_cost, but ignore cost for scalar moves.  */
23969
23970 static unsigned
23971 seq_cost_ignoring_scalar_moves (const rtx_insn *seq, bool speed)
23972 {
23973   unsigned cost = 0;
23974
23975   for (; seq; seq = NEXT_INSN (seq))
23976     if (NONDEBUG_INSN_P (seq))
23977       {
23978         if (rtx set = single_set (seq))
23979           {
23980             if (!scalar_move_insn_p (set))
23981               cost += set_rtx_cost (set, speed);
23982           }
23983         else
23984           {
23985             int this_cost = insn_cost (CONST_CAST_RTX_INSN (seq), speed);
23986             if (this_cost > 0)
23987               cost += this_cost;
23988             else
23989               cost++;
23990           }
23991       }
23992
23993   return cost;
23994 }
23995
23996 /* Expand a vector initialization sequence, such that TARGET is
23997    initialized to contain VALS.  */
23998
23999 void
24000 aarch64_expand_vector_init (rtx target, rtx vals)
24001 {
24002   /* Try decomposing the initializer into even and odd halves and
24003      then ZIP them together.  Use the resulting sequence if it is
24004      strictly cheaper than loading VALS directly.
24005
24006      Prefer the fallback sequence in the event of a tie, since it
24007      will tend to use fewer registers.  */
24008
24009   machine_mode mode = GET_MODE (target);
24010   int n_elts = XVECLEN (vals, 0);
24011
24012   if (n_elts < 4
24013       || maybe_ne (GET_MODE_BITSIZE (mode), 128))
24014     {
24015       aarch64_expand_vector_init_fallback (target, vals);
24016       return;
24017     }
24018
24019   start_sequence ();
24020   rtx halves[2];
24021   unsigned costs[2];
24022   for (int i = 0; i < 2; i++)
24023     {
24024       start_sequence ();
24025       rtx new_vals = aarch64_unzip_vector_init (mode, vals, i == 0);
24026       rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
24027       aarch64_expand_vector_init (tmp_reg, new_vals);
24028       halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
24029       rtx_insn *rec_seq = get_insns ();
24030       end_sequence ();
24031       costs[i] = seq_cost_ignoring_scalar_moves (rec_seq, !optimize_size);
24032       emit_insn (rec_seq);
24033     }
24034
24035   rtvec v = gen_rtvec (2, halves[0], halves[1]);
24036   rtx_insn *zip1_insn
24037     = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24038   unsigned seq_total_cost
24039     = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
24040   seq_total_cost += insn_cost (zip1_insn, !optimize_size);
24041
24042   rtx_insn *seq = get_insns ();
24043   end_sequence ();
24044
24045   start_sequence ();
24046   aarch64_expand_vector_init_fallback (target, vals);
24047   rtx_insn *fallback_seq = get_insns ();
24048   unsigned fallback_seq_cost
24049     = seq_cost_ignoring_scalar_moves (fallback_seq, !optimize_size);
24050   end_sequence ();
24051
24052   emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
24053 }
24054
24055 /* Emit RTL corresponding to:
24056    insr TARGET, ELEM.  */
24057
24058 static void
24059 emit_insr (rtx target, rtx elem)
24060 {
24061   machine_mode mode = GET_MODE (target);
24062   scalar_mode elem_mode = GET_MODE_INNER (mode);
24063   elem = force_reg (elem_mode, elem);
24064
24065   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
24066   gcc_assert (icode != CODE_FOR_nothing);
24067   emit_insn (GEN_FCN (icode) (target, target, elem));
24068 }
24069
24070 /* Subroutine of aarch64_sve_expand_vector_init for handling
24071    trailing constants.
24072    This function works as follows:
24073    (a) Create a new vector consisting of trailing constants.
24074    (b) Initialize TARGET with the constant vector using emit_move_insn.
24075    (c) Insert remaining elements in TARGET using insr.
24076    NELTS is the total number of elements in original vector while
24077    while NELTS_REQD is the number of elements that are actually
24078    significant.
24079
24080    ??? The heuristic used is to do above only if number of constants
24081    is at least half the total number of elements.  May need fine tuning.  */
24082
24083 static bool
24084 aarch64_sve_expand_vector_init_handle_trailing_constants
24085  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
24086 {
24087   machine_mode mode = GET_MODE (target);
24088   scalar_mode elem_mode = GET_MODE_INNER (mode);
24089   int n_trailing_constants = 0;
24090
24091   for (int i = nelts_reqd - 1;
24092        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
24093        i--)
24094     n_trailing_constants++;
24095
24096   if (n_trailing_constants >= nelts_reqd / 2)
24097     {
24098       /* Try to use the natural pattern of BUILDER to extend the trailing
24099          constant elements to a full vector.  Replace any variables in the
24100          extra elements with zeros.
24101
24102          ??? It would be better if the builders supported "don't care"
24103              elements, with the builder filling in whichever elements
24104              give the most compact encoding.  */
24105       rtx_vector_builder v (mode, nelts, 1);
24106       for (int i = 0; i < nelts; i++)
24107         {
24108           rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
24109           if (!valid_for_const_vector_p (elem_mode, x))
24110             x = CONST0_RTX (elem_mode);
24111           v.quick_push (x);
24112         }
24113       rtx const_vec = v.build ();
24114       emit_move_insn (target, const_vec);
24115
24116       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
24117         emit_insr (target, builder.elt (i));
24118
24119       return true;
24120     }
24121
24122   return false;
24123 }
24124
24125 /* Subroutine of aarch64_sve_expand_vector_init.
24126    Works as follows:
24127    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
24128    (b) Skip trailing elements from BUILDER, which are the same as
24129        element NELTS_REQD - 1.
24130    (c) Insert earlier elements in reverse order in TARGET using insr.  */
24131
24132 static void
24133 aarch64_sve_expand_vector_init_insert_elems (rtx target,
24134                                              const rtx_vector_builder &builder,
24135                                              int nelts_reqd)
24136 {
24137   machine_mode mode = GET_MODE (target);
24138   scalar_mode elem_mode = GET_MODE_INNER (mode);
24139
24140   struct expand_operand ops[2];
24141   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
24142   gcc_assert (icode != CODE_FOR_nothing);
24143
24144   create_output_operand (&ops[0], target, mode);
24145   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
24146   expand_insn (icode, 2, ops);
24147
24148   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24149   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
24150     emit_insr (target, builder.elt (i));
24151 }
24152
24153 /* Subroutine of aarch64_sve_expand_vector_init to handle case
24154    when all trailing elements of builder are same.
24155    This works as follows:
24156    (a) Use expand_insn interface to broadcast last vector element in TARGET.
24157    (b) Insert remaining elements in TARGET using insr.
24158
24159    ??? The heuristic used is to do above if number of same trailing elements
24160    is at least 3/4 of total number of elements, loosely based on
24161    heuristic from mostly_zeros_p.  May need fine-tuning.  */
24162
24163 static bool
24164 aarch64_sve_expand_vector_init_handle_trailing_same_elem
24165  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
24166 {
24167   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24168   if (ndups >= (3 * nelts_reqd) / 4)
24169     {
24170       aarch64_sve_expand_vector_init_insert_elems (target, builder,
24171                                                    nelts_reqd - ndups + 1);
24172       return true;
24173     }
24174
24175   return false;
24176 }
24177
24178 /* Initialize register TARGET from BUILDER. NELTS is the constant number
24179    of elements in BUILDER.
24180
24181    The function tries to initialize TARGET from BUILDER if it fits one
24182    of the special cases outlined below.
24183
24184    Failing that, the function divides BUILDER into two sub-vectors:
24185    v_even = even elements of BUILDER;
24186    v_odd = odd elements of BUILDER;
24187
24188    and recursively calls itself with v_even and v_odd.
24189
24190    if (recursive call succeeded for v_even or v_odd)
24191      TARGET = zip (v_even, v_odd)
24192
24193    The function returns true if it managed to build TARGET from BUILDER
24194    with one of the special cases, false otherwise.
24195
24196    Example: {a, 1, b, 2, c, 3, d, 4}
24197
24198    The vector gets divided into:
24199    v_even = {a, b, c, d}
24200    v_odd = {1, 2, 3, 4}
24201
24202    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
24203    initialize tmp2 from constant vector v_odd using emit_move_insn.
24204
24205    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
24206    4 elements, so we construct tmp1 from v_even using insr:
24207    tmp1 = dup(d)
24208    insr tmp1, c
24209    insr tmp1, b
24210    insr tmp1, a
24211
24212    And finally:
24213    TARGET = zip (tmp1, tmp2)
24214    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
24215
24216 static bool
24217 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
24218                                 int nelts, int nelts_reqd)
24219 {
24220   machine_mode mode = GET_MODE (target);
24221
24222   /* Case 1: Vector contains trailing constants.  */
24223
24224   if (aarch64_sve_expand_vector_init_handle_trailing_constants
24225        (target, builder, nelts, nelts_reqd))
24226     return true;
24227
24228   /* Case 2: Vector contains leading constants.  */
24229
24230   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
24231   for (int i = 0; i < nelts_reqd; i++)
24232     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
24233   rev_builder.finalize ();
24234
24235   if (aarch64_sve_expand_vector_init_handle_trailing_constants
24236        (target, rev_builder, nelts, nelts_reqd))
24237     {
24238       emit_insn (gen_aarch64_sve_rev (mode, target, target));
24239       return true;
24240     }
24241
24242   /* Case 3: Vector contains trailing same element.  */
24243
24244   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24245        (target, builder, nelts_reqd))
24246     return true;
24247
24248   /* Case 4: Vector contains leading same element.  */
24249
24250   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24251        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
24252     {
24253       emit_insn (gen_aarch64_sve_rev (mode, target, target));
24254       return true;
24255     }
24256
24257   /* Avoid recursing below 4-elements.
24258      ??? The threshold 4 may need fine-tuning.  */
24259
24260   if (nelts_reqd <= 4)
24261     return false;
24262
24263   rtx_vector_builder v_even (mode, nelts, 1);
24264   rtx_vector_builder v_odd (mode, nelts, 1);
24265
24266   for (int i = 0; i < nelts * 2; i += 2)
24267     {
24268       v_even.quick_push (builder.elt (i));
24269       v_odd.quick_push (builder.elt (i + 1));
24270     }
24271
24272   v_even.finalize ();
24273   v_odd.finalize ();
24274
24275   rtx tmp1 = gen_reg_rtx (mode);
24276   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
24277                                                     nelts, nelts_reqd / 2);
24278
24279   rtx tmp2 = gen_reg_rtx (mode);
24280   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
24281                                                    nelts, nelts_reqd / 2);
24282
24283   if (!did_even_p && !did_odd_p)
24284     return false;
24285
24286   /* Initialize v_even and v_odd using INSR if it didn't match any of the
24287      special cases and zip v_even, v_odd.  */
24288
24289   if (!did_even_p)
24290     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
24291
24292   if (!did_odd_p)
24293     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
24294
24295   rtvec v = gen_rtvec (2, tmp1, tmp2);
24296   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24297   return true;
24298 }
24299
24300 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
24301
24302 void
24303 aarch64_sve_expand_vector_init (rtx target, rtx vals)
24304 {
24305   machine_mode mode = GET_MODE (target);
24306   int nelts = XVECLEN (vals, 0);
24307
24308   rtx_vector_builder v (mode, nelts, 1);
24309   for (int i = 0; i < nelts; i++)
24310     v.quick_push (XVECEXP (vals, 0, i));
24311   v.finalize ();
24312
24313   /* If neither sub-vectors of v could be initialized specially,
24314      then use INSR to insert all elements from v into TARGET.
24315      ??? This might not be optimal for vectors with large
24316      initializers like 16-element or above.
24317      For nelts < 4, it probably isn't useful to handle specially.  */
24318
24319   if (nelts < 4
24320       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
24321     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
24322 }
24323
24324 /* Check whether VALUE is a vector constant in which every element
24325    is either a power of 2 or a negated power of 2.  If so, return
24326    a constant vector of log2s, and flip CODE between PLUS and MINUS
24327    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
24328
24329 static rtx
24330 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
24331 {
24332   if (!CONST_VECTOR_P (value))
24333     return NULL_RTX;
24334
24335   rtx_vector_builder builder;
24336   if (!builder.new_unary_operation (GET_MODE (value), value, false))
24337     return NULL_RTX;
24338
24339   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
24340   /* 1 if the result of the multiplication must be negated,
24341      0 if it mustn't, or -1 if we don't yet care.  */
24342   int negate = -1;
24343   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
24344   for (unsigned int i = 0; i < encoded_nelts; ++i)
24345     {
24346       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
24347       if (!CONST_SCALAR_INT_P (elt))
24348         return NULL_RTX;
24349       rtx_mode_t val (elt, int_mode);
24350       wide_int pow2 = wi::neg (val);
24351       if (val != pow2)
24352         {
24353           /* It matters whether we negate or not.  Make that choice,
24354              and make sure that it's consistent with previous elements.  */
24355           if (negate == !wi::neg_p (val))
24356             return NULL_RTX;
24357           negate = wi::neg_p (val);
24358           if (!negate)
24359             pow2 = val;
24360         }
24361       /* POW2 is now the value that we want to be a power of 2.  */
24362       int shift = wi::exact_log2 (pow2);
24363       if (shift < 0)
24364         return NULL_RTX;
24365       builder.quick_push (gen_int_mode (shift, int_mode));
24366     }
24367   if (negate == -1)
24368     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
24369     code = PLUS;
24370   else if (negate == 1)
24371     code = code == PLUS ? MINUS : PLUS;
24372   return builder.build ();
24373 }
24374
24375 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
24376    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
24377    operands array, in the same order as for fma_optab.  Return true if
24378    the function emitted all the necessary instructions, false if the caller
24379    should generate the pattern normally with the new OPERANDS array.  */
24380
24381 bool
24382 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
24383 {
24384   machine_mode mode = GET_MODE (operands[0]);
24385   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
24386     {
24387       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
24388                                   NULL_RTX, true, OPTAB_DIRECT);
24389       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
24390                           operands[3], product, operands[0], true,
24391                           OPTAB_DIRECT);
24392       return true;
24393     }
24394   operands[2] = force_reg (mode, operands[2]);
24395   return false;
24396 }
24397
24398 /* Likewise, but for a conditional pattern.  */
24399
24400 bool
24401 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
24402 {
24403   machine_mode mode = GET_MODE (operands[0]);
24404   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
24405     {
24406       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
24407                                   NULL_RTX, true, OPTAB_DIRECT);
24408       emit_insn (gen_cond (code, mode, operands[0], operands[1],
24409                            operands[4], product, operands[5]));
24410       return true;
24411     }
24412   operands[3] = force_reg (mode, operands[3]);
24413   return false;
24414 }
24415
24416 static unsigned HOST_WIDE_INT
24417 aarch64_shift_truncation_mask (machine_mode mode)
24418 {
24419   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
24420     return 0;
24421   return GET_MODE_UNIT_BITSIZE (mode) - 1;
24422 }
24423
24424 /* Select a format to encode pointers in exception handling data.  */
24425 int
24426 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
24427 {
24428    int type;
24429    switch (aarch64_cmodel)
24430      {
24431      case AARCH64_CMODEL_TINY:
24432      case AARCH64_CMODEL_TINY_PIC:
24433      case AARCH64_CMODEL_SMALL:
24434      case AARCH64_CMODEL_SMALL_PIC:
24435      case AARCH64_CMODEL_SMALL_SPIC:
24436        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
24437           for everything.  */
24438        type = DW_EH_PE_sdata4;
24439        break;
24440      default:
24441        /* No assumptions here.  8-byte relocs required.  */
24442        type = DW_EH_PE_sdata8;
24443        break;
24444      }
24445    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
24446 }
24447
24448 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
24449
24450 static void
24451 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
24452 {
24453   if (TREE_CODE (decl) == FUNCTION_DECL)
24454     {
24455       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
24456       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
24457         {
24458           fprintf (stream, "\t.variant_pcs\t");
24459           assemble_name (stream, name);
24460           fprintf (stream, "\n");
24461         }
24462     }
24463 }
24464
24465 /* The last .arch and .tune assembly strings that we printed.  */
24466 static std::string aarch64_last_printed_arch_string;
24467 static std::string aarch64_last_printed_tune_string;
24468
24469 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
24470    by the function fndecl.  */
24471
24472 void
24473 aarch64_declare_function_name (FILE *stream, const char* name,
24474                                 tree fndecl)
24475 {
24476   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
24477
24478   struct cl_target_option *targ_options;
24479   if (target_parts)
24480     targ_options = TREE_TARGET_OPTION (target_parts);
24481   else
24482     targ_options = TREE_TARGET_OPTION (target_option_current_node);
24483   gcc_assert (targ_options);
24484
24485   const struct processor *this_arch
24486     = aarch64_get_arch (targ_options->x_selected_arch);
24487
24488   auto isa_flags = targ_options->x_aarch64_asm_isa_flags;
24489   std::string extension
24490     = aarch64_get_extension_string_for_isa_flags (isa_flags,
24491                                                   this_arch->flags);
24492   /* Only update the assembler .arch string if it is distinct from the last
24493      such string we printed.  */
24494   std::string to_print = this_arch->name + extension;
24495   if (to_print != aarch64_last_printed_arch_string)
24496     {
24497       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
24498       aarch64_last_printed_arch_string = to_print;
24499     }
24500
24501   /* Print the cpu name we're tuning for in the comments, might be
24502      useful to readers of the generated asm.  Do it only when it changes
24503      from function to function and verbose assembly is requested.  */
24504   const struct processor *this_tune
24505     = aarch64_get_tune_cpu (targ_options->x_selected_tune);
24506
24507   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
24508     {
24509       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
24510                    this_tune->name);
24511       aarch64_last_printed_tune_string = this_tune->name;
24512     }
24513
24514   aarch64_asm_output_variant_pcs (stream, fndecl, name);
24515
24516   /* Don't forget the type directive for ELF.  */
24517   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
24518   ASM_OUTPUT_FUNCTION_LABEL (stream, name, fndecl);
24519
24520   cfun->machine->label_is_assembled = true;
24521 }
24522
24523 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  */
24524
24525 void
24526 aarch64_print_patchable_function_entry (FILE *file,
24527                                         unsigned HOST_WIDE_INT patch_area_size,
24528                                         bool record_p)
24529 {
24530   if (!cfun->machine->label_is_assembled)
24531     {
24532       /* Emit the patching area before the entry label, if any.  */
24533       default_print_patchable_function_entry (file, patch_area_size,
24534                                               record_p);
24535       return;
24536     }
24537
24538   rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
24539                                GEN_INT (record_p));
24540   basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
24541
24542   if (!aarch_bti_enabled ()
24543       || cgraph_node::get (cfun->decl)->only_called_directly_p ())
24544     {
24545       /* Emit the patchable_area at the beginning of the function.  */
24546       rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
24547       INSN_ADDRESSES_NEW (insn, -1);
24548       return;
24549     }
24550
24551   rtx_insn *insn = next_real_nondebug_insn (get_insns ());
24552   if (!insn
24553       || !INSN_P (insn)
24554       || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
24555       || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
24556     {
24557       /* Emit a BTI_C.  */
24558       insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
24559     }
24560
24561   /* Emit the patchable_area after BTI_C.  */
24562   insn = emit_insn_after (pa, insn);
24563   INSN_ADDRESSES_NEW (insn, -1);
24564 }
24565
24566 /* Output patchable area.  */
24567
24568 void
24569 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
24570 {
24571   default_print_patchable_function_entry (asm_out_file, patch_area_size,
24572                                           record_p);
24573 }
24574
24575 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
24576
24577 void
24578 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
24579 {
24580   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
24581   const char *value = IDENTIFIER_POINTER (target);
24582   aarch64_asm_output_variant_pcs (stream, decl, name);
24583   ASM_OUTPUT_DEF (stream, name, value);
24584 }
24585
24586 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
24587    function symbol references.  */
24588
24589 void
24590 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
24591 {
24592   default_elf_asm_output_external (stream, decl, name);
24593   aarch64_asm_output_variant_pcs (stream, decl, name);
24594 }
24595
24596 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
24597    Used to output the .cfi_b_key_frame directive when signing the current
24598    function with the B key.  */
24599
24600 void
24601 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
24602 {
24603   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
24604       && aarch64_ra_sign_key == AARCH64_KEY_B)
24605         asm_fprintf (f, "\t.cfi_b_key_frame\n");
24606 }
24607
24608 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
24609
24610 static void
24611 aarch64_start_file (void)
24612 {
24613   struct cl_target_option *default_options
24614     = TREE_TARGET_OPTION (target_option_default_node);
24615
24616   const struct processor *default_arch
24617     = aarch64_get_arch (default_options->x_selected_arch);
24618   auto default_isa_flags = default_options->x_aarch64_asm_isa_flags;
24619   std::string extension
24620     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
24621                                                   default_arch->flags);
24622
24623    aarch64_last_printed_arch_string = default_arch->name + extension;
24624    aarch64_last_printed_tune_string = "";
24625    asm_fprintf (asm_out_file, "\t.arch %s\n",
24626                 aarch64_last_printed_arch_string.c_str ());
24627
24628    default_file_start ();
24629 }
24630
24631 /* Emit load exclusive.  */
24632
24633 static void
24634 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
24635                              rtx mem, rtx model_rtx)
24636 {
24637   if (mode == TImode)
24638     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
24639                                                 gen_highpart (DImode, rval),
24640                                                 mem, model_rtx));
24641   else
24642     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
24643 }
24644
24645 /* Emit store exclusive.  */
24646
24647 static void
24648 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
24649                               rtx mem, rtx rval, rtx model_rtx)
24650 {
24651   if (mode == TImode)
24652     emit_insn (gen_aarch64_store_exclusive_pair
24653                (bval, mem, operand_subword (rval, 0, 0, TImode),
24654                 operand_subword (rval, 1, 0, TImode), model_rtx));
24655   else
24656     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
24657 }
24658
24659 /* Mark the previous jump instruction as unlikely.  */
24660
24661 static void
24662 aarch64_emit_unlikely_jump (rtx insn)
24663 {
24664   rtx_insn *jump = emit_jump_insn (insn);
24665   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
24666 }
24667
24668 /* We store the names of the various atomic helpers in a 5x5 array.
24669    Return the libcall function given MODE, MODEL and NAMES.  */
24670
24671 rtx
24672 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
24673                         const atomic_ool_names *names)
24674 {
24675   memmodel model = memmodel_from_int (INTVAL (model_rtx));
24676   int mode_idx, model_idx;
24677
24678   switch (mode)
24679     {
24680     case E_QImode:
24681       mode_idx = 0;
24682       break;
24683     case E_HImode:
24684       mode_idx = 1;
24685       break;
24686     case E_SImode:
24687       mode_idx = 2;
24688       break;
24689     case E_DImode:
24690       mode_idx = 3;
24691       break;
24692     case E_TImode:
24693       mode_idx = 4;
24694       break;
24695     default:
24696       gcc_unreachable ();
24697     }
24698
24699   switch (model)
24700     {
24701     case MEMMODEL_RELAXED:
24702       model_idx = 0;
24703       break;
24704     case MEMMODEL_CONSUME:
24705     case MEMMODEL_ACQUIRE:
24706       model_idx = 1;
24707       break;
24708     case MEMMODEL_RELEASE:
24709       model_idx = 2;
24710       break;
24711     case MEMMODEL_ACQ_REL:
24712     case MEMMODEL_SEQ_CST:
24713       model_idx = 3;
24714       break;
24715     case MEMMODEL_SYNC_ACQUIRE:
24716     case MEMMODEL_SYNC_RELEASE:
24717     case MEMMODEL_SYNC_SEQ_CST:
24718       model_idx = 4;
24719       break;
24720     default:
24721       gcc_unreachable ();
24722     }
24723
24724   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
24725                                       VISIBILITY_HIDDEN);
24726 }
24727
24728 #define DEF0(B, N) \
24729   { "__aarch64_" #B #N "_relax", \
24730     "__aarch64_" #B #N "_acq", \
24731     "__aarch64_" #B #N "_rel", \
24732     "__aarch64_" #B #N "_acq_rel", \
24733     "__aarch64_" #B #N "_sync" }
24734
24735 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
24736                  { NULL, NULL, NULL, NULL }
24737 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
24738
24739 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
24740 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
24741 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
24742 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
24743 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
24744 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
24745
24746 #undef DEF0
24747 #undef DEF4
24748 #undef DEF5
24749
24750 /* Expand a compare and swap pattern.  */
24751
24752 void
24753 aarch64_expand_compare_and_swap (rtx operands[])
24754 {
24755   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
24756   machine_mode mode, r_mode;
24757
24758   bval = operands[0];
24759   rval = operands[1];
24760   mem = operands[2];
24761   oldval = operands[3];
24762   newval = operands[4];
24763   is_weak = operands[5];
24764   mod_s = operands[6];
24765   mod_f = operands[7];
24766   mode = GET_MODE (mem);
24767
24768   /* Normally the succ memory model must be stronger than fail, but in the
24769      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
24770      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
24771   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
24772       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
24773     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
24774
24775   r_mode = mode;
24776   if (mode == QImode || mode == HImode)
24777     {
24778       r_mode = SImode;
24779       rval = gen_reg_rtx (r_mode);
24780     }
24781
24782   if (TARGET_LSE)
24783     {
24784       /* The CAS insn requires oldval and rval overlap, but we need to
24785          have a copy of oldval saved across the operation to tell if
24786          the operation is successful.  */
24787       if (reg_overlap_mentioned_p (rval, oldval))
24788         rval = copy_to_mode_reg (r_mode, oldval);
24789       else
24790         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
24791       if (mode == TImode)
24792         newval = force_reg (mode, newval);
24793
24794       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
24795                                                    newval, mod_s));
24796       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24797     }
24798   else if (TARGET_OUTLINE_ATOMICS)
24799     {
24800       /* Oldval must satisfy compare afterward.  */
24801       if (!aarch64_plus_operand (oldval, mode))
24802         oldval = force_reg (mode, oldval);
24803       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
24804       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
24805                                       oldval, mode, newval, mode,
24806                                       XEXP (mem, 0), Pmode);
24807       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24808     }
24809   else
24810     {
24811       /* The oldval predicate varies by mode.  Test it and force to reg.  */
24812       insn_code code = code_for_aarch64_compare_and_swap (mode);
24813       if (!insn_data[code].operand[2].predicate (oldval, mode))
24814         oldval = force_reg (mode, oldval);
24815
24816       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
24817                                  is_weak, mod_s, mod_f));
24818       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
24819     }
24820
24821   if (r_mode != mode)
24822     rval = gen_lowpart (mode, rval);
24823   emit_move_insn (operands[1], rval);
24824
24825   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
24826   emit_insn (gen_rtx_SET (bval, x));
24827 }
24828
24829 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
24830    sequence implementing an atomic operation.  */
24831
24832 static void
24833 aarch64_emit_post_barrier (enum memmodel model)
24834 {
24835   const enum memmodel base_model = memmodel_base (model);
24836
24837   if (is_mm_sync (model)
24838       && (base_model == MEMMODEL_ACQUIRE
24839           || base_model == MEMMODEL_ACQ_REL
24840           || base_model == MEMMODEL_SEQ_CST))
24841     {
24842       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
24843     }
24844 }
24845
24846 /* Split a compare and swap pattern.  */
24847
24848 void
24849 aarch64_split_compare_and_swap (rtx operands[])
24850 {
24851   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
24852   gcc_assert (epilogue_completed);
24853
24854   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
24855   machine_mode mode;
24856   bool is_weak;
24857   rtx_code_label *label1, *label2;
24858   enum memmodel model;
24859
24860   rval = operands[0];
24861   mem = operands[1];
24862   oldval = operands[2];
24863   newval = operands[3];
24864   model_rtx = operands[5];
24865   scratch = operands[7];
24866   mode = GET_MODE (mem);
24867   model = memmodel_from_int (INTVAL (model_rtx));
24868   is_weak = operands[4] != const0_rtx && mode != TImode;
24869
24870   /* When OLDVAL is zero and we want the strong version we can emit a tighter
24871     loop:
24872     .label1:
24873         LD[A]XR rval, [mem]
24874         CBNZ    rval, .label2
24875         ST[L]XR scratch, newval, [mem]
24876         CBNZ    scratch, .label1
24877     .label2:
24878         CMP     rval, 0.  */
24879   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
24880                         oldval == const0_rtx && mode != TImode);
24881
24882   label1 = NULL;
24883   if (!is_weak)
24884     {
24885       label1 = gen_label_rtx ();
24886       emit_label (label1);
24887     }
24888   label2 = gen_label_rtx ();
24889
24890   /* The initial load can be relaxed for a __sync operation since a final
24891      barrier will be emitted to stop code hoisting.  */
24892   if (is_mm_sync (model))
24893     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
24894   else
24895     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
24896
24897   if (strong_zero_p)
24898     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
24899   else
24900     {
24901       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24902       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
24903     }
24904   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24905                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
24906   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24907
24908   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
24909
24910   if (!is_weak)
24911     {
24912       x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
24913       aarch64_emit_unlikely_jump (x);
24914     }
24915   else
24916     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
24917
24918   /* 128-bit LDAXP is not atomic unless STLXP succeeds.  So for a mismatch,
24919      store the returned value and loop if the STLXP fails.  */
24920   if (mode == TImode)
24921     {
24922       rtx_code_label *label3 = gen_label_rtx ();
24923       emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
24924       emit_barrier ();
24925
24926       emit_label (label2);
24927       aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
24928
24929       x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
24930       aarch64_emit_unlikely_jump (x);
24931
24932       label2 = label3;
24933     }
24934
24935   emit_label (label2);
24936
24937   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
24938      to set the condition flags.  If this is not used it will be removed by
24939      later passes.  */
24940   if (strong_zero_p)
24941     aarch64_gen_compare_reg (NE, rval, const0_rtx);
24942
24943   /* Emit any final barrier needed for a __sync operation.  */
24944   if (is_mm_sync (model))
24945     aarch64_emit_post_barrier (model);
24946 }
24947
24948 /* Split an atomic operation.  */
24949
24950 void
24951 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
24952                          rtx value, rtx model_rtx, rtx cond)
24953 {
24954   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
24955   gcc_assert (epilogue_completed);
24956
24957   machine_mode mode = GET_MODE (mem);
24958   machine_mode wmode = (mode == DImode ? DImode : SImode);
24959   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
24960   const bool is_sync = is_mm_sync (model);
24961   rtx_code_label *label;
24962   rtx x;
24963
24964   /* Split the atomic operation into a sequence.  */
24965   label = gen_label_rtx ();
24966   emit_label (label);
24967
24968   if (new_out)
24969     new_out = gen_lowpart (wmode, new_out);
24970   if (old_out)
24971     old_out = gen_lowpart (wmode, old_out);
24972   else
24973     old_out = new_out;
24974   value = simplify_gen_subreg (wmode, value, mode, 0);
24975
24976   /* The initial load can be relaxed for a __sync operation since a final
24977      barrier will be emitted to stop code hoisting.  */
24978  if (is_sync)
24979     aarch64_emit_load_exclusive (mode, old_out, mem,
24980                                  GEN_INT (MEMMODEL_RELAXED));
24981   else
24982     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
24983
24984   switch (code)
24985     {
24986     case SET:
24987       new_out = value;
24988       break;
24989
24990     case NOT:
24991       x = gen_rtx_AND (wmode, old_out, value);
24992       emit_insn (gen_rtx_SET (new_out, x));
24993       x = gen_rtx_NOT (wmode, new_out);
24994       emit_insn (gen_rtx_SET (new_out, x));
24995       break;
24996
24997     case MINUS:
24998       if (CONST_INT_P (value))
24999         {
25000           value = GEN_INT (-UINTVAL (value));
25001           code = PLUS;
25002         }
25003       /* Fall through.  */
25004
25005     default:
25006       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
25007       emit_insn (gen_rtx_SET (new_out, x));
25008       break;
25009     }
25010
25011   aarch64_emit_store_exclusive (mode, cond, mem,
25012                                 gen_lowpart (mode, new_out), model_rtx);
25013
25014   x = aarch64_gen_compare_zero_and_branch (NE, cond, label);
25015   aarch64_emit_unlikely_jump (x);
25016
25017   /* Emit any final barrier needed for a __sync operation.  */
25018   if (is_sync)
25019     aarch64_emit_post_barrier (model);
25020 }
25021
25022 static void
25023 aarch64_init_libfuncs (void)
25024 {
25025    /* Half-precision float operations.  The compiler handles all operations
25026      with NULL libfuncs by converting to SFmode.  */
25027
25028   /* Conversions.  */
25029   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
25030   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
25031
25032   /* Arithmetic.  */
25033   set_optab_libfunc (add_optab, HFmode, NULL);
25034   set_optab_libfunc (sdiv_optab, HFmode, NULL);
25035   set_optab_libfunc (smul_optab, HFmode, NULL);
25036   set_optab_libfunc (neg_optab, HFmode, NULL);
25037   set_optab_libfunc (sub_optab, HFmode, NULL);
25038
25039   /* Comparisons.  */
25040   set_optab_libfunc (eq_optab, HFmode, NULL);
25041   set_optab_libfunc (ne_optab, HFmode, NULL);
25042   set_optab_libfunc (lt_optab, HFmode, NULL);
25043   set_optab_libfunc (le_optab, HFmode, NULL);
25044   set_optab_libfunc (ge_optab, HFmode, NULL);
25045   set_optab_libfunc (gt_optab, HFmode, NULL);
25046   set_optab_libfunc (unord_optab, HFmode, NULL);
25047 }
25048
25049 /* Target hook for c_mode_for_suffix.  */
25050 static machine_mode
25051 aarch64_c_mode_for_suffix (char suffix)
25052 {
25053   if (suffix == 'q')
25054     return TFmode;
25055
25056   return VOIDmode;
25057 }
25058
25059 /* We can only represent floating point constants which will fit in
25060    "quarter-precision" values.  These values are characterised by
25061    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
25062    by:
25063
25064    (-1)^s * (n/16) * 2^r
25065
25066    Where:
25067      's' is the sign bit.
25068      'n' is an integer in the range 16 <= n <= 31.
25069      'r' is an integer in the range -3 <= r <= 4.  */
25070
25071 /* Return true iff X can be represented by a quarter-precision
25072    floating point immediate operand X.  Note, we cannot represent 0.0.  */
25073 bool
25074 aarch64_float_const_representable_p (rtx x)
25075 {
25076   /* This represents our current view of how many bits
25077      make up the mantissa.  */
25078   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
25079   int exponent;
25080   unsigned HOST_WIDE_INT mantissa, mask;
25081   REAL_VALUE_TYPE r, m;
25082   bool fail;
25083
25084   x = unwrap_const_vec_duplicate (x);
25085   if (!CONST_DOUBLE_P (x))
25086     return false;
25087
25088   if (GET_MODE (x) == VOIDmode
25089       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
25090     return false;
25091
25092   r = *CONST_DOUBLE_REAL_VALUE (x);
25093
25094   /* We cannot represent infinities, NaNs or +/-zero.  We won't
25095      know if we have +zero until we analyse the mantissa, but we
25096      can reject the other invalid values.  */
25097   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
25098       || REAL_VALUE_MINUS_ZERO (r))
25099     return false;
25100
25101   /* For BFmode, only handle 0.0. */
25102   if (GET_MODE (x) == BFmode)
25103     return real_iszero (&r, false);
25104
25105   /* Extract exponent.  */
25106   r = real_value_abs (&r);
25107   exponent = REAL_EXP (&r);
25108
25109   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
25110      highest (sign) bit, with a fixed binary point at bit point_pos.
25111      m1 holds the low part of the mantissa, m2 the high part.
25112      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
25113      bits for the mantissa, this can fail (low bits will be lost).  */
25114   real_ldexp (&m, &r, point_pos - exponent);
25115   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
25116
25117   /* If the low part of the mantissa has bits set we cannot represent
25118      the value.  */
25119   if (w.ulow () != 0)
25120     return false;
25121   /* We have rejected the lower HOST_WIDE_INT, so update our
25122      understanding of how many bits lie in the mantissa and
25123      look only at the high HOST_WIDE_INT.  */
25124   mantissa = w.elt (1);
25125   point_pos -= HOST_BITS_PER_WIDE_INT;
25126
25127   /* We can only represent values with a mantissa of the form 1.xxxx.  */
25128   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
25129   if ((mantissa & mask) != 0)
25130     return false;
25131
25132   /* Having filtered unrepresentable values, we may now remove all
25133      but the highest 5 bits.  */
25134   mantissa >>= point_pos - 5;
25135
25136   /* We cannot represent the value 0.0, so reject it.  This is handled
25137      elsewhere.  */
25138   if (mantissa == 0)
25139     return false;
25140
25141   /* Then, as bit 4 is always set, we can mask it off, leaving
25142      the mantissa in the range [0, 15].  */
25143   mantissa &= ~(1 << 4);
25144   gcc_assert (mantissa <= 15);
25145
25146   /* GCC internally does not use IEEE754-like encoding (where normalized
25147      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
25148      Our mantissa values are shifted 4 places to the left relative to
25149      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
25150      by 5 places to correct for GCC's representation.  */
25151   exponent = 5 - exponent;
25152
25153   return (exponent >= 0 && exponent <= 7);
25154 }
25155
25156 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
25157    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
25158    output MOVI/MVNI, ORR or BIC immediate.  */
25159 char*
25160 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
25161                                    enum simd_immediate_check which)
25162 {
25163   bool is_valid;
25164   static char templ[40];
25165   const char *mnemonic;
25166   const char *shift_op;
25167   unsigned int lane_count = 0;
25168   char element_char;
25169
25170   struct simd_immediate_info info;
25171
25172   /* This will return true to show const_vector is legal for use as either
25173      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
25174      It will also update INFO to show how the immediate should be generated.
25175      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
25176   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
25177   gcc_assert (is_valid);
25178
25179   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25180   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
25181
25182   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25183     {
25184       gcc_assert (info.insn == simd_immediate_info::MOV
25185                   && info.u.mov.shift == 0);
25186       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
25187          move immediate path.  */
25188       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25189         info.u.mov.value = GEN_INT (0);
25190       else
25191         {
25192           const unsigned int buf_size = 20;
25193           char float_buf[buf_size] = {'\0'};
25194           real_to_decimal_for_mode (float_buf,
25195                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25196                                     buf_size, buf_size, 1, info.elt_mode);
25197
25198           if (lane_count == 1)
25199             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
25200           else
25201             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
25202                       lane_count, element_char, float_buf);
25203           return templ;
25204         }
25205     }
25206
25207   gcc_assert (CONST_INT_P (info.u.mov.value));
25208
25209   if (which == AARCH64_CHECK_MOV)
25210     {
25211       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
25212       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
25213                   ? "msl" : "lsl");
25214       if (lane_count == 1)
25215         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
25216                   mnemonic, UINTVAL (info.u.mov.value));
25217       else if (info.u.mov.shift)
25218         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25219                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
25220                   element_char, UINTVAL (info.u.mov.value), shift_op,
25221                   info.u.mov.shift);
25222       else
25223         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25224                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
25225                   element_char, UINTVAL (info.u.mov.value));
25226     }
25227   else
25228     {
25229       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
25230       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
25231       if (info.u.mov.shift)
25232         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25233                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
25234                   element_char, UINTVAL (info.u.mov.value), "lsl",
25235                   info.u.mov.shift);
25236       else
25237         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25238                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
25239                   element_char, UINTVAL (info.u.mov.value));
25240     }
25241   return templ;
25242 }
25243
25244 char*
25245 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
25246 {
25247
25248   /* If a floating point number was passed and we desire to use it in an
25249      integer mode do the conversion to integer.  */
25250   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
25251     {
25252       unsigned HOST_WIDE_INT ival;
25253       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
25254           gcc_unreachable ();
25255       immediate = gen_int_mode (ival, mode);
25256     }
25257
25258   machine_mode vmode;
25259   /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
25260      a 128 bit vector mode.  */
25261   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
25262
25263   vmode = aarch64_simd_container_mode (mode, width);
25264   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
25265   return aarch64_output_simd_mov_immediate (v_op, width);
25266 }
25267
25268 /* Return the output string to use for moving immediate CONST_VECTOR
25269    into an SVE register.  */
25270
25271 char *
25272 aarch64_output_sve_mov_immediate (rtx const_vector)
25273 {
25274   static char templ[40];
25275   struct simd_immediate_info info;
25276   char element_char;
25277
25278   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
25279   gcc_assert (is_valid);
25280
25281   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25282
25283   machine_mode vec_mode = GET_MODE (const_vector);
25284   if (aarch64_sve_pred_mode_p (vec_mode))
25285     {
25286       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
25287       if (info.insn == simd_immediate_info::MOV)
25288         {
25289           gcc_assert (info.u.mov.value == const0_rtx);
25290           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
25291         }
25292       else
25293         {
25294           gcc_assert (info.insn == simd_immediate_info::PTRUE);
25295           unsigned int total_bytes;
25296           if (info.u.pattern == AARCH64_SV_ALL
25297               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
25298             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
25299                       total_bytes / GET_MODE_SIZE (info.elt_mode));
25300           else
25301             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
25302                       svpattern_token (info.u.pattern));
25303         }
25304       return buf;
25305     }
25306
25307   if (info.insn == simd_immediate_info::INDEX)
25308     {
25309       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
25310                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
25311                 element_char, INTVAL (info.u.index.base),
25312                 INTVAL (info.u.index.step));
25313       return templ;
25314     }
25315
25316   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25317     {
25318       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25319         info.u.mov.value = GEN_INT (0);
25320       else
25321         {
25322           const int buf_size = 20;
25323           char float_buf[buf_size] = {};
25324           real_to_decimal_for_mode (float_buf,
25325                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25326                                     buf_size, buf_size, 1, info.elt_mode);
25327
25328           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
25329                     element_char, float_buf);
25330           return templ;
25331         }
25332     }
25333
25334   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
25335             element_char, INTVAL (info.u.mov.value));
25336   return templ;
25337 }
25338
25339 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
25340    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
25341    pattern.  */
25342
25343 char *
25344 aarch64_output_sve_ptrues (rtx const_unspec)
25345 {
25346   static char templ[40];
25347
25348   struct simd_immediate_info info;
25349   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
25350   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
25351
25352   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25353   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
25354             svpattern_token (info.u.pattern));
25355   return templ;
25356 }
25357
25358 /* Split operands into moves from op[1] + op[2] into op[0].  */
25359
25360 void
25361 aarch64_split_combinev16qi (rtx operands[3])
25362 {
25363   machine_mode halfmode = GET_MODE (operands[1]);
25364
25365   gcc_assert (halfmode == V16QImode);
25366
25367   rtx destlo = simplify_gen_subreg (halfmode, operands[0],
25368                                     GET_MODE (operands[0]), 0);
25369   rtx desthi = simplify_gen_subreg (halfmode, operands[0],
25370                                     GET_MODE (operands[0]),
25371                                     GET_MODE_SIZE (halfmode));
25372
25373   bool skiplo = rtx_equal_p (destlo, operands[1]);
25374   bool skiphi = rtx_equal_p (desthi, operands[2]);
25375
25376   if (skiplo && skiphi)
25377     {
25378       /* No-op move.  Can't split to nothing; emit something.  */
25379       emit_note (NOTE_INSN_DELETED);
25380       return;
25381     }
25382
25383   /* Special case of reversed high/low parts.  */
25384   if (reg_overlap_mentioned_p (operands[2], destlo)
25385       && reg_overlap_mentioned_p (operands[1], desthi))
25386     {
25387       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25388       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
25389       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25390     }
25391   else if (!reg_overlap_mentioned_p (operands[2], destlo))
25392     {
25393       /* Try to avoid unnecessary moves if part of the result
25394          is in the right place already.  */
25395       if (!skiplo)
25396         emit_move_insn (destlo, operands[1]);
25397       if (!skiphi)
25398         emit_move_insn (desthi, operands[2]);
25399     }
25400   else
25401     {
25402       if (!skiphi)
25403         emit_move_insn (desthi, operands[2]);
25404       if (!skiplo)
25405         emit_move_insn (destlo, operands[1]);
25406     }
25407 }
25408
25409 /* vec_perm support.  */
25410
25411 struct expand_vec_perm_d
25412 {
25413   rtx target, op0, op1;
25414   vec_perm_indices perm;
25415   machine_mode vmode;
25416   machine_mode op_mode;
25417   unsigned int vec_flags;
25418   unsigned int op_vec_flags;
25419   bool one_vector_p;
25420   bool zero_op0_p, zero_op1_p;
25421   bool testing_p;
25422 };
25423
25424 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
25425
25426 /* Generate a variable permutation.  */
25427
25428 static void
25429 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
25430 {
25431   machine_mode vmode = GET_MODE (target);
25432   bool one_vector_p = rtx_equal_p (op0, op1);
25433
25434   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
25435   gcc_checking_assert (GET_MODE (op0) == vmode);
25436   gcc_checking_assert (GET_MODE (op1) == vmode);
25437   gcc_checking_assert (GET_MODE (sel) == vmode);
25438   gcc_checking_assert (TARGET_SIMD);
25439
25440   if (one_vector_p)
25441     {
25442       if (vmode == V8QImode)
25443         {
25444           /* Expand the argument to a V16QI mode by duplicating it.  */
25445           rtx pair = gen_reg_rtx (V16QImode);
25446           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
25447           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25448         }
25449       else
25450         {
25451           emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
25452         }
25453     }
25454   else
25455     {
25456       rtx pair;
25457
25458       if (vmode == V8QImode)
25459         {
25460           pair = gen_reg_rtx (V16QImode);
25461           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
25462           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25463         }
25464       else
25465         {
25466           pair = gen_reg_rtx (V2x16QImode);
25467           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
25468           emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
25469         }
25470     }
25471 }
25472
25473 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
25474    NELT is the number of elements in the vector.  */
25475
25476 void
25477 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
25478                          unsigned int nelt)
25479 {
25480   machine_mode vmode = GET_MODE (target);
25481   bool one_vector_p = rtx_equal_p (op0, op1);
25482   rtx mask;
25483
25484   /* The TBL instruction does not use a modulo index, so we must take care
25485      of that ourselves.  */
25486   mask = aarch64_simd_gen_const_vector_dup (vmode,
25487       one_vector_p ? nelt - 1 : 2 * nelt - 1);
25488   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
25489
25490   /* For big-endian, we also need to reverse the index within the vector
25491      (but not which vector).  */
25492   if (BYTES_BIG_ENDIAN)
25493     {
25494       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
25495       if (!one_vector_p)
25496         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
25497       sel = expand_simple_binop (vmode, XOR, sel, mask,
25498                                  NULL, 0, OPTAB_LIB_WIDEN);
25499     }
25500   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
25501 }
25502
25503 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
25504
25505 static void
25506 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
25507 {
25508   emit_insn (gen_rtx_SET (target,
25509                           gen_rtx_UNSPEC (GET_MODE (target),
25510                                           gen_rtvec (2, op0, op1), code)));
25511 }
25512
25513 /* Expand an SVE vec_perm with the given operands.  */
25514
25515 void
25516 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
25517 {
25518   machine_mode data_mode = GET_MODE (target);
25519   machine_mode sel_mode = GET_MODE (sel);
25520   /* Enforced by the pattern condition.  */
25521   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
25522
25523   /* Note: vec_perm indices are supposed to wrap when they go beyond the
25524      size of the two value vectors, i.e. the upper bits of the indices
25525      are effectively ignored.  SVE TBL instead produces 0 for any
25526      out-of-range indices, so we need to modulo all the vec_perm indices
25527      to ensure they are all in range.  */
25528   rtx sel_reg = force_reg (sel_mode, sel);
25529
25530   /* Check if the sel only references the first values vector.  */
25531   if (CONST_VECTOR_P (sel)
25532       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
25533     {
25534       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
25535       return;
25536     }
25537
25538   /* Check if the two values vectors are the same.  */
25539   if (rtx_equal_p (op0, op1))
25540     {
25541       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
25542       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25543                                          NULL, 0, OPTAB_DIRECT);
25544       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
25545       return;
25546     }
25547
25548   /* Run TBL on for each value vector and combine the results.  */
25549
25550   rtx res0 = gen_reg_rtx (data_mode);
25551   rtx res1 = gen_reg_rtx (data_mode);
25552   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
25553   if (!CONST_VECTOR_P (sel)
25554       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
25555     {
25556       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
25557                                                        2 * nunits - 1);
25558       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25559                                      NULL, 0, OPTAB_DIRECT);
25560     }
25561   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
25562   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
25563                                      NULL, 0, OPTAB_DIRECT);
25564   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
25565   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
25566     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
25567   else
25568     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
25569 }
25570
25571 /* Recognize patterns suitable for the TRN instructions.  */
25572 static bool
25573 aarch64_evpc_trn (struct expand_vec_perm_d *d)
25574 {
25575   HOST_WIDE_INT odd;
25576   poly_uint64 nelt = d->perm.length ();
25577   rtx out, in0, in1;
25578   machine_mode vmode = d->vmode;
25579
25580   if (GET_MODE_UNIT_SIZE (vmode) > 8)
25581     return false;
25582
25583   /* Note that these are little-endian tests.
25584      We correct for big-endian later.  */
25585   if (!d->perm[0].is_constant (&odd)
25586       || (odd != 0 && odd != 1)
25587       || !d->perm.series_p (0, 2, odd, 2)
25588       || !d->perm.series_p (1, 2, nelt + odd, 2))
25589     return false;
25590
25591   /* Success!  */
25592   if (d->testing_p)
25593     return true;
25594
25595   in0 = d->op0;
25596   in1 = d->op1;
25597   /* We don't need a big-endian lane correction for SVE; see the comment
25598      at the head of aarch64-sve.md for details.  */
25599   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25600     {
25601       std::swap (in0, in1);
25602       odd = !odd;
25603     }
25604   out = d->target;
25605
25606   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25607                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
25608   return true;
25609 }
25610
25611 /* Try to re-encode the PERM constant so it combines odd and even elements.
25612    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
25613    We retry with this new constant with the full suite of patterns.  */
25614 static bool
25615 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
25616 {
25617   expand_vec_perm_d newd;
25618
25619   if (d->vec_flags != VEC_ADVSIMD)
25620     return false;
25621
25622   /* Get the new mode.  Always twice the size of the inner
25623      and half the elements.  */
25624   poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
25625   unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
25626   auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
25627   machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
25628
25629   if (new_mode == word_mode)
25630     return false;
25631
25632   vec_perm_indices newpermindices;
25633
25634   if (!newpermindices.new_shrunk_vector (d->perm, 2))
25635     return false;
25636
25637   newd.vmode = new_mode;
25638   newd.vec_flags = VEC_ADVSIMD;
25639   newd.op_mode = newd.vmode;
25640   newd.op_vec_flags = newd.vec_flags;
25641   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
25642   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
25643   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
25644   newd.testing_p = d->testing_p;
25645   newd.one_vector_p = d->one_vector_p;
25646
25647   newd.perm.new_vector (newpermindices.encoding (), newd.one_vector_p ? 1 : 2,
25648                         newpermindices.nelts_per_input ());
25649   return aarch64_expand_vec_perm_const_1 (&newd);
25650 }
25651
25652 /* Recognize patterns suitable for the UZP instructions.  */
25653 static bool
25654 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
25655 {
25656   HOST_WIDE_INT odd;
25657   rtx out, in0, in1;
25658   machine_mode vmode = d->vmode;
25659
25660   if (GET_MODE_UNIT_SIZE (vmode) > 8)
25661     return false;
25662
25663   /* Note that these are little-endian tests.
25664      We correct for big-endian later.  */
25665   if (!d->perm[0].is_constant (&odd)
25666       || (odd != 0 && odd != 1)
25667       || !d->perm.series_p (0, 1, odd, 2))
25668     return false;
25669
25670   /* Success!  */
25671   if (d->testing_p)
25672     return true;
25673
25674   in0 = d->op0;
25675   in1 = d->op1;
25676   /* We don't need a big-endian lane correction for SVE; see the comment
25677      at the head of aarch64-sve.md for details.  */
25678   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25679     {
25680       std::swap (in0, in1);
25681       odd = !odd;
25682     }
25683   out = d->target;
25684
25685   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25686                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
25687   return true;
25688 }
25689
25690 /* Recognize patterns suitable for the ZIP instructions.  */
25691 static bool
25692 aarch64_evpc_zip (struct expand_vec_perm_d *d)
25693 {
25694   unsigned int high;
25695   poly_uint64 nelt = d->perm.length ();
25696   rtx out, in0, in1;
25697   machine_mode vmode = d->vmode;
25698
25699   if (GET_MODE_UNIT_SIZE (vmode) > 8)
25700     return false;
25701
25702   /* Note that these are little-endian tests.
25703      We correct for big-endian later.  */
25704   poly_uint64 first = d->perm[0];
25705   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
25706       || !d->perm.series_p (0, 2, first, 1)
25707       || !d->perm.series_p (1, 2, first + nelt, 1))
25708     return false;
25709   high = maybe_ne (first, 0U);
25710
25711   /* Success!  */
25712   if (d->testing_p)
25713     return true;
25714
25715   in0 = d->op0;
25716   in1 = d->op1;
25717   /* We don't need a big-endian lane correction for SVE; see the comment
25718      at the head of aarch64-sve.md for details.  */
25719   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25720     {
25721       std::swap (in0, in1);
25722       high = !high;
25723     }
25724   out = d->target;
25725
25726   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25727                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
25728   return true;
25729 }
25730
25731 /* Recognize patterns for the EXT insn.  */
25732
25733 static bool
25734 aarch64_evpc_ext (struct expand_vec_perm_d *d)
25735 {
25736   HOST_WIDE_INT location;
25737   rtx offset;
25738
25739   /* The first element always refers to the first vector.
25740      Check if the extracted indices are increasing by one.  */
25741   if ((d->vec_flags & VEC_SVE_PRED)
25742       || !d->perm[0].is_constant (&location)
25743       || !d->perm.series_p (0, 1, location, 1))
25744     return false;
25745
25746   /* Success! */
25747   if (d->testing_p)
25748     return true;
25749
25750   /* The case where (location == 0) is a no-op for both big- and little-endian,
25751      and is removed by the mid-end at optimization levels -O1 and higher.
25752
25753      We don't need a big-endian lane correction for SVE; see the comment
25754      at the head of aarch64-sve.md for details.  */
25755   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
25756     {
25757       /* After setup, we want the high elements of the first vector (stored
25758          at the LSB end of the register), and the low elements of the second
25759          vector (stored at the MSB end of the register). So swap.  */
25760       std::swap (d->op0, d->op1);
25761       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
25762          to_constant () is safe since this is restricted to Advanced SIMD
25763          vectors.  */
25764       location = d->perm.length ().to_constant () - location;
25765     }
25766
25767   offset = GEN_INT (location);
25768   emit_set_insn (d->target,
25769                  gen_rtx_UNSPEC (d->vmode,
25770                                  gen_rtvec (3, d->op0, d->op1, offset),
25771                                  UNSPEC_EXT));
25772   return true;
25773 }
25774
25775 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
25776    within each 64-bit, 32-bit or 16-bit granule.  */
25777
25778 static bool
25779 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
25780 {
25781   HOST_WIDE_INT diff;
25782   unsigned int i, size, unspec;
25783   machine_mode pred_mode;
25784
25785   if ((d->vec_flags & VEC_SVE_PRED)
25786       || !d->one_vector_p
25787       || !d->perm[0].is_constant (&diff)
25788       || !diff)
25789     return false;
25790
25791   if (d->vec_flags & VEC_SVE_DATA)
25792     size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
25793   else
25794     size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
25795   if (size == 64)
25796     {
25797       unspec = UNSPEC_REV64;
25798       pred_mode = VNx2BImode;
25799     }
25800   else if (size == 32)
25801     {
25802       unspec = UNSPEC_REV32;
25803       pred_mode = VNx4BImode;
25804     }
25805   else if (size == 16)
25806     {
25807       unspec = UNSPEC_REV16;
25808       pred_mode = VNx8BImode;
25809     }
25810   else
25811     return false;
25812
25813   unsigned int step = diff + 1;
25814   for (i = 0; i < step; ++i)
25815     if (!d->perm.series_p (i, step, diff - i, step))
25816       return false;
25817
25818   /* Success! */
25819   if (d->testing_p)
25820     return true;
25821
25822   if (d->vec_flags & VEC_SVE_DATA)
25823     {
25824       rtx pred = aarch64_ptrue_reg (pred_mode);
25825       emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
25826                                          d->target, pred, d->op0));
25827       return true;
25828     }
25829   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
25830   emit_set_insn (d->target, src);
25831   return true;
25832 }
25833
25834 /* Recognize patterns for the REV insn, which reverses elements within
25835    a full vector.  */
25836
25837 static bool
25838 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
25839 {
25840   poly_uint64 nelt = d->perm.length ();
25841
25842   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
25843     return false;
25844
25845   if (!d->perm.series_p (0, 1, nelt - 1, -1))
25846     return false;
25847
25848   /* Success! */
25849   if (d->testing_p)
25850     return true;
25851
25852   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
25853   emit_set_insn (d->target, src);
25854   return true;
25855 }
25856
25857 static bool
25858 aarch64_evpc_dup (struct expand_vec_perm_d *d)
25859 {
25860   rtx out = d->target;
25861   rtx in0;
25862   HOST_WIDE_INT elt;
25863   machine_mode vmode = d->vmode;
25864   rtx lane;
25865
25866   if ((d->vec_flags & VEC_SVE_PRED)
25867       || d->perm.encoding ().encoded_nelts () != 1
25868       || !d->perm[0].is_constant (&elt))
25869     return false;
25870
25871   if ((d->vec_flags & VEC_SVE_DATA)
25872       && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
25873     return false;
25874
25875   /* Success! */
25876   if (d->testing_p)
25877     return true;
25878
25879   /* The generic preparation in aarch64_expand_vec_perm_const_1
25880      swaps the operand order and the permute indices if it finds
25881      d->perm[0] to be in the second operand.  Thus, we can always
25882      use d->op0 and need not do any extra arithmetic to get the
25883      correct lane number.  */
25884   in0 = d->op0;
25885   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
25886
25887   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
25888   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
25889   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
25890   return true;
25891 }
25892
25893 static bool
25894 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
25895 {
25896   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
25897   machine_mode vmode = d->vmode;
25898
25899   /* Make sure that the indices are constant.  */
25900   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
25901   for (unsigned int i = 0; i < encoded_nelts; ++i)
25902     if (!d->perm[i].is_constant ())
25903       return false;
25904
25905   if (d->testing_p)
25906     return true;
25907
25908   /* Generic code will try constant permutation twice.  Once with the
25909      original mode and again with the elements lowered to QImode.
25910      So wait and don't do the selector expansion ourselves.  */
25911   if (vmode != V8QImode && vmode != V16QImode)
25912     return false;
25913
25914   /* to_constant is safe since this routine is specific to Advanced SIMD
25915      vectors.  */
25916   unsigned int nelt = d->perm.length ().to_constant ();
25917
25918   /* If one register is the constant vector of 0 then we only need
25919      a one reg TBL and we map any accesses to the vector of 0 to -1.  We can't
25920      do this earlier since vec_perm_indices clamps elements to within range so
25921      we can only do it during codegen.  */
25922   if (d->zero_op0_p)
25923     d->op0 = d->op1;
25924   else if (d->zero_op1_p)
25925     d->op1 = d->op0;
25926
25927   for (unsigned int i = 0; i < nelt; ++i)
25928     {
25929       auto val = d->perm[i].to_constant ();
25930
25931       /* If we're selecting from a 0 vector, we can just use an out of range
25932          index instead.  */
25933       if ((d->zero_op0_p && val < nelt) || (d->zero_op1_p && val >= nelt))
25934         rperm[i] = constm1_rtx;
25935       else
25936         {
25937           /* If we are remapping a zero register as the first parameter we need
25938              to adjust the indices of the non-zero register.  */
25939           if (d->zero_op0_p)
25940             val = val % nelt;
25941
25942           /* If big-endian and two vectors we end up with a weird mixed-endian
25943              mode on NEON.  Reverse the index within each word but not the word
25944              itself.  to_constant is safe because we checked is_constant
25945              above.  */
25946           rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val);
25947         }
25948     }
25949
25950   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
25951   sel = force_reg (vmode, sel);
25952
25953   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
25954   return true;
25955 }
25956
25957 /* Try to implement D using an SVE TBL instruction.  */
25958
25959 static bool
25960 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
25961 {
25962   unsigned HOST_WIDE_INT nelt;
25963
25964   /* Permuting two variable-length vectors could overflow the
25965      index range.  */
25966   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
25967     return false;
25968
25969   if (d->testing_p)
25970     return true;
25971
25972   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
25973   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
25974   if (d->one_vector_p)
25975     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
25976   else
25977     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
25978   return true;
25979 }
25980
25981 /* Try to implement D using SVE dup instruction.  */
25982
25983 static bool
25984 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
25985 {
25986   if (BYTES_BIG_ENDIAN
25987       || !d->one_vector_p
25988       || d->vec_flags != VEC_SVE_DATA
25989       || d->op_vec_flags != VEC_ADVSIMD
25990       || d->perm.encoding ().nelts_per_pattern () != 1
25991       || !known_eq (d->perm.encoding ().npatterns (),
25992                     GET_MODE_NUNITS (d->op_mode))
25993       || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
25994     return false;
25995
25996   int npatterns = d->perm.encoding ().npatterns ();
25997   for (int i = 0; i < npatterns; i++)
25998     if (!known_eq (d->perm[i], i))
25999       return false;
26000
26001   if (d->testing_p)
26002     return true;
26003
26004   aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
26005   return true;
26006 }
26007
26008 /* Try to implement D using SVE SEL instruction.  */
26009
26010 static bool
26011 aarch64_evpc_sel (struct expand_vec_perm_d *d)
26012 {
26013   machine_mode vmode = d->vmode;
26014   int unit_size = GET_MODE_UNIT_SIZE (vmode);
26015
26016   if (d->vec_flags != VEC_SVE_DATA
26017       || unit_size > 8)
26018     return false;
26019
26020   int n_patterns = d->perm.encoding ().npatterns ();
26021   poly_int64 vec_len = d->perm.length ();
26022
26023   for (int i = 0; i < n_patterns; ++i)
26024     if (!known_eq (d->perm[i], i)
26025         && !known_eq (d->perm[i], vec_len + i))
26026       return false;
26027
26028   for (int i = n_patterns; i < n_patterns * 2; i++)
26029     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
26030         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
26031       return false;
26032
26033   if (d->testing_p)
26034     return true;
26035
26036   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
26037
26038   /* Build a predicate that is true when op0 elements should be used.  */
26039   rtx_vector_builder builder (pred_mode, n_patterns, 2);
26040   for (int i = 0; i < n_patterns * 2; i++)
26041     {
26042       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
26043                                           : CONST0_RTX (BImode);
26044       builder.quick_push (elem);
26045     }
26046
26047   rtx const_vec = builder.build ();
26048   rtx pred = force_reg (pred_mode, const_vec);
26049   /* TARGET = PRED ? OP0 : OP1.  */
26050   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
26051   return true;
26052 }
26053
26054 /* Recognize patterns suitable for the INS instructions.  */
26055 static bool
26056 aarch64_evpc_ins (struct expand_vec_perm_d *d)
26057 {
26058   machine_mode mode = d->vmode;
26059   unsigned HOST_WIDE_INT nelt;
26060
26061   if (d->vec_flags != VEC_ADVSIMD)
26062     return false;
26063
26064   /* to_constant is safe since this routine is specific to Advanced SIMD
26065      vectors.  */
26066   nelt = d->perm.length ().to_constant ();
26067   rtx insv = d->op0;
26068
26069   HOST_WIDE_INT idx = -1;
26070
26071   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26072     {
26073       HOST_WIDE_INT elt;
26074       if (!d->perm[i].is_constant (&elt))
26075         return false;
26076       if (elt == (HOST_WIDE_INT) i)
26077         continue;
26078       if (idx != -1)
26079         {
26080           idx = -1;
26081           break;
26082         }
26083       idx = i;
26084     }
26085
26086   if (idx == -1)
26087     {
26088       insv = d->op1;
26089       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26090         {
26091           if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
26092             continue;
26093           if (idx != -1)
26094             return false;
26095           idx = i;
26096         }
26097
26098       if (idx == -1)
26099         return false;
26100     }
26101
26102   if (d->testing_p)
26103     return true;
26104
26105   gcc_assert (idx != -1);
26106
26107   unsigned extractindex = d->perm[idx].to_constant ();
26108   rtx extractv = d->op0;
26109   if (extractindex >= nelt)
26110     {
26111       extractv = d->op1;
26112       extractindex -= nelt;
26113     }
26114   gcc_assert (extractindex < nelt);
26115
26116   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
26117   expand_operand ops[5];
26118   create_output_operand (&ops[0], d->target, mode);
26119   create_input_operand (&ops[1], insv, mode);
26120   create_integer_operand (&ops[2], 1 << idx);
26121   create_input_operand (&ops[3], extractv, mode);
26122   create_integer_operand (&ops[4], extractindex);
26123   expand_insn (icode, 5, ops);
26124
26125   return true;
26126 }
26127
26128 static bool
26129 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
26130 {
26131   gcc_assert (d->op_mode != E_VOIDmode);
26132
26133   /* The pattern matching functions above are written to look for a small
26134      number to begin the sequence (0, 1, N/2).  If we begin with an index
26135      from the second operand, we can swap the operands.  */
26136   poly_int64 nelt = d->perm.length ();
26137   if (known_ge (d->perm[0], nelt))
26138     {
26139       d->perm.rotate_inputs (1);
26140       std::swap (d->op0, d->op1);
26141     }
26142
26143   if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
26144        || d->vec_flags == VEC_SVE_DATA
26145        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
26146        || d->vec_flags == VEC_SVE_PRED)
26147       && known_gt (nelt, 1))
26148     {
26149       if (d->vmode == d->op_mode)
26150         {
26151           if (aarch64_evpc_rev_local (d))
26152             return true;
26153           else if (aarch64_evpc_rev_global (d))
26154             return true;
26155           else if (aarch64_evpc_ext (d))
26156             return true;
26157           else if (aarch64_evpc_dup (d))
26158             return true;
26159           else if (aarch64_evpc_zip (d))
26160             return true;
26161           else if (aarch64_evpc_uzp (d))
26162             return true;
26163           else if (aarch64_evpc_trn (d))
26164             return true;
26165           else if (aarch64_evpc_sel (d))
26166             return true;
26167           else if (aarch64_evpc_ins (d))
26168             return true;
26169           else if (aarch64_evpc_reencode (d))
26170             return true;
26171
26172           if (d->vec_flags == VEC_SVE_DATA)
26173             return aarch64_evpc_sve_tbl (d);
26174           else if (d->vec_flags == VEC_ADVSIMD)
26175             return aarch64_evpc_tbl (d);
26176         }
26177       else
26178         {
26179           if (aarch64_evpc_sve_dup (d))
26180             return true;
26181         }
26182     }
26183   return false;
26184 }
26185
26186 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
26187
26188 static bool
26189 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
26190                                   rtx target, rtx op0, rtx op1,
26191                                   const vec_perm_indices &sel)
26192 {
26193   struct expand_vec_perm_d d;
26194
26195   /* Check whether the mask can be applied to a single vector.  */
26196   if (sel.ninputs () == 1
26197       || (op0 && rtx_equal_p (op0, op1)))
26198     d.one_vector_p = true;
26199   else if (sel.all_from_input_p (0))
26200     {
26201       d.one_vector_p = true;
26202       op1 = op0;
26203     }
26204   else if (sel.all_from_input_p (1))
26205     {
26206       d.one_vector_p = true;
26207       op0 = op1;
26208     }
26209   else
26210     d.one_vector_p = false;
26211
26212   d.zero_op0_p = op0 == CONST0_RTX (op_mode);
26213   d.zero_op1_p = op1 == CONST0_RTX (op_mode);
26214   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
26215                      sel.nelts_per_input ());
26216   d.vmode = vmode;
26217   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
26218   d.op_mode = op_mode;
26219   d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
26220   d.target = target;
26221   d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
26222   if (op0 == op1)
26223     d.op1 = d.op0;
26224   else
26225     d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
26226   d.testing_p = !target;
26227
26228   if (!d.testing_p)
26229     return aarch64_expand_vec_perm_const_1 (&d);
26230
26231   rtx_insn *last = get_last_insn ();
26232   bool ret = aarch64_expand_vec_perm_const_1 (&d);
26233   gcc_assert (last == get_last_insn ());
26234
26235   return ret;
26236 }
26237 /* Generate a byte permute mask for a register of mode MODE,
26238    which has NUNITS units.  */
26239
26240 rtx
26241 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
26242 {
26243   /* We have to reverse each vector because we dont have
26244      a permuted load that can reverse-load according to ABI rules.  */
26245   rtx mask;
26246   rtvec v = rtvec_alloc (16);
26247   unsigned int i, j;
26248   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
26249
26250   gcc_assert (BYTES_BIG_ENDIAN);
26251   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
26252
26253   for (i = 0; i < nunits; i++)
26254     for (j = 0; j < usize; j++)
26255       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
26256   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
26257   return force_reg (V16QImode, mask);
26258 }
26259
26260 /* Expand an SVE integer comparison using the SVE equivalent of:
26261
26262      (set TARGET (CODE OP0 OP1)).  */
26263
26264 void
26265 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
26266 {
26267   machine_mode pred_mode = GET_MODE (target);
26268   machine_mode data_mode = GET_MODE (op0);
26269   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
26270                                       op0, op1);
26271   if (!rtx_equal_p (target, res))
26272     emit_move_insn (target, res);
26273 }
26274
26275 /* Return the UNSPEC_COND_* code for comparison CODE.  */
26276
26277 static unsigned int
26278 aarch64_unspec_cond_code (rtx_code code)
26279 {
26280   switch (code)
26281     {
26282     case NE:
26283       return UNSPEC_COND_FCMNE;
26284     case EQ:
26285       return UNSPEC_COND_FCMEQ;
26286     case LT:
26287       return UNSPEC_COND_FCMLT;
26288     case GT:
26289       return UNSPEC_COND_FCMGT;
26290     case LE:
26291       return UNSPEC_COND_FCMLE;
26292     case GE:
26293       return UNSPEC_COND_FCMGE;
26294     case UNORDERED:
26295       return UNSPEC_COND_FCMUO;
26296     default:
26297       gcc_unreachable ();
26298     }
26299 }
26300
26301 /* Emit:
26302
26303       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26304
26305    where <X> is the operation associated with comparison CODE.
26306    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26307
26308 static void
26309 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
26310                           bool known_ptrue_p, rtx op0, rtx op1)
26311 {
26312   rtx flag = gen_int_mode (known_ptrue_p, SImode);
26313   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
26314                                gen_rtvec (4, pred, flag, op0, op1),
26315                                aarch64_unspec_cond_code (code));
26316   emit_set_insn (target, unspec);
26317 }
26318
26319 /* Emit the SVE equivalent of:
26320
26321       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
26322       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
26323       (set TARGET (ior:PRED_MODE TMP1 TMP2))
26324
26325    where <Xi> is the operation associated with comparison CODEi.
26326    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26327
26328 static void
26329 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
26330                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
26331 {
26332   machine_mode pred_mode = GET_MODE (pred);
26333   rtx tmp1 = gen_reg_rtx (pred_mode);
26334   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
26335   rtx tmp2 = gen_reg_rtx (pred_mode);
26336   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
26337   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
26338 }
26339
26340 /* Emit the SVE equivalent of:
26341
26342       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26343       (set TARGET (not TMP))
26344
26345    where <X> is the operation associated with comparison CODE.
26346    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26347
26348 static void
26349 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
26350                                  bool known_ptrue_p, rtx op0, rtx op1)
26351 {
26352   machine_mode pred_mode = GET_MODE (pred);
26353   rtx tmp = gen_reg_rtx (pred_mode);
26354   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
26355   aarch64_emit_unop (target, one_cmpl_optab, tmp);
26356 }
26357
26358 /* Expand an SVE floating-point comparison using the SVE equivalent of:
26359
26360      (set TARGET (CODE OP0 OP1))
26361
26362    If CAN_INVERT_P is true, the caller can also handle inverted results;
26363    return true if the result is in fact inverted.  */
26364
26365 bool
26366 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
26367                                   rtx op0, rtx op1, bool can_invert_p)
26368 {
26369   machine_mode pred_mode = GET_MODE (target);
26370   machine_mode data_mode = GET_MODE (op0);
26371
26372   rtx ptrue = aarch64_ptrue_reg (pred_mode);
26373   switch (code)
26374     {
26375     case UNORDERED:
26376       /* UNORDERED has no immediate form.  */
26377       op1 = force_reg (data_mode, op1);
26378       /* fall through */
26379     case LT:
26380     case LE:
26381     case GT:
26382     case GE:
26383     case EQ:
26384     case NE:
26385       {
26386         /* There is native support for the comparison.  */
26387         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26388         return false;
26389       }
26390
26391     case LTGT:
26392       /* This is a trapping operation (LT or GT).  */
26393       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
26394       return false;
26395
26396     case UNEQ:
26397       if (!flag_trapping_math)
26398         {
26399           /* This would trap for signaling NaNs.  */
26400           op1 = force_reg (data_mode, op1);
26401           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
26402                                         ptrue, true, op0, op1);
26403           return false;
26404         }
26405       /* fall through */
26406     case UNLT:
26407     case UNLE:
26408     case UNGT:
26409     case UNGE:
26410       if (flag_trapping_math)
26411         {
26412           /* Work out which elements are ordered.  */
26413           rtx ordered = gen_reg_rtx (pred_mode);
26414           op1 = force_reg (data_mode, op1);
26415           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
26416                                            ptrue, true, op0, op1);
26417
26418           /* Test the opposite condition for the ordered elements,
26419              then invert the result.  */
26420           if (code == UNEQ)
26421             code = NE;
26422           else
26423             code = reverse_condition_maybe_unordered (code);
26424           if (can_invert_p)
26425             {
26426               aarch64_emit_sve_fp_cond (target, code,
26427                                         ordered, false, op0, op1);
26428               return true;
26429             }
26430           aarch64_emit_sve_invert_fp_cond (target, code,
26431                                            ordered, false, op0, op1);
26432           return false;
26433         }
26434       break;
26435
26436     case ORDERED:
26437       /* ORDERED has no immediate form.  */
26438       op1 = force_reg (data_mode, op1);
26439       break;
26440
26441     default:
26442       gcc_unreachable ();
26443     }
26444
26445   /* There is native support for the inverse comparison.  */
26446   code = reverse_condition_maybe_unordered (code);
26447   if (can_invert_p)
26448     {
26449       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26450       return true;
26451     }
26452   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
26453   return false;
26454 }
26455
26456 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
26457    of the data being selected and CMP_MODE is the mode of the values being
26458    compared.  */
26459
26460 void
26461 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
26462                           rtx *ops)
26463 {
26464   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
26465   rtx pred = gen_reg_rtx (pred_mode);
26466   if (FLOAT_MODE_P (cmp_mode))
26467     {
26468       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
26469                                             ops[4], ops[5], true))
26470         std::swap (ops[1], ops[2]);
26471     }
26472   else
26473     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
26474
26475   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
26476     ops[1] = force_reg (data_mode, ops[1]);
26477   /* The "false" value can only be zero if the "true" value is a constant.  */
26478   if (register_operand (ops[1], data_mode)
26479       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
26480     ops[2] = force_reg (data_mode, ops[2]);
26481
26482   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
26483   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
26484 }
26485
26486 /* Return true if:
26487
26488    (a) MODE1 and MODE2 use the same layout for bytes that are common
26489        to both modes;
26490
26491    (b) subregs involving the two modes behave as the target-independent
26492        subreg rules require; and
26493
26494    (c) there is at least one register that can hold both modes.
26495
26496    Return false otherwise.  */
26497
26498 static bool
26499 aarch64_modes_compatible_p (machine_mode mode1, machine_mode mode2)
26500 {
26501   unsigned int flags1 = aarch64_classify_vector_mode (mode1);
26502   unsigned int flags2 = aarch64_classify_vector_mode (mode2);
26503
26504   bool sve1_p = (flags1 & VEC_ANY_SVE);
26505   bool sve2_p = (flags2 & VEC_ANY_SVE);
26506
26507   bool partial_sve1_p = sve1_p && (flags1 & VEC_PARTIAL);
26508   bool partial_sve2_p = sve2_p && (flags2 & VEC_PARTIAL);
26509
26510   bool pred1_p = (flags1 & VEC_SVE_PRED);
26511   bool pred2_p = (flags2 & VEC_SVE_PRED);
26512
26513   bool partial_advsimd_struct1_p = (flags1 == (VEC_ADVSIMD | VEC_STRUCT
26514                                                | VEC_PARTIAL));
26515   bool partial_advsimd_struct2_p = (flags2 == (VEC_ADVSIMD | VEC_STRUCT
26516                                                | VEC_PARTIAL));
26517
26518   /* Don't allow changes between predicate modes and other modes.
26519      Only predicate registers can hold predicate modes and only
26520      non-predicate registers can hold non-predicate modes, so any
26521      attempt to mix them would require a round trip through memory.  */
26522   if (pred1_p != pred2_p)
26523     return false;
26524
26525   /* The contents of partial SVE modes are distributed evenly across
26526      the register, whereas GCC expects them to be clustered together.
26527      We therefore need to be careful about mode changes involving them.  */
26528   if (partial_sve1_p && partial_sve2_p)
26529     {
26530       /* Reject changes between partial SVE modes that have different
26531          patterns of significant and insignificant bits.  */
26532       if ((aarch64_sve_container_bits (mode1)
26533            != aarch64_sve_container_bits (mode2))
26534           || GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26535         return false;
26536     }
26537   else if (partial_sve1_p)
26538     {
26539       /* The first lane of MODE1 is where GCC expects it, but anything
26540          bigger than that is not.  */
26541       if (maybe_gt (GET_MODE_SIZE (mode2), GET_MODE_UNIT_SIZE (mode1)))
26542         return false;
26543     }
26544   else if (partial_sve2_p)
26545     {
26546       /* Similarly in reverse.  */
26547       if (maybe_gt (GET_MODE_SIZE (mode1), GET_MODE_UNIT_SIZE (mode2)))
26548         return false;
26549     }
26550
26551   /* Don't allow changes between partial Advanced SIMD structure modes
26552      and other modes that are bigger than 8 bytes.  E.g. V16QI and V2x8QI
26553      are the same size, but the former occupies one Q register while the
26554      latter occupies two D registers.  */
26555   if (partial_advsimd_struct1_p != partial_advsimd_struct2_p
26556       && maybe_gt (GET_MODE_SIZE (mode1), 8)
26557       && maybe_gt (GET_MODE_SIZE (mode2), 8))
26558     return false;
26559
26560   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26561     {
26562       /* Don't allow changes between SVE modes and other modes that might
26563          be bigger than 128 bits.  In particular, OImode, CImode and XImode
26564          divide into 128-bit quantities while SVE modes divide into
26565          BITS_PER_SVE_VECTOR quantities.  */
26566       if (sve1_p && !sve2_p && maybe_gt (GET_MODE_BITSIZE (mode2), 128))
26567         return false;
26568       if (sve2_p && !sve1_p && maybe_gt (GET_MODE_BITSIZE (mode1), 128))
26569         return false;
26570     }
26571
26572   if (BYTES_BIG_ENDIAN)
26573     {
26574       /* Don't allow changes between SVE data modes and non-SVE modes.
26575          See the comment at the head of aarch64-sve.md for details.  */
26576       if (sve1_p != sve2_p)
26577         return false;
26578
26579       /* Don't allow changes in element size: lane 0 of the new vector
26580          would not then be lane 0 of the old vector.  See the comment
26581          above aarch64_maybe_expand_sve_subreg_move for a more detailed
26582          description.
26583
26584          In the worst case, this forces a register to be spilled in
26585          one mode and reloaded in the other, which handles the
26586          endianness correctly.  */
26587       if (sve1_p && GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26588         return false;
26589     }
26590   return true;
26591 }
26592
26593 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always defer
26594    to aarch64_modes_compatible_p.  However due to issues with register
26595    allocation it is preferable to avoid tieing integer scalar and FP
26596    scalar modes.  Executing integer operations in general registers is
26597    better than treating them as scalar vector operations.  This reduces
26598    latency and avoids redundant int<->FP moves.  So tie modes if they
26599    are either the same class, or one of them is a vector mode.  */
26600
26601 static bool
26602 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
26603 {
26604   if (aarch64_modes_compatible_p (mode1, mode2))
26605     {
26606       if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
26607         return true;
26608       if (VECTOR_MODE_P (mode1) || VECTOR_MODE_P (mode2))
26609         return true;
26610     }
26611   return false;
26612 }
26613
26614 /* Return a new RTX holding the result of moving POINTER forward by
26615    AMOUNT bytes.  */
26616
26617 static rtx
26618 aarch64_move_pointer (rtx pointer, poly_int64 amount)
26619 {
26620   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
26621
26622   return adjust_automodify_address (pointer, GET_MODE (pointer),
26623                                     next, amount);
26624 }
26625
26626 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
26627    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
26628    rather than memcpy.  Return true iff we succeeded.  */
26629 bool
26630 aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
26631 {
26632   if (!TARGET_MOPS)
26633     return false;
26634
26635   /* All three registers are changed by the instruction, so each one
26636      must be a fresh pseudo.  */
26637   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26638   rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
26639   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26640   rtx src_mem = replace_equiv_address (operands[1], src_addr);
26641   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
26642   if (is_memmove)
26643     emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
26644   else
26645     emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
26646   return true;
26647 }
26648
26649 /* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
26650    OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
26651    if this is a memmove rather than memcpy.  Return true if we succeed,
26652    otherwise return false, indicating that a libcall should be emitted.  */
26653 bool
26654 aarch64_expand_cpymem (rtx *operands, bool is_memmove)
26655 {
26656   int mode_bytes;
26657   rtx dst = operands[0];
26658   rtx src = operands[1];
26659   unsigned align = UINTVAL (operands[3]);
26660   rtx base;
26661   machine_mode mode = BLKmode, next_mode;
26662
26663   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
26664   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
26665     return aarch64_expand_cpymem_mops (operands, is_memmove);
26666
26667   unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
26668
26669   /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
26670   unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
26671   unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
26672                                        : aarch64_mops_memcpy_size_threshold;
26673
26674   /* Reduce the maximum size with -Os.  */
26675   if (optimize_function_for_size_p (cfun))
26676     max_copy_size /= 4;
26677
26678   /* Large copies use MOPS when available or a library call.  */
26679   if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
26680     return aarch64_expand_cpymem_mops (operands, is_memmove);
26681
26682   /* Default to 32-byte LDP/STP on large copies, however small copies or
26683      no SIMD support fall back to 16-byte chunks.
26684      ??? Although it would be possible to use LDP/STP Qn in streaming mode
26685      (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
26686      whether that would improve performance.  */
26687   bool use_qregs = size > 24 && TARGET_SIMD;
26688
26689   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26690   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26691
26692   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
26693   src = adjust_automodify_address (src, VOIDmode, base, 0);
26694
26695   auto_vec<std::pair<rtx, rtx>, 16> ops;
26696   int offset = 0;
26697
26698   while (size > 0)
26699     {
26700       /* Find the largest mode in which to do the copy in without over reading
26701          or writing.  */
26702       opt_scalar_int_mode mode_iter;
26703       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26704         if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
26705           mode = mode_iter.require ();
26706
26707       gcc_assert (mode != BLKmode);
26708
26709       mode_bytes = GET_MODE_SIZE (mode).to_constant ();
26710
26711       /* Prefer Q-register accesses.  */
26712       if (mode_bytes == 16 && use_qregs)
26713         mode = V4SImode;
26714
26715       rtx reg = gen_reg_rtx (mode);
26716       rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
26717       rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
26718       ops.safe_push ({ load, store });
26719       size -= mode_bytes;
26720       offset += mode_bytes;
26721
26722       /* Emit trailing copies using overlapping unaligned accesses
26723          (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
26724       if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
26725         {
26726           next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
26727           int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
26728           gcc_assert (n_bytes <= mode_bytes);
26729           offset -= n_bytes - size;
26730           size = n_bytes;
26731         }
26732     }
26733
26734   /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
26735   int nops = ops.length();
26736   int inc = is_memmove || nops <= 8 ? nops : 6;
26737
26738   for (int i = 0; i < nops; i += inc)
26739     {
26740       int m = MIN (nops, i + inc);
26741       /* Emit loads.  */
26742       for (int j = i; j < m; j++)
26743         emit_insn (ops[j].first);
26744       /* Emit stores.  */
26745       for (int j = i; j < m; j++)
26746         emit_insn (ops[j].second);
26747     }
26748   return true;
26749 }
26750
26751 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
26752    as for the setmem pattern.  Return true iff we succeed.  */
26753 static bool
26754 aarch64_expand_setmem_mops (rtx *operands)
26755 {
26756   if (!TARGET_MOPS)
26757     return false;
26758
26759   /* The first two registers are changed by the instruction, so both
26760      of them must be a fresh pseudo.  */
26761   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26762   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26763   rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
26764   rtx val = operands[2];
26765   if (val != CONST0_RTX (QImode))
26766     val = force_reg (QImode, val);
26767   emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
26768   return true;
26769 }
26770
26771 /* Expand setmem, as if from a __builtin_memset.  Return true if
26772    we succeed, otherwise return false.  */
26773
26774 bool
26775 aarch64_expand_setmem (rtx *operands)
26776 {
26777   int mode_bytes;
26778   unsigned HOST_WIDE_INT len;
26779   rtx dst = operands[0];
26780   rtx val = operands[2], src;
26781   unsigned align = UINTVAL (operands[3]);
26782   rtx base;
26783   machine_mode mode = BLKmode, next_mode;
26784
26785   /* Variable-sized or strict-align memset may use the MOPS expansion.  */
26786   if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
26787       || (STRICT_ALIGNMENT && align < 16))
26788     return aarch64_expand_setmem_mops (operands);
26789
26790   /* Set inline limits for memset.  MOPS has a separate threshold.  */
26791   unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
26792   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
26793
26794   len = UINTVAL (operands[1]);
26795
26796   /* Large memset uses MOPS when available or a library call.  */
26797   if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
26798     return aarch64_expand_setmem_mops (operands);
26799
26800   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26801   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26802
26803   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
26804   val = expand_vector_broadcast (V16QImode, val);
26805   val = force_reg (V16QImode, val);
26806
26807   int offset = 0;
26808   while (len > 0)
26809     {
26810       /* Find the largest mode in which to do the copy without
26811          over writing.  */
26812       opt_scalar_int_mode mode_iter;
26813       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26814         if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16))
26815           mode = mode_iter.require ();
26816
26817       gcc_assert (mode != BLKmode);
26818
26819       mode_bytes = GET_MODE_SIZE (mode).to_constant ();
26820
26821       src = val;
26822
26823       /* Prefer Q-register accesses.  */
26824       if (mode_bytes == 16)
26825         mode = V16QImode;
26826       else
26827         src = lowpart_subreg (mode, src, GET_MODE (val));
26828
26829       emit_move_insn (adjust_address (dst, mode, offset), src);
26830       len -= mode_bytes;
26831       offset += mode_bytes;
26832
26833       /* Emit trailing writes using overlapping unaligned accesses
26834          (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
26835       if (len > 0 && len < 16 && !STRICT_ALIGNMENT)
26836         {
26837           next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
26838           int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
26839           gcc_assert (n_bytes <= mode_bytes);
26840           offset -= n_bytes - len;
26841           len = n_bytes;
26842         }
26843     }
26844
26845   return true;
26846 }
26847
26848
26849 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
26850    SImode stores.  Handle the case when the constant has identical
26851    bottom and top halves.  This is beneficial when the two stores can be
26852    merged into an STP and we avoid synthesising potentially expensive
26853    immediates twice.  Return true if such a split is possible.  */
26854
26855 bool
26856 aarch64_split_dimode_const_store (rtx dst, rtx src)
26857 {
26858   rtx lo = gen_lowpart (SImode, src);
26859   rtx hi = gen_highpart_mode (SImode, DImode, src);
26860
26861   if (!rtx_equal_p (lo, hi))
26862     return false;
26863
26864   unsigned int orig_cost
26865     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
26866   unsigned int lo_cost
26867     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
26868
26869   /* We want to transform:
26870      MOV        x1, 49370
26871      MOVK       x1, 0x140, lsl 16
26872      MOVK       x1, 0xc0da, lsl 32
26873      MOVK       x1, 0x140, lsl 48
26874      STR        x1, [x0]
26875    into:
26876      MOV        w1, 49370
26877      MOVK       w1, 0x140, lsl 16
26878      STP        w1, w1, [x0]
26879    So we want to perform this when we save at least one instruction.  */
26880   if (orig_cost <= lo_cost)
26881     return false;
26882
26883   rtx mem_lo = adjust_address (dst, SImode, 0);
26884   if (!aarch64_mem_pair_operand (mem_lo, SImode))
26885     return false;
26886
26887   rtx tmp_reg = gen_reg_rtx (SImode);
26888   aarch64_expand_mov_immediate (tmp_reg, lo);
26889   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
26890   /* Don't emit an explicit store pair as this may not be always profitable.
26891      Let the sched-fusion logic decide whether to merge them.  */
26892   emit_move_insn (mem_lo, tmp_reg);
26893   emit_move_insn (mem_hi, tmp_reg);
26894
26895   return true;
26896 }
26897
26898 /* Generate RTL for a conditional branch with rtx comparison CODE in
26899    mode CC_MODE.  The destination of the unlikely conditional branch
26900    is LABEL_REF.  */
26901
26902 void
26903 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
26904                               rtx label_ref)
26905 {
26906   rtx x;
26907   x = gen_rtx_fmt_ee (code, VOIDmode,
26908                       gen_rtx_REG (cc_mode, CC_REGNUM),
26909                       const0_rtx);
26910
26911   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
26912                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
26913                             pc_rtx);
26914   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
26915 }
26916
26917 /* Generate DImode scratch registers for 128-bit (TImode) addition.
26918
26919    OP1 represents the TImode destination operand 1
26920    OP2 represents the TImode destination operand 2
26921    LOW_DEST represents the low half (DImode) of TImode operand 0
26922    LOW_IN1 represents the low half (DImode) of TImode operand 1
26923    LOW_IN2 represents the low half (DImode) of TImode operand 2
26924    HIGH_DEST represents the high half (DImode) of TImode operand 0
26925    HIGH_IN1 represents the high half (DImode) of TImode operand 1
26926    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
26927
26928 void
26929 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
26930                             rtx *low_in1, rtx *low_in2,
26931                             rtx *high_dest, rtx *high_in1,
26932                             rtx *high_in2)
26933 {
26934   *low_dest = gen_reg_rtx (DImode);
26935   *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
26936   *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
26937   *high_dest = gen_reg_rtx (DImode);
26938   *high_in1 = force_highpart_subreg (DImode, op1, TImode);
26939   *high_in2 = force_highpart_subreg (DImode, op2, TImode);
26940 }
26941
26942 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
26943
26944    OP1 represents the TImode destination operand 1
26945    OP2 represents the TImode destination operand 2
26946    LOW_DEST represents the low half (DImode) of TImode operand 0
26947    LOW_IN1 represents the low half (DImode) of TImode operand 1
26948    LOW_IN2 represents the low half (DImode) of TImode operand 2
26949    HIGH_DEST represents the high half (DImode) of TImode operand 0
26950    HIGH_IN1 represents the high half (DImode) of TImode operand 1
26951    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
26952
26953
26954 void
26955 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
26956                              rtx *low_in1, rtx *low_in2,
26957                              rtx *high_dest, rtx *high_in1,
26958                              rtx *high_in2)
26959 {
26960   *low_dest = gen_reg_rtx (DImode);
26961   *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
26962   *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
26963   *high_dest = gen_reg_rtx (DImode);
26964
26965   *high_in1 = force_highpart_subreg (DImode, op1, TImode);
26966   *high_in2 = force_highpart_subreg (DImode, op2, TImode);
26967 }
26968
26969 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
26970
26971    OP0 represents the TImode destination operand 0
26972    LOW_DEST represents the low half (DImode) of TImode operand 0
26973    LOW_IN1 represents the low half (DImode) of TImode operand 1
26974    LOW_IN2 represents the low half (DImode) of TImode operand 2
26975    HIGH_DEST represents the high half (DImode) of TImode operand 0
26976    HIGH_IN1 represents the high half (DImode) of TImode operand 1
26977    HIGH_IN2 represents the high half (DImode) of TImode operand 2
26978    UNSIGNED_P is true if the operation is being performed on unsigned
26979    values.  */
26980 void
26981 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
26982                        rtx low_in2, rtx high_dest, rtx high_in1,
26983                        rtx high_in2, bool unsigned_p)
26984 {
26985   if (low_in2 == const0_rtx)
26986     {
26987       low_dest = low_in1;
26988       high_in2 = force_reg (DImode, high_in2);
26989       if (unsigned_p)
26990         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
26991       else
26992         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
26993     }
26994   else
26995     {
26996       if (aarch64_plus_immediate (low_in2, DImode))
26997         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
26998                                             GEN_INT (-UINTVAL (low_in2))));
26999       else
27000         {
27001           low_in2 = force_reg (DImode, low_in2);
27002           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
27003         }
27004       high_in2 = force_reg (DImode, high_in2);
27005
27006       if (unsigned_p)
27007         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
27008       else
27009         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
27010     }
27011
27012   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
27013   emit_move_insn (gen_highpart (DImode, op0), high_dest);
27014
27015 }
27016
27017 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
27018
27019 static unsigned HOST_WIDE_INT
27020 aarch64_asan_shadow_offset (void)
27021 {
27022   if (TARGET_ILP32)
27023     return (HOST_WIDE_INT_1 << 29);
27024   else
27025     return (HOST_WIDE_INT_1 << 36);
27026 }
27027
27028 static rtx
27029 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
27030                         rtx_code code, tree treeop0, tree treeop1)
27031 {
27032   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27033   rtx op0, op1;
27034   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27035   insn_code icode;
27036   struct expand_operand ops[4];
27037
27038   start_sequence ();
27039   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27040
27041   op_mode = GET_MODE (op0);
27042   if (op_mode == VOIDmode)
27043     op_mode = GET_MODE (op1);
27044
27045   switch (op_mode)
27046     {
27047     case E_QImode:
27048     case E_HImode:
27049     case E_SImode:
27050       cmp_mode = SImode;
27051       icode = CODE_FOR_cmpsi;
27052       break;
27053
27054     case E_DImode:
27055       cmp_mode = DImode;
27056       icode = CODE_FOR_cmpdi;
27057       break;
27058
27059     case E_SFmode:
27060       cmp_mode = SFmode;
27061       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
27062       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
27063       break;
27064
27065     case E_DFmode:
27066       cmp_mode = DFmode;
27067       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
27068       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
27069       break;
27070
27071     default:
27072       end_sequence ();
27073       return NULL_RTX;
27074     }
27075
27076   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
27077   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
27078   if (!op0 || !op1)
27079     {
27080       end_sequence ();
27081       return NULL_RTX;
27082     }
27083   *prep_seq = get_insns ();
27084   end_sequence ();
27085
27086   create_fixed_operand (&ops[0], op0);
27087   create_fixed_operand (&ops[1], op1);
27088
27089   start_sequence ();
27090   if (!maybe_expand_insn (icode, 2, ops))
27091     {
27092       end_sequence ();
27093       return NULL_RTX;
27094     }
27095   *gen_seq = get_insns ();
27096   end_sequence ();
27097
27098   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
27099                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
27100 }
27101
27102 static rtx
27103 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
27104                        rtx_code cmp_code, tree treeop0, tree treeop1,
27105                        rtx_code bit_code)
27106 {
27107   rtx op0, op1, target;
27108   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27109   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27110   insn_code icode;
27111   struct expand_operand ops[6];
27112   int aarch64_cond;
27113
27114   push_to_sequence (*prep_seq);
27115   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27116
27117   op_mode = GET_MODE (op0);
27118   if (op_mode == VOIDmode)
27119     op_mode = GET_MODE (op1);
27120
27121   switch (op_mode)
27122     {
27123     case E_QImode:
27124     case E_HImode:
27125     case E_SImode:
27126       cmp_mode = SImode;
27127       break;
27128
27129     case E_DImode:
27130       cmp_mode = DImode;
27131       break;
27132
27133     case E_SFmode:
27134       cmp_mode = SFmode;
27135       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27136       break;
27137
27138     case E_DFmode:
27139       cmp_mode = DFmode;
27140       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27141       break;
27142
27143     default:
27144       end_sequence ();
27145       return NULL_RTX;
27146     }
27147
27148   icode = code_for_ccmp (cc_mode, cmp_mode);
27149
27150   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
27151   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
27152   if (!op0 || !op1)
27153     {
27154       end_sequence ();
27155       return NULL_RTX;
27156     }
27157   *prep_seq = get_insns ();
27158   end_sequence ();
27159
27160   target = gen_rtx_REG (cc_mode, CC_REGNUM);
27161   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
27162
27163   if (bit_code != AND)
27164     {
27165       /* Treat the ccmp patterns as canonical and use them where possible,
27166          but fall back to ccmp_rev patterns if there's no other option.  */
27167       rtx_code prev_code = GET_CODE (prev);
27168       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
27169       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
27170           && !(prev_code == EQ
27171                || prev_code == NE
27172                || prev_code == ORDERED
27173                || prev_code == UNORDERED))
27174         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
27175       else
27176         {
27177           rtx_code code = reverse_condition (prev_code);
27178           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
27179         }
27180       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
27181     }
27182
27183   create_fixed_operand (&ops[0], XEXP (prev, 0));
27184   create_fixed_operand (&ops[1], target);
27185   create_fixed_operand (&ops[2], op0);
27186   create_fixed_operand (&ops[3], op1);
27187   create_fixed_operand (&ops[4], prev);
27188   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
27189
27190   push_to_sequence (*gen_seq);
27191   if (!maybe_expand_insn (icode, 6, ops))
27192     {
27193       end_sequence ();
27194       return NULL_RTX;
27195     }
27196
27197   *gen_seq = get_insns ();
27198   end_sequence ();
27199
27200   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
27201 }
27202
27203 #undef TARGET_GEN_CCMP_FIRST
27204 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
27205
27206 #undef TARGET_GEN_CCMP_NEXT
27207 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
27208
27209 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
27210    instruction fusion of some sort.  */
27211
27212 static bool
27213 aarch64_macro_fusion_p (void)
27214 {
27215   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
27216 }
27217
27218
27219 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
27220    should be kept together during scheduling.  */
27221
27222 static bool
27223 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
27224 {
27225   rtx set_dest;
27226   rtx prev_set = single_set (prev);
27227   rtx curr_set = single_set (curr);
27228   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
27229   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
27230
27231   if (!aarch64_macro_fusion_p ())
27232     return false;
27233
27234   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
27235     {
27236       /* We are trying to match:
27237          prev (mov)  == (set (reg r0) (const_int imm16))
27238          curr (movk) == (set (zero_extract (reg r0)
27239                                            (const_int 16)
27240                                            (const_int 16))
27241                              (const_int imm16_1))  */
27242
27243       set_dest = SET_DEST (curr_set);
27244
27245       if (GET_CODE (set_dest) == ZERO_EXTRACT
27246           && CONST_INT_P (SET_SRC (curr_set))
27247           && CONST_INT_P (SET_SRC (prev_set))
27248           && CONST_INT_P (XEXP (set_dest, 2))
27249           && INTVAL (XEXP (set_dest, 2)) == 16
27250           && REG_P (XEXP (set_dest, 0))
27251           && REG_P (SET_DEST (prev_set))
27252           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
27253         {
27254           return true;
27255         }
27256     }
27257
27258   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
27259     {
27260
27261       /*  We're trying to match:
27262           prev (adrp) == (set (reg r1)
27263                               (high (symbol_ref ("SYM"))))
27264           curr (add) == (set (reg r0)
27265                              (lo_sum (reg r1)
27266                                      (symbol_ref ("SYM"))))
27267           Note that r0 need not necessarily be the same as r1, especially
27268           during pre-regalloc scheduling.  */
27269
27270       if (satisfies_constraint_Ush (SET_SRC (prev_set))
27271           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27272         {
27273           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
27274               && REG_P (XEXP (SET_SRC (curr_set), 0))
27275               && REGNO (XEXP (SET_SRC (curr_set), 0))
27276                  == REGNO (SET_DEST (prev_set))
27277               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
27278                               XEXP (SET_SRC (curr_set), 1)))
27279             return true;
27280         }
27281     }
27282
27283   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
27284     {
27285
27286       /* We're trying to match:
27287          prev (movk) == (set (zero_extract (reg r0)
27288                                            (const_int 16)
27289                                            (const_int 32))
27290                              (const_int imm16_1))
27291          curr (movk) == (set (zero_extract (reg r0)
27292                                            (const_int 16)
27293                                            (const_int 48))
27294                              (const_int imm16_2))  */
27295
27296       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
27297           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
27298           && REG_P (XEXP (SET_DEST (prev_set), 0))
27299           && REG_P (XEXP (SET_DEST (curr_set), 0))
27300           && REGNO (XEXP (SET_DEST (prev_set), 0))
27301              == REGNO (XEXP (SET_DEST (curr_set), 0))
27302           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
27303           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
27304           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
27305           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
27306           && CONST_INT_P (SET_SRC (prev_set))
27307           && CONST_INT_P (SET_SRC (curr_set)))
27308         return true;
27309
27310     }
27311   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
27312     {
27313       /* We're trying to match:
27314           prev (adrp) == (set (reg r0)
27315                               (high (symbol_ref ("SYM"))))
27316           curr (ldr) == (set (reg r1)
27317                              (mem (lo_sum (reg r0)
27318                                              (symbol_ref ("SYM")))))
27319                  or
27320           curr (ldr) == (set (reg r1)
27321                              (zero_extend (mem
27322                                            (lo_sum (reg r0)
27323                                                    (symbol_ref ("SYM"))))))  */
27324       if (satisfies_constraint_Ush (SET_SRC (prev_set))
27325           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27326         {
27327           rtx curr_src = SET_SRC (curr_set);
27328
27329           if (GET_CODE (curr_src) == ZERO_EXTEND)
27330             curr_src = XEXP (curr_src, 0);
27331
27332           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
27333               && REG_P (XEXP (XEXP (curr_src, 0), 0))
27334               && REGNO (XEXP (XEXP (curr_src, 0), 0))
27335                  == REGNO (SET_DEST (prev_set))
27336               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
27337                               XEXP (SET_SRC (prev_set), 0)))
27338               return true;
27339         }
27340     }
27341
27342   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
27343   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
27344       && prev_set && curr_set && any_condjump_p (curr)
27345       && GET_CODE (SET_SRC (prev_set)) == COMPARE
27346       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
27347       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
27348     return true;
27349
27350   /* Fuse flag-setting ALU instructions and conditional branch.  */
27351   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
27352       && any_condjump_p (curr))
27353     {
27354       unsigned int condreg1, condreg2;
27355       rtx cc_reg_1;
27356       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
27357       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
27358
27359       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
27360           && prev
27361           && modified_in_p (cc_reg_1, prev))
27362         {
27363           enum attr_type prev_type = get_attr_type (prev);
27364
27365           /* FIXME: this misses some which is considered simple arthematic
27366              instructions for ThunderX.  Simple shifts are missed here.  */
27367           if (prev_type == TYPE_ALUS_SREG
27368               || prev_type == TYPE_ALUS_IMM
27369               || prev_type == TYPE_LOGICS_REG
27370               || prev_type == TYPE_LOGICS_IMM)
27371             return true;
27372         }
27373     }
27374
27375   /* Fuse ALU instructions and CBZ/CBNZ.  */
27376   if (prev_set
27377       && curr_set
27378       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
27379       && any_condjump_p (curr))
27380     {
27381       /* We're trying to match:
27382           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
27383           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
27384                                                          (const_int 0))
27385                                                  (label_ref ("SYM"))
27386                                                  (pc))  */
27387       if (SET_DEST (curr_set) == (pc_rtx)
27388           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
27389           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
27390           && REG_P (SET_DEST (prev_set))
27391           && REGNO (SET_DEST (prev_set))
27392              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
27393         {
27394           /* Fuse ALU operations followed by conditional branch instruction.  */
27395           switch (get_attr_type (prev))
27396             {
27397             case TYPE_ALU_IMM:
27398             case TYPE_ALU_SREG:
27399             case TYPE_ADC_REG:
27400             case TYPE_ADC_IMM:
27401             case TYPE_ADCS_REG:
27402             case TYPE_ADCS_IMM:
27403             case TYPE_LOGIC_REG:
27404             case TYPE_LOGIC_IMM:
27405             case TYPE_CSEL:
27406             case TYPE_ADR:
27407             case TYPE_MOV_IMM:
27408             case TYPE_SHIFT_REG:
27409             case TYPE_SHIFT_IMM:
27410             case TYPE_BFM:
27411             case TYPE_RBIT:
27412             case TYPE_REV:
27413             case TYPE_EXTEND:
27414               return true;
27415
27416             default:;
27417             }
27418         }
27419     }
27420
27421   /* Fuse A+B+1 and A-B-1 */
27422   if (simple_sets_p
27423       && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
27424     {
27425       /* We're trying to match:
27426           prev == (set (r0) (plus (r0) (r1)))
27427           curr == (set (r0) (plus (r0) (const_int 1)))
27428         or:
27429           prev == (set (r0) (minus (r0) (r1)))
27430           curr == (set (r0) (plus (r0) (const_int -1))) */
27431
27432       rtx prev_src = SET_SRC (prev_set);
27433       rtx curr_src = SET_SRC (curr_set);
27434
27435       int polarity = 1;
27436       if (GET_CODE (prev_src) == MINUS)
27437         polarity = -1;
27438
27439       if (GET_CODE (curr_src) == PLUS
27440           && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
27441           && CONST_INT_P (XEXP (curr_src, 1))
27442           && INTVAL (XEXP (curr_src, 1)) == polarity
27443           && REG_P (XEXP (curr_src, 0))
27444           && REG_P (SET_DEST (prev_set))
27445           && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
27446         return true;
27447     }
27448
27449   return false;
27450 }
27451
27452 /* Return true iff the instruction fusion described by OP is enabled.  */
27453
27454 bool
27455 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
27456 {
27457   return (aarch64_tune_params.fusible_ops & op) != 0;
27458 }
27459
27460 /* If MEM is in the form of [base+offset], extract the two parts
27461    of address and set to BASE and OFFSET, otherwise return false
27462    after clearing BASE and OFFSET.  */
27463
27464 bool
27465 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
27466 {
27467   rtx addr;
27468
27469   gcc_assert (MEM_P (mem));
27470
27471   addr = XEXP (mem, 0);
27472
27473   if (REG_P (addr))
27474     {
27475       *base = addr;
27476       *offset = const0_rtx;
27477       return true;
27478     }
27479
27480   if (GET_CODE (addr) == PLUS
27481       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
27482     {
27483       *base = XEXP (addr, 0);
27484       *offset = XEXP (addr, 1);
27485       return true;
27486     }
27487
27488   *base = NULL_RTX;
27489   *offset = NULL_RTX;
27490
27491   return false;
27492 }
27493
27494 /* Types for scheduling fusion.  */
27495 enum sched_fusion_type
27496 {
27497   SCHED_FUSION_NONE = 0,
27498   SCHED_FUSION_LD_SIGN_EXTEND,
27499   SCHED_FUSION_LD_ZERO_EXTEND,
27500   SCHED_FUSION_LD,
27501   SCHED_FUSION_ST,
27502   SCHED_FUSION_NUM
27503 };
27504
27505 /* If INSN is a load or store of address in the form of [base+offset],
27506    extract the two parts and set to BASE and OFFSET.  Return scheduling
27507    fusion type this INSN is.  */
27508
27509 static enum sched_fusion_type
27510 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
27511 {
27512   rtx x, dest, src;
27513   enum sched_fusion_type fusion = SCHED_FUSION_LD;
27514
27515   gcc_assert (INSN_P (insn));
27516   x = PATTERN (insn);
27517   if (GET_CODE (x) != SET)
27518     return SCHED_FUSION_NONE;
27519
27520   src = SET_SRC (x);
27521   dest = SET_DEST (x);
27522
27523   machine_mode dest_mode = GET_MODE (dest);
27524
27525   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
27526     return SCHED_FUSION_NONE;
27527
27528   if (GET_CODE (src) == SIGN_EXTEND)
27529     {
27530       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
27531       src = XEXP (src, 0);
27532       if (!MEM_P (src) || GET_MODE (src) != SImode)
27533         return SCHED_FUSION_NONE;
27534     }
27535   else if (GET_CODE (src) == ZERO_EXTEND)
27536     {
27537       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
27538       src = XEXP (src, 0);
27539       if (!MEM_P (src) || GET_MODE (src) != SImode)
27540         return SCHED_FUSION_NONE;
27541     }
27542
27543   if (MEM_P (src) && REG_P (dest))
27544     extract_base_offset_in_addr (src, base, offset);
27545   else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
27546     {
27547       fusion = SCHED_FUSION_ST;
27548       extract_base_offset_in_addr (dest, base, offset);
27549     }
27550   else
27551     return SCHED_FUSION_NONE;
27552
27553   if (*base == NULL_RTX || *offset == NULL_RTX)
27554     fusion = SCHED_FUSION_NONE;
27555
27556   return fusion;
27557 }
27558
27559 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
27560
27561    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
27562    and PRI are only calculated for these instructions.  For other instruction,
27563    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
27564    type instruction fusion can be added by returning different priorities.
27565
27566    It's important that irrelevant instructions get the largest FUSION_PRI.  */
27567
27568 static void
27569 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
27570                                int *fusion_pri, int *pri)
27571 {
27572   int tmp, off_val;
27573   rtx base, offset;
27574   enum sched_fusion_type fusion;
27575
27576   gcc_assert (INSN_P (insn));
27577
27578   tmp = max_pri - 1;
27579   fusion = fusion_load_store (insn, &base, &offset);
27580   if (fusion == SCHED_FUSION_NONE)
27581     {
27582       *pri = tmp;
27583       *fusion_pri = tmp;
27584       return;
27585     }
27586
27587   /* Set FUSION_PRI according to fusion type and base register.  */
27588   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
27589
27590   /* Calculate PRI.  */
27591   tmp /= 2;
27592
27593   /* INSN with smaller offset goes first.  */
27594   off_val = (int)(INTVAL (offset));
27595   if (off_val >= 0)
27596     tmp -= (off_val & 0xfffff);
27597   else
27598     tmp += ((- off_val) & 0xfffff);
27599
27600   *pri = tmp;
27601   return;
27602 }
27603
27604 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
27605    Adjust priority of sha1h instructions so they are scheduled before
27606    other SHA1 instructions.  */
27607
27608 static int
27609 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
27610 {
27611   rtx x = PATTERN (insn);
27612
27613   if (GET_CODE (x) == SET)
27614     {
27615       x = SET_SRC (x);
27616
27617       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
27618         return priority + 10;
27619     }
27620
27621   return priority;
27622 }
27623
27624 /* If REVERSED is null, return true if memory reference *MEM2 comes
27625    immediately after memory reference *MEM1.  Do not change the references
27626    in this case.
27627
27628    Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
27629    if they are, try to make them use constant offsets from the same base
27630    register.  Return true on success.  When returning true, set *REVERSED
27631    to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2.  */
27632 static bool
27633 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
27634 {
27635   if (reversed)
27636     *reversed = false;
27637
27638   if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
27639       || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
27640     return false;
27641
27642   if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
27643     return false;
27644
27645   auto size1 = MEM_SIZE (*mem1);
27646   auto size2 = MEM_SIZE (*mem2);
27647
27648   rtx base1, base2, offset1, offset2;
27649   extract_base_offset_in_addr (*mem1, &base1, &offset1);
27650   extract_base_offset_in_addr (*mem2, &base2, &offset2);
27651
27652   /* Make sure at least one memory is in base+offset form.  */
27653   if (!(base1 && offset1) && !(base2 && offset2))
27654     return false;
27655
27656   /* If both mems already use the same base register, just check the
27657      offsets.  */
27658   if (base1 && base2 && rtx_equal_p (base1, base2))
27659     {
27660       if (!offset1 || !offset2)
27661         return false;
27662
27663       if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
27664         return true;
27665
27666       if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
27667         {
27668           *reversed = true;
27669           return true;
27670         }
27671
27672       return false;
27673     }
27674
27675   /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
27676      guarantee that the values are consecutive.  */
27677   if (MEM_EXPR (*mem1)
27678       && MEM_EXPR (*mem2)
27679       && MEM_OFFSET_KNOWN_P (*mem1)
27680       && MEM_OFFSET_KNOWN_P (*mem2))
27681     {
27682       poly_int64 expr_offset1;
27683       poly_int64 expr_offset2;
27684       tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
27685                                                        &expr_offset1);
27686       tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
27687                                                        &expr_offset2);
27688       if (!expr_base1
27689           || !expr_base2
27690           || !DECL_P (expr_base1)
27691           || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
27692         return false;
27693
27694       expr_offset1 += MEM_OFFSET (*mem1);
27695       expr_offset2 += MEM_OFFSET (*mem2);
27696
27697       if (known_eq (expr_offset1 + size1, expr_offset2))
27698         ;
27699       else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
27700         *reversed = true;
27701       else
27702         return false;
27703
27704       if (reversed)
27705         {
27706           if (base2)
27707             {
27708               rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
27709                                          expr_offset1 - expr_offset2);
27710               *mem1 = replace_equiv_address_nv (*mem1, addr1);
27711             }
27712           else
27713             {
27714               rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
27715                                          expr_offset2 - expr_offset1);
27716               *mem2 = replace_equiv_address_nv (*mem2, addr2);
27717             }
27718         }
27719       return true;
27720     }
27721
27722   return false;
27723 }
27724
27725 /* Test if MODE is suitable for a single transfer register in an ldp or stp
27726    instruction.  */
27727
27728 bool
27729 aarch64_ldpstp_operand_mode_p (machine_mode mode)
27730 {
27731   if (!targetm.hard_regno_mode_ok (V0_REGNUM, mode)
27732       || hard_regno_nregs (V0_REGNUM, mode) > 1)
27733     return false;
27734
27735   const auto size = GET_MODE_SIZE (mode);
27736   return known_eq (size, 4) || known_eq (size, 8) || known_eq (size, 16);
27737 }
27738
27739 /* Return true if MEM1 and MEM2 can be combined into a single access
27740    of mode MODE, with the combined access having the same address as MEM1.  */
27741
27742 bool
27743 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
27744 {
27745   if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
27746     return false;
27747   return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
27748 }
27749
27750 /* Return true if MEM agrees with the ldp-stp policy model.
27751    Otherwise, false.  */
27752
27753 bool
27754 aarch64_mem_ok_with_ldpstp_policy_model (rtx mem, bool load, machine_mode mode)
27755 {
27756   auto policy = (load
27757                  ? aarch64_tune_params.ldp_policy_model
27758                  : aarch64_tune_params.stp_policy_model);
27759
27760   /* If we have AARCH64_LDP_STP_POLICY_NEVER, reject the load pair.  */
27761   if (policy == AARCH64_LDP_STP_POLICY_NEVER)
27762     return false;
27763
27764   /* If we have AARCH64_LDP_STP_POLICY_ALIGNED,
27765      do not emit the load pair unless the alignment is checked to be
27766      at least double the alignment of the type.  */
27767   if (policy == AARCH64_LDP_STP_POLICY_ALIGNED
27768       && !optimize_function_for_size_p (cfun)
27769       && MEM_ALIGN (mem) < 2 * GET_MODE_ALIGNMENT (mode))
27770     return false;
27771
27772   return true;
27773 }
27774
27775 /* Given OPERANDS of consecutive load/store, check if we can merge
27776    them into ldp/stp.  LOAD is true if they are load instructions.  */
27777
27778 bool
27779 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load)
27780 {
27781   enum reg_class rclass_1, rclass_2;
27782   rtx mem_1, mem_2, reg_1, reg_2;
27783
27784   if (load)
27785     {
27786       mem_1 = operands[1];
27787       mem_2 = operands[3];
27788       reg_1 = operands[0];
27789       reg_2 = operands[2];
27790       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
27791       if (REGNO (reg_1) == REGNO (reg_2))
27792         return false;
27793       if (reg_overlap_mentioned_p (reg_1, mem_2))
27794         return false;
27795     }
27796   else
27797     {
27798       mem_1 = operands[0];
27799       mem_2 = operands[2];
27800       reg_1 = operands[1];
27801       reg_2 = operands[3];
27802     }
27803
27804   /* The mems cannot be volatile.  */
27805   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
27806     return false;
27807
27808   /* Check if the addresses are in the form of [base+offset].  */
27809   bool reversed = false;
27810   if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
27811     return false;
27812
27813   /* The operands must be of the same size.  */
27814   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
27815                         GET_MODE_SIZE (GET_MODE (mem_2))));
27816
27817   /* The lower memory access must be a mem-pair operand.  */
27818   rtx lower_mem = reversed ? mem_2 : mem_1;
27819   machine_mode lower_mem_mode = GET_MODE (lower_mem);
27820   if (!aarch64_mem_pair_operand (lower_mem, lower_mem_mode))
27821     return false;
27822
27823   /* Check if lower_mem is ok with the ldp-stp policy model.  */
27824   if (!aarch64_mem_ok_with_ldpstp_policy_model (lower_mem, load,
27825                                                 lower_mem_mode))
27826     return false;
27827
27828   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
27829     rclass_1 = FP_REGS;
27830   else
27831     rclass_1 = GENERAL_REGS;
27832
27833   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
27834     rclass_2 = FP_REGS;
27835   else
27836     rclass_2 = GENERAL_REGS;
27837
27838   /* Check if the registers are of same class.  */
27839   if (rclass_1 != rclass_2)
27840     return false;
27841
27842   return true;
27843 }
27844
27845 /* Given OPERANDS of consecutive load/store that can be merged,
27846    swap them if they are not in ascending order.  */
27847 void
27848 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
27849 {
27850   int mem_op = load ? 1 : 0;
27851   bool reversed = false;
27852   if (!aarch64_check_consecutive_mems (operands + mem_op,
27853                                        operands + mem_op + 2, &reversed))
27854     gcc_unreachable ();
27855
27856   if (reversed)
27857     {
27858       /* Irrespective of whether this is a load or a store,
27859          we do the same swap.  */
27860       std::swap (operands[0], operands[2]);
27861       std::swap (operands[1], operands[3]);
27862     }
27863 }
27864
27865 /* Helper function used for generation of load/store pair instructions, called
27866    from peepholes in aarch64-ldpstp.md.  OPERANDS is an array of
27867    operands as matched by the peepholes in that file.  LOAD_P is true if we're
27868    generating a load pair, otherwise we're generating a store pair.  CODE is
27869    either {ZERO,SIGN}_EXTEND for extending loads or UNKNOWN if we're generating a
27870    standard load/store pair.  */
27871
27872 void
27873 aarch64_finish_ldpstp_peephole (rtx *operands, bool load_p, enum rtx_code code)
27874 {
27875   aarch64_swap_ldrstr_operands (operands, load_p);
27876
27877   if (load_p)
27878     emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
27879                                       operands[1], code));
27880   else
27881     {
27882       gcc_assert (code == UNKNOWN);
27883       emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
27884                                          operands[3]));
27885     }
27886 }
27887
27888 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
27889    comparison between the two.  */
27890 int
27891 aarch64_host_wide_int_compare (const void *x, const void *y)
27892 {
27893   return wi::cmps (* ((const HOST_WIDE_INT *) x),
27894                    * ((const HOST_WIDE_INT *) y));
27895 }
27896
27897 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
27898    other pointing to a REG rtx containing an offset, compare the offsets
27899    of the two pairs.
27900
27901    Return:
27902
27903         1 iff offset (X) > offset (Y)
27904         0 iff offset (X) == offset (Y)
27905         -1 iff offset (X) < offset (Y)  */
27906 int
27907 aarch64_ldrstr_offset_compare (const void *x, const void *y)
27908 {
27909   const rtx * operands_1 = (const rtx *) x;
27910   const rtx * operands_2 = (const rtx *) y;
27911   rtx mem_1, mem_2, base, offset_1, offset_2;
27912
27913   if (MEM_P (operands_1[0]))
27914     mem_1 = operands_1[0];
27915   else
27916     mem_1 = operands_1[1];
27917
27918   if (MEM_P (operands_2[0]))
27919     mem_2 = operands_2[0];
27920   else
27921     mem_2 = operands_2[1];
27922
27923   /* Extract the offsets.  */
27924   extract_base_offset_in_addr (mem_1, &base, &offset_1);
27925   extract_base_offset_in_addr (mem_2, &base, &offset_2);
27926
27927   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
27928
27929   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
27930 }
27931
27932 /* Given OPERANDS of consecutive load/store, check if we can merge
27933    them into ldp/stp by adjusting the offset.  LOAD is true if they
27934    are load instructions.  MODE is the mode of memory operands.
27935
27936    Given below consecutive stores:
27937
27938      str  w1, [xb, 0x100]
27939      str  w1, [xb, 0x104]
27940      str  w1, [xb, 0x108]
27941      str  w1, [xb, 0x10c]
27942
27943    Though the offsets are out of the range supported by stp, we can
27944    still pair them after adjusting the offset, like:
27945
27946      add  scratch, xb, 0x100
27947      stp  w1, w1, [scratch]
27948      stp  w1, w1, [scratch, 0x8]
27949
27950    The peephole patterns detecting this opportunity should guarantee
27951    the scratch register is avaliable.  */
27952
27953 bool
27954 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
27955                                        machine_mode mode)
27956 {
27957   const int num_insns = 4;
27958   enum reg_class rclass;
27959   HOST_WIDE_INT offvals[num_insns], msize;
27960   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
27961
27962   if (load)
27963     {
27964       for (int i = 0; i < num_insns; i++)
27965         {
27966           reg[i] = operands[2 * i];
27967           mem[i] = operands[2 * i + 1];
27968
27969           gcc_assert (REG_P (reg[i]));
27970         }
27971
27972       /* Do not attempt to merge the loads if the loads clobber each other.  */
27973       for (int i = 0; i < 8; i += 2)
27974         for (int j = i + 2; j < 8; j += 2)
27975           if (reg_overlap_mentioned_p (operands[i], operands[j]))
27976             return false;
27977     }
27978   else
27979     for (int i = 0; i < num_insns; i++)
27980       {
27981         mem[i] = operands[2 * i];
27982         reg[i] = operands[2 * i + 1];
27983       }
27984
27985   /* Skip if memory operand is by itself valid for ldp/stp.  */
27986   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
27987     return false;
27988
27989   for (int i = 0; i < num_insns; i++)
27990     {
27991       /* The mems cannot be volatile.  */
27992       if (MEM_VOLATILE_P (mem[i]))
27993         return false;
27994
27995       /* Check if the addresses are in the form of [base+offset].  */
27996       extract_base_offset_in_addr (mem[i], base + i, offset + i);
27997       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
27998         return false;
27999     }
28000
28001   /* Check if the registers are of same class.  */
28002   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
28003     ? FP_REGS : GENERAL_REGS;
28004
28005   for (int i = 1; i < num_insns; i++)
28006     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
28007       {
28008         if (rclass != FP_REGS)
28009           return false;
28010       }
28011     else
28012       {
28013         if (rclass != GENERAL_REGS)
28014           return false;
28015       }
28016
28017   /* Only the last register in the order in which they occur
28018      may be clobbered by the load.  */
28019   if (rclass == GENERAL_REGS && load)
28020     for (int i = 0; i < num_insns - 1; i++)
28021       if (reg_mentioned_p (reg[i], mem[i]))
28022         return false;
28023
28024   /* Check if the bases are same.  */
28025   for (int i = 0; i < num_insns - 1; i++)
28026     if (!rtx_equal_p (base[i], base[i + 1]))
28027       return false;
28028
28029   for (int i = 0; i < num_insns; i++)
28030     offvals[i] = INTVAL (offset[i]);
28031
28032   msize = GET_MODE_SIZE (mode).to_constant ();
28033
28034   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
28035   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
28036          aarch64_host_wide_int_compare);
28037
28038   if (!(offvals[1] == offvals[0] + msize
28039         && offvals[3] == offvals[2] + msize))
28040     return false;
28041
28042   /* Check that offsets are within range of each other.  The ldp/stp
28043      instructions have 7 bit immediate offsets, so use 0x80.  */
28044   if (offvals[2] - offvals[0] >= msize * 0x80)
28045     return false;
28046
28047   /* The offsets must be aligned with respect to each other.  */
28048   if (offvals[0] % msize != offvals[2] % msize)
28049     return false;
28050
28051    /* Check if mem[0] is ok with the ldp-stp policy model.  */
28052   if (!aarch64_mem_ok_with_ldpstp_policy_model (mem[0], load, mode))
28053     return false;
28054
28055   return true;
28056 }
28057
28058 /* Given OPERANDS of consecutive load/store, this function pairs them
28059    into LDP/STP after adjusting the offset.  It depends on the fact
28060    that the operands can be sorted so the offsets are correct for STP.
28061    MODE is the mode of memory operands.  CODE is the rtl operator
28062    which should be applied to all memory operands, it's SIGN_EXTEND,
28063    ZERO_EXTEND or UNKNOWN.  */
28064
28065 bool
28066 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
28067                              machine_mode mode, RTX_CODE code)
28068 {
28069   rtx base, offset_1, offset_2;
28070   rtx mem_1, mem_2;
28071   rtx temp_operands[8];
28072   HOST_WIDE_INT off_val_1, off_val_2, base_off, new_off_1, new_off_2,
28073                 stp_off_upper_limit, stp_off_lower_limit, msize;
28074
28075   /* We make changes on a copy as we may still bail out.  */
28076   for (int i = 0; i < 8; i ++)
28077     temp_operands[i] = operands[i];
28078
28079   /* Sort the operands.  Note for cases as below:
28080        [base + 0x310] = A
28081        [base + 0x320] = B
28082        [base + 0x330] = C
28083        [base + 0x320] = D
28084      We need stable sorting otherwise wrong data may be store to offset 0x320.
28085      Also note the dead store in above case should be optimized away, but no
28086      guarantees here.  */
28087   gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
28088                  aarch64_ldrstr_offset_compare);
28089
28090   /* Copy the memory operands so that if we have to bail for some
28091      reason the original addresses are unchanged.  */
28092   if (load)
28093     {
28094       mem_1 = copy_rtx (temp_operands[1]);
28095       mem_2 = copy_rtx (temp_operands[5]);
28096     }
28097   else
28098     {
28099       mem_1 = copy_rtx (temp_operands[0]);
28100       mem_2 = copy_rtx (temp_operands[4]);
28101       gcc_assert (code == UNKNOWN);
28102     }
28103
28104   extract_base_offset_in_addr (mem_1, &base, &offset_1);
28105   extract_base_offset_in_addr (mem_2, &base, &offset_2);
28106   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
28107               && offset_2 != NULL_RTX);
28108
28109   /* Adjust offset so it can fit in LDP/STP instruction.  */
28110   msize = GET_MODE_SIZE (mode).to_constant();
28111   stp_off_upper_limit = msize * (0x40 - 1);
28112   stp_off_lower_limit = - msize * 0x40;
28113
28114   off_val_1 = INTVAL (offset_1);
28115   off_val_2 = INTVAL (offset_2);
28116
28117   /* The base offset is optimally half way between the two STP/LDP offsets.  */
28118   if (msize <= 4)
28119     base_off = (off_val_1 + off_val_2) / 2;
28120   else
28121     /* However, due to issues with negative LDP/STP offset generation for
28122        larger modes, for DF, DD, DI and vector modes. we must not use negative
28123        addresses smaller than 9 signed unadjusted bits can store.  This
28124        provides the most range in this case.  */
28125     base_off = off_val_1;
28126
28127   /* Adjust the base so that it is aligned with the addresses but still
28128      optimal.  */
28129   if (base_off % msize != off_val_1 % msize)
28130     /* Fix the offset, bearing in mind we want to make it bigger not
28131        smaller.  */
28132     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28133   else if (msize <= 4)
28134     /* The negative range of LDP/STP is one larger than the positive range.  */
28135     base_off += msize;
28136
28137   /* Check if base offset is too big or too small.  We can attempt to resolve
28138      this issue by setting it to the maximum value and seeing if the offsets
28139      still fit.  */
28140   if (base_off >= 0x1000)
28141     {
28142       base_off = 0x1000 - 1;
28143       /* We must still make sure that the base offset is aligned with respect
28144          to the address.  But it may not be made any bigger.  */
28145       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28146     }
28147
28148   /* Likewise for the case where the base is too small.  */
28149   if (base_off <= -0x1000)
28150     {
28151       base_off = -0x1000 + 1;
28152       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28153     }
28154
28155   /* Offset of the first STP/LDP.  */
28156   new_off_1 = off_val_1 - base_off;
28157
28158   /* Offset of the second STP/LDP.  */
28159   new_off_2 = off_val_2 - base_off;
28160
28161   /* The offsets must be within the range of the LDP/STP instructions.  */
28162   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
28163       || new_off_2 > stp_off_upper_limit || new_off_2 < stp_off_lower_limit)
28164     return false;
28165
28166   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
28167                                                   new_off_1), true);
28168   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
28169                                                   new_off_2), true);
28170
28171   if (!aarch64_mem_pair_operand (mem_1, mode)
28172       || !aarch64_mem_pair_operand (mem_2, mode))
28173     return false;
28174
28175   if (load)
28176     {
28177       operands[0] = temp_operands[0];
28178       operands[1] = mem_1;
28179       operands[2] = temp_operands[2];
28180       operands[4] = temp_operands[4];
28181       operands[5] = mem_2;
28182       operands[6] = temp_operands[6];
28183     }
28184   else
28185     {
28186       operands[0] = mem_1;
28187       operands[1] = temp_operands[1];
28188       operands[3] = temp_operands[3];
28189       operands[4] = mem_2;
28190       operands[5] = temp_operands[5];
28191       operands[7] = temp_operands[7];
28192     }
28193
28194   /* Emit adjusting instruction.  */
28195   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
28196   /* Emit ldp/stp instructions.  */
28197   if (load)
28198     {
28199       emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
28200                                         operands[1], code));
28201       emit_insn (aarch64_gen_load_pair (operands[4], operands[6],
28202                                         operands[5], code));
28203     }
28204   else
28205     {
28206       emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
28207                                          operands[3]));
28208       emit_insn (aarch64_gen_store_pair (operands[4], operands[5],
28209                                          operands[7]));
28210     }
28211   return true;
28212 }
28213
28214 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
28215    it isn't worth branching around empty masked ops (including masked
28216    stores).  */
28217
28218 static bool
28219 aarch64_empty_mask_is_expensive (unsigned)
28220 {
28221   return false;
28222 }
28223
28224 /* Return 1 if pseudo register should be created and used to hold
28225    GOT address for PIC code.  */
28226
28227 bool
28228 aarch64_use_pseudo_pic_reg (void)
28229 {
28230   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
28231 }
28232
28233 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
28234
28235 static int
28236 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
28237 {
28238   switch (XINT (x, 1))
28239     {
28240     case UNSPEC_GOTSMALLPIC:
28241     case UNSPEC_GOTSMALLPIC28K:
28242     case UNSPEC_GOTTINYPIC:
28243       return 0;
28244     default:
28245       break;
28246     }
28247
28248   return default_unspec_may_trap_p (x, flags);
28249 }
28250
28251
28252 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
28253    return the log2 of that value.  Otherwise return -1.  */
28254
28255 int
28256 aarch64_fpconst_pow_of_2 (rtx x)
28257 {
28258   const REAL_VALUE_TYPE *r;
28259
28260   if (!CONST_DOUBLE_P (x))
28261     return -1;
28262
28263   r = CONST_DOUBLE_REAL_VALUE (x);
28264
28265   if (REAL_VALUE_NEGATIVE (*r)
28266       || REAL_VALUE_ISNAN (*r)
28267       || REAL_VALUE_ISINF (*r)
28268       || !real_isinteger (r, DFmode))
28269     return -1;
28270
28271   return exact_log2 (real_to_integer (r));
28272 }
28273
28274 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
28275    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
28276    return n. Otherwise return -1.  */
28277
28278 int
28279 aarch64_fpconst_pow2_recip (rtx x)
28280 {
28281   REAL_VALUE_TYPE r0;
28282
28283   if (!CONST_DOUBLE_P (x))
28284     return -1;
28285
28286   r0 = *CONST_DOUBLE_REAL_VALUE (x);
28287   if (exact_real_inverse (DFmode, &r0)
28288       && !REAL_VALUE_NEGATIVE (r0))
28289     {
28290         int ret = exact_log2 (real_to_integer (&r0));
28291         if (ret >= 1 && ret <= 32)
28292             return ret;
28293     }
28294   return -1;
28295 }
28296
28297 /* If X is a vector of equal CONST_DOUBLE values and that value is
28298    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
28299
28300 int
28301 aarch64_vec_fpconst_pow_of_2 (rtx x)
28302 {
28303   int nelts;
28304   if (!CONST_VECTOR_P (x)
28305       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
28306     return -1;
28307
28308   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
28309     return -1;
28310
28311   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
28312   if (firstval <= 0)
28313     return -1;
28314
28315   for (int i = 1; i < nelts; i++)
28316     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
28317       return -1;
28318
28319   return firstval;
28320 }
28321
28322 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
28323    to float.
28324
28325    __fp16 always promotes through this hook.
28326    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
28327    through the generic excess precision logic rather than here.  */
28328
28329 static tree
28330 aarch64_promoted_type (const_tree t)
28331 {
28332   if (SCALAR_FLOAT_TYPE_P (t)
28333       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
28334     return float_type_node;
28335
28336   return NULL_TREE;
28337 }
28338
28339 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
28340
28341 static bool
28342 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
28343                            optimization_type opt_type)
28344 {
28345   switch (op)
28346     {
28347     case rsqrt_optab:
28348       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
28349
28350     default:
28351       return true;
28352     }
28353 }
28354
28355 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
28356
28357 static unsigned int
28358 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
28359                                         int *offset)
28360 {
28361   /* Polynomial invariant 1 == (VG / 2) - 1.  */
28362   gcc_assert (i == 1);
28363   *factor = 2;
28364   *offset = 1;
28365   return AARCH64_DWARF_VG;
28366 }
28367
28368 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
28369    if MODE is [BH]Fmode, and punt to the generic implementation otherwise.  */
28370
28371 static bool
28372 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
28373 {
28374   return ((mode == HFmode || mode == BFmode)
28375           ? true
28376           : default_libgcc_floating_mode_supported_p (mode));
28377 }
28378
28379 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
28380    if MODE is [BH]Fmode, and punt to the generic implementation otherwise.  */
28381
28382 static bool
28383 aarch64_scalar_mode_supported_p (scalar_mode mode)
28384 {
28385   if (DECIMAL_FLOAT_MODE_P (mode))
28386     return default_decimal_float_supported_p ();
28387
28388   return ((mode == HFmode || mode == BFmode)
28389           ? true
28390           : default_scalar_mode_supported_p (mode));
28391 }
28392
28393 /* Set the value of FLT_EVAL_METHOD.
28394    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
28395
28396     0: evaluate all operations and constants, whose semantic type has at
28397        most the range and precision of type float, to the range and
28398        precision of float; evaluate all other operations and constants to
28399        the range and precision of the semantic type;
28400
28401     N, where _FloatN is a supported interchange floating type
28402        evaluate all operations and constants, whose semantic type has at
28403        most the range and precision of _FloatN type, to the range and
28404        precision of the _FloatN type; evaluate all other operations and
28405        constants to the range and precision of the semantic type;
28406
28407    If we have the ARMv8.2-A extensions then we support _Float16 in native
28408    precision, so we should set this to 16.  Otherwise, we support the type,
28409    but want to evaluate expressions in float precision, so set this to
28410    0.  */
28411
28412 static enum flt_eval_method
28413 aarch64_excess_precision (enum excess_precision_type type)
28414 {
28415   switch (type)
28416     {
28417       case EXCESS_PRECISION_TYPE_FAST:
28418       case EXCESS_PRECISION_TYPE_STANDARD:
28419         /* We can calculate either in 16-bit range and precision or
28420            32-bit range and precision.  Make that decision based on whether
28421            we have native support for the ARMv8.2-A 16-bit floating-point
28422            instructions or not.  */
28423         return (TARGET_FP_F16INST
28424                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
28425                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
28426       case EXCESS_PRECISION_TYPE_IMPLICIT:
28427       case EXCESS_PRECISION_TYPE_FLOAT16:
28428         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
28429       default:
28430         gcc_unreachable ();
28431     }
28432   return FLT_EVAL_METHOD_UNPREDICTABLE;
28433 }
28434
28435 /* Implement TARGET_C_BITINT_TYPE_INFO.
28436    Return true if _BitInt(N) is supported and fill its details into *INFO.  */
28437 bool
28438 aarch64_bitint_type_info (int n, struct bitint_info *info)
28439 {
28440   if (TARGET_BIG_END)
28441     return false;
28442
28443   if (n <= 8)
28444     info->limb_mode = QImode;
28445   else if (n <= 16)
28446     info->limb_mode = HImode;
28447   else if (n <= 32)
28448     info->limb_mode = SImode;
28449   else if (n <= 64)
28450     info->limb_mode = DImode;
28451   else if (n <= 128)
28452     info->limb_mode = TImode;
28453   else
28454     /* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
28455        type {signed,unsigned} __int128[M] where M*128 >= N.  However, to be
28456        able to use libgcc's implementation to support large _BitInt's we need
28457        to use a LIMB_MODE that is no larger than 'long long'.  This is why we
28458        use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
28459        be TImode to ensure we are ABI compliant.  */
28460     info->limb_mode = DImode;
28461
28462   if (n > 128)
28463     info->abi_limb_mode = TImode;
28464   else
28465     info->abi_limb_mode = info->limb_mode;
28466   info->big_endian = TARGET_BIG_END;
28467   info->extended = false;
28468   return true;
28469 }
28470
28471 /* Implement TARGET_C_MODE_FOR_FLOATING_TYPE.  Return TFmode for
28472    TI_LONG_DOUBLE_TYPE which is for long double type, go with the default
28473    one for the others.  */
28474
28475 static machine_mode
28476 aarch64_c_mode_for_floating_type (enum tree_index ti)
28477 {
28478   if (ti == TI_LONG_DOUBLE_TYPE)
28479     return TFmode;
28480   return default_mode_for_floating_type (ti);
28481 }
28482
28483 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
28484    scheduled for speculative execution.  Reject the long-running division
28485    and square-root instructions.  */
28486
28487 static bool
28488 aarch64_sched_can_speculate_insn (rtx_insn *insn)
28489 {
28490   switch (get_attr_type (insn))
28491     {
28492       case TYPE_SDIV:
28493       case TYPE_UDIV:
28494       case TYPE_FDIVS:
28495       case TYPE_FDIVD:
28496       case TYPE_FSQRTS:
28497       case TYPE_FSQRTD:
28498       case TYPE_NEON_FP_SQRT_S:
28499       case TYPE_NEON_FP_SQRT_D:
28500       case TYPE_NEON_FP_SQRT_S_Q:
28501       case TYPE_NEON_FP_SQRT_D_Q:
28502       case TYPE_NEON_FP_DIV_S:
28503       case TYPE_NEON_FP_DIV_D:
28504       case TYPE_NEON_FP_DIV_S_Q:
28505       case TYPE_NEON_FP_DIV_D_Q:
28506         return false;
28507       default:
28508         return true;
28509     }
28510 }
28511
28512 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
28513
28514 static int
28515 aarch64_compute_pressure_classes (reg_class *classes)
28516 {
28517   int i = 0;
28518   classes[i++] = GENERAL_REGS;
28519   classes[i++] = FP_REGS;
28520   /* PR_REGS isn't a useful pressure class because many predicate pseudo
28521      registers need to go in PR_LO_REGS at some point during their
28522      lifetime.  Splitting it into two halves has the effect of making
28523      all predicates count against PR_LO_REGS, so that we try whenever
28524      possible to restrict the number of live predicates to 8.  This
28525      greatly reduces the amount of spilling in certain loops.  */
28526   classes[i++] = PR_LO_REGS;
28527   classes[i++] = PR_HI_REGS;
28528   return i;
28529 }
28530
28531 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
28532
28533 static bool
28534 aarch64_can_change_mode_class (machine_mode from,
28535                                machine_mode to, reg_class_t)
28536 {
28537   return aarch64_modes_compatible_p (from, to);
28538 }
28539
28540 /* Implement TARGET_EARLY_REMAT_MODES.  */
28541
28542 static void
28543 aarch64_select_early_remat_modes (sbitmap modes)
28544 {
28545   /* SVE values are not normally live across a call, so it should be
28546      worth doing early rematerialization even in VL-specific mode.  */
28547   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
28548     if (aarch64_sve_mode_p ((machine_mode) i))
28549       bitmap_set_bit (modes, i);
28550 }
28551
28552 /* Override the default target speculation_safe_value.  */
28553 static rtx
28554 aarch64_speculation_safe_value (machine_mode mode,
28555                                 rtx result, rtx val, rtx failval)
28556 {
28557   /* Maybe we should warn if falling back to hard barriers.  They are
28558      likely to be noticably more expensive than the alternative below.  */
28559   if (!aarch64_track_speculation)
28560     return default_speculation_safe_value (mode, result, val, failval);
28561
28562   if (!REG_P (val))
28563     val = copy_to_mode_reg (mode, val);
28564
28565   if (!aarch64_reg_or_zero (failval, mode))
28566     failval = copy_to_mode_reg (mode, failval);
28567
28568   emit_insn (gen_despeculate_copy (mode, result, val, failval));
28569   return result;
28570 }
28571
28572 /* Implement TARGET_ESTIMATED_POLY_VALUE.
28573    Look into the tuning structure for an estimate.
28574    KIND specifies the type of requested estimate: min, max or likely.
28575    For cores with a known SVE width all three estimates are the same.
28576    For generic SVE tuning we want to distinguish the maximum estimate from
28577    the minimum and likely ones.
28578    The likely estimate is the same as the minimum in that case to give a
28579    conservative behavior of auto-vectorizing with SVE when it is a win
28580    even for 128-bit SVE.
28581    When SVE width information is available VAL.coeffs[1] is multiplied by
28582    the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
28583
28584 static HOST_WIDE_INT
28585 aarch64_estimated_poly_value (poly_int64 val,
28586                               poly_value_estimate_kind kind
28587                                 = POLY_VALUE_LIKELY)
28588 {
28589   unsigned int width_source = aarch64_tune_params.sve_width;
28590
28591   /* If there is no core-specific information then the minimum and likely
28592      values are based on 128-bit vectors and the maximum is based on
28593      the architectural maximum of 2048 bits.  */
28594   if (width_source == SVE_SCALABLE)
28595     switch (kind)
28596       {
28597       case POLY_VALUE_MIN:
28598       case POLY_VALUE_LIKELY:
28599         return val.coeffs[0];
28600       case POLY_VALUE_MAX:
28601           return val.coeffs[0] + val.coeffs[1] * 15;
28602       }
28603
28604   /* Allow sve_width to be a bitmask of different VL, treating the lowest
28605      as likely.  This could be made more general if future -mtune options
28606      need it to be.  */
28607   if (kind == POLY_VALUE_MAX)
28608     width_source = 1 << floor_log2 (width_source);
28609   else
28610     width_source = least_bit_hwi (width_source);
28611
28612   /* If the core provides width information, use that.  */
28613   HOST_WIDE_INT over_128 = width_source - 128;
28614   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
28615 }
28616
28617
28618 /* Return true for types that could be supported as SIMD return or
28619    argument types.  */
28620
28621 static bool
28622 supported_simd_type (tree t)
28623 {
28624   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
28625     {
28626       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
28627       return s == 1 || s == 2 || s == 4 || s == 8;
28628     }
28629   return false;
28630 }
28631
28632 /* Determine the lane size for the clone argument/return type.  This follows
28633    the LS(P) rule in the VFABIA64.  */
28634
28635 static unsigned
28636 lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
28637 {
28638   gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
28639
28640   /* For non map-to-vector types that are pointers we use the element type it
28641      points to.  */
28642   if (POINTER_TYPE_P (type))
28643     switch (clone_arg_type)
28644       {
28645       default:
28646         break;
28647       case SIMD_CLONE_ARG_TYPE_UNIFORM:
28648       case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
28649       case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
28650         type = TREE_TYPE (type);
28651         break;
28652       }
28653
28654   /* For types (or pointers of non map-to-vector types point to) that are
28655      integers or floating point, we use their size if they are 1, 2, 4 or 8.
28656    */
28657   if (INTEGRAL_TYPE_P (type)
28658       || SCALAR_FLOAT_TYPE_P (type))
28659     switch (TYPE_PRECISION (type) / BITS_PER_UNIT)
28660       {
28661       default:
28662         break;
28663       case 1:
28664       case 2:
28665       case 4:
28666       case 8:
28667         return TYPE_PRECISION (type);
28668       }
28669   /* For any other we use the size of uintptr_t.  For map-to-vector types that
28670      are pointers, using the size of uintptr_t is the same as using the size of
28671      their type, seeing all pointers are the same size as uintptr_t.  */
28672   return POINTER_SIZE;
28673 }
28674
28675
28676 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
28677
28678 static int
28679 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
28680                                         struct cgraph_simd_clone *clonei,
28681                                         tree base_type ATTRIBUTE_UNUSED,
28682                                         int num, bool explicit_p)
28683 {
28684   tree t, ret_type;
28685   unsigned int nds_elt_bits;
28686   unsigned HOST_WIDE_INT const_simdlen;
28687
28688   if (!TARGET_SIMD)
28689     return 0;
28690
28691   /* For now, SVE simdclones won't produce illegal simdlen, So only check
28692      const simdlens here.  */
28693   if (maybe_ne (clonei->simdlen, 0U)
28694       && clonei->simdlen.is_constant (&const_simdlen)
28695       && (const_simdlen < 2
28696           || const_simdlen > 1024
28697           || (const_simdlen & (const_simdlen - 1)) != 0))
28698     {
28699       if (explicit_p)
28700         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28701                     "unsupported simdlen %wd", const_simdlen);
28702       return 0;
28703     }
28704
28705   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
28706   /* According to AArch64's Vector ABI the type that determines the simdlen is
28707      the narrowest of types, so we ignore base_type for AArch64.  */
28708   if (TREE_CODE (ret_type) != VOID_TYPE
28709       && !supported_simd_type (ret_type))
28710     {
28711       if (!explicit_p)
28712         ;
28713       else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28714         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28715                     "GCC does not currently support return type %qT "
28716                     "for simd", ret_type);
28717       else
28718         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28719                     "unsupported return type %qT for simd",
28720                     ret_type);
28721       return 0;
28722     }
28723
28724   auto_vec<std::pair <tree, unsigned int>> vec_elts (clonei->nargs + 1);
28725
28726   /* We are looking for the NDS type here according to the VFABIA64.  */
28727   if (TREE_CODE (ret_type) != VOID_TYPE)
28728     {
28729       nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
28730       vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
28731     }
28732   else
28733     nds_elt_bits = POINTER_SIZE;
28734
28735   int i;
28736   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
28737   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
28738   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
28739        t && t != void_list_node; t = TREE_CHAIN (t), i++)
28740     {
28741       tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
28742       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
28743           && !supported_simd_type (arg_type))
28744         {
28745           if (!explicit_p)
28746             ;
28747           else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28748             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28749                         "GCC does not currently support argument type %qT "
28750                         "for simd", arg_type);
28751           else
28752             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28753                         "unsupported argument type %qT for simd",
28754                         arg_type);
28755           return 0;
28756         }
28757       unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
28758       if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
28759         vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
28760       if (nds_elt_bits > lane_bits)
28761         nds_elt_bits = lane_bits;
28762     }
28763
28764   clonei->vecsize_mangle = 'n';
28765   clonei->mask_mode = VOIDmode;
28766   poly_uint64 simdlen;
28767   auto_vec<poly_uint64> simdlens (2);
28768   /* Keep track of the possible simdlens the clones of this function can have,
28769      and check them later to see if we support them.  */
28770   if (known_eq (clonei->simdlen, 0U))
28771     {
28772       simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
28773       if (maybe_ne (simdlen, 1U))
28774         simdlens.safe_push (simdlen);
28775       simdlens.safe_push (simdlen * 2);
28776     }
28777   else
28778     simdlens.safe_push (clonei->simdlen);
28779
28780   clonei->vecsize_int = 0;
28781   clonei->vecsize_float = 0;
28782
28783   /* We currently do not support generating simdclones where vector arguments
28784      do not fit into a single vector register, i.e. vector types that are more
28785      than 128-bits large.  This is because of how we currently represent such
28786      types in ACLE, where we use a struct to allow us to pass them as arguments
28787      and return.
28788      Hence why we have to check whether the simdlens available for this
28789      simdclone would cause a vector type to be larger than 128-bits, and reject
28790      such a clone.  */
28791   unsigned j = 0;
28792   while (j < simdlens.length ())
28793     {
28794       bool remove_simdlen = false;
28795       for (auto elt : vec_elts)
28796         if (known_gt (simdlens[j] * elt.second, 128U))
28797           {
28798             /* Don't issue a warning for every simdclone when there is no
28799                specific simdlen clause.  */
28800             if (explicit_p && maybe_ne (clonei->simdlen, 0U))
28801               warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28802                           "GCC does not currently support simdlen %wd for "
28803                           "type %qT",
28804                           constant_lower_bound (simdlens[j]), elt.first);
28805             remove_simdlen = true;
28806             break;
28807           }
28808       if (remove_simdlen)
28809         simdlens.ordered_remove (j);
28810       else
28811         j++;
28812     }
28813
28814
28815   int count = simdlens.length ();
28816   if (count == 0)
28817     {
28818       if (explicit_p && known_eq (clonei->simdlen, 0U))
28819         {
28820           /* Warn the user if we can't generate any simdclone.  */
28821           simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
28822           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28823                       "GCC does not currently support a simdclone with simdlens"
28824                       " %wd and %wd for these types.",
28825                       constant_lower_bound (simdlen),
28826                       constant_lower_bound (simdlen*2));
28827         }
28828       return 0;
28829     }
28830
28831   gcc_assert (num < count);
28832   clonei->simdlen = simdlens[num];
28833   return count;
28834 }
28835
28836 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
28837
28838 static void
28839 aarch64_simd_clone_adjust (struct cgraph_node *node)
28840 {
28841   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
28842      use the correct ABI.  */
28843
28844   tree t = TREE_TYPE (node->decl);
28845   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
28846                                         TYPE_ATTRIBUTES (t));
28847 }
28848
28849 /* Implement TARGET_SIMD_CLONE_USABLE.  */
28850
28851 static int
28852 aarch64_simd_clone_usable (struct cgraph_node *node)
28853 {
28854   switch (node->simdclone->vecsize_mangle)
28855     {
28856     case 'n':
28857       if (!TARGET_SIMD)
28858         return -1;
28859       return 0;
28860     default:
28861       gcc_unreachable ();
28862     }
28863 }
28864
28865 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
28866
28867 static int
28868 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
28869 {
28870   auto check_attr = [&](const char *ns, const char *name) {
28871     tree attr1 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type1));
28872     tree attr2 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type2));
28873     if (!attr1 && !attr2)
28874       return true;
28875
28876     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
28877   };
28878
28879   if (!check_attr ("gnu", "aarch64_vector_pcs"))
28880     return 0;
28881   if (!check_attr ("gnu", "Advanced SIMD type"))
28882     return 0;
28883   if (!check_attr ("gnu", "SVE type"))
28884     return 0;
28885   if (!check_attr ("gnu", "SVE sizeless type"))
28886     return 0;
28887   if (!check_attr ("arm", "streaming"))
28888     return 0;
28889   if (!check_attr ("arm", "streaming_compatible"))
28890     return 0;
28891   if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "za")
28892       != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "za"))
28893     return 0;
28894   if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "zt0")
28895       != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "zt0"))
28896     return 0;
28897   return 1;
28898 }
28899
28900 /* Implement TARGET_MERGE_DECL_ATTRIBUTES.  */
28901
28902 static tree
28903 aarch64_merge_decl_attributes (tree olddecl, tree newdecl)
28904 {
28905   tree old_attrs = DECL_ATTRIBUTES (olddecl);
28906   tree old_new = lookup_attribute ("arm", "new", old_attrs);
28907
28908   tree new_attrs = DECL_ATTRIBUTES (newdecl);
28909   tree new_new = lookup_attribute ("arm", "new", new_attrs);
28910
28911   if (DECL_INITIAL (olddecl) && new_new)
28912     {
28913       error ("cannot apply attribute %qs to %q+D after the function"
28914              " has been defined", "new", newdecl);
28915       inform (DECL_SOURCE_LOCATION (olddecl), "%q+D defined here",
28916               newdecl);
28917     }
28918   else
28919     {
28920       if (old_new && new_new)
28921         {
28922           old_attrs = remove_attribute ("arm", "new", old_attrs);
28923           TREE_VALUE (new_new) = chainon (TREE_VALUE (new_new),
28924                                           TREE_VALUE (old_new));
28925         }
28926       if (new_new)
28927         aarch64_check_arm_new_against_type (TREE_VALUE (new_new), newdecl);
28928     }
28929
28930   return merge_attributes (old_attrs, new_attrs);
28931 }
28932
28933 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
28934
28935 static const char *
28936 aarch64_get_multilib_abi_name (void)
28937 {
28938   if (TARGET_BIG_END)
28939     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
28940   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
28941 }
28942
28943 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
28944    global variable based guard use the default else
28945    return a null tree.  */
28946 static tree
28947 aarch64_stack_protect_guard (void)
28948 {
28949   if (aarch64_stack_protector_guard == SSP_GLOBAL)
28950     return default_stack_protect_guard ();
28951
28952   return NULL_TREE;
28953 }
28954
28955 /* Return the diagnostic message string if the binary operation OP is
28956    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
28957
28958 static const char *
28959 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
28960                            const_tree type2)
28961 {
28962   if (VECTOR_TYPE_P (type1)
28963       && VECTOR_TYPE_P (type2)
28964       && !TYPE_INDIVISIBLE_P (type1)
28965       && !TYPE_INDIVISIBLE_P (type2)
28966       && (aarch64_sve::builtin_type_p (type1)
28967           != aarch64_sve::builtin_type_p (type2)))
28968     return N_("cannot combine GNU and SVE vectors in a binary operation");
28969
28970   /* Operation allowed.  */
28971   return NULL;
28972 }
28973
28974 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES.  Here we tell the rest of the
28975    compiler that we automatically ignore the top byte of our pointers, which
28976    allows using -fsanitize=hwaddress.  */
28977 bool
28978 aarch64_can_tag_addresses ()
28979 {
28980   return !TARGET_ILP32;
28981 }
28982
28983 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
28984    section at the end if needed.  */
28985 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
28986 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
28987 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
28988 void
28989 aarch64_file_end_indicate_exec_stack ()
28990 {
28991   file_end_indicate_exec_stack ();
28992
28993   unsigned feature_1_and = 0;
28994   if (aarch_bti_enabled ())
28995     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
28996
28997   if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE)
28998     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
28999
29000   if (feature_1_and)
29001     {
29002       /* Generate .note.gnu.property section.  */
29003       switch_to_section (get_section (".note.gnu.property",
29004                                       SECTION_NOTYPE, NULL));
29005
29006       /* PT_NOTE header: namesz, descsz, type.
29007          namesz = 4 ("GNU\0")
29008          descsz = 16 (Size of the program property array)
29009                   [(12 + padding) * Number of array elements]
29010          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
29011       assemble_align (POINTER_SIZE);
29012       assemble_integer (GEN_INT (4), 4, 32, 1);
29013       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
29014       assemble_integer (GEN_INT (5), 4, 32, 1);
29015
29016       /* PT_NOTE name.  */
29017       assemble_string ("GNU", 4);
29018
29019       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
29020          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
29021          datasz = 4
29022          data   = feature_1_and.  */
29023       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
29024       assemble_integer (GEN_INT (4), 4, 32, 1);
29025       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
29026
29027       /* Pad the size of the note to the required alignment.  */
29028       assemble_align (POINTER_SIZE);
29029     }
29030 }
29031 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
29032 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
29033 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
29034
29035 /* Helper function for straight line speculation.
29036    Return what barrier should be emitted for straight line speculation
29037    mitigation.
29038    When not mitigating against straight line speculation this function returns
29039    an empty string.
29040    When mitigating against straight line speculation, use:
29041    * SB when the v8.5-A SB extension is enabled.
29042    * DSB+ISB otherwise.  */
29043 const char *
29044 aarch64_sls_barrier (int mitigation_required)
29045 {
29046   return mitigation_required
29047     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
29048     : "";
29049 }
29050
29051 static GTY (()) tree aarch64_sls_shared_thunks[30];
29052 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
29053 const char *indirect_symbol_names[30] = {
29054     "__call_indirect_x0",
29055     "__call_indirect_x1",
29056     "__call_indirect_x2",
29057     "__call_indirect_x3",
29058     "__call_indirect_x4",
29059     "__call_indirect_x5",
29060     "__call_indirect_x6",
29061     "__call_indirect_x7",
29062     "__call_indirect_x8",
29063     "__call_indirect_x9",
29064     "__call_indirect_x10",
29065     "__call_indirect_x11",
29066     "__call_indirect_x12",
29067     "__call_indirect_x13",
29068     "__call_indirect_x14",
29069     "__call_indirect_x15",
29070     "", /* "__call_indirect_x16",  */
29071     "", /* "__call_indirect_x17",  */
29072     "__call_indirect_x18",
29073     "__call_indirect_x19",
29074     "__call_indirect_x20",
29075     "__call_indirect_x21",
29076     "__call_indirect_x22",
29077     "__call_indirect_x23",
29078     "__call_indirect_x24",
29079     "__call_indirect_x25",
29080     "__call_indirect_x26",
29081     "__call_indirect_x27",
29082     "__call_indirect_x28",
29083     "__call_indirect_x29",
29084 };
29085
29086 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
29087    line speculation.  Instead of a simple BLR that can be speculated past,
29088    we emit a BL to this thunk, and this thunk contains a BR to the relevant
29089    register.  These thunks have the relevant speculation barries put after
29090    their indirect branch so that speculation is blocked.
29091
29092    We use such a thunk so the speculation barriers are kept off the
29093    architecturally executed path in order to reduce the performance overhead.
29094
29095    When optimizing for size we use stubs shared by the linked object.
29096    When optimizing for performance we emit stubs for each function in the hope
29097    that the branch predictor can better train on jumps specific for a given
29098    function.  */
29099 rtx
29100 aarch64_sls_create_blr_label (int regnum)
29101 {
29102   gcc_assert (STUB_REGNUM_P (regnum));
29103   if (optimize_function_for_size_p (cfun))
29104     {
29105       /* For the thunks shared between different functions in this compilation
29106          unit we use a named symbol -- this is just for users to more easily
29107          understand the generated assembly.  */
29108       aarch64_sls_shared_thunks_needed = true;
29109       const char *thunk_name = indirect_symbol_names[regnum];
29110       if (aarch64_sls_shared_thunks[regnum] == NULL)
29111         {
29112           /* Build a decl representing this function stub and record it for
29113              later.  We build a decl here so we can use the GCC machinery for
29114              handling sections automatically (through `get_named_section` and
29115              `make_decl_one_only`).  That saves us a lot of trouble handling
29116              the specifics of different output file formats.  */
29117           tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
29118                                   get_identifier (thunk_name),
29119                                   build_function_type_list (void_type_node,
29120                                                             NULL_TREE));
29121           DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
29122                                            NULL_TREE, void_type_node);
29123           TREE_PUBLIC (decl) = 1;
29124           TREE_STATIC (decl) = 1;
29125           DECL_IGNORED_P (decl) = 1;
29126           DECL_ARTIFICIAL (decl) = 1;
29127           make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29128           resolve_unique_section (decl, 0, false);
29129           aarch64_sls_shared_thunks[regnum] = decl;
29130         }
29131
29132       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
29133     }
29134
29135   if (cfun->machine->call_via[regnum] == NULL)
29136     cfun->machine->call_via[regnum]
29137       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
29138   return cfun->machine->call_via[regnum];
29139 }
29140
29141 /* Helper function for aarch64_sls_emit_blr_function_thunks and
29142    aarch64_sls_emit_shared_blr_thunks below.  */
29143 static void
29144 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
29145 {
29146   /* Save in x16 and branch to that function so this transformation does
29147      not prevent jumping to `BTI c` instructions.  */
29148   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
29149   asm_fprintf (out_file, "\tbr\tx16\n");
29150 }
29151
29152 /* Emit all BLR stubs for this particular function.
29153    Here we emit all the BLR stubs needed for the current function.  Since we
29154    emit these stubs in a consecutive block we know there will be no speculation
29155    gadgets between each stub, and hence we only emit a speculation barrier at
29156    the end of the stub sequences.
29157
29158    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
29159 void
29160 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
29161 {
29162   if (! aarch64_harden_sls_blr_p ())
29163     return;
29164
29165   bool any_functions_emitted = false;
29166   /* We must save and restore the current function section since this assembly
29167      is emitted at the end of the function.  This means it can be emitted *just
29168      after* the cold section of a function.  That cold part would be emitted in
29169      a different section.  That switch would trigger a `.cfi_endproc` directive
29170      to be emitted in the original section and a `.cfi_startproc` directive to
29171      be emitted in the new section.  Switching to the original section without
29172      restoring would mean that the `.cfi_endproc` emitted as a function ends
29173      would happen in a different section -- leaving an unmatched
29174      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
29175      in the standard text section.  */
29176   section *save_text_section = in_section;
29177   switch_to_section (function_section (current_function_decl));
29178   for (int regnum = 0; regnum < 30; ++regnum)
29179     {
29180       rtx specu_label = cfun->machine->call_via[regnum];
29181       if (specu_label == NULL)
29182         continue;
29183
29184       targetm.asm_out.print_operand (out_file, specu_label, 0);
29185       asm_fprintf (out_file, ":\n");
29186       aarch64_sls_emit_function_stub (out_file, regnum);
29187       any_functions_emitted = true;
29188     }
29189   if (any_functions_emitted)
29190     /* Can use the SB if needs be here, since this stub will only be used
29191       by the current function, and hence for the current target.  */
29192     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
29193   switch_to_section (save_text_section);
29194 }
29195
29196 /* Emit shared BLR stubs for the current compilation unit.
29197    Over the course of compiling this unit we may have converted some BLR
29198    instructions to a BL to a shared stub function.  This is where we emit those
29199    stub functions.
29200    This function is for the stubs shared between different functions in this
29201    compilation unit.  We share when optimizing for size instead of speed.
29202
29203    This function is called through the TARGET_ASM_FILE_END hook.  */
29204 void
29205 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
29206 {
29207   if (! aarch64_sls_shared_thunks_needed)
29208     return;
29209
29210   for (int regnum = 0; regnum < 30; ++regnum)
29211     {
29212       tree decl = aarch64_sls_shared_thunks[regnum];
29213       if (!decl)
29214         continue;
29215
29216       const char *name = indirect_symbol_names[regnum];
29217       switch_to_section (get_named_section (decl, NULL, 0));
29218       ASM_OUTPUT_ALIGN (out_file, 2);
29219       targetm.asm_out.globalize_label (out_file, name);
29220       /* Only emits if the compiler is configured for an assembler that can
29221          handle visibility directives.  */
29222       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
29223       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
29224       ASM_OUTPUT_LABEL (out_file, name);
29225       aarch64_sls_emit_function_stub (out_file, regnum);
29226       /* Use the most conservative target to ensure it can always be used by any
29227          function in the translation unit.  */
29228       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
29229       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
29230     }
29231 }
29232
29233 /* Implement TARGET_ASM_FILE_END.  */
29234 void
29235 aarch64_asm_file_end ()
29236 {
29237   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
29238   /* Since this function will be called for the ASM_FILE_END hook, we ensure
29239      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
29240      for FreeBSD) still gets called.  */
29241 #ifdef TARGET_ASM_FILE_END
29242   TARGET_ASM_FILE_END ();
29243 #endif
29244 }
29245
29246 const char *
29247 aarch64_indirect_call_asm (rtx addr)
29248 {
29249   gcc_assert (REG_P (addr));
29250   if (aarch64_harden_sls_blr_p ())
29251     {
29252       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
29253       output_asm_insn ("bl\t%0", &stub_label);
29254     }
29255   else
29256    output_asm_insn ("blr\t%0", &addr);
29257   return "";
29258 }
29259
29260 /* Emit the assembly instruction to load the thread pointer into DEST.
29261    Select between different tpidr_elN registers depending on -mtp= setting.  */
29262
29263 const char *
29264 aarch64_output_load_tp (rtx dest)
29265 {
29266   const char *tpidrs[] = {"tpidr_el0", "tpidr_el1", "tpidr_el2",
29267                           "tpidr_el3", "tpidrro_el0"};
29268   char buffer[64];
29269   snprintf (buffer, sizeof (buffer), "mrs\t%%0, %s",
29270             tpidrs[aarch64_tpidr_register]);
29271   output_asm_insn (buffer, &dest);
29272   return "";
29273 }
29274
29275 /* Set up the value of REG_ALLOC_ORDER from scratch.
29276
29277    It was previously good practice to put call-clobbered registers ahead
29278    of call-preserved registers, but that isn't necessary these days.
29279    IRA's model of register save/restore costs is much more sophisticated
29280    than the model that a simple ordering could provide.  We leave
29281    HONOR_REG_ALLOC_ORDER undefined so that we can get the full benefit
29282    of IRA's model.
29283
29284    However, it is still useful to list registers that are members of
29285    multiple classes after registers that are members of fewer classes.
29286    For example, we have:
29287
29288    - FP_LO8_REGS: v0-v7
29289    - FP_LO_REGS: v0-v15
29290    - FP_REGS: v0-v31
29291
29292    If, as a tie-breaker, we allocate FP_REGS in the order v0-v31,
29293    we run the risk of starving other (lower-priority) pseudos that
29294    require FP_LO8_REGS or FP_LO_REGS.  Allocating FP_LO_REGS in the
29295    order v0-v15 could similarly starve pseudos that require FP_LO8_REGS.
29296    Allocating downwards rather than upwards avoids this problem, at least
29297    in code that has reasonable register pressure.
29298
29299    The situation for predicate registers is similar.  */
29300
29301 void
29302 aarch64_adjust_reg_alloc_order ()
29303 {
29304   for (int i = 0; i < FIRST_PSEUDO_REGISTER; ++i)
29305     if (IN_RANGE (i, V0_REGNUM, V31_REGNUM))
29306       reg_alloc_order[i] = V31_REGNUM - (i - V0_REGNUM);
29307     else if (IN_RANGE (i, P0_REGNUM, P15_REGNUM))
29308       reg_alloc_order[i] = P15_REGNUM - (i - P0_REGNUM);
29309     else
29310       reg_alloc_order[i] = i;
29311 }
29312
29313 /* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
29314    of vector mode MODE to select half the elements of that vector.
29315    Allow any combination of indices except duplicates (or out of range of
29316    the mode units).  */
29317
29318 bool
29319 aarch64_parallel_select_half_p (machine_mode mode, rtx par)
29320 {
29321   int nunits = XVECLEN (par, 0);
29322   if (!known_eq (GET_MODE_NUNITS (mode), nunits * 2))
29323     return false;
29324   int mode_nunits = nunits * 2;
29325   /* Put all the elements of PAR into a hash_set and use its
29326      uniqueness guarantees to check that we don't try to insert the same
29327      element twice.  */
29328   hash_set<rtx> parset;
29329   for (int i = 0; i < nunits; ++i)
29330     {
29331       rtx elt = XVECEXP (par, 0, i);
29332       if (!CONST_INT_P (elt)
29333           || !IN_RANGE (INTVAL (elt), 0, mode_nunits - 1)
29334           || parset.add (elt))
29335         return false;
29336     }
29337   return true;
29338 }
29339
29340 /* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
29341    contain any common elements.  */
29342
29343 bool
29344 aarch64_pars_overlap_p (rtx par1, rtx par2)
29345 {
29346   int len1 = XVECLEN (par1, 0);
29347   int len2 = XVECLEN (par2, 0);
29348   hash_set<rtx> parset;
29349   for (int i = 0; i < len1; ++i)
29350     parset.add (XVECEXP (par1, 0, i));
29351   for (int i = 0; i < len2; ++i)
29352     if (parset.contains (XVECEXP (par2, 0, i)))
29353       return true;
29354   return false;
29355 }
29356
29357 /* Implement OPTIMIZE_MODE_SWITCHING.  */
29358
29359 bool
29360 aarch64_optimize_mode_switching (aarch64_mode_entity entity)
29361 {
29362   bool have_sme_state = (aarch64_cfun_incoming_pstate_za () != 0
29363                          || (aarch64_cfun_has_new_state ("za")
29364                              && df_regs_ever_live_p (ZA_REGNUM))
29365                          || (aarch64_cfun_has_new_state ("zt0")
29366                              && df_regs_ever_live_p (ZT0_REGNUM)));
29367
29368   if (have_sme_state && nonlocal_goto_handler_labels)
29369     {
29370       static bool reported;
29371       if (!reported)
29372         {
29373           sorry ("non-local gotos in functions with SME state");
29374           reported = true;
29375         }
29376     }
29377
29378   switch (entity)
29379     {
29380     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29381     case aarch64_mode_entity::LOCAL_SME_STATE:
29382       return have_sme_state && !nonlocal_goto_handler_labels;
29383     }
29384   gcc_unreachable ();
29385 }
29386
29387 /* Implement TARGET_MODE_EMIT for ZA_SAVE_BUFFER.  */
29388
29389 static void
29390 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode mode,
29391                                   aarch64_tristate_mode prev_mode)
29392 {
29393   if (mode == aarch64_tristate_mode::YES)
29394     {
29395       gcc_assert (prev_mode == aarch64_tristate_mode::NO);
29396       aarch64_init_tpidr2_block ();
29397     }
29398   else
29399     gcc_unreachable ();
29400 }
29401
29402 /* Implement TARGET_MODE_EMIT for LOCAL_SME_STATE.  */
29403
29404 static void
29405 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
29406                                    aarch64_local_sme_state prev_mode)
29407 {
29408   /* Back-propagation should ensure that we're always starting from
29409      a known mode.  */
29410   gcc_assert (prev_mode != aarch64_local_sme_state::ANY);
29411
29412   if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29413     {
29414       /* Commit any uncommitted lazy save.  This leaves ZA either active
29415          and zero (lazy save case) or off (normal case).
29416
29417          The sequence is:
29418
29419              mrs <temp>, tpidr2_el0
29420              cbz <temp>, no_save
29421              bl __arm_tpidr2_save
29422              msr tpidr2_el0, xzr
29423              zero { za }       // Only if ZA is live
29424              zero { zt0 }      // Only if ZT0 is live
29425          no_save:  */
29426       auto tmp_reg = gen_reg_rtx (DImode);
29427       emit_insn (gen_aarch64_read_tpidr2 (tmp_reg));
29428       auto label = gen_label_rtx ();
29429       rtx branch = aarch64_gen_compare_zero_and_branch (EQ, tmp_reg, label);
29430       auto jump = emit_jump_insn (branch);
29431       JUMP_LABEL (jump) = label;
29432       emit_insn (gen_aarch64_tpidr2_save ());
29433       emit_insn (gen_aarch64_clear_tpidr2 ());
29434       if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29435           || mode == aarch64_local_sme_state::ACTIVE_DEAD)
29436         {
29437           if (aarch64_cfun_has_state ("za"))
29438             emit_insn (gen_aarch64_initial_zero_za ());
29439           if (aarch64_cfun_has_state ("zt0"))
29440             emit_insn (gen_aarch64_sme_zero_zt0 ());
29441         }
29442       emit_label (label);
29443     }
29444
29445   if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29446       || mode == aarch64_local_sme_state::ACTIVE_DEAD)
29447     {
29448       if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29449         {
29450           /* Make ZA active after being inactive.
29451
29452              First handle the case in which the lazy save we set up was
29453              committed by a callee.  If the function's source-level ZA state
29454              is live then we must conditionally restore it from the lazy
29455              save buffer.  Otherwise we can just force PSTATE.ZA to 1.  */
29456           if (mode == aarch64_local_sme_state::ACTIVE_LIVE)
29457             emit_insn (gen_aarch64_restore_za (aarch64_get_tpidr2_ptr ()));
29458           else
29459             emit_insn (gen_aarch64_smstart_za ());
29460
29461           /* Now handle the case in which the lazy save was not committed.
29462              In that case, ZA still contains the current function's ZA state,
29463              and we just need to cancel the lazy save.  */
29464           emit_insn (gen_aarch64_clear_tpidr2 ());
29465
29466           /* Restore the ZT0 state, if we have some.  */
29467           if (aarch64_cfun_has_state ("zt0"))
29468             aarch64_restore_zt0 (true);
29469
29470           return;
29471         }
29472
29473       if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL)
29474         {
29475           /* Retrieve the current function's ZA state from the lazy save
29476              buffer.  */
29477           aarch64_restore_za (aarch64_get_tpidr2_ptr ());
29478
29479           /* Restore the ZT0 state, if we have some.  */
29480           if (aarch64_cfun_has_state ("zt0"))
29481             aarch64_restore_zt0 (true);
29482           return;
29483         }
29484
29485       if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER
29486           || prev_mode == aarch64_local_sme_state::OFF)
29487         {
29488           /* INACTIVE_CALLER means that we are enabling ZA for the first
29489              time in this function.  The code above means that ZA is either
29490              active and zero (if we committed a lazy save) or off.  Handle
29491              the latter case by forcing ZA on.
29492
29493              OFF means that PSTATE.ZA is guaranteed to be 0.  We just need
29494              to force it to 1.
29495
29496              Both cases leave ZA zeroed.  */
29497           emit_insn (gen_aarch64_smstart_za ());
29498
29499           /* Restore the ZT0 state, if we have some.  */
29500           if (prev_mode == aarch64_local_sme_state::OFF
29501               && aarch64_cfun_has_state ("zt0"))
29502             aarch64_restore_zt0 (true);
29503           return;
29504         }
29505
29506       if (prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29507           || prev_mode == aarch64_local_sme_state::ACTIVE_LIVE)
29508         /* A simple change in liveness, such as in a CFG structure where
29509            ZA is only conditionally defined.  No code is needed.  */
29510         return;
29511
29512       gcc_unreachable ();
29513     }
29514
29515   if (mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29516     {
29517       if (prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29518           || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29519           || prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29520         {
29521           /* Save the ZT0 state, if we have some.  */
29522           if (aarch64_cfun_has_state ("zt0"))
29523             aarch64_save_zt0 ();
29524
29525           /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
29526              case of setting up a lazy save buffer before a call.
29527              A transition from INACTIVE_CALLER is similar, except that
29528              the contents of ZA are known to be zero.
29529
29530              A transition from ACTIVE_DEAD means that ZA is live at the
29531              point of the transition, but is dead on at least one incoming
29532              edge.  (That is, ZA is only conditionally initialized.)
29533              For efficiency, we want to set up a lazy save even for
29534              dead contents, since forcing ZA off would make later code
29535              restore ZA from the lazy save buffer.  */
29536           emit_insn (gen_aarch64_write_tpidr2 (aarch64_get_tpidr2_ptr ()));
29537           return;
29538         }
29539
29540       if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL
29541           || prev_mode == aarch64_local_sme_state::OFF)
29542         /* We're simply discarding the information about which inactive
29543            state applies.  */
29544         return;
29545
29546       gcc_unreachable ();
29547     }
29548
29549   if (mode == aarch64_local_sme_state::INACTIVE_CALLER
29550       || mode == aarch64_local_sme_state::OFF)
29551     {
29552       /* Save the ZT0 state, if we have some.  */
29553       if ((prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29554            || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD)
29555           && mode == aarch64_local_sme_state::OFF
29556           && aarch64_cfun_has_state ("zt0"))
29557         aarch64_save_zt0 ();
29558
29559       /* The transition to INACTIVE_CALLER is used before returning from
29560          new("za") functions.  Any state in ZA belongs to the current
29561          function rather than a caller, but that state is no longer
29562          needed.  Clear any pending lazy save and turn ZA off.
29563
29564          The transition to OFF is used before calling a private-ZA function.
29565          We committed any incoming lazy save above, so at this point any
29566          contents in ZA belong to the current function.  */
29567       if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29568         emit_insn (gen_aarch64_clear_tpidr2 ());
29569
29570       if (prev_mode != aarch64_local_sme_state::OFF
29571           && prev_mode != aarch64_local_sme_state::SAVED_LOCAL)
29572         emit_insn (gen_aarch64_smstop_za ());
29573
29574       return;
29575     }
29576
29577   if (mode == aarch64_local_sme_state::SAVED_LOCAL)
29578     {
29579       /* This is a transition to an exception handler.  */
29580       gcc_assert (prev_mode == aarch64_local_sme_state::OFF
29581                   || prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL);
29582       return;
29583     }
29584
29585   gcc_unreachable ();
29586 }
29587
29588 /* Implement TARGET_MODE_EMIT.  */
29589
29590 static void
29591 aarch64_mode_emit (int entity, int mode, int prev_mode, HARD_REG_SET live)
29592 {
29593   if (mode == prev_mode)
29594     return;
29595
29596   start_sequence ();
29597   switch (aarch64_mode_entity (entity))
29598     {
29599     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29600       aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode (mode),
29601                                         aarch64_tristate_mode (prev_mode));
29602       break;
29603
29604     case aarch64_mode_entity::LOCAL_SME_STATE:
29605       aarch64_mode_emit_local_sme_state (aarch64_local_sme_state (mode),
29606                                          aarch64_local_sme_state (prev_mode));
29607       break;
29608     }
29609   rtx_insn *seq = get_insns ();
29610   end_sequence ();
29611
29612   /* Get the set of clobbered registers that are currently live.  */
29613   HARD_REG_SET clobbers = {};
29614   for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
29615     {
29616       if (!NONDEBUG_INSN_P (insn))
29617         continue;
29618       vec_rtx_properties properties;
29619       properties.add_insn (insn, false);
29620       for (rtx_obj_reference ref : properties.refs ())
29621         if (ref.is_write () && HARD_REGISTER_NUM_P (ref.regno))
29622           SET_HARD_REG_BIT (clobbers, ref.regno);
29623     }
29624   clobbers &= live;
29625
29626   /* Emit instructions to save clobbered registers to pseudos.  Queue
29627      instructions to restore the registers afterwards.
29628
29629      This should only needed in rare situations.  */
29630   auto_vec<rtx, 33> after;
29631   for (unsigned int regno = R0_REGNUM; regno < R30_REGNUM; ++regno)
29632     if (TEST_HARD_REG_BIT (clobbers, regno))
29633       {
29634         rtx hard_reg = gen_rtx_REG (DImode, regno);
29635         rtx pseudo_reg = gen_reg_rtx (DImode);
29636         emit_move_insn (pseudo_reg, hard_reg);
29637         after.quick_push (gen_move_insn (hard_reg, pseudo_reg));
29638       }
29639   if (TEST_HARD_REG_BIT (clobbers, CC_REGNUM))
29640     {
29641       rtx pseudo_reg = gen_reg_rtx (DImode);
29642       emit_insn (gen_aarch64_save_nzcv (pseudo_reg));
29643       after.quick_push (gen_aarch64_restore_nzcv (pseudo_reg));
29644     }
29645
29646   /* Emit the transition instructions themselves.  */
29647   emit_insn (seq);
29648
29649   /* Restore the clobbered registers.  */
29650   for (auto *insn : after)
29651     emit_insn (insn);
29652 }
29653
29654 /* Return true if INSN references the SME state represented by hard register
29655    REGNO.  */
29656
29657 static bool
29658 aarch64_insn_references_sme_state_p (rtx_insn *insn, unsigned int regno)
29659 {
29660   df_ref ref;
29661   FOR_EACH_INSN_DEF (ref, insn)
29662     if (!DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
29663         && DF_REF_REGNO (ref) == regno)
29664       return true;
29665   FOR_EACH_INSN_USE (ref, insn)
29666     if (DF_REF_REGNO (ref) == regno)
29667       return true;
29668   return false;
29669 }
29670
29671 /* Implement TARGET_MODE_NEEDED for LOCAL_SME_STATE.  */
29672
29673 static aarch64_local_sme_state
29674 aarch64_mode_needed_local_sme_state (rtx_insn *insn, HARD_REG_SET live)
29675 {
29676   if (!CALL_P (insn)
29677       && find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29678     {
29679       static bool reported;
29680       if (!reported)
29681         {
29682           sorry ("catching non-call exceptions in functions with SME state");
29683           reported = true;
29684         }
29685       /* Aim for graceful error recovery by picking the value that is
29686          least likely to generate an ICE.  */
29687       return aarch64_local_sme_state::INACTIVE_LOCAL;
29688     }
29689
29690   /* A non-local goto is equivalent to a return.  We disallow non-local
29691      receivers in functions with SME state, so we know that the target
29692      expects ZA to be dormant or off.  */
29693   if (JUMP_P (insn)
29694       && find_reg_note (insn, REG_NON_LOCAL_GOTO, NULL_RTX))
29695     return aarch64_local_sme_state::INACTIVE_CALLER;
29696
29697   /* start_private_za_call and end_private_za_call bracket a sequence
29698      that calls a private-ZA function.  Force ZA to be turned off if the
29699      function doesn't have any live ZA state, otherwise require ZA to be
29700      inactive.  */
29701   auto icode = recog_memoized (insn);
29702   if (icode == CODE_FOR_aarch64_start_private_za_call
29703       || icode == CODE_FOR_aarch64_end_private_za_call)
29704     return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29705             ? aarch64_local_sme_state::INACTIVE_LOCAL
29706             : aarch64_local_sme_state::OFF);
29707
29708   /* Force ZA to contain the current function's ZA state if INSN wants
29709      to access it.  Do the same for accesses to ZT0, since ZA and ZT0
29710      are both controlled by PSTATE.ZA.  */
29711   if (aarch64_insn_references_sme_state_p (insn, ZA_REGNUM)
29712       || aarch64_insn_references_sme_state_p (insn, ZT0_REGNUM))
29713     return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29714             ? aarch64_local_sme_state::ACTIVE_LIVE
29715             : aarch64_local_sme_state::ACTIVE_DEAD);
29716
29717   return aarch64_local_sme_state::ANY;
29718 }
29719
29720 /* Implement TARGET_MODE_NEEDED for ZA_SAVE_BUFFER.  */
29721
29722 static aarch64_tristate_mode
29723 aarch64_mode_needed_za_save_buffer (rtx_insn *insn, HARD_REG_SET live)
29724 {
29725   /* We need to set up a lazy save buffer no later than the first
29726      transition to INACTIVE_LOCAL (which involves setting up a lazy save).  */
29727   if (aarch64_mode_needed_local_sme_state (insn, live)
29728       == aarch64_local_sme_state::INACTIVE_LOCAL)
29729     return aarch64_tristate_mode::YES;
29730
29731   /* Also make sure that the lazy save buffer is set up before the first
29732      insn that throws internally.  The exception handler will sometimes
29733      load from it.  */
29734   if (find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29735     return aarch64_tristate_mode::YES;
29736
29737   return aarch64_tristate_mode::MAYBE;
29738 }
29739
29740 /* Implement TARGET_MODE_NEEDED.  */
29741
29742 static int
29743 aarch64_mode_needed (int entity, rtx_insn *insn, HARD_REG_SET live)
29744 {
29745   switch (aarch64_mode_entity (entity))
29746     {
29747     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29748       return int (aarch64_mode_needed_za_save_buffer (insn, live));
29749
29750     case aarch64_mode_entity::LOCAL_SME_STATE:
29751       return int (aarch64_mode_needed_local_sme_state (insn, live));
29752     }
29753   gcc_unreachable ();
29754 }
29755
29756 /* Implement TARGET_MODE_AFTER for LOCAL_SME_STATE.  */
29757
29758 static aarch64_local_sme_state
29759 aarch64_mode_after_local_sme_state (aarch64_local_sme_state mode,
29760                                     HARD_REG_SET live)
29761 {
29762   /* Note places where ZA dies, so that we can try to avoid saving and
29763      restoring state that isn't needed.  */
29764   if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29765       && !TEST_HARD_REG_BIT (live, ZA_REGNUM))
29766     return aarch64_local_sme_state::ACTIVE_DEAD;
29767
29768   /* Note where ZA is born, e.g. when moving past an __arm_out("za")
29769      function.  */
29770   if (mode == aarch64_local_sme_state::ACTIVE_DEAD
29771       && TEST_HARD_REG_BIT (live, ZA_REGNUM))
29772     return aarch64_local_sme_state::ACTIVE_LIVE;
29773
29774   return mode;
29775 }
29776
29777 /* Implement TARGET_MODE_AFTER.  */
29778
29779 static int
29780 aarch64_mode_after (int entity, int mode, rtx_insn *, HARD_REG_SET live)
29781 {
29782   switch (aarch64_mode_entity (entity))
29783     {
29784     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29785       return mode;
29786
29787     case aarch64_mode_entity::LOCAL_SME_STATE:
29788       return int (aarch64_mode_after_local_sme_state
29789                   (aarch64_local_sme_state (mode), live));
29790     }
29791   gcc_unreachable ();
29792 }
29793
29794 /* Implement TARGET_MODE_CONFLUENCE for LOCAL_SME_STATE.  */
29795
29796 static aarch64_local_sme_state
29797 aarch64_local_sme_confluence (aarch64_local_sme_state mode1,
29798                               aarch64_local_sme_state mode2)
29799 {
29800   /* Perform a symmetrical check for two values.  */
29801   auto is_pair = [&](aarch64_local_sme_state val1,
29802                      aarch64_local_sme_state val2)
29803     {
29804       return ((mode1 == val1 && mode2 == val2)
29805               || (mode1 == val2 && mode2 == val1));
29806     };
29807
29808   /* INACTIVE_CALLER means ZA is off or it has dormant contents belonging
29809      to a caller.  OFF is one of the options.  */
29810   if (is_pair (aarch64_local_sme_state::INACTIVE_CALLER,
29811                aarch64_local_sme_state::OFF))
29812     return aarch64_local_sme_state::INACTIVE_CALLER;
29813
29814   /* Similarly for dormant contents belonging to the current function.  */
29815   if (is_pair (aarch64_local_sme_state::INACTIVE_LOCAL,
29816                aarch64_local_sme_state::OFF))
29817     return aarch64_local_sme_state::INACTIVE_LOCAL;
29818
29819   /* Treat a conditionally-initialized value as a fully-initialized value.  */
29820   if (is_pair (aarch64_local_sme_state::ACTIVE_LIVE,
29821                aarch64_local_sme_state::ACTIVE_DEAD))
29822     return aarch64_local_sme_state::ACTIVE_LIVE;
29823
29824   return aarch64_local_sme_state::ANY;
29825 }
29826
29827 /* Implement TARGET_MODE_CONFLUENCE.  */
29828
29829 static int
29830 aarch64_mode_confluence (int entity, int mode1, int mode2)
29831 {
29832   gcc_assert (mode1 != mode2);
29833   switch (aarch64_mode_entity (entity))
29834     {
29835     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29836       return int (aarch64_tristate_mode::MAYBE);
29837
29838     case aarch64_mode_entity::LOCAL_SME_STATE:
29839       return int (aarch64_local_sme_confluence
29840                   (aarch64_local_sme_state (mode1),
29841                    aarch64_local_sme_state (mode2)));
29842     }
29843   gcc_unreachable ();
29844 }
29845
29846 /* Implement TARGET_MODE_BACKPROP for an entity that either stays
29847    NO throughput, or makes one transition from NO to YES.  */
29848
29849 static aarch64_tristate_mode
29850 aarch64_one_shot_backprop (aarch64_tristate_mode mode1,
29851                            aarch64_tristate_mode mode2)
29852 {
29853   /* Keep bringing the transition forward until it starts from NO.  */
29854   if (mode1 == aarch64_tristate_mode::MAYBE
29855       && mode2 == aarch64_tristate_mode::YES)
29856     return mode2;
29857
29858   return aarch64_tristate_mode::MAYBE;
29859 }
29860
29861 /* Implement TARGET_MODE_BACKPROP for LOCAL_SME_STATE.  */
29862
29863 static aarch64_local_sme_state
29864 aarch64_local_sme_backprop (aarch64_local_sme_state mode1,
29865                             aarch64_local_sme_state mode2)
29866 {
29867   /* We always need to know what the current state is when transitioning
29868      to a new state.  Force any location with indeterminate starting state
29869      to be active.  */
29870   if (mode1 == aarch64_local_sme_state::ANY)
29871     switch (mode2)
29872       {
29873       case aarch64_local_sme_state::INACTIVE_CALLER:
29874       case aarch64_local_sme_state::OFF:
29875       case aarch64_local_sme_state::ACTIVE_DEAD:
29876         /* The current function's ZA state is not live.  */
29877         return aarch64_local_sme_state::ACTIVE_DEAD;
29878
29879       case aarch64_local_sme_state::INACTIVE_LOCAL:
29880       case aarch64_local_sme_state::ACTIVE_LIVE:
29881         /* The current function's ZA state is live.  */
29882         return aarch64_local_sme_state::ACTIVE_LIVE;
29883
29884       case aarch64_local_sme_state::SAVED_LOCAL:
29885         /* This is a transition to an exception handler.  Since we don't
29886            support non-call exceptions for SME functions, the source of
29887            the transition must be known.  We'll assert later if that's
29888            not the case.  */
29889         return aarch64_local_sme_state::ANY;
29890
29891       case aarch64_local_sme_state::ANY:
29892         return aarch64_local_sme_state::ANY;
29893       }
29894
29895   return aarch64_local_sme_state::ANY;
29896 }
29897
29898 /* Implement TARGET_MODE_BACKPROP.  */
29899
29900 static int
29901 aarch64_mode_backprop (int entity, int mode1, int mode2)
29902 {
29903   switch (aarch64_mode_entity (entity))
29904     {
29905     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29906       return int (aarch64_one_shot_backprop (aarch64_tristate_mode (mode1),
29907                                              aarch64_tristate_mode (mode2)));
29908
29909     case aarch64_mode_entity::LOCAL_SME_STATE:
29910       return int (aarch64_local_sme_backprop
29911                   (aarch64_local_sme_state (mode1),
29912                    aarch64_local_sme_state (mode2)));
29913     }
29914   gcc_unreachable ();
29915 }
29916
29917 /* Implement TARGET_MODE_ENTRY.  */
29918
29919 static int
29920 aarch64_mode_entry (int entity)
29921 {
29922   switch (aarch64_mode_entity (entity))
29923     {
29924     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29925       return int (aarch64_tristate_mode::NO);
29926
29927     case aarch64_mode_entity::LOCAL_SME_STATE:
29928       return int (aarch64_cfun_shared_flags ("za") != 0
29929                   ? aarch64_local_sme_state::ACTIVE_LIVE
29930                   : aarch64_cfun_incoming_pstate_za () != 0
29931                   ? aarch64_local_sme_state::ACTIVE_DEAD
29932                   : aarch64_local_sme_state::INACTIVE_CALLER);
29933     }
29934   gcc_unreachable ();
29935 }
29936
29937 /* Implement TARGET_MODE_EXIT.  */
29938
29939 static int
29940 aarch64_mode_exit (int entity)
29941 {
29942   switch (aarch64_mode_entity (entity))
29943     {
29944     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29945       return int (aarch64_tristate_mode::MAYBE);
29946
29947     case aarch64_mode_entity::LOCAL_SME_STATE:
29948       return int (aarch64_cfun_shared_flags ("za") != 0
29949                   ? aarch64_local_sme_state::ACTIVE_LIVE
29950                   : aarch64_cfun_incoming_pstate_za () != 0
29951                   ? aarch64_local_sme_state::ACTIVE_DEAD
29952                   : aarch64_local_sme_state::INACTIVE_CALLER);
29953     }
29954   gcc_unreachable ();
29955 }
29956
29957 /* Implement TARGET_MODE_EH_HANDLER.  */
29958
29959 static int
29960 aarch64_mode_eh_handler (int entity)
29961 {
29962   switch (aarch64_mode_entity (entity))
29963     {
29964     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29965       /* Require a lazy save buffer to be allocated before the first
29966          insn that can throw.  */
29967       return int (aarch64_tristate_mode::YES);
29968
29969     case aarch64_mode_entity::LOCAL_SME_STATE:
29970       return int (aarch64_local_sme_state::SAVED_LOCAL);
29971     }
29972   gcc_unreachable ();
29973 }
29974
29975 /* Implement TARGET_MODE_PRIORITY.  */
29976
29977 static int
29978 aarch64_mode_priority (int, int n)
29979 {
29980   return n;
29981 }
29982
29983 /* Implement TARGET_MD_ASM_ADJUST.  */
29984
29985 static rtx_insn *
29986 aarch64_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
29987                        vec<machine_mode> &input_modes,
29988                        vec<const char *> &constraints,
29989                        vec<rtx> &uses, vec<rtx> &clobbers,
29990                        HARD_REG_SET &clobbered_regs, location_t loc)
29991 {
29992   rtx_insn *seq = arm_md_asm_adjust (outputs, inputs, input_modes, constraints,
29993                                      uses, clobbers, clobbered_regs, loc);
29994
29995   /* "za" in the clobber list of a function with ZA state is defined to
29996      mean that the asm can read from and write to ZA.  We can model the
29997      read using a USE, but unfortunately, it's not possible to model the
29998      write directly.   Use a separate insn to model the effect.
29999
30000      We must ensure that ZA is active on entry, which is enforced by using
30001      SME_STATE_REGNUM.  The asm must ensure that ZA is active on return.
30002
30003      The same thing applies to ZT0.  */
30004   if (TARGET_ZA)
30005     for (unsigned int i = clobbers.length (); i-- > 0; )
30006       {
30007         rtx x = clobbers[i];
30008         if (REG_P (x)
30009             && (REGNO (x) == ZA_REGNUM || REGNO (x) == ZT0_REGNUM))
30010           {
30011             auto id = cfun->machine->next_asm_update_za_id++;
30012
30013             start_sequence ();
30014             if (seq)
30015               emit_insn (seq);
30016             rtx id_rtx = gen_int_mode (id, SImode);
30017             emit_insn (REGNO (x) == ZA_REGNUM
30018                        ? gen_aarch64_asm_update_za (id_rtx)
30019                        : gen_aarch64_asm_update_zt0 (id_rtx));
30020             seq = get_insns ();
30021             end_sequence ();
30022
30023             auto mode = REGNO (x) == ZA_REGNUM ? VNx16QImode : V8DImode;
30024             uses.safe_push (gen_rtx_REG (mode, REGNO (x)));
30025             uses.safe_push (gen_rtx_REG (DImode, SME_STATE_REGNUM));
30026
30027             clobbers.ordered_remove (i);
30028             CLEAR_HARD_REG_BIT (clobbered_regs, REGNO (x));
30029           }
30030       }
30031   return seq;
30032 }
30033
30034 /* BB is the target of an exception or nonlocal goto edge, which means
30035    that PSTATE.SM is known to be 0 on entry.  Put it into the state that
30036    the current function requires.  */
30037
30038 static bool
30039 aarch64_switch_pstate_sm_for_landing_pad (basic_block bb)
30040 {
30041   if (TARGET_NON_STREAMING)
30042     return false;
30043
30044   start_sequence ();
30045   rtx_insn *guard_label = nullptr;
30046   if (TARGET_STREAMING_COMPATIBLE)
30047     guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30048                                                   AARCH64_FL_SM_OFF);
30049   aarch64_sme_mode_switch_regs args_switch;
30050   args_switch.add_call_preserved_regs (df_get_live_in (bb));
30051   args_switch.emit_prologue ();
30052   aarch64_switch_pstate_sm (AARCH64_FL_SM_OFF, AARCH64_FL_SM_ON);
30053   args_switch.emit_epilogue ();
30054   if (guard_label)
30055     emit_label (guard_label);
30056   auto seq = get_insns ();
30057   end_sequence ();
30058
30059   emit_insn_after (seq, bb_note (bb));
30060   return true;
30061 }
30062
30063 /* JUMP is a nonlocal goto.  Its target requires PSTATE.SM to be 0 on entry,
30064    so arrange to make it so.  */
30065
30066 static bool
30067 aarch64_switch_pstate_sm_for_jump (rtx_insn *jump)
30068 {
30069   if (TARGET_NON_STREAMING)
30070     return false;
30071
30072   start_sequence ();
30073   rtx_insn *guard_label = nullptr;
30074   if (TARGET_STREAMING_COMPATIBLE)
30075     guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30076                                                   AARCH64_FL_SM_OFF);
30077   aarch64_switch_pstate_sm (AARCH64_FL_SM_ON, AARCH64_FL_SM_OFF);
30078   if (guard_label)
30079     emit_label (guard_label);
30080   auto seq = get_insns ();
30081   end_sequence ();
30082
30083   emit_insn_before (seq, jump);
30084   return true;
30085 }
30086
30087 /* If CALL involves a change in PSTATE.SM, emit the instructions needed
30088    to switch to the new mode and the instructions needed to restore the
30089    original mode.  Return true if something changed.  */
30090 static bool
30091 aarch64_switch_pstate_sm_for_call (rtx_call_insn *call)
30092 {
30093   /* Mode switches for sibling calls are handled via the epilogue.  */
30094   if (SIBLING_CALL_P (call))
30095     return false;
30096
30097   auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
30098   if (!aarch64_call_switches_pstate_sm (callee_isa_mode))
30099     return false;
30100
30101   /* Switch mode before the call, preserving any argument registers
30102      across the switch.  */
30103   start_sequence ();
30104   rtx_insn *args_guard_label = nullptr;
30105   if (TARGET_STREAMING_COMPATIBLE)
30106     args_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30107                                                        callee_isa_mode);
30108   aarch64_sme_mode_switch_regs args_switch;
30109   args_switch.add_call_args (call);
30110   args_switch.emit_prologue ();
30111   aarch64_switch_pstate_sm (AARCH64_ISA_MODE, callee_isa_mode);
30112   args_switch.emit_epilogue ();
30113   if (args_guard_label)
30114     emit_label (args_guard_label);
30115   auto args_seq = get_insns ();
30116   end_sequence ();
30117   emit_insn_before (args_seq, call);
30118
30119   if (find_reg_note (call, REG_NORETURN, NULL_RTX))
30120     return true;
30121
30122   /* Switch mode after the call, preserving any return registers across
30123      the switch.  */
30124   start_sequence ();
30125   rtx_insn *return_guard_label = nullptr;
30126   if (TARGET_STREAMING_COMPATIBLE)
30127     return_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30128                                                          callee_isa_mode);
30129   aarch64_sme_mode_switch_regs return_switch;
30130   return_switch.add_call_result (call);
30131   return_switch.emit_prologue ();
30132   aarch64_switch_pstate_sm (callee_isa_mode, AARCH64_ISA_MODE);
30133   return_switch.emit_epilogue ();
30134   if (return_guard_label)
30135     emit_label (return_guard_label);
30136   auto result_seq = get_insns ();
30137   end_sequence ();
30138   emit_insn_after (result_seq, call);
30139   return true;
30140 }
30141
30142 namespace {
30143
30144 const pass_data pass_data_switch_pstate_sm =
30145 {
30146   RTL_PASS, // type
30147   "smstarts", // name
30148   OPTGROUP_NONE, // optinfo_flags
30149   TV_NONE, // tv_id
30150   0, // properties_required
30151   0, // properties_provided
30152   0, // properties_destroyed
30153   0, // todo_flags_start
30154   TODO_df_finish, // todo_flags_finish
30155 };
30156
30157 class pass_switch_pstate_sm : public rtl_opt_pass
30158 {
30159 public:
30160   pass_switch_pstate_sm (gcc::context *ctxt)
30161     : rtl_opt_pass (pass_data_switch_pstate_sm, ctxt)
30162   {}
30163
30164   // opt_pass methods:
30165   bool gate (function *) override final;
30166   unsigned int execute (function *) override final;
30167 };
30168
30169 bool
30170 pass_switch_pstate_sm::gate (function *fn)
30171 {
30172   return (aarch64_fndecl_pstate_sm (fn->decl) != AARCH64_FL_SM_OFF
30173           || cfun->machine->call_switches_pstate_sm);
30174 }
30175
30176 /* Emit any instructions needed to switch PSTATE.SM.  */
30177 unsigned int
30178 pass_switch_pstate_sm::execute (function *fn)
30179 {
30180   basic_block bb;
30181
30182   auto_sbitmap blocks (last_basic_block_for_fn (cfun));
30183   bitmap_clear (blocks);
30184   FOR_EACH_BB_FN (bb, fn)
30185     {
30186       if (has_abnormal_call_or_eh_pred_edge_p (bb)
30187           && aarch64_switch_pstate_sm_for_landing_pad (bb))
30188         bitmap_set_bit (blocks, bb->index);
30189
30190       if (cfun->machine->call_switches_pstate_sm)
30191         {
30192           rtx_insn *insn;
30193           FOR_BB_INSNS (bb, insn)
30194             if (auto *call = dyn_cast<rtx_call_insn *> (insn))
30195               if (aarch64_switch_pstate_sm_for_call (call))
30196                 bitmap_set_bit (blocks, bb->index);
30197         }
30198
30199       auto end = BB_END (bb);
30200       if (JUMP_P (end)
30201           && find_reg_note (end, REG_NON_LOCAL_GOTO, NULL_RTX)
30202           && aarch64_switch_pstate_sm_for_jump (end))
30203         bitmap_set_bit (blocks, bb->index);
30204     }
30205   find_many_sub_basic_blocks (blocks);
30206   clear_aux_for_blocks ();
30207   return 0;
30208 }
30209
30210 }
30211
30212 rtl_opt_pass *
30213 make_pass_switch_pstate_sm (gcc::context *ctxt)
30214 {
30215   return new pass_switch_pstate_sm (ctxt);
30216 }
30217
30218 /* Parse an implementation-defined system register name of
30219    the form S[0-3]_[0-7]_C[0-15]_C[0-15]_[0-7].
30220    Return true if name matched against above pattern, false
30221    otherwise.  */
30222 bool
30223 aarch64_is_implem_def_reg (const char *regname)
30224 {
30225   unsigned pos = 0;
30226   unsigned name_len = strlen (regname);
30227   if (name_len < 12 || name_len > 14)
30228     return false;
30229
30230   auto cterm_valid_p = [&]()
30231   {
30232     bool leading_zero_p = false;
30233     unsigned i = 0;
30234     char n[3] = {0};
30235
30236     if (regname[pos] != 'c')
30237       return false;
30238     pos++;
30239     while (regname[pos] != '_')
30240       {
30241         if (leading_zero_p)
30242           return false;
30243         if (i == 0 && regname[pos] == '0')
30244           leading_zero_p = true;
30245         if (i > 2)
30246           return false;
30247         if (!ISDIGIT (regname[pos]))
30248           return false;
30249         n[i++] = regname[pos++];
30250       }
30251     if (atoi (n) > 15)
30252       return false;
30253     return true;
30254   };
30255
30256   if (regname[pos] != 's')
30257     return false;
30258   pos++;
30259   if (regname[pos] < '0' || regname[pos] > '3')
30260     return false;
30261   pos++;
30262   if (regname[pos++] != '_')
30263     return false;
30264   if (regname[pos] < '0' || regname[pos] > '7')
30265     return false;
30266   pos++;
30267   if (regname[pos++] != '_')
30268     return false;
30269   if (!cterm_valid_p ())
30270     return false;
30271   if (regname[pos++] != '_')
30272     return false;
30273   if (!cterm_valid_p ())
30274     return false;
30275   if (regname[pos++] != '_')
30276     return false;
30277   if (regname[pos] < '0' || regname[pos] > '7')
30278     return false;
30279   return true;
30280 }
30281
30282 /* Return true if REGNAME matches either a known permitted system
30283    register name, or a generic sysreg specification.  For use in
30284    back-end predicate `aarch64_sysreg_string'.  */
30285 bool
30286 aarch64_valid_sysreg_name_p (const char *regname)
30287 {
30288   const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30289   if (sysreg == NULL)
30290     return aarch64_is_implem_def_reg (regname);
30291   if (sysreg->arch_reqs)
30292     return (aarch64_isa_flags & sysreg->arch_reqs);
30293   return true;
30294 }
30295
30296 /* Return the generic sysreg specification for a valid system register
30297    name, otherwise NULL.  WRITE_P is true iff the register is being
30298    written to.  IS128OP indicates the requested system register should
30299    be checked for a 128-bit implementation.  */
30300 const char *
30301 aarch64_retrieve_sysreg (const char *regname, bool write_p, bool is128op)
30302 {
30303   const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30304   if (sysreg == NULL)
30305     {
30306       if (aarch64_is_implem_def_reg (regname))
30307         return regname;
30308       else
30309         return NULL;
30310     }
30311   if (is128op && !(sysreg->properties & F_REG_128))
30312     return NULL;
30313   if ((write_p && (sysreg->properties & F_REG_READ))
30314       || (!write_p && (sysreg->properties & F_REG_WRITE)))
30315     return NULL;
30316   if ((~aarch64_isa_flags & sysreg->arch_reqs) != 0)
30317     return NULL;
30318   return sysreg->encoding;
30319 }
30320
30321 /* Target-specific selftests.  */
30322
30323 #if CHECKING_P
30324
30325 namespace selftest {
30326
30327 /* Selftest for the RTL loader.
30328    Verify that the RTL loader copes with a dump from
30329    print_rtx_function.  This is essentially just a test that class
30330    function_reader can handle a real dump, but it also verifies
30331    that lookup_reg_by_dump_name correctly handles hard regs.
30332    The presence of hard reg names in the dump means that the test is
30333    target-specific, hence it is in this file.  */
30334
30335 static void
30336 aarch64_test_loading_full_dump ()
30337 {
30338   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
30339
30340   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
30341
30342   rtx_insn *insn_1 = get_insn_by_uid (1);
30343   ASSERT_EQ (NOTE, GET_CODE (insn_1));
30344
30345   rtx_insn *insn_15 = get_insn_by_uid (15);
30346   ASSERT_EQ (INSN, GET_CODE (insn_15));
30347   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
30348
30349   /* Verify crtl->return_rtx.  */
30350   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
30351   ASSERT_EQ (0, REGNO (crtl->return_rtx));
30352   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
30353 }
30354
30355 /* Test the fractional_cost class.  */
30356
30357 static void
30358 aarch64_test_fractional_cost ()
30359 {
30360   using cf = fractional_cost;
30361
30362   ASSERT_EQ (cf (0, 20), 0);
30363
30364   ASSERT_EQ (cf (4, 2), 2);
30365   ASSERT_EQ (3, cf (9, 3));
30366
30367   ASSERT_NE (cf (5, 2), 2);
30368   ASSERT_NE (3, cf (8, 3));
30369
30370   ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
30371   ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
30372   ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
30373
30374   ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
30375   ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
30376   ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
30377   ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
30378   ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
30379   ASSERT_EQ (3 - cf (10, 3), 0);
30380
30381   ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
30382   ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
30383
30384   ASSERT_TRUE (cf (4, 15) <= cf (5, 15));
30385   ASSERT_TRUE (cf (5, 15) <= cf (5, 15));
30386   ASSERT_FALSE (cf (6, 15) <= cf (5, 15));
30387   ASSERT_TRUE (cf (1, 3) <= cf (2, 5));
30388   ASSERT_TRUE (cf (1, 12) <= cf (1, 6));
30389   ASSERT_TRUE (cf (5, 3) <= cf (5, 3));
30390   ASSERT_TRUE (cf (239, 240) <= 1);
30391   ASSERT_TRUE (cf (240, 240) <= 1);
30392   ASSERT_FALSE (cf (241, 240) <= 1);
30393   ASSERT_FALSE (2 <= cf (207, 104));
30394   ASSERT_TRUE (2 <= cf (208, 104));
30395   ASSERT_TRUE (2 <= cf (209, 104));
30396
30397   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
30398   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
30399   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
30400   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
30401   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
30402   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
30403   ASSERT_TRUE (cf (239, 240) < 1);
30404   ASSERT_FALSE (cf (240, 240) < 1);
30405   ASSERT_FALSE (cf (241, 240) < 1);
30406   ASSERT_FALSE (2 < cf (207, 104));
30407   ASSERT_FALSE (2 < cf (208, 104));
30408   ASSERT_TRUE (2 < cf (209, 104));
30409
30410   ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
30411   ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
30412   ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
30413   ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
30414   ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
30415   ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
30416   ASSERT_FALSE (cf (239, 240) >= 1);
30417   ASSERT_TRUE (cf (240, 240) >= 1);
30418   ASSERT_TRUE (cf (241, 240) >= 1);
30419   ASSERT_TRUE (2 >= cf (207, 104));
30420   ASSERT_TRUE (2 >= cf (208, 104));
30421   ASSERT_FALSE (2 >= cf (209, 104));
30422
30423   ASSERT_FALSE (cf (4, 15) > cf (5, 15));
30424   ASSERT_FALSE (cf (5, 15) > cf (5, 15));
30425   ASSERT_TRUE (cf (6, 15) > cf (5, 15));
30426   ASSERT_FALSE (cf (1, 3) > cf (2, 5));
30427   ASSERT_FALSE (cf (1, 12) > cf (1, 6));
30428   ASSERT_FALSE (cf (5, 3) > cf (5, 3));
30429   ASSERT_FALSE (cf (239, 240) > 1);
30430   ASSERT_FALSE (cf (240, 240) > 1);
30431   ASSERT_TRUE (cf (241, 240) > 1);
30432   ASSERT_TRUE (2 > cf (207, 104));
30433   ASSERT_FALSE (2 > cf (208, 104));
30434   ASSERT_FALSE (2 > cf (209, 104));
30435
30436   ASSERT_EQ (cf (1, 2).ceil (), 1);
30437   ASSERT_EQ (cf (11, 7).ceil (), 2);
30438   ASSERT_EQ (cf (20, 1).ceil (), 20);
30439   ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
30440   ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
30441   ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
30442   ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
30443   ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
30444
30445   ASSERT_EQ (cf (1, 2).as_double (), 0.5);
30446 }
30447
30448 /* Calculate whether our system register data, as imported from
30449    `aarch64-sys-reg.def' has any duplicate entries.  */
30450 static void
30451 aarch64_test_sysreg_encoding_clashes (void)
30452 {
30453   using dup_instances_t = hash_map<nofree_string_hash,
30454                                    std::vector<const sysreg_t*>>;
30455
30456   dup_instances_t duplicate_instances;
30457
30458   /* Every time an encoding is established to come up more than once
30459      we add it to a "clash-analysis queue", which is then used to extract
30460      necessary information from our hash map when establishing whether
30461      repeated encodings are valid.  */
30462
30463   /* 1) Collect recurrence information.  */
30464   for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
30465     {
30466       const sysreg_t *reg = aarch64_sysregs + i;
30467
30468       std::vector<const sysreg_t*> *tmp
30469         = &duplicate_instances.get_or_insert (reg->encoding);
30470
30471       tmp->push_back (reg);
30472     }
30473
30474   /* 2) Carry out analysis on collected data.  */
30475   for (auto instance : duplicate_instances)
30476     {
30477       unsigned nrep = instance.second.size ();
30478       if (nrep > 1)
30479         for (unsigned i = 0; i < nrep; i++)
30480           for (unsigned j = i + 1; j < nrep; j++)
30481             {
30482               const sysreg_t *a = instance.second[i];
30483               const sysreg_t *b = instance.second[j];
30484               ASSERT_TRUE ((a->properties != b->properties)
30485                            || (a->arch_reqs != b->arch_reqs));
30486             }
30487     }
30488 }
30489
30490 /* Run all target-specific selftests.  */
30491
30492 static void
30493 aarch64_run_selftests (void)
30494 {
30495   aarch64_test_loading_full_dump ();
30496   aarch64_test_fractional_cost ();
30497   aarch64_test_sysreg_encoding_clashes ();
30498 }
30499
30500 } // namespace selftest
30501
30502 #endif /* #if CHECKING_P */
30503
30504 #undef TARGET_STACK_PROTECT_GUARD
30505 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
30506
30507 #undef TARGET_ADDRESS_COST
30508 #define TARGET_ADDRESS_COST aarch64_address_cost
30509
30510 /* This hook will determines whether unnamed bitfields affect the alignment
30511    of the containing structure.  The hook returns true if the structure
30512    should inherit the alignment requirements of an unnamed bitfield's
30513    type.  */
30514 #undef TARGET_ALIGN_ANON_BITFIELD
30515 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
30516
30517 #undef TARGET_ASM_ALIGNED_DI_OP
30518 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
30519
30520 #undef TARGET_ASM_ALIGNED_HI_OP
30521 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
30522
30523 #undef TARGET_ASM_ALIGNED_SI_OP
30524 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
30525
30526 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
30527 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
30528   hook_bool_const_tree_hwi_hwi_const_tree_true
30529
30530 #undef TARGET_ASM_FILE_START
30531 #define TARGET_ASM_FILE_START aarch64_start_file
30532
30533 #undef TARGET_ASM_OUTPUT_MI_THUNK
30534 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
30535
30536 #undef TARGET_ASM_SELECT_RTX_SECTION
30537 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
30538
30539 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
30540 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
30541
30542 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
30543 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
30544
30545 #undef TARGET_BUILD_BUILTIN_VA_LIST
30546 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
30547
30548 #undef TARGET_CALLEE_COPIES
30549 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
30550
30551 #undef TARGET_FRAME_POINTER_REQUIRED
30552 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
30553
30554 #undef TARGET_CAN_ELIMINATE
30555 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
30556
30557 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
30558 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P \
30559   aarch64_function_attribute_inlinable_p
30560
30561 #undef TARGET_NEED_IPA_FN_TARGET_INFO
30562 #define TARGET_NEED_IPA_FN_TARGET_INFO aarch64_need_ipa_fn_target_info
30563
30564 #undef TARGET_UPDATE_IPA_FN_TARGET_INFO
30565 #define TARGET_UPDATE_IPA_FN_TARGET_INFO aarch64_update_ipa_fn_target_info
30566
30567 #undef TARGET_CAN_INLINE_P
30568 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
30569
30570 #undef TARGET_CANNOT_FORCE_CONST_MEM
30571 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
30572
30573 #undef TARGET_CASE_VALUES_THRESHOLD
30574 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
30575
30576 #undef TARGET_CONDITIONAL_REGISTER_USAGE
30577 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
30578
30579 #undef TARGET_MEMBER_TYPE_FORCES_BLK
30580 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
30581
30582 /* Only the least significant bit is used for initialization guard
30583    variables.  */
30584 #undef TARGET_CXX_GUARD_MASK_BIT
30585 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
30586
30587 #undef TARGET_C_MODE_FOR_SUFFIX
30588 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
30589
30590 #ifdef TARGET_BIG_ENDIAN_DEFAULT
30591 #undef  TARGET_DEFAULT_TARGET_FLAGS
30592 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
30593 #endif
30594
30595 #undef TARGET_CLASS_MAX_NREGS
30596 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
30597
30598 #undef TARGET_BUILTIN_DECL
30599 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
30600
30601 #undef TARGET_BUILTIN_RECIPROCAL
30602 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
30603
30604 #undef TARGET_C_EXCESS_PRECISION
30605 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
30606
30607 #undef TARGET_C_BITINT_TYPE_INFO
30608 #define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
30609
30610 #undef TARGET_C_MODE_FOR_FLOATING_TYPE
30611 #define TARGET_C_MODE_FOR_FLOATING_TYPE aarch64_c_mode_for_floating_type
30612
30613 #undef  TARGET_EXPAND_BUILTIN
30614 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
30615
30616 #undef TARGET_EXPAND_BUILTIN_VA_START
30617 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
30618
30619 #undef TARGET_FOLD_BUILTIN
30620 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
30621
30622 #undef TARGET_FUNCTION_ARG
30623 #define TARGET_FUNCTION_ARG aarch64_function_arg
30624
30625 #undef TARGET_FUNCTION_ARG_ADVANCE
30626 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
30627
30628 #undef TARGET_FUNCTION_ARG_BOUNDARY
30629 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
30630
30631 #undef TARGET_FUNCTION_ARG_PADDING
30632 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
30633
30634 #undef TARGET_GET_RAW_RESULT_MODE
30635 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
30636 #undef TARGET_GET_RAW_ARG_MODE
30637 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
30638
30639 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
30640 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
30641
30642 #undef TARGET_FUNCTION_VALUE
30643 #define TARGET_FUNCTION_VALUE aarch64_function_value
30644
30645 #undef TARGET_FUNCTION_VALUE_REGNO_P
30646 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
30647
30648 #undef TARGET_START_CALL_ARGS
30649 #define TARGET_START_CALL_ARGS aarch64_start_call_args
30650
30651 #undef TARGET_END_CALL_ARGS
30652 #define TARGET_END_CALL_ARGS aarch64_end_call_args
30653
30654 #undef TARGET_GIMPLE_FOLD_BUILTIN
30655 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
30656
30657 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
30658 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
30659
30660 #undef  TARGET_INIT_BUILTINS
30661 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
30662
30663 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
30664 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
30665   aarch64_ira_change_pseudo_allocno_class
30666
30667 #undef TARGET_LEGITIMATE_ADDRESS_P
30668 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
30669
30670 #undef TARGET_LEGITIMATE_CONSTANT_P
30671 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
30672
30673 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
30674 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
30675   aarch64_legitimize_address_displacement
30676
30677 #undef TARGET_LIBGCC_CMP_RETURN_MODE
30678 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
30679
30680 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
30681 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
30682 aarch64_libgcc_floating_mode_supported_p
30683
30684 #undef TARGET_MANGLE_TYPE
30685 #define TARGET_MANGLE_TYPE aarch64_mangle_type
30686
30687 #undef TARGET_INVALID_BINARY_OP
30688 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
30689
30690 #undef TARGET_VERIFY_TYPE_CONTEXT
30691 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
30692
30693 #undef TARGET_MEMORY_MOVE_COST
30694 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
30695
30696 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
30697 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
30698
30699 #undef TARGET_MUST_PASS_IN_STACK
30700 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
30701
30702 /* This target hook should return true if accesses to volatile bitfields
30703    should use the narrowest mode possible.  It should return false if these
30704    accesses should use the bitfield container type.  */
30705 #undef TARGET_NARROW_VOLATILE_BITFIELD
30706 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
30707
30708 #undef  TARGET_OPTION_OVERRIDE
30709 #define TARGET_OPTION_OVERRIDE aarch64_override_options
30710
30711 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
30712 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
30713   aarch64_override_options_after_change
30714
30715 #undef TARGET_OFFLOAD_OPTIONS
30716 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
30717
30718 #undef TARGET_OPTION_RESTORE
30719 #define TARGET_OPTION_RESTORE aarch64_option_restore
30720
30721 #undef TARGET_OPTION_PRINT
30722 #define TARGET_OPTION_PRINT aarch64_option_print
30723
30724 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
30725 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
30726
30727 #undef TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P
30728 #define TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P \
30729   aarch64_option_valid_version_attribute_p
30730
30731 #undef TARGET_SET_CURRENT_FUNCTION
30732 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
30733
30734 #undef TARGET_PASS_BY_REFERENCE
30735 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
30736
30737 #undef TARGET_PREFERRED_RELOAD_CLASS
30738 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
30739
30740 #undef TARGET_SCHED_REASSOCIATION_WIDTH
30741 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
30742
30743 #undef TARGET_DWARF_FRAME_REG_MODE
30744 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
30745
30746 #undef TARGET_PROMOTED_TYPE
30747 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
30748
30749 #undef TARGET_SECONDARY_RELOAD
30750 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
30751
30752 #undef TARGET_SECONDARY_MEMORY_NEEDED
30753 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
30754
30755 #undef TARGET_SHIFT_TRUNCATION_MASK
30756 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
30757
30758 #undef TARGET_SETUP_INCOMING_VARARGS
30759 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
30760
30761 #undef TARGET_STRUCT_VALUE_RTX
30762 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
30763
30764 #undef TARGET_REGISTER_MOVE_COST
30765 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
30766
30767 #undef TARGET_RETURN_IN_MEMORY
30768 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
30769
30770 #undef TARGET_RETURN_IN_MSB
30771 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
30772
30773 #undef TARGET_RTX_COSTS
30774 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
30775
30776 #undef TARGET_INSN_COST
30777 #define TARGET_INSN_COST aarch64_insn_cost
30778
30779 #undef TARGET_SCALAR_MODE_SUPPORTED_P
30780 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
30781
30782 #undef TARGET_SCHED_ISSUE_RATE
30783 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
30784
30785 #undef TARGET_SCHED_VARIABLE_ISSUE
30786 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
30787
30788 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
30789 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
30790   aarch64_sched_first_cycle_multipass_dfa_lookahead
30791
30792 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
30793 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
30794   aarch64_first_cycle_multipass_dfa_lookahead_guard
30795
30796 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
30797 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
30798   aarch64_get_separate_components
30799
30800 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
30801 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
30802   aarch64_components_for_bb
30803
30804 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
30805 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
30806   aarch64_disqualify_components
30807
30808 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
30809 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
30810   aarch64_emit_prologue_components
30811
30812 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
30813 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
30814   aarch64_emit_epilogue_components
30815
30816 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
30817 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
30818   aarch64_set_handled_components
30819
30820 #undef TARGET_TRAMPOLINE_INIT
30821 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
30822
30823 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
30824 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
30825
30826 #undef TARGET_VECTOR_MODE_SUPPORTED_P
30827 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
30828
30829 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
30830 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P aarch64_vector_mode_supported_any_target_p
30831
30832 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
30833 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
30834
30835 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
30836 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
30837   aarch64_builtin_support_vector_misalignment
30838
30839 #undef TARGET_ARRAY_MODE
30840 #define TARGET_ARRAY_MODE aarch64_array_mode
30841
30842 #undef TARGET_ARRAY_MODE_SUPPORTED_P
30843 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
30844
30845 #undef TARGET_VECTORIZE_CREATE_COSTS
30846 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
30847
30848 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
30849 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
30850   aarch64_builtin_vectorization_cost
30851
30852 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
30853 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
30854
30855 #undef TARGET_VECTORIZE_BUILTINS
30856 #define TARGET_VECTORIZE_BUILTINS
30857
30858 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
30859 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
30860   aarch64_autovectorize_vector_modes
30861
30862 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
30863 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
30864   aarch64_atomic_assign_expand_fenv
30865
30866 /* Section anchor support.  */
30867
30868 #undef TARGET_MIN_ANCHOR_OFFSET
30869 #define TARGET_MIN_ANCHOR_OFFSET -256
30870
30871 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
30872    byte offset; we can do much more for larger data types, but have no way
30873    to determine the size of the access.  We assume accesses are aligned.  */
30874 #undef TARGET_MAX_ANCHOR_OFFSET
30875 #define TARGET_MAX_ANCHOR_OFFSET 4095
30876
30877 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
30878 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
30879   aarch64_vectorize_preferred_div_as_shifts_over_mult
30880
30881 #undef TARGET_VECTOR_ALIGNMENT
30882 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
30883
30884 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
30885 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
30886   aarch64_vectorize_preferred_vector_alignment
30887 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
30888 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
30889   aarch64_simd_vector_alignment_reachable
30890
30891 /* vec_perm support.  */
30892
30893 #undef TARGET_VECTORIZE_VEC_PERM_CONST
30894 #define TARGET_VECTORIZE_VEC_PERM_CONST \
30895   aarch64_vectorize_vec_perm_const
30896
30897 #undef TARGET_VECTORIZE_RELATED_MODE
30898 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
30899 #undef TARGET_VECTORIZE_GET_MASK_MODE
30900 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
30901 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
30902 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
30903   aarch64_empty_mask_is_expensive
30904 #undef TARGET_PREFERRED_ELSE_VALUE
30905 #define TARGET_PREFERRED_ELSE_VALUE \
30906   aarch64_preferred_else_value
30907
30908 #undef TARGET_INIT_LIBFUNCS
30909 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
30910
30911 #undef TARGET_FIXED_CONDITION_CODE_REGS
30912 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
30913
30914 #undef TARGET_FLAGS_REGNUM
30915 #define TARGET_FLAGS_REGNUM CC_REGNUM
30916
30917 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
30918 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
30919
30920 #undef TARGET_ASAN_SHADOW_OFFSET
30921 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
30922
30923 #undef TARGET_LEGITIMIZE_ADDRESS
30924 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
30925
30926 #undef TARGET_SCHED_CAN_SPECULATE_INSN
30927 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
30928
30929 #undef TARGET_CAN_USE_DOLOOP_P
30930 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
30931
30932 #undef TARGET_SCHED_ADJUST_PRIORITY
30933 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
30934
30935 #undef TARGET_SCHED_MACRO_FUSION_P
30936 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
30937
30938 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
30939 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
30940
30941 #undef TARGET_SCHED_FUSION_PRIORITY
30942 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
30943
30944 #undef TARGET_UNSPEC_MAY_TRAP_P
30945 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
30946
30947 #undef TARGET_USE_PSEUDO_PIC_REG
30948 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
30949
30950 #undef TARGET_PRINT_OPERAND
30951 #define TARGET_PRINT_OPERAND aarch64_print_operand
30952
30953 #undef TARGET_PRINT_OPERAND_ADDRESS
30954 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
30955
30956 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
30957 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
30958
30959 #undef TARGET_OPTAB_SUPPORTED_P
30960 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
30961
30962 #undef TARGET_OMIT_STRUCT_RETURN_REG
30963 #define TARGET_OMIT_STRUCT_RETURN_REG true
30964
30965 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
30966 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
30967   aarch64_dwarf_poly_indeterminate_value
30968
30969 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
30970 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
30971 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
30972
30973 #undef TARGET_HARD_REGNO_NREGS
30974 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
30975 #undef TARGET_HARD_REGNO_MODE_OK
30976 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
30977
30978 #undef TARGET_MODES_TIEABLE_P
30979 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
30980
30981 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
30982 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
30983   aarch64_hard_regno_call_part_clobbered
30984
30985 #undef TARGET_INSN_CALLEE_ABI
30986 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
30987
30988 #undef TARGET_CONSTANT_ALIGNMENT
30989 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
30990
30991 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
30992 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
30993   aarch64_stack_clash_protection_alloca_probe_range
30994
30995 #undef TARGET_COMPUTE_PRESSURE_CLASSES
30996 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
30997
30998 #undef TARGET_CAN_CHANGE_MODE_CLASS
30999 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
31000
31001 #undef TARGET_SELECT_EARLY_REMAT_MODES
31002 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
31003
31004 #undef TARGET_SPECULATION_SAFE_VALUE
31005 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
31006
31007 #undef TARGET_ESTIMATED_POLY_VALUE
31008 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
31009
31010 #undef TARGET_ATTRIBUTE_TABLE
31011 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
31012
31013 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
31014 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
31015   aarch64_simd_clone_compute_vecsize_and_simdlen
31016
31017 #undef TARGET_SIMD_CLONE_ADJUST
31018 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
31019
31020 #undef TARGET_SIMD_CLONE_USABLE
31021 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
31022
31023 #undef TARGET_COMP_TYPE_ATTRIBUTES
31024 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
31025
31026 #undef TARGET_MERGE_DECL_ATTRIBUTES
31027 #define TARGET_MERGE_DECL_ATTRIBUTES aarch64_merge_decl_attributes
31028
31029 #undef TARGET_GET_MULTILIB_ABI_NAME
31030 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
31031
31032 #undef TARGET_FNTYPE_ABI
31033 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
31034
31035 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
31036 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
31037
31038 #if CHECKING_P
31039 #undef TARGET_RUN_TARGET_SELFTESTS
31040 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
31041 #endif /* #if CHECKING_P */
31042
31043 #undef TARGET_ASM_POST_CFI_STARTPROC
31044 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
31045
31046 #undef TARGET_STRICT_ARGUMENT_NAMING
31047 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
31048
31049 #undef TARGET_MODE_EMIT
31050 #define TARGET_MODE_EMIT aarch64_mode_emit
31051
31052 #undef TARGET_MODE_NEEDED
31053 #define TARGET_MODE_NEEDED aarch64_mode_needed
31054
31055 #undef TARGET_MODE_AFTER
31056 #define TARGET_MODE_AFTER aarch64_mode_after
31057
31058 #undef TARGET_MODE_CONFLUENCE
31059 #define TARGET_MODE_CONFLUENCE aarch64_mode_confluence
31060
31061 #undef TARGET_MODE_BACKPROP
31062 #define TARGET_MODE_BACKPROP aarch64_mode_backprop
31063
31064 #undef TARGET_MODE_ENTRY
31065 #define TARGET_MODE_ENTRY aarch64_mode_entry
31066
31067 #undef TARGET_MODE_EXIT
31068 #define TARGET_MODE_EXIT aarch64_mode_exit
31069
31070 #undef TARGET_MODE_EH_HANDLER
31071 #define TARGET_MODE_EH_HANDLER aarch64_mode_eh_handler
31072
31073 #undef TARGET_MODE_PRIORITY
31074 #define TARGET_MODE_PRIORITY aarch64_mode_priority
31075
31076 #undef TARGET_MD_ASM_ADJUST
31077 #define TARGET_MD_ASM_ADJUST aarch64_md_asm_adjust
31078
31079 #undef TARGET_ASM_FILE_END
31080 #define TARGET_ASM_FILE_END aarch64_asm_file_end
31081
31082 #undef TARGET_ASM_FUNCTION_EPILOGUE
31083 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
31084
31085 #undef TARGET_HAVE_SHADOW_CALL_STACK
31086 #define TARGET_HAVE_SHADOW_CALL_STACK true
31087
31088 #undef TARGET_CONST_ANCHOR
31089 #define TARGET_CONST_ANCHOR 0x1000000
31090
31091 #undef TARGET_EXTRA_LIVE_ON_ENTRY
31092 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
31093
31094 #undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
31095 #define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
31096
31097 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
31098 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
31099
31100 #undef TARGET_OPTION_FUNCTION_VERSIONS
31101 #define TARGET_OPTION_FUNCTION_VERSIONS aarch64_common_function_versions
31102
31103 #undef TARGET_COMPARE_VERSION_PRIORITY
31104 #define TARGET_COMPARE_VERSION_PRIORITY aarch64_compare_version_priority
31105
31106 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
31107 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
31108   aarch64_generate_version_dispatcher_body
31109
31110 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
31111 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
31112   aarch64_get_function_versions_dispatcher
31113
31114 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
31115 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME aarch64_mangle_decl_assembler_name
31116
31117 struct gcc_target targetm = TARGET_INITIALIZER;
31118
31119 #include "gt-aarch64.h"