gcc/config/aarch64/aarch64.cc

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2024 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #define INCLUDE_STRING
  24 #define INCLUDE_ALGORITHM
  25 #define INCLUDE_VECTOR
  26 #include "config.h"
  27 #include "system.h"
  28 #include "coretypes.h"
  29 #include "backend.h"
  30 #include "target.h"
  31 #include "rtl.h"
  32 #include "tree.h"
  33 #include "memmodel.h"
  34 #include "gimple.h"
  35 #include "cfghooks.h"
  36 #include "cfgloop.h"
  37 #include "df.h"
  38 #include "tm_p.h"
  39 #include "stringpool.h"
  40 #include "attribs.h"
  41 #include "optabs.h"
  42 #include "regs.h"
  43 #include "emit-rtl.h"
  44 #include "recog.h"
  45 #include "cgraph.h"
  46 #include "diagnostic.h"
  47 #include "insn-attr.h"
  48 #include "alias.h"
  49 #include "fold-const.h"
  50 #include "stor-layout.h"
  51 #include "calls.h"
  52 #include "varasm.h"
  53 #include "output.h"
  54 #include "flags.h"
  55 #include "explow.h"
  56 #include "expr.h"
  57 #include "reload.h"
  58 #include "langhooks.h"
  59 #include "opts.h"
  60 #include "gimplify.h"
  61 #include "dwarf2.h"
  62 #include "gimple-iterator.h"
  63 #include "tree-vectorizer.h"
  64 #include "aarch64-cost-tables.h"
  65 #include "dumpfile.h"
  66 #include "builtins.h"
  67 #include "rtl-iter.h"
  68 #include "tm-constrs.h"
  69 #include "sched-int.h"
  70 #include "target-globals.h"
  71 #include "common/common-target.h"
  72 #include "cfgrtl.h"
  73 #include "selftest.h"
  74 #include "selftest-rtl.h"
  75 #include "rtx-vector-builder.h"
  76 #include "intl.h"
  77 #include "expmed.h"
  78 #include "function-abi.h"
  79 #include "gimple-pretty-print.h"
  80 #include "tree-ssa-loop-niter.h"
  81 #include "fractional-cost.h"
  82 #include "rtlanal.h"
  83 #include "tree-dfa.h"
  84 #include "asan.h"
  85 #include "aarch64-feature-deps.h"
  86 #include "config/arm/aarch-common.h"
  87 #include "config/arm/aarch-common-protos.h"
  88 #include "common/config/aarch64/cpuinfo.h"
  89 #include "ssa.h"
  90 #include "except.h"
  91 #include "tree-pass.h"
  92 #include "cfgbuild.h"
  93 #include "symbol-summary.h"
  94 #include "sreal.h"
  95 #include "ipa-cp.h"
  96 #include "ipa-prop.h"
  97 #include "ipa-fnsummary.h"
  98 #include "hash-map.h"
  99
 100 /* This file should be included last.  */
 101 #include "target-def.h"
 102
 103 /* Defined for convenience.  */
 104 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 105
 106 /* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
 107    and 1 MOVI/DUP (same size as a call).  */
 108 #define MAX_SET_SIZE(speed) (speed ? 256 : 96)
 109
 110 /* Flags that describe how a function shares certain architectural state
 111    with its callers.
 112
 113    - AARCH64_STATE_SHARED indicates that the function does share the state
 114      with callers.
 115
 116    - AARCH64_STATE_IN indicates that the function reads (or might read) the
 117      incoming state.  The converse is that the function ignores the incoming
 118      state.
 119
 120    - AARCH64_STATE_OUT indicates that the function returns new state.
 121      The converse is that the state on return is the same as it was on entry.
 122
 123    A function that partially modifies the state treats it as both IN
 124    and OUT (because the value on return depends to some extent on the
 125    value on input).  */
 126 constexpr auto AARCH64_STATE_SHARED = 1U << 0;
 127 constexpr auto AARCH64_STATE_IN = 1U << 1;
 128 constexpr auto AARCH64_STATE_OUT = 1U << 2;
 129
 130 /* Information about a legitimate vector immediate operand.  */
 131 struct simd_immediate_info
 132 {
 133   enum insn_type { MOV, MVN, INDEX, PTRUE };
 134   enum modifier_type { LSL, MSL };
 135
 136   simd_immediate_info () {}
 137   simd_immediate_info (scalar_float_mode, rtx);
 138   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 139                        insn_type = MOV, modifier_type = LSL,
 140                        unsigned int = 0);
 141   simd_immediate_info (scalar_mode, rtx, rtx);
 142   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
 143
 144   /* The mode of the elements.  */
 145   scalar_mode elt_mode;
 146
 147   /* The instruction to use to move the immediate into a vector.  */
 148   insn_type insn;
 149
 150   union
 151   {
 152     /* For MOV and MVN.  */
 153     struct
 154     {
 155       /* The value of each element.  */
 156       rtx value;
 157
 158       /* The kind of shift modifier to use, and the number of bits to shift.
 159          This is (LSL, 0) if no shift is needed.  */
 160       modifier_type modifier;
 161       unsigned int shift;
 162     } mov;
 163
 164     /* For INDEX.  */
 165     struct
 166     {
 167       /* The value of the first element and the step to be added for each
 168          subsequent element.  */
 169       rtx base, step;
 170     } index;
 171
 172     /* For PTRUE.  */
 173     aarch64_svpattern pattern;
 174   } u;
 175 };
 176
 177 /* Construct a floating-point immediate in which each element has mode
 178    ELT_MODE_IN and value VALUE_IN.  */
 179 inline simd_immediate_info
 180 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 181   : elt_mode (elt_mode_in), insn (MOV)
 182 {
 183   u.mov.value = value_in;
 184   u.mov.modifier = LSL;
 185   u.mov.shift = 0;
 186 }
 187
 188 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 189    and value VALUE_IN.  The other parameters are as for the structure
 190    fields.  */
 191 inline simd_immediate_info
 192 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 193                        unsigned HOST_WIDE_INT value_in,
 194                        insn_type insn_in, modifier_type modifier_in,
 195                        unsigned int shift_in)
 196   : elt_mode (elt_mode_in), insn (insn_in)
 197 {
 198   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 199   u.mov.modifier = modifier_in;
 200   u.mov.shift = shift_in;
 201 }
 202
 203 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 204    and where element I is equal to BASE_IN + I * STEP_IN.  */
 205 inline simd_immediate_info
 206 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 207   : elt_mode (elt_mode_in), insn (INDEX)
 208 {
 209   u.index.base = base_in;
 210   u.index.step = step_in;
 211 }
 212
 213 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 214    and has PTRUE pattern PATTERN_IN.  */
 215 inline simd_immediate_info
 216 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 217                        aarch64_svpattern pattern_in)
 218   : elt_mode (elt_mode_in), insn (PTRUE)
 219 {
 220   u.pattern = pattern_in;
 221 }
 222
 223 namespace {
 224
 225 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
 226 class pure_scalable_type_info
 227 {
 228 public:
 229   /* Represents the result of analyzing a type.  All values are nonzero,
 230      in the possibly forlorn hope that accidental conversions to bool
 231      trigger a warning.  */
 232   enum analysis_result
 233   {
 234     /* The type does not have an ABI identity; i.e. it doesn't contain
 235        at least one object whose type is a Fundamental Data Type.  */
 236     NO_ABI_IDENTITY = 1,
 237
 238     /* The type is definitely a Pure Scalable Type.  */
 239     IS_PST,
 240
 241     /* The type is definitely not a Pure Scalable Type.  */
 242     ISNT_PST,
 243
 244     /* It doesn't matter for PCS purposes whether the type is a Pure
 245        Scalable Type or not, since the type will be handled the same
 246        way regardless.
 247
 248        Specifically, this means that if the type is a Pure Scalable Type,
 249        there aren't enough argument registers to hold it, and so it will
 250        need to be passed or returned in memory.  If the type isn't a
 251        Pure Scalable Type, it's too big to be passed or returned in core
 252        or SIMD&FP registers, and so again will need to go in memory.  */
 253     DOESNT_MATTER
 254   };
 255
 256   /* Aggregates of 17 bytes or more are normally passed and returned
 257      in memory, so aggregates of that size can safely be analyzed as
 258      DOESNT_MATTER.  We need to be able to collect enough pieces to
 259      represent a PST that is smaller than that.  Since predicates are
 260      2 bytes in size for -msve-vector-bits=128, that means we need to be
 261      able to store at least 8 pieces.
 262
 263      We also need to be able to store enough pieces to represent
 264      a single vector in each vector argument register and a single
 265      predicate in each predicate argument register.  This means that
 266      we need at least 12 pieces.  */
 267   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
 268   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
 269
 270   /* Describes one piece of a PST.  Each piece is one of:
 271
 272      - a single Scalable Vector Type (SVT)
 273      - a single Scalable Predicate Type (SPT)
 274      - a PST containing 2, 3 or 4 SVTs, with no padding
 275
 276      It either represents a single built-in type or a PST formed from
 277      multiple homogeneous built-in types.  */
 278   struct piece
 279   {
 280     rtx get_rtx (unsigned int, unsigned int) const;
 281
 282     /* The number of vector and predicate registers that the piece
 283        occupies.  One of the two is always zero.  */
 284     unsigned int num_zr;
 285     unsigned int num_pr;
 286
 287     /* The mode of the registers described above.  */
 288     machine_mode mode;
 289
 290     /* If this piece is formed from multiple homogeneous built-in types,
 291        this is the mode of the built-in types, otherwise it is MODE.  */
 292     machine_mode orig_mode;
 293
 294     /* The offset in bytes of the piece from the start of the type.  */
 295     poly_uint64 offset;
 296   };
 297
 298   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
 299      are in memory order.  */
 300   auto_vec<piece, MAX_PIECES> pieces;
 301
 302   unsigned int num_zr () const;
 303   unsigned int num_pr () const;
 304
 305   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
 306
 307   analysis_result analyze (const_tree);
 308   bool analyze_registers (const_tree);
 309
 310 private:
 311   analysis_result analyze_array (const_tree);
 312   analysis_result analyze_record (const_tree);
 313   void add_piece (const piece &);
 314 };
 315 }
 316
 317 /* The current code model.  */
 318 enum aarch64_code_model aarch64_cmodel;
 319
 320 enum aarch64_tp_reg aarch64_tpidr_register;
 321
 322 /* The number of 64-bit elements in an SVE vector.  */
 323 poly_uint16 aarch64_sve_vg;
 324
 325 #ifdef HAVE_AS_TLS
 326 #undef TARGET_HAVE_TLS
 327 #define TARGET_HAVE_TLS 1
 328 #endif
 329
 330 static bool aarch64_composite_type_p (const_tree, machine_mode);
 331 static bool aarch64_return_in_memory_1 (const_tree);
 332 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 333                                                      const_tree,
 334                                                      machine_mode *, int *,
 335                                                      bool *, bool);
 336 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 337 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 338 static void aarch64_override_options_after_change (void);
 339 static bool aarch64_vector_mode_supported_p (machine_mode);
 340 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 341 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 342                                                          const_tree type,
 343                                                          int misalignment,
 344                                                          bool is_packed);
 345 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 346 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 347                                             aarch64_addr_query_type);
 348
 349 /* The processor for which instructions should be scheduled.  */
 350 enum aarch64_processor aarch64_tune = cortexa53;
 351
 352 /* Mask to specify which instruction scheduling options should be used.  */
 353 uint64_t aarch64_tune_flags = 0;
 354
 355 /* Global flag for PC relative loads.  */
 356 bool aarch64_pcrelative_literal_loads;
 357
 358 /* Global flag for whether frame pointer is enabled.  */
 359 bool aarch64_use_frame_pointer;
 360
 361 /* Support for command line parsing of boolean flags in the tuning
 362    structures.  */
 363 struct aarch64_flag_desc
 364 {
 365   const char* name;
 366   unsigned int flag;
 367 };
 368
 369 #define AARCH64_FUSION_PAIR(name, internal_name) \
 370   { name, AARCH64_FUSE_##internal_name },
 371 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 372 {
 373   { "none", AARCH64_FUSE_NOTHING },
 374 #include "aarch64-fusion-pairs.def"
 375   { "all", AARCH64_FUSE_ALL },
 376   { NULL, AARCH64_FUSE_NOTHING }
 377 };
 378
 379 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 380   { name, AARCH64_EXTRA_TUNE_##internal_name },
 381 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 382 {
 383   { "none", AARCH64_EXTRA_TUNE_NONE },
 384 #include "aarch64-tuning-flags.def"
 385   { "all", AARCH64_EXTRA_TUNE_ALL },
 386   { NULL, AARCH64_EXTRA_TUNE_NONE }
 387 };
 388
 389 /* Tuning parameters.  */
 390 #include "tuning_models/generic.h"
 391 #include "tuning_models/generic_armv8_a.h"
 392 #include "tuning_models/generic_armv9_a.h"
 393 #include "tuning_models/cortexa35.h"
 394 #include "tuning_models/cortexa53.h"
 395 #include "tuning_models/cortexa57.h"
 396 #include "tuning_models/cortexa72.h"
 397 #include "tuning_models/cortexa73.h"
 398 #include "tuning_models/exynosm1.h"
 399 #include "tuning_models/thunderxt88.h"
 400 #include "tuning_models/thunderx.h"
 401 #include "tuning_models/tsv110.h"
 402 #include "tuning_models/xgene1.h"
 403 #include "tuning_models/emag.h"
 404 #include "tuning_models/qdf24xx.h"
 405 #include "tuning_models/saphira.h"
 406 #include "tuning_models/thunderx2t99.h"
 407 #include "tuning_models/thunderx3t110.h"
 408 #include "tuning_models/neoversen1.h"
 409 #include "tuning_models/ampere1.h"
 410 #include "tuning_models/ampere1a.h"
 411 #include "tuning_models/ampere1b.h"
 412 #include "tuning_models/neoversev1.h"
 413 #include "tuning_models/neoverse512tvb.h"
 414 #include "tuning_models/neoversen2.h"
 415 #include "tuning_models/neoversev2.h"
 416 #include "tuning_models/a64fx.h"
 417
 418 /* Support for fine-grained override of the tuning structures.  */
 419 struct aarch64_tuning_override_function
 420 {
 421   const char* name;
 422   void (*parse_override)(const char*, struct tune_params*);
 423 };
 424
 425 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 426 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 427 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
 428
 429 static const struct aarch64_tuning_override_function
 430 aarch64_tuning_override_functions[] =
 431 {
 432   { "fuse", aarch64_parse_fuse_string },
 433   { "tune", aarch64_parse_tune_string },
 434   { "sve_width", aarch64_parse_sve_width_string },
 435   { NULL, NULL }
 436 };
 437
 438 /* A processor implementing AArch64.  */
 439 struct processor
 440 {
 441   const char *name;
 442   aarch64_processor ident;
 443   aarch64_processor sched_core;
 444   aarch64_arch arch;
 445   aarch64_feature_flags flags;
 446   const tune_params *tune;
 447 };
 448
 449 /* Architectures implementing AArch64.  */
 450 static CONSTEXPR const processor all_architectures[] =
 451 {
 452 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
 453   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
 454    feature_deps::ARCH_IDENT ().enable, NULL},
 455 #include "aarch64-arches.def"
 456   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
 457 };
 458
 459 /* Processor cores implementing AArch64.  */
 460 static const struct processor all_cores[] =
 461 {
 462 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
 463   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
 464    feature_deps::cpu_##IDENT, &COSTS##_tunings},
 465 #include "aarch64-cores.def"
 466   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
 467 };
 468 /* Internal representation of system registers.  */
 469 typedef struct {
 470   const char *name;
 471   /* Stringified sysreg encoding values, represented as
 472      s<sn>_<op1>_c<cn>_c<cm>_<op2>.  */
 473   const char *encoding;
 474   /* Flags affecting sysreg usage, such as read/write-only.  */
 475   unsigned properties;
 476   /* Architectural features implied by sysreg.  */
 477   aarch64_feature_flags arch_reqs;
 478 } sysreg_t;
 479
 480 /* An aarch64_feature_set initializer for a single feature,
 481    AARCH64_FEATURE_<FEAT>.  */
 482 #define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
 483
 484 /* Used by AARCH64_FEATURES.  */
 485 #define AARCH64_OR_FEATURES_1(X, F1) \
 486   AARCH64_FEATURE (F1)
 487 #define AARCH64_OR_FEATURES_2(X, F1, F2) \
 488   (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
 489 #define AARCH64_OR_FEATURES_3(X, F1, ...) \
 490   (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
 491
 492 /* An aarch64_feature_set initializer for the N features listed in "...".  */
 493 #define AARCH64_FEATURES(N, ...) \
 494   AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
 495
 496 #define AARCH64_NO_FEATURES        0
 497
 498 /* Flags associated with the properties of system registers.  It mainly serves
 499    to mark particular registers as read or write only.  */
 500 #define F_DEPRECATED               (1 << 1)
 501 #define F_REG_READ                 (1 << 2)
 502 #define F_REG_WRITE                (1 << 3)
 503 #define F_ARCHEXT                  (1 << 4)
 504 /* Flag indicating register name is alias for another system register.  */
 505 #define F_REG_ALIAS                (1 << 5)
 506 /* Flag indicatinig registers which may be implemented with 128-bits.  */
 507 #define F_REG_128                  (1 << 6)
 508
 509 /* Database of system registers, their encodings and architectural
 510    requirements.  */
 511 const sysreg_t aarch64_sysregs[] =
 512 {
 513 #define CPENC(SN, OP1, CN, CM, OP2) "s"#SN"_"#OP1"_c"#CN"_c"#CM"_"#OP2
 514 #define SYSREG(NAME, ENC, FLAGS, ARCH) \
 515   { NAME, ENC, FLAGS, ARCH },
 516 #include "aarch64-sys-regs.def"
 517 #undef CPENC
 518 };
 519
 520 #undef AARCH64_NO_FEATURES
 521
 522 using sysreg_map_t = hash_map<nofree_string_hash, const sysreg_t *>;
 523 static sysreg_map_t *sysreg_map = nullptr;
 524
 525 /* Map system register names to their hardware metadata: encoding,
 526    feature flags and architectural feature requirements, all of which
 527    are encoded in a sysreg_t struct.  */
 528 void
 529 aarch64_register_sysreg (const char *name, const sysreg_t *metadata)
 530 {
 531   bool dup = sysreg_map->put (name, metadata);
 532   gcc_checking_assert (!dup);
 533 }
 534
 535 /* Lazily initialize hash table for system register validation,
 536    checking the validity of supplied register name and returning
 537    register's associated metadata.  */
 538 static void
 539 aarch64_init_sysregs (void)
 540 {
 541   gcc_assert (!sysreg_map);
 542   sysreg_map = new sysreg_map_t;
 543
 544
 545   for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
 546     {
 547       const sysreg_t *reg = aarch64_sysregs + i;
 548       aarch64_register_sysreg (reg->name, reg);
 549     }
 550 }
 551
 552 /* No direct access to the sysreg hash-map should be made.  Doing so
 553    risks trying to acess an unitialized hash-map and dereferencing the
 554    returned double pointer without due care risks dereferencing a
 555    null-pointer.  */
 556 const sysreg_t *
 557 aarch64_lookup_sysreg_map (const char *regname)
 558 {
 559   if (!sysreg_map)
 560     aarch64_init_sysregs ();
 561
 562   const sysreg_t **sysreg_entry = sysreg_map->get (regname);
 563   if (sysreg_entry != NULL)
 564     return *sysreg_entry;
 565   return NULL;
 566 }
 567
 568 /* The current tuning set.  */
 569 struct tune_params aarch64_tune_params = generic_tunings;
 570
 571 /* If NAME is the name of an arm:: attribute that describes shared state,
 572    return its associated AARCH64_STATE_* flags, otherwise return 0.  */
 573 static unsigned int
 574 aarch64_attribute_shared_state_flags (const char *name)
 575 {
 576   if (strcmp (name, "in") == 0)
 577     return AARCH64_STATE_SHARED | AARCH64_STATE_IN;
 578   if (strcmp (name, "inout") == 0)
 579     return AARCH64_STATE_SHARED | AARCH64_STATE_IN | AARCH64_STATE_OUT;
 580   if (strcmp (name, "out") == 0)
 581     return AARCH64_STATE_SHARED | AARCH64_STATE_OUT;
 582   if (strcmp (name, "preserves") == 0)
 583     return AARCH64_STATE_SHARED;
 584   return 0;
 585 }
 586
 587 /* See whether attribute list ATTRS has any sharing information
 588    for state STATE_NAME.  Return the associated state flags if so,
 589    otherwise return 0.  */
 590 static unsigned int
 591 aarch64_lookup_shared_state_flags (tree attrs, const char *state_name)
 592 {
 593   for (tree attr = attrs; attr; attr = TREE_CHAIN (attr))
 594     {
 595       if (!cxx11_attribute_p (attr))
 596         continue;
 597
 598       auto ns = IDENTIFIER_POINTER (TREE_PURPOSE (TREE_PURPOSE (attr)));
 599       if (strcmp (ns, "arm") != 0)
 600         continue;
 601
 602       auto attr_name = IDENTIFIER_POINTER (TREE_VALUE (TREE_PURPOSE (attr)));
 603       auto flags = aarch64_attribute_shared_state_flags (attr_name);
 604       if (!flags)
 605         continue;
 606
 607       for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
 608         {
 609           tree value = TREE_VALUE (arg);
 610           if (TREE_CODE (value) == STRING_CST
 611               && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
 612             return flags;
 613         }
 614     }
 615   return 0;
 616 }
 617
 618 /* Return true if DECL creates a new scope for state STATE_STRING.  */
 619 static bool
 620 aarch64_fndecl_has_new_state (const_tree decl, const char *state_name)
 621 {
 622   if (tree attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl)))
 623     for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
 624       {
 625         tree value = TREE_VALUE (arg);
 626         if (TREE_CODE (value) == STRING_CST
 627             && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
 628           return true;
 629       }
 630   return false;
 631 }
 632
 633 /* Return true if attribute argument VALUE is a recognized state string,
 634    otherwise report an error.  NAME is the name of the attribute to which
 635    VALUE is being passed.  */
 636 static bool
 637 aarch64_check_state_string (tree name, tree value)
 638 {
 639   if (TREE_CODE (value) != STRING_CST)
 640     {
 641       error ("the arguments to %qE must be constant strings", name);
 642       return false;
 643     }
 644
 645   const char *state_name = TREE_STRING_POINTER (value);
 646   if (strcmp (state_name, "za") != 0
 647       && strcmp (state_name, "zt0") != 0)
 648     {
 649       error ("unrecognized state string %qs", state_name);
 650       return false;
 651     }
 652
 653   return true;
 654 }
 655
 656 /* qsort callback to compare two STRING_CSTs.  */
 657 static int
 658 cmp_string_csts (const void *a, const void *b)
 659 {
 660   return strcmp (TREE_STRING_POINTER (*(const_tree const *) a),
 661                  TREE_STRING_POINTER (*(const_tree const *) b));
 662 }
 663
 664 /* Canonicalize a list of state strings.  ARGS contains the arguments to
 665    a new attribute while OLD_ATTR, if nonnull, contains a previous attribute
 666    of the same type.  If CAN_MERGE_IN_PLACE, it is safe to adjust OLD_ATTR's
 667    arguments and drop the new attribute.  Otherwise, the new attribute must
 668    be kept and ARGS must include the information in OLD_ATTR.
 669
 670    In both cases, the new arguments must be a sorted list of state strings
 671    with duplicates removed.
 672
 673    Return true if new attribute should be kept, false if it should be
 674    dropped.  */
 675 static bool
 676 aarch64_merge_string_arguments (tree args, tree old_attr,
 677                                 bool can_merge_in_place)
 678 {
 679   /* Get a sorted list of all state strings (including duplicates).  */
 680   auto add_args = [](vec<tree> &strings, const_tree args)
 681     {
 682       for (const_tree arg = args; arg; arg = TREE_CHAIN (arg))
 683         if (TREE_CODE (TREE_VALUE (arg)) == STRING_CST)
 684           strings.safe_push (TREE_VALUE (arg));
 685     };
 686   auto_vec<tree, 16> strings;
 687   add_args (strings, args);
 688   if (old_attr)
 689     add_args (strings, TREE_VALUE (old_attr));
 690   strings.qsort (cmp_string_csts);
 691
 692   /* The list can be empty if there was no previous attribute and if all
 693      the new arguments are erroneous.  Drop the attribute in that case.  */
 694   if (strings.is_empty ())
 695     return false;
 696
 697   /* Destructively modify one of the argument lists, removing duplicates
 698      on the fly.  */
 699   bool use_old_attr = old_attr && can_merge_in_place;
 700   tree *end = use_old_attr ? &TREE_VALUE (old_attr) : &args;
 701   tree prev = NULL_TREE;
 702   for (tree arg : strings)
 703     {
 704       if (prev && simple_cst_equal (arg, prev))
 705         continue;
 706       prev = arg;
 707       if (!*end)
 708         *end = tree_cons (NULL_TREE, arg, NULL_TREE);
 709       else
 710         TREE_VALUE (*end) = arg;
 711       end = &TREE_CHAIN (*end);
 712     }
 713   *end = NULL_TREE;
 714   return !use_old_attr;
 715 }
 716
 717 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
 718
 719 static tree
 720 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
 721                                      int, bool *no_add_attrs)
 722 {
 723   /* Since we set fn_type_req to true, the caller should have checked
 724      this for us.  */
 725   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
 726   switch ((arm_pcs) fntype_abi (*node).id ())
 727     {
 728     case ARM_PCS_AAPCS64:
 729     case ARM_PCS_SIMD:
 730       return NULL_TREE;
 731
 732     case ARM_PCS_SVE:
 733       error ("the %qE attribute cannot be applied to an SVE function type",
 734              name);
 735       *no_add_attrs = true;
 736       return NULL_TREE;
 737
 738     case ARM_PCS_TLSDESC:
 739     case ARM_PCS_UNKNOWN:
 740       break;
 741     }
 742   gcc_unreachable ();
 743 }
 744
 745 /* Return true if arm::new(ARGS) is compatible with the type of decl DECL,
 746    otherwise report an error.  */
 747 static bool
 748 aarch64_check_arm_new_against_type (tree args, tree decl)
 749 {
 750   tree type_attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl));
 751   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 752     {
 753       tree value = TREE_VALUE (arg);
 754       if (TREE_CODE (value) == STRING_CST)
 755         {
 756           const char *state_name = TREE_STRING_POINTER (value);
 757           if (aarch64_lookup_shared_state_flags (type_attrs, state_name))
 758             {
 759               error_at (DECL_SOURCE_LOCATION (decl),
 760                         "cannot create a new %qs scope since %qs is shared"
 761                         " with callers", state_name, state_name);
 762               return false;
 763             }
 764         }
 765     }
 766   return true;
 767 }
 768
 769 /* Callback for arm::new attributes.  */
 770 static tree
 771 handle_arm_new (tree *node, tree name, tree args, int, bool *no_add_attrs)
 772 {
 773   tree decl = *node;
 774   if (TREE_CODE (decl) != FUNCTION_DECL)
 775     {
 776       error ("%qE attribute applies only to function definitions", name);
 777       *no_add_attrs = true;
 778       return NULL_TREE;
 779     }
 780   if (TREE_TYPE (decl) == error_mark_node)
 781     {
 782       *no_add_attrs = true;
 783       return NULL_TREE;
 784     }
 785
 786   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 787     aarch64_check_state_string (name, TREE_VALUE (arg));
 788
 789   if (!aarch64_check_arm_new_against_type (args, decl))
 790     {
 791       *no_add_attrs = true;
 792       return NULL_TREE;
 793     }
 794
 795   /* If there is an old attribute, we should try to update it in-place,
 796      so that there is only one (definitive) arm::new attribute on the decl.  */
 797   tree old_attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl));
 798   if (!aarch64_merge_string_arguments (args, old_attr, true))
 799     *no_add_attrs = true;
 800
 801   return NULL_TREE;
 802 }
 803
 804 /* Callback for arm::{in,out,inout,preserves} attributes.  */
 805 static tree
 806 handle_arm_shared (tree *node, tree name, tree args,
 807                    int, bool *no_add_attrs)
 808 {
 809   tree type = *node;
 810   tree old_attrs = TYPE_ATTRIBUTES (type);
 811   auto flags = aarch64_attribute_shared_state_flags (IDENTIFIER_POINTER (name));
 812   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 813     {
 814       tree value = TREE_VALUE (arg);
 815       if (aarch64_check_state_string (name, value))
 816         {
 817           const char *state_name = TREE_STRING_POINTER (value);
 818           auto old_flags = aarch64_lookup_shared_state_flags (old_attrs,
 819                                                               state_name);
 820           if (old_flags && old_flags != flags)
 821             {
 822               error ("inconsistent attributes for state %qs", state_name);
 823               *no_add_attrs = true;
 824               return NULL_TREE;
 825             }
 826         }
 827     }
 828
 829   /* We can't update an old attribute in-place, since types are shared.
 830      Instead make sure that this new attribute contains all the
 831      information, so that the old attribute becomes redundant.  */
 832   tree old_attr = lookup_attribute ("arm", IDENTIFIER_POINTER (name),
 833                                     old_attrs);
 834   if (!aarch64_merge_string_arguments (args, old_attr, false))
 835     *no_add_attrs = true;
 836
 837   return NULL_TREE;
 838 }
 839
 840 /* Mutually-exclusive function type attributes for controlling PSTATE.SM.  */
 841 static const struct attribute_spec::exclusions attr_streaming_exclusions[] =
 842 {
 843   /* Attribute name     exclusion applies to:
 844                         function, type, variable */
 845   { "streaming", false, true, false },
 846   { "streaming_compatible", false, true, false },
 847   { NULL, false, false, false }
 848 };
 849
 850 /* Table of machine attributes.  */
 851 static const attribute_spec aarch64_gnu_attributes[] =
 852 {
 853   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
 854        affects_type_identity, handler, exclude } */
 855   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
 856                           handle_aarch64_vector_pcs_attribute, NULL },
 857   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
 858                           aarch64_sve::handle_arm_sve_vector_bits_attribute,
 859                           NULL },
 860   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
 861   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
 862   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL }
 863 };
 864
 865 static const scoped_attribute_specs aarch64_gnu_attribute_table =
 866 {
 867   "gnu", { aarch64_gnu_attributes }
 868 };
 869
 870 static const attribute_spec aarch64_arm_attributes[] =
 871 {
 872   { "streaming",          0, 0, false, true,  true,  true,
 873                           NULL, attr_streaming_exclusions },
 874   { "streaming_compatible", 0, 0, false, true,  true,  true,
 875                           NULL, attr_streaming_exclusions },
 876   { "locally_streaming",  0, 0, true, false, false, false, NULL, NULL },
 877   { "new",                1, -1, true, false, false, false,
 878                           handle_arm_new, NULL },
 879   { "preserves",          1, -1, false, true,  true,  true,
 880                           handle_arm_shared, NULL },
 881   { "in",                 1, -1, false, true,  true,  true,
 882                           handle_arm_shared, NULL },
 883   { "out",                1, -1, false, true,  true,  true,
 884                           handle_arm_shared, NULL },
 885   { "inout",              1, -1, false, true,  true,  true,
 886                           handle_arm_shared, NULL }
 887 };
 888
 889 static const scoped_attribute_specs aarch64_arm_attribute_table =
 890 {
 891   "arm", { aarch64_arm_attributes }
 892 };
 893
 894 static const scoped_attribute_specs *const aarch64_attribute_table[] =
 895 {
 896   &aarch64_gnu_attribute_table,
 897   &aarch64_arm_attribute_table
 898 };
 899
 900 typedef enum aarch64_cond_code
 901 {
 902   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 903   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 904   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 905 }
 906 aarch64_cc;
 907
 908 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 909
 910
 911 /* The condition codes of the processor, and the inverse function.  */
 912 static const char * const aarch64_condition_codes[] =
 913 {
 914   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 915   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 916 };
 917
 918 /* The preferred condition codes for SVE conditions.  */
 919 static const char *const aarch64_sve_condition_codes[] =
 920 {
 921   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
 922   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
 923 };
 924
 925 /* Return the assembly token for svpattern value VALUE.  */
 926
 927 static const char *
 928 svpattern_token (enum aarch64_svpattern pattern)
 929 {
 930   switch (pattern)
 931     {
 932 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
 933     AARCH64_FOR_SVPATTERN (CASE)
 934 #undef CASE
 935     case AARCH64_NUM_SVPATTERNS:
 936       break;
 937     }
 938   gcc_unreachable ();
 939 }
 940
 941 /* Return the location of a piece that is known to be passed or returned
 942    in registers.  FIRST_ZR is the first unused vector argument register
 943    and FIRST_PR is the first unused predicate argument register.  */
 944
 945 rtx
 946 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
 947                                          unsigned int first_pr) const
 948 {
 949   gcc_assert (VECTOR_MODE_P (mode)
 950               && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
 951               && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
 952
 953   if (num_zr > 0 && num_pr == 0)
 954     return gen_rtx_REG (mode, first_zr);
 955
 956   if (num_zr == 0 && num_pr <= 2)
 957     return gen_rtx_REG (mode, first_pr);
 958
 959   gcc_unreachable ();
 960 }
 961
 962 /* Return the total number of vector registers required by the PST.  */
 963
 964 unsigned int
 965 pure_scalable_type_info::num_zr () const
 966 {
 967   unsigned int res = 0;
 968   for (unsigned int i = 0; i < pieces.length (); ++i)
 969     res += pieces[i].num_zr;
 970   return res;
 971 }
 972
 973 /* Return the total number of predicate registers required by the PST.  */
 974
 975 unsigned int
 976 pure_scalable_type_info::num_pr () const
 977 {
 978   unsigned int res = 0;
 979   for (unsigned int i = 0; i < pieces.length (); ++i)
 980     res += pieces[i].num_pr;
 981   return res;
 982 }
 983
 984 /* Return the location of a PST that is known to be passed or returned
 985    in registers.  FIRST_ZR is the first unused vector argument register
 986    and FIRST_PR is the first unused predicate argument register.  */
 987
 988 rtx
 989 pure_scalable_type_info::get_rtx (machine_mode mode,
 990                                   unsigned int first_zr,
 991                                   unsigned int first_pr) const
 992 {
 993   /* Try to return a single REG if possible.  This leads to better
 994      code generation; it isn't required for correctness.  */
 995   if (mode == pieces[0].mode)
 996     {
 997       gcc_assert (pieces.length () == 1);
 998       return pieces[0].get_rtx (first_zr, first_pr);
 999     }
1000
1001   /* Build up a PARALLEL that contains the individual pieces.  */
1002   rtvec rtxes = rtvec_alloc (pieces.length ());
1003   for (unsigned int i = 0; i < pieces.length (); ++i)
1004     {
1005       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1006       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1007       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1008       first_zr += pieces[i].num_zr;
1009       first_pr += pieces[i].num_pr;
1010     }
1011   return gen_rtx_PARALLEL (mode, rtxes);
1012 }
1013
1014 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1015    in the AAPCS64.  */
1016
1017 pure_scalable_type_info::analysis_result
1018 pure_scalable_type_info::analyze (const_tree type)
1019 {
1020   /* Prevent accidental reuse.  */
1021   gcc_assert (pieces.is_empty ());
1022
1023   /* No code will be generated for erroneous types, so we won't establish
1024      an ABI mapping.  */
1025   if (type == error_mark_node)
1026     return NO_ABI_IDENTITY;
1027
1028   /* Zero-sized types disappear in the language->ABI mapping.  */
1029   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1030     return NO_ABI_IDENTITY;
1031
1032   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
1033   piece p = {};
1034   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1035     {
1036       machine_mode mode = TYPE_MODE_RAW (type);
1037       gcc_assert (VECTOR_MODE_P (mode)
1038                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1039
1040       p.mode = p.orig_mode = mode;
1041       add_piece (p);
1042       return IS_PST;
1043     }
1044
1045   /* Check for user-defined PSTs.  */
1046   if (TREE_CODE (type) == ARRAY_TYPE)
1047     return analyze_array (type);
1048   if (TREE_CODE (type) == RECORD_TYPE)
1049     return analyze_record (type);
1050
1051   return ISNT_PST;
1052 }
1053
1054 /* Analyze a type that is known not to be passed or returned in memory.
1055    Return true if it has an ABI identity and is a Pure Scalable Type.  */
1056
1057 bool
1058 pure_scalable_type_info::analyze_registers (const_tree type)
1059 {
1060   analysis_result result = analyze (type);
1061   gcc_assert (result != DOESNT_MATTER);
1062   return result == IS_PST;
1063 }
1064
1065 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
1066
1067 pure_scalable_type_info::analysis_result
1068 pure_scalable_type_info::analyze_array (const_tree type)
1069 {
1070   /* Analyze the element type.  */
1071   pure_scalable_type_info element_info;
1072   analysis_result result = element_info.analyze (TREE_TYPE (type));
1073   if (result != IS_PST)
1074     return result;
1075
1076   /* An array of unknown, flexible or variable length will be passed and
1077      returned by reference whatever we do.  */
1078   tree nelts_minus_one = array_type_nelts (type);
1079   if (!tree_fits_uhwi_p (nelts_minus_one))
1080     return DOESNT_MATTER;
1081
1082   /* Likewise if the array is constant-sized but too big to be interesting.
1083      The double checks against MAX_PIECES are to protect against overflow.  */
1084   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1085   if (count > MAX_PIECES)
1086     return DOESNT_MATTER;
1087   count += 1;
1088   if (count * element_info.pieces.length () > MAX_PIECES)
1089     return DOESNT_MATTER;
1090
1091   /* The above checks should have weeded out elements of unknown size.  */
1092   poly_uint64 element_bytes;
1093   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1094     gcc_unreachable ();
1095
1096   /* Build up the list of individual vectors and predicates.  */
1097   gcc_assert (!element_info.pieces.is_empty ());
1098   for (unsigned int i = 0; i < count; ++i)
1099     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1100       {
1101         piece p = element_info.pieces[j];
1102         p.offset += i * element_bytes;
1103         add_piece (p);
1104       }
1105   return IS_PST;
1106 }
1107
1108 /* Subroutine of analyze for handling RECORD_TYPEs.  */
1109
1110 pure_scalable_type_info::analysis_result
1111 pure_scalable_type_info::analyze_record (const_tree type)
1112 {
1113   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1114     {
1115       if (TREE_CODE (field) != FIELD_DECL)
1116         continue;
1117
1118       /* Zero-sized fields disappear in the language->ABI mapping.  */
1119       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1120         continue;
1121
1122       /* All fields with an ABI identity must be PSTs for the record as
1123          a whole to be a PST.  If any individual field is too big to be
1124          interesting then the record is too.  */
1125       pure_scalable_type_info field_info;
1126       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1127       if (subresult == NO_ABI_IDENTITY)
1128         continue;
1129       if (subresult != IS_PST)
1130         return subresult;
1131
1132       /* Since all previous fields are PSTs, we ought to be able to track
1133          the field offset using poly_ints.  */
1134       tree bitpos = bit_position (field);
1135       gcc_assert (poly_int_tree_p (bitpos));
1136
1137       /* For the same reason, it shouldn't be possible to create a PST field
1138          whose offset isn't byte-aligned.  */
1139       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1140                                                 BITS_PER_UNIT);
1141
1142       /* Punt if the record is too big to be interesting.  */
1143       poly_uint64 bytepos;
1144       if (!wide_bytepos.to_uhwi (&bytepos)
1145           || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1146         return DOESNT_MATTER;
1147
1148       /* Add the individual vectors and predicates in the field to the
1149          record's list.  */
1150       gcc_assert (!field_info.pieces.is_empty ());
1151       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1152         {
1153           piece p = field_info.pieces[i];
1154           p.offset += bytepos;
1155           add_piece (p);
1156         }
1157     }
1158   /* Empty structures disappear in the language->ABI mapping.  */
1159   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1160 }
1161
1162 /* Add P to the list of pieces in the type.  */
1163
1164 void
1165 pure_scalable_type_info::add_piece (const piece &p)
1166 {
1167   /* Try to fold the new piece into the previous one to form a
1168      single-mode PST.  For example, if we see three consecutive vectors
1169      of the same mode, we can represent them using the corresponding
1170      3-tuple mode.
1171
1172      This is purely an optimization.  */
1173   if (!pieces.is_empty ())
1174     {
1175       piece &prev = pieces.last ();
1176       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1177       unsigned int nelems1, nelems2;
1178       if (prev.orig_mode == p.orig_mode
1179           && GET_MODE_CLASS (p.orig_mode) != MODE_VECTOR_BOOL
1180           && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1181           && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1182                                   GET_MODE_NUNITS (p.orig_mode), &nelems1)
1183           && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1184                                   GET_MODE_NUNITS (p.orig_mode), &nelems2)
1185           && targetm.array_mode (p.orig_mode,
1186                                  nelems1 + nelems2).exists (&prev.mode))
1187         {
1188           prev.num_zr += p.num_zr;
1189           prev.num_pr += p.num_pr;
1190           return;
1191         }
1192     }
1193   pieces.quick_push (p);
1194 }
1195
1196 /* Return true if at least one possible value of type TYPE includes at
1197    least one object of Pure Scalable Type, in the sense of the AAPCS64.
1198
1199    This is a relatively expensive test for some types, so it should
1200    generally be made as late as possible.  */
1201
1202 static bool
1203 aarch64_some_values_include_pst_objects_p (const_tree type)
1204 {
1205   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1206     return false;
1207
1208   if (aarch64_sve::builtin_type_p (type))
1209     return true;
1210
1211   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1212     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1213
1214   if (RECORD_OR_UNION_TYPE_P (type))
1215     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1216       if (TREE_CODE (field) == FIELD_DECL
1217           && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1218         return true;
1219
1220   return false;
1221 }
1222
1223 /* Return the descriptor of the SIMD ABI.  */
1224
1225 static const predefined_function_abi &
1226 aarch64_simd_abi (void)
1227 {
1228   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1229   if (!simd_abi.initialized_p ())
1230     {
1231       HARD_REG_SET full_reg_clobbers
1232         = default_function_abi.full_reg_clobbers ();
1233       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1234         if (FP_SIMD_SAVED_REGNUM_P (regno))
1235           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1236       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1237     }
1238   return simd_abi;
1239 }
1240
1241 /* Return the descriptor of the SVE PCS.  */
1242
1243 static const predefined_function_abi &
1244 aarch64_sve_abi (void)
1245 {
1246   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1247   if (!sve_abi.initialized_p ())
1248     {
1249       HARD_REG_SET full_reg_clobbers
1250         = default_function_abi.full_reg_clobbers ();
1251       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1252         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1253       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1254         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1255       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1256     }
1257   return sve_abi;
1258 }
1259
1260 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
1261    wraps, otherwise return X itself.  */
1262
1263 static rtx
1264 strip_salt (rtx x)
1265 {
1266   rtx search = x;
1267   if (GET_CODE (search) == CONST)
1268     search = XEXP (search, 0);
1269   if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
1270     x = XVECEXP (search, 0, 0);
1271   return x;
1272 }
1273
1274 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
1275    expression.  */
1276
1277 static rtx
1278 strip_offset_and_salt (rtx addr, poly_int64 *offset)
1279 {
1280   return strip_salt (strip_offset (addr, offset));
1281 }
1282
1283 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1284 const char *
1285 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1286                         const char * branch_format)
1287 {
1288     rtx_code_label * tmp_label = gen_label_rtx ();
1289     char label_buf[256];
1290     char buffer[128];
1291     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1292                                  CODE_LABEL_NUMBER (tmp_label));
1293     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1294     rtx dest_label = operands[pos_label];
1295     operands[pos_label] = tmp_label;
1296
1297     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1298     output_asm_insn (buffer, operands);
1299
1300     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1301     operands[pos_label] = dest_label;
1302     output_asm_insn (buffer, operands);
1303     return "";
1304 }
1305
1306 void
1307 aarch64_err_no_fpadvsimd (machine_mode mode)
1308 {
1309   if (TARGET_GENERAL_REGS_ONLY)
1310     if (FLOAT_MODE_P (mode))
1311       error ("%qs is incompatible with the use of floating-point types",
1312              "-mgeneral-regs-only");
1313     else
1314       error ("%qs is incompatible with the use of vector types",
1315              "-mgeneral-regs-only");
1316   else
1317     if (FLOAT_MODE_P (mode))
1318       error ("%qs feature modifier is incompatible with the use of"
1319              " floating-point types", "+nofp");
1320     else
1321       error ("%qs feature modifier is incompatible with the use of"
1322              " vector types", "+nofp");
1323 }
1324
1325 /* Report when we try to do something that requires SVE when SVE is disabled.
1326    This is an error of last resort and isn't very high-quality.  It usually
1327    involves attempts to measure the vector length in some way.  */
1328 static void
1329 aarch64_report_sve_required (void)
1330 {
1331   static bool reported_p = false;
1332
1333   /* Avoid reporting a slew of messages for a single oversight.  */
1334   if (reported_p)
1335     return;
1336
1337   error ("this operation requires the SVE ISA extension");
1338   inform (input_location, "you can enable SVE using the command-line"
1339           " option %<-march%>, or by using the %<target%>"
1340           " attribute or pragma");
1341   reported_p = true;
1342 }
1343
1344 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1345    registers.  */
1346 inline bool
1347 pr_or_ffr_regnum_p (unsigned int regno)
1348 {
1349   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1350 }
1351
1352 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1353    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1354    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1355    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1356    and GENERAL_REGS is lower than the memory cost (in this case the best class
1357    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1358    cost results in bad allocations with many redundant int<->FP moves which
1359    are expensive on various cores.
1360    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1361    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1362    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1363    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1364    The result of this is that it is no longer inefficient to have a higher
1365    memory move cost than the register move cost.
1366 */
1367
1368 static reg_class_t
1369 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1370                                          reg_class_t best_class)
1371 {
1372   machine_mode mode;
1373
1374   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1375       || !reg_class_subset_p (FP_REGS, allocno_class))
1376     return allocno_class;
1377
1378   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1379       || !reg_class_subset_p (FP_REGS, best_class))
1380     return best_class;
1381
1382   mode = PSEUDO_REGNO_MODE (regno);
1383   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1384 }
1385
1386 static unsigned int
1387 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1388 {
1389   if (GET_MODE_UNIT_SIZE (mode) == 4)
1390     return aarch64_tune_params.min_div_recip_mul_sf;
1391   return aarch64_tune_params.min_div_recip_mul_df;
1392 }
1393
1394 /* Return the reassociation width of treeop OPC with mode MODE.  */
1395 static int
1396 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1397 {
1398   if (VECTOR_MODE_P (mode))
1399     return aarch64_tune_params.vec_reassoc_width;
1400   if (INTEGRAL_MODE_P (mode))
1401     return aarch64_tune_params.int_reassoc_width;
1402   /* Reassociation reduces the number of FMAs which may result in worse
1403      performance.  Use a per-CPU setting for FMA reassociation which allows
1404      narrow CPUs with few FP pipes to switch it off (value of 1), and wider
1405      CPUs with many FP pipes to enable reassociation.
1406      Since the reassociation pass doesn't understand FMA at all, assume
1407      that any FP addition might turn into FMA.  */
1408   if (FLOAT_MODE_P (mode))
1409     return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
1410                             : aarch64_tune_params.fp_reassoc_width;
1411   return 1;
1412 }
1413
1414 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1415 unsigned
1416 aarch64_debugger_regno (unsigned regno)
1417 {
1418    if (GP_REGNUM_P (regno))
1419      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1420    else if (regno == SP_REGNUM)
1421      return AARCH64_DWARF_SP;
1422    else if (FP_REGNUM_P (regno))
1423      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1424    else if (PR_REGNUM_P (regno))
1425      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1426    else if (regno == VG_REGNUM)
1427      return AARCH64_DWARF_VG;
1428
1429    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1430       equivalent DWARF register.  */
1431    return DWARF_FRAME_REGISTERS;
1432 }
1433
1434 /* Implement TARGET_DWARF_FRAME_REG_MODE.  */
1435 static machine_mode
1436 aarch64_dwarf_frame_reg_mode (int regno)
1437 {
1438   /* Predicate registers are call-clobbered in the EH ABI (which is
1439      ARM_PCS_AAPCS64), so they should not be described by CFI.
1440      Their size changes as VL changes, so any values computed by
1441      __builtin_init_dwarf_reg_size_table might not be valid for
1442      all frames.  */
1443   if (PR_REGNUM_P (regno))
1444     return VOIDmode;
1445   return default_dwarf_frame_reg_mode (regno);
1446 }
1447
1448 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1449    integer, otherwise return X unmodified.  */
1450 static rtx
1451 aarch64_bit_representation (rtx x)
1452 {
1453   if (CONST_DOUBLE_P (x))
1454     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1455   return x;
1456 }
1457
1458 /* Return an estimate for the number of quadwords in an SVE vector.  This is
1459    equivalent to the number of Advanced SIMD vectors in an SVE vector.  */
1460 static unsigned int
1461 aarch64_estimated_sve_vq ()
1462 {
1463   return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
1464 }
1465
1466 /* Return true if MODE is an SVE predicate mode.  */
1467 static bool
1468 aarch64_sve_pred_mode_p (machine_mode mode)
1469 {
1470   return (TARGET_SVE
1471           && (mode == VNx16BImode
1472               || mode == VNx8BImode
1473               || mode == VNx4BImode
1474               || mode == VNx2BImode));
1475 }
1476
1477 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1478 const unsigned int VEC_ADVSIMD  = 1;
1479 const unsigned int VEC_SVE_DATA = 2;
1480 const unsigned int VEC_SVE_PRED = 4;
1481 /* Indicates a structure of 2, 3 or 4 vectors or predicates.  */
1482 const unsigned int VEC_STRUCT   = 8;
1483 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1484    vector has fewer significant bytes than a full SVE vector.  */
1485 const unsigned int VEC_PARTIAL  = 16;
1486 /* Useful combinations of the above.  */
1487 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1488 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1489
1490 /* Return a set of flags describing the vector properties of mode MODE.
1491    If ANY_TARGET_P is false (the default), ignore modes that are not supported
1492    by the current target.  Otherwise categorize the modes that can be used
1493    with the set of all targets supported by the port.  */
1494
1495 static unsigned int
1496 aarch64_classify_vector_mode (machine_mode mode, bool any_target_p = false)
1497 {
1498   if (aarch64_sve_pred_mode_p (mode))
1499     return VEC_SVE_PRED;
1500
1501   /* Make the decision based on the mode's enum value rather than its
1502      properties, so that we keep the correct classification regardless
1503      of -msve-vector-bits.  */
1504   switch (mode)
1505     {
1506     /* Partial SVE QI vectors.  */
1507     case E_VNx2QImode:
1508     case E_VNx4QImode:
1509     case E_VNx8QImode:
1510     /* Partial SVE HI vectors.  */
1511     case E_VNx2HImode:
1512     case E_VNx4HImode:
1513     /* Partial SVE SI vector.  */
1514     case E_VNx2SImode:
1515     /* Partial SVE HF vectors.  */
1516     case E_VNx2HFmode:
1517     case E_VNx4HFmode:
1518     /* Partial SVE BF vectors.  */
1519     case E_VNx2BFmode:
1520     case E_VNx4BFmode:
1521     /* Partial SVE SF vector.  */
1522     case E_VNx2SFmode:
1523       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1524
1525     case E_VNx16QImode:
1526     case E_VNx8HImode:
1527     case E_VNx4SImode:
1528     case E_VNx2DImode:
1529     case E_VNx8BFmode:
1530     case E_VNx8HFmode:
1531     case E_VNx4SFmode:
1532     case E_VNx2DFmode:
1533       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA : 0;
1534
1535     /* x2 SVE vectors.  */
1536     case E_VNx32QImode:
1537     case E_VNx16HImode:
1538     case E_VNx8SImode:
1539     case E_VNx4DImode:
1540     case E_VNx16BFmode:
1541     case E_VNx16HFmode:
1542     case E_VNx8SFmode:
1543     case E_VNx4DFmode:
1544     /* x3 SVE vectors.  */
1545     case E_VNx48QImode:
1546     case E_VNx24HImode:
1547     case E_VNx12SImode:
1548     case E_VNx6DImode:
1549     case E_VNx24BFmode:
1550     case E_VNx24HFmode:
1551     case E_VNx12SFmode:
1552     case E_VNx6DFmode:
1553     /* x4 SVE vectors.  */
1554     case E_VNx64QImode:
1555     case E_VNx32HImode:
1556     case E_VNx16SImode:
1557     case E_VNx8DImode:
1558     case E_VNx32BFmode:
1559     case E_VNx32HFmode:
1560     case E_VNx16SFmode:
1561     case E_VNx8DFmode:
1562       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_STRUCT : 0;
1563
1564     case E_OImode:
1565     case E_CImode:
1566     case E_XImode:
1567       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1568
1569     /* Structures of 64-bit Advanced SIMD vectors.  */
1570     case E_V2x8QImode:
1571     case E_V2x4HImode:
1572     case E_V2x2SImode:
1573     case E_V2x1DImode:
1574     case E_V2x4BFmode:
1575     case E_V2x4HFmode:
1576     case E_V2x2SFmode:
1577     case E_V2x1DFmode:
1578     case E_V3x8QImode:
1579     case E_V3x4HImode:
1580     case E_V3x2SImode:
1581     case E_V3x1DImode:
1582     case E_V3x4BFmode:
1583     case E_V3x4HFmode:
1584     case E_V3x2SFmode:
1585     case E_V3x1DFmode:
1586     case E_V4x8QImode:
1587     case E_V4x4HImode:
1588     case E_V4x2SImode:
1589     case E_V4x1DImode:
1590     case E_V4x4BFmode:
1591     case E_V4x4HFmode:
1592     case E_V4x2SFmode:
1593     case E_V4x1DFmode:
1594       return (TARGET_FLOAT || any_target_p)
1595               ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
1596
1597     /* Structures of 128-bit Advanced SIMD vectors.  */
1598     case E_V2x16QImode:
1599     case E_V2x8HImode:
1600     case E_V2x4SImode:
1601     case E_V2x2DImode:
1602     case E_V2x8BFmode:
1603     case E_V2x8HFmode:
1604     case E_V2x4SFmode:
1605     case E_V2x2DFmode:
1606     case E_V3x16QImode:
1607     case E_V3x8HImode:
1608     case E_V3x4SImode:
1609     case E_V3x2DImode:
1610     case E_V3x8BFmode:
1611     case E_V3x8HFmode:
1612     case E_V3x4SFmode:
1613     case E_V3x2DFmode:
1614     case E_V4x16QImode:
1615     case E_V4x8HImode:
1616     case E_V4x4SImode:
1617     case E_V4x2DImode:
1618     case E_V4x8BFmode:
1619     case E_V4x8HFmode:
1620     case E_V4x4SFmode:
1621     case E_V4x2DFmode:
1622       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1623
1624     /* 64-bit Advanced SIMD vectors.  */
1625     case E_V8QImode:
1626     case E_V4HImode:
1627     case E_V2SImode:
1628     case E_V1DImode:
1629     case E_V4HFmode:
1630     case E_V4BFmode:
1631     case E_V2SFmode:
1632     case E_V1DFmode:
1633     /* 128-bit Advanced SIMD vectors.  */
1634     case E_V16QImode:
1635     case E_V8HImode:
1636     case E_V4SImode:
1637     case E_V2DImode:
1638     case E_V8HFmode:
1639     case E_V8BFmode:
1640     case E_V4SFmode:
1641     case E_V2DFmode:
1642       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD : 0;
1643
1644     case E_VNx32BImode:
1645       return TARGET_SVE ? VEC_SVE_PRED | VEC_STRUCT : 0;
1646
1647     default:
1648       return 0;
1649     }
1650 }
1651
1652 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1653 bool
1654 aarch64_advsimd_struct_mode_p (machine_mode mode)
1655 {
1656   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1657   return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
1658 }
1659
1660 /* Return true if MODE is an Advanced SIMD D-register structure mode.  */
1661 static bool
1662 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
1663 {
1664   return (aarch64_classify_vector_mode (mode)
1665           == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
1666 }
1667
1668 /* Return true if MODE is an Advanced SIMD Q-register structure mode.  */
1669 static bool
1670 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
1671 {
1672   return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
1673 }
1674
1675 /* Return true if MODE is any of the data vector modes, including
1676    structure modes.  */
1677 static bool
1678 aarch64_vector_data_mode_p (machine_mode mode)
1679 {
1680   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1681 }
1682
1683 /* Return true if MODE is any form of SVE mode, including predicates,
1684    vectors and structures.  */
1685 bool
1686 aarch64_sve_mode_p (machine_mode mode)
1687 {
1688   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1689 }
1690
1691 /* Return true if MODE is an SVE data vector mode; either a single vector
1692    or a structure of vectors.  */
1693 static bool
1694 aarch64_sve_data_mode_p (machine_mode mode)
1695 {
1696   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1697 }
1698
1699 /* Return the number of defined bytes in one constituent vector of
1700    SVE mode MODE, which has vector flags VEC_FLAGS.  */
1701 static poly_int64
1702 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1703 {
1704   if (vec_flags & VEC_PARTIAL)
1705     /* A single partial vector.  */
1706     return GET_MODE_SIZE (mode);
1707
1708   if (vec_flags & VEC_SVE_DATA)
1709     /* A single vector or a tuple.  */
1710     return BYTES_PER_SVE_VECTOR;
1711
1712   /* A single predicate.  */
1713   gcc_assert (vec_flags & VEC_SVE_PRED);
1714   return BYTES_PER_SVE_PRED;
1715 }
1716
1717 /* If MODE holds an array of vectors, return the number of vectors
1718    in the array, otherwise return 1.  */
1719
1720 static unsigned int
1721 aarch64_ldn_stn_vectors (machine_mode mode)
1722 {
1723   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1724   if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
1725     return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
1726   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
1727     return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
1728   if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
1729     return exact_div (GET_MODE_SIZE (mode),
1730                       BYTES_PER_SVE_VECTOR).to_constant ();
1731   return 1;
1732 }
1733
1734 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
1735    corresponding vector structure mode.  */
1736 static opt_machine_mode
1737 aarch64_advsimd_vector_array_mode (machine_mode mode,
1738                                    unsigned HOST_WIDE_INT nelems)
1739 {
1740   unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
1741   if (known_eq (GET_MODE_SIZE (mode), 8))
1742     flags |= VEC_PARTIAL;
1743
1744   machine_mode struct_mode;
1745   FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
1746     if (aarch64_classify_vector_mode (struct_mode) == flags
1747         && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
1748         && known_eq (GET_MODE_NUNITS (struct_mode),
1749              GET_MODE_NUNITS (mode) * nelems))
1750       return struct_mode;
1751   return opt_machine_mode ();
1752 }
1753
1754 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1755
1756 opt_machine_mode
1757 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1758 {
1759   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1760                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1761   machine_mode mode;
1762   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1763     if (inner_mode == GET_MODE_INNER (mode)
1764         && known_eq (nunits, GET_MODE_NUNITS (mode))
1765         && aarch64_sve_data_mode_p (mode))
1766       return mode;
1767   return opt_machine_mode ();
1768 }
1769
1770 /* Implement target hook TARGET_ARRAY_MODE.  */
1771 static opt_machine_mode
1772 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1773 {
1774   if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1775     {
1776       /* Use VNx32BI for pairs of predicates, but explicitly reject giving
1777          a mode to other array sizes.  Using integer modes requires a round
1778          trip through memory and generates terrible code.  */
1779       if (nelems == 1)
1780         return mode;
1781       if (mode == VNx16BImode && nelems == 2)
1782         return VNx32BImode;
1783       return BLKmode;
1784     }
1785
1786   auto flags = aarch64_classify_vector_mode (mode);
1787   if (flags == VEC_SVE_DATA && IN_RANGE (nelems, 2, 4))
1788     return aarch64_sve_data_mode (GET_MODE_INNER (mode),
1789                                   GET_MODE_NUNITS (mode) * nelems);
1790
1791   if (flags == VEC_ADVSIMD && IN_RANGE (nelems, 2, 4))
1792     return aarch64_advsimd_vector_array_mode (mode, nelems);
1793
1794   return opt_machine_mode ();
1795 }
1796
1797 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1798 static bool
1799 aarch64_array_mode_supported_p (machine_mode mode,
1800                                 unsigned HOST_WIDE_INT nelems)
1801 {
1802   if (TARGET_BASE_SIMD
1803       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1804           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1805       && (nelems >= 2 && nelems <= 4))
1806     return true;
1807
1808   return false;
1809 }
1810
1811 /* MODE is some form of SVE vector mode.  For data modes, return the number
1812    of vector register bits that each element of MODE occupies, such as 64
1813    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1814    in a 64-bit container).  For predicate modes, return the number of
1815    data bits controlled by each significant predicate bit.  */
1816
1817 static unsigned int
1818 aarch64_sve_container_bits (machine_mode mode)
1819 {
1820   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1821   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1822                              ? BITS_PER_SVE_VECTOR
1823                              : GET_MODE_BITSIZE (mode));
1824   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1825 }
1826
1827 /* Return the SVE predicate mode to use for elements that have
1828    ELEM_NBYTES bytes, if such a mode exists.  */
1829
1830 opt_machine_mode
1831 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1832 {
1833   if (TARGET_SVE)
1834     {
1835       if (elem_nbytes == 1)
1836         return VNx16BImode;
1837       if (elem_nbytes == 2)
1838         return VNx8BImode;
1839       if (elem_nbytes == 4)
1840         return VNx4BImode;
1841       if (elem_nbytes == 8)
1842         return VNx2BImode;
1843     }
1844   return opt_machine_mode ();
1845 }
1846
1847 /* Return the SVE predicate mode that should be used to control
1848    SVE mode MODE.  */
1849
1850 machine_mode
1851 aarch64_sve_pred_mode (machine_mode mode)
1852 {
1853   unsigned int bits = aarch64_sve_container_bits (mode);
1854   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1855 }
1856
1857 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1858
1859 static opt_machine_mode
1860 aarch64_get_mask_mode (machine_mode mode)
1861 {
1862   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1863   if (vec_flags & VEC_SVE_DATA)
1864     return aarch64_sve_pred_mode (mode);
1865
1866   return default_get_mask_mode (mode);
1867 }
1868
1869 /* Return the integer element mode associated with SVE mode MODE.  */
1870
1871 static scalar_int_mode
1872 aarch64_sve_element_int_mode (machine_mode mode)
1873 {
1874   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1875                              ? BITS_PER_SVE_VECTOR
1876                              : GET_MODE_BITSIZE (mode));
1877   unsigned int elt_bits = vector_element_size (vector_bits,
1878                                                GET_MODE_NUNITS (mode));
1879   return int_mode_for_size (elt_bits, 0).require ();
1880 }
1881
1882 /* Return an integer element mode that contains exactly
1883    aarch64_sve_container_bits (MODE) bits.  This is wider than
1884    aarch64_sve_element_int_mode if MODE is a partial vector,
1885    otherwise it's the same.  */
1886
1887 static scalar_int_mode
1888 aarch64_sve_container_int_mode (machine_mode mode)
1889 {
1890   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1891 }
1892
1893 /* Return the integer vector mode associated with SVE mode MODE.
1894    Unlike related_int_vector_mode, this can handle the case in which
1895    MODE is a predicate (and thus has a different total size).  */
1896
1897 machine_mode
1898 aarch64_sve_int_mode (machine_mode mode)
1899 {
1900   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1901   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1902 }
1903
1904 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
1905
1906 static opt_machine_mode
1907 aarch64_vectorize_related_mode (machine_mode vector_mode,
1908                                 scalar_mode element_mode,
1909                                 poly_uint64 nunits)
1910 {
1911   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
1912
1913   /* If we're operating on SVE vectors, try to return an SVE mode.  */
1914   poly_uint64 sve_nunits;
1915   if ((vec_flags & VEC_SVE_DATA)
1916       && multiple_p (BYTES_PER_SVE_VECTOR,
1917                      GET_MODE_SIZE (element_mode), &sve_nunits))
1918     {
1919       machine_mode sve_mode;
1920       if (maybe_ne (nunits, 0U))
1921         {
1922           /* Try to find a full or partial SVE mode with exactly
1923              NUNITS units.  */
1924           if (multiple_p (sve_nunits, nunits)
1925               && aarch64_sve_data_mode (element_mode,
1926                                         nunits).exists (&sve_mode))
1927             return sve_mode;
1928         }
1929       else
1930         {
1931           /* Take the preferred number of units from the number of bytes
1932              that fit in VECTOR_MODE.  We always start by "autodetecting"
1933              a full vector mode with preferred_simd_mode, so vectors
1934              chosen here will also be full vector modes.  Then
1935              autovectorize_vector_modes tries smaller starting modes
1936              and thus smaller preferred numbers of units.  */
1937           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
1938           if (aarch64_sve_data_mode (element_mode,
1939                                      sve_nunits).exists (&sve_mode))
1940             return sve_mode;
1941         }
1942     }
1943
1944   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
1945   if (TARGET_SIMD
1946       && (vec_flags & VEC_ADVSIMD)
1947       && known_eq (nunits, 0U)
1948       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
1949       && maybe_ge (GET_MODE_BITSIZE (element_mode)
1950                    * GET_MODE_NUNITS (vector_mode), 128U))
1951     {
1952       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
1953       if (VECTOR_MODE_P (res))
1954         return res;
1955     }
1956
1957   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
1958 }
1959
1960 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
1961
1962 static bool
1963 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
1964 {
1965   machine_mode mode = TYPE_MODE (type);
1966   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1967   bool sve_p = (vec_flags & VEC_ANY_SVE);
1968   bool simd_p = (vec_flags & VEC_ADVSIMD);
1969
1970   return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
1971 }
1972
1973 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1974    prefer to use the first arithmetic operand as the else value if
1975    the else value doesn't matter, since that exactly matches the SVE
1976    destructive merging form.  For ternary operations we could either
1977    pick the first operand and use FMAD-like instructions or the last
1978    operand and use FMLA-like instructions; the latter seems more
1979    natural.  */
1980
1981 static tree
1982 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1983 {
1984   return nops == 3 ? ops[2] : ops[0];
1985 }
1986
1987 /* Implement TARGET_HARD_REGNO_NREGS.  */
1988
1989 static unsigned int
1990 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1991 {
1992   /* ??? Logically we should only need to provide a value when
1993      HARD_REGNO_MODE_OK says that the combination is valid,
1994      but at the moment we need to handle all modes.  Just ignore
1995      any runtime parts for registers that can't store them.  */
1996   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1997   switch (aarch64_regno_regclass (regno))
1998     {
1999     case FP_REGS:
2000     case FP_LO_REGS:
2001     case FP_LO8_REGS:
2002       {
2003         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2004         if (vec_flags & VEC_SVE_DATA)
2005           return exact_div (GET_MODE_SIZE (mode),
2006                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2007         if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
2008           return GET_MODE_SIZE (mode).to_constant () / 8;
2009         return CEIL (lowest_size, UNITS_PER_VREG);
2010       }
2011
2012     case PR_REGS:
2013     case PR_LO_REGS:
2014     case PR_HI_REGS:
2015       return mode == VNx32BImode ? 2 : 1;
2016
2017     case FFR_REGS:
2018     case PR_AND_FFR_REGS:
2019     case FAKE_REGS:
2020       return 1;
2021
2022     default:
2023       return CEIL (lowest_size, UNITS_PER_WORD);
2024     }
2025   gcc_unreachable ();
2026 }
2027
2028 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
2029
2030 static bool
2031 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2032 {
2033   if (mode == V8DImode)
2034     return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
2035            && multiple_p (regno - R0_REGNUM, 2);
2036
2037   if (GET_MODE_CLASS (mode) == MODE_CC)
2038     return regno == CC_REGNUM;
2039
2040   if (regno == VG_REGNUM)
2041     /* This must have the same size as _Unwind_Word.  */
2042     return mode == DImode;
2043
2044   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2045   if (vec_flags == VEC_SVE_PRED)
2046     return pr_or_ffr_regnum_p (regno);
2047
2048   if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
2049     return PR_REGNUM_P (regno);
2050
2051   if (pr_or_ffr_regnum_p (regno))
2052     return false;
2053
2054   /* These registers are abstract; their modes don't matter.  */
2055   if (FAKE_REGNUM_P (regno))
2056     return true;
2057
2058   if (regno == SP_REGNUM)
2059     /* The purpose of comparing with ptr_mode is to support the
2060        global register variable associated with the stack pointer
2061        register via the syntax of asm ("wsp") in ILP32.  */
2062     return mode == Pmode || mode == ptr_mode;
2063
2064   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2065     return mode == Pmode;
2066
2067   if (GP_REGNUM_P (regno))
2068     {
2069       if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
2070         return false;
2071       if (known_le (GET_MODE_SIZE (mode), 8))
2072         return true;
2073       if (known_le (GET_MODE_SIZE (mode), 16))
2074         return (regno & 1) == 0;
2075     }
2076   else if (FP_REGNUM_P (regno))
2077     {
2078       if (vec_flags & VEC_STRUCT)
2079         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2080       else
2081         return !VECTOR_MODE_P (mode) || vec_flags != 0;
2082     }
2083
2084   return false;
2085 }
2086
2087 /* Return true if a function with type FNTYPE returns its value in
2088    SVE vector or predicate registers.  */
2089
2090 static bool
2091 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2092 {
2093   tree return_type = TREE_TYPE (fntype);
2094
2095   pure_scalable_type_info pst_info;
2096   switch (pst_info.analyze (return_type))
2097     {
2098     case pure_scalable_type_info::IS_PST:
2099       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2100               && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2101
2102     case pure_scalable_type_info::DOESNT_MATTER:
2103       gcc_assert (aarch64_return_in_memory_1 (return_type));
2104       return false;
2105
2106     case pure_scalable_type_info::NO_ABI_IDENTITY:
2107     case pure_scalable_type_info::ISNT_PST:
2108       return false;
2109     }
2110   gcc_unreachable ();
2111 }
2112
2113 /* Return true if a function with type FNTYPE takes arguments in
2114    SVE vector or predicate registers.  */
2115
2116 static bool
2117 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2118 {
2119   CUMULATIVE_ARGS args_so_far_v;
2120   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2121                                 NULL_TREE, 0, true);
2122   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2123
2124   for (tree chain = TYPE_ARG_TYPES (fntype);
2125        chain && chain != void_list_node;
2126        chain = TREE_CHAIN (chain))
2127     {
2128       tree arg_type = TREE_VALUE (chain);
2129       if (arg_type == error_mark_node)
2130         return false;
2131
2132       function_arg_info arg (arg_type, /*named=*/true);
2133       apply_pass_by_reference_rules (&args_so_far_v, arg);
2134       pure_scalable_type_info pst_info;
2135       if (pst_info.analyze_registers (arg.type))
2136         {
2137           unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2138           unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2139           gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2140           return true;
2141         }
2142
2143       targetm.calls.function_arg_advance (args_so_far, arg);
2144     }
2145   return false;
2146 }
2147
2148 /* Implement TARGET_FNTYPE_ABI.  */
2149
2150 static const predefined_function_abi &
2151 aarch64_fntype_abi (const_tree fntype)
2152 {
2153   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2154     return aarch64_simd_abi ();
2155
2156   if (aarch64_returns_value_in_sve_regs_p (fntype)
2157       || aarch64_takes_arguments_in_sve_regs_p (fntype))
2158     return aarch64_sve_abi ();
2159
2160   return default_function_abi;
2161 }
2162
2163 /* Return the state of PSTATE.SM on entry to functions of type FNTYPE.  */
2164
2165 static aarch64_feature_flags
2166 aarch64_fntype_pstate_sm (const_tree fntype)
2167 {
2168   if (lookup_attribute ("arm", "streaming", TYPE_ATTRIBUTES (fntype)))
2169     return AARCH64_FL_SM_ON;
2170
2171   if (lookup_attribute ("arm", "streaming_compatible",
2172                         TYPE_ATTRIBUTES (fntype)))
2173     return 0;
2174
2175   return AARCH64_FL_SM_OFF;
2176 }
2177
2178 /* Return state flags that describe whether and how functions of type
2179    FNTYPE share state STATE_NAME with their callers.  */
2180
2181 static unsigned int
2182 aarch64_fntype_shared_flags (const_tree fntype, const char *state_name)
2183 {
2184   return aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (fntype),
2185                                             state_name);
2186 }
2187
2188 /* Return the state of PSTATE.ZA on entry to functions of type FNTYPE.  */
2189
2190 static aarch64_feature_flags
2191 aarch64_fntype_pstate_za (const_tree fntype)
2192 {
2193   if (aarch64_fntype_shared_flags (fntype, "za")
2194       || aarch64_fntype_shared_flags (fntype, "zt0"))
2195     return AARCH64_FL_ZA_ON;
2196
2197   return 0;
2198 }
2199
2200 /* Return the ISA mode on entry to functions of type FNTYPE.  */
2201
2202 static aarch64_feature_flags
2203 aarch64_fntype_isa_mode (const_tree fntype)
2204 {
2205   return (aarch64_fntype_pstate_sm (fntype)
2206           | aarch64_fntype_pstate_za (fntype));
2207 }
2208
2209 /* Return true if FNDECL uses streaming mode internally, as an
2210    implementation choice.  */
2211
2212 static bool
2213 aarch64_fndecl_is_locally_streaming (const_tree fndecl)
2214 {
2215   return lookup_attribute ("arm", "locally_streaming",
2216                            DECL_ATTRIBUTES (fndecl));
2217 }
2218
2219 /* Return the state of PSTATE.SM when compiling the body of
2220    function FNDECL.  This might be different from the state of
2221    PSTATE.SM on entry.  */
2222
2223 static aarch64_feature_flags
2224 aarch64_fndecl_pstate_sm (const_tree fndecl)
2225 {
2226   if (aarch64_fndecl_is_locally_streaming (fndecl))
2227     return AARCH64_FL_SM_ON;
2228
2229   return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
2230 }
2231
2232 /* Return true if function FNDECL has state STATE_NAME, either by creating
2233    new state itself or by sharing state with callers.  */
2234
2235 static bool
2236 aarch64_fndecl_has_state (tree fndecl, const char *state_name)
2237 {
2238   return (aarch64_fndecl_has_new_state (fndecl, state_name)
2239           || aarch64_fntype_shared_flags (TREE_TYPE (fndecl),
2240                                           state_name) != 0);
2241 }
2242
2243 /* Return the state of PSTATE.ZA when compiling the body of function FNDECL.
2244    This might be different from the state of PSTATE.ZA on entry.  */
2245
2246 static aarch64_feature_flags
2247 aarch64_fndecl_pstate_za (const_tree fndecl)
2248 {
2249   if (aarch64_fndecl_has_new_state (fndecl, "za")
2250       || aarch64_fndecl_has_new_state (fndecl, "zt0"))
2251     return AARCH64_FL_ZA_ON;
2252
2253   return aarch64_fntype_pstate_za (TREE_TYPE (fndecl));
2254 }
2255
2256 /* Return the ISA mode that should be used to compile the body of
2257    function FNDECL.  */
2258
2259 static aarch64_feature_flags
2260 aarch64_fndecl_isa_mode (const_tree fndecl)
2261 {
2262   return (aarch64_fndecl_pstate_sm (fndecl)
2263           | aarch64_fndecl_pstate_za (fndecl));
2264 }
2265
2266 /* Return the state of PSTATE.SM on entry to the current function.
2267    This might be different from the state of PSTATE.SM in the function
2268    body.  */
2269
2270 static aarch64_feature_flags
2271 aarch64_cfun_incoming_pstate_sm ()
2272 {
2273   return aarch64_fntype_pstate_sm (TREE_TYPE (cfun->decl));
2274 }
2275
2276 /* Return the state of PSTATE.ZA on entry to the current function.
2277    This might be different from the state of PSTATE.ZA in the function
2278    body.  */
2279
2280 static aarch64_feature_flags
2281 aarch64_cfun_incoming_pstate_za ()
2282 {
2283   return aarch64_fntype_pstate_za (TREE_TYPE (cfun->decl));
2284 }
2285
2286 /* Return state flags that describe whether and how the current function shares
2287    state STATE_NAME with callers.  */
2288
2289 static unsigned int
2290 aarch64_cfun_shared_flags (const char *state_name)
2291 {
2292   return aarch64_fntype_shared_flags (TREE_TYPE (cfun->decl), state_name);
2293 }
2294
2295 /* Return true if the current function creates new state of type STATE_NAME
2296    (as opposed to sharing the state with its callers or ignoring the state
2297    altogether).  */
2298
2299 static bool
2300 aarch64_cfun_has_new_state (const char *state_name)
2301 {
2302   return aarch64_fndecl_has_new_state (cfun->decl, state_name);
2303 }
2304
2305 /* Return true if PSTATE.SM is 1 in the body of the current function,
2306    but is not guaranteed to be 1 on entry.  */
2307
2308 static bool
2309 aarch64_cfun_enables_pstate_sm ()
2310 {
2311   return (aarch64_fndecl_is_locally_streaming (cfun->decl)
2312           && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON);
2313 }
2314
2315 /* Return true if the current function has state STATE_NAME, either by
2316    creating new state itself or by sharing state with callers.  */
2317
2318 static bool
2319 aarch64_cfun_has_state (const char *state_name)
2320 {
2321   return aarch64_fndecl_has_state (cfun->decl, state_name);
2322 }
2323
2324 /* Return true if a call from the current function to a function with
2325    ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
2326    the BL instruction.  */
2327
2328 static bool
2329 aarch64_call_switches_pstate_sm (aarch64_feature_flags callee_mode)
2330 {
2331   return (callee_mode & ~AARCH64_ISA_MODE & AARCH64_FL_SM_STATE) != 0;
2332 }
2333
2334 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
2335
2336 static bool
2337 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2338 {
2339   return (aarch64_sve::builtin_type_p (type1)
2340           == aarch64_sve::builtin_type_p (type2));
2341 }
2342
2343 /* Return true if we should emit CFI for register REGNO.  */
2344
2345 static bool
2346 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2347 {
2348   return (GP_REGNUM_P (regno)
2349           || !default_function_abi.clobbers_full_reg_p (regno));
2350 }
2351
2352 /* Return the mode we should use to save and restore register REGNO.  */
2353
2354 static machine_mode
2355 aarch64_reg_save_mode (unsigned int regno)
2356 {
2357   if (GP_REGNUM_P (regno) || regno == VG_REGNUM)
2358     return DImode;
2359
2360   if (FP_REGNUM_P (regno))
2361     switch (crtl->abi->id ())
2362       {
2363       case ARM_PCS_AAPCS64:
2364         /* Only the low 64 bits are saved by the base PCS.  */
2365         return DFmode;
2366
2367       case ARM_PCS_SIMD:
2368         /* The vector PCS saves the low 128 bits (which is the full
2369            register on non-SVE targets).  */
2370         return V16QImode;
2371
2372       case ARM_PCS_SVE:
2373         /* Use vectors of DImode for registers that need frame
2374            information, so that the first 64 bytes of the save slot
2375            are always the equivalent of what storing D<n> would give.  */
2376         if (aarch64_emit_cfi_for_reg_p (regno))
2377           return VNx2DImode;
2378
2379         /* Use vectors of bytes otherwise, so that the layout is
2380            endian-agnostic, and so that we can use LDR and STR for
2381            big-endian targets.  */
2382         return VNx16QImode;
2383
2384       case ARM_PCS_TLSDESC:
2385       case ARM_PCS_UNKNOWN:
2386         break;
2387       }
2388
2389   if (PR_REGNUM_P (regno))
2390     /* Save the full predicate register.  */
2391     return VNx16BImode;
2392
2393   gcc_unreachable ();
2394 }
2395
2396 /* Given the ISA mode on entry to a callee and the ABI of the callee,
2397    return the CONST_INT that should be placed in an UNSPEC_CALLEE_ABI rtx.  */
2398
2399 rtx
2400 aarch64_gen_callee_cookie (aarch64_feature_flags isa_mode, arm_pcs pcs_variant)
2401 {
2402   return gen_int_mode ((unsigned int) isa_mode
2403                        | (unsigned int) pcs_variant << AARCH64_NUM_ISA_MODES,
2404                        DImode);
2405 }
2406
2407 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx.  Return the
2408    callee's ABI.  */
2409
2410 static const predefined_function_abi &
2411 aarch64_callee_abi (rtx cookie)
2412 {
2413   return function_abis[UINTVAL (cookie) >> AARCH64_NUM_ISA_MODES];
2414 }
2415
2416 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx.  Return the
2417    required ISA mode on entry to the callee, which is also the ISA
2418    mode on return from the callee.  */
2419
2420 static aarch64_feature_flags
2421 aarch64_callee_isa_mode (rtx cookie)
2422 {
2423   return UINTVAL (cookie) & AARCH64_FL_ISA_MODES;
2424 }
2425
2426 /* INSN is a call instruction.  Return the CONST_INT stored in its
2427    UNSPEC_CALLEE_ABI rtx.  */
2428
2429 static rtx
2430 aarch64_insn_callee_cookie (const rtx_insn *insn)
2431 {
2432   rtx pat = PATTERN (insn);
2433   gcc_assert (GET_CODE (pat) == PARALLEL);
2434   rtx unspec = XVECEXP (pat, 0, 1);
2435   gcc_assert (GET_CODE (unspec) == UNSPEC
2436               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2437   return XVECEXP (unspec, 0, 0);
2438 }
2439
2440 /* Implement TARGET_INSN_CALLEE_ABI.  */
2441
2442 const predefined_function_abi &
2443 aarch64_insn_callee_abi (const rtx_insn *insn)
2444 {
2445   return aarch64_callee_abi (aarch64_insn_callee_cookie (insn));
2446 }
2447
2448 /* INSN is a call instruction.  Return the required ISA mode on entry to
2449    the callee, which is also the ISA mode on return from the callee.  */
2450
2451 static aarch64_feature_flags
2452 aarch64_insn_callee_isa_mode (const rtx_insn *insn)
2453 {
2454   return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn));
2455 }
2456
2457 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2458    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2459    clobbers the top 64 bits when restoring the bottom 64 bits.  */
2460
2461 static bool
2462 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2463                                         unsigned int regno,
2464                                         machine_mode mode)
2465 {
2466   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2467     {
2468       poly_int64 per_register_size = GET_MODE_SIZE (mode);
2469       unsigned int nregs = hard_regno_nregs (regno, mode);
2470       if (nregs > 1)
2471         per_register_size = exact_div (per_register_size, nregs);
2472       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2473         return maybe_gt (per_register_size, 16);
2474       return maybe_gt (per_register_size, 8);
2475     }
2476   return false;
2477 }
2478
2479 /* Implement REGMODE_NATURAL_SIZE.  */
2480 poly_uint64
2481 aarch64_regmode_natural_size (machine_mode mode)
2482 {
2483   /* The natural size for SVE data modes is one SVE data vector,
2484      and similarly for predicates.  We can't independently modify
2485      anything smaller than that.  */
2486   /* ??? For now, only do this for variable-width SVE registers.
2487      Doing it for constant-sized registers breaks lower-subreg.cc.  */
2488   /* ??? And once that's fixed, we should probably have similar
2489      code for Advanced SIMD.  */
2490   if (!aarch64_sve_vg.is_constant ())
2491     {
2492       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2493       if (vec_flags & VEC_SVE_PRED)
2494         return BYTES_PER_SVE_PRED;
2495       if (vec_flags & VEC_SVE_DATA)
2496         return BYTES_PER_SVE_VECTOR;
2497     }
2498   return UNITS_PER_WORD;
2499 }
2500
2501 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2502 machine_mode
2503 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2504                                      machine_mode mode)
2505 {
2506   /* The predicate mode determines which bits are significant and
2507      which are "don't care".  Decreasing the number of lanes would
2508      lose data while increasing the number of lanes would make bits
2509      unnecessarily significant.  */
2510   if (PR_REGNUM_P (regno))
2511     return mode;
2512   if (known_ge (GET_MODE_SIZE (mode), 4))
2513     return mode;
2514   else
2515     return SImode;
2516 }
2517
2518 /* Return true if I's bits are consecutive ones from the MSB.  */
2519 bool
2520 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2521 {
2522   return exact_log2 (-i) != HOST_WIDE_INT_M1;
2523 }
2524
2525 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2526    that strcpy from constants will be faster.  */
2527
2528 static HOST_WIDE_INT
2529 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2530 {
2531   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2532     return MAX (align, BITS_PER_WORD);
2533   return align;
2534 }
2535
2536 /* Return true if calls to DECL should be treated as
2537    long-calls (ie called via a register).  */
2538 static bool
2539 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2540 {
2541   return false;
2542 }
2543
2544 /* Return true if calls to symbol-ref SYM should be treated as
2545    long-calls (ie called via a register).  */
2546 bool
2547 aarch64_is_long_call_p (rtx sym)
2548 {
2549   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2550 }
2551
2552 /* Return true if calls to symbol-ref SYM should not go through
2553    plt stubs.  */
2554
2555 bool
2556 aarch64_is_noplt_call_p (rtx sym)
2557 {
2558   const_tree decl = SYMBOL_REF_DECL (sym);
2559
2560   if (flag_pic
2561       && decl
2562       && (!flag_plt
2563           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2564       && !targetm.binds_local_p (decl))
2565     return true;
2566
2567   return false;
2568 }
2569
2570 /* Emit an insn that's a simple single-set.  Both the operands must be
2571    known to be valid.  */
2572 inline static rtx_insn *
2573 emit_set_insn (rtx x, rtx y)
2574 {
2575   return emit_insn (gen_rtx_SET (x, y));
2576 }
2577
2578 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2579    return the rtx for register 0 in the proper mode.  */
2580 rtx
2581 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2582 {
2583   machine_mode cmp_mode = GET_MODE (x);
2584   machine_mode cc_mode;
2585   rtx cc_reg;
2586
2587   if (cmp_mode == TImode)
2588     {
2589       gcc_assert (code == NE);
2590
2591       cc_mode = CCmode;
2592       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2593
2594       rtx x_lo = operand_subword (x, 0, 0, TImode);
2595       rtx y_lo = operand_subword (y, 0, 0, TImode);
2596       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2597
2598       rtx x_hi = operand_subword (x, 1, 0, TImode);
2599       rtx y_hi = operand_subword (y, 1, 0, TImode);
2600       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2601                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2602                                GEN_INT (AARCH64_EQ)));
2603     }
2604   else
2605     {
2606       cc_mode = SELECT_CC_MODE (code, x, y);
2607       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2608       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2609     }
2610   return cc_reg;
2611 }
2612
2613 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2614
2615 static rtx
2616 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2617                                   machine_mode y_mode)
2618 {
2619   if (y_mode == E_QImode || y_mode == E_HImode)
2620     {
2621       if (CONST_INT_P (y))
2622         {
2623           y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2624           y_mode = SImode;
2625         }
2626       else
2627         {
2628           rtx t, cc_reg;
2629           machine_mode cc_mode;
2630
2631           t = gen_rtx_ZERO_EXTEND (SImode, y);
2632           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2633           cc_mode = CC_SWPmode;
2634           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2635           emit_set_insn (cc_reg, t);
2636           return cc_reg;
2637         }
2638     }
2639
2640   if (!aarch64_plus_operand (y, y_mode))
2641     y = force_reg (y_mode, y);
2642
2643   return aarch64_gen_compare_reg (code, x, y);
2644 }
2645
2646 /* Generate conditional branch to LABEL, comparing X to 0 using CODE.
2647    Return the jump instruction.  */
2648
2649 static rtx
2650 aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x,
2651                                      rtx_code_label *label)
2652 {
2653   if (aarch64_track_speculation)
2654     {
2655       /* Emit an explicit compare instruction, so that we can correctly
2656          track the condition codes.  */
2657       rtx cc_reg = aarch64_gen_compare_reg (code, x, const0_rtx);
2658       x = gen_rtx_fmt_ee (code, GET_MODE (cc_reg), cc_reg, const0_rtx);
2659     }
2660   else
2661     x = gen_rtx_fmt_ee (code, VOIDmode, x, const0_rtx);
2662
2663   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
2664                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
2665   return gen_rtx_SET (pc_rtx, x);
2666 }
2667
2668 /* Return an rtx that branches to LABEL based on the value of bit BITNUM of X.
2669    If CODE is NE, it branches to LABEL when the bit is set; if CODE is EQ,
2670    it branches to LABEL when the bit is clear.  */
2671
2672 static rtx
2673 aarch64_gen_test_and_branch (rtx_code code, rtx x, int bitnum,
2674                              rtx_code_label *label)
2675 {
2676   auto mode = GET_MODE (x);
2677   if (aarch64_track_speculation)
2678     {
2679       auto mask = gen_int_mode (HOST_WIDE_INT_1U << bitnum, mode);
2680       emit_insn (gen_aarch64_and3nr_compare0 (mode, x, mask));
2681       rtx cc_reg = gen_rtx_REG (CC_NZVmode, CC_REGNUM);
2682       rtx x = gen_rtx_fmt_ee (code, CC_NZVmode, cc_reg, const0_rtx);
2683       return gen_condjump (x, cc_reg, label);
2684     }
2685   return gen_aarch64_tb (code, mode, mode,
2686                          x, gen_int_mode (bitnum, mode), label);
2687 }
2688
2689 /* Consider the operation:
2690
2691      OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
2692
2693    where:
2694
2695    - CODE is [SU]MAX or [SU]MIN
2696    - OPERANDS[2] and OPERANDS[3] are constant integers
2697    - OPERANDS[3] is a positive or negative shifted 12-bit immediate
2698    - all operands have mode MODE
2699
2700    Decide whether it is possible to implement the operation using:
2701
2702      SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
2703      or
2704      ADDS <tmp>, OPERANDS[1], OPERANDS[3]
2705
2706    followed by:
2707
2708      <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
2709
2710    where <insn> is one of CSEL, CSINV or CSINC.  Return true if so.
2711    If GENERATE_P is true, also update OPERANDS as follows:
2712
2713      OPERANDS[4] = -OPERANDS[3]
2714      OPERANDS[5] = the rtl condition representing <cond>
2715      OPERANDS[6] = <tmp>
2716      OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC.  */
2717 bool
2718 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
2719 {
2720   signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
2721   rtx dst = operands[0];
2722   rtx maxmin_op = operands[2];
2723   rtx add_op = operands[3];
2724   machine_mode mode = GET_MODE (dst);
2725
2726   /* max (x, y) - z == (x >= y + 1 ? x : y) - z
2727                     == (x >= y ? x : y) - z
2728                     == (x > y ? x : y) - z
2729                     == (x > y - 1 ? x : y) - z
2730
2731      min (x, y) - z == (x <= y - 1 ? x : y) - z
2732                     == (x <= y ? x : y) - z
2733                     == (x < y ? x : y) - z
2734                     == (x < y + 1 ? x : y) - z
2735
2736      Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
2737      which x is compared with z.  Set DIFF to y - z.  Thus the supported
2738      combinations are as follows, with DIFF being the value after the ":":
2739
2740      max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1   [z == y + 1]
2741                     == x >= y ? x - y : 0              [z == y]
2742                     == x > y ? x - y : 0               [z == y]
2743                     == x > y - 1 ? x - (y - 1) : 1     [z == y - 1]
2744
2745      min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1    [z == y - 1]
2746                     == x <= y ? x - y : 0              [z == y]
2747                     == x < y ? x - y : 0               [z == y]
2748                     == x < y + 1 ? x - (y + 1) : -1    [z == y + 1].  */
2749   auto maxmin_val = rtx_mode_t (maxmin_op, mode);
2750   auto add_val = rtx_mode_t (add_op, mode);
2751   auto sub_val = wi::neg (add_val);
2752   auto diff = wi::sub (maxmin_val, sub_val);
2753   if (!(diff == 0
2754         || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
2755         || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
2756     return false;
2757
2758   if (!generate_p)
2759     return true;
2760
2761   rtx_code cmp;
2762   switch (code)
2763     {
2764     case SMAX:
2765       cmp = diff == 1 ? GT : GE;
2766       break;
2767     case UMAX:
2768       cmp = diff == 1 ? GTU : GEU;
2769       break;
2770     case SMIN:
2771       cmp = diff == -1 ? LT : LE;
2772       break;
2773     case UMIN:
2774       cmp = diff == -1 ? LTU : LEU;
2775       break;
2776     default:
2777       gcc_unreachable ();
2778     }
2779   rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
2780
2781   operands[4] = immed_wide_int_const (sub_val, mode);
2782   operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
2783   if (can_create_pseudo_p ())
2784     operands[6] = gen_reg_rtx (mode);
2785   else
2786     operands[6] = dst;
2787   operands[7] = immed_wide_int_const (diff, mode);
2788
2789   return true;
2790 }
2791
2792
2793 /* Build the SYMBOL_REF for __tls_get_addr.  */
2794
2795 static GTY(()) rtx tls_get_addr_libfunc;
2796
2797 rtx
2798 aarch64_tls_get_addr (void)
2799 {
2800   if (!tls_get_addr_libfunc)
2801     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2802   return tls_get_addr_libfunc;
2803 }
2804
2805 /* Return the TLS model to use for ADDR.  */
2806
2807 static enum tls_model
2808 tls_symbolic_operand_type (rtx addr)
2809 {
2810   enum tls_model tls_kind = TLS_MODEL_NONE;
2811   poly_int64 offset;
2812   addr = strip_offset_and_salt (addr, &offset);
2813   if (SYMBOL_REF_P (addr))
2814     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2815
2816   return tls_kind;
2817 }
2818
2819 /* We'll allow lo_sum's in addresses in our legitimate addresses
2820    so that combine would take care of combining addresses where
2821    necessary, but for generation purposes, we'll generate the address
2822    as :
2823    RTL                               Absolute
2824    tmp = hi (symbol_ref);            adrp  x1, foo
2825    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2826                                      nop
2827
2828    PIC                               TLS
2829    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2830    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2831                                      bl   __tls_get_addr
2832                                      nop
2833
2834    Load TLS symbol, depending on TLS mechanism and TLS access model.
2835
2836    Global Dynamic - Traditional TLS:
2837    adrp tmp, :tlsgd:imm
2838    add  dest, tmp, #:tlsgd_lo12:imm
2839    bl   __tls_get_addr
2840
2841    Global Dynamic - TLS Descriptors:
2842    adrp dest, :tlsdesc:imm
2843    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2844    add  dest, dest, #:tlsdesc_lo12:imm
2845    blr  tmp
2846    mrs  tp, tpidr_el0
2847    add  dest, dest, tp
2848
2849    Initial Exec:
2850    mrs  tp, tpidr_el0
2851    adrp tmp, :gottprel:imm
2852    ldr  dest, [tmp, #:gottprel_lo12:imm]
2853    add  dest, dest, tp
2854
2855    Local Exec:
2856    mrs  tp, tpidr_el0
2857    add  t0, tp, #:tprel_hi12:imm, lsl #12
2858    add  t0, t0, #:tprel_lo12_nc:imm
2859 */
2860
2861 static void
2862 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2863                                    enum aarch64_symbol_type type)
2864 {
2865   switch (type)
2866     {
2867     case SYMBOL_SMALL_ABSOLUTE:
2868       {
2869         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2870         rtx tmp_reg = dest;
2871         machine_mode mode = GET_MODE (dest);
2872
2873         gcc_assert (mode == Pmode || mode == ptr_mode);
2874
2875         if (can_create_pseudo_p ())
2876           tmp_reg = gen_reg_rtx (mode);
2877
2878         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
2879         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2880         return;
2881       }
2882
2883     case SYMBOL_TINY_ABSOLUTE:
2884       emit_insn (gen_rtx_SET (dest, imm));
2885       return;
2886
2887     case SYMBOL_SMALL_GOT_28K:
2888       {
2889         machine_mode mode = GET_MODE (dest);
2890         rtx gp_rtx = pic_offset_table_rtx;
2891         rtx insn;
2892         rtx mem;
2893
2894         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2895            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2896            decide rtx costs, in which case pic_offset_table_rtx is not
2897            initialized.  For that case no need to generate the first adrp
2898            instruction as the final cost for global variable access is
2899            one instruction.  */
2900         if (gp_rtx != NULL)
2901           {
2902             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2903                using the page base as GOT base, the first page may be wasted,
2904                in the worst scenario, there is only 28K space for GOT).
2905
2906                The generate instruction sequence for accessing global variable
2907                is:
2908
2909                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2910
2911                Only one instruction needed. But we must initialize
2912                pic_offset_table_rtx properly.  We generate initialize insn for
2913                every global access, and allow CSE to remove all redundant.
2914
2915                The final instruction sequences will look like the following
2916                for multiply global variables access.
2917
2918                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2919
2920                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2921                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2922                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2923                  ...  */
2924
2925             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2926             crtl->uses_pic_offset_table = 1;
2927             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2928
2929             if (mode != GET_MODE (gp_rtx))
2930              gp_rtx = gen_lowpart (mode, gp_rtx);
2931
2932           }
2933
2934         if (mode == ptr_mode)
2935           {
2936             if (mode == DImode)
2937               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2938             else
2939               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2940
2941             mem = XVECEXP (SET_SRC (insn), 0, 0);
2942           }
2943         else
2944           {
2945             gcc_assert (mode == Pmode);
2946
2947             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2948             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2949           }
2950
2951         /* The operand is expected to be MEM.  Whenever the related insn
2952            pattern changed, above code which calculate mem should be
2953            updated.  */
2954         gcc_assert (MEM_P (mem));
2955         MEM_READONLY_P (mem) = 1;
2956         MEM_NOTRAP_P (mem) = 1;
2957         emit_insn (insn);
2958         return;
2959       }
2960
2961     case SYMBOL_SMALL_GOT_4G:
2962       emit_insn (gen_rtx_SET (dest, imm));
2963       return;
2964
2965     case SYMBOL_SMALL_TLSGD:
2966       {
2967         rtx_insn *insns;
2968         /* The return type of __tls_get_addr is the C pointer type
2969            so use ptr_mode.  */
2970         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
2971         rtx tmp_reg = dest;
2972
2973         if (GET_MODE (dest) != ptr_mode)
2974           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
2975
2976         start_sequence ();
2977         if (ptr_mode == SImode)
2978           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2979         else
2980           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2981         insns = get_insns ();
2982         end_sequence ();
2983
2984         RTL_CONST_CALL_P (insns) = 1;
2985         emit_libcall_block (insns, tmp_reg, result, imm);
2986         /* Convert back to the mode of the dest adding a zero_extend
2987            from SImode (ptr_mode) to DImode (Pmode). */
2988         if (dest != tmp_reg)
2989           convert_move (dest, tmp_reg, true);
2990         return;
2991       }
2992
2993     case SYMBOL_SMALL_TLSDESC:
2994       {
2995         machine_mode mode = GET_MODE (dest);
2996         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2997         rtx tp;
2998
2999         gcc_assert (mode == Pmode || mode == ptr_mode);
3000
3001         /* In ILP32, the got entry is always of SImode size.  Unlike
3002            small GOT, the dest is fixed at reg 0.  */
3003         if (TARGET_ILP32)
3004           emit_insn (gen_tlsdesc_small_si (imm));
3005         else
3006           emit_insn (gen_tlsdesc_small_di (imm));
3007         tp = aarch64_load_tp (NULL);
3008
3009         if (mode != Pmode)
3010           tp = gen_lowpart (mode, tp);
3011
3012         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3013         if (REG_P (dest))
3014           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3015         return;
3016       }
3017
3018     case SYMBOL_SMALL_TLSIE:
3019       {
3020         /* In ILP32, the mode of dest can be either SImode or DImode,
3021            while the got entry is always of SImode size.  The mode of
3022            dest depends on how dest is used: if dest is assigned to a
3023            pointer (e.g. in the memory), it has SImode; it may have
3024            DImode if dest is dereferenced to access the memeory.
3025            This is why we have to handle three different tlsie_small
3026            patterns here (two patterns for ILP32).  */
3027         machine_mode mode = GET_MODE (dest);
3028         rtx tmp_reg = gen_reg_rtx (mode);
3029         rtx tp = aarch64_load_tp (NULL);
3030
3031         if (mode == ptr_mode)
3032           {
3033             if (mode == DImode)
3034               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3035             else
3036               {
3037                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3038                 tp = gen_lowpart (mode, tp);
3039               }
3040           }
3041         else
3042           {
3043             gcc_assert (mode == Pmode);
3044             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3045           }
3046
3047         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3048         if (REG_P (dest))
3049           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3050         return;
3051       }
3052
3053     case SYMBOL_TLSLE12:
3054     case SYMBOL_TLSLE24:
3055     case SYMBOL_TLSLE32:
3056     case SYMBOL_TLSLE48:
3057       {
3058         machine_mode mode = GET_MODE (dest);
3059         rtx tp = aarch64_load_tp (NULL);
3060
3061         if (mode != Pmode)
3062           tp = gen_lowpart (mode, tp);
3063
3064         switch (type)
3065           {
3066           case SYMBOL_TLSLE12:
3067             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3068                         (dest, tp, imm));
3069             break;
3070           case SYMBOL_TLSLE24:
3071             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3072                         (dest, tp, imm));
3073           break;
3074           case SYMBOL_TLSLE32:
3075             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3076                         (dest, imm));
3077             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3078                         (dest, dest, tp));
3079           break;
3080           case SYMBOL_TLSLE48:
3081             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3082                         (dest, imm));
3083             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3084                         (dest, dest, tp));
3085             break;
3086           default:
3087             gcc_unreachable ();
3088           }
3089
3090         if (REG_P (dest))
3091           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3092         return;
3093       }
3094
3095     case SYMBOL_TINY_GOT:
3096       {
3097         rtx insn;
3098         machine_mode mode = GET_MODE (dest);
3099
3100         if (mode == ptr_mode)
3101           insn = gen_ldr_got_tiny (mode, dest, imm);
3102         else
3103           {
3104             gcc_assert (mode == Pmode);
3105             insn = gen_ldr_got_tiny_sidi (dest, imm);
3106           }
3107
3108         emit_insn (insn);
3109         return;
3110       }
3111
3112     case SYMBOL_TINY_TLSIE:
3113       {
3114         machine_mode mode = GET_MODE (dest);
3115         rtx tp = aarch64_load_tp (NULL);
3116
3117         if (mode == ptr_mode)
3118           {
3119             if (mode == DImode)
3120               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3121             else
3122               {
3123                 tp = gen_lowpart (mode, tp);
3124                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3125               }
3126           }
3127         else
3128           {
3129             gcc_assert (mode == Pmode);
3130             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3131           }
3132
3133         if (REG_P (dest))
3134           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3135         return;
3136       }
3137
3138     default:
3139       gcc_unreachable ();
3140     }
3141 }
3142
3143 /* Emit a move from SRC to DEST.  Assume that the move expanders can
3144    handle all moves if !can_create_pseudo_p ().  The distinction is
3145    important because, unlike emit_move_insn, the move expanders know
3146    how to force Pmode objects into the constant pool even when the
3147    constant pool address is not itself legitimate.  */
3148 static rtx
3149 aarch64_emit_move (rtx dest, rtx src)
3150 {
3151   return (can_create_pseudo_p ()
3152           ? emit_move_insn (dest, src)
3153           : emit_move_insn_1 (dest, src));
3154 }
3155
3156 /* Apply UNOPTAB to OP and store the result in DEST.  */
3157
3158 static void
3159 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3160 {
3161   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3162   if (dest != tmp)
3163     emit_move_insn (dest, tmp);
3164 }
3165
3166 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
3167
3168 static void
3169 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3170 {
3171   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3172                           OPTAB_DIRECT);
3173   if (dest != tmp)
3174     emit_move_insn (dest, tmp);
3175 }
3176
3177 /* Split a move from SRC to DST into two moves of mode SINGLE_MODE.  */
3178
3179 void
3180 aarch64_split_double_move (rtx dst, rtx src, machine_mode single_mode)
3181 {
3182   machine_mode mode = GET_MODE (dst);
3183
3184   rtx dst0 = simplify_gen_subreg (single_mode, dst, mode, 0);
3185   rtx dst1 = simplify_gen_subreg (single_mode, dst, mode,
3186                                   GET_MODE_SIZE (single_mode));
3187   rtx src0 = simplify_gen_subreg (single_mode, src, mode, 0);
3188   rtx src1 = simplify_gen_subreg (single_mode, src, mode,
3189                                   GET_MODE_SIZE (single_mode));
3190
3191   /* At most one pairing may overlap.  */
3192   if (reg_overlap_mentioned_p (dst0, src1))
3193     {
3194       aarch64_emit_move (dst1, src1);
3195       aarch64_emit_move (dst0, src0);
3196     }
3197   else
3198     {
3199       aarch64_emit_move (dst0, src0);
3200       aarch64_emit_move (dst1, src1);
3201     }
3202 }
3203
3204 /* Split a 128-bit move operation into two 64-bit move operations,
3205    taking care to handle partial overlap of register to register
3206    copies.  Special cases are needed when moving between GP regs and
3207    FP regs.  SRC can be a register, constant or memory; DST a register
3208    or memory.  If either operand is memory it must not have any side
3209    effects.  */
3210 void
3211 aarch64_split_128bit_move (rtx dst, rtx src)
3212 {
3213   machine_mode mode = GET_MODE (dst);
3214
3215   gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
3216   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3217   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3218
3219   if (REG_P (dst) && REG_P (src))
3220     {
3221       int src_regno = REGNO (src);
3222       int dst_regno = REGNO (dst);
3223
3224       /* Handle FP <-> GP regs.  */
3225       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3226         {
3227           rtx src_lo = gen_lowpart (word_mode, src);
3228           rtx src_hi = gen_highpart (word_mode, src);
3229
3230           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3231           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3232           return;
3233         }
3234       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3235         {
3236           rtx dst_lo = gen_lowpart (word_mode, dst);
3237           rtx dst_hi = gen_highpart (word_mode, dst);
3238
3239           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3240           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3241           return;
3242         }
3243     }
3244
3245   aarch64_split_double_move (dst, src, word_mode);
3246 }
3247
3248 /* Return true if we should split a move from 128-bit value SRC
3249    to 128-bit register DEST.  */
3250
3251 bool
3252 aarch64_split_128bit_move_p (rtx dst, rtx src)
3253 {
3254   if (FP_REGNUM_P (REGNO (dst)))
3255     return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3256   /* All moves to GPRs need to be split.  */
3257   return true;
3258 }
3259
3260 /* Split a complex SIMD move.  */
3261
3262 void
3263 aarch64_split_simd_move (rtx dst, rtx src)
3264 {
3265   machine_mode src_mode = GET_MODE (src);
3266   machine_mode dst_mode = GET_MODE (dst);
3267
3268   gcc_assert (VECTOR_MODE_P (dst_mode));
3269
3270   if (REG_P (dst) && REG_P (src))
3271     {
3272       gcc_assert (VECTOR_MODE_P (src_mode));
3273       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3274     }
3275 }
3276
3277 /* Return a register that contains SVE value X reinterpreted as SVE mode MODE.
3278    The semantics of those of svreinterpret rather than those of subregs;
3279    see the comment at the head of aarch64-sve.md for details about the
3280    difference.  */
3281
3282 rtx
3283 aarch64_sve_reinterpret (machine_mode mode, rtx x)
3284 {
3285   if (GET_MODE (x) == mode)
3286     return x;
3287
3288   /* can_change_mode_class must only return true if subregs and svreinterprets
3289      have the same semantics.  */
3290   if (targetm.can_change_mode_class (GET_MODE (x), mode, FP_REGS))
3291     return force_lowpart_subreg (mode, x, GET_MODE (x));
3292
3293   rtx res = gen_reg_rtx (mode);
3294   x = force_reg (GET_MODE (x), x);
3295   emit_insn (gen_aarch64_sve_reinterpret (mode, res, x));
3296   return res;
3297 }
3298
3299 bool
3300 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3301                               machine_mode ymode, rtx y)
3302 {
3303   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3304   gcc_assert (r != NULL);
3305   return rtx_equal_p (x, r);
3306 }
3307
3308 /* Return TARGET if it is nonnull and a register of mode MODE.
3309    Otherwise, return a fresh register of mode MODE if we can,
3310    or TARGET reinterpreted as MODE if we can't.  */
3311
3312 static rtx
3313 aarch64_target_reg (rtx target, machine_mode mode)
3314 {
3315   if (target && REG_P (target) && GET_MODE (target) == mode)
3316     return target;
3317   if (!can_create_pseudo_p ())
3318     {
3319       gcc_assert (target);
3320       return gen_lowpart (mode, target);
3321     }
3322   return gen_reg_rtx (mode);
3323 }
3324
3325 /* Return a register that contains the constant in BUILDER, given that
3326    the constant is a legitimate move operand.  Use TARGET as the register
3327    if it is nonnull and convenient.  */
3328
3329 static rtx
3330 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3331 {
3332   rtx src = builder.build ();
3333   target = aarch64_target_reg (target, GET_MODE (src));
3334   emit_insn (gen_rtx_SET (target, src));
3335   return target;
3336 }
3337
3338 static rtx
3339 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3340 {
3341   if (can_create_pseudo_p ())
3342     return force_reg (mode, value);
3343   else
3344     {
3345       gcc_assert (x);
3346       aarch64_emit_move (x, value);
3347       return x;
3348     }
3349 }
3350
3351 /* Return true if predicate value X is a constant in which every element
3352    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
3353    value, i.e. as a predicate in which all bits are significant.  */
3354
3355 static bool
3356 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3357 {
3358   if (!CONST_VECTOR_P (x))
3359     return false;
3360
3361   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3362                                              GET_MODE_NUNITS (GET_MODE (x)));
3363   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3364   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3365   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3366
3367   unsigned int nelts = const_vector_encoded_nelts (x);
3368   for (unsigned int i = 0; i < nelts; ++i)
3369     {
3370       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3371       if (!CONST_INT_P (elt))
3372         return false;
3373
3374       builder.quick_push (elt);
3375       for (unsigned int j = 1; j < factor; ++j)
3376         builder.quick_push (const0_rtx);
3377     }
3378   builder.finalize ();
3379   return true;
3380 }
3381
3382 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
3383    widest predicate element size it can have (that is, the largest size
3384    for which each element would still be 0 or 1).  */
3385
3386 unsigned int
3387 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3388 {
3389   /* Start with the most optimistic assumption: that we only need
3390      one bit per pattern.  This is what we will use if only the first
3391      bit in each pattern is ever set.  */
3392   unsigned int mask = GET_MODE_SIZE (DImode);
3393   mask |= builder.npatterns ();
3394
3395   /* Look for set bits.  */
3396   unsigned int nelts = builder.encoded_nelts ();
3397   for (unsigned int i = 1; i < nelts; ++i)
3398     if (INTVAL (builder.elt (i)) != 0)
3399       {
3400         if (i & 1)
3401           return 1;
3402         mask |= i;
3403       }
3404   return mask & -mask;
3405 }
3406
3407 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3408    return that predicate mode, otherwise return opt_machine_mode ().  */
3409
3410 opt_machine_mode
3411 aarch64_ptrue_all_mode (rtx x)
3412 {
3413   gcc_assert (GET_MODE (x) == VNx16BImode);
3414   if (!CONST_VECTOR_P (x)
3415       || !CONST_VECTOR_DUPLICATE_P (x)
3416       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3417       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3418     return opt_machine_mode ();
3419
3420   unsigned int nelts = const_vector_encoded_nelts (x);
3421   for (unsigned int i = 1; i < nelts; ++i)
3422     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3423       return opt_machine_mode ();
3424
3425   return aarch64_sve_pred_mode (nelts);
3426 }
3427
3428 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
3429    that the constant would have with predicate element size ELT_SIZE
3430    (ignoring the upper bits in each element) and return:
3431
3432    * -1 if all bits are set
3433    * N if the predicate has N leading set bits followed by all clear bits
3434    * 0 if the predicate does not have any of these forms.  */
3435
3436 int
3437 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3438                               unsigned int elt_size)
3439 {
3440   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3441      followed by set bits.  */
3442   if (builder.nelts_per_pattern () == 3)
3443     return 0;
3444
3445   /* Skip over leading set bits.  */
3446   unsigned int nelts = builder.encoded_nelts ();
3447   unsigned int i = 0;
3448   for (; i < nelts; i += elt_size)
3449     if (INTVAL (builder.elt (i)) == 0)
3450       break;
3451   unsigned int vl = i / elt_size;
3452
3453   /* Check for the all-true case.  */
3454   if (i == nelts)
3455     return -1;
3456
3457   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3458      repeating pattern of set bits followed by clear bits.  */
3459   if (builder.nelts_per_pattern () != 2)
3460     return 0;
3461
3462   /* We have a "foreground" value and a duplicated "background" value.
3463      If the background might repeat and the last set bit belongs to it,
3464      we might have set bits followed by clear bits followed by set bits.  */
3465   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3466     return 0;
3467
3468   /* Make sure that the rest are all clear.  */
3469   for (; i < nelts; i += elt_size)
3470     if (INTVAL (builder.elt (i)) != 0)
3471       return 0;
3472
3473   return vl;
3474 }
3475
3476 /* See if there is an svpattern that encodes an SVE predicate of mode
3477    PRED_MODE in which the first VL bits are set and the rest are clear.
3478    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3479    A VL of -1 indicates an all-true vector.  */
3480
3481 aarch64_svpattern
3482 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3483 {
3484   if (vl < 0)
3485     return AARCH64_SV_ALL;
3486
3487   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3488     return AARCH64_NUM_SVPATTERNS;
3489
3490   if (vl >= 1 && vl <= 8)
3491     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3492
3493   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3494     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3495
3496   int max_vl;
3497   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3498     {
3499       if (vl == (max_vl / 3) * 3)
3500         return AARCH64_SV_MUL3;
3501       /* These would only trigger for non-power-of-2 lengths.  */
3502       if (vl == (max_vl & -4))
3503         return AARCH64_SV_MUL4;
3504       if (vl == (1 << floor_log2 (max_vl)))
3505         return AARCH64_SV_POW2;
3506       if (vl == max_vl)
3507         return AARCH64_SV_ALL;
3508     }
3509   return AARCH64_NUM_SVPATTERNS;
3510 }
3511
3512 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3513    bits has the lowest bit set and the upper bits clear.  This is the
3514    VNx16BImode equivalent of a PTRUE for controlling elements of
3515    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3516    all bits are significant, even the upper zeros.  */
3517
3518 rtx
3519 aarch64_ptrue_all (unsigned int elt_size)
3520 {
3521   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3522   builder.quick_push (const1_rtx);
3523   for (unsigned int i = 1; i < elt_size; ++i)
3524     builder.quick_push (const0_rtx);
3525   return builder.build ();
3526 }
3527
3528 /* Return an all-true predicate register of mode MODE.  */
3529
3530 rtx
3531 aarch64_ptrue_reg (machine_mode mode)
3532 {
3533   gcc_assert (aarch64_sve_pred_mode_p (mode));
3534   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3535   return gen_lowpart (mode, reg);
3536 }
3537
3538 /* Return an all-false predicate register of mode MODE.  */
3539
3540 rtx
3541 aarch64_pfalse_reg (machine_mode mode)
3542 {
3543   gcc_assert (aarch64_sve_pred_mode_p (mode));
3544   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3545   return gen_lowpart (mode, reg);
3546 }
3547
3548 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3549    for it.  PRED2[0] is the predicate for the instruction whose result
3550    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3551    for it.  Return true if we can prove that the two predicates are
3552    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3553    with PRED1[0] without changing behavior.  */
3554
3555 bool
3556 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3557 {
3558   machine_mode mode = GET_MODE (pred1[0]);
3559   gcc_assert (aarch64_sve_pred_mode_p (mode)
3560               && mode == GET_MODE (pred2[0])
3561               && aarch64_sve_ptrue_flag (pred1[1], SImode)
3562               && aarch64_sve_ptrue_flag (pred2[1], SImode));
3563
3564   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3565                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3566   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3567                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3568   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3569 }
3570
3571 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3572    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3573    Use TARGET as the target register if nonnull and convenient.  */
3574
3575 static rtx
3576 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3577                           machine_mode data_mode, rtx op1, rtx op2)
3578 {
3579   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3580   expand_operand ops[5];
3581   create_output_operand (&ops[0], target, pred_mode);
3582   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3583   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3584   create_input_operand (&ops[3], op1, data_mode);
3585   create_input_operand (&ops[4], op2, data_mode);
3586   expand_insn (icode, 5, ops);
3587   return ops[0].value;
3588 }
3589
3590 /* Use a comparison to convert integer vector SRC into MODE, which is
3591    the corresponding SVE predicate mode.  Use TARGET for the result
3592    if it's nonnull and convenient.  */
3593
3594 rtx
3595 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3596 {
3597   machine_mode src_mode = GET_MODE (src);
3598   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3599                                    src, CONST0_RTX (src_mode));
3600 }
3601
3602 /* Return the assembly token for svprfop value PRFOP.  */
3603
3604 static const char *
3605 svprfop_token (enum aarch64_svprfop prfop)
3606 {
3607   switch (prfop)
3608     {
3609 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3610     AARCH64_FOR_SVPRFOP (CASE)
3611 #undef CASE
3612     case AARCH64_NUM_SVPRFOPS:
3613       break;
3614     }
3615   gcc_unreachable ();
3616 }
3617
3618 /* Return the assembly string for an SVE prefetch operation with
3619    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3620    and that SUFFIX is the format for the remaining operands.  */
3621
3622 char *
3623 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3624                              const char *suffix)
3625 {
3626   static char buffer[128];
3627   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3628   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3629                                    mnemonic, svprfop_token (prfop), suffix);
3630   gcc_assert (written < sizeof (buffer));
3631   return buffer;
3632 }
3633
3634 /* Check whether we can calculate the number of elements in PATTERN
3635    at compile time, given that there are NELTS_PER_VQ elements per
3636    128-bit block.  Return the value if so, otherwise return -1.  */
3637
3638 HOST_WIDE_INT
3639 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3640 {
3641   unsigned int vl, const_vg;
3642   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3643     vl = 1 + (pattern - AARCH64_SV_VL1);
3644   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3645     vl = 16 << (pattern - AARCH64_SV_VL16);
3646   else if (aarch64_sve_vg.is_constant (&const_vg))
3647     {
3648       /* There are two vector granules per quadword.  */
3649       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3650       switch (pattern)
3651         {
3652         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3653         case AARCH64_SV_MUL4: return nelts & -4;
3654         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3655         case AARCH64_SV_ALL: return nelts;
3656         default: gcc_unreachable ();
3657         }
3658     }
3659   else
3660     return -1;
3661
3662   /* There are two vector granules per quadword.  */
3663   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3664   if (known_le (vl, nelts_all))
3665     return vl;
3666
3667   /* Requesting more elements than are available results in a PFALSE.  */
3668   if (known_gt (vl, nelts_all))
3669     return 0;
3670
3671   return -1;
3672 }
3673
3674 /* Return true if a single CNT[BHWD] instruction can multiply FACTOR
3675    by the number of 128-bit quadwords in an SVE vector.  */
3676
3677 static bool
3678 aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
3679 {
3680   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
3681   return (IN_RANGE (factor, 2, 16 * 16)
3682           && (factor & 1) == 0
3683           && factor <= 16 * (factor & -factor));
3684 }
3685
3686 /* Return true if we can move VALUE into a register using a single
3687    CNT[BHWD] instruction.  */
3688
3689 static bool
3690 aarch64_sve_cnt_immediate_p (poly_int64 value)
3691 {
3692   HOST_WIDE_INT factor = value.coeffs[0];
3693   return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
3694 }
3695
3696 /* Likewise for rtx X.  */
3697
3698 bool
3699 aarch64_sve_cnt_immediate_p (rtx x)
3700 {
3701   poly_int64 value;
3702   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3703 }
3704
3705 /* Return the asm string for an instruction with a CNT-like vector size
3706    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3707    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3708    first part of the operands template (the part that comes before the
3709    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
3710    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
3711    in each quadword.  If it is zero, we can use any element size.  */
3712
3713 static char *
3714 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3715                                   aarch64_svpattern pattern,
3716                                   unsigned int factor,
3717                                   unsigned int nelts_per_vq)
3718 {
3719   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3720
3721   if (nelts_per_vq == 0)
3722     /* There is some overlap in the ranges of the four CNT instructions.
3723        Here we always use the smallest possible element size, so that the
3724        multiplier is 1 whereever possible.  */
3725     nelts_per_vq = factor & -factor;
3726   int shift = std::min (exact_log2 (nelts_per_vq), 4);
3727   gcc_assert (IN_RANGE (shift, 1, 4));
3728   char suffix = "dwhb"[shift - 1];
3729
3730   factor >>= shift;
3731   unsigned int written;
3732   if (pattern == AARCH64_SV_ALL && factor == 1)
3733     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3734                         prefix, suffix, operands);
3735   else if (factor == 1)
3736     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3737                         prefix, suffix, operands, svpattern_token (pattern));
3738   else
3739     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3740                         prefix, suffix, operands, svpattern_token (pattern),
3741                         factor);
3742   gcc_assert (written < sizeof (buffer));
3743   return buffer;
3744 }
3745
3746 /* Return the asm string for an instruction with a CNT-like vector size
3747    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3748    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3749    first part of the operands template (the part that comes before the
3750    vector size itself).  X is the value of the vector size operand,
3751    as a polynomial integer rtx; we need to convert this into an "all"
3752    pattern with a multiplier.  */
3753
3754 char *
3755 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3756                                   rtx x)
3757 {
3758   poly_int64 value = rtx_to_poly_int64 (x);
3759   gcc_assert (aarch64_sve_cnt_immediate_p (value));
3760   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3761                                            value.coeffs[1], 0);
3762 }
3763
3764 /* Return the asm string for an instruction with a CNT-like vector size
3765    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3766    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3767    first part of the operands template (the part that comes before the
3768    vector size itself).  CNT_PAT[0..2] are the operands of the
3769    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
3770
3771 char *
3772 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3773                                       const char *operands, rtx *cnt_pat)
3774 {
3775   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3776   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3777   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3778   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3779                                            factor, nelts_per_vq);
3780 }
3781
3782 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
3783
3784 bool
3785 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3786 {
3787   poly_int64 value;
3788   return (poly_int_rtx_p (x, &value)
3789           && (aarch64_sve_cnt_immediate_p (value)
3790               || aarch64_sve_cnt_immediate_p (-value)));
3791 }
3792
3793 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3794    operand 0.  */
3795
3796 char *
3797 aarch64_output_sve_scalar_inc_dec (rtx offset)
3798 {
3799   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3800   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3801   if (offset_value.coeffs[1] > 0)
3802     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3803                                              offset_value.coeffs[1], 0);
3804   else
3805     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3806                                              -offset_value.coeffs[1], 0);
3807 }
3808
3809 /* Return true if a single RDVL instruction can multiply FACTOR by the
3810    number of 128-bit quadwords in an SVE vector.  This is also the
3811    range of ADDVL.  */
3812
3813 static bool
3814 aarch64_sve_rdvl_addvl_factor_p (HOST_WIDE_INT factor)
3815 {
3816   return (multiple_p (factor, 16)
3817           && IN_RANGE (factor, -32 * 16, 31 * 16));
3818 }
3819
3820 /* Return true if ADDPL can be used to add FACTOR multiplied by the number
3821    of quadwords in an SVE vector.  */
3822
3823 static bool
3824 aarch64_sve_addpl_factor_p (HOST_WIDE_INT factor)
3825 {
3826   return (multiple_p (factor, 2)
3827           && IN_RANGE (factor, -32 * 2, 31 * 2));
3828 }
3829
3830 /* Return true if we can move VALUE into a register using a single
3831    RDVL instruction.  */
3832
3833 static bool
3834 aarch64_sve_rdvl_immediate_p (poly_int64 value)
3835 {
3836   HOST_WIDE_INT factor = value.coeffs[0];
3837   return value.coeffs[1] == factor && aarch64_sve_rdvl_addvl_factor_p (factor);
3838 }
3839
3840 /* Likewise for rtx X.  */
3841
3842 bool
3843 aarch64_sve_rdvl_immediate_p (rtx x)
3844 {
3845   poly_int64 value;
3846   return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
3847 }
3848
3849 /* Return the asm string for moving RDVL immediate OFFSET into register
3850    operand 0.  */
3851
3852 char *
3853 aarch64_output_sve_rdvl (rtx offset)
3854 {
3855   static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
3856   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3857   gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
3858
3859   int factor = offset_value.coeffs[1];
3860   snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
3861   return buffer;
3862 }
3863
3864 /* Return true if we can add VALUE to a register using a single ADDVL
3865    or ADDPL instruction.  */
3866
3867 static bool
3868 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3869 {
3870   HOST_WIDE_INT factor = value.coeffs[0];
3871   if (factor == 0 || value.coeffs[1] != factor)
3872     return false;
3873   return (aarch64_sve_rdvl_addvl_factor_p (factor)
3874           || aarch64_sve_addpl_factor_p (factor));
3875 }
3876
3877 /* Likewise for rtx X.  */
3878
3879 bool
3880 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3881 {
3882   poly_int64 value;
3883   return (poly_int_rtx_p (x, &value)
3884           && aarch64_sve_addvl_addpl_immediate_p (value));
3885 }
3886
3887 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3888    to operand 1 and storing the result in operand 0.  */
3889
3890 char *
3891 aarch64_output_sve_addvl_addpl (rtx offset)
3892 {
3893   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3894   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3895   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3896
3897   int factor = offset_value.coeffs[1];
3898   if ((factor & 15) == 0)
3899     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3900   else
3901     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3902   return buffer;
3903 }
3904
3905 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3906    instruction.  If it is, store the number of elements in each vector
3907    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3908    factor in *FACTOR_OUT (if nonnull).  */
3909
3910 bool
3911 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3912                                         unsigned int *nelts_per_vq_out)
3913 {
3914   rtx elt;
3915   poly_int64 value;
3916
3917   if (!const_vec_duplicate_p (x, &elt)
3918       || !poly_int_rtx_p (elt, &value))
3919     return false;
3920
3921   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3922   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3923     /* There's no vector INCB.  */
3924     return false;
3925
3926   HOST_WIDE_INT factor = value.coeffs[0];
3927   if (value.coeffs[1] != factor)
3928     return false;
3929
3930   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3931   if ((factor % nelts_per_vq) != 0
3932       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3933     return false;
3934
3935   if (factor_out)
3936     *factor_out = factor;
3937   if (nelts_per_vq_out)
3938     *nelts_per_vq_out = nelts_per_vq;
3939   return true;
3940 }
3941
3942 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3943    instruction.  */
3944
3945 bool
3946 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3947 {
3948   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3949 }
3950
3951 /* Return the asm template for an SVE vector INC or DEC instruction.
3952    OPERANDS gives the operands before the vector count and X is the
3953    value of the vector count operand itself.  */
3954
3955 char *
3956 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3957 {
3958   int factor;
3959   unsigned int nelts_per_vq;
3960   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3961     gcc_unreachable ();
3962   if (factor < 0)
3963     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3964                                              -factor, nelts_per_vq);
3965   else
3966     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3967                                              factor, nelts_per_vq);
3968 }
3969
3970 /* Return a constant that represents FACTOR multiplied by the
3971    number of 128-bit quadwords in an SME vector.  ISA_MODE is the
3972    ISA mode in which the calculation is being performed.  */
3973
3974 rtx
3975 aarch64_sme_vq_immediate (machine_mode mode, HOST_WIDE_INT factor,
3976                           aarch64_feature_flags isa_mode)
3977 {
3978   gcc_assert (aarch64_sve_rdvl_addvl_factor_p (factor));
3979   if (isa_mode & AARCH64_FL_SM_ON)
3980     /* We're in streaming mode, so we can use normal poly-int values.  */
3981     return gen_int_mode ({ factor, factor }, mode);
3982
3983   rtvec vec = gen_rtvec (1, gen_int_mode (factor, SImode));
3984   rtx unspec = gen_rtx_UNSPEC (mode, vec, UNSPEC_SME_VQ);
3985   return gen_rtx_CONST (mode, unspec);
3986 }
3987
3988 /* Return true if X is a constant that represents some number X
3989    multiplied by the number of quadwords in an SME vector.  Store this X
3990    in *FACTOR if so.  */
3991
3992 static bool
3993 aarch64_sme_vq_unspec_p (const_rtx x, HOST_WIDE_INT *factor)
3994 {
3995   if (!TARGET_SME || GET_CODE (x) != CONST)
3996     return false;
3997
3998   x = XEXP (x, 0);
3999   if (GET_CODE (x) != UNSPEC
4000       || XINT (x, 1) != UNSPEC_SME_VQ
4001       || XVECLEN (x, 0) != 1)
4002     return false;
4003
4004   x = XVECEXP (x, 0, 0);
4005   if (!CONST_INT_P (x))
4006     return false;
4007
4008   *factor = INTVAL (x);
4009   return true;
4010 }
4011
4012 /* Return true if X is a constant that represents some number Y
4013    multiplied by the number of quadwords in an SME vector, and if
4014    that Y is in the range of RDSVL.  */
4015
4016 bool
4017 aarch64_rdsvl_immediate_p (const_rtx x)
4018 {
4019   HOST_WIDE_INT factor;
4020   return (aarch64_sme_vq_unspec_p (x, &factor)
4021           && aarch64_sve_rdvl_addvl_factor_p (factor));
4022 }
4023
4024 /* Return the asm string for an RDSVL instruction that calculates X,
4025    which is a constant that satisfies aarch64_rdsvl_immediate_p.  */
4026
4027 char *
4028 aarch64_output_rdsvl (const_rtx x)
4029 {
4030   gcc_assert (aarch64_rdsvl_immediate_p (x));
4031   static char buffer[sizeof ("rdsvl\t%x0, #-") + 3 * sizeof (int)];
4032   x = XVECEXP (XEXP (x, 0), 0, 0);
4033   snprintf (buffer, sizeof (buffer), "rdsvl\t%%x0, #%d",
4034             (int) INTVAL (x) / 16);
4035   return buffer;
4036 }
4037
4038 /* Return true if X is a constant that can be added using ADDSVL or ADDSPL.  */
4039
4040 bool
4041 aarch64_addsvl_addspl_immediate_p (const_rtx x)
4042 {
4043   HOST_WIDE_INT factor;
4044   return (aarch64_sme_vq_unspec_p (x, &factor)
4045           && (aarch64_sve_rdvl_addvl_factor_p (factor)
4046               || aarch64_sve_addpl_factor_p (factor)));
4047 }
4048
4049 /* X is a constant that satisfies aarch64_addsvl_addspl_immediate_p.
4050    Return the asm string for the associated instruction.  */
4051
4052 char *
4053 aarch64_output_addsvl_addspl (rtx x)
4054 {
4055   static char buffer[sizeof ("addspl\t%x0, %x1, #-") + 3 * sizeof (int)];
4056   HOST_WIDE_INT factor;
4057   if (!aarch64_sme_vq_unspec_p (x, &factor))
4058     gcc_unreachable ();
4059   if (aarch64_sve_rdvl_addvl_factor_p (factor))
4060     snprintf (buffer, sizeof (buffer), "addsvl\t%%x0, %%x1, #%d",
4061               (int) factor / 16);
4062   else if (aarch64_sve_addpl_factor_p (factor))
4063     snprintf (buffer, sizeof (buffer), "addspl\t%%x0, %%x1, #%d",
4064               (int) factor / 2);
4065   else
4066     gcc_unreachable ();
4067   return buffer;
4068 }
4069
4070 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4071
4072 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4073   {
4074     0x0000000100000001ull,
4075     0x0001000100010001ull,
4076     0x0101010101010101ull,
4077     0x1111111111111111ull,
4078     0x5555555555555555ull,
4079   };
4080
4081
4082
4083 /* Return true if 64-bit VAL is a valid bitmask immediate.  */
4084 static bool
4085 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
4086 {
4087   unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
4088   int bits;
4089
4090   /* Check for a single sequence of one bits and return quickly if so.
4091      The special cases of all ones and all zeroes returns false.  */
4092   tmp = val + (val & -val);
4093
4094   if (tmp == (tmp & -tmp))
4095     return (val + 1) > 1;
4096
4097   /* Invert if the immediate doesn't start with a zero bit - this means we
4098      only need to search for sequences of one bits.  */
4099   if (val & 1)
4100     val = ~val;
4101
4102   /* Find the first set bit and set tmp to val with the first sequence of one
4103      bits removed.  Return success if there is a single sequence of ones.  */
4104   first_one = val & -val;
4105   tmp = val & (val + first_one);
4106
4107   if (tmp == 0)
4108     return true;
4109
4110   /* Find the next set bit and compute the difference in bit position.  */
4111   next_one = tmp & -tmp;
4112   bits = clz_hwi (first_one) - clz_hwi (next_one);
4113   mask = val ^ tmp;
4114
4115   /* Check the bit position difference is a power of 2, and that the first
4116      sequence of one bits fits within 'bits' bits.  */
4117   if ((mask >> bits) != 0 || bits != (bits & -bits))
4118     return false;
4119
4120   /* Check the sequence of one bits is repeated 64/bits times.  */
4121   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4122 }
4123
4124
4125 /* Return true if VAL is a valid bitmask immediate for MODE.  */
4126 bool
4127 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4128 {
4129   if (mode == DImode)
4130     return aarch64_bitmask_imm (val);
4131
4132   if (mode == SImode)
4133     return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
4134
4135   /* Replicate small immediates to fit 64 bits.  */
4136   int size = GET_MODE_UNIT_PRECISION (mode);
4137   val &= (HOST_WIDE_INT_1U << size) - 1;
4138   val *= bitmask_imm_mul[__builtin_clz (size) - 26];
4139
4140   return aarch64_bitmask_imm (val);
4141 }
4142
4143
4144 /* Return true if the immediate VAL can be a bitfield immediate
4145    by changing the given MASK bits in VAL to zeroes, ones or bits
4146    from the other half of VAL.  Return the new immediate in VAL2.  */
4147 static inline bool
4148 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
4149                        unsigned HOST_WIDE_INT &val2,
4150                        unsigned HOST_WIDE_INT mask)
4151 {
4152   val2 = val & ~mask;
4153   if (val2 != val && aarch64_bitmask_imm (val2))
4154     return true;
4155   val2 = val | mask;
4156   if (val2 != val && aarch64_bitmask_imm (val2))
4157     return true;
4158   val = val & ~mask;
4159   val2 = val | (((val >> 32) | (val << 32)) & mask);
4160   if (val2 != val && aarch64_bitmask_imm (val2))
4161     return true;
4162   val2 = val | (((val >> 16) | (val << 48)) & mask);
4163   if (val2 != val && aarch64_bitmask_imm (val2))
4164     return true;
4165   return false;
4166 }
4167
4168
4169 /* Return true if VAL is a valid MOVZ immediate.  */
4170 static inline bool
4171 aarch64_is_movz (unsigned HOST_WIDE_INT val)
4172 {
4173   return (val >> (ctz_hwi (val) & 48)) < 65536;
4174 }
4175
4176
4177 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ.  */
4178 bool
4179 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
4180 {
4181   return aarch64_is_movz (val) || aarch64_is_movz (~val)
4182     || aarch64_bitmask_imm (val);
4183 }
4184
4185
4186 /* Return true if VAL is an immediate that can be created by a single
4187    MOV instruction.  */
4188 bool
4189 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4190 {
4191   gcc_assert (mode == SImode || mode == DImode);
4192
4193   if (val < 65536)
4194     return true;
4195
4196   unsigned HOST_WIDE_INT mask =
4197     (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
4198
4199   if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
4200     return true;
4201
4202   val = (val & mask) | ((val << 32) & ~mask);
4203   return aarch64_bitmask_imm (val);
4204 }
4205
4206
4207 static int
4208 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4209                                 machine_mode mode)
4210 {
4211   int i;
4212   unsigned HOST_WIDE_INT val, val2, val3, mask;
4213   int one_match, zero_match;
4214   int num_insns;
4215
4216   gcc_assert (mode == SImode || mode == DImode);
4217
4218   val = INTVAL (imm);
4219
4220   if (aarch64_move_imm (val, mode))
4221     {
4222       if (generate)
4223         emit_insn (gen_rtx_SET (dest, imm));
4224       return 1;
4225     }
4226
4227   if ((val >> 32) == 0 || mode == SImode)
4228     {
4229       if (generate)
4230         {
4231           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4232           if (mode == SImode)
4233             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4234                                        GEN_INT ((val >> 16) & 0xffff)));
4235           else
4236             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4237                                        GEN_INT ((val >> 16) & 0xffff)));
4238         }
4239       return 2;
4240     }
4241
4242   /* Remaining cases are all for DImode.  */
4243
4244   mask = 0xffff;
4245   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4246     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4247   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4248     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4249
4250   /* Try a bitmask immediate and a movk to generate the immediate
4251      in 2 instructions.  */
4252
4253   if (zero_match < 2 && one_match < 2)
4254     {
4255       for (i = 0; i < 64; i += 16)
4256         {
4257           if (aarch64_check_bitmask (val, val2, mask << i))
4258             break;
4259
4260           val2 = val & ~(mask << i);
4261           if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
4262             break;
4263         }
4264
4265       if (i != 64)
4266         {
4267           if (generate)
4268             {
4269               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4270               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4271                                          GEN_INT ((val >> i) & 0xffff)));
4272             }
4273           return 2;
4274         }
4275
4276       /* Try 2 bitmask immediates which are xor'd together. */
4277       for (i = 0; i < 64; i += 16)
4278         {
4279           val2 = (val >> i) & mask;
4280           val2 |= val2 << 16;
4281           val2 |= val2 << 32;
4282           if (aarch64_bitmask_imm (val2) && aarch64_bitmask_imm (val ^ val2))
4283             break;
4284         }
4285
4286       if (i != 64)
4287         {
4288           if (generate)
4289             {
4290               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4291               emit_insn (gen_xordi3 (dest, dest, GEN_INT (val ^ val2)));
4292             }
4293           return 2;
4294         }
4295     }
4296
4297   /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
4298   if (zero_match + one_match == 0)
4299     {
4300       for (i = 0; i < 48; i += 16)
4301         for (int j = i + 16; j < 64; j += 16)
4302           if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
4303             {
4304               if (generate)
4305                 {
4306                   emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4307                   emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4308                                              GEN_INT ((val >> i) & 0xffff)));
4309                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
4310                                                GEN_INT ((val >> j) & 0xffff)));
4311                 }
4312               return 3;
4313             }
4314
4315       /* Try shifting and inserting the bottom 32-bits into the top bits.  */
4316       val2 = val & 0xffffffff;
4317       val3 = 0xffffffff;
4318       val3 = val2 | (val3 << 32);
4319       for (i = 17; i < 48; i++)
4320         if ((val2 | (val2 << i)) == val)
4321           {
4322             if (generate)
4323               {
4324                 emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
4325                 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4326                                            GEN_INT (val2 >> 16)));
4327                 emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
4328               }
4329             return 3;
4330           }
4331         else if ((val3 & ~(val3 << i)) == val)
4332           {
4333             if (generate)
4334               {
4335                 emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
4336                 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4337                                            GEN_INT (val2 >> 16)));
4338                 emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
4339                                                       dest));
4340               }
4341             return 3;
4342           }
4343     }
4344
4345   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4346      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
4347      otherwise skip zero bits.  */
4348
4349   num_insns = 1;
4350   mask = 0xffff;
4351   val2 = one_match > zero_match ? ~val : val;
4352   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4353
4354   if (generate)
4355     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4356                                            ? (val | ~(mask << i))
4357                                            : (val & (mask << i)))));
4358   for (i += 16; i < 64; i += 16)
4359     {
4360       if ((val2 & (mask << i)) == 0)
4361         continue;
4362       if (generate)
4363         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4364                                    GEN_INT ((val >> i) & 0xffff)));
4365       num_insns ++;
4366     }
4367
4368   return num_insns;
4369 }
4370
4371 /* Return whether imm is a 128-bit immediate which is simple enough to
4372    expand inline.  */
4373 bool
4374 aarch64_mov128_immediate (rtx imm)
4375 {
4376   if (CONST_INT_P (imm))
4377     return true;
4378
4379   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4380
4381   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4382   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4383
4384   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4385          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4386 }
4387
4388
4389 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4390    a left shift of 0 or 12 bits.  */
4391 bool
4392 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
4393 {
4394   return val < 4096 || (val & 0xfff000) == val;
4395 }
4396
4397 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
4398    that can be created with a left shift of 0 or 12.  */
4399 static HOST_WIDE_INT
4400 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
4401 {
4402   /* Check to see if the value fits in 24 bits, as that is the maximum we can
4403      handle correctly.  */
4404   gcc_assert (val < 0x1000000);
4405
4406   if (val < 4096)
4407     return val;
4408
4409   return val & 0xfff000;
4410 }
4411
4412
4413 /* Test whether:
4414
4415      X = (X & AND_VAL) | IOR_VAL;
4416
4417    can be implemented using:
4418
4419      MOVK X, #(IOR_VAL >> shift), LSL #shift
4420
4421    Return the shift if so, otherwise return -1.  */
4422 int
4423 aarch64_movk_shift (const wide_int_ref &and_val,
4424                     const wide_int_ref &ior_val)
4425 {
4426   unsigned int precision = and_val.get_precision ();
4427   unsigned HOST_WIDE_INT mask = 0xffff;
4428   for (unsigned int shift = 0; shift < precision; shift += 16)
4429     {
4430       if (and_val == ~mask && (ior_val & mask) == ior_val)
4431         return shift;
4432       mask <<= 16;
4433     }
4434   return -1;
4435 }
4436
4437 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4438    Assumed precondition: VAL_IN Is not zero.  */
4439
4440 unsigned HOST_WIDE_INT
4441 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4442 {
4443   int lowest_bit_set = ctz_hwi (val_in);
4444   int highest_bit_set = floor_log2 (val_in);
4445   gcc_assert (val_in != 0);
4446
4447   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4448           (HOST_WIDE_INT_1U << lowest_bit_set));
4449 }
4450
4451 /* Create constant where bits outside of lowest bit set to highest bit set
4452    are set to 1.  */
4453
4454 unsigned HOST_WIDE_INT
4455 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4456 {
4457   return val_in | ~aarch64_and_split_imm1 (val_in);
4458 }
4459
4460 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4461
4462 bool
4463 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4464 {
4465   scalar_int_mode int_mode;
4466   if (!is_a <scalar_int_mode> (mode, &int_mode))
4467     return false;
4468
4469   if (aarch64_bitmask_imm (val_in, int_mode))
4470     return false;
4471
4472   if (aarch64_move_imm (val_in, int_mode))
4473     return false;
4474
4475   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4476
4477   return aarch64_bitmask_imm (imm2, int_mode);
4478 }
4479
4480 /* Return the number of temporary registers that aarch64_add_offset_1
4481    would need to add OFFSET to a register.  */
4482
4483 static unsigned int
4484 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4485 {
4486   return absu_hwi (offset) < 0x1000000 ? 0 : 1;
4487 }
4488
4489 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
4490    a non-polynomial OFFSET.  MODE is the mode of the addition.
4491    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4492    be set and CFA adjustments added to the generated instructions.
4493
4494    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4495    temporary if register allocation is already complete.  This temporary
4496    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
4497    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4498    the immediate again.
4499
4500    Since this function may be used to adjust the stack pointer, we must
4501    ensure that it cannot cause transient stack deallocation (for example
4502    by first incrementing SP and then decrementing when adjusting by a
4503    large immediate).  */
4504
4505 static void
4506 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4507                       rtx src, HOST_WIDE_INT offset, rtx temp1,
4508                       bool frame_related_p, bool emit_move_imm)
4509 {
4510   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4511   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4512
4513   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4514   rtx_insn *insn;
4515
4516   if (!moffset)
4517     {
4518       if (!rtx_equal_p (dest, src))
4519         {
4520           insn = emit_insn (gen_rtx_SET (dest, src));
4521           RTX_FRAME_RELATED_P (insn) = frame_related_p;
4522         }
4523       return;
4524     }
4525
4526   /* Single instruction adjustment.  */
4527   if (aarch64_uimm12_shift (moffset))
4528     {
4529       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4530       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4531       return;
4532     }
4533
4534   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4535      and either:
4536
4537      a) the offset cannot be loaded by a 16-bit move or
4538      b) there is no spare register into which we can move it.  */
4539   if (moffset < 0x1000000
4540       && ((!temp1 && !can_create_pseudo_p ())
4541           || !aarch64_move_imm (moffset, mode)))
4542     {
4543       HOST_WIDE_INT low_off = moffset & 0xfff;
4544
4545       low_off = offset < 0 ? -low_off : low_off;
4546       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4547       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4548       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4549       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4550       return;
4551     }
4552
4553   /* Emit a move immediate if required and an addition/subtraction.  */
4554   if (emit_move_imm)
4555     {
4556       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4557       temp1 = aarch64_force_temporary (mode, temp1,
4558                                        gen_int_mode (moffset, mode));
4559     }
4560   insn = emit_insn (offset < 0
4561                     ? gen_sub3_insn (dest, src, temp1)
4562                     : gen_add3_insn (dest, src, temp1));
4563   if (frame_related_p)
4564     {
4565       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4566       rtx adj = plus_constant (mode, src, offset);
4567       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4568     }
4569 }
4570
4571 /* Return the number of temporary registers that aarch64_add_offset
4572    would need to move OFFSET into a register or add OFFSET to a register;
4573    ADD_P is true if we want the latter rather than the former.  */
4574
4575 static unsigned int
4576 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4577 {
4578   /* This follows the same structure as aarch64_add_offset.  */
4579   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4580     return 0;
4581
4582   unsigned int count = 0;
4583   HOST_WIDE_INT factor = offset.coeffs[1];
4584   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4585   poly_int64 poly_offset (factor, factor);
4586   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4587     /* Need one register for the ADDVL/ADDPL result.  */
4588     count += 1;
4589   else if (factor != 0)
4590     {
4591       factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
4592       if (!IN_RANGE (factor, -32, 31))
4593         /* Need one register for the CNT or RDVL result and one for the
4594            multiplication factor.  If necessary, the second temporary
4595            can be reused for the constant part of the offset.  */
4596         return 2;
4597       /* Need one register for the CNT or RDVL result (which might then
4598          be shifted).  */
4599       count += 1;
4600     }
4601   return count + aarch64_add_offset_1_temporaries (constant);
4602 }
4603
4604 /* If X can be represented as a poly_int64, return the number
4605    of temporaries that are required to add it to a register.
4606    Return -1 otherwise.  */
4607
4608 int
4609 aarch64_add_offset_temporaries (rtx x)
4610 {
4611   poly_int64 offset;
4612   if (!poly_int_rtx_p (x, &offset))
4613     return -1;
4614   return aarch64_offset_temporaries (true, offset);
4615 }
4616
4617 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
4618    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4619    be set and CFA adjustments added to the generated instructions.
4620
4621    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4622    temporary if register allocation is already complete.  This temporary
4623    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4624    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4625    false to avoid emitting the immediate again.
4626
4627    TEMP2, if nonnull, is a second temporary register that doesn't
4628    overlap either DEST or REG.
4629
4630    FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
4631    is measured relative to the SME vector length instead of the current
4632    prevailing vector length.  It is 0 otherwise.
4633
4634    Since this function may be used to adjust the stack pointer, we must
4635    ensure that it cannot cause transient stack deallocation (for example
4636    by first incrementing SP and then decrementing when adjusting by a
4637    large immediate).  */
4638
4639 static void
4640 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4641                     poly_int64 offset, rtx temp1, rtx temp2,
4642                     aarch64_feature_flags force_isa_mode,
4643                     bool frame_related_p, bool emit_move_imm = true)
4644 {
4645   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4646   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4647   gcc_assert (temp1 == NULL_RTX
4648               || !frame_related_p
4649               || !reg_overlap_mentioned_p (temp1, dest));
4650   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4651
4652   /* Try using ADDVL or ADDPL to add the whole value.  */
4653   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4654     {
4655       gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
4656       rtx offset_rtx;
4657       if (force_isa_mode == 0)
4658         offset_rtx = gen_int_mode (offset, mode);
4659       else
4660         offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
4661       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4662       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4663       if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON))
4664         add_reg_note (insn, REG_CFA_ADJUST_CFA,
4665                       gen_rtx_SET (dest, plus_constant (Pmode, src,
4666                                                         offset)));
4667       return;
4668     }
4669
4670   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4671      SVE vector register, over and above the minimum size of 128 bits.
4672      This is equivalent to half the value returned by CNTD with a
4673      vector shape of ALL.  */
4674   HOST_WIDE_INT factor = offset.coeffs[1];
4675   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4676
4677   /* Try using ADDVL or ADDPL to add the VG-based part.  */
4678   poly_int64 poly_offset (factor, factor);
4679   if (src != const0_rtx
4680       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4681     {
4682       rtx offset_rtx;
4683       if (force_isa_mode == 0)
4684         offset_rtx = gen_int_mode (poly_offset, mode);
4685       else
4686         offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
4687       if (frame_related_p)
4688         {
4689           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4690           RTX_FRAME_RELATED_P (insn) = true;
4691           if (force_isa_mode & AARCH64_FL_SM_ON)
4692             add_reg_note (insn, REG_CFA_ADJUST_CFA,
4693                           gen_rtx_SET (dest, plus_constant (Pmode, src,
4694                                                             poly_offset)));
4695           src = dest;
4696         }
4697       else
4698         {
4699           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4700           src = aarch64_force_temporary (mode, temp1, addr);
4701           temp1 = temp2;
4702           temp2 = NULL_RTX;
4703         }
4704     }
4705   /* Otherwise use a CNT-based sequence.  */
4706   else if (factor != 0)
4707     {
4708       /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
4709          with negative shifts indicating a shift right.  */
4710       HOST_WIDE_INT low_bit = least_bit_hwi (factor);
4711       HOST_WIDE_INT rel_factor = factor / low_bit;
4712       int shift = exact_log2 (low_bit) - 4;
4713       gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
4714
4715       /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
4716          equal to CNTB * FACTOR / 16, with CODE being the [+-].
4717
4718          We can avoid a multiplication if REL_FACTOR is in the range
4719          of RDVL, although there are then various optimizations that
4720          we can try on top.  */
4721       rtx_code code = PLUS;
4722       rtx val;
4723       if (IN_RANGE (rel_factor, -32, 31))
4724         {
4725           if (force_isa_mode & AARCH64_FL_SM_ON)
4726             {
4727               /* Try to use an unshifted RDSVL, otherwise fall back on
4728                  a shifted RDSVL #1.  */
4729               if (aarch64_sve_rdvl_addvl_factor_p (factor))
4730                 shift = 0;
4731               else
4732                 factor = rel_factor * 16;
4733               val = aarch64_sme_vq_immediate (mode, factor, 0);
4734             }
4735           /* Try to use an unshifted CNT[BHWD] or RDVL.  */
4736           else if (aarch64_sve_cnt_factor_p (factor)
4737                    || aarch64_sve_rdvl_addvl_factor_p (factor))
4738             {
4739               val = gen_int_mode (poly_int64 (factor, factor), mode);
4740               shift = 0;
4741             }
4742           /* Try to subtract an unshifted CNT[BHWD].  */
4743           else if (aarch64_sve_cnt_factor_p (-factor))
4744             {
4745               code = MINUS;
4746               val = gen_int_mode (poly_int64 (-factor, -factor), mode);
4747               shift = 0;
4748             }
4749           /* If subtraction is free, prefer to load a positive constant.
4750              In the best case this will fit a shifted CNTB.  */
4751           else if (src != const0_rtx && rel_factor < 0)
4752             {
4753               code = MINUS;
4754               val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
4755             }
4756           /* Otherwise use a shifted RDVL or CNT[BHWD].  */
4757           else
4758             val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
4759         }
4760       else
4761         {
4762           /* If we can calculate CNTB << SHIFT directly, prefer to do that,
4763              since it should increase the chances of being able to use
4764              a shift and add sequence for the multiplication.
4765              If CNTB << SHIFT is out of range, stick with the current
4766              shift factor.  */
4767           if (force_isa_mode == 0
4768               && IN_RANGE (low_bit, 2, 16 * 16))
4769             {
4770               val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
4771               shift = 0;
4772             }
4773           else if ((force_isa_mode & AARCH64_FL_SM_ON)
4774                    && aarch64_sve_rdvl_addvl_factor_p (low_bit))
4775             {
4776               val = aarch64_sme_vq_immediate (mode, low_bit, 0);
4777               shift = 0;
4778             }
4779           else
4780             val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
4781
4782           val = aarch64_force_temporary (mode, temp1, val);
4783
4784           /* Prefer to multiply by a positive factor and subtract rather
4785              than multiply by a negative factor and add, since positive
4786              values are usually easier to move.  */
4787           if (rel_factor < 0 && src != const0_rtx)
4788             {
4789               rel_factor = -rel_factor;
4790               code = MINUS;
4791             }
4792
4793           if (can_create_pseudo_p ())
4794             {
4795               rtx coeff1 = gen_int_mode (rel_factor, mode);
4796               val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
4797             }
4798           else
4799             {
4800               rtx coeff1 = gen_int_mode (rel_factor, mode);
4801               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4802               val = gen_rtx_MULT (mode, val, coeff1);
4803             }
4804         }
4805
4806       /* Multiply by 2 ** SHIFT.  */
4807       if (shift > 0)
4808         {
4809           val = aarch64_force_temporary (mode, temp1, val);
4810           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4811         }
4812       else if (shift < 0)
4813         {
4814           val = aarch64_force_temporary (mode, temp1, val);
4815           val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
4816         }
4817
4818       /* Add the result to SRC or subtract the result from SRC.  */
4819       if (src != const0_rtx)
4820         {
4821           val = aarch64_force_temporary (mode, temp1, val);
4822           val = gen_rtx_fmt_ee (code, mode, src, val);
4823         }
4824       else if (code == MINUS)
4825         {
4826           val = aarch64_force_temporary (mode, temp1, val);
4827           val = gen_rtx_NEG (mode, val);
4828         }
4829
4830       if (constant == 0 || frame_related_p)
4831         {
4832           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4833           if (frame_related_p)
4834             {
4835               RTX_FRAME_RELATED_P (insn) = true;
4836               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4837                             gen_rtx_SET (dest, plus_constant (Pmode, src,
4838                                                               poly_offset)));
4839             }
4840           src = dest;
4841           if (constant == 0)
4842             return;
4843         }
4844       else
4845         {
4846           src = aarch64_force_temporary (mode, temp1, val);
4847           temp1 = temp2;
4848           temp2 = NULL_RTX;
4849         }
4850
4851       emit_move_imm = true;
4852     }
4853
4854   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4855                         frame_related_p, emit_move_imm);
4856 }
4857
4858 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4859    than a poly_int64.  */
4860
4861 void
4862 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4863                           rtx offset_rtx, rtx temp1, rtx temp2)
4864 {
4865   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4866                       temp1, temp2, 0, false);
4867 }
4868
4869 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4870    TEMP1 is available as a temporary if nonnull.  FORCE_ISA_MODE is as
4871    for aarch64_add_offset.  EMIT_MOVE_IMM is false if TEMP1 already
4872    contains abs (DELTA).  */
4873
4874 static inline void
4875 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
4876                 aarch64_feature_flags force_isa_mode, bool emit_move_imm)
4877 {
4878   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4879                       temp1, temp2, force_isa_mode, true, emit_move_imm);
4880 }
4881
4882 /* Subtract DELTA from the stack pointer, marking the instructions
4883    frame-related if FRAME_RELATED_P.  FORCE_ISA_MODE is as for
4884    aarch64_add_offset.  TEMP1 is available as a temporary if nonnull.  */
4885
4886 static inline void
4887 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
4888                 aarch64_feature_flags force_isa_mode,
4889                 bool frame_related_p, bool emit_move_imm = true)
4890 {
4891   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4892                       temp1, temp2, force_isa_mode, frame_related_p,
4893                       emit_move_imm);
4894 }
4895
4896 /* A streaming-compatible function needs to switch temporarily to the known
4897    PSTATE.SM mode described by LOCAL_MODE.  The low bit of OLD_SVCR contains
4898    the runtime state of PSTATE.SM in the streaming-compatible code, before
4899    the start of the switch to LOCAL_MODE.
4900
4901    Emit instructions to branch around the mode switch if PSTATE.SM already
4902    matches LOCAL_MODE.  Return the label that the branch jumps to.  */
4903
4904 static rtx_insn *
4905 aarch64_guard_switch_pstate_sm (rtx old_svcr, aarch64_feature_flags local_mode)
4906 {
4907   local_mode &= AARCH64_FL_SM_STATE;
4908   gcc_assert (local_mode != 0);
4909   auto already_ok_cond = (local_mode & AARCH64_FL_SM_ON ? NE : EQ);
4910   auto *label = gen_label_rtx ();
4911   auto branch = aarch64_gen_test_and_branch (already_ok_cond, old_svcr, 0,
4912                                              label);
4913   auto *jump = emit_jump_insn (branch);
4914   JUMP_LABEL (jump) = label;
4915   return label;
4916 }
4917
4918 /* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM
4919    state in NEW_MODE.  This is known to involve either an SMSTART SM or
4920    an SMSTOP SM.  */
4921
4922 static void
4923 aarch64_switch_pstate_sm (aarch64_feature_flags old_mode,
4924                           aarch64_feature_flags new_mode)
4925 {
4926   old_mode &= AARCH64_FL_SM_STATE;
4927   new_mode &= AARCH64_FL_SM_STATE;
4928   gcc_assert (old_mode != new_mode);
4929
4930   if ((new_mode & AARCH64_FL_SM_ON)
4931       || (new_mode == 0 && (old_mode & AARCH64_FL_SM_OFF)))
4932     emit_insn (gen_aarch64_smstart_sm ());
4933   else
4934     emit_insn (gen_aarch64_smstop_sm ());
4935 }
4936
4937 /* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all
4938    FP and predicate registers.  This class emits code to preserve any
4939    necessary registers around the mode switch.
4940
4941    The class uses four approaches to saving and restoring contents, enumerated
4942    by group_type:
4943
4944    - GPR: save and restore the contents of FP registers using GPRs.
4945      This is used if the FP register contains no more than 64 significant
4946      bits.  The registers used are FIRST_GPR onwards.
4947
4948    - MEM_128: save and restore 128-bit SIMD registers using memory.
4949
4950    - MEM_SVE_PRED: save and restore full SVE predicate registers using memory.
4951
4952    - MEM_SVE_DATA: save and restore full SVE vector registers using memory.
4953
4954    The save slots within each memory group are consecutive, with the
4955    MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots.
4956
4957    There will only be two mode switches for each use of SME, so they should
4958    not be particularly performance-sensitive.  It's also rare for SIMD, SVE
4959    or predicate registers to be live across mode switches.  We therefore
4960    don't preallocate the save slots but instead allocate them locally on
4961    demand.  This makes the code emitted by the class self-contained.  */
4962
4963 class aarch64_sme_mode_switch_regs
4964 {
4965 public:
4966   static const unsigned int FIRST_GPR = R10_REGNUM;
4967
4968   void add_reg (machine_mode, unsigned int);
4969   void add_call_args (rtx_call_insn *);
4970   void add_call_result (rtx_call_insn *);
4971   void add_call_preserved_reg (unsigned int);
4972   void add_call_preserved_regs (bitmap);
4973
4974   void emit_prologue ();
4975   void emit_epilogue ();
4976
4977   /* The number of GPRs needed to save FP registers, starting from
4978      FIRST_GPR.  */
4979   unsigned int num_gprs () { return m_group_count[GPR]; }
4980
4981 private:
4982   enum sequence { PROLOGUE, EPILOGUE };
4983   enum group_type { GPR, MEM_128, MEM_SVE_PRED, MEM_SVE_DATA, NUM_GROUPS };
4984
4985   /* Information about the save location for one FP, SIMD, SVE data, or
4986      SVE predicate register.  */
4987   struct save_location {
4988     /* The register to be saved.  */
4989     rtx reg;
4990
4991     /* Which group the save location belongs to.  */
4992     group_type group;
4993
4994     /* A zero-based index of the register within the group.  */
4995     unsigned int index;
4996   };
4997
4998   unsigned int sve_data_headroom ();
4999   rtx get_slot_mem (machine_mode, poly_int64);
5000   void emit_stack_adjust (sequence, poly_int64);
5001   void emit_mem_move (sequence, const save_location &, poly_int64);
5002
5003   void emit_gpr_moves (sequence);
5004   void emit_mem_128_moves (sequence);
5005   void emit_sve_sp_adjust (sequence);
5006   void emit_sve_pred_moves (sequence);
5007   void emit_sve_data_moves (sequence);
5008
5009   /* All save locations, in no particular order.  */
5010   auto_vec<save_location, 12> m_save_locations;
5011
5012   /* The number of registers in each group.  */
5013   unsigned int m_group_count[NUM_GROUPS] = {};
5014 };
5015
5016 /* Record that (reg:MODE REGNO) needs to be preserved around the mode
5017    switch.  */
5018
5019 void
5020 aarch64_sme_mode_switch_regs::add_reg (machine_mode mode, unsigned int regno)
5021 {
5022   if (!FP_REGNUM_P (regno) && !PR_REGNUM_P (regno))
5023     return;
5024
5025   unsigned int end_regno = end_hard_regno (mode, regno);
5026   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5027   gcc_assert ((vec_flags & VEC_STRUCT) || end_regno == regno + 1);
5028   for (; regno < end_regno; regno++)
5029     {
5030       /* Force the mode of SVE saves and restores even for single registers.
5031          This is necessary because big-endian targets only allow LDR Z and
5032          STR Z to be used with byte modes.  */
5033       machine_mode submode = mode;
5034       if (vec_flags & VEC_SVE_PRED)
5035         submode = VNx16BImode;
5036       else if (vec_flags & VEC_SVE_DATA)
5037         submode = SVE_BYTE_MODE;
5038       else if (vec_flags & VEC_STRUCT)
5039         {
5040           if (vec_flags & VEC_PARTIAL)
5041             submode = V8QImode;
5042           else
5043             submode = V16QImode;
5044         }
5045       save_location loc;
5046       loc.reg = gen_rtx_REG (submode, regno);
5047       if (vec_flags & VEC_SVE_PRED)
5048         {
5049           gcc_assert (PR_REGNUM_P (regno));
5050           loc.group = MEM_SVE_PRED;
5051         }
5052       else
5053         {
5054           gcc_assert (FP_REGNUM_P (regno));
5055           if (known_le (GET_MODE_SIZE (submode), 8))
5056             loc.group = GPR;
5057           else if (known_eq (GET_MODE_SIZE (submode), 16))
5058             loc.group = MEM_128;
5059           else
5060             loc.group = MEM_SVE_DATA;
5061         }
5062       loc.index = m_group_count[loc.group]++;
5063       m_save_locations.quick_push (loc);
5064     }
5065 }
5066
5067 /* Record that the arguments to CALL_INSN need to be preserved around
5068    the mode switch.  */
5069
5070 void
5071 aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn *call_insn)
5072 {
5073   for (rtx node = CALL_INSN_FUNCTION_USAGE (call_insn);
5074        node; node = XEXP (node, 1))
5075     {
5076       rtx item = XEXP (node, 0);
5077       if (GET_CODE (item) != USE)
5078         continue;
5079       item = XEXP (item, 0);
5080       if (!REG_P (item))
5081         continue;
5082       add_reg (GET_MODE (item), REGNO (item));
5083     }
5084 }
5085
5086 /* Record that the return value from CALL_INSN (if any) needs to be
5087    preserved around the mode switch.  */
5088
5089 void
5090 aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn *call_insn)
5091 {
5092   rtx pat = PATTERN (call_insn);
5093   gcc_assert (GET_CODE (pat) == PARALLEL);
5094   pat = XVECEXP (pat, 0, 0);
5095   if (GET_CODE (pat) == CALL)
5096     return;
5097   rtx dest = SET_DEST (pat);
5098   if (GET_CODE (dest) == PARALLEL)
5099     for (int i = 0; i < XVECLEN (dest, 0); ++i)
5100       {
5101         rtx x = XVECEXP (dest, 0, i);
5102         gcc_assert (GET_CODE (x) == EXPR_LIST);
5103         rtx reg = XEXP (x, 0);
5104         add_reg (GET_MODE (reg), REGNO (reg));
5105       }
5106   else
5107     add_reg (GET_MODE (dest), REGNO (dest));
5108 }
5109
5110 /* REGNO is a register that is call-preserved under the current function's ABI.
5111    Record that it must be preserved around the mode switch.  */
5112
5113 void
5114 aarch64_sme_mode_switch_regs::add_call_preserved_reg (unsigned int regno)
5115 {
5116   if (FP_REGNUM_P (regno))
5117     switch (crtl->abi->id ())
5118       {
5119       case ARM_PCS_SVE:
5120         add_reg (VNx16QImode, regno);
5121         break;
5122       case ARM_PCS_SIMD:
5123         add_reg (V16QImode, regno);
5124         break;
5125       case ARM_PCS_AAPCS64:
5126         add_reg (DImode, regno);
5127         break;
5128       default:
5129         gcc_unreachable ();
5130       }
5131   else if (PR_REGNUM_P (regno))
5132     add_reg (VNx16BImode, regno);
5133 }
5134
5135 /* The hard registers in REGS are call-preserved under the current function's
5136    ABI.  Record that they must be preserved around the mode switch.  */
5137
5138 void
5139 aarch64_sme_mode_switch_regs::add_call_preserved_regs (bitmap regs)
5140 {
5141   bitmap_iterator bi;
5142   unsigned int regno;
5143   EXECUTE_IF_SET_IN_BITMAP (regs, 0, regno, bi)
5144     if (HARD_REGISTER_NUM_P (regno))
5145       add_call_preserved_reg (regno);
5146     else
5147       break;
5148 }
5149
5150 /* Emit code to save registers before the mode switch.  */
5151
5152 void
5153 aarch64_sme_mode_switch_regs::emit_prologue ()
5154 {
5155   emit_sve_sp_adjust (PROLOGUE);
5156   emit_sve_pred_moves (PROLOGUE);
5157   emit_sve_data_moves (PROLOGUE);
5158   emit_mem_128_moves (PROLOGUE);
5159   emit_gpr_moves (PROLOGUE);
5160 }
5161
5162 /* Emit code to restore registers after the mode switch.  */
5163
5164 void
5165 aarch64_sme_mode_switch_regs::emit_epilogue ()
5166 {
5167   emit_gpr_moves (EPILOGUE);
5168   emit_mem_128_moves (EPILOGUE);
5169   emit_sve_pred_moves (EPILOGUE);
5170   emit_sve_data_moves (EPILOGUE);
5171   emit_sve_sp_adjust (EPILOGUE);
5172 }
5173
5174 /* The SVE predicate registers are stored below the SVE data registers,
5175    with the predicate save area being padded to a data-register-sized
5176    boundary.  Return the size of this padded area as a whole number
5177    of data register slots.  */
5178
5179 unsigned int
5180 aarch64_sme_mode_switch_regs::sve_data_headroom ()
5181 {
5182   return CEIL (m_group_count[MEM_SVE_PRED], 8);
5183 }
5184
5185 /* Return a memory reference of mode MODE to OFFSET bytes from the
5186    stack pointer.  */
5187
5188 rtx
5189 aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode,
5190                                             poly_int64 offset)
5191 {
5192   rtx addr = plus_constant (Pmode, stack_pointer_rtx, offset);
5193   return gen_rtx_MEM (mode, addr);
5194 }
5195
5196 /* Allocate or deallocate SIZE bytes of stack space: SEQ decides which.  */
5197
5198 void
5199 aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq,
5200                                                  poly_int64 size)
5201 {
5202   if (seq == PROLOGUE)
5203     size = -size;
5204   emit_insn (gen_rtx_SET (stack_pointer_rtx,
5205                           plus_constant (Pmode, stack_pointer_rtx, size)));
5206 }
5207
5208 /* Save or restore the register in LOC, whose slot is OFFSET bytes from
5209    the stack pointer.  SEQ chooses between saving and restoring.  */
5210
5211 void
5212 aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq,
5213                                              const save_location &loc,
5214                                              poly_int64 offset)
5215 {
5216   rtx mem = get_slot_mem (GET_MODE (loc.reg), offset);
5217   if (seq == PROLOGUE)
5218     emit_move_insn (mem, loc.reg);
5219   else
5220     emit_move_insn (loc.reg, mem);
5221 }
5222
5223 /* Emit instructions to save or restore the GPR group.  SEQ chooses between
5224    saving and restoring.  */
5225
5226 void
5227 aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq)
5228 {
5229   for (auto &loc : m_save_locations)
5230     if (loc.group == GPR)
5231       {
5232         gcc_assert (loc.index < 8);
5233         rtx gpr = gen_rtx_REG (GET_MODE (loc.reg), FIRST_GPR + loc.index);
5234         if (seq == PROLOGUE)
5235           emit_move_insn (gpr, loc.reg);
5236         else
5237           emit_move_insn (loc.reg, gpr);
5238       }
5239 }
5240
5241 /* Emit instructions to save or restore the MEM_128 group.  SEQ chooses
5242    between saving and restoring.  */
5243
5244 void
5245 aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq)
5246 {
5247   HOST_WIDE_INT count = m_group_count[MEM_128];
5248   if (count == 0)
5249     return;
5250
5251   auto sp = stack_pointer_rtx;
5252   auto sp_adjust = (seq == PROLOGUE ? -count : count) * 16;
5253
5254   /* Pick a common mode that supports LDR & STR with pre/post-modification
5255      and LDP & STP with pre/post-modification.  */
5256   auto mode = TFmode;
5257
5258   /* An instruction pattern that should be emitted at the end.  */
5259   rtx last_pat = NULL_RTX;
5260
5261   /* A previous MEM_128 location that hasn't been handled yet.  */
5262   save_location *prev_loc = nullptr;
5263
5264   /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC.  */
5265   for (auto &loc : m_save_locations)
5266     if (loc.group == MEM_128)
5267       {
5268         if (!prev_loc)
5269           {
5270             prev_loc = &loc;
5271             continue;
5272           }
5273         gcc_assert (loc.index == prev_loc->index + 1);
5274
5275         /* The offset of the base of the save area from the current
5276            stack pointer.  */
5277         HOST_WIDE_INT bias = 0;
5278         if (prev_loc->index == 0 && seq == PROLOGUE)
5279           bias = sp_adjust;
5280
5281         /* Get the two sets in the LDP/STP.  */
5282         rtx ops[] = {
5283           gen_rtx_REG (mode, REGNO (prev_loc->reg)),
5284           get_slot_mem (mode, prev_loc->index * 16 + bias),
5285           gen_rtx_REG (mode, REGNO (loc.reg)),
5286           get_slot_mem (mode, loc.index * 16 + bias)
5287         };
5288         unsigned int lhs = (seq == PROLOGUE);
5289         rtx set1 = gen_rtx_SET (ops[lhs], ops[1 - lhs]);
5290         rtx set2 = gen_rtx_SET (ops[lhs + 2], ops[3 - lhs]);
5291
5292         /* Combine the sets with any stack allocation/deallocation.  */
5293         rtx pat;
5294         if (prev_loc->index == 0)
5295           {
5296             rtx plus_sp = plus_constant (Pmode, sp, sp_adjust);
5297             rtvec vec = gen_rtvec (3, gen_rtx_SET (sp, plus_sp), set1, set2);
5298             pat = gen_rtx_PARALLEL (VOIDmode, vec);
5299           }
5300         else if (seq == PROLOGUE)
5301           pat = aarch64_gen_store_pair (ops[1], ops[0], ops[2]);
5302         else
5303           pat = aarch64_gen_load_pair (ops[0], ops[2], ops[1]);
5304
5305         /* Queue a deallocation to the end, otherwise emit the
5306            instruction now.  */
5307         if (seq == EPILOGUE && prev_loc->index == 0)
5308           last_pat = pat;
5309         else
5310           emit_insn (pat);
5311         prev_loc = nullptr;
5312       }
5313
5314   /* Handle any leftover LDR/STR.  */
5315   if (prev_loc)
5316     {
5317       rtx reg = gen_rtx_REG (mode, REGNO (prev_loc->reg));
5318       rtx addr;
5319       if (prev_loc->index != 0)
5320         addr = plus_constant (Pmode, sp, prev_loc->index * 16);
5321       else if (seq == PROLOGUE)
5322         {
5323           rtx allocate = plus_constant (Pmode, sp, -count * 16);
5324           addr = gen_rtx_PRE_MODIFY (Pmode, sp, allocate);
5325         }
5326       else
5327         {
5328           rtx deallocate = plus_constant (Pmode, sp, count * 16);
5329           addr = gen_rtx_POST_MODIFY (Pmode, sp, deallocate);
5330         }
5331       rtx mem = gen_rtx_MEM (mode, addr);
5332       if (seq == PROLOGUE)
5333         emit_move_insn (mem, reg);
5334       else
5335         emit_move_insn (reg, mem);
5336     }
5337
5338   if (last_pat)
5339     emit_insn (last_pat);
5340 }
5341
5342 /* Allocate or deallocate the stack space needed by the SVE groups.
5343    SEQ chooses between allocating and deallocating.  */
5344
5345 void
5346 aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq)
5347 {
5348   if (unsigned int count = m_group_count[MEM_SVE_DATA] + sve_data_headroom ())
5349     emit_stack_adjust (seq, count * BYTES_PER_SVE_VECTOR);
5350 }
5351
5352 /* Save or restore the MEM_SVE_DATA group.  SEQ chooses between saving
5353    and restoring.  */
5354
5355 void
5356 aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq)
5357 {
5358   for (auto &loc : m_save_locations)
5359     if (loc.group == MEM_SVE_DATA)
5360       {
5361         auto index = loc.index + sve_data_headroom ();
5362         emit_mem_move (seq, loc, index * BYTES_PER_SVE_VECTOR);
5363       }
5364 }
5365
5366 /* Save or restore the MEM_SVE_PRED group.  SEQ chooses between saving
5367    and restoring.  */
5368
5369 void
5370 aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq)
5371 {
5372   for (auto &loc : m_save_locations)
5373     if (loc.group == MEM_SVE_PRED)
5374       emit_mem_move (seq, loc, loc.index * BYTES_PER_SVE_PRED);
5375 }
5376
5377 /* Set DEST to (vec_series BASE STEP).  */
5378
5379 static void
5380 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
5381 {
5382   machine_mode mode = GET_MODE (dest);
5383   scalar_mode inner = GET_MODE_INNER (mode);
5384
5385   /* Each operand can be a register or an immediate in the range [-16, 15].  */
5386   if (!aarch64_sve_index_immediate_p (base))
5387     base = force_reg (inner, base);
5388   if (!aarch64_sve_index_immediate_p (step))
5389     step = force_reg (inner, step);
5390
5391   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
5392 }
5393
5394 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5395    register of mode MODE.  Use TARGET for the result if it's nonnull
5396    and convenient.
5397
5398    The two vector modes must have the same element mode.  The behavior
5399    is to duplicate architectural lane N of SRC into architectural lanes
5400    N + I * STEP of the result.  On big-endian targets, architectural
5401    lane 0 of an Advanced SIMD vector is the last element of the vector
5402    in memory layout, so for big-endian targets this operation has the
5403    effect of reversing SRC before duplicating it.  Callers need to
5404    account for this.  */
5405
5406 rtx
5407 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
5408 {
5409   machine_mode src_mode = GET_MODE (src);
5410   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
5411   insn_code icode = (BYTES_BIG_ENDIAN
5412                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
5413                      : code_for_aarch64_vec_duplicate_vq_le (mode));
5414
5415   unsigned int i = 0;
5416   expand_operand ops[3];
5417   create_output_operand (&ops[i++], target, mode);
5418   create_output_operand (&ops[i++], src, src_mode);
5419   if (BYTES_BIG_ENDIAN)
5420     {
5421       /* Create a PARALLEL describing the reversal of SRC.  */
5422       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
5423       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
5424                                                   nelts_per_vq - 1, -1);
5425       create_fixed_operand (&ops[i++], sel);
5426     }
5427   expand_insn (icode, i, ops);
5428   return ops[0].value;
5429 }
5430
5431 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5432    the memory image into DEST.  Return true on success.  */
5433
5434 static bool
5435 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
5436 {
5437   src = force_const_mem (GET_MODE (src), src);
5438   if (!src)
5439     return false;
5440
5441   /* Make sure that the address is legitimate.  */
5442   if (!aarch64_sve_ld1rq_operand_p (src))
5443     {
5444       rtx addr = force_reg (Pmode, XEXP (src, 0));
5445       src = replace_equiv_address (src, addr);
5446     }
5447
5448   machine_mode mode = GET_MODE (dest);
5449   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5450   rtx ptrue = aarch64_ptrue_reg (pred_mode);
5451   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
5452   return true;
5453 }
5454
5455 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5456    by N "background" values.  Try to move it into TARGET using:
5457
5458       PTRUE PRED.<T>, VL<N>
5459       MOV TRUE.<T>, #<foreground>
5460       MOV FALSE.<T>, #<background>
5461       SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5462
5463    The PTRUE is always a single instruction but the MOVs might need a
5464    longer sequence.  If the background value is zero (as it often is),
5465    the sequence can sometimes collapse to a PTRUE followed by a
5466    zero-predicated move.
5467
5468    Return the target on success, otherwise return null.  */
5469
5470 static rtx
5471 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
5472 {
5473   gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
5474
5475   /* Make sure that the PTRUE is valid.  */
5476   machine_mode mode = GET_MODE (src);
5477   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5478   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5479   if (aarch64_svpattern_for_vl (pred_mode, npatterns)
5480       == AARCH64_NUM_SVPATTERNS)
5481     return NULL_RTX;
5482
5483   rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
5484   rtx_vector_builder true_builder (mode, npatterns, 1);
5485   rtx_vector_builder false_builder (mode, npatterns, 1);
5486   for (unsigned int i = 0; i < npatterns; ++i)
5487     {
5488       true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5489       pred_builder.quick_push (CONST1_RTX (BImode));
5490     }
5491   for (unsigned int i = 0; i < npatterns; ++i)
5492     {
5493       false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
5494       pred_builder.quick_push (CONST0_RTX (BImode));
5495     }
5496   expand_operand ops[4];
5497   create_output_operand (&ops[0], target, mode);
5498   create_input_operand (&ops[1], true_builder.build (), mode);
5499   create_input_operand (&ops[2], false_builder.build (), mode);
5500   create_input_operand (&ops[3], pred_builder.build (), pred_mode);
5501   expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
5502   return target;
5503 }
5504
5505 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5506    SVE data mode and isn't a legitimate constant.  Use TARGET for the
5507    result if convenient.
5508
5509    The returned register can have whatever mode seems most natural
5510    given the contents of SRC.  */
5511
5512 static rtx
5513 aarch64_expand_sve_const_vector (rtx target, rtx src)
5514 {
5515   machine_mode mode = GET_MODE (src);
5516   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5517   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
5518   scalar_mode elt_mode = GET_MODE_INNER (mode);
5519   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
5520   unsigned int container_bits = aarch64_sve_container_bits (mode);
5521   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
5522
5523   if (nelts_per_pattern == 1
5524       && encoded_bits <= 128
5525       && container_bits != elt_bits)
5526     {
5527       /* We have a partial vector mode and a constant whose full-vector
5528          equivalent would occupy a repeating 128-bit sequence.  Build that
5529          full-vector equivalent instead, so that we have the option of
5530          using LD1RQ and Advanced SIMD operations.  */
5531       unsigned int repeat = container_bits / elt_bits;
5532       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
5533       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
5534       for (unsigned int i = 0; i < npatterns; ++i)
5535         for (unsigned int j = 0; j < repeat; ++j)
5536           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5537       target = aarch64_target_reg (target, full_mode);
5538       return aarch64_expand_sve_const_vector (target, builder.build ());
5539     }
5540
5541   if (nelts_per_pattern == 1 && encoded_bits == 128)
5542     {
5543       /* The constant is a duplicated quadword but can't be narrowed
5544          beyond a quadword.  Get the memory image of the first quadword
5545          as a 128-bit vector and try using LD1RQ to load it from memory.
5546
5547          The effect for both endiannesses is to load memory lane N into
5548          architectural lanes N + I * STEP of the result.  On big-endian
5549          targets, the layout of the 128-bit vector in an Advanced SIMD
5550          register would be different from its layout in an SVE register,
5551          but this 128-bit vector is a memory value only.  */
5552       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5553       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
5554       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
5555         return target;
5556     }
5557
5558   if (nelts_per_pattern == 1 && encoded_bits < 128)
5559     {
5560       /* The vector is a repeating sequence of 64 bits or fewer.
5561          See if we can load them using an Advanced SIMD move and then
5562          duplicate it to fill a vector.  This is better than using a GPR
5563          move because it keeps everything in the same register file.  */
5564       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5565       rtx_vector_builder builder (vq_mode, npatterns, 1);
5566       for (unsigned int i = 0; i < npatterns; ++i)
5567         {
5568           /* We want memory lane N to go into architectural lane N,
5569              so reverse for big-endian targets.  The DUP .Q pattern
5570              has a compensating reverse built-in.  */
5571           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
5572           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
5573         }
5574       rtx vq_src = builder.build ();
5575       if (aarch64_simd_valid_immediate (vq_src, NULL))
5576         {
5577           vq_src = force_reg (vq_mode, vq_src);
5578           return aarch64_expand_sve_dupq (target, mode, vq_src);
5579         }
5580
5581       /* Get an integer representation of the repeating part of Advanced
5582          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
5583          which for big-endian targets is lane-swapped wrt a normal
5584          Advanced SIMD vector.  This means that for both endiannesses,
5585          memory lane N of SVE vector SRC corresponds to architectural
5586          lane N of a register holding VQ_SRC.  This in turn means that
5587          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5588          as a single 128-bit value) and thus that memory lane 0 of SRC is
5589          in the lsb of the integer.  Duplicating the integer therefore
5590          ensures that memory lane N of SRC goes into architectural lane
5591          N + I * INDEX of the SVE register.  */
5592       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
5593       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
5594       if (elt_value)
5595         {
5596           /* Pretend that we had a vector of INT_MODE to start with.  */
5597           elt_mode = int_mode;
5598           mode = aarch64_full_sve_mode (int_mode).require ();
5599
5600           /* If the integer can be moved into a general register by a
5601              single instruction, do that and duplicate the result.  */
5602           if (CONST_INT_P (elt_value)
5603               && aarch64_move_imm (INTVAL (elt_value),
5604                                    encoded_bits <= 32 ? SImode : DImode))
5605             {
5606               elt_value = force_reg (elt_mode, elt_value);
5607               return expand_vector_broadcast (mode, elt_value);
5608             }
5609         }
5610       else if (npatterns == 1)
5611         /* We're duplicating a single value, but can't do better than
5612            force it to memory and load from there.  This handles things
5613            like symbolic constants.  */
5614         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
5615
5616       if (elt_value)
5617         {
5618           /* Load the element from memory if we can, otherwise move it into
5619              a register and use a DUP.  */
5620           rtx op = force_const_mem (elt_mode, elt_value);
5621           if (!op)
5622             op = force_reg (elt_mode, elt_value);
5623           return expand_vector_broadcast (mode, op);
5624         }
5625     }
5626
5627   /* Try using INDEX.  */
5628   rtx base, step;
5629   if (const_vec_series_p (src, &base, &step))
5630     {
5631       aarch64_expand_vec_series (target, base, step);
5632       return target;
5633     }
5634
5635   /* From here on, it's better to force the whole constant to memory
5636      if we can.  */
5637   if (GET_MODE_NUNITS (mode).is_constant ())
5638     return NULL_RTX;
5639
5640   if (nelts_per_pattern == 2)
5641     if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5642       return res;
5643
5644   /* Expand each pattern individually.  */
5645   gcc_assert (npatterns > 1);
5646   rtx_vector_builder builder;
5647   auto_vec<rtx, 16> vectors (npatterns);
5648   for (unsigned int i = 0; i < npatterns; ++i)
5649     {
5650       builder.new_vector (mode, 1, nelts_per_pattern);
5651       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5652         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5653       vectors.quick_push (force_reg (mode, builder.build ()));
5654     }
5655
5656   /* Use permutes to interleave the separate vectors.  */
5657   while (npatterns > 1)
5658     {
5659       npatterns /= 2;
5660       for (unsigned int i = 0; i < npatterns; ++i)
5661         {
5662           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
5663           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5664           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5665           vectors[i] = tmp;
5666         }
5667     }
5668   gcc_assert (vectors[0] == target);
5669   return target;
5670 }
5671
5672 /* Use WHILE to set a predicate register of mode MODE in which the first
5673    VL bits are set and the rest are clear.  Use TARGET for the register
5674    if it's nonnull and convenient.  */
5675
5676 static rtx
5677 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5678                                  unsigned int vl)
5679 {
5680   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
5681   target = aarch64_target_reg (target, mode);
5682   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
5683                         target, const0_rtx, limit));
5684   return target;
5685 }
5686
5687 static rtx
5688 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5689
5690 /* BUILDER is a constant predicate in which the index of every set bit
5691    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5692    by inverting every element at a multiple of ELT_SIZE and EORing the
5693    result with an ELT_SIZE PTRUE.
5694
5695    Return a register that contains the constant on success, otherwise
5696    return null.  Use TARGET as the register if it is nonnull and
5697    convenient.  */
5698
5699 static rtx
5700 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5701                                    unsigned int elt_size)
5702 {
5703   /* Invert every element at a multiple of ELT_SIZE, keeping the
5704      other bits zero.  */
5705   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5706                                   builder.nelts_per_pattern ());
5707   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5708     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5709       inv_builder.quick_push (const1_rtx);
5710     else
5711       inv_builder.quick_push (const0_rtx);
5712   inv_builder.finalize ();
5713
5714   /* See if we can load the constant cheaply.  */
5715   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5716   if (!inv)
5717     return NULL_RTX;
5718
5719   /* EOR the result with an ELT_SIZE PTRUE.  */
5720   rtx mask = aarch64_ptrue_all (elt_size);
5721   mask = force_reg (VNx16BImode, mask);
5722   inv = gen_lowpart (VNx16BImode, inv);
5723   target = aarch64_target_reg (target, VNx16BImode);
5724   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5725   return target;
5726 }
5727
5728 /* BUILDER is a constant predicate in which the index of every set bit
5729    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5730    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
5731    register on success, otherwise return null.  Use TARGET as the register
5732    if nonnull and convenient.  */
5733
5734 static rtx
5735 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5736                                    unsigned int elt_size,
5737                                    unsigned int permute_size)
5738 {
5739   /* We're going to split the constant into two new constants A and B,
5740      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5741      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5742
5743      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5744      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5745
5746      where _ indicates elements that will be discarded by the permute.
5747
5748      First calculate the ELT_SIZEs for A and B.  */
5749   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
5750   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
5751   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
5752     if (INTVAL (builder.elt (i)) != 0)
5753       {
5754         if (i & permute_size)
5755           b_elt_size |= i - permute_size;
5756         else
5757           a_elt_size |= i;
5758       }
5759   a_elt_size &= -a_elt_size;
5760   b_elt_size &= -b_elt_size;
5761
5762   /* Now construct the vectors themselves.  */
5763   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5764                                 builder.nelts_per_pattern ());
5765   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5766                                 builder.nelts_per_pattern ());
5767   unsigned int nelts = builder.encoded_nelts ();
5768   for (unsigned int i = 0; i < nelts; ++i)
5769     if (i & (elt_size - 1))
5770       {
5771         a_builder.quick_push (const0_rtx);
5772         b_builder.quick_push (const0_rtx);
5773       }
5774     else if ((i & permute_size) == 0)
5775       {
5776         /* The A and B elements are significant.  */
5777         a_builder.quick_push (builder.elt (i));
5778         b_builder.quick_push (builder.elt (i + permute_size));
5779       }
5780     else
5781       {
5782         /* The A and B elements are going to be discarded, so pick whatever
5783            is likely to give a nice constant.  We are targeting element
5784            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5785            with the aim of each being a sequence of ones followed by
5786            a sequence of zeros.  So:
5787
5788            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5789              duplicate the last X_ELT_SIZE element, to extend the
5790              current sequence of ones or zeros.
5791
5792            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5793              zero, so that the constant really does have X_ELT_SIZE and
5794              not a smaller size.  */
5795         if (a_elt_size > permute_size)
5796           a_builder.quick_push (const0_rtx);
5797         else
5798           a_builder.quick_push (a_builder.elt (i - a_elt_size));
5799         if (b_elt_size > permute_size)
5800           b_builder.quick_push (const0_rtx);
5801         else
5802           b_builder.quick_push (b_builder.elt (i - b_elt_size));
5803       }
5804   a_builder.finalize ();
5805   b_builder.finalize ();
5806
5807   /* Try loading A into a register.  */
5808   rtx_insn *last = get_last_insn ();
5809   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5810   if (!a)
5811     return NULL_RTX;
5812
5813   /* Try loading B into a register.  */
5814   rtx b = a;
5815   if (a_builder != b_builder)
5816     {
5817       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5818       if (!b)
5819         {
5820           delete_insns_since (last);
5821           return NULL_RTX;
5822         }
5823     }
5824
5825   /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
5826      operands but permutes them as though they had mode MODE.  */
5827   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5828   target = aarch64_target_reg (target, GET_MODE (a));
5829   rtx type_reg = CONST0_RTX (mode);
5830   emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
5831   return target;
5832 }
5833
5834 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
5835    constant in BUILDER into an SVE predicate register.  Return the register
5836    on success, otherwise return null.  Use TARGET for the register if
5837    nonnull and convenient.
5838
5839    ALLOW_RECURSE_P is true if we can use methods that would call this
5840    function recursively.  */
5841
5842 static rtx
5843 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5844                                  bool allow_recurse_p)
5845 {
5846   if (builder.encoded_nelts () == 1)
5847     /* A PFALSE or a PTRUE .B ALL.  */
5848     return aarch64_emit_set_immediate (target, builder);
5849
5850   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5851   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5852     {
5853       /* If we can load the constant using PTRUE, use it as-is.  */
5854       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5855       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5856         return aarch64_emit_set_immediate (target, builder);
5857
5858       /* Otherwise use WHILE to set the first VL bits.  */
5859       return aarch64_sve_move_pred_via_while (target, mode, vl);
5860     }
5861
5862   if (!allow_recurse_p)
5863     return NULL_RTX;
5864
5865   /* Try inverting the vector in element size ELT_SIZE and then EORing
5866      the result with an ELT_SIZE PTRUE.  */
5867   if (INTVAL (builder.elt (0)) == 0)
5868     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5869                                                      elt_size))
5870       return res;
5871
5872   /* Try using TRN1 to permute two simpler constants.  */
5873   for (unsigned int i = elt_size; i <= 8; i *= 2)
5874     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5875                                                      elt_size, i))
5876       return res;
5877
5878   return NULL_RTX;
5879 }
5880
5881 /* Return an SVE predicate register that contains the VNx16BImode
5882    constant in BUILDER, without going through the move expanders.
5883
5884    The returned register can have whatever mode seems most natural
5885    given the contents of BUILDER.  Use TARGET for the result if
5886    convenient.  */
5887
5888 static rtx
5889 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5890 {
5891   /* Try loading the constant using pure predicate operations.  */
5892   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5893     return res;
5894
5895   /* Try forcing the constant to memory.  */
5896   if (builder.full_nelts ().is_constant ())
5897     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5898       {
5899         target = aarch64_target_reg (target, VNx16BImode);
5900         emit_move_insn (target, mem);
5901         return target;
5902       }
5903
5904   /* The last resort is to load the constant as an integer and then
5905      compare it against zero.  Use -1 for set bits in order to increase
5906      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
5907   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5908                                   builder.nelts_per_pattern ());
5909   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5910     int_builder.quick_push (INTVAL (builder.elt (i))
5911                             ? constm1_rtx : const0_rtx);
5912   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5913                                            int_builder.build ());
5914 }
5915
5916 /* Set DEST to immediate IMM.  */
5917
5918 void
5919 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5920 {
5921   machine_mode mode = GET_MODE (dest);
5922
5923   /* Check on what type of symbol it is.  */
5924   scalar_int_mode int_mode;
5925   if ((SYMBOL_REF_P (imm)
5926        || LABEL_REF_P (imm)
5927        || GET_CODE (imm) == CONST
5928        || GET_CODE (imm) == CONST_POLY_INT)
5929       && is_a <scalar_int_mode> (mode, &int_mode))
5930     {
5931       rtx mem;
5932       poly_int64 offset;
5933       HOST_WIDE_INT const_offset;
5934       enum aarch64_symbol_type sty;
5935
5936       /* If we have (const (plus symbol offset)), separate out the offset
5937          before we start classifying the symbol.  */
5938       rtx base = strip_offset (imm, &offset);
5939
5940       /* We must always add an offset involving VL separately, rather than
5941          folding it into the relocation.  */
5942       if (!offset.is_constant (&const_offset))
5943         {
5944           if (!TARGET_SVE)
5945             {
5946               aarch64_report_sve_required ();
5947               return;
5948             }
5949           if (base == const0_rtx
5950               && (aarch64_sve_cnt_immediate_p (offset)
5951                   || aarch64_sve_rdvl_immediate_p (offset)))
5952             emit_insn (gen_rtx_SET (dest, imm));
5953           else
5954             {
5955               /* Do arithmetic on 32-bit values if the result is smaller
5956                  than that.  */
5957               if (partial_subreg_p (int_mode, SImode))
5958                 {
5959                   /* It is invalid to do symbol calculations in modes
5960                      narrower than SImode.  */
5961                   gcc_assert (base == const0_rtx);
5962                   dest = gen_lowpart (SImode, dest);
5963                   int_mode = SImode;
5964                 }
5965               if (base != const0_rtx)
5966                 {
5967                   base = aarch64_force_temporary (int_mode, dest, base);
5968                   aarch64_add_offset (int_mode, dest, base, offset,
5969                                       NULL_RTX, NULL_RTX, 0, false);
5970                 }
5971               else
5972                 aarch64_add_offset (int_mode, dest, base, offset,
5973                                     dest, NULL_RTX, 0, false);
5974             }
5975           return;
5976         }
5977
5978       if (aarch64_rdsvl_immediate_p (base))
5979         {
5980           /* We could handle non-constant offsets if they are ever
5981              generated.  */
5982           gcc_assert (const_offset == 0);
5983           emit_insn (gen_rtx_SET (dest, imm));
5984           return;
5985         }
5986
5987       sty = aarch64_classify_symbol (base, const_offset);
5988       switch (sty)
5989         {
5990         case SYMBOL_FORCE_TO_MEM:
5991           if (int_mode != ptr_mode)
5992             imm = convert_memory_address (ptr_mode, imm);
5993
5994           if (const_offset != 0
5995               && targetm.cannot_force_const_mem (ptr_mode, imm))
5996             {
5997               gcc_assert (can_create_pseudo_p ());
5998               base = aarch64_force_temporary (int_mode, dest, base);
5999               aarch64_add_offset (int_mode, dest, base, const_offset,
6000                                   NULL_RTX, NULL_RTX, 0, false);
6001               return;
6002             }
6003
6004           mem = force_const_mem (ptr_mode, imm);
6005           gcc_assert (mem);
6006
6007           /* If we aren't generating PC relative literals, then
6008              we need to expand the literal pool access carefully.
6009              This is something that needs to be done in a number
6010              of places, so could well live as a separate function.  */
6011           if (!aarch64_pcrelative_literal_loads)
6012             {
6013               gcc_assert (can_create_pseudo_p ());
6014               base = gen_reg_rtx (ptr_mode);
6015               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6016               if (ptr_mode != Pmode)
6017                 base = convert_memory_address (Pmode, base);
6018               mem = gen_rtx_MEM (ptr_mode, base);
6019             }
6020
6021           if (int_mode != ptr_mode)
6022             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6023
6024           emit_insn (gen_rtx_SET (dest, mem));
6025
6026           return;
6027
6028         case SYMBOL_SMALL_TLSGD:
6029         case SYMBOL_SMALL_TLSDESC:
6030         case SYMBOL_SMALL_TLSIE:
6031         case SYMBOL_SMALL_GOT_28K:
6032         case SYMBOL_SMALL_GOT_4G:
6033         case SYMBOL_TINY_GOT:
6034         case SYMBOL_TINY_TLSIE:
6035           if (const_offset != 0)
6036             {
6037               gcc_assert(can_create_pseudo_p ());
6038               base = aarch64_force_temporary (int_mode, dest, base);
6039               aarch64_add_offset (int_mode, dest, base, const_offset,
6040                                   NULL_RTX, NULL_RTX, 0, false);
6041               return;
6042             }
6043           /* FALLTHRU */
6044
6045         case SYMBOL_SMALL_ABSOLUTE:
6046         case SYMBOL_TINY_ABSOLUTE:
6047         case SYMBOL_TLSLE12:
6048         case SYMBOL_TLSLE24:
6049         case SYMBOL_TLSLE32:
6050         case SYMBOL_TLSLE48:
6051           aarch64_load_symref_appropriately (dest, imm, sty);
6052           return;
6053
6054         default:
6055           gcc_unreachable ();
6056         }
6057     }
6058
6059   if (!CONST_INT_P (imm))
6060     {
6061       if (aarch64_sve_pred_mode_p (mode))
6062         {
6063           /* Only the low bit of each .H, .S and .D element is defined,
6064              so we can set the upper bits to whatever we like.  If the
6065              predicate is all-true in MODE, prefer to set all the undefined
6066              bits as well, so that we can share a single .B predicate for
6067              all modes.  */
6068           if (imm == CONSTM1_RTX (mode))
6069             imm = CONSTM1_RTX (VNx16BImode);
6070
6071           /* All methods for constructing predicate modes wider than VNx16BI
6072              will set the upper bits of each element to zero.  Expose this
6073              by moving such constants as a VNx16BI, so that all bits are
6074              significant and so that constants for different modes can be
6075              shared.  The wider constant will still be available as a
6076              REG_EQUAL note.  */
6077           rtx_vector_builder builder;
6078           if (aarch64_get_sve_pred_bits (builder, imm))
6079             {
6080               rtx res = aarch64_expand_sve_const_pred (dest, builder);
6081               if (dest != res)
6082                 emit_move_insn (dest, gen_lowpart (mode, res));
6083               return;
6084             }
6085         }
6086
6087       if (GET_CODE (imm) == HIGH
6088           || aarch64_simd_valid_immediate (imm, NULL))
6089         {
6090           emit_insn (gen_rtx_SET (dest, imm));
6091           return;
6092         }
6093
6094       if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6095         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6096           {
6097             if (dest != res)
6098               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6099             return;
6100           }
6101
6102       rtx mem = force_const_mem (mode, imm);
6103       gcc_assert (mem);
6104       emit_move_insn (dest, mem);
6105       return;
6106     }
6107
6108   aarch64_internal_mov_immediate (dest, imm, true, mode);
6109 }
6110
6111 /* Return the MEM rtx that provides the canary value that should be used
6112    for stack-smashing protection.  MODE is the mode of the memory.
6113    For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6114    (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
6115    indicates whether the caller is performing a SET or a TEST operation.  */
6116
6117 rtx
6118 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6119                                   aarch64_salt_type salt_type)
6120 {
6121   rtx addr;
6122   if (aarch64_stack_protector_guard == SSP_GLOBAL)
6123     {
6124       gcc_assert (MEM_P (decl_rtl));
6125       addr = XEXP (decl_rtl, 0);
6126       poly_int64 offset;
6127       rtx base = strip_offset_and_salt (addr, &offset);
6128       if (!SYMBOL_REF_P (base))
6129         return decl_rtl;
6130
6131       rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6132       addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6133       addr = gen_rtx_CONST (Pmode, addr);
6134       addr = plus_constant (Pmode, addr, offset);
6135     }
6136   else
6137     {
6138       /* Calculate the address from the system register.  */
6139       rtx salt = GEN_INT (salt_type);
6140       addr = gen_reg_rtx (mode);
6141       if (mode == DImode)
6142         emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6143       else
6144         {
6145           emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6146           addr = convert_memory_address (Pmode, addr);
6147         }
6148       addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6149     }
6150   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6151 }
6152
6153 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
6154    that is known to contain PTRUE.  */
6155
6156 void
6157 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6158 {
6159   expand_operand ops[3];
6160   machine_mode mode = GET_MODE (dest);
6161   create_output_operand (&ops[0], dest, mode);
6162   create_input_operand (&ops[1], pred, GET_MODE(pred));
6163   create_input_operand (&ops[2], src, mode);
6164   temporary_volatile_ok v (true);
6165   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6166 }
6167
6168 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6169    operand is in memory.  In this case we need to use the predicated LD1
6170    and ST1 instead of LDR and STR, both for correctness on big-endian
6171    targets and because LD1 and ST1 support a wider range of addressing modes.
6172    PRED_MODE is the mode of the predicate.
6173
6174    See the comment at the head of aarch64-sve.md for details about the
6175    big-endian handling.  */
6176
6177 void
6178 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6179 {
6180   machine_mode mode = GET_MODE (dest);
6181   rtx ptrue = aarch64_ptrue_reg (pred_mode);
6182   if (!register_operand (src, mode)
6183       && !register_operand (dest, mode))
6184     {
6185       rtx tmp = gen_reg_rtx (mode);
6186       if (MEM_P (src))
6187         aarch64_emit_sve_pred_move (tmp, ptrue, src);
6188       else
6189         emit_move_insn (tmp, src);
6190       src = tmp;
6191     }
6192   aarch64_emit_sve_pred_move (dest, ptrue, src);
6193 }
6194
6195 /* Called only on big-endian targets.  See whether an SVE vector move
6196    from SRC to DEST is effectively a REV[BHW] instruction, because at
6197    least one operand is a subreg of an SVE vector that has wider or
6198    narrower elements.  Return true and emit the instruction if so.
6199
6200    For example:
6201
6202      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6203
6204    represents a VIEW_CONVERT between the following vectors, viewed
6205    in memory order:
6206
6207      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
6208      R1: { [0],      [1],      [2],      [3],     ... }
6209
6210    The high part of lane X in R2 should therefore correspond to lane X*2
6211    of R1, but the register representations are:
6212
6213          msb                                      lsb
6214      R2: ...... [1].high  [1].low   [0].high  [0].low
6215      R1: ...... [3]       [2]       [1]       [0]
6216
6217    where the low part of lane X in R2 corresponds to lane X*2 in R1.
6218    We therefore need a reverse operation to swap the high and low values
6219    around.
6220
6221    This is purely an optimization.  Without it we would spill the
6222    subreg operand to the stack in one mode and reload it in the
6223    other mode, which has the same effect as the REV.  */
6224
6225 bool
6226 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6227 {
6228   gcc_assert (BYTES_BIG_ENDIAN);
6229
6230   /* Do not try to optimize subregs that LRA has created for matched
6231      reloads.  These subregs only exist as a temporary measure to make
6232      the RTL well-formed, but they are exempt from the usual
6233      TARGET_CAN_CHANGE_MODE_CLASS rules.
6234
6235      For example, if we have:
6236
6237        (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6238
6239      and the constraints require R1 and R2 to be in the same register,
6240      LRA may need to create RTL such as:
6241
6242        (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6243        (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6244        (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6245
6246      which forces both the input and output of the original instruction
6247      to use the same hard register.  But for this to work, the normal
6248      rules have to be suppressed on the subreg input, otherwise LRA
6249      would need to reload that input too, meaning that the process
6250      would never terminate.  To compensate for this, the normal rules
6251      are also suppressed for the subreg output of the first move.
6252      Ignoring the special case and handling the first move normally
6253      would therefore generate wrong code: we would reverse the elements
6254      for the first subreg but not reverse them back for the second subreg.  */
6255   if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
6256     dest = SUBREG_REG (dest);
6257   if (SUBREG_P (src) && !LRA_SUBREG_P (src))
6258     src = SUBREG_REG (src);
6259
6260   /* The optimization handles two single SVE REGs with different element
6261      sizes.  */
6262   if (!REG_P (dest)
6263       || !REG_P (src)
6264       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6265       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6266       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6267           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6268     return false;
6269
6270   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
6271   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
6272   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6273                                UNSPEC_REV_SUBREG);
6274   emit_insn (gen_rtx_SET (dest, unspec));
6275   return true;
6276 }
6277
6278 /* Return a copy of X with mode MODE, without changing its other
6279    attributes.  Unlike gen_lowpart, this doesn't care whether the
6280    mode change is valid.  */
6281
6282 rtx
6283 aarch64_replace_reg_mode (rtx x, machine_mode mode)
6284 {
6285   if (GET_MODE (x) == mode)
6286     return x;
6287
6288   x = shallow_copy_rtx (x);
6289   set_mode_and_regno (x, mode, REGNO (x));
6290   return x;
6291 }
6292
6293 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6294    stored in wider integer containers.  */
6295
6296 static unsigned int
6297 aarch64_sve_rev_unspec (machine_mode mode)
6298 {
6299   switch (GET_MODE_UNIT_SIZE (mode))
6300     {
6301     case 1: return UNSPEC_REVB;
6302     case 2: return UNSPEC_REVH;
6303     case 4: return UNSPEC_REVW;
6304     }
6305   gcc_unreachable ();
6306 }
6307
6308 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6309    operands.  */
6310
6311 void
6312 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6313 {
6314   /* Decide which REV operation we need.  The mode with wider elements
6315      determines the mode of the operands and the mode with the narrower
6316      elements determines the reverse width.  */
6317   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6318   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6319   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6320       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6321     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6322
6323   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6324   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6325
6326   /* Get the operands in the appropriate modes and emit the instruction.  */
6327   ptrue = gen_lowpart (pred_mode, ptrue);
6328   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6329   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6330   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6331                                dest, ptrue, src));
6332 }
6333
6334 static bool
6335 aarch64_function_ok_for_sibcall (tree, tree exp)
6336 {
6337   if (crtl->abi->id () != expr_callee_abi (exp).id ())
6338     return false;
6339
6340   tree fntype = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp)));
6341   if (aarch64_fntype_pstate_sm (fntype) & ~aarch64_cfun_incoming_pstate_sm ())
6342     return false;
6343   for (auto state : { "za", "zt0" })
6344     if (bool (aarch64_cfun_shared_flags (state))
6345         != bool (aarch64_fntype_shared_flags (fntype, state)))
6346       return false;
6347   return true;
6348 }
6349
6350 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6351    passed in SVE registers.  */
6352
6353 static bool
6354 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6355                              const function_arg_info &arg)
6356 {
6357   HOST_WIDE_INT size;
6358   machine_mode dummymode;
6359   int nregs;
6360
6361   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
6362   if (arg.mode == BLKmode && arg.type)
6363     size = int_size_in_bytes (arg.type);
6364   else
6365     /* No frontends can create types with variable-sized modes, so we
6366        shouldn't be asked to pass or return them.  */
6367     size = GET_MODE_SIZE (arg.mode).to_constant ();
6368
6369   /* Aggregates are passed by reference based on their size.  */
6370   if (arg.aggregate_type_p ())
6371     size = int_size_in_bytes (arg.type);
6372
6373   /* Variable sized arguments are always returned by reference.  */
6374   if (size < 0)
6375     return true;
6376
6377   /* Can this be a candidate to be passed in fp/simd register(s)?  */
6378   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6379                                                &dummymode, &nregs, NULL,
6380                                                !pcum || pcum->silent_p))
6381     return false;
6382
6383   /* Arguments which are variable sized or larger than 2 registers are
6384      passed by reference unless they are a homogenous floating point
6385      aggregate.  */
6386   return size > 2 * UNITS_PER_WORD;
6387 }
6388
6389 /* Implement TARGET_PASS_BY_REFERENCE.  */
6390
6391 static bool
6392 aarch64_pass_by_reference (cumulative_args_t pcum_v,
6393                            const function_arg_info &arg)
6394 {
6395   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6396
6397   if (!arg.type)
6398     return aarch64_pass_by_reference_1 (pcum, arg);
6399
6400   pure_scalable_type_info pst_info;
6401   switch (pst_info.analyze (arg.type))
6402     {
6403     case pure_scalable_type_info::IS_PST:
6404       if (pcum && !pcum->silent_p && !TARGET_SVE)
6405         /* We can't gracefully recover at this point, so make this a
6406            fatal error.  */
6407         fatal_error (input_location, "arguments of type %qT require"
6408                      " the SVE ISA extension", arg.type);
6409
6410       /* Variadic SVE types are passed by reference.  Normal non-variadic
6411          arguments are too if we've run out of registers.  */
6412       return (!arg.named
6413               || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
6414               || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
6415
6416     case pure_scalable_type_info::DOESNT_MATTER:
6417       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
6418       return true;
6419
6420     case pure_scalable_type_info::NO_ABI_IDENTITY:
6421     case pure_scalable_type_info::ISNT_PST:
6422       return aarch64_pass_by_reference_1 (pcum, arg);
6423     }
6424   gcc_unreachable ();
6425 }
6426
6427 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
6428 static bool
6429 aarch64_return_in_msb (const_tree valtype)
6430 {
6431   machine_mode dummy_mode;
6432   int dummy_int;
6433
6434   /* Never happens in little-endian mode.  */
6435   if (!BYTES_BIG_ENDIAN)
6436     return false;
6437
6438   /* Only composite types smaller than or equal to 16 bytes can
6439      be potentially returned in registers.  */
6440   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
6441       || int_size_in_bytes (valtype) <= 0
6442       || int_size_in_bytes (valtype) > 16)
6443     return false;
6444
6445   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6446      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6447      is always passed/returned in the least significant bits of fp/simd
6448      register(s).  */
6449   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
6450                                                &dummy_mode, &dummy_int, NULL,
6451                                                false))
6452     return false;
6453
6454   /* Likewise pure scalable types for SVE vector and predicate registers.  */
6455   pure_scalable_type_info pst_info;
6456   if (pst_info.analyze_registers (valtype))
6457     return false;
6458
6459   return true;
6460 }
6461
6462 /* Implement TARGET_FUNCTION_VALUE.
6463    Define how to find the value returned by a function.  */
6464
6465 static rtx
6466 aarch64_function_value (const_tree type, const_tree func,
6467                         bool outgoing ATTRIBUTE_UNUSED)
6468 {
6469   machine_mode mode;
6470   int unsignedp;
6471
6472   mode = TYPE_MODE (type);
6473   if (INTEGRAL_TYPE_P (type))
6474     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
6475
6476   pure_scalable_type_info pst_info;
6477   if (type && pst_info.analyze_registers (type))
6478     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
6479
6480   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6481      are returned in memory, not by value.  */
6482   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6483   bool sve_p = (vec_flags & VEC_ANY_SVE);
6484
6485   if (aarch64_return_in_msb (type))
6486     {
6487       HOST_WIDE_INT size = int_size_in_bytes (type);
6488
6489       if (size % UNITS_PER_WORD != 0)
6490         {
6491           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
6492           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
6493         }
6494     }
6495
6496   int count;
6497   machine_mode ag_mode;
6498   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
6499                                                NULL, false))
6500     {
6501       gcc_assert (!sve_p);
6502       if (!aarch64_composite_type_p (type, mode))
6503         {
6504           gcc_assert (count == 1 && mode == ag_mode);
6505           return gen_rtx_REG (mode, V0_REGNUM);
6506         }
6507       else if (aarch64_advsimd_full_struct_mode_p (mode)
6508                && known_eq (GET_MODE_SIZE (ag_mode), 16))
6509         return gen_rtx_REG (mode, V0_REGNUM);
6510       else if (aarch64_advsimd_partial_struct_mode_p (mode)
6511                && known_eq (GET_MODE_SIZE (ag_mode), 8))
6512         return gen_rtx_REG (mode, V0_REGNUM);
6513       else
6514         {
6515           int i;
6516           rtx par;
6517
6518           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
6519           for (i = 0; i < count; i++)
6520             {
6521               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6522               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
6523               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6524               XVECEXP (par, 0, i) = tmp;
6525             }
6526           return par;
6527         }
6528     }
6529   else
6530     {
6531       if (sve_p)
6532         {
6533           /* Vector types can acquire a partial SVE mode using things like
6534              __attribute__((vector_size(N))), and this is potentially useful.
6535              However, the choice of mode doesn't affect the type's ABI
6536              identity, so we should treat the types as though they had
6537              the associated integer mode, just like they did before SVE
6538              was introduced.
6539
6540              We know that the vector must be 128 bits or smaller,
6541              otherwise we'd have returned it in memory instead.  */
6542           gcc_assert (type
6543                       && (aarch64_some_values_include_pst_objects_p (type)
6544                           || (vec_flags & VEC_PARTIAL)));
6545
6546           scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
6547           rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
6548           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
6549           return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
6550         }
6551       return gen_rtx_REG (mode, R0_REGNUM);
6552     }
6553 }
6554
6555 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6556    Return true if REGNO is the number of a hard register in which the values
6557    of called function may come back.  */
6558
6559 static bool
6560 aarch64_function_value_regno_p (const unsigned int regno)
6561 {
6562   /* Maximum of 16 bytes can be returned in the general registers.  Examples
6563      of 16-byte return values are: 128-bit integers and 16-byte small
6564      structures (excluding homogeneous floating-point aggregates).  */
6565   if (regno == R0_REGNUM || regno == R1_REGNUM)
6566     return true;
6567
6568   /* Up to four fp/simd registers can return a function value, e.g. a
6569      homogeneous floating-point aggregate having four members.  */
6570   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
6571     return TARGET_FLOAT;
6572
6573   if (regno >= P0_REGNUM && regno < P0_REGNUM + HA_MAX_NUM_FLDS)
6574     return TARGET_SVE;
6575
6576   return false;
6577 }
6578
6579 /* Subroutine for aarch64_return_in_memory for types that are not returned
6580    in SVE registers.  */
6581
6582 static bool
6583 aarch64_return_in_memory_1 (const_tree type)
6584 {
6585   HOST_WIDE_INT size;
6586   machine_mode ag_mode;
6587   int count;
6588
6589   if (!AGGREGATE_TYPE_P (type)
6590       && TREE_CODE (type) != BITINT_TYPE
6591       && TREE_CODE (type) != COMPLEX_TYPE
6592       && TREE_CODE (type) != VECTOR_TYPE)
6593     /* Simple scalar types always returned in registers.  */
6594     return false;
6595
6596   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6597                                                &ag_mode, &count, NULL, false))
6598     return false;
6599
6600   /* Types larger than 2 registers returned in memory.  */
6601   size = int_size_in_bytes (type);
6602   return (size < 0 || size > 2 * UNITS_PER_WORD);
6603 }
6604
6605 /* Implement TARGET_RETURN_IN_MEMORY.
6606
6607    If the type T of the result of a function is such that
6608      void func (T arg)
6609    would require that arg be passed as a value in a register (or set of
6610    registers) according to the parameter passing rules, then the result
6611    is returned in the same registers as would be used for such an
6612    argument.  */
6613
6614 static bool
6615 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
6616 {
6617   pure_scalable_type_info pst_info;
6618   switch (pst_info.analyze (type))
6619     {
6620     case pure_scalable_type_info::IS_PST:
6621       return (pst_info.num_zr () > NUM_FP_ARG_REGS
6622               || pst_info.num_pr () > NUM_PR_ARG_REGS);
6623
6624     case pure_scalable_type_info::DOESNT_MATTER:
6625       gcc_assert (aarch64_return_in_memory_1 (type));
6626       return true;
6627
6628     case pure_scalable_type_info::NO_ABI_IDENTITY:
6629     case pure_scalable_type_info::ISNT_PST:
6630       return aarch64_return_in_memory_1 (type);
6631     }
6632   gcc_unreachable ();
6633 }
6634
6635 static bool
6636 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
6637                                const_tree type, int *nregs)
6638 {
6639   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6640   return aarch64_vfp_is_call_or_return_candidate (mode, type,
6641                                                   &pcum->aapcs_vfp_rmode,
6642                                                   nregs, NULL, pcum->silent_p);
6643 }
6644
6645 /* Given MODE and TYPE of a function argument, return the alignment in
6646    bits.  The idea is to suppress any stronger alignment requested by
6647    the user and opt for the natural alignment (specified in AAPCS64 \S
6648    4.1).  ABI_BREAK_GCC_9 is set to the old alignment if the alignment
6649    was incorrectly calculated in versions of GCC prior to GCC 9.
6650    ABI_BREAK_GCC_13 is set to the old alignment if it was incorrectly
6651    calculated in versions between GCC 9 and GCC 13.  If the alignment
6652    might have changed between GCC 13 and GCC 14, ABI_BREAK_GCC_14
6653    is the old GCC 13 alignment, otherwise it is zero.
6654
6655    This is a helper function for local use only.  */
6656
6657 static unsigned int
6658 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6659                                 unsigned int *abi_break_gcc_9,
6660                                 unsigned int *abi_break_gcc_13,
6661                                 unsigned int *abi_break_gcc_14)
6662 {
6663   *abi_break_gcc_9 = 0;
6664   *abi_break_gcc_13 = 0;
6665   *abi_break_gcc_14 = 0;
6666   if (!type)
6667     return GET_MODE_ALIGNMENT (mode);
6668
6669   if (integer_zerop (TYPE_SIZE (type)))
6670     return 0;
6671
6672   gcc_assert (TYPE_MODE (type) == mode);
6673
6674   if (!AGGREGATE_TYPE_P (type))
6675     {
6676       /* The ABI alignment is the natural alignment of the type, without
6677          any attributes applied.  Normally this is the alignment of the
6678          TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
6679          For now we just handle the known exceptions explicitly.  */
6680       type = TYPE_MAIN_VARIANT (type);
6681       if (POINTER_TYPE_P (type))
6682         {
6683           gcc_assert (known_eq (POINTER_SIZE, GET_MODE_BITSIZE (mode)));
6684           return POINTER_SIZE;
6685         }
6686       if (TREE_CODE (type) == ENUMERAL_TYPE && TREE_TYPE (type))
6687         {
6688           *abi_break_gcc_14 = TYPE_ALIGN (type);
6689           type = TYPE_MAIN_VARIANT (TREE_TYPE (type));
6690         }
6691       gcc_assert (!TYPE_USER_ALIGN (type));
6692       return TYPE_ALIGN (type);
6693     }
6694
6695   if (TREE_CODE (type) == ARRAY_TYPE)
6696     return TYPE_ALIGN (TREE_TYPE (type));
6697
6698   unsigned int alignment = 0;
6699   unsigned int bitfield_alignment_with_packed = 0;
6700   unsigned int bitfield_alignment = 0;
6701   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6702     if (TREE_CODE (field) == FIELD_DECL)
6703       {
6704         /* Note that we explicitly consider zero-sized fields here,
6705            even though they don't map to AAPCS64 machine types.
6706            For example, in:
6707
6708                struct __attribute__((aligned(8))) empty {};
6709
6710                struct s {
6711                  [[no_unique_address]] empty e;
6712                  int x;
6713                };
6714
6715            "s" contains only one Fundamental Data Type (the int field)
6716            but gains 8-byte alignment and size thanks to "e".  */
6717         alignment = std::max (alignment, DECL_ALIGN (field));
6718         if (DECL_BIT_FIELD_TYPE (field))
6719           {
6720             /* Take the bit-field type's alignment into account only
6721                if the user didn't reduce this field's alignment with
6722                the packed attribute.  */
6723             if (!DECL_PACKED (field))
6724               bitfield_alignment
6725                 = std::max (bitfield_alignment,
6726                             TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6727
6728             /* Compute the alignment even if the bit-field is
6729                packed, so that we can emit a warning in case the
6730                alignment changed between GCC versions.  */
6731             bitfield_alignment_with_packed
6732               = std::max (bitfield_alignment_with_packed,
6733                           TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6734           }
6735       }
6736
6737   /* Emit a warning if the alignment is different when taking the
6738      'packed' attribute into account.  */
6739   if (bitfield_alignment != bitfield_alignment_with_packed
6740       && bitfield_alignment_with_packed > alignment)
6741     *abi_break_gcc_13 = bitfield_alignment_with_packed;
6742
6743   if (bitfield_alignment > alignment)
6744     {
6745       *abi_break_gcc_9 = alignment;
6746       return bitfield_alignment;
6747     }
6748
6749   return alignment;
6750 }
6751
6752 /* Return true if TYPE describes a _BitInt(N) or an angreggate that uses the
6753    _BitInt(N) type.  These include ARRAY_TYPE's with an element that is a
6754    _BitInt(N) or an aggregate that uses it, and a RECORD_TYPE or a UNION_TYPE
6755    with a field member that is a _BitInt(N) or an aggregate that uses it.
6756    Return false otherwise.  */
6757
6758 static bool
6759 bitint_or_aggr_of_bitint_p (tree type)
6760 {
6761   if (!type)
6762     return false;
6763
6764   if (TREE_CODE (type) == BITINT_TYPE)
6765     return true;
6766
6767   /* If ARRAY_TYPE, check it's element type.  */
6768   if (TREE_CODE (type) == ARRAY_TYPE)
6769     return bitint_or_aggr_of_bitint_p (TREE_TYPE (type));
6770
6771   /* If RECORD_TYPE or UNION_TYPE, check the fields' types.  */
6772   if (RECORD_OR_UNION_TYPE_P (type))
6773     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
6774       {
6775         if (TREE_CODE (field) != FIELD_DECL)
6776           continue;
6777         if (bitint_or_aggr_of_bitint_p (TREE_TYPE (field)))
6778           return true;
6779       }
6780   return false;
6781 }
6782
6783 /* Layout a function argument according to the AAPCS64 rules.  The rule
6784    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
6785    mode that was originally given to us by the target hook, whereas the
6786    mode in ARG might be the result of replacing partial SVE modes with
6787    the equivalent integer mode.  */
6788
6789 static void
6790 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6791 {
6792   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6793   tree type = arg.type;
6794   machine_mode mode = arg.mode;
6795   int ncrn, nvrn, nregs;
6796   bool allocate_ncrn, allocate_nvrn;
6797   HOST_WIDE_INT size;
6798   unsigned int abi_break_gcc_9;
6799   unsigned int abi_break_gcc_13;
6800   unsigned int abi_break_gcc_14;
6801
6802   /* We need to do this once per argument.  */
6803   if (pcum->aapcs_arg_processed)
6804     return;
6805
6806   bool warn_pcs_change
6807     = (warn_psabi
6808        && !pcum->silent_p
6809        && (currently_expanding_function_start
6810            || currently_expanding_gimple_stmt));
6811
6812   /* HFAs and HVAs can have an alignment greater than 16 bytes.  For example:
6813
6814        typedef struct foo {
6815          __Int8x16_t foo[2] __attribute__((aligned(32)));
6816        } foo;
6817
6818      is still a HVA despite its larger-than-normal alignment.
6819      However, such over-aligned HFAs and HVAs are guaranteed to have
6820      no padding.
6821
6822      If we exclude HFAs and HVAs from the discussion below, then there
6823      are several things to note:
6824
6825      - Both the C and AAPCS64 interpretations of a type's alignment should
6826        give a value that is no greater than the type's size.
6827
6828      - Types bigger than 16 bytes are passed indirectly.
6829
6830      - If an argument of type T is passed indirectly, TYPE and MODE describe
6831        a pointer to T rather than T iself.
6832
6833      It follows that the AAPCS64 alignment of TYPE must be no greater
6834      than 16 bytes.
6835
6836      Versions prior to GCC 9.1 ignored a bitfield's underlying type
6837      and so could calculate an alignment that was too small.  If this
6838      happened for TYPE then ABI_BREAK_GCC_9 is this older, too-small alignment.
6839
6840      Although GCC 9.1 fixed that bug, it introduced a different one:
6841      it would consider the alignment of a bitfield's underlying type even
6842      if the field was packed (which should have the effect of overriding
6843      the alignment of the underlying type).  This was fixed in GCC 13.1.
6844
6845      As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
6846      that was too big.  If this happened for TYPE, ABI_BREAK_GCC_13 is
6847      this older, too-big alignment.
6848
6849      Also, the fact that GCC 9 to GCC 12 considered irrelevant
6850      alignments meant they could calculate type alignments that were
6851      bigger than the type's size, contrary to the assumption above.
6852      The handling of register arguments was nevertheless (and justifiably)
6853      written to follow the assumption that the alignment can never be
6854      greater than the size.  The same was not true for stack arguments;
6855      their alignment was instead handled by MIN bounds in
6856      aarch64_function_arg_boundary.
6857
6858      The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
6859      an alignment of more than 16 bytes for TYPE then:
6860
6861      - If the argument was passed in registers, these GCC versions
6862        would treat the alignment as though it was *less than* 16 bytes.
6863
6864      - If the argument was passed on the stack, these GCC versions
6865        would treat the alignment as though it was *equal to* 16 bytes.
6866
6867      Both behaviors were wrong, but in different cases.  */
6868
6869   pcum->aapcs_arg_processed = true;
6870
6871   pure_scalable_type_info pst_info;
6872   if (type && pst_info.analyze_registers (type))
6873     {
6874       /* aarch64_function_arg_alignment has never had an effect on
6875          this case.  */
6876
6877       /* The PCS says that it is invalid to pass an SVE value to an
6878          unprototyped function.  There is no ABI-defined location we
6879          can return in this case, so we have no real choice but to raise
6880          an error immediately, even though this is only a query function.  */
6881       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
6882         {
6883           gcc_assert (!pcum->silent_p);
6884           error ("SVE type %qT cannot be passed to an unprototyped function",
6885                  arg.type);
6886           /* Avoid repeating the message, and avoid tripping the assert
6887              below.  */
6888           pcum->pcs_variant = ARM_PCS_SVE;
6889         }
6890
6891       /* We would have converted the argument into pass-by-reference
6892          form if it didn't fit in registers.  */
6893       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
6894       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
6895       gcc_assert (arg.named
6896                   && pcum->pcs_variant == ARM_PCS_SVE
6897                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
6898                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
6899       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
6900                                           P0_REGNUM + pcum->aapcs_nprn);
6901       return;
6902     }
6903
6904   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6905      are passed by reference, not by value.  */
6906   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6907   bool sve_p = (vec_flags & VEC_ANY_SVE);
6908   if (sve_p)
6909     /* Vector types can acquire a partial SVE mode using things like
6910        __attribute__((vector_size(N))), and this is potentially useful.
6911        However, the choice of mode doesn't affect the type's ABI
6912        identity, so we should treat the types as though they had
6913        the associated integer mode, just like they did before SVE
6914        was introduced.
6915
6916        We know that the vector must be 128 bits or smaller,
6917        otherwise we'd have passed it in memory instead.  */
6918     gcc_assert (type
6919                 && (aarch64_some_values_include_pst_objects_p (type)
6920                     || (vec_flags & VEC_PARTIAL)));
6921
6922   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
6923   if (type)
6924     size = int_size_in_bytes (type);
6925   else
6926     /* No frontends can create types with variable-sized modes, so we
6927        shouldn't be asked to pass or return them.  */
6928     size = GET_MODE_SIZE (mode).to_constant ();
6929   size = ROUND_UP (size, UNITS_PER_WORD);
6930
6931   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6932   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6933                                                  mode,
6934                                                  type,
6935                                                  &nregs);
6936   gcc_assert (!sve_p || !allocate_nvrn);
6937
6938   unsigned int alignment
6939     = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
6940                                       &abi_break_gcc_13, &abi_break_gcc_14);
6941
6942   gcc_assert ((allocate_nvrn || alignment <= 16 * BITS_PER_UNIT)
6943               && (!alignment || abi_break_gcc_9 < alignment)
6944               && (!abi_break_gcc_13 || alignment < abi_break_gcc_13));
6945
6946   /* _BitInt(N) was only added in GCC 14.  */
6947   bool warn_pcs_change_le_gcc14
6948     = warn_pcs_change && !bitint_or_aggr_of_bitint_p (type);
6949
6950   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6951      The following code thus handles passing by SIMD/FP registers first.  */
6952
6953   nvrn = pcum->aapcs_nvrn;
6954
6955   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6956      and homogenous short-vector aggregates (HVA).  */
6957   if (allocate_nvrn)
6958     {
6959       /* aarch64_function_arg_alignment has never had an effect on
6960          this case.  */
6961       if (!pcum->silent_p && !TARGET_FLOAT)
6962         aarch64_err_no_fpadvsimd (mode);
6963
6964       if (nvrn + nregs <= NUM_FP_ARG_REGS)
6965         {
6966           pcum->aapcs_nextnvrn = nvrn + nregs;
6967           if (!aarch64_composite_type_p (type, mode))
6968             {
6969               gcc_assert (nregs == 1);
6970               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6971             }
6972           else if (aarch64_advsimd_full_struct_mode_p (mode)
6973                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
6974             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6975           else if (aarch64_advsimd_partial_struct_mode_p (mode)
6976                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
6977             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6978           else
6979             {
6980               rtx par;
6981               int i;
6982               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6983               for (i = 0; i < nregs; i++)
6984                 {
6985                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
6986                                          V0_REGNUM + nvrn + i);
6987                   rtx offset = gen_int_mode
6988                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
6989                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6990                   XVECEXP (par, 0, i) = tmp;
6991                 }
6992               pcum->aapcs_reg = par;
6993             }
6994           return;
6995         }
6996       else
6997         {
6998           /* C.3 NSRN is set to 8.  */
6999           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7000           goto on_stack;
7001         }
7002     }
7003
7004   ncrn = pcum->aapcs_ncrn;
7005   nregs = size / UNITS_PER_WORD;
7006
7007   /* C6 - C9.  though the sign and zero extension semantics are
7008      handled elsewhere.  This is the case where the argument fits
7009      entirely general registers.  */
7010   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7011     {
7012       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7013
7014       /* C.8 if the argument has an alignment of 16 then the NGRN is
7015          rounded up to the next even number.  */
7016       if (nregs == 2
7017           && ncrn % 2)
7018         {
7019           /* Emit a warning if the alignment changed when taking the
7020              'packed' attribute into account.  */
7021           if (warn_pcs_change_le_gcc14
7022               && abi_break_gcc_13
7023               && ((abi_break_gcc_13 == 16 * BITS_PER_UNIT)
7024                   != (alignment == 16 * BITS_PER_UNIT)))
7025             inform (input_location, "parameter passing for argument of type "
7026                     "%qT changed in GCC 13.1", type);
7027
7028           if (warn_pcs_change_le_gcc14
7029               && abi_break_gcc_14
7030               && ((abi_break_gcc_14 == 16 * BITS_PER_UNIT)
7031                   != (alignment == 16 * BITS_PER_UNIT)))
7032             inform (input_location, "parameter passing for argument of type "
7033                     "%qT changed in GCC 14.1", type);
7034
7035           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7036              comparison is there because for > 16 * BITS_PER_UNIT
7037              alignment nregs should be > 2 and therefore it should be
7038              passed by reference rather than value.  */
7039           if (alignment == 16 * BITS_PER_UNIT)
7040             {
7041               if (warn_pcs_change_le_gcc14
7042                   && abi_break_gcc_9)
7043                 inform (input_location, "parameter passing for argument of type "
7044                         "%qT changed in GCC 9.1", type);
7045               ++ncrn;
7046               gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7047             }
7048         }
7049
7050       /* If an argument with an SVE mode needs to be shifted up to the
7051          high part of the register, treat it as though it had an integer mode.
7052          Using the normal (parallel [...]) would suppress the shifting.  */
7053       if (sve_p
7054           && BYTES_BIG_ENDIAN
7055           && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7056           && aarch64_pad_reg_upward (mode, type, false))
7057         {
7058           mode = int_mode_for_mode (mode).require ();
7059           sve_p = false;
7060         }
7061
7062       /* NREGS can be 0 when e.g. an empty structure is to be passed.
7063          A reg is still generated for it, but the caller should be smart
7064          enough not to use it.  */
7065       if (nregs == 0
7066           || (nregs == 1 && !sve_p)
7067           || GET_MODE_CLASS (mode) == MODE_INT)
7068         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7069       else
7070         {
7071           rtx par;
7072           int i;
7073
7074           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7075           for (i = 0; i < nregs; i++)
7076             {
7077               scalar_int_mode reg_mode = word_mode;
7078               if (nregs == 1)
7079                 reg_mode = int_mode_for_mode (mode).require ();
7080               rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7081               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7082                                        GEN_INT (i * UNITS_PER_WORD));
7083               XVECEXP (par, 0, i) = tmp;
7084             }
7085           pcum->aapcs_reg = par;
7086         }
7087
7088       pcum->aapcs_nextncrn = ncrn + nregs;
7089       return;
7090     }
7091
7092   /* C.11  */
7093   pcum->aapcs_nextncrn = NUM_ARG_REGS;
7094
7095   /* The argument is passed on stack; record the needed number of words for
7096      this argument and align the total size if necessary.  */
7097 on_stack:
7098   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7099
7100   if (warn_pcs_change_le_gcc14
7101       && abi_break_gcc_13
7102       && ((abi_break_gcc_13 >= 16 * BITS_PER_UNIT)
7103           != (alignment >= 16 * BITS_PER_UNIT)))
7104     inform (input_location, "parameter passing for argument of type "
7105             "%qT changed in GCC 13.1", type);
7106
7107   if (warn_pcs_change_le_gcc14
7108       && abi_break_gcc_14
7109       && ((abi_break_gcc_14 >= 16 * BITS_PER_UNIT)
7110           != (alignment >= 16 * BITS_PER_UNIT)))
7111     inform (input_location, "parameter passing for argument of type "
7112             "%qT changed in GCC 14.1", type);
7113
7114   if (alignment == 16 * BITS_PER_UNIT)
7115     {
7116       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7117       if (pcum->aapcs_stack_size != new_size)
7118         {
7119           if (warn_pcs_change_le_gcc14
7120               && abi_break_gcc_9)
7121             inform (input_location, "parameter passing for argument of type "
7122                     "%qT changed in GCC 9.1", type);
7123           pcum->aapcs_stack_size = new_size;
7124         }
7125     }
7126   return;
7127 }
7128
7129 /* Add the current argument register to the set of those that need
7130    to be saved and restored around a change to PSTATE.SM.  */
7131
7132 static void
7133 aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7134 {
7135   subrtx_var_iterator::array_type array;
7136   FOR_EACH_SUBRTX_VAR (iter, array, pcum->aapcs_reg, NONCONST)
7137     {
7138       rtx x = *iter;
7139       if (REG_P (x) && (FP_REGNUM_P (REGNO (x)) || PR_REGNUM_P (REGNO (x))))
7140         {
7141           unsigned int i = pcum->num_sme_mode_switch_args++;
7142           gcc_assert (i < ARRAY_SIZE (pcum->sme_mode_switch_args));
7143           pcum->sme_mode_switch_args[i] = x;
7144         }
7145     }
7146 }
7147
7148 /* Return a parallel that contains all the registers that need to be
7149    saved around a change to PSTATE.SM.  Return const0_rtx if there is
7150    no such mode switch, or if no registers need to be saved.  */
7151
7152 static rtx
7153 aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7154 {
7155   if (!pcum->num_sme_mode_switch_args)
7156     return const0_rtx;
7157
7158   auto argvec = gen_rtvec_v (pcum->num_sme_mode_switch_args,
7159                              pcum->sme_mode_switch_args);
7160   return gen_rtx_PARALLEL (VOIDmode, argvec);
7161 }
7162
7163 /* Implement TARGET_FUNCTION_ARG.  */
7164
7165 static rtx
7166 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7167 {
7168   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7169   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7170               || pcum->pcs_variant == ARM_PCS_SIMD
7171               || pcum->pcs_variant == ARM_PCS_SVE);
7172
7173   if (arg.end_marker_p ())
7174     {
7175       rtx abi_cookie = aarch64_gen_callee_cookie (pcum->isa_mode,
7176                                                   pcum->pcs_variant);
7177       rtx sme_mode_switch_args = aarch64_finish_sme_mode_switch_args (pcum);
7178       rtx shared_za_flags = gen_int_mode (pcum->shared_za_flags, SImode);
7179       rtx shared_zt0_flags = gen_int_mode (pcum->shared_zt0_flags, SImode);
7180       return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (4, abi_cookie,
7181                                                     sme_mode_switch_args,
7182                                                     shared_za_flags,
7183                                                     shared_zt0_flags));
7184     }
7185
7186   aarch64_layout_arg (pcum_v, arg);
7187   return pcum->aapcs_reg;
7188 }
7189
7190 void
7191 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7192                               const_tree fntype,
7193                               rtx libname ATTRIBUTE_UNUSED,
7194                               const_tree fndecl,
7195                               unsigned n_named ATTRIBUTE_UNUSED,
7196                               bool silent_p)
7197 {
7198   pcum->aapcs_ncrn = 0;
7199   pcum->aapcs_nvrn = 0;
7200   pcum->aapcs_nprn = 0;
7201   pcum->aapcs_nextncrn = 0;
7202   pcum->aapcs_nextnvrn = 0;
7203   pcum->aapcs_nextnprn = 0;
7204   if (fntype)
7205     {
7206       pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7207       pcum->isa_mode = aarch64_fntype_isa_mode (fntype);
7208     }
7209   else
7210     {
7211       pcum->pcs_variant = ARM_PCS_AAPCS64;
7212       pcum->isa_mode = AARCH64_FL_DEFAULT_ISA_MODE;
7213     }
7214   pcum->aapcs_reg = NULL_RTX;
7215   pcum->aapcs_arg_processed = false;
7216   pcum->aapcs_stack_words = 0;
7217   pcum->aapcs_stack_size = 0;
7218   pcum->silent_p = silent_p;
7219   pcum->shared_za_flags
7220     = (fntype ? aarch64_fntype_shared_flags (fntype, "za") : 0U);
7221   pcum->shared_zt0_flags
7222     = (fntype ? aarch64_fntype_shared_flags (fntype, "zt0") : 0U);
7223   pcum->num_sme_mode_switch_args = 0;
7224
7225   if (!silent_p
7226       && !TARGET_FLOAT
7227       && fntype && fntype != error_mark_node)
7228     {
7229       const_tree type = TREE_TYPE (fntype);
7230       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
7231       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
7232       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7233                                                    &mode, &nregs, NULL, false))
7234         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7235     }
7236
7237   if (!silent_p
7238       && !TARGET_SVE
7239       && pcum->pcs_variant == ARM_PCS_SVE)
7240     {
7241       /* We can't gracefully recover at this point, so make this a
7242          fatal error.  */
7243       if (fndecl)
7244         fatal_error (input_location, "%qE requires the SVE ISA extension",
7245                      fndecl);
7246       else
7247         fatal_error (input_location, "calls to functions of type %qT require"
7248                      " the SVE ISA extension", fntype);
7249     }
7250 }
7251
7252 static void
7253 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7254                               const function_arg_info &arg)
7255 {
7256   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7257   if (pcum->pcs_variant == ARM_PCS_AAPCS64
7258       || pcum->pcs_variant == ARM_PCS_SIMD
7259       || pcum->pcs_variant == ARM_PCS_SVE)
7260     {
7261       aarch64_layout_arg (pcum_v, arg);
7262       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7263                   != (pcum->aapcs_stack_words != 0));
7264       if (pcum->aapcs_reg
7265           && aarch64_call_switches_pstate_sm (pcum->isa_mode))
7266         aarch64_record_sme_mode_switch_args (pcum);
7267
7268       pcum->aapcs_arg_processed = false;
7269       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7270       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7271       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7272       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7273       pcum->aapcs_stack_words = 0;
7274       pcum->aapcs_reg = NULL_RTX;
7275     }
7276 }
7277
7278 bool
7279 aarch64_function_arg_regno_p (unsigned regno)
7280 {
7281   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7282           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS)
7283           || (PR_REGNUM_P (regno) && regno < P0_REGNUM + NUM_PR_ARG_REGS));
7284 }
7285
7286 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
7287    PARM_BOUNDARY bits of alignment, but will be given anything up
7288    to STACK_BOUNDARY bits if the type requires it.  This makes sure
7289    that both before and after the layout of each argument, the Next
7290    Stacked Argument Address (NSAA) will have a minimum alignment of
7291    8 bytes.  */
7292
7293 static unsigned int
7294 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7295 {
7296   unsigned int abi_break_gcc_9;
7297   unsigned int abi_break_gcc_13;
7298   unsigned int abi_break_gcc_14;
7299   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7300                                                            &abi_break_gcc_9,
7301                                                            &abi_break_gcc_13,
7302                                                            &abi_break_gcc_14);
7303   /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7304      to emit warnings about ABI incompatibility.  */
7305   alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7306   return alignment;
7307 }
7308
7309 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
7310
7311 static fixed_size_mode
7312 aarch64_get_reg_raw_mode (int regno)
7313 {
7314   /* Don't use any non GP registers for __builtin_apply and
7315      __builtin_return if general registers only mode is requested. */
7316   if (TARGET_GENERAL_REGS_ONLY && !GP_REGNUM_P (regno))
7317     return as_a <fixed_size_mode> (VOIDmode);
7318   if (TARGET_SVE && FP_REGNUM_P (regno))
7319     /* Don't use the SVE part of the register for __builtin_apply and
7320        __builtin_return.  The SVE registers aren't used by the normal PCS,
7321        so using them there would be a waste of time.  The PCS extensions
7322        for SVE types are fundamentally incompatible with the
7323        __builtin_return/__builtin_apply interface.  */
7324     return as_a <fixed_size_mode> (V16QImode);
7325   if (PR_REGNUM_P (regno))
7326     /* For SVE PR regs, indicate that they should be ignored for
7327        __builtin_apply/__builtin_return.  */
7328     return as_a <fixed_size_mode> (VOIDmode);
7329   return default_get_reg_raw_mode (regno);
7330 }
7331
7332 /* Implement TARGET_FUNCTION_ARG_PADDING.
7333
7334    Small aggregate types are placed in the lowest memory address.
7335
7336    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
7337
7338 static pad_direction
7339 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7340 {
7341   /* On little-endian targets, the least significant byte of every stack
7342      argument is passed at the lowest byte address of the stack slot.  */
7343   if (!BYTES_BIG_ENDIAN)
7344     return PAD_UPWARD;
7345
7346   /* Otherwise, integral, floating-point and pointer types are padded downward:
7347      the least significant byte of a stack argument is passed at the highest
7348      byte address of the stack slot.  */
7349   if (type
7350       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7351          || POINTER_TYPE_P (type))
7352       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7353     return PAD_DOWNWARD;
7354
7355   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
7356   return PAD_UPWARD;
7357 }
7358
7359 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7360
7361    It specifies padding for the last (may also be the only)
7362    element of a block move between registers and memory.  If
7363    assuming the block is in the memory, padding upward means that
7364    the last element is padded after its highest significant byte,
7365    while in downward padding, the last element is padded at the
7366    its least significant byte side.
7367
7368    Small aggregates and small complex types are always padded
7369    upwards.
7370
7371    We don't need to worry about homogeneous floating-point or
7372    short-vector aggregates; their move is not affected by the
7373    padding direction determined here.  Regardless of endianness,
7374    each element of such an aggregate is put in the least
7375    significant bits of a fp/simd register.
7376
7377    Return !BYTES_BIG_ENDIAN if the least significant byte of the
7378    register has useful data, and return the opposite if the most
7379    significant byte does.  */
7380
7381 bool
7382 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
7383                      bool first ATTRIBUTE_UNUSED)
7384 {
7385
7386   /* Aside from pure scalable types, small composite types are always
7387      padded upward.  */
7388   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
7389     {
7390       HOST_WIDE_INT size;
7391       if (type)
7392         size = int_size_in_bytes (type);
7393       else
7394         /* No frontends can create types with variable-sized modes, so we
7395            shouldn't be asked to pass or return them.  */
7396         size = GET_MODE_SIZE (mode).to_constant ();
7397       if (size < 2 * UNITS_PER_WORD)
7398         {
7399           pure_scalable_type_info pst_info;
7400           if (pst_info.analyze_registers (type))
7401             return false;
7402           return true;
7403         }
7404     }
7405
7406   /* Otherwise, use the default padding.  */
7407   return !BYTES_BIG_ENDIAN;
7408 }
7409
7410 static scalar_int_mode
7411 aarch64_libgcc_cmp_return_mode (void)
7412 {
7413   return SImode;
7414 }
7415
7416 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7417
7418 /* We use the 12-bit shifted immediate arithmetic instructions so values
7419    must be multiple of (1 << 12), i.e. 4096.  */
7420 #define ARITH_FACTOR 4096
7421
7422 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7423 #error Cannot use simple address calculation for stack probing
7424 #endif
7425
7426 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7427    inclusive.  These are offsets from the current stack pointer.  */
7428
7429 static void
7430 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
7431 {
7432   HOST_WIDE_INT size;
7433   if (!poly_size.is_constant (&size))
7434     {
7435       sorry ("stack probes for SVE frames");
7436       return;
7437     }
7438
7439   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
7440
7441   /* See the same assertion on PROBE_INTERVAL above.  */
7442   gcc_assert ((first % ARITH_FACTOR) == 0);
7443
7444   /* See if we have a constant small number of probes to generate.  If so,
7445      that's the easy case.  */
7446   if (size <= PROBE_INTERVAL)
7447     {
7448       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7449
7450       emit_set_insn (reg1,
7451                      plus_constant (Pmode,
7452                                     stack_pointer_rtx, -(first + base)));
7453       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
7454     }
7455
7456   /* The run-time loop is made up of 8 insns in the generic case while the
7457      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
7458   else if (size <= 4 * PROBE_INTERVAL)
7459     {
7460       HOST_WIDE_INT i, rem;
7461
7462       emit_set_insn (reg1,
7463                      plus_constant (Pmode,
7464                                     stack_pointer_rtx,
7465                                     -(first + PROBE_INTERVAL)));
7466       emit_stack_probe (reg1);
7467
7468       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7469          it exceeds SIZE.  If only two probes are needed, this will not
7470          generate any code.  Then probe at FIRST + SIZE.  */
7471       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7472         {
7473           emit_set_insn (reg1,
7474                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
7475           emit_stack_probe (reg1);
7476         }
7477
7478       rem = size - (i - PROBE_INTERVAL);
7479       if (rem > 256)
7480         {
7481           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7482
7483           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7484           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
7485         }
7486       else
7487         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
7488     }
7489
7490   /* Otherwise, do the same as above, but in a loop.  Note that we must be
7491      extra careful with variables wrapping around because we might be at
7492      the very top (or the very bottom) of the address space and we have
7493      to be able to handle this case properly; in particular, we use an
7494      equality test for the loop condition.  */
7495   else
7496     {
7497       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
7498
7499       /* Step 1: round SIZE to the previous multiple of the interval.  */
7500
7501       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7502
7503
7504       /* Step 2: compute initial and final value of the loop counter.  */
7505
7506       /* TEST_ADDR = SP + FIRST.  */
7507       emit_set_insn (reg1,
7508                      plus_constant (Pmode, stack_pointer_rtx, -first));
7509
7510       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
7511       HOST_WIDE_INT adjustment = - (first + rounded_size);
7512       if (! aarch64_uimm12_shift (adjustment))
7513         {
7514           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7515                                           true, Pmode);
7516           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7517         }
7518       else
7519         emit_set_insn (reg2,
7520                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
7521
7522       /* Step 3: the loop
7523
7524          do
7525            {
7526              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7527              probe at TEST_ADDR
7528            }
7529          while (TEST_ADDR != LAST_ADDR)
7530
7531          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7532          until it is equal to ROUNDED_SIZE.  */
7533
7534       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
7535
7536
7537       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7538          that SIZE is equal to ROUNDED_SIZE.  */
7539
7540       if (size != rounded_size)
7541         {
7542           HOST_WIDE_INT rem = size - rounded_size;
7543
7544           if (rem > 256)
7545             {
7546               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7547
7548               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7549               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
7550             }
7551           else
7552             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
7553         }
7554     }
7555
7556   /* Make sure nothing is scheduled before we are done.  */
7557   emit_insn (gen_blockage ());
7558 }
7559
7560 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
7561    absolute addresses.  */
7562
7563 const char *
7564 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7565 {
7566   static int labelno = 0;
7567   char loop_lab[32];
7568   rtx xops[2];
7569
7570   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7571
7572   /* Loop.  */
7573   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7574
7575   HOST_WIDE_INT stack_clash_probe_interval
7576     = 1 << param_stack_clash_protection_guard_size;
7577
7578   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
7579   xops[0] = reg1;
7580   HOST_WIDE_INT interval;
7581   if (flag_stack_clash_protection)
7582     interval = stack_clash_probe_interval;
7583   else
7584     interval = PROBE_INTERVAL;
7585
7586   gcc_assert (aarch64_uimm12_shift (interval));
7587   xops[1] = GEN_INT (interval);
7588
7589   output_asm_insn ("sub\t%0, %0, %1", xops);
7590
7591   /* If doing stack clash protection then we probe up by the ABI specified
7592      amount.  We do this because we're dropping full pages at a time in the
7593      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
7594   if (flag_stack_clash_protection)
7595     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7596   else
7597     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7598
7599   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
7600      by this amount for each iteration.  */
7601   output_asm_insn ("str\txzr, [%0, %1]", xops);
7602
7603   /* Test if TEST_ADDR == LAST_ADDR.  */
7604   xops[1] = reg2;
7605   output_asm_insn ("cmp\t%0, %1", xops);
7606
7607   /* Branch.  */
7608   fputs ("\tb.ne\t", asm_out_file);
7609   assemble_name_raw (asm_out_file, loop_lab);
7610   fputc ('\n', asm_out_file);
7611
7612   return "";
7613 }
7614
7615 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7616    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7617    of GUARD_SIZE.  When a probe is emitted it is done at most
7618    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7619    at most MIN_PROBE_THRESHOLD.  By the end of this function
7620    BASE = BASE - ADJUSTMENT.  */
7621
7622 const char *
7623 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7624                                       rtx min_probe_threshold, rtx guard_size)
7625 {
7626   /* This function is not allowed to use any instruction generation function
7627      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
7628      so instead emit the code you want using output_asm_insn.  */
7629   gcc_assert (flag_stack_clash_protection);
7630   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7631   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7632
7633   /* The minimum required allocation before the residual requires probing.  */
7634   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7635
7636   /* Clamp the value down to the nearest value that can be used with a cmp.  */
7637   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7638   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7639
7640   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7641   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7642
7643   static int labelno = 0;
7644   char loop_start_lab[32];
7645   char loop_end_lab[32];
7646   rtx xops[2];
7647
7648   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7649   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7650
7651   /* Emit loop start label.  */
7652   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7653
7654   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
7655   xops[0] = adjustment;
7656   xops[1] = probe_offset_value_rtx;
7657   output_asm_insn ("cmp\t%0, %1", xops);
7658
7659   /* Branch to end if not enough adjustment to probe.  */
7660   fputs ("\tb.lt\t", asm_out_file);
7661   assemble_name_raw (asm_out_file, loop_end_lab);
7662   fputc ('\n', asm_out_file);
7663
7664   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
7665   xops[0] = base;
7666   xops[1] = probe_offset_value_rtx;
7667   output_asm_insn ("sub\t%0, %0, %1", xops);
7668
7669   /* Probe at BASE.  */
7670   xops[1] = const0_rtx;
7671   output_asm_insn ("str\txzr, [%0, %1]", xops);
7672
7673   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
7674   xops[0] = adjustment;
7675   xops[1] = probe_offset_value_rtx;
7676   output_asm_insn ("sub\t%0, %0, %1", xops);
7677
7678   /* Branch to start if still more bytes to allocate.  */
7679   fputs ("\tb\t", asm_out_file);
7680   assemble_name_raw (asm_out_file, loop_start_lab);
7681   fputc ('\n', asm_out_file);
7682
7683   /* No probe leave.  */
7684   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
7685
7686   /* BASE = BASE - ADJUSTMENT.  */
7687   xops[0] = base;
7688   xops[1] = adjustment;
7689   output_asm_insn ("sub\t%0, %0, %1", xops);
7690   return "";
7691 }
7692
7693 /* Determine whether a frame chain needs to be generated.  */
7694 static bool
7695 aarch64_needs_frame_chain (void)
7696 {
7697   if (frame_pointer_needed)
7698     return true;
7699
7700   /* A leaf function cannot have calls or write LR.  */
7701   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
7702
7703   /* Don't use a frame chain in leaf functions if leaf frame pointers
7704      are disabled.  */
7705   if (flag_omit_leaf_frame_pointer && is_leaf)
7706     return false;
7707
7708   return aarch64_use_frame_pointer;
7709 }
7710
7711 /* Return true if the current function should save registers above
7712    the locals area, rather than below it.  */
7713
7714 static bool
7715 aarch64_save_regs_above_locals_p ()
7716 {
7717   /* When using stack smash protection, make sure that the canary slot
7718      comes between the locals and the saved registers.  Otherwise,
7719      it would be possible for a carefully sized smash attack to change
7720      the saved registers (particularly LR and FP) without reaching the
7721      canary.  */
7722   return crtl->stack_protect_guard;
7723 }
7724
7725 /* Return true if the current function needs to record the incoming
7726    value of PSTATE.SM.  */
7727 static bool
7728 aarch64_need_old_pstate_sm ()
7729 {
7730   /* Exit early if the incoming value of PSTATE.SM is known at
7731      compile time.  */
7732   if (aarch64_cfun_incoming_pstate_sm () != 0)
7733     return false;
7734
7735   if (aarch64_cfun_enables_pstate_sm ())
7736     return true;
7737
7738   /* Non-local goto receivers are entered with PSTATE.SM equal to 0,
7739      but the function needs to return with PSTATE.SM unchanged.  */
7740   if (nonlocal_goto_handler_labels)
7741     return true;
7742
7743   /* Likewise for exception handlers.  */
7744   eh_landing_pad lp;
7745   for (unsigned int i = 1; vec_safe_iterate (cfun->eh->lp_array, i, &lp); ++i)
7746     if (lp && lp->post_landing_pad)
7747       return true;
7748
7749   /* Non-local gotos need to set PSTATE.SM to zero.  It's possible to call
7750      streaming-compatible functions without SME being available, so PSTATE.SM
7751      should only be changed if it is currently set to one.  */
7752   if (crtl->has_nonlocal_goto)
7753     return true;
7754
7755   if (cfun->machine->call_switches_pstate_sm)
7756     for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
7757       if (auto *call = dyn_cast<rtx_call_insn *> (insn))
7758         if (!SIBLING_CALL_P (call))
7759           {
7760             /* Return true if there is a call to a non-streaming-compatible
7761                function.  */
7762             auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
7763             if (aarch64_call_switches_pstate_sm (callee_isa_mode))
7764               return true;
7765           }
7766   return false;
7767 }
7768
7769 /* Mark the registers that need to be saved by the callee and calculate
7770    the size of the callee-saved registers area and frame record (both FP
7771    and LR may be omitted).  */
7772 static void
7773 aarch64_layout_frame (void)
7774 {
7775   unsigned regno, last_fp_reg = INVALID_REGNUM;
7776   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
7777   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
7778   bool frame_related_fp_reg_p = false;
7779   aarch64_frame &frame = cfun->machine->frame;
7780   poly_int64 top_of_locals = -1;
7781   bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
7782
7783   vec_safe_truncate (frame.saved_gprs, 0);
7784   vec_safe_truncate (frame.saved_fprs, 0);
7785   vec_safe_truncate (frame.saved_prs, 0);
7786
7787   frame.emit_frame_chain = aarch64_needs_frame_chain ();
7788
7789   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
7790      the mid-end is doing.  */
7791   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
7792
7793 #define SLOT_NOT_REQUIRED (-2)
7794 #define SLOT_REQUIRED     (-1)
7795
7796   frame.wb_push_candidate1 = INVALID_REGNUM;
7797   frame.wb_push_candidate2 = INVALID_REGNUM;
7798   frame.spare_pred_reg = INVALID_REGNUM;
7799
7800   /* First mark all the registers that really need to be saved...  */
7801   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7802     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
7803   frame.old_svcr_offset = SLOT_NOT_REQUIRED;
7804
7805   /* ... that includes the eh data registers (if needed)...  */
7806   if (crtl->calls_eh_return)
7807     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
7808       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
7809
7810   /* ... and any callee saved register that dataflow says is live.  */
7811   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7812     if (df_regs_ever_live_p (regno)
7813         && !fixed_regs[regno]
7814         && (regno == R30_REGNUM
7815             || !crtl->abi->clobbers_full_reg_p (regno)))
7816       frame.reg_offset[regno] = SLOT_REQUIRED;
7817
7818   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7819     if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7820         && !fixed_regs[regno]
7821         && !crtl->abi->clobbers_full_reg_p (regno))
7822       {
7823         frame.reg_offset[regno] = SLOT_REQUIRED;
7824         last_fp_reg = regno;
7825         if (aarch64_emit_cfi_for_reg_p (regno))
7826           frame_related_fp_reg_p = true;
7827       }
7828
7829   /* Big-endian SVE frames need a spare predicate register in order
7830      to save Z8-Z15.  Decide which register they should use.  Prefer
7831      an unused argument register if possible, so that we don't force P4
7832      to be saved unnecessarily.  */
7833   if (frame_related_fp_reg_p
7834       && crtl->abi->id () == ARM_PCS_SVE
7835       && BYTES_BIG_ENDIAN)
7836     {
7837       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7838       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
7839       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
7840         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
7841           break;
7842       gcc_assert (regno <= P7_REGNUM);
7843       frame.spare_pred_reg = regno;
7844       df_set_regs_ever_live (regno, true);
7845     }
7846
7847   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7848     if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7849         && !fixed_regs[regno]
7850         && !crtl->abi->clobbers_full_reg_p (regno))
7851       frame.reg_offset[regno] = SLOT_REQUIRED;
7852
7853   bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
7854
7855   poly_int64 offset = crtl->outgoing_args_size;
7856   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
7857   if (regs_at_top_p)
7858     {
7859       offset += get_frame_size ();
7860       offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7861       top_of_locals = offset;
7862     }
7863   frame.bytes_below_saved_regs = offset;
7864   frame.sve_save_and_probe = INVALID_REGNUM;
7865
7866   /* Now assign stack slots for the registers.  Start with the predicate
7867      registers, since predicate LDR and STR have a relatively small
7868      offset range.  These saves happen below the hard frame pointer.  */
7869   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7870     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7871       {
7872         vec_safe_push (frame.saved_prs, regno);
7873         if (frame.sve_save_and_probe == INVALID_REGNUM)
7874           frame.sve_save_and_probe = regno;
7875         frame.reg_offset[regno] = offset;
7876         offset += BYTES_PER_SVE_PRED;
7877       }
7878
7879   poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
7880   if (maybe_ne (saved_prs_size, 0))
7881     {
7882       /* If we have any vector registers to save above the predicate registers,
7883          the offset of the vector register save slots need to be a multiple
7884          of the vector size.  This lets us use the immediate forms of LDR/STR
7885          (or LD1/ST1 for big-endian).
7886
7887          A vector register is 8 times the size of a predicate register,
7888          and we need to save a maximum of 12 predicate registers, so the
7889          first vector register will be at either #1, MUL VL or #2, MUL VL.
7890
7891          If we don't have any vector registers to save, and we know how
7892          big the predicate save area is, we can just round it up to the
7893          next 16-byte boundary.  */
7894       if (last_fp_reg == INVALID_REGNUM && offset.is_constant ())
7895         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7896       else
7897         {
7898           if (known_le (saved_prs_size, vector_save_size))
7899             offset = frame.bytes_below_saved_regs + vector_save_size;
7900           else if (known_le (saved_prs_size, vector_save_size * 2))
7901             offset = frame.bytes_below_saved_regs + vector_save_size * 2;
7902           else
7903             gcc_unreachable ();
7904         }
7905     }
7906
7907   /* If we need to save any SVE vector registers, add them next.  */
7908   if (last_fp_reg != INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
7909     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7910       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7911         {
7912           vec_safe_push (frame.saved_fprs, regno);
7913           if (frame.sve_save_and_probe == INVALID_REGNUM)
7914             frame.sve_save_and_probe = regno;
7915           frame.reg_offset[regno] = offset;
7916           offset += vector_save_size;
7917         }
7918
7919   /* OFFSET is now the offset of the hard frame pointer from the bottom
7920      of the callee save area.  */
7921   auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
7922   bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
7923   gcc_assert (!saves_below_hard_fp_p
7924               || (frame.sve_save_and_probe != INVALID_REGNUM
7925                   && known_eq (frame.reg_offset[frame.sve_save_and_probe],
7926                                frame.bytes_below_saved_regs)));
7927
7928   frame.bytes_below_hard_fp = offset;
7929   frame.hard_fp_save_and_probe = INVALID_REGNUM;
7930
7931   auto allocate_gpr_slot = [&](unsigned int regno)
7932     {
7933       vec_safe_push (frame.saved_gprs, regno);
7934       frame.reg_offset[regno] = offset;
7935       offset += UNITS_PER_WORD;
7936     };
7937
7938   if (frame.emit_frame_chain)
7939     {
7940       /* FP and LR are placed in the linkage record.  */
7941       allocate_gpr_slot (R29_REGNUM);
7942       allocate_gpr_slot (R30_REGNUM);
7943     }
7944   else if ((flag_stack_clash_protection || !frame.is_scs_enabled)
7945            && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
7946     /* Put the LR save slot first, since it makes a good choice of probe
7947        for stack clash purposes.  The idea is that the link register usually
7948        has to be saved before a call anyway, and so we lose little by
7949        stopping it from being individually shrink-wrapped.  */
7950     allocate_gpr_slot (R30_REGNUM);
7951
7952   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7953     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7954       allocate_gpr_slot (regno);
7955
7956   if (aarch64_need_old_pstate_sm ())
7957     {
7958       frame.old_svcr_offset = offset;
7959       offset += UNITS_PER_WORD;
7960     }
7961
7962   /* If the current function changes the SVE vector length, ensure that the
7963      old value of the DWARF VG register is saved and available in the CFI,
7964      so that outer frames with VL-sized offsets can be processed correctly.  */
7965   if (cfun->machine->call_switches_pstate_sm
7966       || aarch64_cfun_enables_pstate_sm ())
7967     {
7968       frame.reg_offset[VG_REGNUM] = offset;
7969       offset += UNITS_PER_WORD;
7970     }
7971
7972   poly_int64 max_int_offset = offset;
7973   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7974   bool has_align_gap = maybe_ne (offset, max_int_offset);
7975
7976   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7977     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7978       {
7979         vec_safe_push (frame.saved_fprs, regno);
7980         /* If there is an alignment gap between integer and fp callee-saves,
7981            allocate the last fp register to it if possible.  */
7982         if (regno == last_fp_reg
7983             && has_align_gap
7984             && known_eq (vector_save_size, 8)
7985             && multiple_p (offset, 16))
7986           {
7987             frame.reg_offset[regno] = max_int_offset;
7988             break;
7989           }
7990
7991         frame.reg_offset[regno] = offset;
7992         offset += vector_save_size;
7993       }
7994
7995   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7996   auto saved_regs_size = offset - frame.bytes_below_saved_regs;
7997
7998   array_slice<unsigned int> push_regs = (!vec_safe_is_empty (frame.saved_gprs)
7999                                          ? frame.saved_gprs
8000                                          : frame.saved_fprs);
8001   if (!push_regs.empty ()
8002       && known_eq (frame.reg_offset[push_regs[0]], frame.bytes_below_hard_fp))
8003     {
8004       frame.hard_fp_save_and_probe = push_regs[0];
8005       frame.wb_push_candidate1 = push_regs[0];
8006       if (push_regs.size () > 1)
8007         frame.wb_push_candidate2 = push_regs[1];
8008     }
8009
8010   /* With stack-clash, a register must be saved in non-leaf functions.
8011      The saving of the bottommost register counts as an implicit probe,
8012      which allows us to maintain the invariant described in the comment
8013      at expand_prologue.  */
8014   gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
8015
8016   if (!regs_at_top_p)
8017     {
8018       offset += get_frame_size ();
8019       offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8020       top_of_locals = offset;
8021     }
8022   offset += frame.saved_varargs_size;
8023   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
8024   frame.frame_size = offset;
8025
8026   frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
8027   gcc_assert (known_ge (top_of_locals, 0));
8028   frame.bytes_above_locals = frame.frame_size - top_of_locals;
8029
8030   frame.initial_adjust = 0;
8031   frame.final_adjust = 0;
8032   frame.callee_adjust = 0;
8033   frame.sve_callee_adjust = 0;
8034
8035   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8036   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8037
8038   /* Shadow call stack only deals with functions where the LR is pushed
8039      onto the stack and without specifying the "no_sanitize" attribute
8040      with the argument "shadow-call-stack".  */
8041   frame.is_scs_enabled
8042     = (!crtl->calls_eh_return
8043        && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8044        && known_ge (frame.reg_offset[LR_REGNUM], 0));
8045
8046   /* When shadow call stack is enabled, the scs_pop in the epilogue will
8047      restore x30, and we don't need to pop x30 again in the traditional
8048      way.  Pop candidates record the registers that need to be popped
8049      eventually.  */
8050   if (frame.is_scs_enabled)
8051     {
8052       if (frame.wb_pop_candidate2 == R30_REGNUM)
8053         frame.wb_pop_candidate2 = INVALID_REGNUM;
8054       else if (frame.wb_pop_candidate1 == R30_REGNUM)
8055         frame.wb_pop_candidate1 = INVALID_REGNUM;
8056     }
8057
8058   /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8059      256 to ensure that the offset meets the requirements of emit_move_insn.
8060      Similarly, if candidate1 is INVALID_REGNUM, we need to set
8061      max_push_offset to 0, because no registers are popped at this time,
8062      so callee_adjust cannot be adjusted.  */
8063   HOST_WIDE_INT max_push_offset = 0;
8064   if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8065     {
8066       if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8067         max_push_offset = 512;
8068       else
8069         max_push_offset = 256;
8070     }
8071
8072   HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
8073   HOST_WIDE_INT const_saved_regs_size;
8074   if (known_eq (saved_regs_size, 0))
8075     frame.initial_adjust = frame.frame_size;
8076   else if (frame.frame_size.is_constant (&const_size)
8077            && const_size < max_push_offset
8078            && known_eq (frame.bytes_above_hard_fp, const_size))
8079     {
8080       /* Simple, small frame with no data below the saved registers.
8081
8082          stp reg1, reg2, [sp, -frame_size]!
8083          stp reg3, reg4, [sp, 16]  */
8084       frame.callee_adjust = const_size;
8085     }
8086   else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
8087            && saved_regs_size.is_constant (&const_saved_regs_size)
8088            && const_below_saved_regs + const_saved_regs_size < 512
8089            /* We could handle this case even with data below the saved
8090               registers, provided that that data left us with valid offsets
8091               for all predicate and vector save slots.  It's such a rare
8092               case that it hardly seems worth the effort though.  */
8093            && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
8094            && !(cfun->calls_alloca
8095                 && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8096                 && const_above_fp < max_push_offset))
8097     {
8098       /* Frame with small area below the saved registers:
8099
8100          sub sp, sp, frame_size
8101          stp reg1, reg2, [sp, bytes_below_saved_regs]
8102          stp reg3, reg4, [sp, bytes_below_saved_regs + 16]  */
8103       frame.initial_adjust = frame.frame_size;
8104     }
8105   else if (saves_below_hard_fp_p
8106            && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
8107     {
8108       /* Frame in which all saves are SVE saves:
8109
8110          sub sp, sp, frame_size - bytes_below_saved_regs
8111          save SVE registers relative to SP
8112          sub sp, sp, bytes_below_saved_regs  */
8113       frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
8114       frame.final_adjust = frame.bytes_below_saved_regs;
8115     }
8116   else if (frame.wb_push_candidate1 != INVALID_REGNUM
8117            && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8118            && const_above_fp < max_push_offset)
8119     {
8120       /* Frame with large area below the saved registers, or with SVE saves,
8121          but with a small area above:
8122
8123          stp reg1, reg2, [sp, -hard_fp_offset]!
8124          stp reg3, reg4, [sp, 16]
8125          [sub sp, sp, below_hard_fp_saved_regs_size]
8126          [save SVE registers relative to SP]
8127          sub sp, sp, bytes_below_saved_regs  */
8128       frame.callee_adjust = const_above_fp;
8129       frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8130       frame.final_adjust = frame.bytes_below_saved_regs;
8131     }
8132   else
8133     {
8134       /* General case:
8135
8136          sub sp, sp, hard_fp_offset
8137          stp x29, x30, [sp, 0]
8138          add x29, sp, 0
8139          stp reg3, reg4, [sp, 16]
8140          [sub sp, sp, below_hard_fp_saved_regs_size]
8141          [save SVE registers relative to SP]
8142          sub sp, sp, bytes_below_saved_regs  */
8143       frame.initial_adjust = frame.bytes_above_hard_fp;
8144       frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8145       frame.final_adjust = frame.bytes_below_saved_regs;
8146     }
8147
8148   /* The frame is allocated in pieces, with each non-final piece
8149      including a register save at offset 0 that acts as a probe for
8150      the following piece.  In addition, the save of the bottommost register
8151      acts as a probe for callees and allocas.  Roll back any probes that
8152      aren't needed.
8153
8154      A probe isn't needed if it is associated with the final allocation
8155      (including callees and allocas) that happens before the epilogue is
8156      executed.  */
8157   if (crtl->is_leaf
8158       && !cfun->calls_alloca
8159       && known_eq (frame.final_adjust, 0))
8160     {
8161       if (maybe_ne (frame.sve_callee_adjust, 0))
8162         frame.sve_save_and_probe = INVALID_REGNUM;
8163       else
8164         frame.hard_fp_save_and_probe = INVALID_REGNUM;
8165     }
8166
8167   /* Make sure the individual adjustments add up to the full frame size.  */
8168   gcc_assert (known_eq (frame.initial_adjust
8169                         + frame.callee_adjust
8170                         + frame.sve_callee_adjust
8171                         + frame.final_adjust, frame.frame_size));
8172
8173   if (frame.callee_adjust == 0)
8174     {
8175       /* We've decided not to do a "real" push and pop.  However,
8176          setting up the frame chain is treated as being essentially
8177          a multi-instruction push.  */
8178       frame.wb_pop_candidate1 = frame.wb_pop_candidate2 = INVALID_REGNUM;
8179       if (!frame.emit_frame_chain)
8180         frame.wb_push_candidate1 = frame.wb_push_candidate2 = INVALID_REGNUM;
8181     }
8182
8183   frame.laid_out = true;
8184 }
8185
8186 /* Return true if the register REGNO is saved on entry to
8187    the current function.  */
8188
8189 static bool
8190 aarch64_register_saved_on_entry (int regno)
8191 {
8192   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8193 }
8194
8195 /* Push the register number REGNO of mode MODE to the stack with write-back
8196    adjusting the stack by ADJUSTMENT.  */
8197
8198 static void
8199 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8200                            HOST_WIDE_INT adjustment)
8201  {
8202   rtx base_rtx = stack_pointer_rtx;
8203   rtx insn, reg, mem;
8204
8205   reg = gen_rtx_REG (mode, regno);
8206   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8207                             plus_constant (Pmode, base_rtx, -adjustment));
8208   mem = gen_frame_mem (mode, mem);
8209
8210   insn = emit_move_insn (mem, reg);
8211   RTX_FRAME_RELATED_P (insn) = 1;
8212 }
8213
8214 /* Generate and return an instruction to store the pair of registers
8215    REG and REG2 of mode MODE to location BASE with write-back adjusting
8216    the stack location BASE by ADJUSTMENT.  */
8217
8218 static rtx
8219 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8220                           HOST_WIDE_INT adjustment)
8221 {
8222   rtx new_base = plus_constant (Pmode, base, -adjustment);
8223   rtx mem = gen_frame_mem (mode, new_base);
8224   rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8225
8226   return gen_rtx_PARALLEL (VOIDmode,
8227                            gen_rtvec (3,
8228                                       gen_rtx_SET (base, new_base),
8229                                       gen_rtx_SET (mem, reg),
8230                                       gen_rtx_SET (mem2, reg2)));
8231 }
8232
8233 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8234    stack pointer by ADJUSTMENT.  */
8235
8236 static void
8237 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8238 {
8239   rtx_insn *insn;
8240   machine_mode mode = aarch64_reg_save_mode (regno1);
8241
8242   if (regno2 == INVALID_REGNUM)
8243     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8244
8245   rtx reg1 = gen_rtx_REG (mode, regno1);
8246   rtx reg2 = gen_rtx_REG (mode, regno2);
8247
8248   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8249                                               reg2, adjustment));
8250   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8251   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8252   RTX_FRAME_RELATED_P (insn) = 1;
8253 }
8254
8255 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8256    adjusting it by ADJUSTMENT afterwards.  */
8257
8258 static rtx
8259 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8260                          HOST_WIDE_INT adjustment)
8261 {
8262   rtx mem = gen_frame_mem (mode, base);
8263   rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8264   rtx new_base = plus_constant (Pmode, base, adjustment);
8265
8266   return gen_rtx_PARALLEL (VOIDmode,
8267                            gen_rtvec (3,
8268                                       gen_rtx_SET (base, new_base),
8269                                       gen_rtx_SET (reg, mem),
8270                                       gen_rtx_SET (reg2, mem2)));
8271 }
8272
8273 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8274    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8275    into CFI_OPS.  */
8276
8277 static void
8278 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8279                   rtx *cfi_ops)
8280 {
8281   machine_mode mode = aarch64_reg_save_mode (regno1);
8282   rtx reg1 = gen_rtx_REG (mode, regno1);
8283
8284   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8285
8286   if (regno2 == INVALID_REGNUM)
8287     {
8288       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8289       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8290       emit_move_insn (reg1, gen_frame_mem (mode, mem));
8291     }
8292   else
8293     {
8294       rtx reg2 = gen_rtx_REG (mode, regno2);
8295       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8296       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8297                                           reg2, adjustment));
8298     }
8299 }
8300
8301 /* Given an ldp/stp register operand mode MODE, return a suitable mode to use
8302    for a mem rtx representing the entire pair.  */
8303
8304 static machine_mode
8305 aarch64_pair_mode_for_mode (machine_mode mode)
8306 {
8307   if (known_eq (GET_MODE_SIZE (mode), 4))
8308     return V2x4QImode;
8309   else if (known_eq (GET_MODE_SIZE (mode), 8))
8310     return V2x8QImode;
8311   else if (known_eq (GET_MODE_SIZE (mode), 16))
8312     return V2x16QImode;
8313   else
8314     gcc_unreachable ();
8315 }
8316
8317 /* Given a base mem MEM with mode and address suitable for a single ldp/stp
8318    operand, return an rtx like MEM which instead represents the entire pair.  */
8319
8320 static rtx
8321 aarch64_pair_mem_from_base (rtx mem)
8322 {
8323   auto pair_mode = aarch64_pair_mode_for_mode (GET_MODE (mem));
8324   mem = adjust_bitfield_address_nv (mem, pair_mode, 0);
8325   gcc_assert (aarch64_mem_pair_lanes_operand (mem, pair_mode));
8326   return mem;
8327 }
8328
8329 /* Generate and return a store pair instruction to store REG1 and REG2
8330    into memory starting at BASE_MEM.  All three rtxes should have modes of the
8331    same size.  */
8332
8333 rtx
8334 aarch64_gen_store_pair (rtx base_mem, rtx reg1, rtx reg2)
8335 {
8336   rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8337
8338   return gen_rtx_SET (pair_mem,
8339                       gen_rtx_UNSPEC (GET_MODE (pair_mem),
8340                                       gen_rtvec (2, reg1, reg2),
8341                                       UNSPEC_STP));
8342 }
8343
8344 /* Generate and return a load pair instruction to load a pair of
8345    registers starting at BASE_MEM into REG1 and REG2.  If CODE is
8346    UNKNOWN, all three rtxes should have modes of the same size.
8347    Otherwise, CODE is {SIGN,ZERO}_EXTEND, base_mem should be in SImode,
8348    and REG{1,2} should be in DImode.  */
8349
8350 rtx
8351 aarch64_gen_load_pair (rtx reg1, rtx reg2, rtx base_mem, enum rtx_code code)
8352 {
8353   rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8354
8355   const bool any_extend_p = (code == ZERO_EXTEND || code == SIGN_EXTEND);
8356   if (any_extend_p)
8357     gcc_checking_assert (GET_MODE (base_mem) == SImode
8358                          && GET_MODE (reg1) == DImode
8359                          && GET_MODE (reg2) == DImode);
8360   else
8361     gcc_assert (code == UNKNOWN);
8362
8363   rtx unspecs[2] = {
8364     gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg1),
8365                     gen_rtvec (1, pair_mem),
8366                     UNSPEC_LDP_FST),
8367     gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg2),
8368                     gen_rtvec (1, copy_rtx (pair_mem)),
8369                     UNSPEC_LDP_SND)
8370   };
8371
8372   if (any_extend_p)
8373     for (int i = 0; i < 2; i++)
8374       unspecs[i] = gen_rtx_fmt_e (code, DImode, unspecs[i]);
8375
8376   return gen_rtx_PARALLEL (VOIDmode,
8377                            gen_rtvec (2,
8378                                       gen_rtx_SET (reg1, unspecs[0]),
8379                                       gen_rtx_SET (reg2, unspecs[1])));
8380 }
8381
8382 /* Return TRUE if return address signing should be enabled for the current
8383    function, otherwise return FALSE.  */
8384
8385 bool
8386 aarch64_return_address_signing_enabled (void)
8387 {
8388   /* This function should only be called after frame laid out.   */
8389   gcc_assert (cfun->machine->frame.laid_out);
8390
8391   /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8392      if its LR is pushed onto stack.  */
8393   return (aarch_ra_sign_scope == AARCH_FUNCTION_ALL
8394           || (aarch_ra_sign_scope == AARCH_FUNCTION_NON_LEAF
8395               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8396 }
8397
8398 /* Only used by the arm backend.  */
8399 void aarch_bti_arch_check (void)
8400 {}
8401
8402 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
8403 bool
8404 aarch_bti_enabled (void)
8405 {
8406   return (aarch_enable_bti == 1);
8407 }
8408
8409 /* Check if INSN is a BTI J insn.  */
8410 bool
8411 aarch_bti_j_insn_p (rtx_insn *insn)
8412 {
8413   if (!insn || !INSN_P (insn))
8414     return false;
8415
8416   rtx pat = PATTERN (insn);
8417   return GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == UNSPECV_BTI_J;
8418 }
8419
8420 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction.  */
8421 bool
8422 aarch_pac_insn_p (rtx x)
8423 {
8424   if (!INSN_P (x))
8425     return false;
8426
8427   subrtx_var_iterator::array_type array;
8428   FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (x), ALL)
8429     {
8430       rtx sub = *iter;
8431       if (sub && GET_CODE (sub) == UNSPEC)
8432         {
8433           int unspec_val = XINT (sub, 1);
8434           switch (unspec_val)
8435             {
8436             case UNSPEC_PACIASP:
8437             case UNSPEC_PACIBSP:
8438               return true;
8439
8440             default:
8441               return false;
8442             }
8443           iter.skip_subrtxes ();
8444         }
8445     }
8446   return false;
8447 }
8448
8449 rtx aarch_gen_bti_c (void)
8450 {
8451   return gen_bti_c ();
8452 }
8453
8454 rtx aarch_gen_bti_j (void)
8455 {
8456   return gen_bti_j ();
8457 }
8458
8459 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8460    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8461    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
8462
8463      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8464          or LD1D address
8465
8466      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8467          if the variable isn't already nonnull
8468
8469    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8470    Handle this case using a temporary base register that is suitable for
8471    all offsets in that range.  Use ANCHOR_REG as this base register if it
8472    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
8473
8474 static inline void
8475 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8476                                      rtx &anchor_reg, poly_int64 &offset,
8477                                      rtx &ptrue)
8478 {
8479   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8480     {
8481       /* This is the maximum valid offset of the anchor from the base.
8482          Lower values would be valid too.  */
8483       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8484       if (!anchor_reg)
8485         {
8486           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8487           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8488                                     gen_int_mode (anchor_offset, Pmode)));
8489         }
8490       base_rtx = anchor_reg;
8491       offset -= anchor_offset;
8492     }
8493   if (!ptrue)
8494     {
8495       int pred_reg = cfun->machine->frame.spare_pred_reg;
8496       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8497                       CONSTM1_RTX (VNx16BImode));
8498       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8499     }
8500 }
8501
8502 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8503    is saved at BASE + OFFSET.  */
8504
8505 static void
8506 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8507                             rtx base, poly_int64 offset)
8508 {
8509   rtx mem = gen_frame_mem (GET_MODE (reg),
8510                            plus_constant (Pmode, base, offset));
8511   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8512 }
8513
8514 /* Emit code to save the callee-saved registers in REGS.  Skip any
8515    write-back candidates if SKIP_WB is true, otherwise consider only
8516    write-back candidates.
8517
8518    The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8519    of the static frame.  HARD_FP_VALID_P is true if the hard frame pointer
8520    has been set up.  */
8521
8522 static void
8523 aarch64_save_callee_saves (poly_int64 bytes_below_sp,
8524                            array_slice<unsigned int> regs, bool skip_wb,
8525                            bool hard_fp_valid_p)
8526 {
8527   aarch64_frame &frame = cfun->machine->frame;
8528   rtx_insn *insn;
8529   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8530
8531   auto skip_save_p = [&](unsigned int regno)
8532     {
8533       if (cfun->machine->reg_is_wrapped_separately[regno])
8534         return true;
8535
8536       if (skip_wb == (regno == frame.wb_push_candidate1
8537                       || regno == frame.wb_push_candidate2))
8538         return true;
8539
8540       return false;
8541     };
8542
8543   for (unsigned int i = 0; i < regs.size (); ++i)
8544     {
8545       unsigned int regno = regs[i];
8546       poly_int64 offset;
8547       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8548
8549       if (skip_save_p (regno))
8550         continue;
8551
8552       machine_mode mode = aarch64_reg_save_mode (regno);
8553       rtx reg = gen_rtx_REG (mode, regno);
8554       rtx move_src = reg;
8555       offset = frame.reg_offset[regno] - bytes_below_sp;
8556       if (regno == VG_REGNUM)
8557         {
8558           move_src = gen_rtx_REG (DImode, IP0_REGNUM);
8559           emit_move_insn (move_src, gen_int_mode (aarch64_sve_vg, DImode));
8560         }
8561       rtx base_rtx = stack_pointer_rtx;
8562       poly_int64 sp_offset = offset;
8563
8564       HOST_WIDE_INT const_offset;
8565       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8566         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8567                                              offset, ptrue);
8568       else if (GP_REGNUM_P (REGNO (reg))
8569                && (!offset.is_constant (&const_offset) || const_offset >= 512))
8570         {
8571           poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
8572           if (hard_fp_valid_p)
8573             base_rtx = hard_frame_pointer_rtx;
8574           else
8575             {
8576               if (!anchor_reg)
8577                 {
8578                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8579                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8580                                             gen_int_mode (fp_offset, Pmode)));
8581                 }
8582               base_rtx = anchor_reg;
8583             }
8584           offset -= fp_offset;
8585         }
8586       rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8587       rtx cfi_mem = gen_frame_mem (mode, plus_constant (Pmode,
8588                                                         stack_pointer_rtx,
8589                                                         sp_offset));
8590       rtx cfi_set = gen_rtx_SET (cfi_mem, reg);
8591       bool need_cfi_note_p = (base_rtx != stack_pointer_rtx);
8592
8593       unsigned int regno2;
8594       if (!aarch64_sve_mode_p (mode)
8595           && reg == move_src
8596           && i + 1 < regs.size ()
8597           && (regno2 = regs[i + 1], !skip_save_p (regno2))
8598           && known_eq (GET_MODE_SIZE (mode),
8599                        frame.reg_offset[regno2] - frame.reg_offset[regno]))
8600         {
8601           rtx reg2 = gen_rtx_REG (mode, regno2);
8602
8603           offset += GET_MODE_SIZE (mode);
8604           insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
8605
8606           rtx cfi_mem2
8607             = gen_frame_mem (mode,
8608                              plus_constant (Pmode,
8609                                             stack_pointer_rtx,
8610                                             sp_offset + GET_MODE_SIZE (mode)));
8611           rtx cfi_set2 = gen_rtx_SET (cfi_mem2, reg2);
8612
8613           /* The first part of a frame-related parallel insn is always
8614              assumed to be relevant to the frame calculations;
8615              subsequent parts, are only frame-related if
8616              explicitly marked.  */
8617           if (aarch64_emit_cfi_for_reg_p (regno2))
8618             RTX_FRAME_RELATED_P (cfi_set2) = 1;
8619
8620           /* Add a REG_FRAME_RELATED_EXPR note since the unspec
8621              representation of stp cannot be understood directly by
8622              dwarf2cfi.  */
8623           rtx par = gen_rtx_PARALLEL (VOIDmode,
8624                                       gen_rtvec (2, cfi_set, cfi_set2));
8625           add_reg_note (insn, REG_FRAME_RELATED_EXPR, par);
8626
8627           regno = regno2;
8628           ++i;
8629         }
8630       else
8631         {
8632           if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8633             {
8634               insn = emit_insn (gen_aarch64_pred_mov (mode, mem,
8635                                                       ptrue, move_src));
8636               need_cfi_note_p = true;
8637             }
8638           else if (aarch64_sve_mode_p (mode))
8639             insn = emit_insn (gen_rtx_SET (mem, move_src));
8640           else
8641             insn = emit_move_insn (mem, move_src);
8642
8643           if (frame_related_p && (need_cfi_note_p || move_src != reg))
8644             add_reg_note (insn, REG_FRAME_RELATED_EXPR, cfi_set);
8645         }
8646
8647       RTX_FRAME_RELATED_P (insn) = frame_related_p;
8648
8649       /* Emit a fake instruction to indicate that the VG save slot has
8650          been initialized.  */
8651       if (regno == VG_REGNUM)
8652         emit_insn (gen_aarch64_old_vg_saved (move_src, mem));
8653     }
8654 }
8655
8656 /* Emit code to restore the callee registers in REGS, ignoring pop candidates
8657    and any other registers that are handled separately.  Write the appropriate
8658    REG_CFA_RESTORE notes into CFI_OPS.
8659
8660    The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8661    of the static frame.  */
8662
8663 static void
8664 aarch64_restore_callee_saves (poly_int64 bytes_below_sp,
8665                               array_slice<unsigned int> regs, rtx *cfi_ops)
8666 {
8667   aarch64_frame &frame = cfun->machine->frame;
8668   poly_int64 offset;
8669   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8670
8671   auto skip_restore_p = [&](unsigned int regno)
8672     {
8673       if (cfun->machine->reg_is_wrapped_separately[regno])
8674         return true;
8675
8676       if (regno == frame.wb_pop_candidate1
8677           || regno == frame.wb_pop_candidate2)
8678         return true;
8679
8680       /* The shadow call stack code restores LR separately.  */
8681       if (frame.is_scs_enabled && regno == LR_REGNUM)
8682         return true;
8683
8684       return false;
8685     };
8686
8687   for (unsigned int i = 0; i < regs.size (); ++i)
8688     {
8689       unsigned int regno = regs[i];
8690       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8691       if (skip_restore_p (regno))
8692         continue;
8693
8694       machine_mode mode = aarch64_reg_save_mode (regno);
8695       rtx reg = gen_rtx_REG (mode, regno);
8696       offset = frame.reg_offset[regno] - bytes_below_sp;
8697       rtx base_rtx = stack_pointer_rtx;
8698       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8699         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8700                                              offset, ptrue);
8701       rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8702
8703       unsigned int regno2;
8704       if (!aarch64_sve_mode_p (mode)
8705           && i + 1 < regs.size ()
8706           && (regno2 = regs[i + 1], !skip_restore_p (regno2))
8707           && known_eq (GET_MODE_SIZE (mode),
8708                        frame.reg_offset[regno2] - frame.reg_offset[regno]))
8709         {
8710           rtx reg2 = gen_rtx_REG (mode, regno2);
8711
8712           offset += GET_MODE_SIZE (mode);
8713           emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
8714
8715           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8716           regno = regno2;
8717           ++i;
8718         }
8719       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8720         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
8721       else if (aarch64_sve_mode_p (mode))
8722         emit_insn (gen_rtx_SET (reg, mem));
8723       else
8724         emit_move_insn (reg, mem);
8725       if (frame_related_p)
8726         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
8727     }
8728 }
8729
8730 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
8731    of MODE.  */
8732
8733 static inline bool
8734 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8735 {
8736   HOST_WIDE_INT multiple;
8737   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8738           && IN_RANGE (multiple, -8, 7));
8739 }
8740
8741 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
8742    of MODE.  */
8743
8744 static inline bool
8745 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8746 {
8747   HOST_WIDE_INT multiple;
8748   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8749           && IN_RANGE (multiple, -32, 31));
8750 }
8751
8752 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
8753    of MODE.  */
8754
8755 static inline bool
8756 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8757 {
8758   HOST_WIDE_INT multiple;
8759   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8760           && IN_RANGE (multiple, 0, 63));
8761 }
8762
8763 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
8764    of MODE.  */
8765
8766 bool
8767 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8768 {
8769   HOST_WIDE_INT multiple;
8770   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8771           && IN_RANGE (multiple, -64, 63));
8772 }
8773
8774 /* Return true if OFFSET is a signed 9-bit value.  */
8775
8776 bool
8777 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
8778                                        poly_int64 offset)
8779 {
8780   HOST_WIDE_INT const_offset;
8781   return (offset.is_constant (&const_offset)
8782           && IN_RANGE (const_offset, -256, 255));
8783 }
8784
8785 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
8786    of MODE.  */
8787
8788 static inline bool
8789 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8790 {
8791   HOST_WIDE_INT multiple;
8792   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8793           && IN_RANGE (multiple, -256, 255));
8794 }
8795
8796 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
8797    of MODE.  */
8798
8799 static inline bool
8800 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8801 {
8802   HOST_WIDE_INT multiple;
8803   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8804           && IN_RANGE (multiple, 0, 4095));
8805 }
8806
8807 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
8808
8809 static sbitmap
8810 aarch64_get_separate_components (void)
8811 {
8812   aarch64_frame &frame = cfun->machine->frame;
8813   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8814   bitmap_clear (components);
8815
8816   /* The registers we need saved to the frame.  */
8817   bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
8818   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8819     if (aarch64_register_saved_on_entry (regno))
8820       {
8821         /* Disallow shrink wrapping for registers that will be clobbered
8822            by an SMSTART SM in the prologue.  */
8823         if (enables_pstate_sm
8824             && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
8825           continue;
8826
8827         /* Punt on saves and restores that use ST1D and LD1D.  We could
8828            try to be smarter, but it would involve making sure that the
8829            spare predicate register itself is safe to use at the save
8830            and restore points.  Also, when a frame pointer is being used,
8831            the slots are often out of reach of ST1D and LD1D anyway.  */
8832         machine_mode mode = aarch64_reg_save_mode (regno);
8833         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8834           continue;
8835
8836         poly_int64 offset = frame.reg_offset[regno];
8837
8838         /* Get the offset relative to the register we'll use.  */
8839         if (frame_pointer_needed)
8840           offset -= frame.bytes_below_hard_fp;
8841
8842         /* Check that we can access the stack slot of the register with one
8843            direct load with no adjustments needed.  */
8844         if (aarch64_sve_mode_p (mode)
8845             ? offset_9bit_signed_scaled_p (mode, offset)
8846             : offset_12bit_unsigned_scaled_p (mode, offset))
8847           bitmap_set_bit (components, regno);
8848       }
8849
8850   /* Don't mess with the hard frame pointer.  */
8851   if (frame_pointer_needed)
8852     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
8853
8854   /* If the spare predicate register used by big-endian SVE code
8855      is call-preserved, it must be saved in the main prologue
8856      before any saves that use it.  */
8857   if (frame.spare_pred_reg != INVALID_REGNUM)
8858     bitmap_clear_bit (components, frame.spare_pred_reg);
8859
8860   unsigned reg1 = frame.wb_push_candidate1;
8861   unsigned reg2 = frame.wb_push_candidate2;
8862   /* If registers have been chosen to be stored/restored with
8863      writeback don't interfere with them to avoid having to output explicit
8864      stack adjustment instructions.  */
8865   if (reg2 != INVALID_REGNUM)
8866     bitmap_clear_bit (components, reg2);
8867   if (reg1 != INVALID_REGNUM)
8868     bitmap_clear_bit (components, reg1);
8869
8870   bitmap_clear_bit (components, LR_REGNUM);
8871   bitmap_clear_bit (components, SP_REGNUM);
8872   if (flag_stack_clash_protection)
8873     {
8874       if (frame.sve_save_and_probe != INVALID_REGNUM)
8875         bitmap_clear_bit (components, frame.sve_save_and_probe);
8876       if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
8877         bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
8878     }
8879
8880   /* The VG save sequence needs a temporary GPR.  Punt for now on trying
8881      to find one.  */
8882   bitmap_clear_bit (components, VG_REGNUM);
8883
8884   return components;
8885 }
8886
8887 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
8888
8889 static sbitmap
8890 aarch64_components_for_bb (basic_block bb)
8891 {
8892   bitmap in = DF_LIVE_IN (bb);
8893   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
8894   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
8895
8896   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8897   bitmap_clear (components);
8898
8899   /* Clobbered registers don't generate values in any meaningful sense,
8900      since nothing after the clobber can rely on their value.  And we can't
8901      say that partially-clobbered registers are unconditionally killed,
8902      because whether they're killed or not depends on the mode of the
8903      value they're holding.  Thus partially call-clobbered registers
8904      appear in neither the kill set nor the gen set.
8905
8906      Check manually for any calls that clobber more of a register than the
8907      current function can.  */
8908   function_abi_aggregator callee_abis;
8909   rtx_insn *insn;
8910   FOR_BB_INSNS (bb, insn)
8911     if (CALL_P (insn))
8912       callee_abis.note_callee_abi (insn_callee_abi (insn));
8913   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
8914
8915   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
8916   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8917     if (!fixed_regs[regno]
8918         && !crtl->abi->clobbers_full_reg_p (regno)
8919         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
8920             || bitmap_bit_p (in, regno)
8921             || bitmap_bit_p (gen, regno)
8922             || bitmap_bit_p (kill, regno)))
8923       {
8924         bitmap_set_bit (components, regno);
8925
8926         /* If there is a callee-save at an adjacent offset, add it too
8927            to increase the use of LDP/STP.  */
8928         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8929         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
8930
8931         if (regno2 <= LAST_SAVED_REGNUM)
8932           {
8933             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
8934             if (regno < regno2
8935                 ? known_eq (offset + 8, offset2)
8936                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
8937               bitmap_set_bit (components, regno2);
8938           }
8939       }
8940
8941   return components;
8942 }
8943
8944 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
8945    Nothing to do for aarch64.  */
8946
8947 static void
8948 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
8949 {
8950 }
8951
8952 /* Return the next set bit in BMP from START onwards.  Return the total number
8953    of bits in BMP if no set bit is found at or after START.  */
8954
8955 static unsigned int
8956 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
8957 {
8958   unsigned int nbits = SBITMAP_SIZE (bmp);
8959   if (start == nbits)
8960     return start;
8961
8962   gcc_assert (start < nbits);
8963   for (unsigned int i = start; i < nbits; i++)
8964     if (bitmap_bit_p (bmp, i))
8965       return i;
8966
8967   return nbits;
8968 }
8969
8970 /* Do the work for aarch64_emit_prologue_components and
8971    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
8972    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
8973    for these components or the epilogue sequence.  That is, it determines
8974    whether we should emit stores or loads and what kind of CFA notes to attach
8975    to the insns.  Otherwise the logic for the two sequences is very
8976    similar.  */
8977
8978 static void
8979 aarch64_process_components (sbitmap components, bool prologue_p)
8980 {
8981   aarch64_frame &frame = cfun->machine->frame;
8982   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
8983                              ? HARD_FRAME_POINTER_REGNUM
8984                              : STACK_POINTER_REGNUM);
8985
8986   unsigned last_regno = SBITMAP_SIZE (components);
8987   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
8988   rtx_insn *insn = NULL;
8989
8990   while (regno != last_regno)
8991     {
8992       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8993       machine_mode mode = aarch64_reg_save_mode (regno);
8994
8995       rtx reg = gen_rtx_REG (mode, regno);
8996       poly_int64 offset = frame.reg_offset[regno];
8997       if (frame_pointer_needed)
8998         offset -= frame.bytes_below_hard_fp;
8999
9000       rtx addr = plus_constant (Pmode, ptr_reg, offset);
9001       rtx mem = gen_frame_mem (mode, addr);
9002
9003       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9004       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9005       /* No more registers to handle after REGNO.
9006          Emit a single save/restore and exit.  */
9007       if (regno2 == last_regno)
9008         {
9009           insn = emit_insn (set);
9010           if (frame_related_p)
9011             {
9012               RTX_FRAME_RELATED_P (insn) = 1;
9013               if (prologue_p)
9014                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9015               else
9016                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9017             }
9018           break;
9019         }
9020
9021       poly_int64 offset2 = frame.reg_offset[regno2];
9022       /* The next register is not of the same class or its offset is not
9023          mergeable with the current one into a pair.  */
9024       if (aarch64_sve_mode_p (mode)
9025           || !satisfies_constraint_Ump (mem)
9026           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9027           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9028           || maybe_ne ((offset2 - frame.reg_offset[regno]),
9029                        GET_MODE_SIZE (mode)))
9030         {
9031           insn = emit_insn (set);
9032           if (frame_related_p)
9033             {
9034               RTX_FRAME_RELATED_P (insn) = 1;
9035               if (prologue_p)
9036                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9037               else
9038                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9039             }
9040
9041           regno = regno2;
9042           continue;
9043         }
9044
9045       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9046
9047       /* REGNO2 can be saved/restored in a pair with REGNO.  */
9048       rtx reg2 = gen_rtx_REG (mode, regno2);
9049       if (frame_pointer_needed)
9050         offset2 -= frame.bytes_below_hard_fp;
9051       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9052       rtx mem2 = gen_frame_mem (mode, addr2);
9053       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9054                              : gen_rtx_SET (reg2, mem2);
9055
9056       if (prologue_p)
9057         insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
9058       else
9059         insn = emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
9060
9061       if (frame_related_p || frame_related2_p)
9062         {
9063           RTX_FRAME_RELATED_P (insn) = 1;
9064           if (prologue_p)
9065             {
9066               if (frame_related_p)
9067                 add_reg_note (insn, REG_CFA_OFFSET, set);
9068               if (frame_related2_p)
9069                 add_reg_note (insn, REG_CFA_OFFSET, set2);
9070             }
9071           else
9072             {
9073               if (frame_related_p)
9074                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9075               if (frame_related2_p)
9076                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9077             }
9078         }
9079
9080       regno = aarch64_get_next_set_bit (components, regno2 + 1);
9081     }
9082 }
9083
9084 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
9085
9086 static void
9087 aarch64_emit_prologue_components (sbitmap components)
9088 {
9089   aarch64_process_components (components, true);
9090 }
9091
9092 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
9093
9094 static void
9095 aarch64_emit_epilogue_components (sbitmap components)
9096 {
9097   aarch64_process_components (components, false);
9098 }
9099
9100 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
9101
9102 static void
9103 aarch64_set_handled_components (sbitmap components)
9104 {
9105   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9106     if (bitmap_bit_p (components, regno))
9107       cfun->machine->reg_is_wrapped_separately[regno] = true;
9108 }
9109
9110 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
9111    determining the probe offset for alloca.  */
9112
9113 static HOST_WIDE_INT
9114 aarch64_stack_clash_protection_alloca_probe_range (void)
9115 {
9116   return STACK_CLASH_CALLER_GUARD;
9117 }
9118
9119 /* Emit a stack tie that acts as a scheduling barrier for all previous and
9120    subsequent memory accesses and that requires the stack pointer and REG
9121    to have their current values.  REG can be stack_pointer_rtx if no
9122    other register's value needs to be fixed.  */
9123
9124 static void
9125 aarch64_emit_stack_tie (rtx reg)
9126 {
9127   emit_insn (gen_stack_tie (reg, gen_int_mode (REGNO (reg), DImode)));
9128 }
9129
9130 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9131    registers.  If POLY_SIZE is not large enough to require a probe this function
9132    will only adjust the stack.  When allocating the stack space
9133    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9134    FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
9135    the saved registers.  If we are then we ensure that any allocation
9136    larger than the ABI defined buffer needs a probe so that the
9137    invariant of having a 1KB buffer is maintained.
9138
9139    We emit barriers after each stack adjustment to prevent optimizations from
9140    breaking the invariant that we never drop the stack more than a page.  This
9141    invariant is needed to make it easier to correctly handle asynchronous
9142    events, e.g. if we were to allow the stack to be dropped by more than a page
9143    and then have multiple probes up and we take a signal somewhere in between
9144    then the signal handler doesn't know the state of the stack and can make no
9145    assumptions about which pages have been probed.
9146
9147    FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE
9148    is measured relative to the SME vector length instead of the current
9149    prevailing vector length.  It is 0 otherwise.  */
9150
9151 static void
9152 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9153                                         poly_int64 poly_size,
9154                                         aarch64_feature_flags force_isa_mode,
9155                                         bool frame_related_p,
9156                                         bool final_adjustment_p)
9157 {
9158   aarch64_frame &frame = cfun->machine->frame;
9159   HOST_WIDE_INT guard_size
9160     = 1 << param_stack_clash_protection_guard_size;
9161   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9162   HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
9163   gcc_assert (multiple_p (poly_size, byte_sp_alignment));
9164   HOST_WIDE_INT min_probe_threshold
9165     = (final_adjustment_p
9166        ? guard_used_by_caller + byte_sp_alignment
9167        : guard_size - guard_used_by_caller);
9168   poly_int64 frame_size = frame.frame_size;
9169
9170   /* We should always have a positive probe threshold.  */
9171   gcc_assert (min_probe_threshold > 0);
9172
9173   if (flag_stack_clash_protection && !final_adjustment_p)
9174     {
9175       poly_int64 initial_adjust = frame.initial_adjust;
9176       poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9177       poly_int64 final_adjust = frame.final_adjust;
9178
9179       if (known_eq (frame_size, 0))
9180         {
9181           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9182         }
9183       else if (known_lt (initial_adjust + sve_callee_adjust,
9184                          guard_size - guard_used_by_caller)
9185                && known_lt (final_adjust, guard_used_by_caller))
9186         {
9187           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9188         }
9189     }
9190
9191   /* If SIZE is not large enough to require probing, just adjust the stack and
9192      exit.  */
9193   if (known_lt (poly_size, min_probe_threshold)
9194       || !flag_stack_clash_protection)
9195     {
9196       aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
9197                       frame_related_p);
9198       return;
9199     }
9200
9201   HOST_WIDE_INT size;
9202   /* Handle the SVE non-constant case first.  */
9203   if (!poly_size.is_constant (&size))
9204     {
9205      if (dump_file)
9206       {
9207         fprintf (dump_file, "Stack clash SVE prologue: ");
9208         print_dec (poly_size, dump_file);
9209         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9210       }
9211
9212       /* First calculate the amount of bytes we're actually spilling.  */
9213       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9214                           poly_size, temp1, temp2, force_isa_mode,
9215                           false, true);
9216
9217       rtx_insn *insn = get_last_insn ();
9218
9219       if (frame_related_p)
9220         {
9221           /* This is done to provide unwinding information for the stack
9222              adjustments we're about to do, however to prevent the optimizers
9223              from removing the R11 move and leaving the CFA note (which would be
9224              very wrong) we tie the old and new stack pointer together.
9225              The tie will expand to nothing but the optimizers will not touch
9226              the instruction.  */
9227           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9228           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9229           aarch64_emit_stack_tie (stack_ptr_copy);
9230
9231           /* We want the CFA independent of the stack pointer for the
9232              duration of the loop.  */
9233           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9234           RTX_FRAME_RELATED_P (insn) = 1;
9235         }
9236
9237       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9238       rtx guard_const = gen_int_mode (guard_size, Pmode);
9239
9240       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9241                                                    stack_pointer_rtx, temp1,
9242                                                    probe_const, guard_const));
9243
9244       /* Now reset the CFA register if needed.  */
9245       if (frame_related_p)
9246         {
9247           add_reg_note (insn, REG_CFA_DEF_CFA,
9248                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9249                                       gen_int_mode (poly_size, Pmode)));
9250           RTX_FRAME_RELATED_P (insn) = 1;
9251         }
9252
9253       return;
9254     }
9255
9256   if (dump_file)
9257     fprintf (dump_file,
9258              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9259              " bytes, probing will be required.\n", size);
9260
9261   /* Round size to the nearest multiple of guard_size, and calculate the
9262      residual as the difference between the original size and the rounded
9263      size.  */
9264   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9265   HOST_WIDE_INT residual = size - rounded_size;
9266
9267   /* We can handle a small number of allocations/probes inline.  Otherwise
9268      punt to a loop.  */
9269   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9270     {
9271       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9272         {
9273           aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
9274           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9275                                            guard_used_by_caller));
9276           emit_insn (gen_blockage ());
9277         }
9278       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9279     }
9280   else
9281     {
9282       /* Compute the ending address.  */
9283       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9284                           temp1, NULL, force_isa_mode, false, true);
9285       rtx_insn *insn = get_last_insn ();
9286
9287       /* For the initial allocation, we don't have a frame pointer
9288          set up, so we always need CFI notes.  If we're doing the
9289          final allocation, then we may have a frame pointer, in which
9290          case it is the CFA, otherwise we need CFI notes.
9291
9292          We can determine which allocation we are doing by looking at
9293          the value of FRAME_RELATED_P since the final allocations are not
9294          frame related.  */
9295       if (frame_related_p)
9296         {
9297           /* We want the CFA independent of the stack pointer for the
9298              duration of the loop.  */
9299           add_reg_note (insn, REG_CFA_DEF_CFA,
9300                         plus_constant (Pmode, temp1, rounded_size));
9301           RTX_FRAME_RELATED_P (insn) = 1;
9302         }
9303
9304       /* This allocates and probes the stack.  Note that this re-uses some of
9305          the existing Ada stack protection code.  However we are guaranteed not
9306          to enter the non loop or residual branches of that code.
9307
9308          The non-loop part won't be entered because if our allocation amount
9309          doesn't require a loop, the case above would handle it.
9310
9311          The residual amount won't be entered because TEMP1 is a mutliple of
9312          the allocation size.  The residual will always be 0.  As such, the only
9313          part we are actually using from that code is the loop setup.  The
9314          actual probing is done in aarch64_output_probe_stack_range.  */
9315       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9316                                                stack_pointer_rtx, temp1));
9317
9318       /* Now reset the CFA register if needed.  */
9319       if (frame_related_p)
9320         {
9321           add_reg_note (insn, REG_CFA_DEF_CFA,
9322                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9323           RTX_FRAME_RELATED_P (insn) = 1;
9324         }
9325
9326       emit_insn (gen_blockage ());
9327       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9328     }
9329
9330   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
9331      be probed.  This maintains the requirement that each page is probed at
9332      least once.  For initial probing we probe only if the allocation is
9333      more than GUARD_SIZE - buffer, and below the saved registers we probe
9334      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
9335      GUARD_SIZE.  This works that for any allocation that is large enough to
9336      trigger a probe here, we'll have at least one, and if they're not large
9337      enough for this code to emit anything for them, The page would have been
9338      probed by the saving of FP/LR either by this function or any callees.  If
9339      we don't have any callees then we won't have more stack adjustments and so
9340      are still safe.  */
9341   if (residual)
9342     {
9343       gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
9344
9345       /* If we're doing final adjustments, and we've done any full page
9346          allocations then any residual needs to be probed.  */
9347       if (final_adjustment_p && rounded_size != 0)
9348         min_probe_threshold = 0;
9349
9350       aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
9351       if (residual >= min_probe_threshold)
9352         {
9353           if (dump_file)
9354             fprintf (dump_file,
9355                      "Stack clash AArch64 prologue residuals: "
9356                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9357                      "\n", residual);
9358
9359           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9360                                            guard_used_by_caller));
9361           emit_insn (gen_blockage ());
9362         }
9363     }
9364 }
9365
9366 /* Implement TARGET_EXTRA_LIVE_ON_ENTRY.  */
9367
9368 void
9369 aarch64_extra_live_on_entry (bitmap regs)
9370 {
9371   if (TARGET_ZA)
9372     {
9373       bitmap_set_bit (regs, LOWERING_REGNUM);
9374       bitmap_set_bit (regs, SME_STATE_REGNUM);
9375       bitmap_set_bit (regs, TPIDR2_SETUP_REGNUM);
9376       bitmap_set_bit (regs, ZA_FREE_REGNUM);
9377       bitmap_set_bit (regs, ZA_SAVED_REGNUM);
9378
9379       /* The only time ZA can't have live contents on entry is when
9380          the function explicitly treats it as a pure output.  */
9381       auto za_flags = aarch64_cfun_shared_flags ("za");
9382       if (za_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9383         bitmap_set_bit (regs, ZA_REGNUM);
9384
9385       /* Since ZT0 is call-clobbered, it is only live on input if
9386          it is explicitly shared, and is not a pure output.  */
9387       auto zt0_flags = aarch64_cfun_shared_flags ("zt0");
9388       if (zt0_flags != 0
9389           && zt0_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9390         bitmap_set_bit (regs, ZT0_REGNUM);
9391     }
9392 }
9393
9394 /* Return 1 if the register is used by the epilogue.  We need to say the
9395    return register is used, but only after epilogue generation is complete.
9396    Note that in the case of sibcalls, the values "used by the epilogue" are
9397    considered live at the start of the called function.  */
9398
9399 int
9400 aarch64_epilogue_uses (int regno)
9401 {
9402   if (epilogue_completed)
9403     {
9404       if (regno == LR_REGNUM)
9405         return 1;
9406     }
9407   if (regno == LOWERING_REGNUM && TARGET_ZA)
9408     return 1;
9409   if (regno == SME_STATE_REGNUM && TARGET_ZA)
9410     return 1;
9411   if (regno == TPIDR2_SETUP_REGNUM && TARGET_ZA)
9412     return 1;
9413   /* If the function shares SME state with its caller, ensure that that
9414      data is not in the lazy save buffer on exit.  */
9415   if (regno == ZA_SAVED_REGNUM && aarch64_cfun_incoming_pstate_za () != 0)
9416     return 1;
9417   if (regno == ZA_REGNUM && aarch64_cfun_shared_flags ("za") != 0)
9418     return 1;
9419   if (regno == ZT0_REGNUM && aarch64_cfun_shared_flags ("zt0") != 0)
9420     return 1;
9421   return 0;
9422 }
9423
9424 /* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE.  */
9425
9426 static bool
9427 aarch64_use_late_prologue_epilogue ()
9428 {
9429   return aarch64_cfun_enables_pstate_sm ();
9430 }
9431
9432 /* The current function's frame has a save slot for the incoming state
9433    of SVCR.  Return a legitimate memory for the slot, based on the hard
9434    frame pointer.  */
9435
9436 static rtx
9437 aarch64_old_svcr_mem ()
9438 {
9439   gcc_assert (frame_pointer_needed
9440               && known_ge (cfun->machine->frame.old_svcr_offset, 0));
9441   rtx base = hard_frame_pointer_rtx;
9442   poly_int64 offset = (0
9443                        /* hard fp -> bottom of frame.  */
9444                        - cfun->machine->frame.bytes_below_hard_fp
9445                        /* bottom of frame -> save slot.  */
9446                        + cfun->machine->frame.old_svcr_offset);
9447   return gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
9448 }
9449
9450 /* The current function's frame has a save slot for the incoming state
9451    of SVCR.  Load the slot into register REGNO and return the register.  */
9452
9453 static rtx
9454 aarch64_read_old_svcr (unsigned int regno)
9455 {
9456   rtx svcr = gen_rtx_REG (DImode, regno);
9457   emit_move_insn (svcr, aarch64_old_svcr_mem ());
9458   return svcr;
9459 }
9460
9461 /* Like the rtx version of aarch64_guard_switch_pstate_sm, but first
9462    load the incoming value of SVCR from its save slot into temporary
9463    register REGNO.  */
9464
9465 static rtx_insn *
9466 aarch64_guard_switch_pstate_sm (unsigned int regno,
9467                                 aarch64_feature_flags local_mode)
9468 {
9469   rtx old_svcr = aarch64_read_old_svcr (regno);
9470   return aarch64_guard_switch_pstate_sm (old_svcr, local_mode);
9471 }
9472
9473 /* AArch64 stack frames generated by this compiler look like:
9474
9475         +-------------------------------+
9476         |                               |
9477         |  incoming stack arguments     |
9478         |                               |
9479         +-------------------------------+
9480         |                               | <-- incoming stack pointer (aligned)
9481         |  callee-allocated save area   |
9482         |  for register varargs         |
9483         |                               |
9484         +-------------------------------+
9485         |  local variables (1)          | <-- frame_pointer_rtx
9486         |                               |
9487         +-------------------------------+
9488         |  padding (1)                  |
9489         +-------------------------------+
9490         |  callee-saved registers       |
9491         +-------------------------------+
9492         |  LR'                          |
9493         +-------------------------------+
9494         |  FP'                          |
9495         +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
9496         |  SVE vector registers         |
9497         +-------------------------------+
9498         |  SVE predicate registers      |
9499         +-------------------------------+
9500         |  local variables (2)          |
9501         +-------------------------------+
9502         |  padding (2)                  |
9503         +-------------------------------+
9504         |  dynamic allocation           |
9505         +-------------------------------+
9506         |  padding                      |
9507         +-------------------------------+
9508         |  outgoing stack arguments     | <-- arg_pointer
9509         |                               |
9510         +-------------------------------+
9511         |                               | <-- stack_pointer_rtx (aligned)
9512
9513    The regions marked (1) and (2) are mutually exclusive.  (2) is used
9514    when aarch64_save_regs_above_locals_p is true.
9515
9516    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9517    but leave frame_pointer_rtx and hard_frame_pointer_rtx
9518    unchanged.
9519
9520    By default for stack-clash we assume the guard is at least 64KB, but this
9521    value is configurable to either 4KB or 64KB.  We also force the guard size to
9522    be the same as the probing interval and both values are kept in sync.
9523
9524    With those assumptions the callee can allocate up to 63KB (or 3KB depending
9525    on the guard size) of stack space without probing.
9526
9527    When probing is needed, we emit a probe at the start of the prologue
9528    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9529
9530    We can also use register saves as probes.  These are stored in
9531    sve_save_and_probe and hard_fp_save_and_probe.
9532
9533    For outgoing arguments we probe if the size is larger than 1KB, such that
9534    the ABI specified buffer is maintained for the next callee.
9535
9536    The following registers are reserved during frame layout and should not be
9537    used for any other purpose:
9538
9539    - r11: Used by stack clash protection when SVE is enabled, and also
9540           as an anchor register when saving and restoring registers
9541    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9542    - r14 and r15: Used for speculation tracking.
9543    - r16(IP0), r17(IP1): Used by indirect tailcalls.
9544    - r30(LR), r29(FP): Used by standard frame layout.
9545
9546    These registers must be avoided in frame layout related code unless the
9547    explicit intention is to interact with one of the features listed above.  */
9548
9549 /* Generate the prologue instructions for entry into a function.
9550    Establish the stack frame by decreasing the stack pointer with a
9551    properly calculated size and, if necessary, create a frame record
9552    filled with the values of LR and previous frame pointer.  The
9553    current FP is also set up if it is in use.  */
9554
9555 void
9556 aarch64_expand_prologue (void)
9557 {
9558   aarch64_frame &frame = cfun->machine->frame;
9559   poly_int64 frame_size = frame.frame_size;
9560   poly_int64 initial_adjust = frame.initial_adjust;
9561   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9562   poly_int64 final_adjust = frame.final_adjust;
9563   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9564   unsigned reg1 = frame.wb_push_candidate1;
9565   unsigned reg2 = frame.wb_push_candidate2;
9566   bool emit_frame_chain = frame.emit_frame_chain;
9567   rtx_insn *insn;
9568   aarch64_feature_flags force_isa_mode = 0;
9569   if (aarch64_cfun_enables_pstate_sm ())
9570     force_isa_mode = AARCH64_FL_SM_ON;
9571
9572   if (flag_stack_clash_protection
9573       && known_eq (callee_adjust, 0)
9574       && known_lt (frame.reg_offset[VG_REGNUM], 0))
9575     {
9576       /* Fold the SVE allocation into the initial allocation.
9577          We don't do this in aarch64_layout_arg to avoid pessimizing
9578          the epilogue code.  */
9579       initial_adjust += sve_callee_adjust;
9580       sve_callee_adjust = 0;
9581     }
9582
9583   /* Sign return address for functions.  */
9584   if (aarch64_return_address_signing_enabled ())
9585     {
9586       switch (aarch64_ra_sign_key)
9587         {
9588           case AARCH64_KEY_A:
9589             insn = emit_insn (gen_paciasp ());
9590             break;
9591           case AARCH64_KEY_B:
9592             insn = emit_insn (gen_pacibsp ());
9593             break;
9594           default:
9595             gcc_unreachable ();
9596         }
9597       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9598       RTX_FRAME_RELATED_P (insn) = 1;
9599     }
9600
9601   /* Push return address to shadow call stack.  */
9602   if (frame.is_scs_enabled)
9603     emit_insn (gen_scs_push ());
9604
9605   if (flag_stack_usage_info)
9606     current_function_static_stack_size = constant_lower_bound (frame_size);
9607
9608   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9609     {
9610       if (crtl->is_leaf && !cfun->calls_alloca)
9611         {
9612           if (maybe_gt (frame_size, PROBE_INTERVAL)
9613               && maybe_gt (frame_size, get_stack_check_protect ()))
9614             aarch64_emit_probe_stack_range (get_stack_check_protect (),
9615                                             (frame_size
9616                                              - get_stack_check_protect ()));
9617         }
9618       else if (maybe_gt (frame_size, 0))
9619         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9620     }
9621
9622   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9623   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9624
9625   /* In theory we should never have both an initial adjustment
9626      and a callee save adjustment.  Verify that is the case since the
9627      code below does not handle it for -fstack-clash-protection.  */
9628   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9629
9630   /* Will only probe if the initial adjustment is larger than the guard
9631      less the amount of the guard reserved for use by the caller's
9632      outgoing args.  */
9633   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9634                                           force_isa_mode, true, false);
9635
9636   if (callee_adjust != 0)
9637     aarch64_push_regs (reg1, reg2, callee_adjust);
9638
9639   /* The offset of the current SP from the bottom of the static frame.  */
9640   poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
9641
9642   if (emit_frame_chain)
9643     {
9644       /* The offset of the frame chain record (if any) from the current SP.  */
9645       poly_int64 chain_offset = (initial_adjust + callee_adjust
9646                                  - frame.bytes_above_hard_fp);
9647       gcc_assert (known_ge (chain_offset, 0));
9648
9649       gcc_assert (reg1 == R29_REGNUM && reg2 == R30_REGNUM);
9650       if (callee_adjust == 0)
9651         aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs,
9652                                    false, false);
9653       else
9654         gcc_assert (known_eq (chain_offset, 0));
9655       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9656                           stack_pointer_rtx, chain_offset,
9657                           tmp1_rtx, tmp0_rtx, force_isa_mode,
9658                           frame_pointer_needed);
9659       if (frame_pointer_needed && !frame_size.is_constant ())
9660         {
9661           /* Variable-sized frames need to describe the save slot
9662              address using DW_CFA_expression rather than DW_CFA_offset.
9663              This means that, without taking further action, the
9664              locations of the registers that we've already saved would
9665              remain based on the stack pointer even after we redefine
9666              the CFA based on the frame pointer.  We therefore need new
9667              DW_CFA_expressions to re-express the save slots with addresses
9668              based on the frame pointer.  */
9669           rtx_insn *insn = get_last_insn ();
9670           gcc_assert (RTX_FRAME_RELATED_P (insn));
9671
9672           /* Add an explicit CFA definition if this was previously
9673              implicit.  */
9674           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9675             {
9676               rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
9677               add_reg_note (insn, REG_CFA_ADJUST_CFA,
9678                             gen_rtx_SET (hard_frame_pointer_rtx, src));
9679             }
9680
9681           /* Change the save slot expressions for the registers that
9682              we've already saved.  */
9683           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9684                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
9685           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
9686                                       hard_frame_pointer_rtx, 0);
9687         }
9688       aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9689     }
9690
9691   aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs, true,
9692                              emit_frame_chain);
9693   if (maybe_ge (frame.reg_offset[VG_REGNUM], 0))
9694     {
9695       unsigned int saved_regs[] = { VG_REGNUM };
9696       aarch64_save_callee_saves (bytes_below_sp, saved_regs, true,
9697                                  emit_frame_chain);
9698     }
9699   if (maybe_ne (sve_callee_adjust, 0))
9700     {
9701       gcc_assert (!flag_stack_clash_protection
9702                   || known_eq (initial_adjust, 0)
9703                   /* The VG save isn't shrink-wrapped and so serves as
9704                      a probe of the initial allocation.  */
9705                   || known_eq (frame.reg_offset[VG_REGNUM], bytes_below_sp));
9706       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
9707                                               sve_callee_adjust,
9708                                               force_isa_mode,
9709                                               !frame_pointer_needed, false);
9710       bytes_below_sp -= sve_callee_adjust;
9711     }
9712   aarch64_save_callee_saves (bytes_below_sp, frame.saved_prs, true,
9713                              emit_frame_chain);
9714   aarch64_save_callee_saves (bytes_below_sp, frame.saved_fprs, true,
9715                              emit_frame_chain);
9716
9717   /* We may need to probe the final adjustment if it is larger than the guard
9718      that is assumed by the called.  */
9719   gcc_assert (known_eq (bytes_below_sp, final_adjust));
9720   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
9721                                           force_isa_mode,
9722                                           !frame_pointer_needed, true);
9723   if (emit_frame_chain && maybe_ne (final_adjust, 0))
9724     aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9725
9726   /* Save the incoming value of PSTATE.SM, if required.  Code further
9727      down does this for locally-streaming functions.  */
9728   if (known_ge (frame.old_svcr_offset, 0)
9729       && !aarch64_cfun_enables_pstate_sm ())
9730     {
9731       rtx mem = aarch64_old_svcr_mem ();
9732       MEM_VOLATILE_P (mem) = 1;
9733       if (TARGET_SME)
9734         {
9735           rtx reg = gen_rtx_REG (DImode, IP0_REGNUM);
9736           emit_insn (gen_aarch64_read_svcr (reg));
9737           emit_move_insn (mem, reg);
9738         }
9739       else
9740         {
9741           rtx old_r0 = NULL_RTX, old_r1 = NULL_RTX;
9742           auto &args = crtl->args.info;
9743           if (args.aapcs_ncrn > 0)
9744             {
9745               old_r0 = gen_rtx_REG (DImode, PROBE_STACK_FIRST_REGNUM);
9746               emit_move_insn (old_r0, gen_rtx_REG (DImode, R0_REGNUM));
9747             }
9748           if (args.aapcs_ncrn > 1)
9749             {
9750               old_r1 = gen_rtx_REG (DImode, PROBE_STACK_SECOND_REGNUM);
9751               emit_move_insn (old_r1, gen_rtx_REG (DImode, R1_REGNUM));
9752             }
9753           emit_insn (gen_aarch64_get_sme_state ());
9754           emit_move_insn (mem, gen_rtx_REG (DImode, R0_REGNUM));
9755           if (old_r0)
9756             emit_move_insn (gen_rtx_REG (DImode, R0_REGNUM), old_r0);
9757           if (old_r1)
9758             emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
9759         }
9760     }
9761
9762   /* Enable PSTATE.SM, if required.  */
9763   if (aarch64_cfun_enables_pstate_sm ())
9764     {
9765       rtx_insn *guard_label = nullptr;
9766       if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9767         {
9768           /* The current function is streaming-compatible.  Save the
9769              original state of PSTATE.SM.  */
9770           rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
9771           emit_insn (gen_aarch64_read_svcr (svcr));
9772           emit_move_insn (aarch64_old_svcr_mem (), svcr);
9773           guard_label = aarch64_guard_switch_pstate_sm (svcr,
9774                                                         aarch64_isa_flags);
9775         }
9776       aarch64_sme_mode_switch_regs args_switch;
9777       auto &args = crtl->args.info;
9778       for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
9779         {
9780           rtx x = args.sme_mode_switch_args[i];
9781           args_switch.add_reg (GET_MODE (x), REGNO (x));
9782         }
9783       args_switch.emit_prologue ();
9784       emit_insn (gen_aarch64_smstart_sm ());
9785       args_switch.emit_epilogue ();
9786       if (guard_label)
9787         emit_label (guard_label);
9788     }
9789 }
9790
9791 /* Return TRUE if we can use a simple_return insn.
9792
9793    This function checks whether the callee saved stack is empty, which
9794    means no restore actions are need. The pro_and_epilogue will use
9795    this to check whether shrink-wrapping opt is feasible.  */
9796
9797 bool
9798 aarch64_use_return_insn_p (void)
9799 {
9800   if (!reload_completed)
9801     return false;
9802
9803   if (crtl->profile)
9804     return false;
9805
9806   return known_eq (cfun->machine->frame.frame_size, 0);
9807 }
9808
9809 /* Generate the epilogue instructions for returning from a function.
9810    This is almost exactly the reverse of the prolog sequence, except
9811    that we need to insert barriers to avoid scheduling loads that read
9812    from a deallocated stack, and we optimize the unwind records by
9813    emitting them all together if possible.  */
9814 void
9815 aarch64_expand_epilogue (rtx_call_insn *sibcall)
9816 {
9817   aarch64_frame &frame = cfun->machine->frame;
9818   poly_int64 initial_adjust = frame.initial_adjust;
9819   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9820   poly_int64 final_adjust = frame.final_adjust;
9821   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9822   poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
9823   unsigned reg1 = frame.wb_pop_candidate1;
9824   unsigned reg2 = frame.wb_pop_candidate2;
9825   rtx cfi_ops = NULL;
9826   rtx_insn *insn;
9827   /* A stack clash protection prologue may not have left EP0_REGNUM or
9828      EP1_REGNUM in a usable state.  The same is true for allocations
9829      with an SVE component, since we then need both temporary registers
9830      for each allocation.  For stack clash we are in a usable state if
9831      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
9832   HOST_WIDE_INT guard_size
9833     = 1 << param_stack_clash_protection_guard_size;
9834   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9835   aarch64_feature_flags force_isa_mode = 0;
9836   if (aarch64_cfun_enables_pstate_sm ())
9837     force_isa_mode = AARCH64_FL_SM_ON;
9838
9839   /* We can re-use the registers when:
9840
9841      (a) the deallocation amount is the same as the corresponding
9842          allocation amount (which is false if we combine the initial
9843          and SVE callee save allocations in the prologue); and
9844
9845      (b) the allocation amount doesn't need a probe (which is false
9846          if the amount is guard_size - guard_used_by_caller or greater).
9847
9848      In such situations the register should remain live with the correct
9849      value.  */
9850   bool can_inherit_p = (initial_adjust.is_constant ()
9851                         && final_adjust.is_constant ()
9852                         && (!flag_stack_clash_protection
9853                             || (known_lt (initial_adjust,
9854                                           guard_size - guard_used_by_caller)
9855                                 && known_eq (sve_callee_adjust, 0))));
9856
9857   /* We need to add memory barrier to prevent read from deallocated stack.  */
9858   bool need_barrier_p
9859     = maybe_ne (get_frame_size ()
9860                 + frame.saved_varargs_size, 0);
9861
9862   /* Reset PSTATE.SM, if required.  */
9863   if (aarch64_cfun_enables_pstate_sm ())
9864     {
9865       rtx_insn *guard_label = nullptr;
9866       if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9867         guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
9868                                                       aarch64_isa_flags);
9869       aarch64_sme_mode_switch_regs return_switch;
9870       if (sibcall)
9871         return_switch.add_call_args (sibcall);
9872       else if (crtl->return_rtx && REG_P (crtl->return_rtx))
9873         return_switch.add_reg (GET_MODE (crtl->return_rtx),
9874                                REGNO (crtl->return_rtx));
9875       return_switch.emit_prologue ();
9876       emit_insn (gen_aarch64_smstop_sm ());
9877       return_switch.emit_epilogue ();
9878       if (guard_label)
9879         emit_label (guard_label);
9880     }
9881
9882   /* Emit a barrier to prevent loads from a deallocated stack.  */
9883   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
9884       || cfun->calls_alloca
9885       || crtl->calls_eh_return)
9886     {
9887       aarch64_emit_stack_tie (stack_pointer_rtx);
9888       need_barrier_p = false;
9889     }
9890
9891   /* Restore the stack pointer from the frame pointer if it may not
9892      be the same as the stack pointer.  */
9893   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9894   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9895   if (frame_pointer_needed
9896       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
9897     /* If writeback is used when restoring callee-saves, the CFA
9898        is restored on the instruction doing the writeback.  */
9899     aarch64_add_offset (Pmode, stack_pointer_rtx,
9900                         hard_frame_pointer_rtx,
9901                         -bytes_below_hard_fp + final_adjust,
9902                         tmp1_rtx, tmp0_rtx, force_isa_mode,
9903                         callee_adjust == 0);
9904   else
9905      /* The case where we need to re-use the register here is very rare, so
9906         avoid the complicated condition and just always emit a move if the
9907         immediate doesn't fit.  */
9908      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
9909
9910   /* Restore the vector registers before the predicate registers,
9911      so that we can use P4 as a temporary for big-endian SVE frames.  */
9912   aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
9913   aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
9914   if (maybe_ne (sve_callee_adjust, 0))
9915     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
9916                     force_isa_mode, true);
9917
9918   /* When shadow call stack is enabled, the scs_pop in the epilogue will
9919      restore x30, we don't need to restore x30 again in the traditional
9920      way.  */
9921   aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
9922                                 frame.saved_gprs, &cfi_ops);
9923
9924   if (need_barrier_p)
9925     aarch64_emit_stack_tie (stack_pointer_rtx);
9926
9927   if (callee_adjust != 0)
9928     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
9929
9930   /* If we have no register restore information, the CFA must have been
9931      defined in terms of the stack pointer since the end of the prologue.  */
9932   gcc_assert (cfi_ops || !frame_pointer_needed);
9933
9934   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
9935     {
9936       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
9937       insn = get_last_insn ();
9938       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
9939       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
9940       RTX_FRAME_RELATED_P (insn) = 1;
9941       cfi_ops = NULL;
9942     }
9943
9944   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
9945      add restriction on emit_move optimization to leaf functions.  */
9946   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
9947                   (!can_inherit_p || !crtl->is_leaf
9948                    || df_regs_ever_live_p (EP0_REGNUM)));
9949
9950   if (cfi_ops)
9951     {
9952       /* Emit delayed restores and reset the CFA to be SP.  */
9953       insn = get_last_insn ();
9954       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
9955       REG_NOTES (insn) = cfi_ops;
9956       RTX_FRAME_RELATED_P (insn) = 1;
9957     }
9958
9959   /* Pop return address from shadow call stack.  */
9960   if (frame.is_scs_enabled)
9961     {
9962       machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
9963       rtx reg = gen_rtx_REG (mode, R30_REGNUM);
9964
9965       insn = emit_insn (gen_scs_pop ());
9966       add_reg_note (insn, REG_CFA_RESTORE, reg);
9967       RTX_FRAME_RELATED_P (insn) = 1;
9968     }
9969
9970   /* Stack adjustment for exception handler.  */
9971   if (crtl->calls_eh_return && !sibcall)
9972     {
9973       /* If the EH_RETURN_TAKEN_RTX flag is set then we need
9974          to unwind the stack and jump to the handler, otherwise
9975          skip this eh_return logic and continue with normal
9976          return after the label.  We have already reset the CFA
9977          to be SP; letting the CFA move during this adjustment
9978          is just as correct as retaining the CFA from the body
9979          of the function.  Therefore, do nothing special.  */
9980       rtx_code_label *label = gen_label_rtx ();
9981       rtx x = aarch64_gen_compare_zero_and_branch (EQ, EH_RETURN_TAKEN_RTX,
9982                                                    label);
9983       rtx jump = emit_jump_insn (x);
9984       JUMP_LABEL (jump) = label;
9985       LABEL_NUSES (label)++;
9986       emit_insn (gen_add2_insn (stack_pointer_rtx,
9987                                 EH_RETURN_STACKADJ_RTX));
9988       emit_jump_insn (gen_indirect_jump (EH_RETURN_HANDLER_RTX));
9989       emit_barrier ();
9990       emit_label (label);
9991     }
9992
9993   /* We prefer to emit the combined return/authenticate instruction RETAA,
9994      however there are three cases in which we must instead emit an explicit
9995      authentication instruction.
9996
9997         1) Sibcalls don't return in a normal way, so if we're about to call one
9998            we must authenticate.
9999
10000         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10001            generating code for !TARGET_ARMV8_3 we can't use it and must
10002            explicitly authenticate.
10003     */
10004   if (aarch64_return_address_signing_enabled ()
10005       && (sibcall || !TARGET_ARMV8_3))
10006     {
10007       switch (aarch64_ra_sign_key)
10008         {
10009           case AARCH64_KEY_A:
10010             insn = emit_insn (gen_autiasp ());
10011             break;
10012           case AARCH64_KEY_B:
10013             insn = emit_insn (gen_autibsp ());
10014             break;
10015           default:
10016             gcc_unreachable ();
10017         }
10018       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10019       RTX_FRAME_RELATED_P (insn) = 1;
10020     }
10021
10022   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10023   if (!sibcall)
10024     emit_jump_insn (ret_rtx);
10025 }
10026
10027 /* Output code to add DELTA to the first argument, and then jump
10028    to FUNCTION.  Used for C++ multiple inheritance.  */
10029 static void
10030 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10031                          HOST_WIDE_INT delta,
10032                          HOST_WIDE_INT vcall_offset,
10033                          tree function)
10034 {
10035   /* The this pointer is always in x0.  Note that this differs from
10036      Arm where the this pointer maybe bumped to r1 if r0 is required
10037      to return a pointer to an aggregate.  On AArch64 a result value
10038      pointer will be in x8.  */
10039   int this_regno = R0_REGNUM;
10040   rtx this_rtx, temp0, temp1, addr, funexp;
10041   rtx_insn *insn;
10042   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10043
10044   if (aarch_bti_enabled ())
10045     emit_insn (gen_bti_c());
10046
10047   reload_completed = 1;
10048   emit_note (NOTE_INSN_PROLOGUE_END);
10049
10050   this_rtx = gen_rtx_REG (Pmode, this_regno);
10051   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10052   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10053
10054   if (vcall_offset == 0)
10055     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
10056                         0, false);
10057   else
10058     {
10059       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10060
10061       addr = this_rtx;
10062       if (delta != 0)
10063         {
10064           if (delta >= -256 && delta < 256)
10065             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10066                                        plus_constant (Pmode, this_rtx, delta));
10067           else
10068             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10069                                 temp1, temp0, 0, false);
10070         }
10071
10072       if (Pmode == ptr_mode)
10073         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10074       else
10075         aarch64_emit_move (temp0,
10076                            gen_rtx_ZERO_EXTEND (Pmode,
10077                                                 gen_rtx_MEM (ptr_mode, addr)));
10078
10079       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10080           addr = plus_constant (Pmode, temp0, vcall_offset);
10081       else
10082         {
10083           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10084                                           Pmode);
10085           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10086         }
10087
10088       if (Pmode == ptr_mode)
10089         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10090       else
10091         aarch64_emit_move (temp1,
10092                            gen_rtx_SIGN_EXTEND (Pmode,
10093                                                 gen_rtx_MEM (ptr_mode, addr)));
10094
10095       emit_insn (gen_add2_insn (this_rtx, temp1));
10096     }
10097
10098   /* Generate a tail call to the target function.  */
10099   if (!TREE_USED (function))
10100     {
10101       assemble_external (function);
10102       TREE_USED (function) = 1;
10103     }
10104   funexp = XEXP (DECL_RTL (function), 0);
10105   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10106   auto isa_mode = aarch64_fntype_isa_mode (TREE_TYPE (function));
10107   auto pcs_variant = arm_pcs (fndecl_abi (function).id ());
10108   rtx callee_abi = aarch64_gen_callee_cookie (isa_mode, pcs_variant);
10109   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10110   SIBLING_CALL_P (insn) = 1;
10111
10112   insn = get_insns ();
10113   shorten_branches (insn);
10114
10115   assemble_start_function (thunk, fnname);
10116   final_start_function (insn, file, 1);
10117   final (insn, file, 1);
10118   final_end_function ();
10119   assemble_end_function (thunk, fnname);
10120
10121   /* Stop pretending to be a post-reload pass.  */
10122   reload_completed = 0;
10123 }
10124
10125 static bool
10126 aarch64_tls_referenced_p (rtx x)
10127 {
10128   if (!TARGET_HAVE_TLS)
10129     return false;
10130   subrtx_iterator::array_type array;
10131   FOR_EACH_SUBRTX (iter, array, x, ALL)
10132     {
10133       const_rtx x = *iter;
10134       if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10135         return true;
10136       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10137          TLS offsets, not real symbol references.  */
10138       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10139         iter.skip_subrtxes ();
10140     }
10141   return false;
10142 }
10143
10144
10145 static bool
10146 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10147 {
10148   if (GET_CODE (x) == HIGH)
10149     return true;
10150
10151   /* There's no way to calculate VL-based values using relocations.  */
10152   subrtx_iterator::array_type array;
10153   HOST_WIDE_INT factor;
10154   FOR_EACH_SUBRTX (iter, array, x, ALL)
10155     if (GET_CODE (*iter) == CONST_POLY_INT
10156         || aarch64_sme_vq_unspec_p (x, &factor))
10157       return true;
10158
10159   poly_int64 offset;
10160   rtx base = strip_offset_and_salt (x, &offset);
10161   if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10162     {
10163       /* We checked for POLY_INT_CST offsets above.  */
10164       if (aarch64_classify_symbol (base, offset.to_constant ())
10165           != SYMBOL_FORCE_TO_MEM)
10166         return true;
10167       else
10168         /* Avoid generating a 64-bit relocation in ILP32; leave
10169            to aarch64_expand_mov_immediate to handle it properly.  */
10170         return mode != ptr_mode;
10171     }
10172
10173   return aarch64_tls_referenced_p (x);
10174 }
10175
10176 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10177    The expansion for a table switch is quite expensive due to the number
10178    of instructions, the table lookup and hard to predict indirect jump.
10179    When optimizing for speed, and -O3 enabled, use the per-core tuning if
10180    set, otherwise use tables for >= 11 cases as a tradeoff between size and
10181    performance.  When optimizing for size, use 8 for smallest codesize.  */
10182
10183 static unsigned int
10184 aarch64_case_values_threshold (void)
10185 {
10186   /* Use the specified limit for the number of cases before using jump
10187      tables at higher optimization levels.  */
10188   if (optimize > 2
10189       && aarch64_tune_params.max_case_values != 0)
10190     return aarch64_tune_params.max_case_values;
10191   else
10192     return optimize_size ? 8 : 11;
10193 }
10194
10195 /* Return true if register REGNO is a valid index register.
10196    STRICT_P is true if REG_OK_STRICT is in effect.  */
10197
10198 bool
10199 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10200 {
10201   if (!HARD_REGISTER_NUM_P (regno))
10202     {
10203       if (!strict_p)
10204         return true;
10205
10206       if (!reg_renumber)
10207         return false;
10208
10209       regno = reg_renumber[regno];
10210     }
10211   return GP_REGNUM_P (regno);
10212 }
10213
10214 /* Return true if register REGNO is a valid base register for mode MODE.
10215    STRICT_P is true if REG_OK_STRICT is in effect.  */
10216
10217 bool
10218 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10219 {
10220   if (!HARD_REGISTER_NUM_P (regno))
10221     {
10222       if (!strict_p)
10223         return true;
10224
10225       if (!reg_renumber)
10226         return false;
10227
10228       regno = reg_renumber[regno];
10229     }
10230
10231   /* The fake registers will be eliminated to either the stack or
10232      hard frame pointer, both of which are usually valid base registers.
10233      Reload deals with the cases where the eliminated form isn't valid.  */
10234   return (GP_REGNUM_P (regno)
10235           || regno == SP_REGNUM
10236           || regno == FRAME_POINTER_REGNUM
10237           || regno == ARG_POINTER_REGNUM);
10238 }
10239
10240 /* Return true if X is a valid base register for mode MODE.
10241    STRICT_P is true if REG_OK_STRICT is in effect.  */
10242
10243 static bool
10244 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10245 {
10246   if (!strict_p
10247       && SUBREG_P (x)
10248       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10249     x = SUBREG_REG (x);
10250
10251   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10252 }
10253
10254 /* Return true if address offset is a valid index.  If it is, fill in INFO
10255    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10256
10257 static bool
10258 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10259                         machine_mode mode, bool strict_p)
10260 {
10261   enum aarch64_address_type type;
10262   rtx index;
10263   int shift;
10264
10265   /* (reg:P) */
10266   if ((REG_P (x) || SUBREG_P (x))
10267       && GET_MODE (x) == Pmode)
10268     {
10269       type = ADDRESS_REG_REG;
10270       index = x;
10271       shift = 0;
10272     }
10273   /* (sign_extend:DI (reg:SI)) */
10274   else if ((GET_CODE (x) == SIGN_EXTEND
10275             || GET_CODE (x) == ZERO_EXTEND)
10276            && GET_MODE (x) == DImode
10277            && GET_MODE (XEXP (x, 0)) == SImode)
10278     {
10279       type = (GET_CODE (x) == SIGN_EXTEND)
10280         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10281       index = XEXP (x, 0);
10282       shift = 0;
10283     }
10284   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10285   else if (GET_CODE (x) == MULT
10286            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10287                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10288            && GET_MODE (XEXP (x, 0)) == DImode
10289            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10290            && CONST_INT_P (XEXP (x, 1)))
10291     {
10292       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10293         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10294       index = XEXP (XEXP (x, 0), 0);
10295       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10296     }
10297   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10298   else if (GET_CODE (x) == ASHIFT
10299            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10300                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10301            && GET_MODE (XEXP (x, 0)) == DImode
10302            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10303            && CONST_INT_P (XEXP (x, 1)))
10304     {
10305       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10306         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10307       index = XEXP (XEXP (x, 0), 0);
10308       shift = INTVAL (XEXP (x, 1));
10309     }
10310   /* (and:DI (mult:DI (reg:DI) (const_int scale))
10311      (const_int 0xffffffff<<shift)) */
10312   else if (GET_CODE (x) == AND
10313            && GET_MODE (x) == DImode
10314            && GET_CODE (XEXP (x, 0)) == MULT
10315            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10316            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10317            && CONST_INT_P (XEXP (x, 1)))
10318     {
10319       type = ADDRESS_REG_UXTW;
10320       index = XEXP (XEXP (x, 0), 0);
10321       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10322       /* Avoid undefined code dealing with shift being -1. */
10323       if (shift != -1
10324           && INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10325         shift = -1;
10326     }
10327   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10328      (const_int 0xffffffff<<shift)) */
10329   else if (GET_CODE (x) == AND
10330            && GET_MODE (x) == DImode
10331            && GET_CODE (XEXP (x, 0)) == ASHIFT
10332            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10333            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10334            && CONST_INT_P (XEXP (x, 1)))
10335     {
10336       type = ADDRESS_REG_UXTW;
10337       index = XEXP (XEXP (x, 0), 0);
10338       shift = INTVAL (XEXP (XEXP (x, 0), 1));
10339       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10340         shift = -1;
10341     }
10342   /* (mult:P (reg:P) (const_int scale)) */
10343   else if (GET_CODE (x) == MULT
10344            && GET_MODE (x) == Pmode
10345            && GET_MODE (XEXP (x, 0)) == Pmode
10346            && CONST_INT_P (XEXP (x, 1)))
10347     {
10348       type = ADDRESS_REG_REG;
10349       index = XEXP (x, 0);
10350       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10351     }
10352   /* (ashift:P (reg:P) (const_int shift)) */
10353   else if (GET_CODE (x) == ASHIFT
10354            && GET_MODE (x) == Pmode
10355            && GET_MODE (XEXP (x, 0)) == Pmode
10356            && CONST_INT_P (XEXP (x, 1)))
10357     {
10358       type = ADDRESS_REG_REG;
10359       index = XEXP (x, 0);
10360       shift = INTVAL (XEXP (x, 1));
10361     }
10362   else
10363     return false;
10364
10365   if (!strict_p
10366       && SUBREG_P (index)
10367       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10368     index = SUBREG_REG (index);
10369
10370   if (aarch64_sve_data_mode_p (mode) || mode == VNx1TImode)
10371     {
10372       if (type != ADDRESS_REG_REG
10373           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10374         return false;
10375     }
10376   else
10377     {
10378       if (shift != 0
10379           && !(IN_RANGE (shift, 1, 3)
10380                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10381         return false;
10382     }
10383
10384   if (REG_P (index)
10385       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10386     {
10387       info->type = type;
10388       info->offset = index;
10389       info->shift = shift;
10390       return true;
10391     }
10392
10393   return false;
10394 }
10395
10396 /* Return true if MODE is one of the modes for which we
10397    support LDP/STP operations.  */
10398
10399 static bool
10400 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10401 {
10402   return mode == SImode || mode == DImode
10403          || mode == SFmode || mode == DFmode
10404          || mode == SDmode || mode == DDmode
10405          || (aarch64_vector_mode_supported_p (mode)
10406              && (known_eq (GET_MODE_SIZE (mode), 8)
10407                  || known_eq (GET_MODE_SIZE (mode), 16)));
10408 }
10409
10410 /* Return true if REGNO is a virtual pointer register, or an eliminable
10411    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
10412    include stack_pointer or hard_frame_pointer.  */
10413 static bool
10414 virt_or_elim_regno_p (unsigned regno)
10415 {
10416   return ((regno >= FIRST_VIRTUAL_REGISTER
10417            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10418           || regno == FRAME_POINTER_REGNUM
10419           || regno == ARG_POINTER_REGNUM);
10420 }
10421
10422 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10423    If it is, fill in INFO appropriately.  STRICT_P is true if
10424    REG_OK_STRICT is in effect.  */
10425
10426 bool
10427 aarch64_classify_address (struct aarch64_address_info *info,
10428                           rtx x, machine_mode mode, bool strict_p,
10429                           aarch64_addr_query_type type)
10430 {
10431   enum rtx_code code = GET_CODE (x);
10432   rtx op0, op1;
10433   poly_int64 offset;
10434
10435   HOST_WIDE_INT const_size;
10436
10437   /* Whether a vector mode is partial doesn't affect address legitimacy.
10438      Partial vectors like VNx8QImode allow the same indexed addressing
10439      mode and MUL VL addressing mode as full vectors like VNx16QImode;
10440      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
10441   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10442   vec_flags &= ~VEC_PARTIAL;
10443
10444   /* On BE, we use load/store pair for all large int mode load/stores.
10445      TI/TF/TDmode may also use a load/store pair.  */
10446   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10447   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10448                             || type == ADDR_QUERY_LDP_STP_N
10449                             || mode == TImode
10450                             || mode == TFmode
10451                             || mode == TDmode
10452                             || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10453                                 && advsimd_struct_p));
10454   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10455      corresponds to the actual size of the memory being loaded/stored and the
10456      mode of the corresponding addressing mode is half of that.  */
10457   if (type == ADDR_QUERY_LDP_STP_N)
10458     {
10459       if (known_eq (GET_MODE_SIZE (mode), 32))
10460         mode = V16QImode;
10461       else if (known_eq (GET_MODE_SIZE (mode), 16))
10462         mode = DFmode;
10463       else if (known_eq (GET_MODE_SIZE (mode), 8))
10464         mode = SFmode;
10465       else
10466         return false;
10467
10468       /* This isn't really an Advanced SIMD struct mode, but a mode
10469          used to represent the complete mem in a load/store pair.  */
10470       advsimd_struct_p = false;
10471     }
10472
10473   bool allow_reg_index_p = (!load_store_pair_p
10474                             && ((vec_flags == 0
10475                                  && known_lt (GET_MODE_SIZE (mode), 16))
10476                                 || vec_flags == VEC_ADVSIMD
10477                                 || vec_flags & VEC_SVE_DATA
10478                                 || mode == VNx1TImode));
10479
10480   /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10481      The latter is not valid for SVE predicates, and that's rejected through
10482      allow_reg_index_p above.  */
10483   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10484       && (code != REG && code != PLUS))
10485     return false;
10486
10487   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10488      REG addressing.  */
10489   if (advsimd_struct_p
10490       && TARGET_SIMD
10491       && !BYTES_BIG_ENDIAN
10492       && (code != POST_INC && code != REG))
10493     return false;
10494
10495   gcc_checking_assert (GET_MODE (x) == VOIDmode
10496                        || SCALAR_INT_MODE_P (GET_MODE (x)));
10497
10498   switch (code)
10499     {
10500     case REG:
10501     case SUBREG:
10502       info->type = ADDRESS_REG_IMM;
10503       info->base = x;
10504       info->offset = const0_rtx;
10505       info->const_offset = 0;
10506       return aarch64_base_register_rtx_p (x, strict_p);
10507
10508     case PLUS:
10509       op0 = XEXP (x, 0);
10510       op1 = XEXP (x, 1);
10511
10512       if (! strict_p
10513           && REG_P (op0)
10514           && virt_or_elim_regno_p (REGNO (op0))
10515           && poly_int_rtx_p (op1, &offset))
10516         {
10517           info->type = ADDRESS_REG_IMM;
10518           info->base = op0;
10519           info->offset = op1;
10520           info->const_offset = offset;
10521
10522           return true;
10523         }
10524
10525       if (maybe_ne (GET_MODE_SIZE (mode), 0)
10526           && aarch64_base_register_rtx_p (op0, strict_p)
10527           && poly_int_rtx_p (op1, &offset))
10528         {
10529           info->type = ADDRESS_REG_IMM;
10530           info->base = op0;
10531           info->offset = op1;
10532           info->const_offset = offset;
10533
10534           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10535              registers and individual Q registers.  The available
10536              address modes are:
10537              X,X: 7-bit signed scaled offset
10538              Q:   9-bit signed offset
10539              We conservatively require an offset representable in either mode.
10540              When performing the check for pairs of X registers i.e.  LDP/STP
10541              pass down DImode since that is the natural size of the LDP/STP
10542              instruction memory accesses.  */
10543           if (mode == TImode || mode == TFmode || mode == TDmode)
10544             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10545                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10546                         || offset_12bit_unsigned_scaled_p (mode, offset)));
10547
10548           if (mode == V8DImode)
10549             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10550                     && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10551
10552           /* A 7bit offset check because OImode will emit a ldp/stp
10553              instruction (only !TARGET_SIMD or big endian will get here).
10554              For ldp/stp instructions, the offset is scaled for the size of a
10555              single element of the pair.  */
10556           if (aarch64_advsimd_partial_struct_mode_p (mode)
10557               && known_eq (GET_MODE_SIZE (mode), 16))
10558             return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10559           if (aarch64_advsimd_full_struct_mode_p (mode)
10560               && known_eq (GET_MODE_SIZE (mode), 32))
10561             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10562
10563           /* Three 9/12 bit offsets checks because CImode will emit three
10564              ldr/str instructions (only !TARGET_SIMD or big endian will
10565              get here).  */
10566           if (aarch64_advsimd_partial_struct_mode_p (mode)
10567               && known_eq (GET_MODE_SIZE (mode), 24))
10568             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10569                     && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10570                                                                offset + 16)
10571                         || offset_12bit_unsigned_scaled_p (DImode,
10572                                                            offset + 16)));
10573           if (aarch64_advsimd_full_struct_mode_p (mode)
10574               && known_eq (GET_MODE_SIZE (mode), 48))
10575             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10576                     && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10577                                                                offset + 32)
10578                         || offset_12bit_unsigned_scaled_p (TImode,
10579                                                            offset + 32)));
10580
10581           /* Two 7bit offsets checks because XImode will emit two ldp/stp
10582              instructions (only big endian will get here).  */
10583           if (aarch64_advsimd_partial_struct_mode_p (mode)
10584               && known_eq (GET_MODE_SIZE (mode), 32))
10585             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10586                     && aarch64_offset_7bit_signed_scaled_p (DImode,
10587                                                             offset + 16));
10588           if (aarch64_advsimd_full_struct_mode_p (mode)
10589               && known_eq (GET_MODE_SIZE (mode), 64))
10590             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10591                     && aarch64_offset_7bit_signed_scaled_p (TImode,
10592                                                             offset + 32));
10593
10594           /* Make "m" use the LD1 offset range for SVE data modes, so
10595              that pre-RTL optimizers like ivopts will work to that
10596              instead of the wider LDR/STR range.  */
10597           if (vec_flags == VEC_SVE_DATA || mode == VNx1TImode)
10598             return (type == ADDR_QUERY_M
10599                     ? offset_4bit_signed_scaled_p (mode, offset)
10600                     : offset_9bit_signed_scaled_p (mode, offset));
10601
10602           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10603             {
10604               poly_int64 end_offset = (offset
10605                                        + GET_MODE_SIZE (mode)
10606                                        - BYTES_PER_SVE_VECTOR);
10607               return (type == ADDR_QUERY_M
10608                       ? offset_4bit_signed_scaled_p (mode, offset)
10609                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10610                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10611                                                          end_offset)));
10612             }
10613
10614           if (vec_flags == VEC_SVE_PRED)
10615             return offset_9bit_signed_scaled_p (mode, offset);
10616
10617           if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
10618             {
10619               poly_int64 end_offset = (offset
10620                                        + GET_MODE_SIZE (mode)
10621                                        - BYTES_PER_SVE_PRED);
10622               return (offset_9bit_signed_scaled_p (VNx16BImode, end_offset)
10623                       && offset_9bit_signed_scaled_p (VNx16BImode, offset));
10624             }
10625
10626           if (load_store_pair_p)
10627             return ((known_eq (GET_MODE_SIZE (mode), 4)
10628                      || known_eq (GET_MODE_SIZE (mode), 8)
10629                      || known_eq (GET_MODE_SIZE (mode), 16))
10630                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10631           else
10632             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10633                     || offset_12bit_unsigned_scaled_p (mode, offset));
10634         }
10635
10636       if (allow_reg_index_p)
10637         {
10638           /* Look for base + (scaled/extended) index register.  */
10639           if (aarch64_base_register_rtx_p (op0, strict_p)
10640               && aarch64_classify_index (info, op1, mode, strict_p))
10641             {
10642               info->base = op0;
10643               return true;
10644             }
10645           if (aarch64_base_register_rtx_p (op1, strict_p)
10646               && aarch64_classify_index (info, op0, mode, strict_p))
10647             {
10648               info->base = op1;
10649               return true;
10650             }
10651         }
10652
10653       return false;
10654
10655     case POST_INC:
10656     case POST_DEC:
10657     case PRE_INC:
10658     case PRE_DEC:
10659       info->type = ADDRESS_REG_WB;
10660       info->base = XEXP (x, 0);
10661       info->offset = NULL_RTX;
10662       return aarch64_base_register_rtx_p (info->base, strict_p);
10663
10664     case POST_MODIFY:
10665     case PRE_MODIFY:
10666       info->type = ADDRESS_REG_WB;
10667       info->base = XEXP (x, 0);
10668       if (GET_CODE (XEXP (x, 1)) == PLUS
10669           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10670           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10671           && aarch64_base_register_rtx_p (info->base, strict_p))
10672         {
10673           info->offset = XEXP (XEXP (x, 1), 1);
10674           info->const_offset = offset;
10675
10676           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10677              registers and individual Q registers.  The available
10678              address modes are:
10679              X,X: 7-bit signed scaled offset
10680              Q:   9-bit signed offset
10681              We conservatively require an offset representable in either mode.
10682            */
10683           if (mode == TImode || mode == TFmode || mode == TDmode)
10684             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10685                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10686
10687           if (load_store_pair_p)
10688             return ((known_eq (GET_MODE_SIZE (mode), 4)
10689                      || known_eq (GET_MODE_SIZE (mode), 8)
10690                      || known_eq (GET_MODE_SIZE (mode), 16))
10691                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10692           else
10693             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10694         }
10695       return false;
10696
10697     case CONST:
10698     case SYMBOL_REF:
10699     case LABEL_REF:
10700       /* load literal: pc-relative constant pool entry.  Only supported
10701          for SI mode or larger.  */
10702       info->type = ADDRESS_SYMBOLIC;
10703
10704       if (!load_store_pair_p
10705           && GET_MODE_SIZE (mode).is_constant (&const_size)
10706           && const_size >= 4)
10707         {
10708           poly_int64 offset;
10709           rtx sym = strip_offset_and_salt (x, &offset);
10710           return ((LABEL_REF_P (sym)
10711                    || (SYMBOL_REF_P (sym)
10712                        && CONSTANT_POOL_ADDRESS_P (sym)
10713                        && aarch64_pcrelative_literal_loads)));
10714         }
10715       return false;
10716
10717     case LO_SUM:
10718       info->type = ADDRESS_LO_SUM;
10719       info->base = XEXP (x, 0);
10720       info->offset = XEXP (x, 1);
10721       if (allow_reg_index_p
10722           && aarch64_base_register_rtx_p (info->base, strict_p))
10723         {
10724           poly_int64 offset;
10725           HOST_WIDE_INT const_offset;
10726           rtx sym = strip_offset_and_salt (info->offset, &offset);
10727           if (SYMBOL_REF_P (sym)
10728               && offset.is_constant (&const_offset)
10729               && (aarch64_classify_symbol (sym, const_offset)
10730                   == SYMBOL_SMALL_ABSOLUTE))
10731             {
10732               /* The symbol and offset must be aligned to the access size.  */
10733               unsigned int align;
10734
10735               if (CONSTANT_POOL_ADDRESS_P (sym))
10736                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
10737               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
10738                 {
10739                   tree exp = SYMBOL_REF_DECL (sym);
10740                   align = TYPE_ALIGN (TREE_TYPE (exp));
10741                   align = aarch64_constant_alignment (exp, align);
10742                 }
10743               else if (SYMBOL_REF_DECL (sym))
10744                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
10745               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
10746                        && SYMBOL_REF_BLOCK (sym) != NULL)
10747                 align = SYMBOL_REF_BLOCK (sym)->alignment;
10748               else
10749                 align = BITS_PER_UNIT;
10750
10751               poly_int64 ref_size = GET_MODE_SIZE (mode);
10752               if (known_eq (ref_size, 0))
10753                 ref_size = GET_MODE_SIZE (DImode);
10754
10755               return (multiple_p (const_offset, ref_size)
10756                       && multiple_p (align / BITS_PER_UNIT, ref_size));
10757             }
10758         }
10759       return false;
10760
10761     default:
10762       return false;
10763     }
10764 }
10765
10766 /* Return true if the address X is valid for a PRFM instruction.
10767    STRICT_P is true if we should do strict checking with
10768    aarch64_classify_address.  */
10769
10770 bool
10771 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
10772 {
10773   struct aarch64_address_info addr;
10774
10775   /* PRFM accepts the same addresses as DImode...  */
10776   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
10777   if (!res)
10778     return false;
10779
10780   /* ... except writeback forms.  */
10781   return addr.type != ADDRESS_REG_WB;
10782 }
10783
10784 bool
10785 aarch64_symbolic_address_p (rtx x)
10786 {
10787   poly_int64 offset;
10788   x = strip_offset_and_salt (x, &offset);
10789   return SYMBOL_REF_P (x) || LABEL_REF_P (x);
10790 }
10791
10792 /* Classify the base of symbolic expression X.  */
10793
10794 enum aarch64_symbol_type
10795 aarch64_classify_symbolic_expression (rtx x)
10796 {
10797   rtx offset;
10798
10799   split_const (x, &x, &offset);
10800   return aarch64_classify_symbol (x, INTVAL (offset));
10801 }
10802
10803
10804 /* Return TRUE if X is a legitimate address for accessing memory in
10805    mode MODE.  */
10806 static bool
10807 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p,
10808                                    code_helper = ERROR_MARK)
10809 {
10810   struct aarch64_address_info addr;
10811
10812   return aarch64_classify_address (&addr, x, mode, strict_p);
10813 }
10814
10815 /* Return TRUE if X is a legitimate address of type TYPE for accessing
10816    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10817 bool
10818 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
10819                               aarch64_addr_query_type type)
10820 {
10821   struct aarch64_address_info addr;
10822
10823   return aarch64_classify_address (&addr, x, mode, strict_p, type);
10824 }
10825
10826 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
10827
10828 static bool
10829 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
10830                                          poly_int64 orig_offset,
10831                                          machine_mode mode)
10832 {
10833   HOST_WIDE_INT size;
10834   if (GET_MODE_SIZE (mode).is_constant (&size))
10835     {
10836       HOST_WIDE_INT const_offset, second_offset;
10837
10838       /* A general SVE offset is A * VQ + B.  Remove the A component from
10839          coefficient 0 in order to get the constant B.  */
10840       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
10841
10842       /* Split an out-of-range address displacement into a base and
10843          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
10844          range otherwise to increase opportunities for sharing the base
10845          address of different sizes.  Unaligned accesses use the signed
10846          9-bit range, TImode/TFmode/TDmode use the intersection of signed
10847          scaled 7-bit and signed 9-bit offset.  */
10848       if (mode == TImode || mode == TFmode || mode == TDmode)
10849         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
10850       else if ((const_offset & (size - 1)) != 0)
10851         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
10852       else
10853         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
10854
10855       if (second_offset == 0 || known_eq (orig_offset, second_offset))
10856         return false;
10857
10858       /* Split the offset into second_offset and the rest.  */
10859       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10860       *offset2 = gen_int_mode (second_offset, Pmode);
10861       return true;
10862     }
10863   else
10864     {
10865       /* Get the mode we should use as the basis of the range.  For structure
10866          modes this is the mode of one vector.  */
10867       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10868       machine_mode step_mode
10869         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
10870
10871       /* Get the "mul vl" multiplier we'd like to use.  */
10872       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
10873       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
10874       if (vec_flags & VEC_SVE_DATA)
10875         /* LDR supports a 9-bit range, but the move patterns for
10876            structure modes require all vectors to be in range of the
10877            same base.  The simplest way of accomodating that while still
10878            promoting reuse of anchor points between different modes is
10879            to use an 8-bit range unconditionally.  */
10880         vnum = ((vnum + 128) & 255) - 128;
10881       else
10882         /* Predicates are only handled singly, so we might as well use
10883            the full range.  */
10884         vnum = ((vnum + 256) & 511) - 256;
10885       if (vnum == 0)
10886         return false;
10887
10888       /* Convert the "mul vl" multiplier into a byte offset.  */
10889       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
10890       if (known_eq (second_offset, orig_offset))
10891         return false;
10892
10893       /* Split the offset into second_offset and the rest.  */
10894       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10895       *offset2 = gen_int_mode (second_offset, Pmode);
10896       return true;
10897     }
10898 }
10899
10900 /* Return the binary representation of floating point constant VALUE in INTVAL.
10901    If the value cannot be converted, return false without setting INTVAL.
10902    The conversion is done in the given MODE.  */
10903 bool
10904 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
10905 {
10906
10907   /* We make a general exception for 0.  */
10908   if (aarch64_float_const_zero_rtx_p (value))
10909     {
10910       *intval = 0;
10911       return true;
10912     }
10913
10914   scalar_float_mode mode;
10915   if (!CONST_DOUBLE_P (value)
10916       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
10917       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
10918       /* Only support up to DF mode.  */
10919       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
10920     return false;
10921
10922   unsigned HOST_WIDE_INT ival = 0;
10923
10924   long res[2];
10925   real_to_target (res,
10926                   CONST_DOUBLE_REAL_VALUE (value),
10927                   REAL_MODE_FORMAT (mode));
10928
10929   if (mode == DFmode || mode == DDmode)
10930     {
10931       int order = BYTES_BIG_ENDIAN ? 1 : 0;
10932       ival = zext_hwi (res[order], 32);
10933       ival |= (zext_hwi (res[1 - order], 32) << 32);
10934     }
10935   else
10936       ival = zext_hwi (res[0], 32);
10937
10938   *intval = ival;
10939   return true;
10940 }
10941
10942 /* Return TRUE if rtx X is an immediate constant that can be moved using a
10943    single MOV(+MOVK) followed by an FMOV.  */
10944 bool
10945 aarch64_float_const_rtx_p (rtx x)
10946 {
10947   machine_mode mode = GET_MODE (x);
10948   if (mode == VOIDmode)
10949     return false;
10950
10951   /* Determine whether it's cheaper to write float constants as
10952      mov/movk pairs over ldr/adrp pairs.  */
10953   unsigned HOST_WIDE_INT ival;
10954
10955   if (CONST_DOUBLE_P (x)
10956       && SCALAR_FLOAT_MODE_P (mode)
10957       && aarch64_reinterpret_float_as_int (x, &ival))
10958     {
10959       machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
10960       int num_instr = aarch64_internal_mov_immediate
10961                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10962       return num_instr < 3;
10963     }
10964
10965   return false;
10966 }
10967
10968 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
10969    Floating Point).  */
10970 bool
10971 aarch64_float_const_zero_rtx_p (rtx x)
10972 {
10973   /* 0.0 in Decimal Floating Point cannot be represented by #0 or
10974      zr as our callers expect, so no need to check the actual
10975      value if X is of Decimal Floating Point type.  */
10976   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
10977     return false;
10978
10979   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
10980     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
10981   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
10982 }
10983
10984 /* Return true if X is any kind of constant zero rtx.  */
10985
10986 bool
10987 aarch64_const_zero_rtx_p (rtx x)
10988 {
10989   return (x == CONST0_RTX (GET_MODE (x))
10990           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)));
10991 }
10992
10993 /* Return TRUE if rtx X is immediate constant that fits in a single
10994    MOVI immediate operation.  */
10995 bool
10996 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
10997 {
10998   if (!TARGET_SIMD)
10999      return false;
11000
11001   machine_mode vmode;
11002   scalar_int_mode imode;
11003   unsigned HOST_WIDE_INT ival;
11004
11005   if (CONST_DOUBLE_P (x)
11006       && SCALAR_FLOAT_MODE_P (mode))
11007     {
11008       if (!aarch64_reinterpret_float_as_int (x, &ival))
11009         return false;
11010
11011       /* We make a general exception for 0.  */
11012       if (aarch64_float_const_zero_rtx_p (x))
11013         return true;
11014
11015       imode = int_mode_for_mode (mode).require ();
11016     }
11017   else if (CONST_INT_P (x)
11018            && is_a <scalar_int_mode> (mode, &imode))
11019     ival = INTVAL (x);
11020   else
11021     return false;
11022
11023    /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11024      a 128 bit vector mode.  */
11025   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11026
11027   vmode = aarch64_simd_container_mode (imode, width);
11028   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11029
11030   return aarch64_simd_valid_immediate (v_op, NULL);
11031 }
11032
11033
11034 /* Return the fixed registers used for condition codes.  */
11035
11036 static bool
11037 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11038 {
11039   *p1 = CC_REGNUM;
11040   *p2 = INVALID_REGNUM;
11041   return true;
11042 }
11043
11044 /* Return a fresh memory reference to the current function's TPIDR2 block,
11045    creating a block if necessary.  */
11046
11047 static rtx
11048 aarch64_get_tpidr2_block ()
11049 {
11050   if (!cfun->machine->tpidr2_block)
11051     /* The TPIDR2 block is 16 bytes in size and must be aligned to a 128-bit
11052        boundary.  */
11053     cfun->machine->tpidr2_block = assign_stack_local (V16QImode, 16, 128);
11054   return copy_rtx (cfun->machine->tpidr2_block);
11055 }
11056
11057 /* Return a fresh register that points to the current function's
11058    TPIDR2 block, creating a block if necessary.  */
11059
11060 static rtx
11061 aarch64_get_tpidr2_ptr ()
11062 {
11063   rtx block = aarch64_get_tpidr2_block ();
11064   return force_reg (Pmode, XEXP (block, 0));
11065 }
11066
11067 /* Emit instructions to allocate a ZA lazy save buffer and initialize the
11068    current function's TPIDR2 block.  */
11069
11070 static void
11071 aarch64_init_tpidr2_block ()
11072 {
11073   rtx block = aarch64_get_tpidr2_block ();
11074
11075   /* The ZA save buffer is SVL.B*SVL.B bytes in size.  */
11076   rtx svl_bytes = aarch64_sme_vq_immediate (Pmode, 16, AARCH64_ISA_MODE);
11077   rtx svl_bytes_reg = force_reg (DImode, svl_bytes);
11078   rtx za_size = expand_simple_binop (Pmode, MULT, svl_bytes_reg,
11079                                      svl_bytes_reg, NULL, 0, OPTAB_LIB_WIDEN);
11080   rtx za_save_buffer = allocate_dynamic_stack_space (za_size, 128,
11081                                                      BITS_PER_UNIT, -1, true);
11082   za_save_buffer = force_reg (Pmode, za_save_buffer);
11083   cfun->machine->za_save_buffer = za_save_buffer;
11084
11085   /* The first word of the block points to the save buffer and the second
11086      word is the number of ZA slices to save.  */
11087   rtx block_0 = adjust_address (block, DImode, 0);
11088   emit_insn (aarch64_gen_store_pair (block_0, za_save_buffer, svl_bytes_reg));
11089
11090   if (!memory_operand (block, V16QImode))
11091     block = replace_equiv_address (block, force_reg (Pmode, XEXP (block, 0)));
11092   emit_insn (gen_aarch64_setup_local_tpidr2 (block));
11093 }
11094
11095 /* Restore the contents of ZA from the lazy save buffer, given that
11096    register TPIDR2_BLOCK points to the current function's TPIDR2 block.
11097    PSTATE.ZA is known to be 0 and TPIDR2_EL0 is known to be null.  */
11098
11099 void
11100 aarch64_restore_za (rtx tpidr2_block)
11101 {
11102   emit_insn (gen_aarch64_smstart_za ());
11103   if (REGNO (tpidr2_block) != R0_REGNUM)
11104     emit_move_insn (gen_rtx_REG (Pmode, R0_REGNUM), tpidr2_block);
11105   emit_insn (gen_aarch64_tpidr2_restore ());
11106 }
11107
11108 /* Return the ZT0 save buffer, creating one if necessary.  */
11109
11110 static rtx
11111 aarch64_get_zt0_save_buffer ()
11112 {
11113   if (!cfun->machine->zt0_save_buffer)
11114     cfun->machine->zt0_save_buffer = assign_stack_local (V8DImode, 64, 128);
11115   return cfun->machine->zt0_save_buffer;
11116 }
11117
11118 /* Save ZT0 to the current function's save buffer.  */
11119
11120 static void
11121 aarch64_save_zt0 ()
11122 {
11123   rtx mem = aarch64_get_zt0_save_buffer ();
11124   mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11125   emit_insn (gen_aarch64_sme_str_zt0 (mem));
11126 }
11127
11128 /* Restore ZT0 from the current function's save buffer.  FROM_LAZY_SAVE_P
11129    is true if the load is happening after a call to a private-ZA function,
11130    false if it can be treated as a normal load.  */
11131
11132 static void
11133 aarch64_restore_zt0 (bool from_lazy_save_p)
11134 {
11135   rtx mem = aarch64_get_zt0_save_buffer ();
11136   mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11137   emit_insn (from_lazy_save_p
11138              ? gen_aarch64_restore_zt0 (mem)
11139              : gen_aarch64_sme_ldr_zt0 (mem));
11140 }
11141
11142 /* Implement TARGET_START_CALL_ARGS.  */
11143
11144 static void
11145 aarch64_start_call_args (cumulative_args_t ca_v)
11146 {
11147   CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11148
11149   if (!TARGET_SME && (ca->isa_mode & AARCH64_FL_SM_ON))
11150     {
11151       error ("calling a streaming function requires the ISA extension %qs",
11152              "sme");
11153       inform (input_location, "you can enable %qs using the command-line"
11154               " option %<-march%>, or by using the %<target%>"
11155               " attribute or pragma", "sme");
11156     }
11157
11158   if ((ca->shared_za_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11159       && !aarch64_cfun_has_state ("za"))
11160     error ("call to a function that shares %qs state from a function"
11161            " that has no %qs state", "za", "za");
11162   else if ((ca->shared_zt0_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11163            && !aarch64_cfun_has_state ("zt0"))
11164     error ("call to a function that shares %qs state from a function"
11165            " that has no %qs state", "zt0", "zt0");
11166   else if (!TARGET_ZA && (ca->isa_mode & AARCH64_FL_ZA_ON))
11167     error ("call to a function that shares SME state from a function"
11168            " that has no SME state");
11169
11170   /* If this is a call to a private ZA function, emit a marker to
11171      indicate where any necessary set-up code could be inserted.
11172      The code itself is inserted by the mode-switching pass.  */
11173   if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
11174     emit_insn (gen_aarch64_start_private_za_call ());
11175
11176   /* If this is a call to a shared-ZA function that doesn't share ZT0,
11177      save and restore ZT0 around the call.  */
11178   if (aarch64_cfun_has_state ("zt0")
11179       && (ca->isa_mode & AARCH64_FL_ZA_ON)
11180       && ca->shared_zt0_flags == 0)
11181     aarch64_save_zt0 ();
11182 }
11183
11184 /* This function is used by the call expanders of the machine description.
11185    RESULT is the register in which the result is returned.  It's NULL for
11186    "call" and "sibcall".
11187    MEM is the location of the function call.
11188    COOKIE is either:
11189      - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI.
11190      - a PARALLEL that contains such a const_int as its first element.
11191        The second element is a PARALLEL that lists all the argument
11192        registers that need to be saved and restored around a change
11193        in PSTATE.SM, or const0_rtx if no such switch is needed.
11194        The third and fourth elements are const_ints that contain the
11195        sharing flags for ZA and ZT0 respectively.
11196    SIBCALL indicates whether this function call is normal call or sibling call.
11197    It will generate different pattern accordingly.  */
11198
11199 void
11200 aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
11201 {
11202   rtx call, callee, tmp;
11203   rtvec vec;
11204   machine_mode mode;
11205
11206   rtx callee_abi = cookie;
11207   rtx sme_mode_switch_args = const0_rtx;
11208   unsigned int shared_za_flags = 0;
11209   unsigned int shared_zt0_flags = 0;
11210   if (GET_CODE (cookie) == PARALLEL)
11211     {
11212       callee_abi = XVECEXP (cookie, 0, 0);
11213       sme_mode_switch_args = XVECEXP (cookie, 0, 1);
11214       shared_za_flags = INTVAL (XVECEXP (cookie, 0, 2));
11215       shared_zt0_flags = INTVAL (XVECEXP (cookie, 0, 3));
11216     }
11217
11218   gcc_assert (CONST_INT_P (callee_abi));
11219   auto callee_isa_mode = aarch64_callee_isa_mode (callee_abi);
11220
11221   if (aarch64_cfun_has_state ("za")
11222       && (callee_isa_mode & AARCH64_FL_ZA_ON)
11223       && !shared_za_flags)
11224     {
11225       sorry ("call to a function that shares state other than %qs"
11226              " from a function that has %qs state", "za", "za");
11227       inform (input_location, "use %<__arm_preserves(\"za\")%> if the"
11228               " callee preserves ZA");
11229     }
11230
11231   gcc_assert (MEM_P (mem));
11232   callee = XEXP (mem, 0);
11233   mode = GET_MODE (callee);
11234   gcc_assert (mode == Pmode);
11235
11236   /* Decide if we should generate indirect calls by loading the
11237      address of the callee into a register before performing
11238      the branch-and-link.  */
11239   if (SYMBOL_REF_P (callee)
11240       ? (aarch64_is_long_call_p (callee)
11241          || aarch64_is_noplt_call_p (callee))
11242       : !REG_P (callee))
11243     XEXP (mem, 0) = force_reg (mode, callee);
11244
11245   /* Accumulate the return values, including state that is shared via
11246      attributes.  */
11247   auto_vec<rtx, 8> return_values;
11248   if (result)
11249     {
11250       if (GET_CODE (result) == PARALLEL)
11251         for (int i = 0; i < XVECLEN (result, 0); ++i)
11252           return_values.safe_push (XVECEXP (result, 0, i));
11253       else
11254         return_values.safe_push (result);
11255     }
11256   unsigned int orig_num_return_values = return_values.length ();
11257   if (shared_za_flags & AARCH64_STATE_OUT)
11258     return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11259   /* When calling private-ZA functions from functions with ZA state,
11260      we want to know whether the call committed a lazy save.  */
11261   if (TARGET_ZA && !shared_za_flags)
11262     return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11263   if (shared_zt0_flags & AARCH64_STATE_OUT)
11264     return_values.safe_push (gen_rtx_REG (V8DImode, ZT0_REGNUM));
11265
11266   /* Create the new return value, if necessary.  */
11267   if (orig_num_return_values != return_values.length ())
11268     {
11269       if (return_values.length () == 1)
11270         result = return_values[0];
11271       else
11272         {
11273           for (rtx &x : return_values)
11274             if (GET_CODE (x) != EXPR_LIST)
11275               x = gen_rtx_EXPR_LIST (VOIDmode, x, const0_rtx);
11276           rtvec v = gen_rtvec_v (return_values.length (),
11277                                  return_values.address ());
11278           result = gen_rtx_PARALLEL (VOIDmode, v);
11279         }
11280     }
11281
11282   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11283
11284   if (result != NULL_RTX)
11285     call = gen_rtx_SET (result, call);
11286
11287   if (sibcall)
11288     tmp = ret_rtx;
11289   else
11290     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11291
11292   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11293                                UNSPEC_CALLEE_ABI);
11294
11295   vec = gen_rtvec (3, call, callee_abi, tmp);
11296   call = gen_rtx_PARALLEL (VOIDmode, vec);
11297
11298   auto call_insn = aarch64_emit_call_insn (call);
11299
11300   /* Check whether the call requires a change to PSTATE.SM.  We can't
11301      emit the instructions to change PSTATE.SM yet, since they involve
11302      a change in vector length and a change in instruction set, which
11303      cannot be represented in RTL.
11304
11305      For now, just record which registers will be clobbered and used
11306      by the changes to PSTATE.SM.  */
11307   if (!sibcall && aarch64_call_switches_pstate_sm (callee_isa_mode))
11308     {
11309       aarch64_sme_mode_switch_regs args_switch;
11310       if (sme_mode_switch_args != const0_rtx)
11311         {
11312           unsigned int num_args = XVECLEN (sme_mode_switch_args, 0);
11313           for (unsigned int i = 0; i < num_args; ++i)
11314             {
11315               rtx x = XVECEXP (sme_mode_switch_args, 0, i);
11316               args_switch.add_reg (GET_MODE (x), REGNO (x));
11317             }
11318         }
11319
11320       aarch64_sme_mode_switch_regs result_switch;
11321       if (result)
11322         result_switch.add_call_result (call_insn);
11323
11324       unsigned int num_gprs = MAX (args_switch.num_gprs (),
11325                                    result_switch.num_gprs ());
11326       for (unsigned int i = 0; i < num_gprs; ++i)
11327         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11328                      gen_rtx_REG (DImode, args_switch.FIRST_GPR + i));
11329
11330       for (int regno = V0_REGNUM; regno < V0_REGNUM + 32; regno += 4)
11331         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11332                      gen_rtx_REG (V4x16QImode, regno));
11333
11334       for (int regno = P0_REGNUM; regno < P0_REGNUM + 16; regno += 1)
11335         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11336                      gen_rtx_REG (VNx16BImode, regno));
11337
11338       /* Ensure that the VG save slot has been initialized.  Also emit
11339          an instruction to model the effect of the temporary clobber
11340          of VG, so that the prologue/epilogue pass sees the need to
11341          save the old value.  */
11342       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11343                gen_rtx_REG (DImode, VG_REGNUM));
11344       emit_insn_before (gen_aarch64_update_vg (), call_insn);
11345
11346       cfun->machine->call_switches_pstate_sm = true;
11347     }
11348
11349   /* Add any ZA-related information.
11350
11351      ZA_REGNUM represents the current function's ZA state, rather than
11352      the contents of the ZA register itself.  We ensure that the function's
11353      ZA state is preserved by private-ZA call sequences, so the call itself
11354      does not use or clobber ZA_REGNUM.  The same thing applies to
11355      ZT0_REGNUM.  */
11356   if (TARGET_ZA)
11357     {
11358       /* The callee requires ZA to be active if the callee is shared-ZA,
11359          otherwise it requires ZA to be dormant or off.  The state of ZA is
11360          captured by a combination of SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM,
11361          and ZA_SAVED_REGNUM.  */
11362       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11363                gen_rtx_REG (DImode, SME_STATE_REGNUM));
11364       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11365                gen_rtx_REG (DImode, TPIDR2_SETUP_REGNUM));
11366       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11367                gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11368
11369       /* Keep the aarch64_start/end_private_za_call markers live.  */
11370       if (!(callee_isa_mode & AARCH64_FL_ZA_ON))
11371         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11372                  gen_rtx_REG (VNx16BImode, LOWERING_REGNUM));
11373
11374       /* If the callee is a shared-ZA function, record whether it uses the
11375          current value of ZA and ZT0.  */
11376       if (shared_za_flags & AARCH64_STATE_IN)
11377         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11378                  gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11379
11380       if (shared_zt0_flags & AARCH64_STATE_IN)
11381         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11382                  gen_rtx_REG (V8DImode, ZT0_REGNUM));
11383     }
11384 }
11385
11386 /* Implement TARGET_END_CALL_ARGS.  */
11387
11388 static void
11389 aarch64_end_call_args (cumulative_args_t ca_v)
11390 {
11391   CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11392
11393   /* If this is a call to a private ZA function, emit a marker to
11394      indicate where any necessary restoration code could be inserted.
11395      The code itself is inserted by the mode-switching pass.  */
11396   if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
11397     emit_insn (gen_aarch64_end_private_za_call ());
11398
11399   /* If this is a call to a shared-ZA function that doesn't share ZT0,
11400      save and restore ZT0 around the call.  */
11401   if (aarch64_cfun_has_state ("zt0")
11402       && (ca->isa_mode & AARCH64_FL_ZA_ON)
11403       && ca->shared_zt0_flags == 0)
11404     aarch64_restore_zt0 (false);
11405 }
11406
11407 /* Emit call insn with PAT and do aarch64-specific handling.  */
11408
11409 rtx_call_insn *
11410 aarch64_emit_call_insn (rtx pat)
11411 {
11412   auto insn = emit_call_insn (pat);
11413
11414   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11415   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11416   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11417   return as_a<rtx_call_insn *> (insn);
11418 }
11419
11420 machine_mode
11421 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11422 {
11423   machine_mode mode_x = GET_MODE (x);
11424   rtx_code code_x = GET_CODE (x);
11425
11426   /* All floating point compares return CCFP if it is an equality
11427      comparison, and CCFPE otherwise.  */
11428   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11429     {
11430       switch (code)
11431         {
11432         case EQ:
11433         case NE:
11434         case UNORDERED:
11435         case ORDERED:
11436         case UNLT:
11437         case UNLE:
11438         case UNGT:
11439         case UNGE:
11440         case UNEQ:
11441           return CCFPmode;
11442
11443         case LT:
11444         case LE:
11445         case GT:
11446         case GE:
11447         case LTGT:
11448           return CCFPEmode;
11449
11450         default:
11451           gcc_unreachable ();
11452         }
11453     }
11454
11455   /* Equality comparisons of short modes against zero can be performed
11456      using the TST instruction with the appropriate bitmask.  */
11457   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11458       && (code == EQ || code == NE)
11459       && (mode_x == HImode || mode_x == QImode))
11460     return CC_Zmode;
11461
11462   /* Similarly, comparisons of zero_extends from shorter modes can
11463      be performed using an ANDS with an immediate mask.  */
11464   if (y == const0_rtx && code_x == ZERO_EXTEND
11465       && (mode_x == SImode || mode_x == DImode)
11466       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11467       && (code == EQ || code == NE))
11468     return CC_Zmode;
11469
11470   /* Zero extracts support equality comparisons.  */
11471   if ((mode_x == SImode || mode_x == DImode)
11472       && y == const0_rtx
11473       && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11474           && CONST_INT_P (XEXP (x, 2)))
11475       && (code == EQ || code == NE))
11476     return CC_Zmode;
11477
11478   /* ANDS/BICS/TST support equality and all signed comparisons.  */
11479   if ((mode_x == SImode || mode_x == DImode)
11480       && y == const0_rtx
11481       && (code_x == AND)
11482       && (code == EQ || code == NE || code == LT || code == GE
11483           || code == GT || code == LE))
11484     return CC_NZVmode;
11485
11486   /* ADDS/SUBS correctly set N and Z flags.  */
11487   if ((mode_x == SImode || mode_x == DImode)
11488       && y == const0_rtx
11489       && (code == EQ || code == NE || code == LT || code == GE)
11490       && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11491     return CC_NZmode;
11492
11493   /* A compare with a shifted operand.  Because of canonicalization,
11494      the comparison will have to be swapped when we emit the assembly
11495      code.  */
11496   if ((mode_x == SImode || mode_x == DImode)
11497       && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11498       && (code_x == ASHIFT || code_x == ASHIFTRT
11499           || code_x == LSHIFTRT
11500           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11501     return CC_SWPmode;
11502
11503   /* Similarly for a negated operand, but we can only do this for
11504      equalities.  */
11505   if ((mode_x == SImode || mode_x == DImode)
11506       && (REG_P (y) || SUBREG_P (y))
11507       && (code == EQ || code == NE)
11508       && code_x == NEG)
11509     return CC_Zmode;
11510
11511   /* A test for unsigned overflow from an addition.  */
11512   if ((mode_x == DImode || mode_x == TImode)
11513       && (code == LTU || code == GEU)
11514       && code_x == PLUS
11515       && rtx_equal_p (XEXP (x, 0), y))
11516     return CC_Cmode;
11517
11518   /* A test for unsigned overflow from an add with carry.  */
11519   if ((mode_x == DImode || mode_x == TImode)
11520       && (code == LTU || code == GEU)
11521       && code_x == PLUS
11522       && CONST_SCALAR_INT_P (y)
11523       && (rtx_mode_t (y, mode_x)
11524           == (wi::shwi (1, mode_x)
11525               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11526     return CC_ADCmode;
11527
11528   /* A test for signed overflow.  */
11529   if ((mode_x == DImode || mode_x == TImode)
11530       && code == NE
11531       && code_x == PLUS
11532       && GET_CODE (y) == SIGN_EXTEND)
11533     return CC_Vmode;
11534
11535   /* For everything else, return CCmode.  */
11536   return CCmode;
11537 }
11538
11539 static int
11540 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11541
11542 int
11543 aarch64_get_condition_code (rtx x)
11544 {
11545   machine_mode mode = GET_MODE (XEXP (x, 0));
11546   enum rtx_code comp_code = GET_CODE (x);
11547
11548   if (GET_MODE_CLASS (mode) != MODE_CC)
11549     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11550   return aarch64_get_condition_code_1 (mode, comp_code);
11551 }
11552
11553 static int
11554 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11555 {
11556   switch (mode)
11557     {
11558     case E_CCFPmode:
11559     case E_CCFPEmode:
11560       switch (comp_code)
11561         {
11562         case GE: return AARCH64_GE;
11563         case GT: return AARCH64_GT;
11564         case LE: return AARCH64_LS;
11565         case LT: return AARCH64_MI;
11566         case NE: return AARCH64_NE;
11567         case EQ: return AARCH64_EQ;
11568         case ORDERED: return AARCH64_VC;
11569         case UNORDERED: return AARCH64_VS;
11570         case UNLT: return AARCH64_LT;
11571         case UNLE: return AARCH64_LE;
11572         case UNGT: return AARCH64_HI;
11573         case UNGE: return AARCH64_PL;
11574         default: return -1;
11575         }
11576       break;
11577
11578     case E_CCmode:
11579       switch (comp_code)
11580         {
11581         case NE: return AARCH64_NE;
11582         case EQ: return AARCH64_EQ;
11583         case GE: return AARCH64_GE;
11584         case GT: return AARCH64_GT;
11585         case LE: return AARCH64_LE;
11586         case LT: return AARCH64_LT;
11587         case GEU: return AARCH64_CS;
11588         case GTU: return AARCH64_HI;
11589         case LEU: return AARCH64_LS;
11590         case LTU: return AARCH64_CC;
11591         default: return -1;
11592         }
11593       break;
11594
11595     case E_CC_SWPmode:
11596       switch (comp_code)
11597         {
11598         case NE: return AARCH64_NE;
11599         case EQ: return AARCH64_EQ;
11600         case GE: return AARCH64_LE;
11601         case GT: return AARCH64_LT;
11602         case LE: return AARCH64_GE;
11603         case LT: return AARCH64_GT;
11604         case GEU: return AARCH64_LS;
11605         case GTU: return AARCH64_CC;
11606         case LEU: return AARCH64_CS;
11607         case LTU: return AARCH64_HI;
11608         default: return -1;
11609         }
11610       break;
11611
11612     case E_CC_NZCmode:
11613       switch (comp_code)
11614         {
11615         case NE: return AARCH64_NE; /* = any */
11616         case EQ: return AARCH64_EQ; /* = none */
11617         case GE: return AARCH64_PL; /* = nfrst */
11618         case LT: return AARCH64_MI; /* = first */
11619         case GEU: return AARCH64_CS; /* = nlast */
11620         case GTU: return AARCH64_HI; /* = pmore */
11621         case LEU: return AARCH64_LS; /* = plast */
11622         case LTU: return AARCH64_CC; /* = last */
11623         default: return -1;
11624         }
11625       break;
11626
11627     case E_CC_NZVmode:
11628       switch (comp_code)
11629         {
11630         case NE: return AARCH64_NE;
11631         case EQ: return AARCH64_EQ;
11632         case GE: return AARCH64_PL;
11633         case LT: return AARCH64_MI;
11634         case GT: return AARCH64_GT;
11635         case LE: return AARCH64_LE;
11636         default: return -1;
11637         }
11638       break;
11639
11640     case E_CC_NZmode:
11641       switch (comp_code)
11642         {
11643         case NE: return AARCH64_NE;
11644         case EQ: return AARCH64_EQ;
11645         case GE: return AARCH64_PL;
11646         case LT: return AARCH64_MI;
11647         default: return -1;
11648         }
11649       break;
11650
11651     case E_CC_Zmode:
11652       switch (comp_code)
11653         {
11654         case NE: return AARCH64_NE;
11655         case EQ: return AARCH64_EQ;
11656         default: return -1;
11657         }
11658       break;
11659
11660     case E_CC_Cmode:
11661       switch (comp_code)
11662         {
11663         case LTU: return AARCH64_CS;
11664         case GEU: return AARCH64_CC;
11665         default: return -1;
11666         }
11667       break;
11668
11669     case E_CC_ADCmode:
11670       switch (comp_code)
11671         {
11672         case GEU: return AARCH64_CS;
11673         case LTU: return AARCH64_CC;
11674         default: return -1;
11675         }
11676       break;
11677
11678     case E_CC_Vmode:
11679       switch (comp_code)
11680         {
11681         case NE: return AARCH64_VS;
11682         case EQ: return AARCH64_VC;
11683         default: return -1;
11684         }
11685       break;
11686
11687     default:
11688       return -1;
11689     }
11690
11691   return -1;
11692 }
11693
11694 /* Return true if X is a CONST_INT, CONST_WIDE_INT or a constant vector
11695    duplicate of such constants.  If so, store in RET_WI the wide_int
11696    representation of the constant paired with the inner mode of the vector mode
11697    or MODE for scalar X constants.  If MODE is not provided then TImode is
11698    used.  */
11699
11700 static bool
11701 aarch64_extract_vec_duplicate_wide_int (rtx x, wide_int *ret_wi,
11702                                         scalar_mode mode = TImode)
11703 {
11704   rtx elt = unwrap_const_vec_duplicate (x);
11705   if (!CONST_SCALAR_INT_P (elt))
11706     return false;
11707   scalar_mode smode
11708     = CONST_SCALAR_INT_P (x) ? mode : GET_MODE_INNER (GET_MODE (x));
11709   *ret_wi = rtx_mode_t (elt, smode);
11710   return true;
11711 }
11712
11713 /* Return true if X is a scalar or a constant vector of integer
11714    immediates that represent the rounding constant used in the fixed-point
11715    arithmetic instructions.
11716    The accepted form of the constant is (1 << (C - 1)) where C is in the range
11717    [1, MODE_WIDTH/2].  */
11718
11719 bool
11720 aarch64_rnd_imm_p (rtx x)
11721 {
11722   wide_int rnd_cst;
11723   if (!aarch64_extract_vec_duplicate_wide_int (x, &rnd_cst))
11724     return false;
11725   int log2 = wi::exact_log2 (rnd_cst);
11726   if (log2 < 0)
11727     return false;
11728   return IN_RANGE (log2, 0, rnd_cst.get_precision () / 2 - 1);
11729 }
11730
11731 /* Return true if RND is a constant vector of integer rounding constants
11732    corresponding to a constant vector of shifts, SHIFT.
11733    The relationship should be RND == (1 << (SHIFT - 1)).  */
11734
11735 bool
11736 aarch64_const_vec_rnd_cst_p (rtx rnd, rtx shift)
11737 {
11738   wide_int rnd_cst, shft_cst;
11739   if (!aarch64_extract_vec_duplicate_wide_int (rnd, &rnd_cst)
11740       || !aarch64_extract_vec_duplicate_wide_int (shift, &shft_cst))
11741     return false;
11742
11743   return rnd_cst == (wi::shwi (1, rnd_cst.get_precision ()) << (shft_cst - 1));
11744 }
11745
11746 bool
11747 aarch64_const_vec_all_same_in_range_p (rtx x,
11748                                        HOST_WIDE_INT minval,
11749                                        HOST_WIDE_INT maxval)
11750 {
11751   rtx elt;
11752   return (const_vec_duplicate_p (x, &elt)
11753           && CONST_INT_P (elt)
11754           && IN_RANGE (INTVAL (elt), minval, maxval));
11755 }
11756
11757 /* Some constants can't be made using normal mov instructions in Advanced SIMD
11758    but we can still create them in various ways.  If the constant in VAL can be
11759    created using alternate methods then if possible then return true and
11760    additionally set TARGET to the rtx for the sequence if TARGET is not NULL.
11761    Otherwise return false if sequence is not possible.  */
11762
11763 bool
11764 aarch64_maybe_generate_simd_constant (rtx target, rtx val, machine_mode mode)
11765 {
11766   wide_int wval;
11767   auto smode = GET_MODE_INNER (mode);
11768   if (!aarch64_extract_vec_duplicate_wide_int (val, &wval, smode))
11769     return false;
11770
11771   /* For Advanced SIMD we can create an integer with only the top bit set
11772      using fneg (0.0f).  */
11773   if (TARGET_SIMD
11774       && !TARGET_SVE
11775       && smode == DImode
11776       && wi::only_sign_bit_p (wval))
11777     {
11778       if (!target)
11779         return true;
11780
11781       /* Use the same base type as aarch64_gen_shareable_zero.  */
11782       rtx zero = CONST0_RTX (V4SImode);
11783       emit_move_insn (lowpart_subreg (V4SImode, target, mode), zero);
11784       rtx neg = lowpart_subreg (V2DFmode, target, mode);
11785       emit_insn (gen_negv2df2 (neg, copy_rtx (neg)));
11786       return true;
11787     }
11788
11789   return false;
11790 }
11791
11792 /* Check if the value in VAL with mode MODE can be created using special
11793    instruction sequences.  */
11794
11795 bool aarch64_simd_special_constant_p (rtx val, machine_mode mode)
11796 {
11797   return aarch64_maybe_generate_simd_constant (NULL_RTX, val, mode);
11798 }
11799
11800 bool
11801 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11802 {
11803   return aarch64_const_vec_all_same_in_range_p (x, val, val);
11804 }
11805
11806 /* Return true if VEC is a constant in which every element is in the range
11807    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
11808
11809 static bool
11810 aarch64_const_vec_all_in_range_p (rtx vec,
11811                                   HOST_WIDE_INT minval,
11812                                   HOST_WIDE_INT maxval)
11813 {
11814   if (!CONST_VECTOR_P (vec)
11815       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11816     return false;
11817
11818   int nunits;
11819   if (!CONST_VECTOR_STEPPED_P (vec))
11820     nunits = const_vector_encoded_nelts (vec);
11821   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11822     return false;
11823
11824   for (int i = 0; i < nunits; i++)
11825     {
11826       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11827       if (!CONST_INT_P (vec_elem)
11828           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11829         return false;
11830     }
11831   return true;
11832 }
11833
11834 /* N Z C V.  */
11835 #define AARCH64_CC_V 1
11836 #define AARCH64_CC_C (1 << 1)
11837 #define AARCH64_CC_Z (1 << 2)
11838 #define AARCH64_CC_N (1 << 3)
11839
11840 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
11841 static const int aarch64_nzcv_codes[] =
11842 {
11843   0,            /* EQ, Z == 1.  */
11844   AARCH64_CC_Z, /* NE, Z == 0.  */
11845   0,            /* CS, C == 1.  */
11846   AARCH64_CC_C, /* CC, C == 0.  */
11847   0,            /* MI, N == 1.  */
11848   AARCH64_CC_N, /* PL, N == 0.  */
11849   0,            /* VS, V == 1.  */
11850   AARCH64_CC_V, /* VC, V == 0.  */
11851   0,            /* HI, C ==1 && Z == 0.  */
11852   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
11853   AARCH64_CC_V, /* GE, N == V.  */
11854   0,            /* LT, N != V.  */
11855   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
11856   0,            /* LE, !(Z == 0 && N == V).  */
11857   0,            /* AL, Any.  */
11858   0             /* NV, Any.  */
11859 };
11860
11861 /* Print floating-point vector immediate operand X to F, negating it
11862    first if NEGATE is true.  Return true on success, false if it isn't
11863    a constant we can handle.  */
11864
11865 static bool
11866 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11867 {
11868   rtx elt;
11869
11870   if (!const_vec_duplicate_p (x, &elt))
11871     return false;
11872
11873   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11874   if (negate)
11875     r = real_value_negate (&r);
11876
11877   /* Handle the SVE single-bit immediates specially, since they have a
11878      fixed form in the assembly syntax.  */
11879   if (real_equal (&r, &dconst0))
11880     asm_fprintf (f, "0.0");
11881   else if (real_equal (&r, &dconst2))
11882     asm_fprintf (f, "2.0");
11883   else if (real_equal (&r, &dconst1))
11884     asm_fprintf (f, "1.0");
11885   else if (real_equal (&r, &dconsthalf))
11886     asm_fprintf (f, "0.5");
11887   else
11888     {
11889       const int buf_size = 20;
11890       char float_buf[buf_size] = {'\0'};
11891       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11892                                 1, GET_MODE (elt));
11893       asm_fprintf (f, "%s", float_buf);
11894     }
11895
11896   return true;
11897 }
11898
11899 /* Return the equivalent letter for size.  */
11900 static char
11901 sizetochar (int size)
11902 {
11903   switch (size)
11904     {
11905     case 64: return 'd';
11906     case 32: return 's';
11907     case 16: return 'h';
11908     case 8 : return 'b';
11909     default: gcc_unreachable ();
11910     }
11911 }
11912
11913 /* Print operand X to file F in a target specific manner according to CODE.
11914    The acceptable formatting commands given by CODE are:
11915      'c':               An integer or symbol address without a preceding #
11916                         sign.
11917      'C':               Take the duplicated element in a vector constant
11918                         and print it in hex.
11919      'D':               Take the duplicated element in a vector constant
11920                         and print it as an unsigned integer, in decimal.
11921      'e':               Print the sign/zero-extend size as a character 8->b,
11922                         16->h, 32->w.  Can also be used for masks:
11923                         0xff->b, 0xffff->h, 0xffffffff->w.
11924      'I':               If the operand is a duplicated vector constant,
11925                         replace it with the duplicated scalar.  If the
11926                         operand is then a floating-point constant, replace
11927                         it with the integer bit representation.  Print the
11928                         transformed constant as a signed decimal number.
11929      'p':               Prints N such that 2^N == X (X must be power of 2 and
11930                         const int).
11931      'P':               Print the number of non-zero bits in X (a const_int).
11932      'H':               Print the higher numbered register of a pair (TImode)
11933                         of regs.
11934      'm':               Print a condition (eq, ne, etc).
11935      'M':               Same as 'm', but invert condition.
11936      'N':               Take the duplicated element in a vector constant
11937                         and print the negative of it in decimal.
11938      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
11939      'Z':               Same for SVE registers.  ('z' was already taken.)
11940                         Note that it is not necessary to use %Z for operands
11941                         that have SVE modes.  The convention is to use %Z
11942                         only for non-SVE (or potentially non-SVE) modes.
11943      'S/T/U/V':         Print a FP/SIMD register name for a register list.
11944                         The register printed is the FP/SIMD register name
11945                         of X + 0/1/2/3 for S/T/U/V.
11946      'R':               Print a scalar Integer/FP/SIMD register name + 1.
11947      'X':               Print bottom 16 bits of integer constant in hex.
11948      'w/x':             Print a general register name or the zero register
11949                         (32-bit or 64-bit).
11950      '0':               Print a normal operand, if it's a general register,
11951                         then we assume DImode.
11952      'k':               Print NZCV for conditional compare instructions.
11953      'K':               Print a predicate register as pn<N> rather than p<N>
11954      'A':               Output address constant representing the first
11955                         argument of X, specifying a relocation offset
11956                         if appropriate.
11957      'L':               Output constant address specified by X
11958                         with a relocation offset if appropriate.
11959      'G':               Prints address of X, specifying a PC relative
11960                         relocation mode if appropriate.
11961      'y':               Output address of LDP or STP - this is used for
11962                         some LDP/STPs which don't use a PARALLEL in their
11963                         pattern (so the mode needs to be adjusted).
11964      'z':               Output address of a typical LDP or STP.  */
11965
11966 static void
11967 aarch64_print_operand (FILE *f, rtx x, int code)
11968 {
11969   rtx elt;
11970   switch (code)
11971     {
11972     case 'c':
11973       if (CONST_INT_P (x))
11974         fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11975       else
11976         {
11977           poly_int64 offset;
11978           rtx base = strip_offset_and_salt (x, &offset);
11979           if (SYMBOL_REF_P (base))
11980             output_addr_const (f, x);
11981           else
11982             output_operand_lossage ("unsupported operand for code '%c'", code);
11983         }
11984       break;
11985
11986     case 'e':
11987       {
11988         x = unwrap_const_vec_duplicate (x);
11989         if (!CONST_INT_P (x))
11990           {
11991             output_operand_lossage ("invalid operand for '%%%c'", code);
11992             return;
11993           }
11994
11995         HOST_WIDE_INT val = INTVAL (x);
11996         if ((val & ~7) == 8 || val == 0xff)
11997           fputc ('b', f);
11998         else if ((val & ~7) == 16 || val == 0xffff)
11999           fputc ('h', f);
12000         else if ((val & ~7) == 32 || val == 0xffffffff)
12001           fputc ('w', f);
12002         else
12003           {
12004             output_operand_lossage ("invalid operand for '%%%c'", code);
12005             return;
12006           }
12007       }
12008       break;
12009
12010     case 'p':
12011       {
12012         int n;
12013
12014         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
12015           {
12016             output_operand_lossage ("invalid operand for '%%%c'", code);
12017             return;
12018           }
12019
12020         asm_fprintf (f, "%d", n);
12021       }
12022       break;
12023
12024     case 'P':
12025       if (!CONST_INT_P (x))
12026         {
12027           output_operand_lossage ("invalid operand for '%%%c'", code);
12028           return;
12029         }
12030
12031       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
12032       break;
12033
12034     case 'H':
12035       if (x == const0_rtx)
12036         {
12037           asm_fprintf (f, "xzr");
12038           break;
12039         }
12040
12041       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
12042         {
12043           output_operand_lossage ("invalid operand for '%%%c'", code);
12044           return;
12045         }
12046
12047       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
12048       break;
12049
12050     case 'I':
12051       {
12052         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
12053         if (CONST_INT_P (x))
12054           asm_fprintf (f, "%wd", INTVAL (x));
12055         else
12056           {
12057             output_operand_lossage ("invalid operand for '%%%c'", code);
12058             return;
12059           }
12060         break;
12061       }
12062
12063     case 'M':
12064     case 'm':
12065       {
12066         int cond_code;
12067         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
12068         if (x == const_true_rtx)
12069           {
12070             if (code == 'M')
12071               fputs ("nv", f);
12072             return;
12073           }
12074
12075         if (!COMPARISON_P (x))
12076           {
12077             output_operand_lossage ("invalid operand for '%%%c'", code);
12078             return;
12079           }
12080
12081         cond_code = aarch64_get_condition_code (x);
12082         gcc_assert (cond_code >= 0);
12083         if (code == 'M')
12084           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
12085         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
12086           fputs (aarch64_sve_condition_codes[cond_code], f);
12087         else
12088           fputs (aarch64_condition_codes[cond_code], f);
12089       }
12090       break;
12091
12092     case 'N':
12093       if (!const_vec_duplicate_p (x, &elt))
12094         {
12095           output_operand_lossage ("invalid vector constant");
12096           return;
12097         }
12098
12099       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12100         asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
12101       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12102                && aarch64_print_vector_float_operand (f, x, true))
12103         ;
12104       else
12105         {
12106           output_operand_lossage ("invalid vector constant");
12107           return;
12108         }
12109       break;
12110
12111     case 'b':
12112     case 'h':
12113     case 's':
12114     case 'd':
12115     case 'q':
12116     case 'Z':
12117       code = TOLOWER (code);
12118       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12119         {
12120           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12121           return;
12122         }
12123       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
12124       break;
12125
12126     case 'S':
12127     case 'T':
12128     case 'U':
12129     case 'V':
12130       if (!REG_P (x) || (!FP_REGNUM_P (REGNO (x)) && !PR_REGNUM_P (REGNO (x))))
12131         {
12132           output_operand_lossage ("incompatible operand for '%%%c'", code);
12133           return;
12134         }
12135       if (PR_REGNUM_P (REGNO (x)))
12136         asm_fprintf (f, "p%d", REGNO (x) - P0_REGNUM + (code - 'S'));
12137       else
12138         asm_fprintf (f, "%c%d",
12139                      aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12140                      REGNO (x) - V0_REGNUM + (code - 'S'));
12141       break;
12142
12143     case 'R':
12144       if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12145           && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12146         asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12147       else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
12148         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12149       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12150         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12151       else
12152         output_operand_lossage ("incompatible register operand for '%%%c'",
12153                                 code);
12154       break;
12155
12156     case 'X':
12157       if (!CONST_INT_P (x))
12158         {
12159           output_operand_lossage ("invalid operand for '%%%c'", code);
12160           return;
12161         }
12162       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
12163       break;
12164
12165     case 'C':
12166       {
12167         /* Print a replicated constant in hex.  */
12168         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12169           {
12170             output_operand_lossage ("invalid operand for '%%%c'", code);
12171             return;
12172           }
12173         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12174         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12175       }
12176       break;
12177
12178     case 'D':
12179       {
12180         /* Print a replicated constant in decimal, treating it as
12181            unsigned.  */
12182         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12183           {
12184             output_operand_lossage ("invalid operand for '%%%c'", code);
12185             return;
12186           }
12187         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12188         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12189       }
12190       break;
12191
12192     case 'w':
12193     case 'x':
12194       if (aarch64_const_zero_rtx_p (x))
12195         {
12196           asm_fprintf (f, "%czr", code);
12197           break;
12198         }
12199
12200       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12201         {
12202           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12203           break;
12204         }
12205
12206       if (REG_P (x) && REGNO (x) == SP_REGNUM)
12207         {
12208           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12209           break;
12210         }
12211
12212       /* Fall through */
12213
12214     case 0:
12215       if (x == NULL)
12216         {
12217           output_operand_lossage ("missing operand");
12218           return;
12219         }
12220
12221       switch (GET_CODE (x))
12222         {
12223         case CONST_STRING:
12224           {
12225             asm_fprintf (f, "%s", XSTR (x, 0));
12226             break;
12227           }
12228         case REG:
12229           if (aarch64_sve_data_mode_p (GET_MODE (x)))
12230             {
12231               if (REG_NREGS (x) == 1)
12232                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12233               else
12234                 {
12235                   char suffix
12236                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12237                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
12238                                REGNO (x) - V0_REGNUM, suffix,
12239                                END_REGNO (x) - V0_REGNUM - 1, suffix);
12240                 }
12241             }
12242           else
12243             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12244           break;
12245
12246         case MEM:
12247           output_address (GET_MODE (x), XEXP (x, 0));
12248           break;
12249
12250         case LABEL_REF:
12251         case SYMBOL_REF:
12252           output_addr_const (asm_out_file, x);
12253           break;
12254
12255         case CONST_INT:
12256           asm_fprintf (f, "%wd", INTVAL (x));
12257           break;
12258
12259         case CONST:
12260           if (!VECTOR_MODE_P (GET_MODE (x)))
12261             {
12262               output_addr_const (asm_out_file, x);
12263               break;
12264             }
12265           /* fall through */
12266
12267         case CONST_VECTOR:
12268           if (!const_vec_duplicate_p (x, &elt))
12269             {
12270               output_operand_lossage ("invalid vector constant");
12271               return;
12272             }
12273
12274           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12275             asm_fprintf (f, "%wd", INTVAL (elt));
12276           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12277                    && aarch64_print_vector_float_operand (f, x, false))
12278             ;
12279           else
12280             {
12281               output_operand_lossage ("invalid vector constant");
12282               return;
12283             }
12284           break;
12285
12286         case CONST_DOUBLE:
12287           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12288              be getting CONST_DOUBLEs holding integers.  */
12289           gcc_assert (GET_MODE (x) != VOIDmode);
12290           if (aarch64_float_const_zero_rtx_p (x))
12291             {
12292               fputc ('0', f);
12293               break;
12294             }
12295           else if (aarch64_float_const_representable_p (x))
12296             {
12297 #define buf_size 20
12298               char float_buf[buf_size] = {'\0'};
12299               real_to_decimal_for_mode (float_buf,
12300                                         CONST_DOUBLE_REAL_VALUE (x),
12301                                         buf_size, buf_size,
12302                                         1, GET_MODE (x));
12303               asm_fprintf (asm_out_file, "%s", float_buf);
12304               break;
12305 #undef buf_size
12306             }
12307           output_operand_lossage ("invalid constant");
12308           return;
12309         default:
12310           output_operand_lossage ("invalid operand");
12311           return;
12312         }
12313       break;
12314
12315     case 'A':
12316       if (GET_CODE (x) == HIGH)
12317         x = XEXP (x, 0);
12318
12319       switch (aarch64_classify_symbolic_expression (x))
12320         {
12321         case SYMBOL_SMALL_GOT_4G:
12322           asm_fprintf (asm_out_file, ":got:");
12323           break;
12324
12325         case SYMBOL_SMALL_TLSGD:
12326           asm_fprintf (asm_out_file, ":tlsgd:");
12327           break;
12328
12329         case SYMBOL_SMALL_TLSDESC:
12330           asm_fprintf (asm_out_file, ":tlsdesc:");
12331           break;
12332
12333         case SYMBOL_SMALL_TLSIE:
12334           asm_fprintf (asm_out_file, ":gottprel:");
12335           break;
12336
12337         case SYMBOL_TLSLE24:
12338           asm_fprintf (asm_out_file, ":tprel:");
12339           break;
12340
12341         case SYMBOL_TINY_GOT:
12342           gcc_unreachable ();
12343           break;
12344
12345         default:
12346           break;
12347         }
12348       output_addr_const (asm_out_file, x);
12349       break;
12350
12351     case 'L':
12352       switch (aarch64_classify_symbolic_expression (x))
12353         {
12354         case SYMBOL_SMALL_GOT_4G:
12355           asm_fprintf (asm_out_file, ":got_lo12:");
12356           break;
12357
12358         case SYMBOL_SMALL_TLSGD:
12359           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12360           break;
12361
12362         case SYMBOL_SMALL_TLSDESC:
12363           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12364           break;
12365
12366         case SYMBOL_SMALL_TLSIE:
12367           asm_fprintf (asm_out_file, ":gottprel_lo12:");
12368           break;
12369
12370         case SYMBOL_TLSLE12:
12371           asm_fprintf (asm_out_file, ":tprel_lo12:");
12372           break;
12373
12374         case SYMBOL_TLSLE24:
12375           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12376           break;
12377
12378         case SYMBOL_TINY_GOT:
12379           asm_fprintf (asm_out_file, ":got:");
12380           break;
12381
12382         case SYMBOL_TINY_TLSIE:
12383           asm_fprintf (asm_out_file, ":gottprel:");
12384           break;
12385
12386         default:
12387           break;
12388         }
12389       output_addr_const (asm_out_file, x);
12390       break;
12391
12392     case 'G':
12393       switch (aarch64_classify_symbolic_expression (x))
12394         {
12395         case SYMBOL_TLSLE24:
12396           asm_fprintf (asm_out_file, ":tprel_hi12:");
12397           break;
12398         default:
12399           break;
12400         }
12401       output_addr_const (asm_out_file, x);
12402       break;
12403
12404     case 'k':
12405       {
12406         HOST_WIDE_INT cond_code;
12407
12408         if (!CONST_INT_P (x))
12409           {
12410             output_operand_lossage ("invalid operand for '%%%c'", code);
12411             return;
12412           }
12413
12414         cond_code = INTVAL (x);
12415         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12416         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12417       }
12418       break;
12419
12420     case 'K':
12421       if (!REG_P (x) || !PR_REGNUM_P (REGNO (x)))
12422         {
12423           output_operand_lossage ("invalid operand for '%%%c'", code);
12424           return;
12425         }
12426       asm_fprintf (f, "pn%d", REGNO (x) - P0_REGNUM);
12427       break;
12428
12429     case 'y':
12430     case 'z':
12431       {
12432         machine_mode mode = GET_MODE (x);
12433
12434         if (!MEM_P (x)
12435             || (code == 'y'
12436                 && maybe_ne (GET_MODE_SIZE (mode), 8)
12437                 && maybe_ne (GET_MODE_SIZE (mode), 16)
12438                 && maybe_ne (GET_MODE_SIZE (mode), 32)))
12439           {
12440             output_operand_lossage ("invalid operand for '%%%c'", code);
12441             return;
12442           }
12443
12444         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12445                                             code == 'y'
12446                                             ? ADDR_QUERY_LDP_STP_N
12447                                             : ADDR_QUERY_LDP_STP))
12448           output_operand_lossage ("invalid operand prefix '%%%c'", code);
12449       }
12450       break;
12451
12452     default:
12453       output_operand_lossage ("invalid operand prefix '%%%c'", code);
12454       return;
12455     }
12456 }
12457
12458 /* Print address 'x' of a memory access with mode 'mode'.
12459    'op' is the context required by aarch64_classify_address.  It can either be
12460    MEM for a normal memory access or PARALLEL for LDP/STP.  */
12461 static bool
12462 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12463                                 aarch64_addr_query_type type)
12464 {
12465   struct aarch64_address_info addr;
12466   unsigned int size, vec_flags;
12467
12468   /* Check all addresses are Pmode - including ILP32.  */
12469   if (GET_MODE (x) != Pmode
12470       && (!CONST_INT_P (x)
12471           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12472     {
12473       output_operand_lossage ("invalid address mode");
12474       return false;
12475     }
12476
12477   const bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
12478                                   || type == ADDR_QUERY_LDP_STP_N);
12479
12480   if (aarch64_classify_address (&addr, x, mode, true, type))
12481     switch (addr.type)
12482       {
12483       case ADDRESS_REG_IMM:
12484         if (known_eq (addr.const_offset, 0))
12485           {
12486             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12487             return true;
12488           }
12489
12490         vec_flags = aarch64_classify_vector_mode (mode);
12491         if ((vec_flags & VEC_ANY_SVE) && !load_store_pair_p)
12492           {
12493             HOST_WIDE_INT vnum
12494               = exact_div (addr.const_offset,
12495                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12496             asm_fprintf (f, "[%s, #%wd, mul vl]",
12497                          reg_names[REGNO (addr.base)], vnum);
12498             return true;
12499           }
12500
12501         if (!CONST_INT_P (addr.offset))
12502           return false;
12503
12504         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12505                      INTVAL (addr.offset));
12506         return true;
12507
12508       case ADDRESS_REG_REG:
12509         if (addr.shift == 0)
12510           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12511                        reg_names [REGNO (addr.offset)]);
12512         else
12513           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12514                        reg_names [REGNO (addr.offset)], addr.shift);
12515         return true;
12516
12517       case ADDRESS_REG_UXTW:
12518         if (addr.shift == 0)
12519           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12520                        REGNO (addr.offset) - R0_REGNUM);
12521         else
12522           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12523                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12524         return true;
12525
12526       case ADDRESS_REG_SXTW:
12527         if (addr.shift == 0)
12528           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12529                        REGNO (addr.offset) - R0_REGNUM);
12530         else
12531           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12532                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12533         return true;
12534
12535       case ADDRESS_REG_WB:
12536         /* Writeback is only supported for fixed-width modes.  */
12537         size = GET_MODE_SIZE (mode).to_constant ();
12538         switch (GET_CODE (x))
12539           {
12540           case PRE_INC:
12541             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12542             return true;
12543           case POST_INC:
12544             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12545             return true;
12546           case PRE_DEC:
12547             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12548             return true;
12549           case POST_DEC:
12550             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12551             return true;
12552           case PRE_MODIFY:
12553             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12554                          INTVAL (addr.offset));
12555             return true;
12556           case POST_MODIFY:
12557             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12558                          INTVAL (addr.offset));
12559             return true;
12560           default:
12561             break;
12562           }
12563         break;
12564
12565       case ADDRESS_LO_SUM:
12566         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12567         output_addr_const (f, addr.offset);
12568         asm_fprintf (f, "]");
12569         return true;
12570
12571       case ADDRESS_SYMBOLIC:
12572         output_addr_const (f, x);
12573         return true;
12574       }
12575
12576   return false;
12577 }
12578
12579 /* Print address 'x' of a memory access with mode 'mode'.  */
12580 static void
12581 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12582 {
12583   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12584     output_addr_const (f, x);
12585 }
12586
12587 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
12588
12589 static bool
12590 aarch64_output_addr_const_extra (FILE *file, rtx x)
12591 {
12592   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12593     {
12594       output_addr_const (file, XVECEXP (x, 0, 0));
12595       return true;
12596    }
12597   return false;
12598 }
12599
12600 bool
12601 aarch64_label_mentioned_p (rtx x)
12602 {
12603   const char *fmt;
12604   int i;
12605
12606   if (LABEL_REF_P (x))
12607     return true;
12608
12609   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12610      referencing instruction, but they are constant offsets, not
12611      symbols.  */
12612   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12613     return false;
12614
12615   fmt = GET_RTX_FORMAT (GET_CODE (x));
12616   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12617     {
12618       if (fmt[i] == 'E')
12619         {
12620           int j;
12621
12622           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12623             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12624               return 1;
12625         }
12626       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12627         return 1;
12628     }
12629
12630   return 0;
12631 }
12632
12633 /* Implement REGNO_REG_CLASS.  */
12634
12635 enum reg_class
12636 aarch64_regno_regclass (unsigned regno)
12637 {
12638   if (W8_W11_REGNUM_P (regno))
12639     return W8_W11_REGS;
12640
12641   if (W12_W15_REGNUM_P (regno))
12642     return W12_W15_REGS;
12643
12644   if (STUB_REGNUM_P (regno))
12645     return STUB_REGS;
12646
12647   if (GP_REGNUM_P (regno))
12648     return GENERAL_REGS;
12649
12650   if (regno == SP_REGNUM)
12651     return STACK_REG;
12652
12653   if (regno == FRAME_POINTER_REGNUM
12654       || regno == ARG_POINTER_REGNUM)
12655     return POINTER_REGS;
12656
12657   if (FP_REGNUM_P (regno))
12658     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12659             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12660
12661   if (PR_REGNUM_P (regno))
12662     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12663
12664   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12665     return FFR_REGS;
12666
12667   if (FAKE_REGNUM_P (regno))
12668     return FAKE_REGS;
12669
12670   return NO_REGS;
12671 }
12672
12673 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12674    If OFFSET is out of range, return an offset of an anchor point
12675    that is in range.  Return 0 otherwise.  */
12676
12677 static HOST_WIDE_INT
12678 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12679                        machine_mode mode)
12680 {
12681   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
12682   if (size > 16)
12683     return (offset + 0x400) & ~0x7f0;
12684
12685   /* For offsets that aren't a multiple of the access size, the limit is
12686      -256...255.  */
12687   if (offset & (size - 1))
12688     {
12689       /* BLKmode typically uses LDP of X-registers.  */
12690       if (mode == BLKmode)
12691         return (offset + 512) & ~0x3ff;
12692       return (offset + 0x100) & ~0x1ff;
12693     }
12694
12695   /* Small negative offsets are supported.  */
12696   if (IN_RANGE (offset, -256, 0))
12697     return 0;
12698
12699   if (mode == TImode || mode == TFmode || mode == TDmode)
12700     return (offset + 0x100) & ~0x1ff;
12701
12702   /* Use 12-bit offset by access size.  */
12703   return offset & (~0xfff * size);
12704 }
12705
12706 static rtx
12707 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
12708 {
12709   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12710      where mask is selected by alignment and size of the offset.
12711      We try to pick as large a range for the offset as possible to
12712      maximize the chance of a CSE.  However, for aligned addresses
12713      we limit the range to 4k so that structures with different sized
12714      elements are likely to use the same base.  We need to be careful
12715      not to split a CONST for some forms of address expression, otherwise
12716      it will generate sub-optimal code.  */
12717
12718   /* First split X + CONST (base, offset) into (base + X) + offset.  */
12719   if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 1)) == CONST)
12720     {
12721       poly_int64 offset;
12722       rtx base = strip_offset (XEXP (x, 1), &offset);
12723
12724       base = expand_binop (Pmode, add_optab, base, XEXP (x, 0),
12725                            NULL_RTX, true, OPTAB_DIRECT);
12726       x = plus_constant (Pmode, base, offset);
12727     }
12728
12729   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12730     {
12731       rtx base = XEXP (x, 0);
12732       rtx offset_rtx = XEXP (x, 1);
12733       HOST_WIDE_INT offset = INTVAL (offset_rtx);
12734
12735       if (GET_CODE (base) == PLUS)
12736         {
12737           rtx op0 = XEXP (base, 0);
12738           rtx op1 = XEXP (base, 1);
12739
12740           /* Force any scaling into a temp for CSE.  */
12741           op0 = force_reg (Pmode, op0);
12742           op1 = force_reg (Pmode, op1);
12743
12744           /* Let the pointer register be in op0.  */
12745           if (REG_POINTER (op1))
12746             std::swap (op0, op1);
12747
12748           /* If the pointer is virtual or frame related, then we know that
12749              virtual register instantiation or register elimination is going
12750              to apply a second constant.  We want the two constants folded
12751              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
12752           if (virt_or_elim_regno_p (REGNO (op0)))
12753             {
12754               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12755                                    NULL_RTX, true, OPTAB_DIRECT);
12756               return gen_rtx_PLUS (Pmode, base, op1);
12757             }
12758
12759           /* Otherwise, in order to encourage CSE (and thence loop strength
12760              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
12761           base = expand_binop (Pmode, add_optab, op0, op1,
12762                                NULL_RTX, true, OPTAB_DIRECT);
12763           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12764         }
12765
12766       HOST_WIDE_INT size;
12767       if (GET_MODE_SIZE (mode).is_constant (&size))
12768         {
12769           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12770                                                              mode);
12771           if (base_offset != 0)
12772             {
12773               base = plus_constant (Pmode, base, base_offset);
12774               base = force_operand (base, NULL_RTX);
12775               return plus_constant (Pmode, base, offset - base_offset);
12776             }
12777         }
12778     }
12779
12780   return x;
12781 }
12782
12783 static reg_class_t
12784 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12785                           reg_class_t rclass,
12786                           machine_mode mode,
12787                           secondary_reload_info *sri)
12788 {
12789   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12790      LDR and STR.  See the comment at the head of aarch64-sve.md for
12791      more details about the big-endian handling.  */
12792   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12793   if (reg_class_subset_p (rclass, FP_REGS)
12794       && !((REG_P (x) && HARD_REGISTER_P (x))
12795            || aarch64_simd_valid_immediate (x, NULL))
12796       && mode != VNx16QImode
12797       && (vec_flags & VEC_SVE_DATA)
12798       && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12799     {
12800       sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12801       return NO_REGS;
12802     }
12803
12804   /* If we have to disable direct literal pool loads and stores because the
12805      function is too big, then we need a scratch register.  */
12806   if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12807       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12808           || targetm.vector_mode_supported_p (GET_MODE (x)))
12809       && !aarch64_pcrelative_literal_loads)
12810     {
12811       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12812       return NO_REGS;
12813     }
12814
12815   /* Without the TARGET_SIMD or TARGET_SVE instructions we cannot move a
12816      Q register to a Q register directly.  We need a scratch.  */
12817   if (REG_P (x)
12818       && (mode == TFmode
12819           || mode == TImode
12820           || mode == TDmode
12821           || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
12822       && mode == GET_MODE (x)
12823       && !TARGET_SIMD
12824       && FP_REGNUM_P (REGNO (x))
12825       && reg_class_subset_p (rclass, FP_REGS))
12826     {
12827       sri->icode = code_for_aarch64_reload_mov (mode);
12828       return NO_REGS;
12829     }
12830
12831   /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12832      because AArch64 has richer addressing modes for LDR/STR instructions
12833      than LDP/STP instructions.  */
12834   if (TARGET_FLOAT && rclass == GENERAL_REGS
12835       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12836     return FP_REGS;
12837
12838   if (rclass == FP_REGS
12839       && (mode == TImode || mode == TFmode || mode == TDmode)
12840       && CONSTANT_P(x))
12841       return GENERAL_REGS;
12842
12843   return NO_REGS;
12844 }
12845
12846 /* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
12847
12848 static bool
12849 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12850                                  reg_class_t class2)
12851 {
12852   if (!TARGET_SIMD
12853       && reg_classes_intersect_p (class1, FP_REGS)
12854       && reg_classes_intersect_p (class2, FP_REGS))
12855     {
12856       /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12857          so we can't easily split a move involving tuples of 128-bit
12858          vectors.  Force the copy through memory instead.
12859
12860          (Tuples of 64-bit vectors are fine.)  */
12861       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12862       if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12863         return true;
12864     }
12865   return false;
12866 }
12867
12868 /* Implement TARGET_FRAME_POINTER_REQUIRED.  */
12869
12870 static bool
12871 aarch64_frame_pointer_required ()
12872 {
12873   /* If the function needs to record the incoming value of PSTATE.SM,
12874      make sure that the slot is accessible from the frame pointer.  */
12875   return aarch64_need_old_pstate_sm ();
12876 }
12877
12878 static bool
12879 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12880 {
12881   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12882
12883   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12884      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
12885   if (frame_pointer_needed)
12886     return to == HARD_FRAME_POINTER_REGNUM;
12887   return true;
12888 }
12889
12890 poly_int64
12891 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12892 {
12893   aarch64_frame &frame = cfun->machine->frame;
12894
12895   if (to == HARD_FRAME_POINTER_REGNUM)
12896     {
12897       if (from == ARG_POINTER_REGNUM)
12898         return frame.bytes_above_hard_fp;
12899
12900       if (from == FRAME_POINTER_REGNUM)
12901         return frame.bytes_above_hard_fp - frame.bytes_above_locals;
12902     }
12903
12904   if (to == STACK_POINTER_REGNUM)
12905     {
12906       if (from == FRAME_POINTER_REGNUM)
12907         return frame.frame_size - frame.bytes_above_locals;
12908     }
12909
12910   return frame.frame_size;
12911 }
12912
12913
12914 /* Get return address without mangling.  */
12915
12916 rtx
12917 aarch64_return_addr_rtx (void)
12918 {
12919   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12920   /* Note: aarch64_return_address_signing_enabled only
12921      works after cfun->machine->frame.laid_out is set,
12922      so here we don't know if the return address will
12923      be signed or not.  */
12924   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12925   emit_move_insn (lr, val);
12926   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12927   return lr;
12928 }
12929
12930
12931 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
12932    previous frame.  */
12933
12934 rtx
12935 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12936 {
12937   if (count != 0)
12938     return const0_rtx;
12939   return aarch64_return_addr_rtx ();
12940 }
12941
12942 static void
12943 aarch64_asm_trampoline_template (FILE *f)
12944 {
12945   /* Even if the current function doesn't have branch protection, some
12946      later function might, so since this template is only generated once
12947      we have to add a BTI just in case. */
12948   asm_fprintf (f, "\thint\t34 // bti c\n");
12949
12950   if (TARGET_ILP32)
12951     {
12952       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12953       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12954     }
12955   else
12956     {
12957       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12958       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12959     }
12960   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12961
12962   /* We always emit a speculation barrier.
12963      This is because the same trampoline template is used for every nested
12964      function.  Since nested functions are not particularly common or
12965      performant we don't worry too much about the extra instructions to copy
12966      around.
12967      This is not yet a problem, since we have not yet implemented function
12968      specific attributes to choose between hardening against straight line
12969      speculation or not, but such function specific attributes are likely to
12970      happen in the future.  */
12971   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12972
12973   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12974   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12975 }
12976
12977 static void
12978 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12979 {
12980   rtx fnaddr, mem, a_tramp;
12981   const int tramp_code_sz = 24;
12982
12983   /* Don't need to copy the trailing D-words, we fill those in below.  */
12984   /* We create our own memory address in Pmode so that `emit_block_move` can
12985      use parts of the backend which expect Pmode addresses.  */
12986   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12987   emit_block_move (gen_rtx_MEM (BLKmode, temp),
12988                    assemble_trampoline_template (),
12989                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12990   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
12991   fnaddr = XEXP (DECL_RTL (fndecl), 0);
12992   if (GET_MODE (fnaddr) != ptr_mode)
12993     fnaddr = convert_memory_address (ptr_mode, fnaddr);
12994   emit_move_insn (mem, fnaddr);
12995
12996   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
12997   emit_move_insn (mem, chain_value);
12998
12999   /* XXX We should really define a "clear_cache" pattern and use
13000      gen_clear_cache().  */
13001   a_tramp = XEXP (m_tramp, 0);
13002   maybe_emit_call_builtin___clear_cache (a_tramp,
13003                                          plus_constant (ptr_mode,
13004                                                         a_tramp,
13005                                                         TRAMPOLINE_SIZE));
13006 }
13007
13008 static unsigned char
13009 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
13010 {
13011   /* ??? Logically we should only need to provide a value when
13012      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
13013      can hold MODE, but at the moment we need to handle all modes.
13014      Just ignore any runtime parts for registers that can't store them.  */
13015   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
13016   unsigned int nregs, vec_flags;
13017   switch (regclass)
13018     {
13019     case W8_W11_REGS:
13020     case W12_W15_REGS:
13021     case STUB_REGS:
13022     case TAILCALL_ADDR_REGS:
13023     case POINTER_REGS:
13024     case GENERAL_REGS:
13025     case ALL_REGS:
13026     case POINTER_AND_FP_REGS:
13027     case FP_REGS:
13028     case FP_LO_REGS:
13029     case FP_LO8_REGS:
13030       vec_flags = aarch64_classify_vector_mode (mode);
13031       if ((vec_flags & VEC_SVE_DATA)
13032           && constant_multiple_p (GET_MODE_SIZE (mode),
13033                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
13034         return nregs;
13035       if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
13036         return GET_MODE_SIZE (mode).to_constant () / 8;
13037       return (vec_flags & VEC_ADVSIMD
13038               ? CEIL (lowest_size, UNITS_PER_VREG)
13039               : CEIL (lowest_size, UNITS_PER_WORD));
13040
13041     case PR_REGS:
13042     case PR_LO_REGS:
13043     case PR_HI_REGS:
13044       return mode == VNx32BImode ? 2 : 1;
13045
13046     case STACK_REG:
13047     case FFR_REGS:
13048     case PR_AND_FFR_REGS:
13049     case FAKE_REGS:
13050       return 1;
13051
13052     case NO_REGS:
13053       return 0;
13054
13055     default:
13056       break;
13057     }
13058   gcc_unreachable ();
13059 }
13060
13061 static reg_class_t
13062 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
13063 {
13064   if (regclass == POINTER_REGS)
13065     return GENERAL_REGS;
13066
13067   if (regclass == STACK_REG)
13068     {
13069       if (REG_P(x)
13070           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
13071           return regclass;
13072
13073       return NO_REGS;
13074     }
13075
13076   /* Register eliminiation can result in a request for
13077      SP+constant->FP_REGS.  We cannot support such operations which
13078      use SP as source and an FP_REG as destination, so reject out
13079      right now.  */
13080   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
13081     {
13082       rtx lhs = XEXP (x, 0);
13083
13084       /* Look through a possible SUBREG introduced by ILP32.  */
13085       if (SUBREG_P (lhs))
13086         lhs = SUBREG_REG (lhs);
13087
13088       gcc_assert (REG_P (lhs));
13089       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
13090                                       POINTER_REGS));
13091       return NO_REGS;
13092     }
13093
13094   return regclass;
13095 }
13096
13097 void
13098 aarch64_asm_output_labelref (FILE* f, const char *name)
13099 {
13100   asm_fprintf (f, "%U%s", name);
13101 }
13102
13103 static void
13104 aarch64_elf_asm_constructor (rtx symbol, int priority)
13105 {
13106   if (priority == DEFAULT_INIT_PRIORITY)
13107     default_ctor_section_asm_out_constructor (symbol, priority);
13108   else
13109     {
13110       section *s;
13111       /* While priority is known to be in range [0, 65535], so 18 bytes
13112          would be enough, the compiler might not know that.  To avoid
13113          -Wformat-truncation false positive, use a larger size.  */
13114       char buf[23];
13115       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
13116       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13117       switch_to_section (s);
13118       assemble_align (POINTER_SIZE);
13119       assemble_aligned_integer (POINTER_BYTES, symbol);
13120     }
13121 }
13122
13123 static void
13124 aarch64_elf_asm_destructor (rtx symbol, int priority)
13125 {
13126   if (priority == DEFAULT_INIT_PRIORITY)
13127     default_dtor_section_asm_out_destructor (symbol, priority);
13128   else
13129     {
13130       section *s;
13131       /* While priority is known to be in range [0, 65535], so 18 bytes
13132          would be enough, the compiler might not know that.  To avoid
13133          -Wformat-truncation false positive, use a larger size.  */
13134       char buf[23];
13135       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
13136       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13137       switch_to_section (s);
13138       assemble_align (POINTER_SIZE);
13139       assemble_aligned_integer (POINTER_BYTES, symbol);
13140     }
13141 }
13142
13143 const char*
13144 aarch64_output_casesi (rtx *operands)
13145 {
13146   char buf[100];
13147   char label[100];
13148   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
13149   int index;
13150   static const char *const patterns[4][2] =
13151   {
13152     {
13153       "ldrb\t%w3, [%0,%w1,uxtw]",
13154       "add\t%3, %4, %w3, sxtb #2"
13155     },
13156     {
13157       "ldrh\t%w3, [%0,%w1,uxtw #1]",
13158       "add\t%3, %4, %w3, sxth #2"
13159     },
13160     {
13161       "ldr\t%w3, [%0,%w1,uxtw #2]",
13162       "add\t%3, %4, %w3, sxtw #2"
13163     },
13164     /* We assume that DImode is only generated when not optimizing and
13165        that we don't really need 64-bit address offsets.  That would
13166        imply an object file with 8GB of code in a single function!  */
13167     {
13168       "ldr\t%w3, [%0,%w1,uxtw #2]",
13169       "add\t%3, %4, %w3, sxtw #2"
13170     }
13171   };
13172
13173   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13174
13175   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13176   index = exact_log2 (GET_MODE_SIZE (mode));
13177
13178   gcc_assert (index >= 0 && index <= 3);
13179
13180   /* Need to implement table size reduction, by chaning the code below.  */
13181   output_asm_insn (patterns[index][0], operands);
13182   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13183   snprintf (buf, sizeof (buf),
13184             "adr\t%%4, %s", targetm.strip_name_encoding (label));
13185   output_asm_insn (buf, operands);
13186   output_asm_insn (patterns[index][1], operands);
13187   output_asm_insn ("br\t%3", operands);
13188   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13189                    operands);
13190   assemble_label (asm_out_file, label);
13191   return "";
13192 }
13193
13194 /* Return the asm string for an SME ZERO instruction whose 8-bit mask
13195    operand is MASK.  */
13196 const char *
13197 aarch64_output_sme_zero_za (rtx mask)
13198 {
13199   auto mask_val = UINTVAL (mask);
13200   if (mask_val == 0)
13201     return "zero\t{}";
13202
13203   if (mask_val == 0xff)
13204     return "zero\t{ za }";
13205
13206   static constexpr struct { unsigned char mask; char letter; } tiles[] = {
13207     { 0xff, 'b' },
13208     { 0x55, 'h' },
13209     { 0x11, 's' },
13210     { 0x01, 'd' }
13211   };
13212   /* The last entry in the list has the form "za7.d }", but that's the
13213      same length as "za7.d, ".  */
13214   static char buffer[sizeof("zero\t{ ") + sizeof ("za7.d, ") * 8 + 1];
13215   for (auto &tile : tiles)
13216     {
13217       unsigned int tile_mask = tile.mask;
13218       unsigned int tile_index = 0;
13219       unsigned int i = snprintf (buffer, sizeof (buffer), "zero\t");
13220       const char *prefix = "{ ";
13221       auto remaining_mask = mask_val;
13222       while (tile_mask < 0x100)
13223         {
13224           if ((remaining_mask & tile_mask) == tile_mask)
13225             {
13226               i += snprintf (buffer + i, sizeof (buffer) - i, "%sza%d.%c",
13227                              prefix, tile_index, tile.letter);
13228               prefix = ", ";
13229               remaining_mask &= ~tile_mask;
13230             }
13231           tile_mask <<= 1;
13232           tile_index += 1;
13233         }
13234       if (remaining_mask == 0)
13235         {
13236           gcc_assert (i + 3 <= sizeof (buffer));
13237           snprintf (buffer + i, sizeof (buffer) - i, " }");
13238           return buffer;
13239         }
13240     }
13241   gcc_unreachable ();
13242 }
13243
13244 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13245    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13246    operator.  */
13247
13248 int
13249 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13250 {
13251   if (shift >= 0 && shift <= 4)
13252     {
13253       int size;
13254       for (size = 8; size <= 32; size *= 2)
13255         {
13256           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13257           if (mask == bits << shift)
13258             return size;
13259         }
13260     }
13261   return 0;
13262 }
13263
13264 /* Constant pools are per function only when PC relative
13265    literal loads are true or we are in the large memory
13266    model.  */
13267
13268 static inline bool
13269 aarch64_can_use_per_function_literal_pools_p (void)
13270 {
13271   return (aarch64_pcrelative_literal_loads
13272           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13273 }
13274
13275 static bool
13276 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
13277 {
13278   /* We can't use blocks for constants when we're using a per-function
13279      constant pool.  */
13280   return !aarch64_can_use_per_function_literal_pools_p ();
13281 }
13282
13283 /* Select appropriate section for constants depending
13284    on where we place literal pools.  */
13285
13286 static section *
13287 aarch64_select_rtx_section (machine_mode mode,
13288                             rtx x,
13289                             unsigned HOST_WIDE_INT align)
13290 {
13291   if (aarch64_can_use_per_function_literal_pools_p ())
13292     return function_section (current_function_decl);
13293
13294   return default_elf_select_rtx_section (mode, x, align);
13295 }
13296
13297 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
13298 void
13299 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13300                                   HOST_WIDE_INT offset)
13301 {
13302   /* When using per-function literal pools, we must ensure that any code
13303      section is aligned to the minimal instruction length, lest we get
13304      errors from the assembler re "unaligned instructions".  */
13305   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13306     ASM_OUTPUT_ALIGN (f, 2);
13307 }
13308
13309 /* Costs.  */
13310
13311 /* Helper function for rtx cost calculation.  Strip a shift expression
13312    from X.  Returns the inner operand if successful, or the original
13313    expression on failure.  */
13314 static rtx
13315 aarch64_strip_shift (rtx x)
13316 {
13317   rtx op = x;
13318
13319   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13320      we can convert both to ROR during final output.  */
13321   if ((GET_CODE (op) == ASHIFT
13322        || GET_CODE (op) == ASHIFTRT
13323        || GET_CODE (op) == LSHIFTRT
13324        || GET_CODE (op) == ROTATERT
13325        || GET_CODE (op) == ROTATE)
13326       && CONST_INT_P (XEXP (op, 1)))
13327     return XEXP (op, 0);
13328
13329   if (GET_CODE (op) == MULT
13330       && CONST_INT_P (XEXP (op, 1))
13331       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13332     return XEXP (op, 0);
13333
13334   return x;
13335 }
13336
13337 /* Helper function for rtx cost calculation.  Strip an extend
13338    expression from X.  Returns the inner operand if successful, or the
13339    original expression on failure.  We deal with a number of possible
13340    canonicalization variations here. If STRIP_SHIFT is true, then
13341    we can strip off a shift also.  */
13342 static rtx
13343 aarch64_strip_extend (rtx x, bool strip_shift)
13344 {
13345   scalar_int_mode mode;
13346   rtx op = x;
13347
13348   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13349     return op;
13350
13351   if (GET_CODE (op) == AND
13352       && GET_CODE (XEXP (op, 0)) == MULT
13353       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13354       && CONST_INT_P (XEXP (op, 1))
13355       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13356                            INTVAL (XEXP (op, 1))) != 0)
13357     return XEXP (XEXP (op, 0), 0);
13358
13359   /* Now handle extended register, as this may also have an optional
13360      left shift by 1..4.  */
13361   if (strip_shift
13362       && GET_CODE (op) == ASHIFT
13363       && CONST_INT_P (XEXP (op, 1))
13364       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13365     op = XEXP (op, 0);
13366
13367   if (GET_CODE (op) == ZERO_EXTEND
13368       || GET_CODE (op) == SIGN_EXTEND)
13369     op = XEXP (op, 0);
13370
13371   if (op != x)
13372     return op;
13373
13374   return x;
13375 }
13376
13377 /* Helper function for rtx cost calculation. Strip extension as well as any
13378    inner VEC_SELECT high-half from X. Returns the inner vector operand if
13379    successful, or the original expression on failure.  */
13380 static rtx
13381 aarch64_strip_extend_vec_half (rtx x)
13382 {
13383   if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13384     {
13385       x = XEXP (x, 0);
13386       if (GET_CODE (x) == VEC_SELECT
13387           && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13388                                     XEXP (x, 1)))
13389         x = XEXP (x, 0);
13390     }
13391   return x;
13392 }
13393
13394 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13395    any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13396    operand if successful, or the original expression on failure.  */
13397 static rtx
13398 aarch64_strip_duplicate_vec_elt (rtx x)
13399 {
13400   if (GET_CODE (x) == VEC_DUPLICATE
13401       && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13402     {
13403       x = XEXP (x, 0);
13404       if (GET_CODE (x) == VEC_SELECT)
13405         x = XEXP (x, 0);
13406       else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13407                && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13408         x = XEXP (XEXP (x, 0), 0);
13409     }
13410   return x;
13411 }
13412
13413 /* Return true iff CODE is a shift supported in combination
13414    with arithmetic instructions.  */
13415
13416 static bool
13417 aarch64_shift_p (enum rtx_code code)
13418 {
13419   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13420 }
13421
13422
13423 /* Return true iff X is a cheap shift without a sign extend. */
13424
13425 static bool
13426 aarch64_cheap_mult_shift_p (rtx x)
13427 {
13428   rtx op0, op1;
13429
13430   op0 = XEXP (x, 0);
13431   op1 = XEXP (x, 1);
13432
13433   if (!(aarch64_tune_params.extra_tuning_flags
13434                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13435     return false;
13436
13437   if (GET_CODE (op0) == SIGN_EXTEND)
13438     return false;
13439
13440   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13441       && UINTVAL (op1) <= 4)
13442     return true;
13443
13444   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13445     return false;
13446
13447   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13448
13449   if (l2 > 0 && l2 <= 4)
13450     return true;
13451
13452   return false;
13453 }
13454
13455 /* Helper function for rtx cost calculation.  Calculate the cost of
13456    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13457    Return the calculated cost of the expression, recursing manually in to
13458    operands where needed.  */
13459
13460 static int
13461 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13462 {
13463   rtx op0, op1;
13464   const struct cpu_cost_table *extra_cost
13465     = aarch64_tune_params.insn_extra_cost;
13466   int cost = 0;
13467   bool compound_p = (outer == PLUS || outer == MINUS);
13468   machine_mode mode = GET_MODE (x);
13469
13470   gcc_checking_assert (code == MULT);
13471
13472   op0 = XEXP (x, 0);
13473   op1 = XEXP (x, 1);
13474
13475   if (VECTOR_MODE_P (mode))
13476     {
13477       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13478       if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13479         {
13480           /* The select-operand-high-half versions of the instruction have the
13481              same cost as the three vector version - don't add the costs of the
13482              extension or selection into the costs of the multiply.  */
13483           op0 = aarch64_strip_extend_vec_half (op0);
13484           op1 = aarch64_strip_extend_vec_half (op1);
13485           /* The by-element versions of the instruction have the same costs as
13486              the normal 3-vector version.  We make an assumption that the input
13487              to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
13488              costing of a MUL by element pre RA is a bit optimistic.  */
13489           op0 = aarch64_strip_duplicate_vec_elt (op0);
13490           op1 = aarch64_strip_duplicate_vec_elt (op1);
13491         }
13492       cost += rtx_cost (op0, mode, MULT, 0, speed);
13493       cost += rtx_cost (op1, mode, MULT, 1, speed);
13494       if (speed)
13495         {
13496           if (GET_CODE (x) == MULT)
13497             cost += extra_cost->vect.mult;
13498           /* This is to catch the SSRA costing currently flowing here.  */
13499           else
13500             cost += extra_cost->vect.alu;
13501         }
13502       return cost;
13503     }
13504
13505   /* Integer multiply/fma.  */
13506   if (GET_MODE_CLASS (mode) == MODE_INT)
13507     {
13508       /* The multiply will be canonicalized as a shift, cost it as such.  */
13509       if (aarch64_shift_p (GET_CODE (x))
13510           || (CONST_INT_P (op1)
13511               && exact_log2 (INTVAL (op1)) > 0))
13512         {
13513           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13514                            || GET_CODE (op0) == SIGN_EXTEND;
13515           if (speed)
13516             {
13517               if (compound_p)
13518                 {
13519                   /* If the shift is considered cheap,
13520                      then don't add any cost. */
13521                   if (aarch64_cheap_mult_shift_p (x))
13522                     ;
13523                   else if (REG_P (op1))
13524                     /* ARITH + shift-by-register.  */
13525                     cost += extra_cost->alu.arith_shift_reg;
13526                   else if (is_extend)
13527                     /* ARITH + extended register.  We don't have a cost field
13528                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
13529                     cost += extra_cost->alu.extend_arith;
13530                   else
13531                     /* ARITH + shift-by-immediate.  */
13532                     cost += extra_cost->alu.arith_shift;
13533                 }
13534               else
13535                 /* LSL (immediate).  */
13536                 cost += extra_cost->alu.shift;
13537
13538             }
13539           /* Strip extends as we will have costed them in the case above.  */
13540           if (is_extend)
13541             op0 = aarch64_strip_extend (op0, true);
13542
13543           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13544
13545           return cost;
13546         }
13547
13548       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
13549          compound and let the below cases handle it.  After all, MNEG is a
13550          special-case alias of MSUB.  */
13551       if (GET_CODE (op0) == NEG)
13552         {
13553           op0 = XEXP (op0, 0);
13554           compound_p = true;
13555         }
13556
13557       /* Integer multiplies or FMAs have zero/sign extending variants.  */
13558       if ((GET_CODE (op0) == ZERO_EXTEND
13559            && GET_CODE (op1) == ZERO_EXTEND)
13560           || (GET_CODE (op0) == SIGN_EXTEND
13561               && GET_CODE (op1) == SIGN_EXTEND))
13562         {
13563           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13564           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13565
13566           if (speed)
13567             {
13568               if (compound_p)
13569                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
13570                 cost += extra_cost->mult[0].extend_add;
13571               else
13572                 /* MUL/SMULL/UMULL.  */
13573                 cost += extra_cost->mult[0].extend;
13574             }
13575
13576           return cost;
13577         }
13578
13579       /* This is either an integer multiply or a MADD.  In both cases
13580          we want to recurse and cost the operands.  */
13581       cost += rtx_cost (op0, mode, MULT, 0, speed);
13582       cost += rtx_cost (op1, mode, MULT, 1, speed);
13583
13584       if (speed)
13585         {
13586           if (compound_p)
13587             /* MADD/MSUB.  */
13588             cost += extra_cost->mult[mode == DImode].add;
13589           else
13590             /* MUL.  */
13591             cost += extra_cost->mult[mode == DImode].simple;
13592         }
13593
13594       return cost;
13595     }
13596   else
13597     {
13598       if (speed)
13599         {
13600           /* Floating-point FMA/FMUL can also support negations of the
13601              operands, unless the rounding mode is upward or downward in
13602              which case FNMUL is different than FMUL with operand negation.  */
13603           bool neg0 = GET_CODE (op0) == NEG;
13604           bool neg1 = GET_CODE (op1) == NEG;
13605           if (compound_p || !flag_rounding_math || (neg0 && neg1))
13606             {
13607               if (neg0)
13608                 op0 = XEXP (op0, 0);
13609               if (neg1)
13610                 op1 = XEXP (op1, 0);
13611             }
13612
13613           if (compound_p)
13614             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
13615             cost += extra_cost->fp[mode == DFmode].fma;
13616           else
13617             /* FMUL/FNMUL.  */
13618             cost += extra_cost->fp[mode == DFmode].mult;
13619         }
13620
13621       cost += rtx_cost (op0, mode, MULT, 0, speed);
13622       cost += rtx_cost (op1, mode, MULT, 1, speed);
13623       return cost;
13624     }
13625 }
13626
13627 static int
13628 aarch64_address_cost (rtx x,
13629                       machine_mode mode,
13630                       addr_space_t as ATTRIBUTE_UNUSED,
13631                       bool speed)
13632 {
13633   enum rtx_code c = GET_CODE (x);
13634   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13635   struct aarch64_address_info info;
13636   int cost = 0;
13637   info.shift = 0;
13638
13639   if (!aarch64_classify_address (&info, x, mode, false))
13640     {
13641       if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13642         {
13643           /* This is a CONST or SYMBOL ref which will be split
13644              in a different way depending on the code model in use.
13645              Cost it through the generic infrastructure.  */
13646           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13647           /* Divide through by the cost of one instruction to
13648              bring it to the same units as the address costs.  */
13649           cost_symbol_ref /= COSTS_N_INSNS (1);
13650           /* The cost is then the cost of preparing the address,
13651              followed by an immediate (possibly 0) offset.  */
13652           return cost_symbol_ref + addr_cost->imm_offset;
13653         }
13654       else
13655         {
13656           /* This is most likely a jump table from a case
13657              statement.  */
13658           return addr_cost->register_offset;
13659         }
13660     }
13661
13662   switch (info.type)
13663     {
13664       case ADDRESS_LO_SUM:
13665       case ADDRESS_SYMBOLIC:
13666       case ADDRESS_REG_IMM:
13667         cost += addr_cost->imm_offset;
13668         break;
13669
13670       case ADDRESS_REG_WB:
13671         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13672           cost += addr_cost->pre_modify;
13673         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13674           {
13675             unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13676             if (nvectors == 3)
13677               cost += addr_cost->post_modify_ld3_st3;
13678             else if (nvectors == 4)
13679               cost += addr_cost->post_modify_ld4_st4;
13680             else
13681               cost += addr_cost->post_modify;
13682           }
13683         else
13684           gcc_unreachable ();
13685
13686         break;
13687
13688       case ADDRESS_REG_REG:
13689         cost += addr_cost->register_offset;
13690         break;
13691
13692       case ADDRESS_REG_SXTW:
13693         cost += addr_cost->register_sextend;
13694         break;
13695
13696       case ADDRESS_REG_UXTW:
13697         cost += addr_cost->register_zextend;
13698         break;
13699
13700       default:
13701         gcc_unreachable ();
13702     }
13703
13704
13705   if (info.shift > 0)
13706     {
13707       /* For the sake of calculating the cost of the shifted register
13708          component, we can treat same sized modes in the same way.  */
13709       if (known_eq (GET_MODE_BITSIZE (mode), 16))
13710         cost += addr_cost->addr_scale_costs.hi;
13711       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13712         cost += addr_cost->addr_scale_costs.si;
13713       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13714         cost += addr_cost->addr_scale_costs.di;
13715       else
13716         /* We can't tell, or this is a 128-bit vector.  */
13717         cost += addr_cost->addr_scale_costs.ti;
13718     }
13719
13720   return cost;
13721 }
13722
13723 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
13724    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
13725    to be taken.  */
13726
13727 int
13728 aarch64_branch_cost (bool speed_p, bool predictable_p)
13729 {
13730   /* When optimizing for speed, use the cost of unpredictable branches.  */
13731   const struct cpu_branch_cost *branch_costs =
13732     aarch64_tune_params.branch_costs;
13733
13734   if (!speed_p || predictable_p)
13735     return branch_costs->predictable;
13736   else
13737     return branch_costs->unpredictable;
13738 }
13739
13740 /* Return true if X is a zero or sign extract
13741    usable in an ADD or SUB (extended register) instruction.  */
13742 static bool
13743 aarch64_rtx_arith_op_extract_p (rtx x)
13744 {
13745   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13746      No shift.  */
13747   if (GET_CODE (x) == SIGN_EXTEND
13748       || GET_CODE (x) == ZERO_EXTEND)
13749     return REG_P (XEXP (x, 0));
13750
13751   return false;
13752 }
13753
13754 static bool
13755 aarch64_frint_unspec_p (unsigned int u)
13756 {
13757   switch (u)
13758     {
13759       case UNSPEC_FRINTZ:
13760       case UNSPEC_FRINTP:
13761       case UNSPEC_FRINTM:
13762       case UNSPEC_FRINTA:
13763       case UNSPEC_FRINTN:
13764       case UNSPEC_FRINTX:
13765       case UNSPEC_FRINTI:
13766         return true;
13767
13768       default:
13769         return false;
13770     }
13771 }
13772
13773 /* Return true iff X is an rtx that will match an extr instruction
13774    i.e. as described in the *extr<mode>5_insn family of patterns.
13775    OP0 and OP1 will be set to the operands of the shifts involved
13776    on success and will be NULL_RTX otherwise.  */
13777
13778 static bool
13779 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13780 {
13781   rtx op0, op1;
13782   scalar_int_mode mode;
13783   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13784     return false;
13785
13786   *res_op0 = NULL_RTX;
13787   *res_op1 = NULL_RTX;
13788
13789   if (GET_CODE (x) != IOR)
13790     return false;
13791
13792   op0 = XEXP (x, 0);
13793   op1 = XEXP (x, 1);
13794
13795   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13796       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13797     {
13798      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
13799       if (GET_CODE (op1) == ASHIFT)
13800         std::swap (op0, op1);
13801
13802       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13803         return false;
13804
13805       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13806       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13807
13808       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13809           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13810         {
13811           *res_op0 = XEXP (op0, 0);
13812           *res_op1 = XEXP (op1, 0);
13813           return true;
13814         }
13815     }
13816
13817   return false;
13818 }
13819
13820 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13821    storing it in *COST.  Result is true if the total cost of the operation
13822    has now been calculated.  */
13823 static bool
13824 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13825 {
13826   rtx inner;
13827   rtx comparator;
13828   enum rtx_code cmpcode;
13829   const struct cpu_cost_table *extra_cost
13830     = aarch64_tune_params.insn_extra_cost;
13831
13832   if (COMPARISON_P (op0))
13833     {
13834       inner = XEXP (op0, 0);
13835       comparator = XEXP (op0, 1);
13836       cmpcode = GET_CODE (op0);
13837     }
13838   else
13839     {
13840       inner = op0;
13841       comparator = const0_rtx;
13842       cmpcode = NE;
13843     }
13844
13845   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13846     {
13847       /* Conditional branch.  */
13848       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13849         return true;
13850       else
13851         {
13852           if (cmpcode == NE || cmpcode == EQ)
13853             {
13854               if (comparator == const0_rtx)
13855                 {
13856                   /* TBZ/TBNZ/CBZ/CBNZ.  */
13857                   if (GET_CODE (inner) == ZERO_EXTRACT)
13858                     /* TBZ/TBNZ.  */
13859                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13860                                        ZERO_EXTRACT, 0, speed);
13861                   else
13862                     /* CBZ/CBNZ.  */
13863                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13864
13865                   return true;
13866                 }
13867               if (register_operand (inner, VOIDmode)
13868                   && aarch64_imm24 (comparator, VOIDmode))
13869                 {
13870                   /* SUB and SUBS.  */
13871                   *cost += COSTS_N_INSNS (2);
13872                   if (speed)
13873                     *cost += extra_cost->alu.arith * 2;
13874                   return true;
13875                 }
13876             }
13877           else if (cmpcode == LT || cmpcode == GE)
13878             {
13879               /* TBZ/TBNZ.  */
13880               if (comparator == const0_rtx)
13881                 return true;
13882             }
13883         }
13884     }
13885   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13886     {
13887       /* CCMP.  */
13888       if (GET_CODE (op1) == COMPARE)
13889         {
13890           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
13891           if (XEXP (op1, 1) == const0_rtx)
13892             *cost += 1;
13893           if (speed)
13894             {
13895               machine_mode mode = GET_MODE (XEXP (op1, 0));
13896
13897               if (GET_MODE_CLASS (mode) == MODE_INT)
13898                 *cost += extra_cost->alu.arith;
13899               else
13900                 *cost += extra_cost->fp[mode == DFmode].compare;
13901             }
13902           return true;
13903         }
13904
13905       /* It's a conditional operation based on the status flags,
13906          so it must be some flavor of CSEL.  */
13907
13908       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
13909       if (GET_CODE (op1) == NEG
13910           || GET_CODE (op1) == NOT
13911           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13912         op1 = XEXP (op1, 0);
13913       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13914         {
13915           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
13916           op1 = XEXP (op1, 0);
13917           op2 = XEXP (op2, 0);
13918         }
13919       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13920         {
13921           inner = XEXP (op1, 0);
13922           if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13923             /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
13924             op1 = XEXP (inner, 0);
13925         }
13926       else if (op1 == constm1_rtx || op1 == const1_rtx)
13927         {
13928           /* Use CSINV or CSINC.  */
13929           *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13930           return true;
13931         }
13932       else if (op2 == constm1_rtx || op2 == const1_rtx)
13933         {
13934           /* Use CSINV or CSINC.  */
13935           *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13936           return true;
13937         }
13938
13939       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13940       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13941       return true;
13942     }
13943
13944   /* We don't know what this is, cost all operands.  */
13945   return false;
13946 }
13947
13948 /* Check whether X is a bitfield operation of the form shift + extend that
13949    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
13950    operand to which the bitfield operation is applied.  Otherwise return
13951    NULL_RTX.  */
13952
13953 static rtx
13954 aarch64_extend_bitfield_pattern_p (rtx x)
13955 {
13956   rtx_code outer_code = GET_CODE (x);
13957   machine_mode outer_mode = GET_MODE (x);
13958
13959   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13960       && outer_mode != SImode && outer_mode != DImode)
13961     return NULL_RTX;
13962
13963   rtx inner = XEXP (x, 0);
13964   rtx_code inner_code = GET_CODE (inner);
13965   machine_mode inner_mode = GET_MODE (inner);
13966   rtx op = NULL_RTX;
13967
13968   switch (inner_code)
13969     {
13970       case ASHIFT:
13971         if (CONST_INT_P (XEXP (inner, 1))
13972             && (inner_mode == QImode || inner_mode == HImode))
13973           op = XEXP (inner, 0);
13974         break;
13975       case LSHIFTRT:
13976         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13977             && (inner_mode == QImode || inner_mode == HImode))
13978           op = XEXP (inner, 0);
13979         break;
13980       case ASHIFTRT:
13981         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13982             && (inner_mode == QImode || inner_mode == HImode))
13983           op = XEXP (inner, 0);
13984         break;
13985       default:
13986         break;
13987     }
13988
13989   return op;
13990 }
13991
13992 /* Return true if the mask and a shift amount from an RTX of the form
13993    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13994    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
13995
13996 bool
13997 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13998                                     rtx shft_amnt)
13999 {
14000   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
14001          && INTVAL (mask) > 0
14002          && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
14003          && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
14004          && (UINTVAL (mask)
14005              & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
14006 }
14007
14008 /* Return true if the masks and a shift amount from an RTX of the form
14009    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
14010    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
14011
14012 bool
14013 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
14014                                    unsigned HOST_WIDE_INT mask1,
14015                                    unsigned HOST_WIDE_INT shft_amnt,
14016                                    unsigned HOST_WIDE_INT mask2)
14017 {
14018   unsigned HOST_WIDE_INT t;
14019
14020   /* Verify that there is no overlap in what bits are set in the two masks.  */
14021   if (mask1 != ~mask2)
14022     return false;
14023
14024   /* Verify that mask2 is not all zeros or ones.  */
14025   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
14026     return false;
14027
14028   /* The shift amount should always be less than the mode size.  */
14029   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
14030
14031   /* Verify that the mask being shifted is contiguous and would be in the
14032      least significant bits after shifting by shft_amnt.  */
14033   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
14034   return (t == (t & -t));
14035 }
14036
14037 /* Return true if X is an RTX representing an operation in the ABD family
14038    of instructions.  */
14039
14040 static bool
14041 aarch64_abd_rtx_p (rtx x)
14042 {
14043   if (GET_CODE (x) != MINUS)
14044     return false;
14045   rtx max_arm = XEXP (x, 0);
14046   rtx min_arm = XEXP (x, 1);
14047   if (GET_CODE (max_arm) != SMAX && GET_CODE (max_arm) != UMAX)
14048     return false;
14049   bool signed_p = GET_CODE (max_arm) == SMAX;
14050   if (signed_p && GET_CODE (min_arm) != SMIN)
14051     return false;
14052   else if (!signed_p && GET_CODE (min_arm) != UMIN)
14053     return false;
14054
14055   rtx maxop0 = XEXP (max_arm, 0);
14056   rtx maxop1 = XEXP (max_arm, 1);
14057   rtx minop0 = XEXP (min_arm, 0);
14058   rtx minop1 = XEXP (min_arm, 1);
14059   return rtx_equal_p (maxop0, minop0) && rtx_equal_p (maxop1, minop1);
14060 }
14061
14062 /* Calculate the cost of calculating X, storing it in *COST.  Result
14063    is true if the total cost of the operation has now been calculated.  */
14064 static bool
14065 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
14066                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
14067 {
14068   rtx op0, op1, op2;
14069   const struct cpu_cost_table *extra_cost
14070     = aarch64_tune_params.insn_extra_cost;
14071   rtx_code code = GET_CODE (x);
14072   scalar_int_mode int_mode;
14073
14074   /* By default, assume that everything has equivalent cost to the
14075      cheapest instruction.  Any additional costs are applied as a delta
14076      above this default.  */
14077   *cost = COSTS_N_INSNS (1);
14078
14079   switch (code)
14080     {
14081     case SET:
14082       /* The cost depends entirely on the operands to SET.  */
14083       *cost = 0;
14084       op0 = SET_DEST (x);
14085       op1 = SET_SRC (x);
14086
14087       switch (GET_CODE (op0))
14088         {
14089         case MEM:
14090           if (speed)
14091             {
14092               rtx address = XEXP (op0, 0);
14093               if (VECTOR_MODE_P (mode))
14094                 *cost += extra_cost->ldst.storev;
14095               else if (GET_MODE_CLASS (mode) == MODE_INT)
14096                 *cost += extra_cost->ldst.store;
14097               else if (mode == SFmode || mode == SDmode)
14098                 *cost += extra_cost->ldst.storef;
14099               else if (mode == DFmode || mode == DDmode)
14100                 *cost += extra_cost->ldst.stored;
14101
14102               *cost +=
14103                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14104                                                      0, speed));
14105             }
14106
14107           *cost += rtx_cost (op1, mode, SET, 1, speed);
14108           return true;
14109
14110         case SUBREG:
14111           if (! REG_P (SUBREG_REG (op0)))
14112             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
14113
14114           /* Fall through.  */
14115         case REG:
14116           /* The cost is one per vector-register copied.  */
14117           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
14118             {
14119               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
14120               *cost = COSTS_N_INSNS (nregs);
14121             }
14122           /* const0_rtx is in general free, but we will use an
14123              instruction to set a register to 0.  */
14124           else if (REG_P (op1) || op1 == const0_rtx)
14125             {
14126               /* The cost is 1 per register copied.  */
14127               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
14128               *cost = COSTS_N_INSNS (nregs);
14129             }
14130           else
14131             /* Cost is just the cost of the RHS of the set.  */
14132             *cost += rtx_cost (op1, mode, SET, 1, speed);
14133           return true;
14134
14135         case ZERO_EXTRACT:
14136         case SIGN_EXTRACT:
14137           /* Bit-field insertion.  Strip any redundant widening of
14138              the RHS to meet the width of the target.  */
14139           if (SUBREG_P (op1))
14140             op1 = SUBREG_REG (op1);
14141           if ((GET_CODE (op1) == ZERO_EXTEND
14142                || GET_CODE (op1) == SIGN_EXTEND)
14143               && CONST_INT_P (XEXP (op0, 1))
14144               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
14145               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
14146             op1 = XEXP (op1, 0);
14147
14148           if (CONST_INT_P (op1))
14149             {
14150               /* MOV immediate is assumed to always be cheap.  */
14151               *cost = COSTS_N_INSNS (1);
14152             }
14153           else
14154             {
14155               /* BFM.  */
14156               if (speed)
14157                 *cost += extra_cost->alu.bfi;
14158               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
14159             }
14160
14161           return true;
14162
14163         default:
14164           /* We can't make sense of this, assume default cost.  */
14165           *cost = COSTS_N_INSNS (1);
14166           return false;
14167         }
14168       return false;
14169
14170     case CONST_INT:
14171       /* If an instruction can incorporate a constant within the
14172          instruction, the instruction's expression avoids calling
14173          rtx_cost() on the constant.  If rtx_cost() is called on a
14174          constant, then it is usually because the constant must be
14175          moved into a register by one or more instructions.
14176
14177          The exception is constant 0, which can be expressed
14178          as XZR/WZR and is therefore free.  The exception to this is
14179          if we have (set (reg) (const0_rtx)) in which case we must cost
14180          the move.  However, we can catch that when we cost the SET, so
14181          we don't need to consider that here.  */
14182       if (x == const0_rtx)
14183         *cost = 0;
14184       else
14185         {
14186           /* To an approximation, building any other constant is
14187              proportionally expensive to the number of instructions
14188              required to build that constant.  This is true whether we
14189              are compiling for SPEED or otherwise.  */
14190           machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
14191                                 ? SImode : DImode;
14192           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
14193                                  (NULL_RTX, x, false, imode));
14194         }
14195       return true;
14196
14197     case CONST_DOUBLE:
14198
14199       /* First determine number of instructions to do the move
14200           as an integer constant.  */
14201       if (!aarch64_float_const_representable_p (x)
14202            && !aarch64_can_const_movi_rtx_p (x, mode)
14203            && aarch64_float_const_rtx_p (x))
14204         {
14205           unsigned HOST_WIDE_INT ival;
14206           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
14207           gcc_assert (succeed);
14208
14209           machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
14210                                 ? DImode : SImode;
14211           int ncost = aarch64_internal_mov_immediate
14212                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
14213           *cost += COSTS_N_INSNS (ncost);
14214           return true;
14215         }
14216
14217       if (speed)
14218         {
14219           /* mov[df,sf]_aarch64.  */
14220           if (aarch64_float_const_representable_p (x))
14221             /* FMOV (scalar immediate).  */
14222             *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
14223           else if (!aarch64_float_const_zero_rtx_p (x))
14224             {
14225               /* This will be a load from memory.  */
14226               if (mode == DFmode || mode == DDmode)
14227                 *cost += extra_cost->ldst.loadd;
14228               else
14229                 *cost += extra_cost->ldst.loadf;
14230             }
14231           else
14232             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
14233                or MOV v0.s[0], wzr - neither of which are modeled by the
14234                cost tables.  Just use the default cost.  */
14235             {
14236             }
14237         }
14238
14239       return true;
14240
14241     case MEM:
14242       if (speed)
14243         {
14244           /* For loads we want the base cost of a load, plus an
14245              approximation for the additional cost of the addressing
14246              mode.  */
14247           rtx address = XEXP (x, 0);
14248           if (VECTOR_MODE_P (mode))
14249             *cost += extra_cost->ldst.loadv;
14250           else if (GET_MODE_CLASS (mode) == MODE_INT)
14251             *cost += extra_cost->ldst.load;
14252           else if (mode == SFmode || mode == SDmode)
14253             *cost += extra_cost->ldst.loadf;
14254           else if (mode == DFmode || mode == DDmode)
14255             *cost += extra_cost->ldst.loadd;
14256
14257           *cost +=
14258                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14259                                                      0, speed));
14260         }
14261
14262       return true;
14263
14264     case NEG:
14265       op0 = XEXP (x, 0);
14266
14267       if (VECTOR_MODE_P (mode))
14268         {
14269           /* Many vector comparison operations are represented as NEG
14270              of a comparison.  */
14271           if (COMPARISON_P (op0))
14272             {
14273               rtx op00 = XEXP (op0, 0);
14274               rtx op01 = XEXP (op0, 1);
14275               machine_mode inner_mode = GET_MODE (op00);
14276               /* FACGE/FACGT.  */
14277               if (GET_MODE_CLASS (inner_mode) == MODE_VECTOR_FLOAT
14278                   && GET_CODE (op00) == ABS
14279                   && GET_CODE (op01) == ABS)
14280                 {
14281                   op00 = XEXP (op00, 0);
14282                   op01 = XEXP (op01, 0);
14283                 }
14284               *cost += rtx_cost (op00, inner_mode, GET_CODE (op0), 0, speed);
14285               *cost += rtx_cost (op01, inner_mode, GET_CODE (op0), 1, speed);
14286               if (speed)
14287                 *cost += extra_cost->vect.alu;
14288               return true;
14289             }
14290           if (speed)
14291             {
14292               /* FNEG.  */
14293               *cost += extra_cost->vect.alu;
14294             }
14295           return false;
14296         }
14297
14298       if (GET_MODE_CLASS (mode) == MODE_INT)
14299         {
14300           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14301               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14302             {
14303               /* CSETM.  */
14304               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
14305               return true;
14306             }
14307
14308           /* Cost this as SUB wzr, X.  */
14309           op0 = CONST0_RTX (mode);
14310           op1 = XEXP (x, 0);
14311           goto cost_minus;
14312         }
14313
14314       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14315         {
14316           /* Support (neg(fma...)) as a single instruction only if
14317              sign of zeros is unimportant.  This matches the decision
14318              making in aarch64.md.  */
14319           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14320             {
14321               /* FNMADD.  */
14322               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14323               return true;
14324             }
14325           if (GET_CODE (op0) == MULT)
14326             {
14327               /* FNMUL.  */
14328               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14329               return true;
14330             }
14331           if (speed)
14332             /* FNEG.  */
14333             *cost += extra_cost->fp[mode == DFmode].neg;
14334           return false;
14335         }
14336
14337       return false;
14338
14339     case CLRSB:
14340     case CLZ:
14341       if (speed)
14342         {
14343           if (VECTOR_MODE_P (mode))
14344             *cost += extra_cost->vect.alu;
14345           else
14346             *cost += extra_cost->alu.clz;
14347         }
14348
14349       return false;
14350
14351     case CTZ:
14352       if (VECTOR_MODE_P (mode))
14353         {
14354           *cost = COSTS_N_INSNS (3);
14355           if (speed)
14356             *cost += extra_cost->vect.alu * 3;
14357         }
14358       else if (TARGET_CSSC)
14359         {
14360           *cost = COSTS_N_INSNS (1);
14361           if (speed)
14362             *cost += extra_cost->alu.clz;
14363         }
14364       else
14365         {
14366           *cost = COSTS_N_INSNS (2);
14367           if (speed)
14368             *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14369         }
14370       return false;
14371
14372     case COMPARE:
14373       op0 = XEXP (x, 0);
14374       op1 = XEXP (x, 1);
14375
14376       if (op1 == const0_rtx
14377           && GET_CODE (op0) == AND)
14378         {
14379           x = op0;
14380           mode = GET_MODE (op0);
14381           goto cost_logic;
14382         }
14383
14384       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14385         {
14386           /* TODO: A write to the CC flags possibly costs extra, this
14387              needs encoding in the cost tables.  */
14388
14389           mode = GET_MODE (op0);
14390           /* ANDS.  */
14391           if (GET_CODE (op0) == AND)
14392             {
14393               x = op0;
14394               goto cost_logic;
14395             }
14396
14397           if (GET_CODE (op0) == PLUS)
14398             {
14399               /* ADDS (and CMN alias).  */
14400               x = op0;
14401               goto cost_plus;
14402             }
14403
14404           if (GET_CODE (op0) == MINUS)
14405             {
14406               /* SUBS.  */
14407               x = op0;
14408               goto cost_minus;
14409             }
14410
14411           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14412               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14413               && CONST_INT_P (XEXP (op0, 2)))
14414             {
14415               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14416                  Handle it here directly rather than going to cost_logic
14417                  since we know the immediate generated for the TST is valid
14418                  so we can avoid creating an intermediate rtx for it only
14419                  for costing purposes.  */
14420               if (speed)
14421                 *cost += extra_cost->alu.logical;
14422
14423               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14424                                  ZERO_EXTRACT, 0, speed);
14425               return true;
14426             }
14427
14428           if (GET_CODE (op1) == NEG)
14429             {
14430               /* CMN.  */
14431               if (speed)
14432                 *cost += extra_cost->alu.arith;
14433
14434               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14435               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14436               return true;
14437             }
14438
14439           /* CMP.
14440
14441              Compare can freely swap the order of operands, and
14442              canonicalization puts the more complex operation first.
14443              But the integer MINUS logic expects the shift/extend
14444              operation in op1.  */
14445           if (! (REG_P (op0)
14446                  || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14447           {
14448             op0 = XEXP (x, 1);
14449             op1 = XEXP (x, 0);
14450           }
14451           goto cost_minus;
14452         }
14453
14454       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14455         {
14456           /* FCMP.  */
14457           if (speed)
14458             *cost += extra_cost->fp[mode == DFmode].compare;
14459
14460           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14461             {
14462               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14463               /* FCMP supports constant 0.0 for no extra cost. */
14464               return true;
14465             }
14466           return false;
14467         }
14468
14469       if (VECTOR_MODE_P (mode))
14470         {
14471           /* Vector compare.  */
14472           if (speed)
14473             *cost += extra_cost->vect.alu;
14474
14475           if (aarch64_float_const_zero_rtx_p (op1))
14476             {
14477               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14478                  cost.  */
14479               return true;
14480             }
14481           return false;
14482         }
14483       return false;
14484
14485     case MINUS:
14486       {
14487         op0 = XEXP (x, 0);
14488         op1 = XEXP (x, 1);
14489
14490 cost_minus:
14491         if (VECTOR_MODE_P (mode))
14492           {
14493             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14494             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14495               {
14496                 /* Recognise the SABD and UABD operation here.
14497                    Recursion from the PLUS case will catch the accumulating
14498                    forms.  */
14499                 if (aarch64_abd_rtx_p (x))
14500                   {
14501                     if (speed)
14502                       *cost += extra_cost->vect.alu;
14503                     return true;
14504                   }
14505                   /* SUBL2 and SUBW2.
14506                    The select-operand-high-half versions of the sub instruction
14507                    have the same cost as the regular three vector version -
14508                    don't add the costs of the select into the costs of the sub.
14509                    */
14510                 op0 = aarch64_strip_extend_vec_half (op0);
14511                 op1 = aarch64_strip_extend_vec_half (op1);
14512               }
14513           }
14514
14515         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14516
14517         /* Detect valid immediates.  */
14518         if ((GET_MODE_CLASS (mode) == MODE_INT
14519              || (GET_MODE_CLASS (mode) == MODE_CC
14520                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14521             && CONST_INT_P (op1)
14522             && aarch64_uimm12_shift (INTVAL (op1)))
14523           {
14524             if (speed)
14525               /* SUB(S) (immediate).  */
14526               *cost += extra_cost->alu.arith;
14527             return true;
14528           }
14529
14530         /* Look for SUB (extended register).  */
14531         if (is_a <scalar_int_mode> (mode)
14532             && aarch64_rtx_arith_op_extract_p (op1))
14533           {
14534             if (speed)
14535               *cost += extra_cost->alu.extend_arith;
14536
14537             op1 = aarch64_strip_extend (op1, true);
14538             *cost += rtx_cost (op1, VOIDmode,
14539                                (enum rtx_code) GET_CODE (op1), 0, speed);
14540             return true;
14541           }
14542
14543         rtx new_op1 = aarch64_strip_extend (op1, false);
14544
14545         /* Cost this as an FMA-alike operation.  */
14546         if ((GET_CODE (new_op1) == MULT
14547              || aarch64_shift_p (GET_CODE (new_op1)))
14548             && code != COMPARE)
14549           {
14550             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14551                                             (enum rtx_code) code,
14552                                             speed);
14553             return true;
14554           }
14555
14556         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14557
14558         if (speed)
14559           {
14560             if (VECTOR_MODE_P (mode))
14561               {
14562                 /* Vector SUB.  */
14563                 *cost += extra_cost->vect.alu;
14564               }
14565             else if (GET_MODE_CLASS (mode) == MODE_INT)
14566               {
14567                 /* SUB(S).  */
14568                 *cost += extra_cost->alu.arith;
14569               }
14570             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14571               {
14572                 /* FSUB.  */
14573                 *cost += extra_cost->fp[mode == DFmode].addsub;
14574               }
14575           }
14576         return true;
14577       }
14578
14579     case PLUS:
14580       {
14581         rtx new_op0;
14582
14583         op0 = XEXP (x, 0);
14584         op1 = XEXP (x, 1);
14585
14586 cost_plus:
14587         if (VECTOR_MODE_P (mode))
14588           {
14589             /* ADDL2 and ADDW2.  */
14590             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14591             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14592               {
14593                 /* The select-operand-high-half versions of the add instruction
14594                    have the same cost as the regular three vector version -
14595                    don't add the costs of the select into the costs of the add.
14596                    */
14597                 op0 = aarch64_strip_extend_vec_half (op0);
14598                 op1 = aarch64_strip_extend_vec_half (op1);
14599               }
14600           }
14601
14602         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14603             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14604           {
14605             /* CSINC.  */
14606             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14607             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14608             return true;
14609           }
14610
14611         if (GET_MODE_CLASS (mode) == MODE_INT
14612             && (aarch64_plus_immediate (op1, mode)
14613                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14614           {
14615             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14616
14617             if (speed)
14618               {
14619                 /* ADD (immediate).  */
14620                 *cost += extra_cost->alu.arith;
14621
14622                 /* Some tunings prefer to not use the VL-based scalar ops.
14623                    Increase the cost of the poly immediate to prevent their
14624                    formation.  */
14625                 if (GET_CODE (op1) == CONST_POLY_INT
14626                     && (aarch64_tune_params.extra_tuning_flags
14627                         & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14628                   *cost += COSTS_N_INSNS (1);
14629               }
14630             return true;
14631           }
14632
14633         if (aarch64_pluslong_immediate (op1, mode))
14634           {
14635             /* 24-bit add in 2 instructions or 12-bit shifted add.  */
14636             if ((INTVAL (op1) & 0xfff) != 0)
14637               *cost += COSTS_N_INSNS (1);
14638
14639             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14640             return true;
14641           }
14642
14643         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14644
14645         /* Look for ADD (extended register).  */
14646         if (is_a <scalar_int_mode> (mode)
14647             && aarch64_rtx_arith_op_extract_p (op0))
14648           {
14649             if (speed)
14650               *cost += extra_cost->alu.extend_arith;
14651
14652             op0 = aarch64_strip_extend (op0, true);
14653             *cost += rtx_cost (op0, VOIDmode,
14654                                (enum rtx_code) GET_CODE (op0), 0, speed);
14655             return true;
14656           }
14657
14658         /* Strip any extend, leave shifts behind as we will
14659            cost them through mult_cost.  */
14660         new_op0 = aarch64_strip_extend (op0, false);
14661
14662         if (GET_CODE (new_op0) == MULT
14663             || aarch64_shift_p (GET_CODE (new_op0)))
14664           {
14665             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14666                                             speed);
14667             return true;
14668           }
14669
14670         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14671
14672         if (speed)
14673           {
14674             if (VECTOR_MODE_P (mode))
14675               {
14676                 /* Vector ADD.  */
14677                 *cost += extra_cost->vect.alu;
14678               }
14679             else if (GET_MODE_CLASS (mode) == MODE_INT)
14680               {
14681                 /* ADD.  */
14682                 *cost += extra_cost->alu.arith;
14683               }
14684             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14685               {
14686                 /* FADD.  */
14687                 *cost += extra_cost->fp[mode == DFmode].addsub;
14688               }
14689           }
14690         return true;
14691       }
14692
14693     case BITREVERSE:
14694     case BSWAP:
14695       *cost = COSTS_N_INSNS (1);
14696
14697       if (speed)
14698         {
14699           if (VECTOR_MODE_P (mode))
14700             *cost += extra_cost->vect.alu;
14701           else
14702             *cost += extra_cost->alu.rev;
14703         }
14704       return false;
14705
14706     case IOR:
14707       if (aarch_rev16_p (x))
14708         {
14709           *cost = COSTS_N_INSNS (1);
14710
14711           if (speed)
14712             {
14713               if (VECTOR_MODE_P (mode))
14714                 *cost += extra_cost->vect.alu;
14715               else
14716                 *cost += extra_cost->alu.rev;
14717             }
14718           return true;
14719         }
14720
14721       if (aarch64_extr_rtx_p (x, &op0, &op1))
14722         {
14723           *cost += rtx_cost (op0, mode, IOR, 0, speed);
14724           *cost += rtx_cost (op1, mode, IOR, 1, speed);
14725           if (speed)
14726             *cost += extra_cost->alu.shift;
14727
14728           return true;
14729         }
14730     /* Fall through.  */
14731     case XOR:
14732     case AND:
14733     cost_logic:
14734       op0 = XEXP (x, 0);
14735       op1 = XEXP (x, 1);
14736
14737       if (VECTOR_MODE_P (mode))
14738         {
14739           if (speed)
14740             *cost += extra_cost->vect.alu;
14741           return true;
14742         }
14743
14744       if (code == AND
14745           && GET_CODE (op0) == MULT
14746           && CONST_INT_P (XEXP (op0, 1))
14747           && CONST_INT_P (op1)
14748           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14749                                INTVAL (op1)) != 0)
14750         {
14751           /* This is a UBFM/SBFM.  */
14752           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14753           if (speed)
14754             *cost += extra_cost->alu.bfx;
14755           return true;
14756         }
14757
14758       if (is_int_mode (mode, &int_mode))
14759         {
14760           if (CONST_INT_P (op1))
14761             {
14762               /* We have a mask + shift version of a UBFIZ
14763                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
14764               if (GET_CODE (op0) == ASHIFT
14765                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14766                                                          XEXP (op0, 1)))
14767                 {
14768                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
14769                                      (enum rtx_code) code, 0, speed);
14770                   if (speed)
14771                     *cost += extra_cost->alu.bfx;
14772
14773                   return true;
14774                 }
14775               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14776                 {
14777                 /* We possibly get the immediate for free, this is not
14778                    modelled.  */
14779                   *cost += rtx_cost (op0, int_mode,
14780                                      (enum rtx_code) code, 0, speed);
14781                   if (speed)
14782                     *cost += extra_cost->alu.logical;
14783
14784                   return true;
14785                 }
14786             }
14787           else
14788             {
14789               rtx new_op0 = op0;
14790
14791               /* Handle ORN, EON, or BIC.  */
14792               if (GET_CODE (op0) == NOT)
14793                 op0 = XEXP (op0, 0);
14794
14795               new_op0 = aarch64_strip_shift (op0);
14796
14797               /* If we had a shift on op0 then this is a logical-shift-
14798                  by-register/immediate operation.  Otherwise, this is just
14799                  a logical operation.  */
14800               if (speed)
14801                 {
14802                   if (new_op0 != op0)
14803                     {
14804                       /* Shift by immediate.  */
14805                       if (CONST_INT_P (XEXP (op0, 1)))
14806                         *cost += extra_cost->alu.log_shift;
14807                       else
14808                         *cost += extra_cost->alu.log_shift_reg;
14809                     }
14810                   else
14811                     *cost += extra_cost->alu.logical;
14812                 }
14813
14814               /* In both cases we want to cost both operands.  */
14815               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14816                                  0, speed);
14817               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14818                                  1, speed);
14819
14820               return true;
14821             }
14822         }
14823       return false;
14824
14825     case NOT:
14826       x = XEXP (x, 0);
14827       op0 = aarch64_strip_shift (x);
14828
14829       if (VECTOR_MODE_P (mode))
14830         {
14831           /* Vector NOT.  */
14832           *cost += extra_cost->vect.alu;
14833           return false;
14834         }
14835
14836       /* MVN-shifted-reg.  */
14837       if (op0 != x)
14838         {
14839           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14840
14841           if (speed)
14842             *cost += extra_cost->alu.log_shift;
14843
14844           return true;
14845         }
14846       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14847          Handle the second form here taking care that 'a' in the above can
14848          be a shift.  */
14849       else if (GET_CODE (op0) == XOR)
14850         {
14851           rtx newop0 = XEXP (op0, 0);
14852           rtx newop1 = XEXP (op0, 1);
14853           rtx op0_stripped = aarch64_strip_shift (newop0);
14854
14855           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14856           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14857
14858           if (speed)
14859             {
14860               if (op0_stripped != newop0)
14861                 *cost += extra_cost->alu.log_shift;
14862               else
14863                 *cost += extra_cost->alu.logical;
14864             }
14865
14866           return true;
14867         }
14868       /* MVN.  */
14869       if (speed)
14870         *cost += extra_cost->alu.logical;
14871
14872       return false;
14873
14874     case ZERO_EXTEND:
14875
14876       op0 = XEXP (x, 0);
14877       /* If a value is written in SI mode, then zero extended to DI
14878          mode, the operation will in general be free as a write to
14879          a 'w' register implicitly zeroes the upper bits of an 'x'
14880          register.  However, if this is
14881
14882            (set (reg) (zero_extend (reg)))
14883
14884          we must cost the explicit register move.  */
14885       if (mode == DImode
14886           && GET_MODE (op0) == SImode)
14887         {
14888           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14889
14890         /* If OP_COST is non-zero, then the cost of the zero extend
14891            is effectively the cost of the inner operation.  Otherwise
14892            we have a MOV instruction and we take the cost from the MOV
14893            itself.  This is true independently of whether we are
14894            optimizing for space or time.  */
14895           if (op_cost)
14896             *cost = op_cost;
14897
14898           return true;
14899         }
14900       else if (MEM_P (op0))
14901         {
14902           /* All loads can zero extend to any size for free.  */
14903           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14904           return true;
14905         }
14906
14907       op0 = aarch64_extend_bitfield_pattern_p (x);
14908       if (op0)
14909         {
14910           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14911           if (speed)
14912             *cost += extra_cost->alu.bfx;
14913           return true;
14914         }
14915
14916       if (speed)
14917         {
14918           if (VECTOR_MODE_P (mode))
14919             {
14920               /* UMOV.  */
14921               *cost += extra_cost->vect.alu;
14922             }
14923           else
14924             {
14925               /* We generate an AND instead of UXTB/UXTH.  */
14926               *cost += extra_cost->alu.logical;
14927             }
14928         }
14929       return false;
14930
14931     case SIGN_EXTEND:
14932       if (MEM_P (XEXP (x, 0)))
14933         {
14934           /* LDRSH.  */
14935           if (speed)
14936             {
14937               rtx address = XEXP (XEXP (x, 0), 0);
14938               *cost += extra_cost->ldst.load_sign_extend;
14939
14940               *cost +=
14941                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14942                                                      0, speed));
14943             }
14944           return true;
14945         }
14946
14947       op0 = aarch64_extend_bitfield_pattern_p (x);
14948       if (op0)
14949         {
14950           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14951           if (speed)
14952             *cost += extra_cost->alu.bfx;
14953           return true;
14954         }
14955
14956       if (speed)
14957         {
14958           if (VECTOR_MODE_P (mode))
14959             *cost += extra_cost->vect.alu;
14960           else
14961             *cost += extra_cost->alu.extend;
14962         }
14963       return false;
14964
14965     case ROTATE:
14966     case ROTATERT:
14967     case LSHIFTRT:
14968     case ASHIFTRT:
14969     case ASHIFT:
14970       op0 = XEXP (x, 0);
14971       op1 = XEXP (x, 1);
14972
14973       if (CONST_INT_P (op1))
14974         {
14975           if (speed)
14976             {
14977               if (VECTOR_MODE_P (mode))
14978                 {
14979                   /* Vector shift (immediate).  */
14980                   *cost += extra_cost->vect.alu;
14981                 }
14982               else
14983                 {
14984                   /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
14985                      These are all aliases.  */
14986                   *cost += extra_cost->alu.shift;
14987                 }
14988             }
14989
14990           /* We can incorporate zero/sign extend for free.  */
14991           if (GET_CODE (op0) == ZERO_EXTEND
14992               || GET_CODE (op0) == SIGN_EXTEND)
14993             op0 = XEXP (op0, 0);
14994
14995           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
14996           return true;
14997         }
14998       else
14999         {
15000           if (VECTOR_MODE_P (mode))
15001             {
15002               if (speed)
15003                 /* Vector shift (register).  */
15004                 *cost += extra_cost->vect.alu;
15005             }
15006           else
15007             {
15008               if (speed)
15009                 /* LSLV, ASRV.  */
15010                 *cost += extra_cost->alu.shift_reg;
15011
15012                /* The register shift amount may be in a shorter mode expressed
15013                   as a lowpart SUBREG.  For costing purposes just look inside.  */
15014               if (SUBREG_P (op1) && subreg_lowpart_p (op1))
15015                 op1 = SUBREG_REG (op1);
15016               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
15017                   && CONST_INT_P (XEXP (op1, 1))
15018                   && known_eq (INTVAL (XEXP (op1, 1)),
15019                                GET_MODE_BITSIZE (mode) - 1))
15020                 {
15021                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
15022                   /* We already demanded XEXP (op1, 0) to be REG_P, so
15023                      don't recurse into it.  */
15024                   return true;
15025                 }
15026             }
15027           return false;  /* All arguments need to be in registers.  */
15028         }
15029
15030     case SYMBOL_REF:
15031
15032       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
15033           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
15034         {
15035           /* LDR.  */
15036           if (speed)
15037             *cost += extra_cost->ldst.load;
15038         }
15039       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
15040                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
15041         {
15042           /* ADRP, followed by ADD.  */
15043           *cost += COSTS_N_INSNS (1);
15044           if (speed)
15045             *cost += 2 * extra_cost->alu.arith;
15046         }
15047       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
15048                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
15049         {
15050           /* ADR.  */
15051           if (speed)
15052             *cost += extra_cost->alu.arith;
15053         }
15054
15055       if (flag_pic)
15056         {
15057           /* One extra load instruction, after accessing the GOT.  */
15058           *cost += COSTS_N_INSNS (1);
15059           if (speed)
15060             *cost += extra_cost->ldst.load;
15061         }
15062       return true;
15063
15064     case HIGH:
15065     case LO_SUM:
15066       /* ADRP/ADD (immediate).  */
15067       if (speed)
15068         *cost += extra_cost->alu.arith;
15069       return true;
15070
15071     case ZERO_EXTRACT:
15072     case SIGN_EXTRACT:
15073       /* UBFX/SBFX.  */
15074       if (speed)
15075         {
15076           if (VECTOR_MODE_P (mode))
15077             *cost += extra_cost->vect.alu;
15078           else
15079             *cost += extra_cost->alu.bfx;
15080         }
15081
15082       /* We can trust that the immediates used will be correct (there
15083          are no by-register forms), so we need only cost op0.  */
15084       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
15085       return true;
15086
15087     case MULT:
15088       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
15089       /* aarch64_rtx_mult_cost always handles recursion to its
15090          operands.  */
15091       return true;
15092
15093     case MOD:
15094     /* We can expand signed mod by power of 2 using a NEGS, two parallel
15095        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
15096        an unconditional negate.  This case should only ever be reached through
15097        the set_smod_pow2_cheap check in expmed.cc.  */
15098       if (CONST_INT_P (XEXP (x, 1))
15099           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
15100           && (mode == SImode || mode == DImode))
15101         {
15102           /* We expand to 4 instructions.  Reset the baseline.  */
15103           *cost = COSTS_N_INSNS (4);
15104
15105           if (speed)
15106             *cost += 2 * extra_cost->alu.logical
15107                      + 2 * extra_cost->alu.arith;
15108
15109           return true;
15110         }
15111
15112     /* Fall-through.  */
15113     case UMOD:
15114       if (speed)
15115         {
15116           /* Slighly prefer UMOD over SMOD.  */
15117           if (VECTOR_MODE_P (mode))
15118             *cost += extra_cost->vect.alu;
15119           else if (GET_MODE_CLASS (mode) == MODE_INT)
15120             *cost += (extra_cost->mult[mode == DImode].add
15121                       + extra_cost->mult[mode == DImode].idiv
15122                       + (code == MOD ? 1 : 0));
15123         }
15124       return false;  /* All arguments need to be in registers.  */
15125
15126     case DIV:
15127     case UDIV:
15128     case SQRT:
15129       if (speed)
15130         {
15131           if (VECTOR_MODE_P (mode))
15132             *cost += extra_cost->vect.alu;
15133           else if (GET_MODE_CLASS (mode) == MODE_INT)
15134             /* There is no integer SQRT, so only DIV and UDIV can get
15135                here.  */
15136             *cost += (extra_cost->mult[mode == DImode].idiv
15137                      /* Slighly prefer UDIV over SDIV.  */
15138                      + (code == DIV ? 1 : 0));
15139           else
15140             *cost += extra_cost->fp[mode == DFmode].div;
15141         }
15142       return false;  /* All arguments need to be in registers.  */
15143
15144     case IF_THEN_ELSE:
15145       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
15146                                          XEXP (x, 2), cost, speed);
15147
15148     case EQ:
15149     case NE:
15150     case GT:
15151     case GTU:
15152     case LT:
15153     case LTU:
15154     case GE:
15155     case GEU:
15156     case LE:
15157     case LEU:
15158
15159       return false; /* All arguments must be in registers.  */
15160
15161     case FMA:
15162       op0 = XEXP (x, 0);
15163       op1 = XEXP (x, 1);
15164       op2 = XEXP (x, 2);
15165
15166       if (speed)
15167         {
15168           if (VECTOR_MODE_P (mode))
15169             *cost += extra_cost->vect.alu;
15170           else
15171             *cost += extra_cost->fp[mode == DFmode].fma;
15172         }
15173
15174       /* FMSUB, FNMADD, and FNMSUB are free.  */
15175       if (GET_CODE (op0) == NEG)
15176         op0 = XEXP (op0, 0);
15177
15178       if (GET_CODE (op2) == NEG)
15179         op2 = XEXP (op2, 0);
15180
15181       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
15182          and the by-element operand as operand 0.  */
15183       if (GET_CODE (op1) == NEG)
15184         op1 = XEXP (op1, 0);
15185
15186       /* Catch vector-by-element operations.  The by-element operand can
15187          either be (vec_duplicate (vec_select (x))) or just
15188          (vec_select (x)), depending on whether we are multiplying by
15189          a vector or a scalar.
15190
15191          Canonicalization is not very good in these cases, FMA4 will put the
15192          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
15193       if (GET_CODE (op0) == VEC_DUPLICATE)
15194         op0 = XEXP (op0, 0);
15195       else if (GET_CODE (op1) == VEC_DUPLICATE)
15196         op1 = XEXP (op1, 0);
15197
15198       if (GET_CODE (op0) == VEC_SELECT)
15199         op0 = XEXP (op0, 0);
15200       else if (GET_CODE (op1) == VEC_SELECT)
15201         op1 = XEXP (op1, 0);
15202
15203       /* If the remaining parameters are not registers,
15204          get the cost to put them into registers.  */
15205       *cost += rtx_cost (op0, mode, FMA, 0, speed);
15206       *cost += rtx_cost (op1, mode, FMA, 1, speed);
15207       *cost += rtx_cost (op2, mode, FMA, 2, speed);
15208       return true;
15209
15210     case FLOAT:
15211     case UNSIGNED_FLOAT:
15212       if (speed)
15213         *cost += extra_cost->fp[mode == DFmode].fromint;
15214       return false;
15215
15216     case FLOAT_EXTEND:
15217       if (speed)
15218         {
15219           if (VECTOR_MODE_P (mode))
15220             {
15221               /*Vector truncate.  */
15222               *cost += extra_cost->vect.alu;
15223             }
15224           else
15225             *cost += extra_cost->fp[mode == DFmode].widen;
15226         }
15227       return false;
15228
15229     case FLOAT_TRUNCATE:
15230       if (speed)
15231         {
15232           if (VECTOR_MODE_P (mode))
15233             {
15234               /*Vector conversion.  */
15235               *cost += extra_cost->vect.alu;
15236             }
15237           else
15238             *cost += extra_cost->fp[mode == DFmode].narrow;
15239         }
15240       return false;
15241
15242     case FIX:
15243     case UNSIGNED_FIX:
15244       x = XEXP (x, 0);
15245       /* Strip the rounding part.  They will all be implemented
15246          by the fcvt* family of instructions anyway.  */
15247       if (GET_CODE (x) == UNSPEC)
15248         {
15249           unsigned int uns_code = XINT (x, 1);
15250
15251           if (uns_code == UNSPEC_FRINTA
15252               || uns_code == UNSPEC_FRINTM
15253               || uns_code == UNSPEC_FRINTN
15254               || uns_code == UNSPEC_FRINTP
15255               || uns_code == UNSPEC_FRINTZ)
15256             x = XVECEXP (x, 0, 0);
15257         }
15258
15259       if (speed)
15260         {
15261           if (VECTOR_MODE_P (mode))
15262             *cost += extra_cost->vect.alu;
15263           else
15264             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
15265         }
15266
15267       /* We can combine fmul by a power of 2 followed by a fcvt into a single
15268          fixed-point fcvt.  */
15269       if (GET_CODE (x) == MULT
15270           && ((VECTOR_MODE_P (mode)
15271                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
15272               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
15273         {
15274           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
15275                              0, speed);
15276           return true;
15277         }
15278
15279       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
15280       return true;
15281
15282     case ABS:
15283       if (VECTOR_MODE_P (mode))
15284         {
15285           /* ABS (vector).  */
15286           if (speed)
15287             *cost += extra_cost->vect.alu;
15288         }
15289       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15290         {
15291           op0 = XEXP (x, 0);
15292
15293           /* FABD, which is analogous to FADD.  */
15294           if (GET_CODE (op0) == MINUS)
15295             {
15296               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15297               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
15298               if (speed)
15299                 *cost += extra_cost->fp[mode == DFmode].addsub;
15300
15301               return true;
15302             }
15303           /* Simple FABS is analogous to FNEG.  */
15304           if (speed)
15305             *cost += extra_cost->fp[mode == DFmode].neg;
15306         }
15307       else
15308         {
15309           /* Integer ABS will either be split to
15310              two arithmetic instructions, or will be an ABS
15311              (scalar), which we don't model.  */
15312           *cost = COSTS_N_INSNS (2);
15313           if (speed)
15314             *cost += 2 * extra_cost->alu.arith;
15315         }
15316       return false;
15317
15318     case SMAX:
15319     case SMIN:
15320       if (speed)
15321         {
15322           if (VECTOR_MODE_P (mode))
15323             *cost += extra_cost->vect.alu;
15324           else
15325             {
15326               /* FMAXNM/FMINNM/FMAX/FMIN.
15327                  TODO: This may not be accurate for all implementations, but
15328                  we do not model this in the cost tables.  */
15329               *cost += extra_cost->fp[mode == DFmode].addsub;
15330             }
15331         }
15332       return false;
15333
15334     case UNSPEC:
15335       /* The floating point round to integer frint* instructions.  */
15336       if (aarch64_frint_unspec_p (XINT (x, 1)))
15337         {
15338           if (speed)
15339             *cost += extra_cost->fp[mode == DFmode].roundint;
15340
15341           return false;
15342         }
15343       break;
15344
15345     case TRUNCATE:
15346
15347       /* Decompose <su>muldi3_highpart.  */
15348       if (/* (truncate:DI  */
15349           mode == DImode
15350           /*   (lshiftrt:TI  */
15351           && GET_MODE (XEXP (x, 0)) == TImode
15352           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15353           /*      (mult:TI  */
15354           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15355           /*        (ANY_EXTEND:TI (reg:DI))
15356                     (ANY_EXTEND:TI (reg:DI)))  */
15357           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15358                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15359               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15360                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15361           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15362           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15363           /*     (const_int 64)  */
15364           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15365           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15366         {
15367           /* UMULH/SMULH.  */
15368           if (speed)
15369             *cost += extra_cost->mult[mode == DImode].extend;
15370           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15371                              mode, MULT, 0, speed);
15372           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15373                              mode, MULT, 1, speed);
15374           return true;
15375         }
15376         break;
15377     case CONST_VECTOR:
15378         {
15379           /* Load using MOVI/MVNI.  */
15380           if (aarch64_simd_valid_immediate (x, NULL))
15381             *cost = extra_cost->vect.movi;
15382           else /* Load using constant pool.  */
15383             *cost = extra_cost->ldst.load;
15384           break;
15385         }
15386     case VEC_CONCAT:
15387         /* depending on the operation, either DUP or INS.
15388            For now, keep default costing.  */
15389         break;
15390     case VEC_DUPLICATE:
15391         /* Load using a DUP.  */
15392         *cost = extra_cost->vect.dup;
15393         return false;
15394     case VEC_SELECT:
15395         {
15396           rtx op0 = XEXP (x, 0);
15397           *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15398
15399           /* cost subreg of 0 as free, otherwise as DUP */
15400           rtx op1 = XEXP (x, 1);
15401           if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15402             ;
15403           else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15404             *cost = extra_cost->vect.dup;
15405           else
15406             *cost = extra_cost->vect.extract;
15407           return true;
15408         }
15409     default:
15410       break;
15411     }
15412
15413   if (dump_file
15414       && flag_aarch64_verbose_cost)
15415     fprintf (dump_file,
15416       "\nFailed to cost RTX.  Assuming default cost.\n");
15417
15418   return true;
15419 }
15420
15421 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15422    calculated for X.  This cost is stored in *COST.  Returns true
15423    if the total cost of X was calculated.  */
15424 static bool
15425 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15426                    int param, int *cost, bool speed)
15427 {
15428   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15429
15430   if (dump_file
15431       && flag_aarch64_verbose_cost)
15432     {
15433       print_rtl_single (dump_file, x);
15434       fprintf (dump_file, "\n%s cost: %d (%s)\n",
15435                speed ? "Hot" : "Cold",
15436                *cost, result ? "final" : "partial");
15437     }
15438
15439   return result;
15440 }
15441
15442 static int
15443 aarch64_register_move_cost (machine_mode mode,
15444                             reg_class_t from_i, reg_class_t to_i)
15445 {
15446   enum reg_class from = (enum reg_class) from_i;
15447   enum reg_class to = (enum reg_class) to_i;
15448   const struct cpu_regmove_cost *regmove_cost
15449     = aarch64_tune_params.regmove_cost;
15450
15451   /* Trest any subset of POINTER_REGS as though it were GENERAL_REGS.  */
15452   if (reg_class_subset_p (to, POINTER_REGS))
15453     to = GENERAL_REGS;
15454
15455   if (reg_class_subset_p (from, POINTER_REGS))
15456     from = GENERAL_REGS;
15457
15458   /* Make RDFFR very expensive.  In particular, if we know that the FFR
15459      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15460      as a way of obtaining a PTRUE.  */
15461   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15462       && hard_reg_set_subset_p (reg_class_contents[from_i],
15463                                 reg_class_contents[FFR_REGS]))
15464     return 80;
15465
15466   /* Moving between GPR and stack cost is the same as GP2GP.  */
15467   if ((from == GENERAL_REGS && to == STACK_REG)
15468       || (to == GENERAL_REGS && from == STACK_REG))
15469     return regmove_cost->GP2GP;
15470
15471   /* To/From the stack register, we move via the gprs.  */
15472   if (to == STACK_REG || from == STACK_REG)
15473     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15474             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15475
15476   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15477   if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15478       && known_eq (GET_MODE_SIZE (mode), 16))
15479     {
15480       /* 128-bit operations on general registers require 2 instructions.  */
15481       if (from == GENERAL_REGS && to == GENERAL_REGS)
15482         return regmove_cost->GP2GP * 2;
15483       else if (from == GENERAL_REGS)
15484         return regmove_cost->GP2FP * 2;
15485       else if (to == GENERAL_REGS)
15486         return regmove_cost->FP2GP * 2;
15487
15488       /* When AdvSIMD instructions are disabled it is not possible to move
15489          a 128-bit value directly between Q registers.  This is handled in
15490          secondary reload.  A general register is used as a scratch to move
15491          the upper DI value and the lower DI value is moved directly,
15492          hence the cost is the sum of three moves. */
15493       if (!TARGET_SIMD && !TARGET_SVE)
15494         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15495
15496       return regmove_cost->FP2FP;
15497     }
15498
15499   if (from == GENERAL_REGS && to == GENERAL_REGS)
15500     return regmove_cost->GP2GP;
15501   else if (from == GENERAL_REGS)
15502     return regmove_cost->GP2FP;
15503   else if (to == GENERAL_REGS)
15504     return regmove_cost->FP2GP;
15505
15506   if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15507     {
15508       /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15509          The cost must be greater than 2 units to indicate that direct
15510          moves aren't possible.  */
15511       auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15512                          + aarch64_tune_params.memmov_cost.store_fp);
15513       return MIN (CEIL (per_vector, 2), 4);
15514     }
15515
15516   return regmove_cost->FP2FP;
15517 }
15518
15519 /* Implements TARGET_MEMORY_MOVE_COST.  */
15520 static int
15521 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15522 {
15523   enum reg_class rclass = (enum reg_class) rclass_i;
15524   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15525       ? reg_classes_intersect_p (rclass, PR_REGS)
15526       : reg_class_subset_p (rclass, PR_REGS))
15527     return (in
15528             ? aarch64_tune_params.memmov_cost.load_pred
15529             : aarch64_tune_params.memmov_cost.store_pred);
15530
15531   if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15532       ? reg_classes_intersect_p (rclass, FP_REGS)
15533       : reg_class_subset_p (rclass, FP_REGS))
15534     return (in
15535             ? aarch64_tune_params.memmov_cost.load_fp
15536             : aarch64_tune_params.memmov_cost.store_fp);
15537
15538   return (in
15539           ? aarch64_tune_params.memmov_cost.load_int
15540           : aarch64_tune_params.memmov_cost.store_int);
15541 }
15542
15543 /* Implement TARGET_INSN_COST.  We have the opportunity to do something
15544    much more productive here, such as using insn attributes to cost things.
15545    But we don't, not yet.
15546
15547    The main point of this current definition is to make calling insn_cost
15548    on one instruction equivalent to calling seq_cost on a sequence that
15549    contains only that instruction.  The default definition would instead
15550    only look at SET_SRCs, ignoring SET_DESTs.
15551
15552    This ensures that, for example, storing a 128-bit zero vector is more
15553    expensive than storing a 128-bit vector register.  A move of zero
15554    into a 128-bit vector register followed by multiple stores of that
15555    register is then cheaper than multiple stores of zero (which would
15556    use STP of XZR).  This in turn allows STP Qs to be formed.  */
15557 static int
15558 aarch64_insn_cost (rtx_insn *insn, bool speed)
15559 {
15560   if (rtx set = single_set (insn))
15561     return set_rtx_cost (set, speed);
15562   return pattern_cost (PATTERN (insn), speed);
15563 }
15564
15565 /* Implement TARGET_INIT_BUILTINS.  */
15566 static void
15567 aarch64_init_builtins ()
15568 {
15569   aarch64_general_init_builtins ();
15570   aarch64_sve::init_builtins ();
15571 #ifdef SUBTARGET_INIT_BUILTINS
15572   SUBTARGET_INIT_BUILTINS;
15573 #endif
15574 }
15575
15576 /* Implement TARGET_FOLD_BUILTIN.  */
15577 static tree
15578 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15579 {
15580   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15581   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15582   tree type = TREE_TYPE (TREE_TYPE (fndecl));
15583   switch (code & AARCH64_BUILTIN_CLASS)
15584     {
15585     case AARCH64_BUILTIN_GENERAL:
15586       return aarch64_general_fold_builtin (subcode, type, nargs, args);
15587
15588     case AARCH64_BUILTIN_SVE:
15589       return NULL_TREE;
15590     }
15591   gcc_unreachable ();
15592 }
15593
15594 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
15595 static bool
15596 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15597 {
15598   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15599   tree fndecl = gimple_call_fndecl (stmt);
15600   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15601   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15602   gimple *new_stmt = NULL;
15603   switch (code & AARCH64_BUILTIN_CLASS)
15604     {
15605     case AARCH64_BUILTIN_GENERAL:
15606       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15607       break;
15608
15609     case AARCH64_BUILTIN_SVE:
15610       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15611       break;
15612     }
15613
15614   if (!new_stmt)
15615     return false;
15616
15617   gsi_replace (gsi, new_stmt, false);
15618   return true;
15619 }
15620
15621 /* Implement TARGET_EXPAND_BUILTIN.  */
15622 static rtx
15623 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15624 {
15625   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15626   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15627   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15628   switch (code & AARCH64_BUILTIN_CLASS)
15629     {
15630     case AARCH64_BUILTIN_GENERAL:
15631       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15632
15633     case AARCH64_BUILTIN_SVE:
15634       return aarch64_sve::expand_builtin (subcode, exp, target);
15635     }
15636   gcc_unreachable ();
15637 }
15638
15639 /* Implement TARGET_BUILTIN_DECL.  */
15640 static tree
15641 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15642 {
15643   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15644   switch (code & AARCH64_BUILTIN_CLASS)
15645     {
15646     case AARCH64_BUILTIN_GENERAL:
15647       return aarch64_general_builtin_decl (subcode, initialize_p);
15648
15649     case AARCH64_BUILTIN_SVE:
15650       return aarch64_sve::builtin_decl (subcode, initialize_p);
15651     }
15652   gcc_unreachable ();
15653 }
15654
15655 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15656    to optimize 1.0/sqrt.  */
15657
15658 static bool
15659 use_rsqrt_p (machine_mode mode)
15660 {
15661   return (!flag_trapping_math
15662           && flag_unsafe_math_optimizations
15663           && ((aarch64_tune_params.approx_modes->recip_sqrt
15664                & AARCH64_APPROX_MODE (mode))
15665               || flag_mrecip_low_precision_sqrt));
15666 }
15667
15668 /* Function to decide when to use the approximate reciprocal square root
15669    builtin.  */
15670
15671 static tree
15672 aarch64_builtin_reciprocal (tree fndecl)
15673 {
15674   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15675
15676   if (!use_rsqrt_p (mode))
15677     return NULL_TREE;
15678   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15679   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15680   switch (code & AARCH64_BUILTIN_CLASS)
15681     {
15682     case AARCH64_BUILTIN_GENERAL:
15683       return aarch64_general_builtin_rsqrt (subcode);
15684
15685     case AARCH64_BUILTIN_SVE:
15686       return NULL_TREE;
15687     }
15688   gcc_unreachable ();
15689 }
15690
15691 /* Emit code to perform the floating-point operation:
15692
15693      DST = SRC1 * SRC2
15694
15695    where all three operands are already known to be registers.
15696    If the operation is an SVE one, PTRUE is a suitable all-true
15697    predicate.  */
15698
15699 static void
15700 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15701 {
15702   if (ptrue)
15703     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15704                                  dst, ptrue, src1, src2,
15705                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
15706   else
15707     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15708 }
15709
15710 /* Emit instruction sequence to compute either the approximate square root
15711    or its approximate reciprocal, depending on the flag RECP, and return
15712    whether the sequence was emitted or not.  */
15713
15714 bool
15715 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15716 {
15717   machine_mode mode = GET_MODE (dst);
15718
15719   if (GET_MODE_INNER (mode) == HFmode)
15720     {
15721       gcc_assert (!recp);
15722       return false;
15723     }
15724
15725   if (!recp)
15726     {
15727       if (!(flag_mlow_precision_sqrt
15728             || (aarch64_tune_params.approx_modes->sqrt
15729                 & AARCH64_APPROX_MODE (mode))))
15730         return false;
15731
15732       if (!flag_finite_math_only
15733           || flag_trapping_math
15734           || !flag_unsafe_math_optimizations
15735           || optimize_function_for_size_p (cfun))
15736         return false;
15737     }
15738   else
15739     /* Caller assumes we cannot fail.  */
15740     gcc_assert (use_rsqrt_p (mode));
15741
15742   rtx pg = NULL_RTX;
15743   if (aarch64_sve_mode_p (mode))
15744     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15745   machine_mode mmsk = (VECTOR_MODE_P (mode)
15746                        ? related_int_vector_mode (mode).require ()
15747                        : int_mode_for_mode (mode).require ());
15748   rtx xmsk = NULL_RTX;
15749   if (!recp)
15750     {
15751       /* When calculating the approximate square root, compare the
15752          argument with 0.0 and create a mask.  */
15753       rtx zero = CONST0_RTX (mode);
15754       if (pg)
15755         {
15756           xmsk = gen_reg_rtx (GET_MODE (pg));
15757           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15758           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15759                                            xmsk, pg, hint, src, zero));
15760         }
15761       else
15762         {
15763           xmsk = gen_reg_rtx (mmsk);
15764           emit_insn (gen_rtx_SET (xmsk,
15765                                   gen_rtx_NEG (mmsk,
15766                                                gen_rtx_EQ (mmsk, src, zero))));
15767         }
15768     }
15769
15770   /* Estimate the approximate reciprocal square root.  */
15771   rtx xdst = gen_reg_rtx (mode);
15772   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15773
15774   /* Iterate over the series twice for SF and thrice for DF.  */
15775   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15776
15777   /* Optionally iterate over the series once less for faster performance
15778      while sacrificing the accuracy.  */
15779   if ((recp && flag_mrecip_low_precision_sqrt)
15780       || (!recp && flag_mlow_precision_sqrt))
15781     iterations--;
15782
15783   /* Iterate over the series to calculate the approximate reciprocal square
15784      root.  */
15785   rtx x1 = gen_reg_rtx (mode);
15786   while (iterations--)
15787     {
15788       rtx x2 = gen_reg_rtx (mode);
15789       aarch64_emit_mult (x2, pg, xdst, xdst);
15790
15791       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15792
15793       if (iterations > 0)
15794         aarch64_emit_mult (xdst, pg, xdst, x1);
15795     }
15796
15797   if (!recp)
15798     {
15799       if (pg)
15800         /* Multiply nonzero source values by the corresponding intermediate
15801            result elements, so that the final calculation is the approximate
15802            square root rather than its reciprocal.  Select a zero result for
15803            zero source values, to avoid the Inf * 0 -> NaN that we'd get
15804            otherwise.  */
15805         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15806                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15807       else
15808         {
15809           /* Qualify the approximate reciprocal square root when the
15810              argument is 0.0 by squashing the intermediary result to 0.0.  */
15811           rtx xtmp = gen_reg_rtx (mmsk);
15812           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15813                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
15814           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15815
15816           /* Calculate the approximate square root.  */
15817           aarch64_emit_mult (xdst, pg, xdst, src);
15818         }
15819     }
15820
15821   /* Finalize the approximation.  */
15822   aarch64_emit_mult (dst, pg, xdst, x1);
15823
15824   return true;
15825 }
15826
15827 /* Emit the instruction sequence to compute the approximation for the division
15828    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
15829
15830 bool
15831 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15832 {
15833   machine_mode mode = GET_MODE (quo);
15834
15835   if (GET_MODE_INNER (mode) == HFmode)
15836     return false;
15837
15838   bool use_approx_division_p = (flag_mlow_precision_div
15839                                 || (aarch64_tune_params.approx_modes->division
15840                                     & AARCH64_APPROX_MODE (mode)));
15841
15842   if (!flag_finite_math_only
15843       || flag_trapping_math
15844       || !flag_unsafe_math_optimizations
15845       || optimize_function_for_size_p (cfun)
15846       || !use_approx_division_p)
15847     return false;
15848
15849   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15850     return false;
15851
15852   rtx pg = NULL_RTX;
15853   if (aarch64_sve_mode_p (mode))
15854     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15855
15856   /* Estimate the approximate reciprocal.  */
15857   rtx xrcp = gen_reg_rtx (mode);
15858   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15859
15860   /* Iterate over the series twice for SF and thrice for DF.  */
15861   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15862
15863   /* Optionally iterate over the series less for faster performance,
15864      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
15865   if (flag_mlow_precision_div)
15866     iterations = (GET_MODE_INNER (mode) == DFmode
15867                   ? aarch64_double_recp_precision
15868                   : aarch64_float_recp_precision);
15869
15870   /* Iterate over the series to calculate the approximate reciprocal.  */
15871   rtx xtmp = gen_reg_rtx (mode);
15872   while (iterations--)
15873     {
15874       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15875
15876       if (iterations > 0)
15877         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15878     }
15879
15880   if (num != CONST1_RTX (mode))
15881     {
15882       /* As the approximate reciprocal of DEN is already calculated, only
15883          calculate the approximate division when NUM is not 1.0.  */
15884       rtx xnum = force_reg (mode, num);
15885       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15886     }
15887
15888   /* Finalize the approximation.  */
15889   aarch64_emit_mult (quo, pg, xrcp, xtmp);
15890   return true;
15891 }
15892
15893 /* Return the number of instructions that can be issued per cycle.  */
15894 static int
15895 aarch64_sched_issue_rate (void)
15896 {
15897   return aarch64_tune_params.issue_rate;
15898 }
15899
15900 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
15901 static int
15902 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15903 {
15904   if (DEBUG_INSN_P (insn))
15905     return more;
15906
15907   rtx_code code = GET_CODE (PATTERN (insn));
15908   if (code == USE || code == CLOBBER)
15909     return more;
15910
15911   if (get_attr_type (insn) == TYPE_NO_INSN)
15912     return more;
15913
15914   return more - 1;
15915 }
15916
15917 static int
15918 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15919 {
15920   int issue_rate = aarch64_sched_issue_rate ();
15921
15922   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15923 }
15924
15925
15926 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15927    autopref_multipass_dfa_lookahead_guard from haifa-sched.cc.  It only
15928    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
15929
15930 static int
15931 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15932                                                     int ready_index)
15933 {
15934   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15935 }
15936
15937
15938 /* Vectorizer cost model target hooks.  */
15939
15940 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
15941    return the decl that should be recorded.  Return null otherwise.  */
15942 tree
15943 aarch64_vector_load_decl (tree addr)
15944 {
15945   if (TREE_CODE (addr) != ADDR_EXPR)
15946     return NULL_TREE;
15947   tree base = get_base_address (TREE_OPERAND (addr, 0));
15948   if (TREE_CODE (base) != VAR_DECL)
15949     return NULL_TREE;
15950   return base;
15951 }
15952
15953 /* Return true if STMT_INFO accesses a decl that is known to be the
15954    argument to a vld1 in the same function.  */
15955 static bool
15956 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
15957 {
15958   if (!cfun->machine->vector_load_decls)
15959     return false;
15960   auto dr = STMT_VINFO_DATA_REF (stmt_info);
15961   if (!dr)
15962     return false;
15963   tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
15964   return decl && cfun->machine->vector_load_decls->contains (decl);
15965 }
15966
15967 /* Information about how the CPU would issue the scalar, Advanced SIMD
15968    or SVE version of a vector loop, using the scheme defined by the
15969    aarch64_base_vec_issue_info hierarchy of structures.  */
15970 class aarch64_vec_op_count
15971 {
15972 public:
15973   aarch64_vec_op_count () = default;
15974   aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15975                         unsigned int = 1);
15976
15977   unsigned int vec_flags () const { return m_vec_flags; }
15978   unsigned int vf_factor () const { return m_vf_factor; }
15979
15980   const aarch64_base_vec_issue_info *base_issue_info () const;
15981   const aarch64_simd_vec_issue_info *simd_issue_info () const;
15982   const aarch64_sve_vec_issue_info *sve_issue_info () const;
15983
15984   fractional_cost rename_cycles_per_iter () const;
15985   fractional_cost min_nonpred_cycles_per_iter () const;
15986   fractional_cost min_pred_cycles_per_iter () const;
15987   fractional_cost min_cycles_per_iter () const;
15988
15989   void dump () const;
15990
15991   /* The number of individual "general" operations.  See the comments
15992      in aarch64_base_vec_issue_info for details.  */
15993   unsigned int general_ops = 0;
15994
15995   /* The number of load and store operations, under the same scheme
15996      as above.  */
15997   unsigned int loads = 0;
15998   unsigned int stores = 0;
15999
16000   /* The minimum number of cycles needed to execute all loop-carried
16001      operations, which in the vector code become associated with
16002      reductions.  */
16003   unsigned int reduction_latency = 0;
16004
16005   /* The number of individual predicate operations.  See the comments
16006      in aarch64_sve_vec_issue_info for details.  */
16007   unsigned int pred_ops = 0;
16008
16009 private:
16010   /* The issue information for the core.  */
16011   const aarch64_vec_issue_info *m_issue_info = nullptr;
16012
16013   /* - If M_VEC_FLAGS is zero then this structure describes scalar code
16014      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
16015        Advanced SIMD code.
16016      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
16017        SVE code.  */
16018   unsigned int m_vec_flags = 0;
16019
16020   /* Assume that, when the code is executing on the core described
16021      by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
16022      times more data than the vectorizer anticipates.
16023
16024      This is only ever different from 1 for SVE.  It allows us to consider
16025      what would happen on a 256-bit SVE target even when the -mtune
16026      parameters say that the “likely” SVE length is 128 bits.  */
16027   unsigned int m_vf_factor = 1;
16028 };
16029
16030 aarch64_vec_op_count::
16031 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
16032                       unsigned int vec_flags, unsigned int vf_factor)
16033   : m_issue_info (issue_info),
16034     m_vec_flags (vec_flags),
16035     m_vf_factor (vf_factor)
16036 {
16037 }
16038
16039 /* Return the base issue information (i.e. the parts that make sense
16040    for both scalar and vector code).  Return null if we have no issue
16041    information.  */
16042 const aarch64_base_vec_issue_info *
16043 aarch64_vec_op_count::base_issue_info () const
16044 {
16045   if (auto *ret = simd_issue_info ())
16046     return ret;
16047   return m_issue_info->scalar;
16048 }
16049
16050 /* If the structure describes vector code and we have associated issue
16051    information, return that issue information, otherwise return null.  */
16052 const aarch64_simd_vec_issue_info *
16053 aarch64_vec_op_count::simd_issue_info () const
16054 {
16055   if (auto *ret = sve_issue_info ())
16056     return ret;
16057   if (m_vec_flags)
16058     return m_issue_info->advsimd;
16059   return nullptr;
16060 }
16061
16062 /* If the structure describes SVE code and we have associated issue
16063    information, return that issue information, otherwise return null.  */
16064 const aarch64_sve_vec_issue_info *
16065 aarch64_vec_op_count::sve_issue_info () const
16066 {
16067   if (m_vec_flags & VEC_ANY_SVE)
16068     return m_issue_info->sve;
16069   return nullptr;
16070 }
16071
16072 /* Estimate the minimum number of cycles per iteration needed to rename
16073    the instructions.
16074
16075    ??? For now this is done inline rather than via cost tables, since it
16076    isn't clear how it should be parameterized for the general case.  */
16077 fractional_cost
16078 aarch64_vec_op_count::rename_cycles_per_iter () const
16079 {
16080   if (sve_issue_info () == &neoverse512tvb_sve_issue_info
16081       || sve_issue_info () == &neoversen2_sve_issue_info
16082       || sve_issue_info () == &neoversev2_sve_issue_info)
16083     /* + 1 for an addition.  We've already counted a general op for each
16084        store, so we don't need to account for stores separately.  The branch
16085        reads no registers and so does not need to be counted either.
16086
16087        ??? This value is very much on the pessimistic side, but seems to work
16088        pretty well in practice.  */
16089     return { general_ops + loads + pred_ops + 1, 5 };
16090
16091   return 0;
16092 }
16093
16094 /* Like min_cycles_per_iter, but excluding predicate operations.  */
16095 fractional_cost
16096 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
16097 {
16098   auto *issue_info = base_issue_info ();
16099
16100   fractional_cost cycles = MAX (reduction_latency, 1);
16101   cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
16102   cycles = std::max (cycles, { loads + stores,
16103                                issue_info->loads_stores_per_cycle });
16104   cycles = std::max (cycles, { general_ops,
16105                                issue_info->general_ops_per_cycle });
16106   cycles = std::max (cycles, rename_cycles_per_iter ());
16107   return cycles;
16108 }
16109
16110 /* Like min_cycles_per_iter, but including only the predicate operations.  */
16111 fractional_cost
16112 aarch64_vec_op_count::min_pred_cycles_per_iter () const
16113 {
16114   if (auto *issue_info = sve_issue_info ())
16115     return { pred_ops, issue_info->pred_ops_per_cycle };
16116   return 0;
16117 }
16118
16119 /* Estimate the minimum number of cycles needed to issue the operations.
16120    This is a very simplistic model!  */
16121 fractional_cost
16122 aarch64_vec_op_count::min_cycles_per_iter () const
16123 {
16124   return std::max (min_nonpred_cycles_per_iter (),
16125                    min_pred_cycles_per_iter ());
16126 }
16127
16128 /* Dump information about the structure.  */
16129 void
16130 aarch64_vec_op_count::dump () const
16131 {
16132   dump_printf_loc (MSG_NOTE, vect_location,
16133                    "  load operations = %d\n", loads);
16134   dump_printf_loc (MSG_NOTE, vect_location,
16135                    "  store operations = %d\n", stores);
16136   dump_printf_loc (MSG_NOTE, vect_location,
16137                    "  general operations = %d\n", general_ops);
16138   if (sve_issue_info ())
16139     dump_printf_loc (MSG_NOTE, vect_location,
16140                      "  predicate operations = %d\n", pred_ops);
16141   dump_printf_loc (MSG_NOTE, vect_location,
16142                    "  reduction latency = %d\n", reduction_latency);
16143   if (auto rcpi = rename_cycles_per_iter ())
16144     dump_printf_loc (MSG_NOTE, vect_location,
16145                      "  estimated cycles per iteration to rename = %f\n",
16146                      rcpi.as_double ());
16147   if (auto pred_cpi = min_pred_cycles_per_iter ())
16148     {
16149       dump_printf_loc (MSG_NOTE, vect_location,
16150                        "  estimated min cycles per iteration"
16151                        " without predication = %f\n",
16152                        min_nonpred_cycles_per_iter ().as_double ());
16153       dump_printf_loc (MSG_NOTE, vect_location,
16154                        "  estimated min cycles per iteration"
16155                        " for predication = %f\n", pred_cpi.as_double ());
16156     }
16157   if (auto cpi = min_cycles_per_iter ())
16158     dump_printf_loc (MSG_NOTE, vect_location,
16159                      "  estimated min cycles per iteration = %f\n",
16160                      cpi.as_double ());
16161 }
16162
16163 /* Information about vector code that we're in the process of costing.  */
16164 class aarch64_vector_costs : public vector_costs
16165 {
16166 public:
16167   aarch64_vector_costs (vec_info *, bool);
16168
16169   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
16170                               stmt_vec_info stmt_info, slp_tree, tree vectype,
16171                               int misalign,
16172                               vect_cost_model_location where) override;
16173   void finish_cost (const vector_costs *) override;
16174   bool better_main_loop_than_p (const vector_costs *other) const override;
16175
16176 private:
16177   void record_potential_advsimd_unrolling (loop_vec_info);
16178   void analyze_loop_vinfo (loop_vec_info);
16179   void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
16180                   aarch64_vec_op_count *);
16181   fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
16182                                         fractional_cost, unsigned int,
16183                                         unsigned int *, bool *);
16184   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
16185                                  unsigned int);
16186   bool prefer_unrolled_loop () const;
16187   unsigned int determine_suggested_unroll_factor ();
16188
16189   /* True if we have performed one-time initialization based on the
16190      vec_info.  */
16191   bool m_analyzed_vinfo = false;
16192
16193   /* This loop uses an average operation that is not supported by SVE, but is
16194      supported by Advanced SIMD and SVE2.  */
16195   bool m_has_avg = false;
16196
16197   /* True if the vector body contains a store to a decl and if the
16198      function is known to have a vld1 from the same decl.
16199
16200      In the Advanced SIMD ACLE, the recommended endian-agnostic way of
16201      initializing a vector is:
16202
16203        float f[4] = { elts };
16204        float32x4_t x = vld1q_f32(f);
16205
16206      We should strongly prefer vectorization of the initialization of f,
16207      so that the store to f and the load back can be optimized away,
16208      leaving a vectorization of { elts }.  */
16209   bool m_stores_to_vector_load_decl = false;
16210
16211   /* Non-zero if the last operation we costed is a vector promotion or demotion.
16212      In this case the value is the number of insns in the last operation.
16213
16214      On AArch64 vector promotion and demotions require us to first widen or
16215      narrow the input and only after that emit conversion instructions.  For
16216      costing this means we need to emit the cost of the final conversions as
16217      well.  */
16218   unsigned int m_num_last_promote_demote = 0;
16219
16220   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
16221      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
16222        SIMD code.
16223      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
16224   unsigned int m_vec_flags = 0;
16225
16226   /* At the moment, we do not model LDP and STP in the vector and scalar costs.
16227      This means that code such as:
16228
16229         a[0] = x;
16230         a[1] = x;
16231
16232      will be costed as two scalar instructions and two vector instructions
16233      (a scalar_to_vec and an unaligned_store).  For SLP, the vector form
16234      wins if the costs are equal, because of the fact that the vector costs
16235      include constant initializations whereas the scalar costs don't.
16236      We would therefore tend to vectorize the code above, even though
16237      the scalar version can use a single STP.
16238
16239      We should eventually fix this and model LDP and STP in the main costs;
16240      see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
16241      Until then, we look specifically for code that does nothing more than
16242      STP-like operations.  We cost them on that basis in addition to the
16243      normal latency-based costs.
16244
16245      If the scalar or vector code could be a sequence of STPs +
16246      initialization, this variable counts the cost of the sequence,
16247      with 2 units per instruction.  The variable is ~0U for other
16248      kinds of code.  */
16249   unsigned int m_stp_sequence_cost = 0;
16250
16251   /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
16252      throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
16253      situations, we try to predict whether an Advanced SIMD implementation
16254      of the loop could be completely unrolled and become straight-line code.
16255      If so, it is generally better to use the Advanced SIMD version rather
16256      than length-agnostic SVE, since the SVE loop would execute an unknown
16257      number of times and so could not be completely unrolled in the same way.
16258
16259      If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
16260      number of Advanced SIMD loop iterations that would be unrolled and
16261      M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
16262      in the unrolled loop.  Both values are zero if we're not applying
16263      the heuristic.  */
16264   unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
16265   unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
16266
16267   /* If we're vectorizing a loop that executes a constant number of times,
16268      this variable gives the number of times that the vector loop would
16269      iterate, otherwise it is zero.  */
16270   uint64_t m_num_vector_iterations = 0;
16271
16272   /* Used only when vectorizing loops.  Estimates the number and kind of
16273      operations that would be needed by one iteration of the scalar
16274      or vector loop.  There is one entry for each tuning option of
16275      interest.  */
16276   auto_vec<aarch64_vec_op_count, 2> m_ops;
16277 };
16278
16279 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
16280                                             bool costing_for_scalar)
16281   : vector_costs (vinfo, costing_for_scalar),
16282     m_vec_flags (costing_for_scalar ? 0
16283                  : aarch64_classify_vector_mode (vinfo->vector_mode))
16284 {
16285   if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
16286     {
16287       m_ops.quick_push ({ issue_info, m_vec_flags });
16288       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
16289         {
16290           unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
16291           m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
16292                               vf_factor });
16293         }
16294     }
16295 }
16296
16297 /* Implement TARGET_VECTORIZE_CREATE_COSTS.  */
16298 vector_costs *
16299 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
16300 {
16301   return new aarch64_vector_costs (vinfo, costing_for_scalar);
16302 }
16303
16304 /* Return true if the current CPU should use the new costs defined
16305    in GCC 11.  This should be removed for GCC 12 and above, with the
16306    costs applying to all CPUs instead.  */
16307 static bool
16308 aarch64_use_new_vector_costs_p ()
16309 {
16310   return (aarch64_tune_params.extra_tuning_flags
16311           & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
16312 }
16313
16314 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
16315 static const simd_vec_cost *
16316 aarch64_simd_vec_costs (tree vectype)
16317 {
16318   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16319   if (vectype != NULL
16320       && aarch64_sve_mode_p (TYPE_MODE (vectype))
16321       && costs->sve != NULL)
16322     return costs->sve;
16323   return costs->advsimd;
16324 }
16325
16326 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
16327 static const simd_vec_cost *
16328 aarch64_simd_vec_costs_for_flags (unsigned int flags)
16329 {
16330   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16331   if ((flags & VEC_ANY_SVE) && costs->sve)
16332     return costs->sve;
16333   return costs->advsimd;
16334 }
16335
16336 /* If STMT_INFO is a memory reference, return the scalar memory type,
16337    otherwise return null.  */
16338 static tree
16339 aarch64_dr_type (stmt_vec_info stmt_info)
16340 {
16341   if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16342     return TREE_TYPE (DR_REF (dr));
16343   return NULL_TREE;
16344 }
16345
16346 /* Decide whether to use the unrolling heuristic described above
16347    m_unrolled_advsimd_niters, updating that field if so.  LOOP_VINFO
16348    describes the loop that we're vectorizing.  */
16349 void
16350 aarch64_vector_costs::
16351 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
16352 {
16353   /* The heuristic only makes sense on targets that have the same
16354      vector throughput for SVE and Advanced SIMD.  */
16355   if (!(aarch64_tune_params.extra_tuning_flags
16356         & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16357     return;
16358
16359   /* We only want to apply the heuristic if LOOP_VINFO is being
16360      vectorized for SVE.  */
16361   if (!(m_vec_flags & VEC_ANY_SVE))
16362     return;
16363
16364   /* Check whether it is possible in principle to use Advanced SIMD
16365      instead.  */
16366   if (aarch64_autovec_preference == 2)
16367     return;
16368
16369   /* We don't want to apply the heuristic to outer loops, since it's
16370      harder to track two levels of unrolling.  */
16371   if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16372     return;
16373
16374   /* Only handle cases in which the number of Advanced SIMD iterations
16375      would be known at compile time but the number of SVE iterations
16376      would not.  */
16377   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16378       || aarch64_sve_vg.is_constant ())
16379     return;
16380
16381   /* Guess how many times the Advanced SIMD loop would iterate and make
16382      sure that it is within the complete unrolling limit.  Even if the
16383      number of iterations is small enough, the number of statements might
16384      not be, which is why we need to estimate the number of statements too.  */
16385   unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16386   unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16387   unsigned HOST_WIDE_INT unrolled_advsimd_niters
16388     = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16389   if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16390     return;
16391
16392   /* Record that we're applying the heuristic and should try to estimate
16393      the number of statements in the Advanced SIMD loop.  */
16394   m_unrolled_advsimd_niters = unrolled_advsimd_niters;
16395 }
16396
16397 /* Do one-time initialization of the aarch64_vector_costs given that we're
16398    costing the loop vectorization described by LOOP_VINFO.  */
16399 void
16400 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
16401 {
16402   /* Record the number of times that the vector loop would execute,
16403      if known.  */
16404   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16405   auto scalar_niters = max_stmt_executions_int (loop);
16406   if (scalar_niters >= 0)
16407     {
16408       unsigned int vf = vect_vf_for_cost (loop_vinfo);
16409       if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16410         m_num_vector_iterations = scalar_niters / vf;
16411       else
16412         m_num_vector_iterations = CEIL (scalar_niters, vf);
16413     }
16414
16415   /* Detect whether we're vectorizing for SVE and should apply the unrolling
16416      heuristic described above m_unrolled_advsimd_niters.  */
16417   record_potential_advsimd_unrolling (loop_vinfo);
16418 }
16419
16420 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
16421 static int
16422 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16423                                     tree vectype,
16424                                     int misalign ATTRIBUTE_UNUSED)
16425 {
16426   unsigned elements;
16427   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16428   bool fp = false;
16429
16430   if (vectype != NULL)
16431     fp = FLOAT_TYPE_P (vectype);
16432
16433   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16434
16435   switch (type_of_cost)
16436     {
16437       case scalar_stmt:
16438         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16439
16440       case scalar_load:
16441         return costs->scalar_load_cost;
16442
16443       case scalar_store:
16444         return costs->scalar_store_cost;
16445
16446       case vector_stmt:
16447         return fp ? simd_costs->fp_stmt_cost
16448                   : simd_costs->int_stmt_cost;
16449
16450       case vector_load:
16451         return simd_costs->align_load_cost;
16452
16453       case vector_store:
16454         return simd_costs->store_cost;
16455
16456       case vec_to_scalar:
16457         return simd_costs->vec_to_scalar_cost;
16458
16459       case scalar_to_vec:
16460         return simd_costs->scalar_to_vec_cost;
16461
16462       case unaligned_load:
16463       case vector_gather_load:
16464         return simd_costs->unalign_load_cost;
16465
16466       case unaligned_store:
16467       case vector_scatter_store:
16468         return simd_costs->unalign_store_cost;
16469
16470       case cond_branch_taken:
16471         return costs->cond_taken_branch_cost;
16472
16473       case cond_branch_not_taken:
16474         return costs->cond_not_taken_branch_cost;
16475
16476       case vec_perm:
16477         return simd_costs->permute_cost;
16478
16479       case vec_promote_demote:
16480         return fp ? simd_costs->fp_stmt_cost
16481                   : simd_costs->int_stmt_cost;
16482
16483       case vec_construct:
16484         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16485         return elements / 2 + 1;
16486
16487       default:
16488         gcc_unreachable ();
16489     }
16490 }
16491
16492 /* Return true if an access of kind KIND for STMT_INFO represents one
16493    vector of an LD[234] or ST[234] operation.  Return the total number of
16494    vectors (2, 3 or 4) if so, otherwise return a value outside that range.  */
16495 static int
16496 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16497 {
16498   if ((kind == vector_load
16499        || kind == unaligned_load
16500        || kind == vector_store
16501        || kind == unaligned_store)
16502       && STMT_VINFO_DATA_REF (stmt_info))
16503     {
16504       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16505       if (stmt_info
16506           && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16507         return DR_GROUP_SIZE (stmt_info);
16508     }
16509   return 0;
16510 }
16511
16512 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16513    vectors would produce a series of LDP or STP operations.  KIND is the
16514    kind of statement that STMT_INFO represents.  */
16515 static bool
16516 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16517                            stmt_vec_info stmt_info)
16518 {
16519   switch (kind)
16520     {
16521     case vector_load:
16522     case vector_store:
16523     case unaligned_load:
16524     case unaligned_store:
16525       break;
16526
16527     default:
16528       return false;
16529     }
16530
16531   return is_gimple_assign (stmt_info->stmt);
16532 }
16533
16534 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16535    or multiply-subtract sequence that might be suitable for fusing into a
16536    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16537    a scalar one, otherwise analyze it as an operation on vectors with those
16538    VEC_* flags.  */
16539 static bool
16540 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16541                         unsigned int vec_flags)
16542 {
16543   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16544   if (!assign)
16545     return false;
16546   tree_code code = gimple_assign_rhs_code (assign);
16547   if (code != PLUS_EXPR && code != MINUS_EXPR)
16548     return false;
16549
16550   auto is_mul_result = [&](int i)
16551     {
16552       tree rhs = gimple_op (assign, i);
16553       /* ??? Should we try to check for a single use as well?  */
16554       if (TREE_CODE (rhs) != SSA_NAME)
16555         return false;
16556
16557       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16558       if (!def_stmt_info
16559           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16560         return false;
16561       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16562       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16563         return false;
16564
16565       if (vec_flags & VEC_ADVSIMD)
16566         {
16567           /* Scalar and SVE code can tie the result to any FMLA input (or none,
16568              although that requires a MOVPRFX for SVE).  However, Advanced SIMD
16569              only supports MLA forms, so will require a move if the result
16570              cannot be tied to the accumulator.  The most important case in
16571              which this is true is when the accumulator input is invariant.  */
16572           rhs = gimple_op (assign, 3 - i);
16573           if (TREE_CODE (rhs) != SSA_NAME)
16574             return false;
16575           def_stmt_info = vinfo->lookup_def (rhs);
16576           if (!def_stmt_info
16577               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
16578               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
16579             return false;
16580         }
16581
16582       return true;
16583     };
16584
16585   if (code == MINUS_EXPR && (vec_flags & VEC_ADVSIMD))
16586     /* Advanced SIMD doesn't have FNMADD/FNMSUB/FNMLA/FNMLS, so the
16587        multiplication must be on the second operand (to form an FMLS).
16588        But if both operands are multiplications and the second operand
16589        is used more than once, we'll instead negate the second operand
16590        and use it as an accumulator for the first operand.  */
16591     return (is_mul_result (2)
16592             && (has_single_use (gimple_assign_rhs2 (assign))
16593                 || !is_mul_result (1)));
16594
16595   return is_mul_result (1) || is_mul_result (2);
16596 }
16597
16598 /* Return true if STMT_INFO is the second part of a two-statement boolean AND
16599    expression sequence that might be suitable for fusing into a
16600    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16601    a scalar one, otherwise analyze it as an operation on vectors with those
16602    VEC_* flags.  */
16603
16604 static bool
16605 aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
16606                          unsigned int vec_flags)
16607 {
16608   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16609   if (!assign
16610       || gimple_assign_rhs_code (assign) != BIT_AND_EXPR
16611       || !STMT_VINFO_VECTYPE (stmt_info)
16612       || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info)))
16613     return false;
16614
16615   for (int i = 1; i < 3; ++i)
16616     {
16617       tree rhs = gimple_op (assign, i);
16618
16619       if (TREE_CODE (rhs) != SSA_NAME)
16620         continue;
16621
16622       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16623       if (!def_stmt_info
16624           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16625         continue;
16626
16627       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16628       if (!rhs_assign
16629           || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign))
16630                 != tcc_comparison)
16631         continue;
16632
16633       if (vec_flags & VEC_ADVSIMD)
16634         return false;
16635
16636       return true;
16637     }
16638   return false;
16639 }
16640
16641 /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
16642    in-loop reduction that SVE supports directly, return its latency in cycles,
16643    otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
16644    instructions.  */
16645 static unsigned int
16646 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16647                                        stmt_vec_info stmt_info,
16648                                        const sve_vec_cost *sve_costs)
16649 {
16650   switch (vect_reduc_type (vinfo, stmt_info))
16651     {
16652     case EXTRACT_LAST_REDUCTION:
16653       return sve_costs->clast_cost;
16654
16655     case FOLD_LEFT_REDUCTION:
16656       switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16657         {
16658         case E_HFmode:
16659         case E_BFmode:
16660           return sve_costs->fadda_f16_cost;
16661
16662         case E_SFmode:
16663           return sve_costs->fadda_f32_cost;
16664
16665         case E_DFmode:
16666           return sve_costs->fadda_f64_cost;
16667
16668         default:
16669           break;
16670         }
16671       break;
16672     }
16673
16674   return 0;
16675 }
16676
16677 /* STMT_INFO describes a loop-carried operation in the original scalar code
16678    that we are considering implementing as a reduction.  Return one of the
16679    following values, depending on VEC_FLAGS:
16680
16681    - If VEC_FLAGS is zero, return the loop carry latency of the original
16682      scalar operation.
16683
16684    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16685      Advanced SIMD implementation.
16686
16687    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16688      SVE implementation.  */
16689 static unsigned int
16690 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16691                                    unsigned int vec_flags)
16692 {
16693   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16694   const sve_vec_cost *sve_costs = nullptr;
16695   if (vec_flags & VEC_ANY_SVE)
16696     sve_costs = aarch64_tune_params.vec_costs->sve;
16697
16698   /* If the caller is asking for the SVE latency, check for forms of reduction
16699      that only SVE can handle directly.  */
16700   if (sve_costs)
16701     {
16702       unsigned int latency
16703         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16704       if (latency)
16705         return latency;
16706     }
16707
16708   /* Handle scalar costs.  */
16709   bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16710   if (vec_flags == 0)
16711     {
16712       if (is_float)
16713         return vec_costs->scalar_fp_stmt_cost;
16714       return vec_costs->scalar_int_stmt_cost;
16715     }
16716
16717   /* Otherwise, the loop body just contains normal integer or FP operations,
16718      with a vector reduction outside the loop.  */
16719   const simd_vec_cost *simd_costs
16720     = aarch64_simd_vec_costs_for_flags (vec_flags);
16721   if (is_float)
16722     return simd_costs->fp_stmt_cost;
16723   return simd_costs->int_stmt_cost;
16724 }
16725
16726 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16727    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
16728    try to subdivide the target-independent categorization provided by KIND
16729    to get a more accurate cost.  */
16730 static fractional_cost
16731 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16732                                     stmt_vec_info stmt_info,
16733                                     fractional_cost stmt_cost)
16734 {
16735   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
16736      the extension with the load.  */
16737   if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16738     return 0;
16739
16740   return stmt_cost;
16741 }
16742
16743 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16744    for the vectorized form of STMT_INFO, which has cost kind KIND and which
16745    when vectorized would operate on vector type VECTYPE.  Try to subdivide
16746    the target-independent categorization provided by KIND to get a more
16747    accurate cost.  WHERE specifies where the cost associated with KIND
16748    occurs.  */
16749 static fractional_cost
16750 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16751                                     stmt_vec_info stmt_info, tree vectype,
16752                                     enum vect_cost_model_location where,
16753                                     fractional_cost stmt_cost)
16754 {
16755   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16756   const sve_vec_cost *sve_costs = nullptr;
16757   if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16758     sve_costs = aarch64_tune_params.vec_costs->sve;
16759
16760   /* It's generally better to avoid costing inductions, since the induction
16761      will usually be hidden by other operations.  This is particularly true
16762      for things like COND_REDUCTIONS.  */
16763   if (is_a<gphi *> (stmt_info->stmt))
16764     return 0;
16765
16766   /* Detect cases in which vec_to_scalar is describing the extraction of a
16767      vector element in preparation for a scalar store.  The store itself is
16768      costed separately.  */
16769   if (vect_is_store_elt_extraction (kind, stmt_info))
16770     return simd_costs->store_elt_extra_cost;
16771
16772   /* Detect SVE gather loads, which are costed as a single scalar_load
16773      for each element.  We therefore need to divide the full-instruction
16774      cost by the number of elements in the vector.  */
16775   if (kind == scalar_load
16776       && sve_costs
16777       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16778     {
16779       unsigned int nunits = vect_nunits_for_cost (vectype);
16780       if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16781         return { sve_costs->gather_load_x64_cost, nunits };
16782       return { sve_costs->gather_load_x32_cost, nunits };
16783     }
16784
16785   /* Detect cases in which a scalar_store is really storing one element
16786      in a scatter operation.  */
16787   if (kind == scalar_store
16788       && sve_costs
16789       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16790     return sve_costs->scatter_store_elt_cost;
16791
16792   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
16793   if (kind == vec_to_scalar
16794       && where == vect_body
16795       && sve_costs)
16796     {
16797       unsigned int latency
16798         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16799       if (latency)
16800         return latency;
16801     }
16802
16803   /* Detect cases in which vec_to_scalar represents a single reduction
16804      instruction like FADDP or MAXV.  */
16805   if (kind == vec_to_scalar
16806       && where == vect_epilogue
16807       && vect_is_reduction (stmt_info))
16808     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16809       {
16810       case E_QImode:
16811         return simd_costs->reduc_i8_cost;
16812
16813       case E_HImode:
16814         return simd_costs->reduc_i16_cost;
16815
16816       case E_SImode:
16817         return simd_costs->reduc_i32_cost;
16818
16819       case E_DImode:
16820         return simd_costs->reduc_i64_cost;
16821
16822       case E_HFmode:
16823       case E_BFmode:
16824         return simd_costs->reduc_f16_cost;
16825
16826       case E_SFmode:
16827         return simd_costs->reduc_f32_cost;
16828
16829       case E_DFmode:
16830         return simd_costs->reduc_f64_cost;
16831
16832       default:
16833         break;
16834       }
16835
16836   /* Otherwise stick with the original categorization.  */
16837   return stmt_cost;
16838 }
16839
16840 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16841    for STMT_INFO, which has cost kind KIND and which when vectorized would
16842    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
16843    targets.  */
16844 static fractional_cost
16845 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16846                               stmt_vec_info stmt_info, tree vectype,
16847                               fractional_cost stmt_cost)
16848 {
16849   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16850      vector register size or number of units.  Integer promotions of this
16851      type therefore map to SXT[BHW] or UXT[BHW].
16852
16853      Most loads have extending forms that can do the sign or zero extension
16854      on the fly.  Optimistically assume that a load followed by an extension
16855      will fold to this form during combine, and that the extension therefore
16856      comes for free.  */
16857   if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16858     stmt_cost = 0;
16859
16860   /* For similar reasons, vector_stmt integer truncations are a no-op,
16861      because we can just ignore the unused upper bits of the source.  */
16862   if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16863     stmt_cost = 0;
16864
16865   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16866      but there are no equivalent instructions for SVE.  This means that
16867      (all other things being equal) 128-bit SVE needs twice as many load
16868      and store instructions as Advanced SIMD in order to process vector pairs.
16869
16870      Also, scalar code can often use LDP and STP to access pairs of values,
16871      so it is too simplistic to say that one SVE load or store replaces
16872      VF scalar loads and stores.
16873
16874      Ideally we would account for this in the scalar and Advanced SIMD
16875      costs by making suitable load/store pairs as cheap as a single
16876      load/store.  However, that would be a very invasive change and in
16877      practice it tends to stress other parts of the cost model too much.
16878      E.g. stores of scalar constants currently count just a store,
16879      whereas stores of vector constants count a store and a vec_init.
16880      This is an artificial distinction for AArch64, where stores of
16881      nonzero scalar constants need the same kind of register invariant
16882      as vector stores.
16883
16884      An alternative would be to double the cost of any SVE loads and stores
16885      that could be paired in Advanced SIMD (and possibly also paired in
16886      scalar code).  But this tends to stress other parts of the cost model
16887      in the same way.  It also means that we can fall back to Advanced SIMD
16888      even if full-loop predication would have been useful.
16889
16890      Here we go for a more conservative version: double the costs of SVE
16891      loads and stores if one iteration of the scalar loop processes enough
16892      elements for it to use a whole number of Advanced SIMD LDP or STP
16893      instructions.  This makes it very likely that the VF would be 1 for
16894      Advanced SIMD, and so no epilogue should be needed.  */
16895   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16896     {
16897       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16898       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16899       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16900       if (multiple_p (count * elt_bits, 256)
16901           && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16902         stmt_cost *= 2;
16903     }
16904
16905   return stmt_cost;
16906 }
16907
16908 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16909    and which when vectorized would operate on vector type VECTYPE.  Add the
16910    cost of any embedded operations.  */
16911 static fractional_cost
16912 aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
16913                           stmt_vec_info stmt_info, tree vectype,
16914                           unsigned vec_flags, fractional_cost stmt_cost)
16915 {
16916   if (vectype)
16917     {
16918       const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16919
16920       /* Detect cases in which a vector load or store represents an
16921          LD[234] or ST[234] instruction.  */
16922       switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16923         {
16924         case 2:
16925           stmt_cost += simd_costs->ld2_st2_permute_cost;
16926           break;
16927
16928         case 3:
16929           stmt_cost += simd_costs->ld3_st3_permute_cost;
16930           break;
16931
16932         case 4:
16933           stmt_cost += simd_costs->ld4_st4_permute_cost;
16934           break;
16935         }
16936
16937       gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
16938       if ((kind == scalar_stmt || kind == vector_stmt) && assign)
16939         {
16940           /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
16941           if (!vect_is_reduction (stmt_info)
16942               && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
16943             return 0;
16944
16945           /* For vector boolean ANDs with a compare operand we just need
16946              one insn.  */
16947           if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
16948             return 0;
16949         }
16950
16951       if (kind == vector_stmt || kind == vec_to_scalar)
16952         if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16953           {
16954             if (FLOAT_TYPE_P (cmp_type))
16955               stmt_cost += simd_costs->fp_stmt_cost;
16956             else
16957               stmt_cost += simd_costs->int_stmt_cost;
16958           }
16959     }
16960
16961   if (kind == scalar_stmt)
16962     if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16963       {
16964         if (FLOAT_TYPE_P (cmp_type))
16965           stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16966         else
16967           stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16968       }
16969
16970   return stmt_cost;
16971 }
16972
16973 /* Return true if STMT_INFO is part of a reduction that has the form:
16974
16975       r = r op ...;
16976       r = r op ...;
16977
16978    with the single accumulator being read and written multiple times.  */
16979 static bool
16980 aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info)
16981 {
16982   if (!STMT_VINFO_REDUC_DEF (stmt_info))
16983     return false;
16984
16985   auto reduc_info = info_for_reduction (vinfo, stmt_info);
16986   return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
16987 }
16988
16989 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16990    and they describe an operation in the body of a vector loop.  Record issue
16991    information relating to the vector operation in OPS.  */
16992 void
16993 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
16994                                  stmt_vec_info stmt_info,
16995                                  aarch64_vec_op_count *ops)
16996 {
16997   const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16998   if (!base_issue)
16999     return;
17000   const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
17001   const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
17002
17003   /* Calculate the minimum cycles per iteration imposed by a reduction
17004      operation.  */
17005   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17006       && vect_is_reduction (stmt_info))
17007     {
17008       unsigned int base
17009         = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
17010       if (aarch64_force_single_cycle (m_vinfo, stmt_info))
17011         /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
17012            and then accumulate that, but at the moment the loop-carried
17013            dependency includes all copies.  */
17014         ops->reduction_latency = MAX (ops->reduction_latency, base * count);
17015       else
17016         ops->reduction_latency = MAX (ops->reduction_latency, base);
17017     }
17018
17019   if (stmt_info && (kind == scalar_stmt || kind == vector_stmt))
17020     {
17021       /* Assume that multiply-adds will become a single operation.  */
17022       if (aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
17023         return;
17024
17025       /* Assume that bool AND with compare operands will become a single
17026          operation.  */
17027       if (aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
17028         return;
17029     }
17030
17031
17032   /* Count the basic operation cost associated with KIND.  */
17033   switch (kind)
17034     {
17035     case cond_branch_taken:
17036     case cond_branch_not_taken:
17037     case vector_gather_load:
17038     case vector_scatter_store:
17039       /* We currently don't expect these to be used in a loop body.  */
17040       break;
17041
17042     case vec_perm:
17043     case vec_promote_demote:
17044     case vec_construct:
17045     case vec_to_scalar:
17046     case scalar_to_vec:
17047     case vector_stmt:
17048     case scalar_stmt:
17049       ops->general_ops += count;
17050       break;
17051
17052     case scalar_load:
17053     case vector_load:
17054     case unaligned_load:
17055       ops->loads += count;
17056       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17057         ops->general_ops += base_issue->fp_simd_load_general_ops * count;
17058       break;
17059
17060     case vector_store:
17061     case unaligned_store:
17062     case scalar_store:
17063       ops->stores += count;
17064       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17065         ops->general_ops += base_issue->fp_simd_store_general_ops * count;
17066       break;
17067     }
17068
17069   /* Add any embedded comparison operations.  */
17070   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17071       && vect_embedded_comparison_type (stmt_info))
17072     ops->general_ops += count;
17073
17074   /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
17075      have only accounted for one.  */
17076   if ((kind == vector_stmt || kind == vec_to_scalar)
17077       && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
17078     ops->general_ops += count;
17079
17080   /* Count the predicate operations needed by an SVE comparison.  */
17081   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
17082     if (tree type = vect_comparison_type (stmt_info))
17083       {
17084         unsigned int base = (FLOAT_TYPE_P (type)
17085                              ? sve_issue->fp_cmp_pred_ops
17086                              : sve_issue->int_cmp_pred_ops);
17087         ops->pred_ops += base * count;
17088       }
17089
17090   /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
17091   if (simd_issue)
17092     switch (aarch64_ld234_st234_vectors (kind, stmt_info))
17093       {
17094       case 2:
17095         ops->general_ops += simd_issue->ld2_st2_general_ops * count;
17096         break;
17097
17098       case 3:
17099         ops->general_ops += simd_issue->ld3_st3_general_ops * count;
17100         break;
17101
17102       case 4:
17103         ops->general_ops += simd_issue->ld4_st4_general_ops * count;
17104         break;
17105       }
17106
17107   /* Add any overhead associated with gather loads and scatter stores.  */
17108   if (sve_issue
17109       && (kind == scalar_load || kind == scalar_store)
17110       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
17111     {
17112       unsigned int pairs = CEIL (count, 2);
17113       ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
17114       ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
17115     }
17116 }
17117
17118 /* Return true if STMT_INFO contains a memory access and if the constant
17119    component of the memory address is aligned to SIZE bytes.  */
17120 static bool
17121 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
17122                                    poly_uint64 size)
17123 {
17124   if (!STMT_VINFO_DATA_REF (stmt_info))
17125     return false;
17126
17127   if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
17128     stmt_info = first_stmt;
17129   tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
17130   /* Needed for gathers & scatters, for example.  */
17131   if (!constant_offset)
17132     return false;
17133
17134   return multiple_p (wi::to_poly_offset (constant_offset), size);
17135 }
17136
17137 /* Check if a scalar or vector stmt could be part of a region of code
17138    that does nothing more than store values to memory, in the scalar
17139    case using STP.  Return the cost of the stmt if so, counting 2 for
17140    one instruction.  Return ~0U otherwise.
17141
17142    The arguments are a subset of those passed to add_stmt_cost.  */
17143 unsigned int
17144 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
17145                            stmt_vec_info stmt_info, tree vectype)
17146 {
17147   /* Code that stores vector constants uses a vector_load to create
17148      the constant.  We don't apply the heuristic to that case for two
17149      main reasons:
17150
17151      - At the moment, STPs are only formed via peephole2, and the
17152        constant scalar moves would often come between STRs and so
17153        prevent STP formation.
17154
17155      - The scalar code also has to load the constant somehow, and that
17156        isn't costed.  */
17157   switch (kind)
17158     {
17159     case scalar_to_vec:
17160       /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup.  */
17161       return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
17162
17163     case vec_construct:
17164       if (FLOAT_TYPE_P (vectype))
17165         /* Count 1 insn for the maximum number of FP->SIMD INS
17166            instructions.  */
17167         return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
17168
17169       /* Count 2 insns for a GPR->SIMD move and 2 insns for the
17170          maximum number of GPR->SIMD INS instructions.  */
17171       return vect_nunits_for_cost (vectype) * 4 * count;
17172
17173     case vector_store:
17174     case unaligned_store:
17175       /* Count 1 insn per vector if we can't form STP Q pairs.  */
17176       if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
17177         return count * 2;
17178
17179       if (stmt_info)
17180         {
17181           /* Assume we won't be able to use STP if the constant offset
17182              component of the address is misaligned.  ??? This could be
17183              removed if we formed STP pairs earlier, rather than relying
17184              on peephole2.  */
17185           auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
17186           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17187             return count * 2;
17188         }
17189       return CEIL (count, 2) * 2;
17190
17191     case scalar_store:
17192       if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
17193         {
17194           /* Check for a mode in which STP pairs can be formed.  */
17195           auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
17196           if (maybe_ne (size, 4) && maybe_ne (size, 8))
17197             return ~0U;
17198
17199           /* Assume we won't be able to use STP if the constant offset
17200              component of the address is misaligned.  ??? This could be
17201              removed if we formed STP pairs earlier, rather than relying
17202              on peephole2.  */
17203           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17204             return ~0U;
17205         }
17206       return count;
17207
17208     default:
17209       return ~0U;
17210     }
17211 }
17212
17213 unsigned
17214 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
17215                                      stmt_vec_info stmt_info, slp_tree,
17216                                      tree vectype, int misalign,
17217                                      vect_cost_model_location where)
17218 {
17219   fractional_cost stmt_cost
17220     = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
17221
17222   bool in_inner_loop_p = (where == vect_body
17223                           && stmt_info
17224                           && stmt_in_inner_loop_p (m_vinfo, stmt_info));
17225
17226   /* Do one-time initialization based on the vinfo.  */
17227   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17228   if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
17229     {
17230       if (loop_vinfo)
17231         analyze_loop_vinfo (loop_vinfo);
17232
17233       m_analyzed_vinfo = true;
17234     }
17235
17236   /* Apply the heuristic described above m_stp_sequence_cost.  */
17237   if (m_stp_sequence_cost != ~0U)
17238     {
17239       uint64_t cost = aarch64_stp_sequence_cost (count, kind,
17240                                                  stmt_info, vectype);
17241       m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
17242     }
17243
17244   /* Try to get a more accurate cost by looking at STMT_INFO instead
17245      of just looking at KIND.  */
17246   if (stmt_info && aarch64_use_new_vector_costs_p ())
17247     {
17248       /* If we scalarize a strided store, the vectorizer costs one
17249          vec_to_scalar for each element.  However, we can store the first
17250          element using an FP store without a separate extract step.  */
17251       if (vect_is_store_elt_extraction (kind, stmt_info))
17252         count -= 1;
17253
17254       stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
17255                                                       stmt_info, stmt_cost);
17256
17257       if (vectype && m_vec_flags)
17258         stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
17259                                                         stmt_info, vectype,
17260                                                         where, stmt_cost);
17261     }
17262
17263   /* Do any SVE-specific adjustments to the cost.  */
17264   if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
17265     stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17266                                               vectype, stmt_cost);
17267
17268   /*  Vector promotion and demotion requires us to widen the operation first
17269       and only after that perform the conversion.  Unfortunately the mid-end
17270       expects this to be doable as a single operation and doesn't pass on
17271       enough context here for us to tell which operation is happening.  To
17272       account for this we count every promote-demote operation twice and if
17273       the previously costed operation was also a promote-demote we reduce
17274       the cost of the currently being costed operation to simulate the final
17275       conversion cost.  Note that for SVE we can do better here if the converted
17276       value comes from a load since the widening load would consume the widening
17277       operations.  However since we're in stage 3 we can't change the helper
17278       vect_is_extending_load and duplicating the code seems not useful.  */
17279   gassign *assign = NULL;
17280   if (kind == vec_promote_demote
17281       && (assign = dyn_cast <gassign *> (STMT_VINFO_STMT (stmt_info)))
17282       && gimple_assign_rhs_code (assign) == FLOAT_EXPR)
17283     {
17284       auto new_count = count * 2 - m_num_last_promote_demote;
17285       m_num_last_promote_demote = count;
17286       count = new_count;
17287     }
17288   else
17289     m_num_last_promote_demote = 0;
17290
17291   if (stmt_info && aarch64_use_new_vector_costs_p ())
17292     {
17293       /* Account for any extra "embedded" costs that apply additively
17294          to the base cost calculated above.  */
17295       stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17296                                             vectype, m_vec_flags, stmt_cost);
17297
17298       /* If we're recording a nonzero vector loop body cost for the
17299          innermost loop, also estimate the operations that would need
17300          to be issued by all relevant implementations of the loop.  */
17301       if (loop_vinfo
17302           && (m_costing_for_scalar || where == vect_body)
17303           && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
17304           && stmt_cost != 0)
17305         for (auto &ops : m_ops)
17306           count_ops (count, kind, stmt_info, &ops);
17307
17308       /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
17309          estimate the number of statements in the unrolled Advanced SIMD
17310          loop.  For simplicitly, we assume that one iteration of the
17311          Advanced SIMD loop would need the same number of statements
17312          as one iteration of the SVE loop.  */
17313       if (where == vect_body && m_unrolled_advsimd_niters)
17314         m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
17315
17316       /* Detect the use of an averaging operation.  */
17317       gimple *stmt = stmt_info->stmt;
17318       if (is_gimple_call (stmt)
17319           && gimple_call_internal_p (stmt))
17320         {
17321           switch (gimple_call_internal_fn (stmt))
17322             {
17323             case IFN_AVG_FLOOR:
17324             case IFN_AVG_CEIL:
17325               m_has_avg = true;
17326             default:
17327               break;
17328             }
17329         }
17330     }
17331
17332   /* If the statement stores to a decl that is known to be the argument
17333      to a vld1 in the same function, ignore the store for costing purposes.
17334      See the comment above m_stores_to_vector_load_decl for more details.  */
17335   if (stmt_info
17336       && (kind == vector_store || kind == unaligned_store)
17337       && aarch64_accesses_vector_load_decl_p (stmt_info))
17338     {
17339       stmt_cost = 0;
17340       m_stores_to_vector_load_decl = true;
17341     }
17342
17343   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
17344 }
17345
17346 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
17347    heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
17348    says that we should prefer the Advanced SIMD loop.  */
17349 bool
17350 aarch64_vector_costs::prefer_unrolled_loop () const
17351 {
17352   if (!m_unrolled_advsimd_stmts)
17353     return false;
17354
17355   if (dump_enabled_p ())
17356     dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
17357                      " unrolled Advanced SIMD loop = "
17358                      HOST_WIDE_INT_PRINT_UNSIGNED "\n",
17359                      m_unrolled_advsimd_stmts);
17360
17361   /* The balance here is tricky.  On the one hand, we can't be sure whether
17362      the code is vectorizable with Advanced SIMD or not.  However, even if
17363      it isn't vectorizable with Advanced SIMD, there's a possibility that
17364      the scalar code could also be unrolled.  Some of the code might then
17365      benefit from SLP, or from using LDP and STP.  We therefore apply
17366      the heuristic regardless of can_use_advsimd_p.  */
17367   return (m_unrolled_advsimd_stmts
17368           && (m_unrolled_advsimd_stmts
17369               <= (unsigned int) param_max_completely_peeled_insns));
17370 }
17371
17372 /* Subroutine of adjust_body_cost for handling SVE.  Use ISSUE_INFO to work out
17373    how fast the SVE code can be issued and compare it to the equivalent value
17374    for scalar code (SCALAR_CYCLES_PER_ITER).  If COULD_USE_ADVSIMD is true,
17375    also compare it to the issue rate of Advanced SIMD code
17376    (ADVSIMD_CYCLES_PER_ITER).
17377
17378    ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
17379    *BODY_COST is the current value of the adjusted cost.  *SHOULD_DISPARAGE
17380    is true if we think the loop body is too expensive.  */
17381
17382 fractional_cost
17383 aarch64_vector_costs::
17384 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
17385                       fractional_cost scalar_cycles_per_iter,
17386                       unsigned int orig_body_cost, unsigned int *body_cost,
17387                       bool *should_disparage)
17388 {
17389   if (dump_enabled_p ())
17390     ops->dump ();
17391
17392   fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
17393   fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
17394
17395   /* If the scalar version of the loop could issue at least as
17396      quickly as the predicate parts of the SVE loop, make the SVE loop
17397      prohibitively expensive.  In this case vectorization is adding an
17398      overhead that the original scalar code didn't have.
17399
17400      This is mostly intended to detect cases in which WHILELOs dominate
17401      for very tight loops, which is something that normal latency-based
17402      costs would not model.  Adding this kind of cliffedge would be
17403      too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17404      code in the caller handles that case in a more conservative way.  */
17405   fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
17406   if (scalar_cycles_per_iter < sve_estimate)
17407     {
17408       unsigned int min_cost
17409         = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17410       if (*body_cost < min_cost)
17411         {
17412           if (dump_enabled_p ())
17413             dump_printf_loc (MSG_NOTE, vect_location,
17414                              "Increasing body cost to %d because the"
17415                              " scalar code could issue within the limit"
17416                              " imposed by predicate operations\n",
17417                              min_cost);
17418           *body_cost = min_cost;
17419           *should_disparage = true;
17420         }
17421     }
17422
17423   return sve_cycles_per_iter;
17424 }
17425
17426 unsigned int
17427 aarch64_vector_costs::determine_suggested_unroll_factor ()
17428 {
17429   bool sve = m_vec_flags & VEC_ANY_SVE;
17430   /* If we are trying to unroll an Advanced SIMD main loop that contains
17431      an averaging operation that we do not support with SVE and we might use a
17432      predicated epilogue, we need to be conservative and block unrolling as
17433      this might lead to a less optimal loop for the first and only epilogue
17434      using the original loop's vectorization factor.
17435      TODO: Remove this constraint when we add support for multiple epilogue
17436      vectorization.  */
17437   if (!sve && !TARGET_SVE2 && m_has_avg)
17438     return 1;
17439
17440   unsigned int max_unroll_factor = 1;
17441   for (auto vec_ops : m_ops)
17442     {
17443       aarch64_simd_vec_issue_info const *vec_issue
17444         = vec_ops.simd_issue_info ();
17445       if (!vec_issue)
17446         return 1;
17447       /* Limit unroll factor to a value adjustable by the user, the default
17448          value is 4. */
17449       unsigned int unroll_factor = aarch64_vect_unroll_limit;
17450       unsigned int factor
17451        = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17452       unsigned int temp;
17453
17454       /* Sanity check, this should never happen.  */
17455       if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17456         return 1;
17457
17458       /* Check stores.  */
17459       if (vec_ops.stores > 0)
17460         {
17461           temp = CEIL (factor * vec_issue->stores_per_cycle,
17462                        vec_ops.stores);
17463           unroll_factor = MIN (unroll_factor, temp);
17464         }
17465
17466       /* Check loads + stores.  */
17467       if (vec_ops.loads > 0)
17468         {
17469           temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17470                        vec_ops.loads + vec_ops.stores);
17471           unroll_factor = MIN (unroll_factor, temp);
17472         }
17473
17474       /* Check general ops.  */
17475       if (vec_ops.general_ops > 0)
17476         {
17477           temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17478                        vec_ops.general_ops);
17479           unroll_factor = MIN (unroll_factor, temp);
17480          }
17481       max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17482     }
17483
17484   /* Make sure unroll factor is power of 2.  */
17485   return 1 << ceil_log2 (max_unroll_factor);
17486 }
17487
17488 /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
17489    and return the new cost.  */
17490 unsigned int
17491 aarch64_vector_costs::
17492 adjust_body_cost (loop_vec_info loop_vinfo,
17493                   const aarch64_vector_costs *scalar_costs,
17494                   unsigned int body_cost)
17495 {
17496   if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17497     return body_cost;
17498
17499   const auto &scalar_ops = scalar_costs->m_ops[0];
17500   const auto &vector_ops = m_ops[0];
17501   unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17502   unsigned int orig_body_cost = body_cost;
17503   bool should_disparage = false;
17504
17505   if (dump_enabled_p ())
17506     dump_printf_loc (MSG_NOTE, vect_location,
17507                      "Original vector body cost = %d\n", body_cost);
17508
17509   fractional_cost scalar_cycles_per_iter
17510     = scalar_ops.min_cycles_per_iter () * estimated_vf;
17511
17512   fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17513
17514   if (dump_enabled_p ())
17515     {
17516       if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17517         dump_printf_loc (MSG_NOTE, vect_location,
17518                          "Vector loop iterates at most %wd times\n",
17519                          m_num_vector_iterations);
17520       dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17521       scalar_ops.dump ();
17522       dump_printf_loc (MSG_NOTE, vect_location,
17523                        "  estimated cycles per vector iteration"
17524                        " (for VF %d) = %f\n",
17525                        estimated_vf, scalar_cycles_per_iter.as_double ());
17526     }
17527
17528   if (vector_ops.sve_issue_info ())
17529     {
17530       if (dump_enabled_p ())
17531         dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17532       vector_cycles_per_iter
17533         = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17534                                 orig_body_cost, &body_cost, &should_disparage);
17535
17536       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17537         {
17538           /* Also take Neoverse V1 tuning into account, doubling the
17539              scalar and Advanced SIMD estimates to account for the
17540              doubling in SVE vector length.  */
17541           if (dump_enabled_p ())
17542             dump_printf_loc (MSG_NOTE, vect_location,
17543                              "Neoverse V1 estimate:\n");
17544           auto vf_factor = m_ops[1].vf_factor ();
17545           adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17546                                 orig_body_cost, &body_cost, &should_disparage);
17547         }
17548     }
17549   else
17550     {
17551       if (dump_enabled_p ())
17552         {
17553           dump_printf_loc (MSG_NOTE, vect_location,
17554                            "Vector issue estimate:\n");
17555           vector_ops.dump ();
17556         }
17557     }
17558
17559   /* Decide whether to stick to latency-based costs or whether to try to
17560      take issue rates into account.  */
17561   unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17562   if (m_vec_flags & VEC_ANY_SVE)
17563     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17564
17565   if (m_num_vector_iterations >= 1
17566       && m_num_vector_iterations < threshold)
17567     {
17568       if (dump_enabled_p ())
17569         dump_printf_loc (MSG_NOTE, vect_location,
17570                          "Low iteration count, so using pure latency"
17571                          " costs\n");
17572     }
17573   /* Increase the cost of the vector code if it looks like the scalar code
17574      could issue more quickly.  These values are only rough estimates,
17575      so minor differences should only result in minor changes.  */
17576   else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17577     {
17578       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17579                                           scalar_cycles_per_iter);
17580       if (dump_enabled_p ())
17581         dump_printf_loc (MSG_NOTE, vect_location,
17582                          "Increasing body cost to %d because scalar code"
17583                          " would issue more quickly\n", body_cost);
17584     }
17585   /* In general, it's expected that the proposed vector code would be able
17586      to issue more quickly than the original scalar code.  This should
17587      already be reflected to some extent in the latency-based costs.
17588
17589      However, the latency-based costs effectively assume that the scalar
17590      code and the vector code execute serially, which tends to underplay
17591      one important case: if the real (non-serialized) execution time of
17592      a scalar iteration is dominated by loop-carried dependencies,
17593      and if the vector code is able to reduce both the length of
17594      the loop-carried dependencies *and* the number of cycles needed
17595      to issue the code in general, we can be more confident that the
17596      vector code is an improvement, even if adding the other (non-loop-carried)
17597      latencies tends to hide this saving.  We therefore reduce the cost of the
17598      vector loop body in proportion to the saving.  */
17599   else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17600            && scalar_ops.reduction_latency == scalar_cycles_per_iter
17601            && scalar_cycles_per_iter > vector_cycles_per_iter
17602            && !should_disparage)
17603     {
17604       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17605                                           scalar_cycles_per_iter);
17606       if (dump_enabled_p ())
17607         dump_printf_loc (MSG_NOTE, vect_location,
17608                          "Decreasing body cost to %d account for smaller"
17609                          " reduction latency\n", body_cost);
17610     }
17611
17612   return body_cost;
17613 }
17614
17615 void
17616 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17617 {
17618   /* Record the issue information for any SVE WHILE instructions that the
17619      loop needs.  */
17620   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17621   if (!m_ops.is_empty ()
17622       && loop_vinfo
17623       && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
17624     {
17625       unsigned int num_masks = 0;
17626       rgroup_controls *rgm;
17627       unsigned int num_vectors_m1;
17628       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
17629                         num_vectors_m1, rgm)
17630         if (rgm->type)
17631           num_masks += num_vectors_m1 + 1;
17632       for (auto &ops : m_ops)
17633         if (auto *issue = ops.sve_issue_info ())
17634           ops.pred_ops += num_masks * issue->while_pred_ops;
17635     }
17636
17637   auto *scalar_costs
17638     = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17639   if (loop_vinfo
17640       && m_vec_flags
17641       && aarch64_use_new_vector_costs_p ())
17642     {
17643       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17644                                              m_costs[vect_body]);
17645       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17646     }
17647
17648   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
17649      the scalar code in the event of a tie, since there is more chance
17650      of scalar code being optimized with surrounding operations.
17651
17652      In addition, if the vector body is a simple store to a decl that
17653      is elsewhere loaded using vld1, strongly prefer the vector form,
17654      to the extent of giving the prologue a zero cost.  See the comment
17655      above m_stores_to_vector_load_decl for details.  */
17656   if (!loop_vinfo
17657       && scalar_costs
17658       && m_stp_sequence_cost != ~0U)
17659     {
17660       if (m_stores_to_vector_load_decl)
17661         m_costs[vect_prologue] = 0;
17662       else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17663         m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17664     }
17665
17666   vector_costs::finish_cost (scalar_costs);
17667 }
17668
17669 bool
17670 aarch64_vector_costs::
17671 better_main_loop_than_p (const vector_costs *uncast_other) const
17672 {
17673   auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17674
17675   auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17676   auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17677
17678   if (dump_enabled_p ())
17679     dump_printf_loc (MSG_NOTE, vect_location,
17680                      "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17681                      GET_MODE_NAME (this_loop_vinfo->vector_mode),
17682                      vect_vf_for_cost (this_loop_vinfo),
17683                      GET_MODE_NAME (other_loop_vinfo->vector_mode),
17684                      vect_vf_for_cost (other_loop_vinfo));
17685
17686   /* Apply the unrolling heuristic described above
17687      m_unrolled_advsimd_niters.  */
17688   if (bool (m_unrolled_advsimd_stmts)
17689       != bool (other->m_unrolled_advsimd_stmts))
17690     {
17691       bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17692       bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17693       if (this_prefer_unrolled != other_prefer_unrolled)
17694         {
17695           if (dump_enabled_p ())
17696             dump_printf_loc (MSG_NOTE, vect_location,
17697                              "Preferring Advanced SIMD loop because"
17698                              " it can be unrolled\n");
17699           return other_prefer_unrolled;
17700         }
17701     }
17702
17703   for (unsigned int i = 0; i < m_ops.length (); ++i)
17704     {
17705       if (dump_enabled_p ())
17706         {
17707           if (i)
17708             dump_printf_loc (MSG_NOTE, vect_location,
17709                              "Reconsidering with subtuning %d\n", i);
17710           dump_printf_loc (MSG_NOTE, vect_location,
17711                            "Issue info for %s loop:\n",
17712                            GET_MODE_NAME (this_loop_vinfo->vector_mode));
17713           this->m_ops[i].dump ();
17714           dump_printf_loc (MSG_NOTE, vect_location,
17715                            "Issue info for %s loop:\n",
17716                            GET_MODE_NAME (other_loop_vinfo->vector_mode));
17717           other->m_ops[i].dump ();
17718         }
17719
17720       auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17721                                 * this->m_ops[i].vf_factor ());
17722       auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17723                                  * other->m_ops[i].vf_factor ());
17724
17725       /* If it appears that one loop could process the same amount of data
17726          in fewer cycles, prefer that loop over the other one.  */
17727       fractional_cost this_cost
17728         = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17729       fractional_cost other_cost
17730         = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17731       if (dump_enabled_p ())
17732         {
17733           dump_printf_loc (MSG_NOTE, vect_location,
17734                            "Weighted cycles per iteration of %s loop ~= %f\n",
17735                            GET_MODE_NAME (this_loop_vinfo->vector_mode),
17736                            this_cost.as_double ());
17737           dump_printf_loc (MSG_NOTE, vect_location,
17738                            "Weighted cycles per iteration of %s loop ~= %f\n",
17739                            GET_MODE_NAME (other_loop_vinfo->vector_mode),
17740                            other_cost.as_double ());
17741         }
17742       if (this_cost != other_cost)
17743         {
17744           if (dump_enabled_p ())
17745             dump_printf_loc (MSG_NOTE, vect_location,
17746                              "Preferring loop with lower cycles"
17747                              " per iteration\n");
17748           return this_cost < other_cost;
17749         }
17750
17751       /* If the issue rate of SVE code is limited by predicate operations
17752          (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17753          and if Advanced SIMD code could issue within the limit imposed
17754          by the predicate operations, the predicate operations are adding an
17755          overhead that the original code didn't have and so we should prefer
17756          the Advanced SIMD version.  */
17757       auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17758                                     const aarch64_vec_op_count &b) -> bool
17759         {
17760           if (a.pred_ops == 0
17761               && (b.min_pred_cycles_per_iter ()
17762                   > b.min_nonpred_cycles_per_iter ()))
17763             {
17764               if (dump_enabled_p ())
17765                 dump_printf_loc (MSG_NOTE, vect_location,
17766                                  "Preferring Advanced SIMD loop since"
17767                                  " SVE loop is predicate-limited\n");
17768               return true;
17769             }
17770           return false;
17771         };
17772       if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17773         return true;
17774       if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17775         return false;
17776     }
17777
17778   return vector_costs::better_main_loop_than_p (other);
17779 }
17780
17781 static void initialize_aarch64_code_model (struct gcc_options *);
17782
17783 /* Parse the TO_PARSE string and put the architecture struct that it
17784    selects into RES and the architectural features into ISA_FLAGS.
17785    Return an aarch_parse_opt_result describing the parse result.
17786    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17787    When the TO_PARSE string contains an invalid extension,
17788    a copy of the string is created and stored to INVALID_EXTENSION.  */
17789
17790 static enum aarch_parse_opt_result
17791 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17792                     aarch64_feature_flags *isa_flags,
17793                     std::string *invalid_extension)
17794 {
17795   const char *ext;
17796   const struct processor *arch;
17797   size_t len;
17798
17799   ext = strchr (to_parse, '+');
17800
17801   if (ext != NULL)
17802     len = ext - to_parse;
17803   else
17804     len = strlen (to_parse);
17805
17806   if (len == 0)
17807     return AARCH_PARSE_MISSING_ARG;
17808
17809
17810   /* Loop through the list of supported ARCHes to find a match.  */
17811   for (arch = all_architectures; arch->name != NULL; arch++)
17812     {
17813       if (strlen (arch->name) == len
17814           && strncmp (arch->name, to_parse, len) == 0)
17815         {
17816           auto isa_temp = arch->flags;
17817
17818           if (ext != NULL)
17819             {
17820               /* TO_PARSE string contains at least one extension.  */
17821               enum aarch_parse_opt_result ext_res
17822                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17823
17824               if (ext_res != AARCH_PARSE_OK)
17825                 return ext_res;
17826             }
17827           /* Extension parsing was successful.  Confirm the result
17828              arch and ISA flags.  */
17829           *res = arch;
17830           *isa_flags = isa_temp;
17831           return AARCH_PARSE_OK;
17832         }
17833     }
17834
17835   /* ARCH name not found in list.  */
17836   return AARCH_PARSE_INVALID_ARG;
17837 }
17838
17839 /* Parse the TO_PARSE string and put the result tuning in RES and the
17840    architecture flags in ISA_FLAGS.  Return an aarch_parse_opt_result
17841    describing the parse result.  If there is an error parsing, RES and
17842    ISA_FLAGS are left unchanged.
17843    When the TO_PARSE string contains an invalid extension,
17844    a copy of the string is created and stored to INVALID_EXTENSION.  */
17845
17846 static enum aarch_parse_opt_result
17847 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17848                    aarch64_feature_flags *isa_flags,
17849                    std::string *invalid_extension)
17850 {
17851   const char *ext;
17852   const struct processor *cpu;
17853   size_t len;
17854
17855   ext = strchr (to_parse, '+');
17856
17857   if (ext != NULL)
17858     len = ext - to_parse;
17859   else
17860     len = strlen (to_parse);
17861
17862   if (len == 0)
17863     return AARCH_PARSE_MISSING_ARG;
17864
17865
17866   /* Loop through the list of supported CPUs to find a match.  */
17867   for (cpu = all_cores; cpu->name != NULL; cpu++)
17868     {
17869       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17870         {
17871           auto isa_temp = cpu->flags;
17872
17873           if (ext != NULL)
17874             {
17875               /* TO_PARSE string contains at least one extension.  */
17876               enum aarch_parse_opt_result ext_res
17877                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17878
17879               if (ext_res != AARCH_PARSE_OK)
17880                 return ext_res;
17881             }
17882           /* Extension parsing was successfull.  Confirm the result
17883              cpu and ISA flags.  */
17884           *res = cpu;
17885           *isa_flags = isa_temp;
17886           return AARCH_PARSE_OK;
17887         }
17888     }
17889
17890   /* CPU name not found in list.  */
17891   return AARCH_PARSE_INVALID_ARG;
17892 }
17893
17894 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17895    Return an aarch_parse_opt_result describing the parse result.
17896    If the parsing fails the RES does not change.  */
17897
17898 static enum aarch_parse_opt_result
17899 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17900 {
17901   const struct processor *cpu;
17902
17903   /* Loop through the list of supported CPUs to find a match.  */
17904   for (cpu = all_cores; cpu->name != NULL; cpu++)
17905     {
17906       if (strcmp (cpu->name, to_parse) == 0)
17907         {
17908           *res = cpu;
17909           return AARCH_PARSE_OK;
17910         }
17911     }
17912
17913   /* CPU name not found in list.  */
17914   return AARCH_PARSE_INVALID_ARG;
17915 }
17916
17917 /* Parse TOKEN, which has length LENGTH to see if it is an option
17918    described in FLAG.  If it is, return the index bit for that fusion type.
17919    If not, error (printing OPTION_NAME) and return zero.  */
17920
17921 static unsigned int
17922 aarch64_parse_one_option_token (const char *token,
17923                                 size_t length,
17924                                 const struct aarch64_flag_desc *flag,
17925                                 const char *option_name)
17926 {
17927   for (; flag->name != NULL; flag++)
17928     {
17929       if (length == strlen (flag->name)
17930           && !strncmp (flag->name, token, length))
17931         return flag->flag;
17932     }
17933
17934   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17935   return 0;
17936 }
17937
17938 /* Parse OPTION which is a comma-separated list of flags to enable.
17939    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17940    default state we inherit from the CPU tuning structures.  OPTION_NAME
17941    gives the top-level option we are parsing in the -moverride string,
17942    for use in error messages.  */
17943
17944 static unsigned int
17945 aarch64_parse_boolean_options (const char *option,
17946                                const struct aarch64_flag_desc *flags,
17947                                unsigned int initial_state,
17948                                const char *option_name)
17949 {
17950   const char separator = '.';
17951   const char* specs = option;
17952   const char* ntoken = option;
17953   unsigned int found_flags = initial_state;
17954
17955   while ((ntoken = strchr (specs, separator)))
17956     {
17957       size_t token_length = ntoken - specs;
17958       unsigned token_ops = aarch64_parse_one_option_token (specs,
17959                                                            token_length,
17960                                                            flags,
17961                                                            option_name);
17962       /* If we find "none" (or, for simplicity's sake, an error) anywhere
17963          in the token stream, reset the supported operations.  So:
17964
17965            adrp+add.cmp+branch.none.adrp+add
17966
17967            would have the result of turning on only adrp+add fusion.  */
17968       if (!token_ops)
17969         found_flags = 0;
17970
17971       found_flags |= token_ops;
17972       specs = ++ntoken;
17973     }
17974
17975   /* We ended with a comma, print something.  */
17976   if (!(*specs))
17977     {
17978       error ("%qs string ill-formed", option_name);
17979       return 0;
17980     }
17981
17982   /* We still have one more token to parse.  */
17983   size_t token_length = strlen (specs);
17984   unsigned token_ops = aarch64_parse_one_option_token (specs,
17985                                                        token_length,
17986                                                        flags,
17987                                                        option_name);
17988    if (!token_ops)
17989      found_flags = 0;
17990
17991   found_flags |= token_ops;
17992   return found_flags;
17993 }
17994
17995 /* Support for overriding instruction fusion.  */
17996
17997 static void
17998 aarch64_parse_fuse_string (const char *fuse_string,
17999                             struct tune_params *tune)
18000 {
18001   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
18002                                                      aarch64_fusible_pairs,
18003                                                      tune->fusible_ops,
18004                                                      "fuse=");
18005 }
18006
18007 /* Support for overriding other tuning flags.  */
18008
18009 static void
18010 aarch64_parse_tune_string (const char *tune_string,
18011                             struct tune_params *tune)
18012 {
18013   tune->extra_tuning_flags
18014     = aarch64_parse_boolean_options (tune_string,
18015                                      aarch64_tuning_flags,
18016                                      tune->extra_tuning_flags,
18017                                      "tune=");
18018 }
18019
18020 /* Parse the sve_width tuning moverride string in TUNE_STRING.
18021    Accept the valid SVE vector widths allowed by
18022    aarch64_sve_vector_bits_enum and use it to override sve_width
18023    in TUNE.  */
18024
18025 static void
18026 aarch64_parse_sve_width_string (const char *tune_string,
18027                                 struct tune_params *tune)
18028 {
18029   int width = -1;
18030
18031   int n = sscanf (tune_string, "%d", &width);
18032   if (n == EOF)
18033     {
18034       error ("invalid format for %<sve_width%>");
18035       return;
18036     }
18037   switch (width)
18038     {
18039     case SVE_128:
18040     case SVE_256:
18041     case SVE_512:
18042     case SVE_1024:
18043     case SVE_2048:
18044       break;
18045     default:
18046       error ("invalid %<sve_width%> value: %d", width);
18047     }
18048   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
18049 }
18050
18051 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
18052    we understand.  If it is, extract the option string and handoff to
18053    the appropriate function.  */
18054
18055 void
18056 aarch64_parse_one_override_token (const char* token,
18057                                   size_t length,
18058                                   struct tune_params *tune)
18059 {
18060   const struct aarch64_tuning_override_function *fn
18061     = aarch64_tuning_override_functions;
18062
18063   const char *option_part = strchr (token, '=');
18064   if (!option_part)
18065     {
18066       error ("tuning string missing in option (%s)", token);
18067       return;
18068     }
18069
18070   /* Get the length of the option name.  */
18071   length = option_part - token;
18072   /* Skip the '=' to get to the option string.  */
18073   option_part++;
18074
18075   for (; fn->name != NULL; fn++)
18076     {
18077       if (!strncmp (fn->name, token, length))
18078         {
18079           fn->parse_override (option_part, tune);
18080           return;
18081         }
18082     }
18083
18084   error ("unknown tuning option (%s)",token);
18085   return;
18086 }
18087
18088 /* A checking mechanism for the implementation of the tls size.  */
18089
18090 static void
18091 initialize_aarch64_tls_size (struct gcc_options *opts)
18092 {
18093   if (aarch64_tls_size == 0)
18094     aarch64_tls_size = 24;
18095
18096   switch (opts->x_aarch64_cmodel_var)
18097     {
18098     case AARCH64_CMODEL_TINY:
18099       /* Both the default and maximum TLS size allowed under tiny is 1M which
18100          needs two instructions to address, so we clamp the size to 24.  */
18101       if (aarch64_tls_size > 24)
18102         aarch64_tls_size = 24;
18103       break;
18104     case AARCH64_CMODEL_SMALL:
18105       /* The maximum TLS size allowed under small is 4G.  */
18106       if (aarch64_tls_size > 32)
18107         aarch64_tls_size = 32;
18108       break;
18109     case AARCH64_CMODEL_LARGE:
18110       /* The maximum TLS size allowed under large is 16E.
18111          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
18112       if (aarch64_tls_size > 48)
18113         aarch64_tls_size = 48;
18114       break;
18115     default:
18116       gcc_unreachable ();
18117     }
18118
18119   return;
18120 }
18121
18122 /* Return the CPU corresponding to the enum CPU.  */
18123
18124 static const struct processor *
18125 aarch64_get_tune_cpu (enum aarch64_processor cpu)
18126 {
18127   gcc_assert (cpu != aarch64_none);
18128
18129   return &all_cores[cpu];
18130 }
18131
18132 /* Return the architecture corresponding to the enum ARCH.  */
18133
18134 static const struct processor *
18135 aarch64_get_arch (enum aarch64_arch arch)
18136 {
18137   gcc_assert (arch != aarch64_no_arch);
18138
18139   return &all_architectures[arch];
18140 }
18141
18142 /* Parse STRING looking for options in the format:
18143      string     :: option:string
18144      option     :: name=substring
18145      name       :: {a-z}
18146      substring  :: defined by option.  */
18147
18148 static void
18149 aarch64_parse_override_string (const char* input_string,
18150                                struct tune_params* tune)
18151 {
18152   const char separator = ':';
18153   size_t string_length = strlen (input_string) + 1;
18154   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
18155   char *string = string_root;
18156   strncpy (string, input_string, string_length);
18157   string[string_length - 1] = '\0';
18158
18159   char* ntoken = string;
18160
18161   while ((ntoken = strchr (string, separator)))
18162     {
18163       size_t token_length = ntoken - string;
18164       /* Make this substring look like a string.  */
18165       *ntoken = '\0';
18166       aarch64_parse_one_override_token (string, token_length, tune);
18167       string = ++ntoken;
18168     }
18169
18170   /* One last option to parse.  */
18171   aarch64_parse_one_override_token (string, strlen (string), tune);
18172   free (string_root);
18173 }
18174
18175 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
18176    are best for a generic target with the currently-enabled architecture
18177    extensions.  */
18178 static void
18179 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
18180 {
18181   /* Neoverse V1 is the only core that is known to benefit from
18182      AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.  There is therefore no
18183      point enabling it for SVE2 and above.  */
18184   if (TARGET_SVE2)
18185     current_tune.extra_tuning_flags
18186       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
18187 }
18188
18189 static void
18190 aarch64_override_options_after_change_1 (struct gcc_options *opts)
18191 {
18192   /* PR 70044: We have to be careful about being called multiple times for the
18193      same function.  This means all changes should be repeatable.  */
18194
18195   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
18196      Disable the frame pointer flag so the mid-end will not use a frame
18197      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
18198      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
18199      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
18200   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
18201   if (opts->x_flag_omit_frame_pointer == 0)
18202     opts->x_flag_omit_frame_pointer = 2;
18203
18204   /* If not optimizing for size, set the default
18205      alignment to what the target wants.  */
18206   if (!opts->x_optimize_size)
18207     {
18208       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
18209         opts->x_str_align_loops = aarch64_tune_params.loop_align;
18210       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
18211         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
18212       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
18213         opts->x_str_align_functions = aarch64_tune_params.function_align;
18214     }
18215
18216   /* We default to no pc-relative literal loads.  */
18217
18218   aarch64_pcrelative_literal_loads = false;
18219
18220   /* If -mpc-relative-literal-loads is set on the command line, this
18221      implies that the user asked for PC relative literal loads.  */
18222   if (opts->x_pcrelative_literal_loads == 1)
18223     aarch64_pcrelative_literal_loads = true;
18224
18225   /* In the tiny memory model it makes no sense to disallow PC relative
18226      literal pool loads.  */
18227   if (aarch64_cmodel == AARCH64_CMODEL_TINY
18228       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
18229     aarch64_pcrelative_literal_loads = true;
18230
18231   /* When enabling the lower precision Newton series for the square root, also
18232      enable it for the reciprocal square root, since the latter is an
18233      intermediary step for the former.  */
18234   if (flag_mlow_precision_sqrt)
18235     flag_mrecip_low_precision_sqrt = true;
18236 }
18237
18238 /* 'Unpack' up the internal tuning structs and update the options
18239     in OPTS.  The caller must have set up selected_tune and selected_arch
18240     as all the other target-specific codegen decisions are
18241     derived from them.  */
18242
18243 void
18244 aarch64_override_options_internal (struct gcc_options *opts)
18245 {
18246   const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
18247   aarch64_tune_flags = tune->flags;
18248   aarch64_tune = tune->sched_core;
18249   /* Make a copy of the tuning parameters attached to the core, which
18250      we may later overwrite.  */
18251   aarch64_tune_params = *(tune->tune);
18252   if (tune->tune == &generic_tunings)
18253     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
18254
18255   if (opts->x_aarch64_override_tune_string)
18256     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
18257                                    &aarch64_tune_params);
18258
18259   if (opts->x_aarch64_ldp_policy_param)
18260     aarch64_tune_params.ldp_policy_model = opts->x_aarch64_ldp_policy_param;
18261
18262   if (opts->x_aarch64_stp_policy_param)
18263     aarch64_tune_params.stp_policy_model = opts->x_aarch64_stp_policy_param;
18264
18265   /* This target defaults to strict volatile bitfields.  */
18266   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
18267     opts->x_flag_strict_volatile_bitfields = 1;
18268
18269   if (aarch64_stack_protector_guard == SSP_GLOBAL
18270       && opts->x_aarch64_stack_protector_guard_offset_str)
18271     {
18272       error ("incompatible options %<-mstack-protector-guard=global%> and "
18273              "%<-mstack-protector-guard-offset=%s%>",
18274              aarch64_stack_protector_guard_offset_str);
18275     }
18276
18277   if (aarch64_stack_protector_guard == SSP_SYSREG
18278       && !(opts->x_aarch64_stack_protector_guard_offset_str
18279            && opts->x_aarch64_stack_protector_guard_reg_str))
18280     {
18281       error ("both %<-mstack-protector-guard-offset%> and "
18282              "%<-mstack-protector-guard-reg%> must be used "
18283              "with %<-mstack-protector-guard=sysreg%>");
18284     }
18285
18286   if (opts->x_aarch64_stack_protector_guard_reg_str)
18287     {
18288       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
18289           error ("specify a system register with a small string length");
18290     }
18291
18292   if (opts->x_aarch64_stack_protector_guard_offset_str)
18293     {
18294       char *end;
18295       const char *str = aarch64_stack_protector_guard_offset_str;
18296       errno = 0;
18297       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
18298       if (!*str || *end || errno)
18299         error ("%qs is not a valid offset in %qs", str,
18300                "-mstack-protector-guard-offset=");
18301       aarch64_stack_protector_guard_offset = offs;
18302     }
18303
18304   if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
18305       && !fixed_regs[R18_REGNUM])
18306     error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
18307
18308   if ((opts->x_aarch64_isa_flags & (AARCH64_FL_SM_ON | AARCH64_FL_ZA_ON))
18309       && !(opts->x_aarch64_isa_flags & AARCH64_FL_SME))
18310     {
18311       if (opts->x_aarch64_isa_flags & AARCH64_FL_SM_ON)
18312         error ("streaming functions require the ISA extension %qs", "sme");
18313       else
18314         error ("functions with SME state require the ISA extension %qs",
18315                "sme");
18316       inform (input_location, "you can enable %qs using the command-line"
18317               " option %<-march%>, or by using the %<target%>"
18318               " attribute or pragma", "sme");
18319       opts->x_target_flags &= ~MASK_GENERAL_REGS_ONLY;
18320       auto new_flags = (opts->x_aarch64_asm_isa_flags
18321                         | feature_deps::SME ().enable);
18322       aarch64_set_asm_isa_flags (opts, new_flags);
18323     }
18324
18325   initialize_aarch64_code_model (opts);
18326   initialize_aarch64_tls_size (opts);
18327   aarch64_tpidr_register = opts->x_aarch64_tpidr_reg;
18328
18329   int queue_depth = 0;
18330   switch (aarch64_tune_params.autoprefetcher_model)
18331     {
18332       case tune_params::AUTOPREFETCHER_OFF:
18333         queue_depth = -1;
18334         break;
18335       case tune_params::AUTOPREFETCHER_WEAK:
18336         queue_depth = 0;
18337         break;
18338       case tune_params::AUTOPREFETCHER_STRONG:
18339         queue_depth = max_insn_queue_index + 1;
18340         break;
18341       default:
18342         gcc_unreachable ();
18343     }
18344
18345   /* We don't mind passing in global_options_set here as we don't use
18346      the *options_set structs anyway.  */
18347   SET_OPTION_IF_UNSET (opts, &global_options_set,
18348                        param_sched_autopref_queue_depth, queue_depth);
18349
18350   /* Set up parameters to be used in prefetching algorithm.  Do not
18351      override the defaults unless we are tuning for a core we have
18352      researched values for.  */
18353   if (aarch64_tune_params.prefetch->num_slots > 0)
18354     SET_OPTION_IF_UNSET (opts, &global_options_set,
18355                          param_simultaneous_prefetches,
18356                          aarch64_tune_params.prefetch->num_slots);
18357   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
18358     SET_OPTION_IF_UNSET (opts, &global_options_set,
18359                          param_l1_cache_size,
18360                          aarch64_tune_params.prefetch->l1_cache_size);
18361   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18362     SET_OPTION_IF_UNSET (opts, &global_options_set,
18363                          param_l1_cache_line_size,
18364                          aarch64_tune_params.prefetch->l1_cache_line_size);
18365
18366   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18367     {
18368       SET_OPTION_IF_UNSET (opts, &global_options_set,
18369                            param_destruct_interfere_size,
18370                            aarch64_tune_params.prefetch->l1_cache_line_size);
18371       SET_OPTION_IF_UNSET (opts, &global_options_set,
18372                            param_construct_interfere_size,
18373                            aarch64_tune_params.prefetch->l1_cache_line_size);
18374     }
18375   else
18376     {
18377       /* For a generic AArch64 target, cover the current range of cache line
18378          sizes.  */
18379       SET_OPTION_IF_UNSET (opts, &global_options_set,
18380                            param_destruct_interfere_size,
18381                            256);
18382       SET_OPTION_IF_UNSET (opts, &global_options_set,
18383                            param_construct_interfere_size,
18384                            64);
18385     }
18386
18387   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
18388     SET_OPTION_IF_UNSET (opts, &global_options_set,
18389                          param_l2_cache_size,
18390                          aarch64_tune_params.prefetch->l2_cache_size);
18391   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
18392     SET_OPTION_IF_UNSET (opts, &global_options_set,
18393                          param_prefetch_dynamic_strides, 0);
18394   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
18395     SET_OPTION_IF_UNSET (opts, &global_options_set,
18396                          param_prefetch_minimum_stride,
18397                          aarch64_tune_params.prefetch->minimum_stride);
18398
18399   /* Use the alternative scheduling-pressure algorithm by default.  */
18400   SET_OPTION_IF_UNSET (opts, &global_options_set,
18401                        param_sched_pressure_algorithm,
18402                        SCHED_PRESSURE_MODEL);
18403
18404   /* Validate the guard size.  */
18405   int guard_size = param_stack_clash_protection_guard_size;
18406
18407   if (guard_size != 12 && guard_size != 16)
18408     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
18409            "size.  Given value %d (%llu KB) is out of range",
18410            guard_size, (1ULL << guard_size) / 1024ULL);
18411
18412   /* Enforce that interval is the same size as size so the mid-end does the
18413      right thing.  */
18414   SET_OPTION_IF_UNSET (opts, &global_options_set,
18415                        param_stack_clash_protection_probe_interval,
18416                        guard_size);
18417
18418   /* The maybe_set calls won't update the value if the user has explicitly set
18419      one.  Which means we need to validate that probing interval and guard size
18420      are equal.  */
18421   int probe_interval
18422     = param_stack_clash_protection_probe_interval;
18423   if (guard_size != probe_interval)
18424     error ("stack clash guard size %<%d%> must be equal to probing interval "
18425            "%<%d%>", guard_size, probe_interval);
18426
18427   /* Enable sw prefetching at specified optimization level for
18428      CPUS that have prefetch.  Lower optimization level threshold by 1
18429      when profiling is enabled.  */
18430   if (opts->x_flag_prefetch_loop_arrays < 0
18431       && !opts->x_optimize_size
18432       && aarch64_tune_params.prefetch->default_opt_level >= 0
18433       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
18434     opts->x_flag_prefetch_loop_arrays = 1;
18435
18436   /* Avoid loop-dependant FMA chains.  */
18437   if (aarch64_tune_params.extra_tuning_flags
18438       & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA)
18439     SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
18440                          512);
18441
18442   /* Consider fully pipelined FMA in reassociation.  */
18443   if (aarch64_tune_params.extra_tuning_flags
18444       & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
18445     SET_OPTION_IF_UNSET (opts, &global_options_set, param_fully_pipelined_fma,
18446                          1);
18447
18448   aarch64_override_options_after_change_1 (opts);
18449 }
18450
18451 /* Print a hint with a suggestion for a core or architecture name that
18452    most closely resembles what the user passed in STR.  ARCH is true if
18453    the user is asking for an architecture name.  ARCH is false if the user
18454    is asking for a core name.  */
18455
18456 static void
18457 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
18458 {
18459   auto_vec<const char *> candidates;
18460   const struct processor *entry = arch ? all_architectures : all_cores;
18461   for (; entry->name != NULL; entry++)
18462     candidates.safe_push (entry->name);
18463
18464 #ifdef HAVE_LOCAL_CPU_DETECT
18465   /* Add also "native" as possible value.  */
18466   if (arch)
18467     candidates.safe_push ("native");
18468 #endif
18469
18470   char *s;
18471   const char *hint = candidates_list_and_hint (str, s, candidates);
18472   if (hint)
18473     inform (input_location, "valid arguments are: %s;"
18474                              " did you mean %qs?", s, hint);
18475   else
18476     inform (input_location, "valid arguments are: %s", s);
18477
18478   XDELETEVEC (s);
18479 }
18480
18481 /* Print a hint with a suggestion for a core name that most closely resembles
18482    what the user passed in STR.  */
18483
18484 inline static void
18485 aarch64_print_hint_for_core (const char *str)
18486 {
18487   aarch64_print_hint_for_core_or_arch (str, false);
18488 }
18489
18490 /* Print a hint with a suggestion for an architecture name that most closely
18491    resembles what the user passed in STR.  */
18492
18493 inline static void
18494 aarch64_print_hint_for_arch (const char *str)
18495 {
18496   aarch64_print_hint_for_core_or_arch (str, true);
18497 }
18498
18499
18500 /* Print a hint with a suggestion for an extension name
18501    that most closely resembles what the user passed in STR.  */
18502
18503 void
18504 aarch64_print_hint_for_extensions (const std::string &str)
18505 {
18506   auto_vec<const char *> candidates;
18507   aarch64_get_all_extension_candidates (&candidates);
18508   char *s;
18509   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
18510   if (hint)
18511     inform (input_location, "valid arguments are: %s;"
18512                              " did you mean %qs?", s, hint);
18513   else
18514     inform (input_location, "valid arguments are: %s", s);
18515
18516   XDELETEVEC (s);
18517 }
18518
18519 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
18520    specified in STR and throw errors if appropriate.  Put the results if
18521    they are valid in RES and ISA_FLAGS.  Return whether the option is
18522    valid.  */
18523
18524 static bool
18525 aarch64_validate_mcpu (const char *str, const struct processor **res,
18526                        aarch64_feature_flags *isa_flags)
18527 {
18528   std::string invalid_extension;
18529   enum aarch_parse_opt_result parse_res
18530     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18531
18532   if (parse_res == AARCH_PARSE_OK)
18533     return true;
18534
18535   switch (parse_res)
18536     {
18537       case AARCH_PARSE_MISSING_ARG:
18538         error ("missing cpu name in %<-mcpu=%s%>", str);
18539         break;
18540       case AARCH_PARSE_INVALID_ARG:
18541         error ("unknown value %qs for %<-mcpu%>", str);
18542         aarch64_print_hint_for_core (str);
18543         /* A common user error is confusing -march and -mcpu.
18544            If the -mcpu string matches a known architecture then suggest
18545            -march=.  */
18546         parse_res = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18547         if (parse_res == AARCH_PARSE_OK)
18548           inform (input_location, "did you mean %<-march=%s%>?", str);
18549         break;
18550       case AARCH_PARSE_INVALID_FEATURE:
18551         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18552                invalid_extension.c_str (), str);
18553         aarch64_print_hint_for_extensions (invalid_extension);
18554         break;
18555       default:
18556         gcc_unreachable ();
18557     }
18558
18559   return false;
18560 }
18561
18562 /* Straight line speculation indicators.  */
18563 enum aarch64_sls_hardening_type
18564 {
18565   SLS_NONE = 0,
18566   SLS_RETBR = 1,
18567   SLS_BLR = 2,
18568   SLS_ALL = 3,
18569 };
18570 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18571
18572 /* Return whether we should mitigatate Straight Line Speculation for the RET
18573    and BR instructions.  */
18574 bool
18575 aarch64_harden_sls_retbr_p (void)
18576 {
18577   return aarch64_sls_hardening & SLS_RETBR;
18578 }
18579
18580 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18581    instruction.  */
18582 bool
18583 aarch64_harden_sls_blr_p (void)
18584 {
18585   return aarch64_sls_hardening & SLS_BLR;
18586 }
18587
18588 /* As of yet we only allow setting these options globally, in the future we may
18589    allow setting them per function.  */
18590 static void
18591 aarch64_validate_sls_mitigation (const char *const_str)
18592 {
18593   char *token_save = NULL;
18594   char *str = NULL;
18595
18596   if (strcmp (const_str, "none") == 0)
18597     {
18598       aarch64_sls_hardening = SLS_NONE;
18599       return;
18600     }
18601   if (strcmp (const_str, "all") == 0)
18602     {
18603       aarch64_sls_hardening = SLS_ALL;
18604       return;
18605     }
18606
18607   char *str_root = xstrdup (const_str);
18608   str = strtok_r (str_root, ",", &token_save);
18609   if (!str)
18610     error ("invalid argument given to %<-mharden-sls=%>");
18611
18612   int temp = SLS_NONE;
18613   while (str)
18614     {
18615       if (strcmp (str, "blr") == 0)
18616         temp |= SLS_BLR;
18617       else if (strcmp (str, "retbr") == 0)
18618         temp |= SLS_RETBR;
18619       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18620         {
18621           error ("%qs must be by itself for %<-mharden-sls=%>", str);
18622           break;
18623         }
18624       else
18625         {
18626           error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18627           break;
18628         }
18629       str = strtok_r (NULL, ",", &token_save);
18630     }
18631   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18632   free (str_root);
18633 }
18634
18635 /* Validate a command-line -march option.  Parse the arch and extensions
18636    (if any) specified in STR and throw errors if appropriate.  Put the
18637    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
18638    option is valid.  */
18639
18640 static bool
18641 aarch64_validate_march (const char *str, const struct processor **res,
18642                         aarch64_feature_flags *isa_flags)
18643 {
18644   std::string invalid_extension;
18645   enum aarch_parse_opt_result parse_res
18646     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18647
18648   if (parse_res == AARCH_PARSE_OK)
18649     return true;
18650
18651   switch (parse_res)
18652     {
18653       case AARCH_PARSE_MISSING_ARG:
18654         error ("missing arch name in %<-march=%s%>", str);
18655         break;
18656       case AARCH_PARSE_INVALID_ARG:
18657         error ("unknown value %qs for %<-march%>", str);
18658         aarch64_print_hint_for_arch (str);
18659         /* A common user error is confusing -march and -mcpu.
18660            If the -march string matches a known CPU suggest -mcpu.  */
18661         parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18662         if (parse_res == AARCH_PARSE_OK)
18663           inform (input_location, "did you mean %<-mcpu=%s%>?", str);
18664         break;
18665       case AARCH_PARSE_INVALID_FEATURE:
18666         error ("invalid feature modifier %qs in %<-march=%s%>",
18667                invalid_extension.c_str (), str);
18668         aarch64_print_hint_for_extensions (invalid_extension);
18669         break;
18670       default:
18671         gcc_unreachable ();
18672     }
18673
18674   return false;
18675 }
18676
18677 /* Validate a command-line -mtune option.  Parse the cpu
18678    specified in STR and throw errors if appropriate.  Put the
18679    result, if it is valid, in RES.  Return whether the option is
18680    valid.  */
18681
18682 static bool
18683 aarch64_validate_mtune (const char *str, const struct processor **res)
18684 {
18685   enum aarch_parse_opt_result parse_res
18686     = aarch64_parse_tune (str, res);
18687
18688   if (parse_res == AARCH_PARSE_OK)
18689     return true;
18690
18691   switch (parse_res)
18692     {
18693       case AARCH_PARSE_MISSING_ARG:
18694         error ("missing cpu name in %<-mtune=%s%>", str);
18695         break;
18696       case AARCH_PARSE_INVALID_ARG:
18697         error ("unknown value %qs for %<-mtune%>", str);
18698         aarch64_print_hint_for_core (str);
18699         break;
18700       default:
18701         gcc_unreachable ();
18702     }
18703   return false;
18704 }
18705
18706 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
18707
18708 static poly_uint16
18709 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18710 {
18711   /* 128-bit SVE and Advanced SIMD modes use different register layouts
18712      on big-endian targets, so we would need to forbid subregs that convert
18713      from one to the other.  By default a reinterpret sequence would then
18714      involve a store to memory in one mode and a load back in the other.
18715      Even if we optimize that sequence using reverse instructions,
18716      it would still be a significant potential overhead.
18717
18718      For now, it seems better to generate length-agnostic code for that
18719      case instead.  */
18720   if (value == SVE_SCALABLE
18721       || (value == SVE_128 && BYTES_BIG_ENDIAN))
18722     return poly_uint16 (2, 2);
18723   else
18724     return (int) value / 64;
18725 }
18726
18727 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18728    aarch64_isa_flags accordingly.  */
18729
18730 void
18731 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18732 {
18733   aarch64_set_asm_isa_flags (&global_options, flags);
18734 }
18735
18736 static void
18737 aarch64_handle_no_branch_protection (void)
18738 {
18739   aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18740   aarch_enable_bti = 0;
18741 }
18742
18743 static void
18744 aarch64_handle_standard_branch_protection (void)
18745 {
18746   aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18747   aarch64_ra_sign_key = AARCH64_KEY_A;
18748   aarch_enable_bti = 1;
18749 }
18750
18751 static void
18752 aarch64_handle_pac_ret_protection (void)
18753 {
18754   aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18755   aarch64_ra_sign_key = AARCH64_KEY_A;
18756 }
18757
18758 static void
18759 aarch64_handle_pac_ret_leaf (void)
18760 {
18761   aarch_ra_sign_scope = AARCH_FUNCTION_ALL;
18762 }
18763
18764 static void
18765 aarch64_handle_pac_ret_b_key (void)
18766 {
18767   aarch64_ra_sign_key = AARCH64_KEY_B;
18768 }
18769
18770 static void
18771 aarch64_handle_bti_protection (void)
18772 {
18773   aarch_enable_bti = 1;
18774 }
18775
18776 static const struct aarch_branch_protect_type aarch64_pac_ret_subtypes[] = {
18777   { "leaf", false, aarch64_handle_pac_ret_leaf, NULL, 0 },
18778   { "b-key", false, aarch64_handle_pac_ret_b_key, NULL, 0 },
18779   { NULL, false, NULL, NULL, 0 }
18780 };
18781
18782 static const struct aarch_branch_protect_type aarch64_branch_protect_types[] =
18783 {
18784   { "none", true, aarch64_handle_no_branch_protection, NULL, 0 },
18785   { "standard", true, aarch64_handle_standard_branch_protection, NULL, 0 },
18786   { "pac-ret", false, aarch64_handle_pac_ret_protection,
18787     aarch64_pac_ret_subtypes, ARRAY_SIZE (aarch64_pac_ret_subtypes) },
18788   { "bti", false, aarch64_handle_bti_protection, NULL, 0 },
18789   { NULL, false, NULL, NULL, 0 }
18790 };
18791
18792 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
18793    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18794    tuning structs.  In particular it must set selected_tune and
18795    aarch64_asm_isa_flags that define the available ISA features and tuning
18796    decisions.  It must also set selected_arch as this will be used to
18797    output the .arch asm tags for each function.  */
18798
18799 static void
18800 aarch64_override_options (void)
18801 {
18802   aarch64_feature_flags cpu_isa = 0;
18803   aarch64_feature_flags arch_isa = 0;
18804   aarch64_set_asm_isa_flags (0);
18805
18806   const struct processor *cpu = NULL;
18807   const struct processor *arch = NULL;
18808   const struct processor *tune = NULL;
18809
18810   if (aarch64_harden_sls_string)
18811     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18812
18813   if (aarch64_branch_protection_string)
18814     aarch_validate_mbranch_protection (aarch64_branch_protect_types,
18815                                        aarch64_branch_protection_string,
18816                                        "-mbranch-protection=");
18817
18818   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18819      If either of -march or -mtune is given, they override their
18820      respective component of -mcpu.  */
18821   if (aarch64_cpu_string)
18822     aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18823
18824   if (aarch64_arch_string)
18825     aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18826
18827   if (aarch64_tune_string)
18828     aarch64_validate_mtune (aarch64_tune_string, &tune);
18829
18830 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18831   SUBTARGET_OVERRIDE_OPTIONS;
18832 #endif
18833
18834   auto isa_mode = AARCH64_FL_DEFAULT_ISA_MODE;
18835   if (cpu && arch)
18836     {
18837       /* If both -mcpu and -march are specified, warn if they are not
18838          feature compatible.  feature compatible means that the inclusion of the
18839          cpu features would end up disabling an achitecture feature.  In
18840          otherwords the cpu features need to be a strict superset of the arch
18841          features and if so prefer the -march ISA flags.  */
18842       auto full_arch_flags = arch->flags | arch_isa;
18843       auto full_cpu_flags = cpu->flags | cpu_isa;
18844       if (~full_cpu_flags & full_arch_flags)
18845         {
18846           std::string ext_diff
18847             = aarch64_get_extension_string_for_isa_flags (full_arch_flags,
18848                                                           full_cpu_flags);
18849           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch "
18850                       "and resulted in options %<%s%> being added",
18851                        aarch64_cpu_string,
18852                        aarch64_arch_string,
18853                        ext_diff.c_str ());
18854         }
18855
18856       selected_arch = arch->arch;
18857       aarch64_set_asm_isa_flags (arch_isa | isa_mode);
18858     }
18859   else if (cpu)
18860     {
18861       selected_arch = cpu->arch;
18862       aarch64_set_asm_isa_flags (cpu_isa | isa_mode);
18863     }
18864   else if (arch)
18865     {
18866       cpu = &all_cores[arch->ident];
18867       selected_arch = arch->arch;
18868       aarch64_set_asm_isa_flags (arch_isa | isa_mode);
18869     }
18870   else
18871     {
18872       /* No -mcpu or -march specified, so use the default CPU.  */
18873       cpu = &all_cores[TARGET_CPU_DEFAULT];
18874       selected_arch = cpu->arch;
18875       aarch64_set_asm_isa_flags (cpu->flags | isa_mode);
18876     }
18877
18878   selected_tune = tune ? tune->ident : cpu->ident;
18879
18880   if (aarch_enable_bti == 2)
18881     {
18882 #ifdef TARGET_ENABLE_BTI
18883       aarch_enable_bti = 1;
18884 #else
18885       aarch_enable_bti = 0;
18886 #endif
18887     }
18888
18889   /* Return address signing is currently not supported for ILP32 targets.  For
18890      LP64 targets use the configured option in the absence of a command-line
18891      option for -mbranch-protection.  */
18892   if (!TARGET_ILP32 && aarch64_branch_protection_string == NULL)
18893     {
18894 #ifdef TARGET_ENABLE_PAC_RET
18895       aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18896 #else
18897       aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18898 #endif
18899     }
18900
18901 #ifndef HAVE_AS_MABI_OPTION
18902   /* The compiler may have been configured with 2.23.* binutils, which does
18903      not have support for ILP32.  */
18904   if (TARGET_ILP32)
18905     error ("assembler does not support %<-mabi=ilp32%>");
18906 #endif
18907
18908   /* Convert -msve-vector-bits to a VG count.  */
18909   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18910
18911   if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE && TARGET_ILP32)
18912     sorry ("return address signing is only supported for %<-mabi=lp64%>");
18913
18914   /* The pass to insert speculation tracking runs before
18915      shrink-wrapping and the latter does not know how to update the
18916      tracking status.  So disable it in this case.  */
18917   if (aarch64_track_speculation)
18918     flag_shrink_wrap = 0;
18919
18920   aarch64_override_options_internal (&global_options);
18921
18922   /* Save these options as the default ones in case we push and pop them later
18923      while processing functions with potential target attributes.  */
18924   target_option_default_node = target_option_current_node
18925     = build_target_option_node (&global_options, &global_options_set);
18926 }
18927
18928 /* Implement targetm.override_options_after_change.  */
18929
18930 static void
18931 aarch64_override_options_after_change (void)
18932 {
18933   aarch64_override_options_after_change_1 (&global_options);
18934 }
18935
18936 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
18937 static char *
18938 aarch64_offload_options (void)
18939 {
18940   if (TARGET_ILP32)
18941     return xstrdup ("-foffload-abi=ilp32");
18942   else
18943     return xstrdup ("-foffload-abi=lp64");
18944 }
18945
18946 static struct machine_function *
18947 aarch64_init_machine_status (void)
18948 {
18949   struct machine_function *machine;
18950   machine = ggc_cleared_alloc<machine_function> ();
18951   return machine;
18952 }
18953
18954 void
18955 aarch64_init_expanders (void)
18956 {
18957   init_machine_status = aarch64_init_machine_status;
18958 }
18959
18960 /* A checking mechanism for the implementation of the various code models.  */
18961 static void
18962 initialize_aarch64_code_model (struct gcc_options *opts)
18963 {
18964   aarch64_cmodel = opts->x_aarch64_cmodel_var;
18965   switch (opts->x_aarch64_cmodel_var)
18966     {
18967     case AARCH64_CMODEL_TINY:
18968       if (opts->x_flag_pic)
18969         aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18970       break;
18971     case AARCH64_CMODEL_SMALL:
18972       if (opts->x_flag_pic)
18973         {
18974 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18975           aarch64_cmodel = (flag_pic == 2
18976                             ? AARCH64_CMODEL_SMALL_PIC
18977                             : AARCH64_CMODEL_SMALL_SPIC);
18978 #else
18979           aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
18980 #endif
18981         }
18982       break;
18983     case AARCH64_CMODEL_LARGE:
18984       if (opts->x_flag_pic)
18985         sorry ("code model %qs with %<-f%s%>", "large",
18986                opts->x_flag_pic > 1 ? "PIC" : "pic");
18987       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18988         sorry ("code model %qs not supported in ilp32 mode", "large");
18989       break;
18990     case AARCH64_CMODEL_TINY_PIC:
18991     case AARCH64_CMODEL_SMALL_PIC:
18992     case AARCH64_CMODEL_SMALL_SPIC:
18993       gcc_unreachable ();
18994     }
18995 }
18996
18997 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
18998    using the information saved in PTR.  */
18999
19000 static void
19001 aarch64_option_restore (struct gcc_options *opts,
19002                         struct gcc_options * /* opts_set */,
19003                         struct cl_target_option * /* ptr */)
19004 {
19005   aarch64_override_options_internal (opts);
19006 }
19007
19008 /* Implement TARGET_OPTION_PRINT.  */
19009
19010 static void
19011 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
19012 {
19013   const struct processor *cpu
19014     = aarch64_get_tune_cpu (ptr->x_selected_tune);
19015   const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
19016   std::string extension
19017     = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_asm_isa_flags,
19018                                                   arch->flags);
19019
19020   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
19021   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
19022            arch->name, extension.c_str ());
19023 }
19024
19025 static GTY(()) tree aarch64_previous_fndecl;
19026
19027 void
19028 aarch64_reset_previous_fndecl (void)
19029 {
19030   aarch64_previous_fndecl = NULL;
19031 }
19032
19033 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
19034    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
19035    make sure optab availability predicates are recomputed when necessary.  */
19036
19037 void
19038 aarch64_save_restore_target_globals (tree new_tree)
19039 {
19040   if (TREE_TARGET_GLOBALS (new_tree))
19041     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
19042   else if (new_tree == target_option_default_node)
19043     restore_target_globals (&default_target_globals);
19044   else
19045     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
19046 }
19047
19048 /* Return the target_option_node for FNDECL, or the current options
19049    if FNDECL is null.  */
19050
19051 static tree
19052 aarch64_fndecl_options (tree fndecl)
19053 {
19054   if (!fndecl)
19055     return target_option_current_node;
19056
19057   if (tree options = DECL_FUNCTION_SPECIFIC_TARGET (fndecl))
19058     return options;
19059
19060   return target_option_default_node;
19061 }
19062
19063 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
19064    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
19065    of the function, if such exists.  This function may be called multiple
19066    times on a single function so use aarch64_previous_fndecl to avoid
19067    setting up identical state.  */
19068
19069 static void
19070 aarch64_set_current_function (tree fndecl)
19071 {
19072   tree old_tree = aarch64_fndecl_options (aarch64_previous_fndecl);
19073   tree new_tree = aarch64_fndecl_options (fndecl);
19074
19075   auto new_isa_mode = (fndecl
19076                        ? aarch64_fndecl_isa_mode (fndecl)
19077                        : AARCH64_FL_DEFAULT_ISA_MODE);
19078   auto isa_flags = TREE_TARGET_OPTION (new_tree)->x_aarch64_isa_flags;
19079
19080   static bool reported_zt0_p;
19081   if (!reported_zt0_p
19082       && !(isa_flags & AARCH64_FL_SME2)
19083       && fndecl
19084       && aarch64_fndecl_has_state (fndecl, "zt0"))
19085     {
19086       error ("functions with %qs state require the ISA extension %qs",
19087              "zt0", "sme2");
19088       inform (input_location, "you can enable %qs using the command-line"
19089               " option %<-march%>, or by using the %<target%>"
19090               " attribute or pragma", "sme2");
19091       reported_zt0_p = true;
19092     }
19093
19094   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
19095      the default have been handled by aarch64_save_restore_target_globals from
19096      aarch64_pragma_target_parse.  */
19097   if (old_tree == new_tree
19098       && (!fndecl || aarch64_previous_fndecl)
19099       && (isa_flags & AARCH64_FL_ISA_MODES) == new_isa_mode)
19100     {
19101       gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19102       return;
19103     }
19104
19105   aarch64_previous_fndecl = fndecl;
19106
19107   /* First set the target options.  */
19108   cl_target_option_restore (&global_options, &global_options_set,
19109                             TREE_TARGET_OPTION (new_tree));
19110
19111   /* The ISA mode can vary based on function type attributes and
19112      function declaration attributes.  Make sure that the target
19113      options correctly reflect these attributes.  */
19114   if ((isa_flags & AARCH64_FL_ISA_MODES) != new_isa_mode)
19115     {
19116       auto base_flags = (aarch64_asm_isa_flags & ~AARCH64_FL_ISA_MODES);
19117       aarch64_set_asm_isa_flags (base_flags | new_isa_mode);
19118
19119       aarch64_override_options_internal (&global_options);
19120       new_tree = build_target_option_node (&global_options,
19121                                            &global_options_set);
19122       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_tree;
19123
19124       tree new_optimize = build_optimization_node (&global_options,
19125                                                    &global_options_set);
19126       if (new_optimize != optimization_default_node)
19127         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19128     }
19129
19130   aarch64_save_restore_target_globals (new_tree);
19131
19132   gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19133 }
19134
19135 /* Enum describing the various ways we can handle attributes.
19136    In many cases we can reuse the generic option handling machinery.  */
19137
19138 enum aarch64_attr_opt_type
19139 {
19140   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
19141   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
19142   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
19143   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
19144 };
19145
19146 /* All the information needed to handle a target attribute.
19147    NAME is the name of the attribute.
19148    ATTR_TYPE specifies the type of behavior of the attribute as described
19149    in the definition of enum aarch64_attr_opt_type.
19150    ALLOW_NEG is true if the attribute supports a "no-" form.
19151    HANDLER is the function that takes the attribute string as an argument
19152    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
19153    OPT_NUM is the enum specifying the option that the attribute modifies.
19154    This is needed for attributes that mirror the behavior of a command-line
19155    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
19156    aarch64_attr_enum.  */
19157
19158 struct aarch64_attribute_info
19159 {
19160   const char *name;
19161   enum aarch64_attr_opt_type attr_type;
19162   bool allow_neg;
19163   bool (*handler) (const char *);
19164   enum opt_code opt_num;
19165 };
19166
19167 /* Handle the ARCH_STR argument to the arch= target attribute.  */
19168
19169 static bool
19170 aarch64_handle_attr_arch (const char *str)
19171 {
19172   const struct processor *tmp_arch = NULL;
19173   std::string invalid_extension;
19174   aarch64_feature_flags tmp_flags;
19175   enum aarch_parse_opt_result parse_res
19176     = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
19177
19178   if (parse_res == AARCH_PARSE_OK)
19179     {
19180       gcc_assert (tmp_arch);
19181       selected_arch = tmp_arch->arch;
19182       aarch64_set_asm_isa_flags (tmp_flags | AARCH64_ISA_MODE);
19183       return true;
19184     }
19185
19186   switch (parse_res)
19187     {
19188       case AARCH_PARSE_MISSING_ARG:
19189         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
19190         break;
19191       case AARCH_PARSE_INVALID_ARG:
19192         error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
19193         aarch64_print_hint_for_arch (str);
19194         break;
19195       case AARCH_PARSE_INVALID_FEATURE:
19196         error ("invalid feature modifier %s of value %qs in "
19197                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19198         aarch64_print_hint_for_extensions (invalid_extension);
19199         break;
19200       default:
19201         gcc_unreachable ();
19202     }
19203
19204   return false;
19205 }
19206
19207 /* Handle the argument CPU_STR to the cpu= target attribute.  */
19208
19209 static bool
19210 aarch64_handle_attr_cpu (const char *str)
19211 {
19212   const struct processor *tmp_cpu = NULL;
19213   std::string invalid_extension;
19214   aarch64_feature_flags tmp_flags;
19215   enum aarch_parse_opt_result parse_res
19216     = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
19217
19218   if (parse_res == AARCH_PARSE_OK)
19219     {
19220       gcc_assert (tmp_cpu);
19221       selected_tune = tmp_cpu->ident;
19222       selected_arch = tmp_cpu->arch;
19223       aarch64_set_asm_isa_flags (tmp_flags | AARCH64_ISA_MODE);
19224       return true;
19225     }
19226
19227   switch (parse_res)
19228     {
19229       case AARCH_PARSE_MISSING_ARG:
19230         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
19231         break;
19232       case AARCH_PARSE_INVALID_ARG:
19233         error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
19234         aarch64_print_hint_for_core (str);
19235         break;
19236       case AARCH_PARSE_INVALID_FEATURE:
19237         error ("invalid feature modifier %qs of value %qs in "
19238                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19239         aarch64_print_hint_for_extensions (invalid_extension);
19240         break;
19241       default:
19242         gcc_unreachable ();
19243     }
19244
19245   return false;
19246 }
19247
19248 /* Handle the argument STR to the branch-protection= attribute.  */
19249
19250 static bool
19251 aarch64_handle_attr_branch_protection (const char* str)
19252 {
19253   return aarch_validate_mbranch_protection (aarch64_branch_protect_types, str,
19254                                             "target(\"branch-protection=\")");
19255 }
19256
19257 /* Handle the argument STR to the tune= target attribute.  */
19258
19259 static bool
19260 aarch64_handle_attr_tune (const char *str)
19261 {
19262   const struct processor *tmp_tune = NULL;
19263   enum aarch_parse_opt_result parse_res
19264     = aarch64_parse_tune (str, &tmp_tune);
19265
19266   if (parse_res == AARCH_PARSE_OK)
19267     {
19268       gcc_assert (tmp_tune);
19269       selected_tune = tmp_tune->ident;
19270       return true;
19271     }
19272
19273   switch (parse_res)
19274     {
19275       case AARCH_PARSE_INVALID_ARG:
19276         error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
19277         aarch64_print_hint_for_core (str);
19278         break;
19279       default:
19280         gcc_unreachable ();
19281     }
19282
19283   return false;
19284 }
19285
19286 /* Parse an architecture extensions target attribute string specified in STR.
19287    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
19288    if successful.  Update aarch64_isa_flags to reflect the ISA features
19289    modified.  */
19290
19291 static bool
19292 aarch64_handle_attr_isa_flags (char *str)
19293 {
19294   enum aarch_parse_opt_result parse_res;
19295   auto isa_flags = aarch64_asm_isa_flags;
19296
19297   /* We allow "+nothing" in the beginning to clear out all architectural
19298      features if the user wants to handpick specific features.  */
19299   if (strncmp ("+nothing", str, 8) == 0)
19300     {
19301       isa_flags = AARCH64_ISA_MODE;
19302       str += 8;
19303     }
19304
19305   std::string invalid_extension;
19306   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
19307
19308   if (parse_res == AARCH_PARSE_OK)
19309     {
19310       aarch64_set_asm_isa_flags (isa_flags);
19311       return true;
19312     }
19313
19314   switch (parse_res)
19315     {
19316       case AARCH_PARSE_MISSING_ARG:
19317         error ("missing value in %<target()%> pragma or attribute");
19318         break;
19319
19320       case AARCH_PARSE_INVALID_FEATURE:
19321         error ("invalid feature modifier %qs of value %qs in "
19322                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19323         break;
19324
19325       default:
19326         gcc_unreachable ();
19327     }
19328
19329  return false;
19330 }
19331
19332 /* The target attributes that we support.  On top of these we also support just
19333    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
19334    handled explicitly in aarch64_process_one_target_attr.  */
19335
19336 static const struct aarch64_attribute_info aarch64_attributes[] =
19337 {
19338   { "general-regs-only", aarch64_attr_mask, false, NULL,
19339      OPT_mgeneral_regs_only },
19340   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
19341      OPT_mfix_cortex_a53_835769 },
19342   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
19343      OPT_mfix_cortex_a53_843419 },
19344   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
19345   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
19346   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
19347      OPT_momit_leaf_frame_pointer },
19348   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
19349   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
19350      OPT_march_ },
19351   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
19352   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
19353      OPT_mtune_ },
19354   { "branch-protection", aarch64_attr_custom, false,
19355      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
19356   { "sign-return-address", aarch64_attr_enum, false, NULL,
19357      OPT_msign_return_address_ },
19358   { "outline-atomics", aarch64_attr_bool, true, NULL,
19359      OPT_moutline_atomics},
19360   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
19361 };
19362
19363 /* Parse ARG_STR which contains the definition of one target attribute.
19364    Show appropriate errors if any or return true if the attribute is valid.  */
19365
19366 static bool
19367 aarch64_process_one_target_attr (char *arg_str)
19368 {
19369   bool invert = false;
19370
19371   size_t len = strlen (arg_str);
19372
19373   if (len == 0)
19374     {
19375       error ("malformed %<target()%> pragma or attribute");
19376       return false;
19377     }
19378
19379   char *str_to_check = (char *) alloca (len + 1);
19380   strcpy (str_to_check, arg_str);
19381
19382   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
19383      It is easier to detect and handle it explicitly here rather than going
19384      through the machinery for the rest of the target attributes in this
19385      function.  */
19386   if (*str_to_check == '+')
19387     return aarch64_handle_attr_isa_flags (str_to_check);
19388
19389   if (len > 3 && startswith (str_to_check, "no-"))
19390     {
19391       invert = true;
19392       str_to_check += 3;
19393     }
19394   char *arg = strchr (str_to_check, '=');
19395
19396   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
19397      and point ARG to "foo".  */
19398   if (arg)
19399     {
19400       *arg = '\0';
19401       arg++;
19402     }
19403   const struct aarch64_attribute_info *p_attr;
19404   bool found = false;
19405   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
19406     {
19407       /* If the names don't match up, or the user has given an argument
19408          to an attribute that doesn't accept one, or didn't give an argument
19409          to an attribute that expects one, fail to match.  */
19410       if (strcmp (str_to_check, p_attr->name) != 0)
19411         continue;
19412
19413       found = true;
19414       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
19415                               || p_attr->attr_type == aarch64_attr_enum;
19416
19417       if (attr_need_arg_p ^ (arg != NULL))
19418         {
19419           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
19420           return false;
19421         }
19422
19423       /* If the name matches but the attribute does not allow "no-" versions
19424          then we can't match.  */
19425       if (invert && !p_attr->allow_neg)
19426         {
19427           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
19428           return false;
19429         }
19430
19431       switch (p_attr->attr_type)
19432         {
19433         /* Has a custom handler registered.
19434            For example, cpu=, arch=, tune=.  */
19435           case aarch64_attr_custom:
19436             gcc_assert (p_attr->handler);
19437             if (!p_attr->handler (arg))
19438               return false;
19439             break;
19440
19441           /* Either set or unset a boolean option.  */
19442           case aarch64_attr_bool:
19443             {
19444               struct cl_decoded_option decoded;
19445
19446               generate_option (p_attr->opt_num, NULL, !invert,
19447                                CL_TARGET, &decoded);
19448               aarch64_handle_option (&global_options, &global_options_set,
19449                                       &decoded, input_location);
19450               break;
19451             }
19452           /* Set or unset a bit in the target_flags.  aarch64_handle_option
19453              should know what mask to apply given the option number.  */
19454           case aarch64_attr_mask:
19455             {
19456               struct cl_decoded_option decoded;
19457               /* We only need to specify the option number.
19458                  aarch64_handle_option will know which mask to apply.  */
19459               decoded.opt_index = p_attr->opt_num;
19460               decoded.value = !invert;
19461               aarch64_handle_option (&global_options, &global_options_set,
19462                                       &decoded, input_location);
19463               break;
19464             }
19465           /* Use the option setting machinery to set an option to an enum.  */
19466           case aarch64_attr_enum:
19467             {
19468               gcc_assert (arg);
19469               bool valid;
19470               int value;
19471               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
19472                                               &value, CL_TARGET);
19473               if (valid)
19474                 {
19475                   set_option (&global_options, NULL, p_attr->opt_num, value,
19476                               NULL, DK_UNSPECIFIED, input_location,
19477                               global_dc);
19478                 }
19479               else
19480                 {
19481                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
19482                 }
19483               break;
19484             }
19485           default:
19486             gcc_unreachable ();
19487         }
19488     }
19489
19490   /* If we reached here we either have found an attribute and validated
19491      it or didn't match any.  If we matched an attribute but its arguments
19492      were malformed we will have returned false already.  */
19493   return found;
19494 }
19495
19496 /* Count how many times the character C appears in
19497    NULL-terminated string STR.  */
19498
19499 static unsigned int
19500 num_occurences_in_str (char c, char *str)
19501 {
19502   unsigned int res = 0;
19503   while (*str != '\0')
19504     {
19505       if (*str == c)
19506         res++;
19507
19508       str++;
19509     }
19510
19511   return res;
19512 }
19513
19514 /* Parse the tree in ARGS that contains the target attribute information
19515    and update the global target options space.  */
19516
19517 bool
19518 aarch64_process_target_attr (tree args)
19519 {
19520   if (TREE_CODE (args) == TREE_LIST)
19521     {
19522       do
19523         {
19524           tree head = TREE_VALUE (args);
19525           if (head)
19526             {
19527               if (!aarch64_process_target_attr (head))
19528                 return false;
19529             }
19530           args = TREE_CHAIN (args);
19531         } while (args);
19532
19533       return true;
19534     }
19535
19536   if (TREE_CODE (args) != STRING_CST)
19537     {
19538       error ("attribute %<target%> argument not a string");
19539       return false;
19540     }
19541
19542   size_t len = strlen (TREE_STRING_POINTER (args));
19543   char *str_to_check = (char *) alloca (len + 1);
19544   strcpy (str_to_check, TREE_STRING_POINTER (args));
19545
19546   if (len == 0)
19547     {
19548       error ("malformed %<target()%> pragma or attribute");
19549       return false;
19550     }
19551
19552   /* Used to catch empty spaces between commas i.e.
19553      attribute ((target ("attr1,,attr2"))).  */
19554   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19555
19556   /* Handle multiple target attributes separated by ','.  */
19557   char *token = strtok_r (str_to_check, ",", &str_to_check);
19558
19559   unsigned int num_attrs = 0;
19560   while (token)
19561     {
19562       num_attrs++;
19563       if (!aarch64_process_one_target_attr (token))
19564         {
19565           /* Check if token is possibly an arch extension without
19566              leading '+'.  */
19567           aarch64_feature_flags isa_temp = 0;
19568           auto with_plus = std::string ("+") + token;
19569           enum aarch_parse_opt_result ext_res
19570             = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19571
19572           if (ext_res == AARCH_PARSE_OK)
19573             error ("arch extension %<%s%> should be prefixed by %<+%>",
19574                    token);
19575           else
19576             error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19577           return false;
19578         }
19579
19580       token = strtok_r (NULL, ",", &str_to_check);
19581     }
19582
19583   if (num_attrs != num_commas + 1)
19584     {
19585       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19586       return false;
19587     }
19588
19589   return true;
19590 }
19591
19592 static bool aarch64_process_target_version_attr (tree args);
19593
19594 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
19595    process attribute ((target ("..."))).  */
19596
19597 static bool
19598 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19599 {
19600   struct cl_target_option cur_target;
19601   bool ret;
19602   tree old_optimize;
19603   tree new_target, new_optimize;
19604   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19605
19606   /* If what we're processing is the current pragma string then the
19607      target option node is already stored in target_option_current_node
19608      by aarch64_pragma_target_parse in aarch64-c.cc.  Use that to avoid
19609      having to re-parse the string.  This is especially useful to keep
19610      arm_neon.h compile times down since that header contains a lot
19611      of intrinsics enclosed in pragmas.  */
19612   if (!existing_target && args == current_target_pragma)
19613     {
19614       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19615       return true;
19616     }
19617   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19618
19619   old_optimize
19620     = build_optimization_node (&global_options, &global_options_set);
19621   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19622
19623   /* If the function changed the optimization levels as well as setting
19624      target options, start with the optimizations specified.  */
19625   if (func_optimize && func_optimize != old_optimize)
19626     cl_optimization_restore (&global_options, &global_options_set,
19627                              TREE_OPTIMIZATION (func_optimize));
19628
19629   /* Save the current target options to restore at the end.  */
19630   cl_target_option_save (&cur_target, &global_options, &global_options_set);
19631
19632   /* If fndecl already has some target attributes applied to it, unpack
19633      them so that we add this attribute on top of them, rather than
19634      overwriting them.  */
19635   if (existing_target)
19636     {
19637       struct cl_target_option *existing_options
19638         = TREE_TARGET_OPTION (existing_target);
19639
19640       if (existing_options)
19641         cl_target_option_restore (&global_options, &global_options_set,
19642                                   existing_options);
19643     }
19644   else
19645     cl_target_option_restore (&global_options, &global_options_set,
19646                               TREE_TARGET_OPTION (target_option_current_node));
19647
19648   ret = aarch64_process_target_attr (args);
19649   if (ret)
19650     {
19651       tree version_attr = lookup_attribute ("target_version",
19652                                             DECL_ATTRIBUTES (fndecl));
19653       if (version_attr != NULL_TREE)
19654         {
19655           /* Reapply any target_version attribute after target attribute.
19656              This should be equivalent to applying the target_version once
19657              after processing all target attributes.  */
19658           tree version_args = TREE_VALUE (version_attr);
19659           ret = aarch64_process_target_version_attr (version_args);
19660         }
19661     }
19662
19663   /* Set up any additional state.  */
19664   if (ret)
19665     {
19666       aarch64_override_options_internal (&global_options);
19667       new_target = build_target_option_node (&global_options,
19668                                              &global_options_set);
19669     }
19670   else
19671     new_target = NULL;
19672
19673   new_optimize = build_optimization_node (&global_options,
19674                                           &global_options_set);
19675
19676   if (fndecl && ret)
19677     {
19678       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19679
19680       if (old_optimize != new_optimize)
19681         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19682     }
19683
19684   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19685
19686   if (old_optimize != new_optimize)
19687     cl_optimization_restore (&global_options, &global_options_set,
19688                              TREE_OPTIMIZATION (old_optimize));
19689   return ret;
19690 }
19691
19692 typedef unsigned long long aarch64_fmv_feature_mask;
19693
19694 typedef struct
19695 {
19696   const char *name;
19697   aarch64_fmv_feature_mask feature_mask;
19698   aarch64_feature_flags opt_flags;
19699 } aarch64_fmv_feature_datum;
19700
19701 #define AARCH64_FMV_FEATURE(NAME, FEAT_NAME, C) \
19702   {NAME, 1ULL << FEAT_##FEAT_NAME, ::feature_deps::fmv_deps_##FEAT_NAME},
19703
19704 /* The "rdma" alias uses a different FEAT_NAME to avoid a duplicate
19705    feature_deps name.  */
19706 #define FEAT_RDMA FEAT_RDM
19707
19708 /* FMV features are listed in priority order, to make it easier to sort target
19709    strings.  */
19710 static aarch64_fmv_feature_datum aarch64_fmv_feature_data[] = {
19711 #include "config/aarch64/aarch64-option-extensions.def"
19712 };
19713
19714 /* Parse a function multiversioning feature string STR, as found in a
19715    target_version or target_clones attribute.
19716
19717    If ISA_FLAGS is nonnull, then update it with the specified architecture
19718    features turned on.  If FEATURE_MASK is nonnull, then assign to it a bitmask
19719    representing the set of features explicitly specified in the feature string.
19720    Return an aarch_parse_opt_result describing the result.
19721
19722    When the STR string contains an invalid or duplicate extension, a copy of
19723    the extension string is created and stored to INVALID_EXTENSION.  */
19724
19725 static enum aarch_parse_opt_result
19726 aarch64_parse_fmv_features (const char *str, aarch64_feature_flags *isa_flags,
19727                             aarch64_fmv_feature_mask *feature_mask,
19728                             std::string *invalid_extension)
19729 {
19730   if (feature_mask)
19731     *feature_mask = 0ULL;
19732
19733   if (strcmp (str, "default") == 0)
19734     return AARCH_PARSE_OK;
19735
19736   while (str != NULL && *str != 0)
19737     {
19738       const char *ext;
19739       size_t len;
19740
19741       ext = strchr (str, '+');
19742
19743       if (ext != NULL)
19744         len = ext - str;
19745       else
19746         len = strlen (str);
19747
19748       if (len == 0)
19749         return AARCH_PARSE_MISSING_ARG;
19750
19751       int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
19752       int i;
19753       for (i = 0; i < num_features; i++)
19754         {
19755           if (strlen (aarch64_fmv_feature_data[i].name) == len
19756               && strncmp (aarch64_fmv_feature_data[i].name, str, len) == 0)
19757             {
19758               if (isa_flags)
19759                 *isa_flags |= aarch64_fmv_feature_data[i].opt_flags;
19760               if (feature_mask)
19761                 {
19762                   auto old_feature_mask = *feature_mask;
19763                   *feature_mask |= aarch64_fmv_feature_data[i].feature_mask;
19764                   if (*feature_mask == old_feature_mask)
19765                     {
19766                       /* Duplicate feature.  */
19767                       if (invalid_extension)
19768                         *invalid_extension = std::string (str, len);
19769                       return AARCH_PARSE_DUPLICATE_FEATURE;
19770                     }
19771                 }
19772               break;
19773             }
19774         }
19775
19776       if (i == num_features)
19777         {
19778           /* Feature not found in list.  */
19779           if (invalid_extension)
19780             *invalid_extension = std::string (str, len);
19781           return AARCH_PARSE_INVALID_FEATURE;
19782         }
19783
19784       str = ext;
19785       if (str)
19786         /* Skip over the next '+'.  */
19787         str++;
19788     }
19789
19790   return AARCH_PARSE_OK;
19791 }
19792
19793 /* Parse the tree in ARGS that contains the target_version attribute
19794    information and update the global target options space.  */
19795
19796 static bool
19797 aarch64_process_target_version_attr (tree args)
19798 {
19799   if (TREE_CODE (args) == TREE_LIST)
19800     {
19801       if (TREE_CHAIN (args))
19802         {
19803           error ("attribute %<target_version%> has multiple values");
19804           return false;
19805         }
19806       args = TREE_VALUE (args);
19807     }
19808
19809   if (!args || TREE_CODE (args) != STRING_CST)
19810     {
19811       error ("attribute %<target_version%> argument not a string");
19812       return false;
19813     }
19814
19815   const char *str = TREE_STRING_POINTER (args);
19816
19817   enum aarch_parse_opt_result parse_res;
19818   auto isa_flags = aarch64_asm_isa_flags;
19819
19820   std::string invalid_extension;
19821   parse_res = aarch64_parse_fmv_features (str, &isa_flags, NULL,
19822                                           &invalid_extension);
19823
19824   if (parse_res == AARCH_PARSE_OK)
19825     {
19826       aarch64_set_asm_isa_flags (isa_flags);
19827       return true;
19828     }
19829
19830   switch (parse_res)
19831     {
19832     case AARCH_PARSE_MISSING_ARG:
19833       error ("missing value in %<target_version%> attribute");
19834       break;
19835
19836     case AARCH_PARSE_INVALID_FEATURE:
19837       error ("invalid feature modifier %qs of value %qs in "
19838              "%<target_version%> attribute", invalid_extension.c_str (),
19839              str);
19840       break;
19841
19842     case AARCH_PARSE_DUPLICATE_FEATURE:
19843       error ("duplicate feature modifier %qs of value %qs in "
19844              "%<target_version%> attribute", invalid_extension.c_str (),
19845              str);
19846       break;
19847
19848     default:
19849       gcc_unreachable ();
19850     }
19851
19852   return false;
19853 }
19854
19855 /* Implement TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P.  This is used to
19856    process attribute ((target_version ("..."))).  */
19857
19858 static bool
19859 aarch64_option_valid_version_attribute_p (tree fndecl, tree, tree args, int)
19860 {
19861   struct cl_target_option cur_target;
19862   bool ret;
19863   tree new_target;
19864   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19865
19866   /* Save the current target options to restore at the end.  */
19867   cl_target_option_save (&cur_target, &global_options, &global_options_set);
19868
19869   /* If fndecl already has some target attributes applied to it, unpack
19870      them so that we add this attribute on top of them, rather than
19871      overwriting them.  */
19872   if (existing_target)
19873     {
19874       struct cl_target_option *existing_options
19875         = TREE_TARGET_OPTION (existing_target);
19876
19877       if (existing_options)
19878         cl_target_option_restore (&global_options, &global_options_set,
19879                                   existing_options);
19880     }
19881   else
19882     cl_target_option_restore (&global_options, &global_options_set,
19883                               TREE_TARGET_OPTION (target_option_current_node));
19884
19885   ret = aarch64_process_target_version_attr (args);
19886
19887   /* Set up any additional state.  */
19888   if (ret)
19889     {
19890       aarch64_override_options_internal (&global_options);
19891       new_target = build_target_option_node (&global_options,
19892                                              &global_options_set);
19893     }
19894   else
19895     new_target = NULL;
19896
19897   if (fndecl && ret)
19898       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19899
19900   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19901
19902   return ret;
19903 }
19904
19905 /* This parses the attribute arguments to target_version in DECL and the
19906    feature mask required to select those targets.  No adjustments are made to
19907    add or remove redundant feature requirements.  */
19908
19909 static aarch64_fmv_feature_mask
19910 get_feature_mask_for_version (tree decl)
19911 {
19912   tree version_attr = lookup_attribute ("target_version",
19913                                         DECL_ATTRIBUTES (decl));
19914   if (version_attr == NULL)
19915     return 0;
19916
19917   const char *version_string = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE
19918                                                     (version_attr)));
19919   enum aarch_parse_opt_result parse_res;
19920   aarch64_fmv_feature_mask feature_mask;
19921
19922   parse_res = aarch64_parse_fmv_features (version_string, NULL, &feature_mask,
19923                                           NULL);
19924
19925   /* We should have detected any errors before getting here.  */
19926   gcc_assert (parse_res == AARCH_PARSE_OK);
19927
19928   return feature_mask;
19929 }
19930
19931 /* Compare priorities of two feature masks. Return:
19932      1: mask1 is higher priority
19933     -1: mask2 is higher priority
19934      0: masks are equal.  */
19935
19936 static int
19937 compare_feature_masks (aarch64_fmv_feature_mask mask1,
19938                        aarch64_fmv_feature_mask mask2)
19939 {
19940   int pop1 = popcount_hwi (mask1);
19941   int pop2 = popcount_hwi (mask2);
19942   if (pop1 > pop2)
19943     return 1;
19944   if (pop2 > pop1)
19945     return -1;
19946
19947   auto diff_mask = mask1 ^ mask2;
19948   if (diff_mask == 0ULL)
19949     return 0;
19950   int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
19951   for (int i = num_features - 1; i >= 0; i--)
19952     {
19953       auto bit_mask = aarch64_fmv_feature_data[i].feature_mask;
19954       if (diff_mask & bit_mask)
19955         return (mask1 & bit_mask) ? 1 : -1;
19956     }
19957   gcc_unreachable();
19958 }
19959
19960 /* Compare priorities of two version decls.  */
19961
19962 int
19963 aarch64_compare_version_priority (tree decl1, tree decl2)
19964 {
19965   auto mask1 = get_feature_mask_for_version (decl1);
19966   auto mask2 = get_feature_mask_for_version (decl2);
19967
19968   return compare_feature_masks (mask1, mask2);
19969 }
19970
19971 /* Build the struct __ifunc_arg_t type:
19972
19973    struct __ifunc_arg_t
19974    {
19975      unsigned long _size; // Size of the struct, so it can grow.
19976      unsigned long _hwcap;
19977      unsigned long _hwcap2;
19978    }
19979  */
19980
19981 static tree
19982 build_ifunc_arg_type ()
19983 {
19984   tree ifunc_arg_type = lang_hooks.types.make_type (RECORD_TYPE);
19985   tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
19986                             get_identifier ("_size"),
19987                             long_unsigned_type_node);
19988   tree field2 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
19989                             get_identifier ("_hwcap"),
19990                             long_unsigned_type_node);
19991   tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
19992                             get_identifier ("_hwcap2"),
19993                             long_unsigned_type_node);
19994
19995   DECL_FIELD_CONTEXT (field1) = ifunc_arg_type;
19996   DECL_FIELD_CONTEXT (field2) = ifunc_arg_type;
19997   DECL_FIELD_CONTEXT (field3) = ifunc_arg_type;
19998
19999   TYPE_FIELDS (ifunc_arg_type) = field1;
20000   DECL_CHAIN (field1) = field2;
20001   DECL_CHAIN (field2) = field3;
20002
20003   layout_type (ifunc_arg_type);
20004
20005   tree const_type = build_qualified_type (ifunc_arg_type, TYPE_QUAL_CONST);
20006   tree pointer_type = build_pointer_type (const_type);
20007
20008   return pointer_type;
20009 }
20010
20011 /* Implement TARGET_MANGLE_DECL_ASSEMBLER_NAME, to add function multiversioning
20012    suffixes.  */
20013
20014 tree
20015 aarch64_mangle_decl_assembler_name (tree decl, tree id)
20016 {
20017   /* For function version, add the target suffix to the assembler name.  */
20018   if (TREE_CODE (decl) == FUNCTION_DECL
20019       && DECL_FUNCTION_VERSIONED (decl))
20020     {
20021       aarch64_fmv_feature_mask feature_mask = get_feature_mask_for_version (decl);
20022
20023       std::string name = IDENTIFIER_POINTER (id);
20024
20025       /* For the default version, append ".default".  */
20026       if (feature_mask == 0ULL)
20027         {
20028           name += ".default";
20029           return get_identifier (name.c_str());
20030         }
20031
20032       name += "._";
20033
20034       int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
20035       for (int i = 0; i < num_features; i++)
20036         {
20037           if (feature_mask & aarch64_fmv_feature_data[i].feature_mask)
20038             {
20039               name += "M";
20040               name += aarch64_fmv_feature_data[i].name;
20041             }
20042         }
20043
20044       if (DECL_ASSEMBLER_NAME_SET_P (decl))
20045         SET_DECL_RTL (decl, NULL);
20046
20047       id = get_identifier (name.c_str());
20048     }
20049   return id;
20050 }
20051
20052 /* Return an identifier for the base assembler name of a versioned function.
20053    This is computed by taking the default version's assembler name, and
20054    stripping off the ".default" suffix if it's already been appended.  */
20055
20056 static tree
20057 get_suffixed_assembler_name (tree default_decl, const char *suffix)
20058 {
20059   std::string name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (default_decl));
20060
20061   auto size = name.size ();
20062   if (size >= 8 && name.compare (size - 8, 8, ".default") == 0)
20063     name.resize (size - 8);
20064   name += suffix;
20065   return get_identifier (name.c_str());
20066 }
20067
20068 /* Make the resolver function decl to dispatch the versions of
20069    a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
20070    ifunc alias that will point to the created resolver.  Create an
20071    empty basic block in the resolver and store the pointer in
20072    EMPTY_BB.  Return the decl of the resolver function.  */
20073
20074 static tree
20075 make_resolver_func (const tree default_decl,
20076                     const tree ifunc_alias_decl,
20077                     basic_block *empty_bb)
20078 {
20079   tree decl, type, t;
20080
20081   /* Create resolver function name based on default_decl.  We need to remove an
20082      existing ".default" suffix if this has already been appended.  */
20083   tree decl_name = get_suffixed_assembler_name (default_decl, ".resolver");
20084   const char *resolver_name = IDENTIFIER_POINTER (decl_name);
20085
20086   /* The resolver function should have signature
20087      (void *) resolver (uint64_t, const __ifunc_arg_t *) */
20088   type = build_function_type_list (ptr_type_node,
20089                                    uint64_type_node,
20090                                    build_ifunc_arg_type (),
20091                                    NULL_TREE);
20092
20093   decl = build_fn_decl (resolver_name, type);
20094   SET_DECL_ASSEMBLER_NAME (decl, decl_name);
20095
20096   DECL_NAME (decl) = decl_name;
20097   TREE_USED (decl) = 1;
20098   DECL_ARTIFICIAL (decl) = 1;
20099   DECL_IGNORED_P (decl) = 1;
20100   TREE_PUBLIC (decl) = 0;
20101   DECL_UNINLINABLE (decl) = 1;
20102
20103   /* Resolver is not external, body is generated.  */
20104   DECL_EXTERNAL (decl) = 0;
20105   DECL_EXTERNAL (ifunc_alias_decl) = 0;
20106
20107   DECL_CONTEXT (decl) = NULL_TREE;
20108   DECL_INITIAL (decl) = make_node (BLOCK);
20109   DECL_STATIC_CONSTRUCTOR (decl) = 0;
20110
20111   if (DECL_COMDAT_GROUP (default_decl)
20112       || TREE_PUBLIC (default_decl))
20113     {
20114       /* In this case, each translation unit with a call to this
20115          versioned function will put out a resolver.  Ensure it
20116          is comdat to keep just one copy.  */
20117       DECL_COMDAT (decl) = 1;
20118       make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
20119     }
20120   else
20121     TREE_PUBLIC (ifunc_alias_decl) = 0;
20122
20123   /* Build result decl and add to function_decl. */
20124   t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
20125   DECL_CONTEXT (t) = decl;
20126   DECL_ARTIFICIAL (t) = 1;
20127   DECL_IGNORED_P (t) = 1;
20128   DECL_RESULT (decl) = t;
20129
20130   /* Build parameter decls and add to function_decl. */
20131   tree arg1 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20132                           get_identifier ("hwcap"),
20133                           uint64_type_node);
20134   tree arg2 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20135                           get_identifier ("arg"),
20136                           build_ifunc_arg_type());
20137   DECL_CONTEXT (arg1) = decl;
20138   DECL_CONTEXT (arg2) = decl;
20139   DECL_ARTIFICIAL (arg1) = 1;
20140   DECL_ARTIFICIAL (arg2) = 1;
20141   DECL_IGNORED_P (arg1) = 1;
20142   DECL_IGNORED_P (arg2) = 1;
20143   DECL_ARG_TYPE (arg1) = uint64_type_node;
20144   DECL_ARG_TYPE (arg2) = build_ifunc_arg_type ();
20145   DECL_ARGUMENTS (decl) = arg1;
20146   TREE_CHAIN (arg1) = arg2;
20147
20148   gimplify_function_tree (decl);
20149   push_cfun (DECL_STRUCT_FUNCTION (decl));
20150   *empty_bb = init_lowered_empty_function (decl, false,
20151                                            profile_count::uninitialized ());
20152
20153   cgraph_node::add_new_function (decl, true);
20154   symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
20155
20156   pop_cfun ();
20157
20158   gcc_assert (ifunc_alias_decl != NULL);
20159   /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
20160   DECL_ATTRIBUTES (ifunc_alias_decl)
20161     = make_attribute ("ifunc", resolver_name,
20162                       DECL_ATTRIBUTES (ifunc_alias_decl));
20163
20164   /* Create the alias for dispatch to resolver here.  */
20165   cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
20166   return decl;
20167 }
20168
20169 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
20170    to return a pointer to VERSION_DECL if all feature bits specified in
20171    FEATURE_MASK are not set in MASK_VAR.  This function will be called during
20172    version dispatch to decide which function version to execute.  It returns
20173    the basic block at the end, to which more conditions can be added.  */
20174 static basic_block
20175 add_condition_to_bb (tree function_decl, tree version_decl,
20176                      aarch64_fmv_feature_mask feature_mask,
20177                      tree mask_var, basic_block new_bb)
20178 {
20179   gimple *return_stmt;
20180   tree convert_expr, result_var;
20181   gimple *convert_stmt;
20182   gimple *if_else_stmt;
20183
20184   basic_block bb1, bb2, bb3;
20185   edge e12, e23;
20186
20187   gimple_seq gseq;
20188
20189   push_cfun (DECL_STRUCT_FUNCTION (function_decl));
20190
20191   gcc_assert (new_bb != NULL);
20192   gseq = bb_seq (new_bb);
20193
20194   convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
20195                          build_fold_addr_expr (version_decl));
20196   result_var = create_tmp_var (ptr_type_node);
20197   convert_stmt = gimple_build_assign (result_var, convert_expr);
20198   return_stmt = gimple_build_return (result_var);
20199
20200   if (feature_mask == 0ULL)
20201     {
20202       /* Default version.  */
20203       gimple_seq_add_stmt (&gseq, convert_stmt);
20204       gimple_seq_add_stmt (&gseq, return_stmt);
20205       set_bb_seq (new_bb, gseq);
20206       gimple_set_bb (convert_stmt, new_bb);
20207       gimple_set_bb (return_stmt, new_bb);
20208       pop_cfun ();
20209       return new_bb;
20210     }
20211
20212   tree and_expr_var = create_tmp_var (long_long_unsigned_type_node);
20213   tree and_expr = build2 (BIT_AND_EXPR,
20214                           long_long_unsigned_type_node,
20215                           mask_var,
20216                           build_int_cst (long_long_unsigned_type_node,
20217                                          feature_mask));
20218   gimple *and_stmt = gimple_build_assign (and_expr_var, and_expr);
20219   gimple_set_block (and_stmt, DECL_INITIAL (function_decl));
20220   gimple_set_bb (and_stmt, new_bb);
20221   gimple_seq_add_stmt (&gseq, and_stmt);
20222
20223   tree zero_llu = build_int_cst (long_long_unsigned_type_node, 0);
20224   if_else_stmt = gimple_build_cond (EQ_EXPR, and_expr_var, zero_llu,
20225                                     NULL_TREE, NULL_TREE);
20226   gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
20227   gimple_set_bb (if_else_stmt, new_bb);
20228   gimple_seq_add_stmt (&gseq, if_else_stmt);
20229
20230   gimple_seq_add_stmt (&gseq, convert_stmt);
20231   gimple_seq_add_stmt (&gseq, return_stmt);
20232   set_bb_seq (new_bb, gseq);
20233
20234   bb1 = new_bb;
20235   e12 = split_block (bb1, if_else_stmt);
20236   bb2 = e12->dest;
20237   e12->flags &= ~EDGE_FALLTHRU;
20238   e12->flags |= EDGE_TRUE_VALUE;
20239
20240   e23 = split_block (bb2, return_stmt);
20241
20242   gimple_set_bb (convert_stmt, bb2);
20243   gimple_set_bb (return_stmt, bb2);
20244
20245   bb3 = e23->dest;
20246   make_edge (bb1, bb3, EDGE_FALSE_VALUE);
20247
20248   remove_edge (e23);
20249   make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
20250
20251   pop_cfun ();
20252
20253   return bb3;
20254 }
20255
20256 /* This function generates the dispatch function for
20257    multi-versioned functions.  DISPATCH_DECL is the function which will
20258    contain the dispatch logic.  FNDECLS are the function choices for
20259    dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
20260    in DISPATCH_DECL in which the dispatch code is generated.  */
20261
20262 static int
20263 dispatch_function_versions (tree dispatch_decl,
20264                             void *fndecls_p,
20265                             basic_block *empty_bb)
20266 {
20267   gimple *ifunc_cpu_init_stmt;
20268   gimple_seq gseq;
20269   vec<tree> *fndecls;
20270
20271   gcc_assert (dispatch_decl != NULL
20272               && fndecls_p != NULL
20273               && empty_bb != NULL);
20274
20275   push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
20276
20277   gseq = bb_seq (*empty_bb);
20278   /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
20279      constructors, so explicity call __init_cpu_features_resolver here.  */
20280   tree init_fn_type = build_function_type_list (void_type_node,
20281                                                 long_unsigned_type_node,
20282                                                 build_ifunc_arg_type(),
20283                                                 NULL);
20284   tree init_fn_id = get_identifier ("__init_cpu_features_resolver");
20285   tree init_fn_decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
20286                                   init_fn_id, init_fn_type);
20287   tree arg1 = DECL_ARGUMENTS (dispatch_decl);
20288   tree arg2 = TREE_CHAIN (arg1);
20289   ifunc_cpu_init_stmt = gimple_build_call (init_fn_decl, 2, arg1, arg2);
20290   gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
20291   gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
20292
20293   /* Build the struct type for __aarch64_cpu_features.  */
20294   tree global_type = lang_hooks.types.make_type (RECORD_TYPE);
20295   tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20296                             get_identifier ("features"),
20297                             long_long_unsigned_type_node);
20298   DECL_FIELD_CONTEXT (field1) = global_type;
20299   TYPE_FIELDS (global_type) = field1;
20300   layout_type (global_type);
20301
20302   tree global_var = build_decl (UNKNOWN_LOCATION, VAR_DECL,
20303                                 get_identifier ("__aarch64_cpu_features"),
20304                                 global_type);
20305   DECL_EXTERNAL (global_var) = 1;
20306   tree mask_var = create_tmp_var (long_long_unsigned_type_node);
20307
20308   tree component_expr = build3 (COMPONENT_REF, long_long_unsigned_type_node,
20309                                 global_var, field1, NULL_TREE);
20310   gimple *component_stmt = gimple_build_assign (mask_var, component_expr);
20311   gimple_set_block (component_stmt, DECL_INITIAL (dispatch_decl));
20312   gimple_set_bb (component_stmt, *empty_bb);
20313   gimple_seq_add_stmt (&gseq, component_stmt);
20314
20315   tree not_expr = build1 (BIT_NOT_EXPR, long_long_unsigned_type_node, mask_var);
20316   gimple *not_stmt = gimple_build_assign (mask_var, not_expr);
20317   gimple_set_block (not_stmt, DECL_INITIAL (dispatch_decl));
20318   gimple_set_bb (not_stmt, *empty_bb);
20319   gimple_seq_add_stmt (&gseq, not_stmt);
20320
20321   set_bb_seq (*empty_bb, gseq);
20322
20323   pop_cfun ();
20324
20325   /* fndecls_p is actually a vector.  */
20326   fndecls = static_cast<vec<tree> *> (fndecls_p);
20327
20328   /* At least one more version other than the default.  */
20329   unsigned int num_versions = fndecls->length ();
20330   gcc_assert (num_versions >= 2);
20331
20332   struct function_version_info
20333     {
20334       tree version_decl;
20335       aarch64_fmv_feature_mask feature_mask;
20336     } *function_versions;
20337
20338   function_versions = (struct function_version_info *)
20339     XNEWVEC (struct function_version_info, (num_versions));
20340
20341   unsigned int actual_versions = 0;
20342
20343   for (tree version_decl : *fndecls)
20344     {
20345       aarch64_fmv_feature_mask feature_mask;
20346       /* Get attribute string, parse it and find the right features.  */
20347       feature_mask = get_feature_mask_for_version (version_decl);
20348       function_versions [actual_versions].version_decl = version_decl;
20349       function_versions [actual_versions].feature_mask = feature_mask;
20350       actual_versions++;
20351     }
20352
20353   auto compare_feature_version_info = [](const void *p1, const void *p2) {
20354     const function_version_info v1 = *(const function_version_info *)p1;
20355     const function_version_info v2 = *(const function_version_info *)p2;
20356     return - compare_feature_masks (v1.feature_mask, v2.feature_mask);
20357   };
20358
20359   /* Sort the versions according to descending order of dispatch priority.  */
20360   qsort (function_versions, actual_versions,
20361          sizeof (struct function_version_info), compare_feature_version_info);
20362
20363   for (unsigned int i = 0; i < actual_versions; ++i)
20364     *empty_bb = add_condition_to_bb (dispatch_decl,
20365                                      function_versions[i].version_decl,
20366                                      function_versions[i].feature_mask,
20367                                      mask_var,
20368                                      *empty_bb);
20369
20370   free (function_versions);
20371   return 0;
20372 }
20373
20374 /* Implement TARGET_GENERATE_VERSION_DISPATCHER_BODY.  */
20375
20376 tree
20377 aarch64_generate_version_dispatcher_body (void *node_p)
20378 {
20379   tree resolver_decl;
20380   basic_block empty_bb;
20381   tree default_ver_decl;
20382   struct cgraph_node *versn;
20383   struct cgraph_node *node;
20384
20385   struct cgraph_function_version_info *node_version_info = NULL;
20386   struct cgraph_function_version_info *versn_info = NULL;
20387
20388   node = (cgraph_node *)node_p;
20389
20390   node_version_info = node->function_version ();
20391   gcc_assert (node->dispatcher_function
20392               && node_version_info != NULL);
20393
20394   if (node_version_info->dispatcher_resolver)
20395     return node_version_info->dispatcher_resolver;
20396
20397   /* The first version in the chain corresponds to the default version.  */
20398   default_ver_decl = node_version_info->next->this_node->decl;
20399
20400   /* node is going to be an alias, so remove the finalized bit.  */
20401   node->definition = false;
20402
20403   resolver_decl = make_resolver_func (default_ver_decl,
20404                                       node->decl, &empty_bb);
20405
20406   node_version_info->dispatcher_resolver = resolver_decl;
20407
20408   push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
20409
20410   auto_vec<tree, 2> fn_ver_vec;
20411
20412   for (versn_info = node_version_info->next; versn_info;
20413        versn_info = versn_info->next)
20414     {
20415       versn = versn_info->this_node;
20416       /* Check for virtual functions here again, as by this time it should
20417          have been determined if this function needs a vtable index or
20418          not.  This happens for methods in derived classes that override
20419          virtual methods in base classes but are not explicitly marked as
20420          virtual.  */
20421       if (DECL_VINDEX (versn->decl))
20422         sorry ("virtual function multiversioning not supported");
20423
20424       fn_ver_vec.safe_push (versn->decl);
20425     }
20426
20427   dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
20428   cgraph_edge::rebuild_edges ();
20429   pop_cfun ();
20430
20431   /* Fix up symbol names.  First we need to obtain the base name, which may
20432      have already been mangled.  */
20433   tree base_name = get_suffixed_assembler_name (default_ver_decl, "");
20434
20435   /* We need to redo the version mangling on the non-default versions for the
20436      target_clones case.  Redoing the mangling for the target_version case is
20437      redundant but does no harm.  We need to skip the default version, because
20438      expand_clones will append ".default" later; fortunately that suffix is the
20439      one we want anyway.  */
20440   for (versn_info = node_version_info->next->next; versn_info;
20441        versn_info = versn_info->next)
20442     {
20443       tree version_decl = versn_info->this_node->decl;
20444       tree name = aarch64_mangle_decl_assembler_name (version_decl,
20445                                                       base_name);
20446       symtab->change_decl_assembler_name (version_decl, name);
20447     }
20448
20449   /* We also need to use the base name for the ifunc declaration.  */
20450   symtab->change_decl_assembler_name (node->decl, base_name);
20451
20452   return resolver_decl;
20453 }
20454
20455 /* Make a dispatcher declaration for the multi-versioned function DECL.
20456    Calls to DECL function will be replaced with calls to the dispatcher
20457    by the front-end.  Returns the decl of the dispatcher function.  */
20458
20459 tree
20460 aarch64_get_function_versions_dispatcher (void *decl)
20461 {
20462   tree fn = (tree) decl;
20463   struct cgraph_node *node = NULL;
20464   struct cgraph_node *default_node = NULL;
20465   struct cgraph_function_version_info *node_v = NULL;
20466   struct cgraph_function_version_info *first_v = NULL;
20467
20468   tree dispatch_decl = NULL;
20469
20470   struct cgraph_function_version_info *default_version_info = NULL;
20471
20472   gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
20473
20474   node = cgraph_node::get (fn);
20475   gcc_assert (node != NULL);
20476
20477   node_v = node->function_version ();
20478   gcc_assert (node_v != NULL);
20479
20480   if (node_v->dispatcher_resolver != NULL)
20481     return node_v->dispatcher_resolver;
20482
20483   /* Find the default version and make it the first node.  */
20484   first_v = node_v;
20485   /* Go to the beginning of the chain.  */
20486   while (first_v->prev != NULL)
20487     first_v = first_v->prev;
20488   default_version_info = first_v;
20489   while (default_version_info != NULL)
20490     {
20491       if (get_feature_mask_for_version
20492             (default_version_info->this_node->decl) == 0ULL)
20493         break;
20494       default_version_info = default_version_info->next;
20495     }
20496
20497   /* If there is no default node, just return NULL.  */
20498   if (default_version_info == NULL)
20499     return NULL;
20500
20501   /* Make default info the first node.  */
20502   if (first_v != default_version_info)
20503     {
20504       default_version_info->prev->next = default_version_info->next;
20505       if (default_version_info->next)
20506         default_version_info->next->prev = default_version_info->prev;
20507       first_v->prev = default_version_info;
20508       default_version_info->next = first_v;
20509       default_version_info->prev = NULL;
20510     }
20511
20512   default_node = default_version_info->this_node;
20513
20514   if (targetm.has_ifunc_p ())
20515     {
20516       struct cgraph_function_version_info *it_v = NULL;
20517       struct cgraph_node *dispatcher_node = NULL;
20518       struct cgraph_function_version_info *dispatcher_version_info = NULL;
20519
20520       /* Right now, the dispatching is done via ifunc.  */
20521       dispatch_decl = make_dispatcher_decl (default_node->decl);
20522       TREE_NOTHROW (dispatch_decl) = TREE_NOTHROW (fn);
20523
20524       dispatcher_node = cgraph_node::get_create (dispatch_decl);
20525       gcc_assert (dispatcher_node != NULL);
20526       dispatcher_node->dispatcher_function = 1;
20527       dispatcher_version_info
20528         = dispatcher_node->insert_new_function_version ();
20529       dispatcher_version_info->next = default_version_info;
20530       dispatcher_node->definition = 1;
20531
20532       /* Set the dispatcher for all the versions.  */
20533       it_v = default_version_info;
20534       while (it_v != NULL)
20535         {
20536           it_v->dispatcher_resolver = dispatch_decl;
20537           it_v = it_v->next;
20538         }
20539     }
20540   else
20541     {
20542       error_at (DECL_SOURCE_LOCATION (default_node->decl),
20543                 "multiversioning needs %<ifunc%> which is not supported "
20544                 "on this target");
20545     }
20546
20547   return dispatch_decl;
20548 }
20549
20550 /* This function returns true if FN1 and FN2 are versions of the same function,
20551    that is, the target_version attributes of the function decls are different.
20552    This assumes that FN1 and FN2 have the same signature.  */
20553
20554 bool
20555 aarch64_common_function_versions (tree fn1, tree fn2)
20556 {
20557   if (TREE_CODE (fn1) != FUNCTION_DECL
20558       || TREE_CODE (fn2) != FUNCTION_DECL)
20559     return false;
20560
20561   return (aarch64_compare_version_priority (fn1, fn2) != 0);
20562 }
20563
20564 /* Implement TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P.  Use an opt-out
20565    rather than an opt-in list.  */
20566
20567 static bool
20568 aarch64_function_attribute_inlinable_p (const_tree fndecl)
20569 {
20570   /* A function that has local SME state cannot be inlined into its caller,
20571      since we only support managing PSTATE.ZA switches at function scope.  */
20572   return (!aarch64_fndecl_has_new_state (fndecl, "za")
20573           && !aarch64_fndecl_has_new_state (fndecl, "zt0"));
20574 }
20575
20576 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
20577    tri-bool options (yes, no, don't care) and the default value is
20578    DEF, determine whether to reject inlining.  */
20579
20580 static bool
20581 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
20582                                      int dont_care, int def)
20583 {
20584   /* If the callee doesn't care, always allow inlining.  */
20585   if (callee == dont_care)
20586     return true;
20587
20588   /* If the caller doesn't care, always allow inlining.  */
20589   if (caller == dont_care)
20590     return true;
20591
20592   /* Otherwise, allow inlining if either the callee and caller values
20593      agree, or if the callee is using the default value.  */
20594   return (callee == caller || callee == def);
20595 }
20596
20597 /* Bit allocations for ipa_fn_summary::target_info.  */
20598
20599 /* Set if the function contains a stmt that relies on the function's
20600    choice of PSTATE.SM setting (0 for non-streaming, 1 for streaming).
20601    Not meaningful for streaming-compatible functions.  */
20602 constexpr auto AARCH64_IPA_SM_FIXED = 1U << 0;
20603
20604 /* Set if the function clobbers ZA and ZT0.  Not meaningful for functions that
20605    have ZA state.  */
20606 constexpr auto AARCH64_IPA_CLOBBERS_ZA = 1U << 1;
20607 constexpr auto AARCH64_IPA_CLOBBERS_ZT0 = 1U << 2;
20608
20609 /* Implement TARGET_NEED_IPA_FN_TARGET_INFO.  */
20610
20611 static bool
20612 aarch64_need_ipa_fn_target_info (const_tree, unsigned int &)
20613 {
20614   /* We could in principle skip this for streaming-compatible functions
20615      that have ZA state, but that's a rare combination.  */
20616   return true;
20617 }
20618
20619 /* Implement TARGET_UPDATE_IPA_FN_TARGET_INFO.  */
20620
20621 static bool
20622 aarch64_update_ipa_fn_target_info (unsigned int &info, const gimple *stmt)
20623 {
20624   if (auto *ga = dyn_cast<const gasm *> (stmt))
20625     {
20626       /* We don't know what the asm does, so conservatively assume that
20627          it requires the function's current SM mode.  */
20628       info |= AARCH64_IPA_SM_FIXED;
20629       for (unsigned int i = 0; i < gimple_asm_nclobbers (ga); ++i)
20630         {
20631           tree op = gimple_asm_clobber_op (ga, i);
20632           const char *clobber = TREE_STRING_POINTER (TREE_VALUE (op));
20633           if (strcmp (clobber, "za") == 0)
20634             info |= AARCH64_IPA_CLOBBERS_ZA;
20635           if (strcmp (clobber, "zt0") == 0)
20636             info |= AARCH64_IPA_CLOBBERS_ZT0;
20637         }
20638     }
20639   if (auto *call = dyn_cast<const gcall *> (stmt))
20640     {
20641       if (gimple_call_builtin_p (call, BUILT_IN_MD))
20642         {
20643           /* The attributes on AArch64 builtins are supposed to be accurate.
20644              If the function isn't marked streaming-compatible then it
20645              needs whichever SM mode it selects.  */
20646           tree decl = gimple_call_fndecl (call);
20647           if (aarch64_fndecl_pstate_sm (decl) != 0)
20648             info |= AARCH64_IPA_SM_FIXED;
20649         }
20650     }
20651   return true;
20652 }
20653
20654 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
20655    to inline CALLEE into CALLER based on target-specific info.
20656    Make sure that the caller and callee have compatible architectural
20657    features.  Then go through the other possible target attributes
20658    and see if they can block inlining.  Try not to reject always_inline
20659    callees unless they are incompatible architecturally.  */
20660
20661 static bool
20662 aarch64_can_inline_p (tree caller, tree callee)
20663 {
20664   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
20665   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
20666
20667   struct cl_target_option *caller_opts
20668         = TREE_TARGET_OPTION (caller_tree ? caller_tree
20669                                            : target_option_default_node);
20670
20671   struct cl_target_option *callee_opts
20672         = TREE_TARGET_OPTION (callee_tree ? callee_tree
20673                                            : target_option_default_node);
20674
20675   /* Callee's ISA flags should be a subset of the caller's.  */
20676   auto caller_asm_isa = (caller_opts->x_aarch64_asm_isa_flags
20677                          & ~AARCH64_FL_ISA_MODES);
20678   auto callee_asm_isa = (callee_opts->x_aarch64_asm_isa_flags
20679                          & ~AARCH64_FL_ISA_MODES);
20680   if (callee_asm_isa & ~caller_asm_isa)
20681     return false;
20682
20683   auto caller_isa = (caller_opts->x_aarch64_isa_flags
20684                      & ~AARCH64_FL_ISA_MODES);
20685   auto callee_isa = (callee_opts->x_aarch64_isa_flags
20686                      & ~AARCH64_FL_ISA_MODES);
20687   if (callee_isa & ~caller_isa)
20688     return false;
20689
20690   /* Return true if the callee might have target_info property PROPERTY.
20691      The answer must be true unless we have positive proof to the contrary.  */
20692   auto callee_has_property = [&](unsigned int property)
20693     {
20694       if (ipa_fn_summaries)
20695         if (auto *summary = ipa_fn_summaries->get (cgraph_node::get (callee)))
20696           if (!(summary->target_info & property))
20697             return false;
20698       return true;
20699     };
20700
20701   /* Streaming-compatible code can be inlined into functions with any
20702      PSTATE.SM mode.  Otherwise the caller and callee must agree on
20703      PSTATE.SM mode, unless we can prove that the callee is naturally
20704      streaming-compatible.  */
20705   auto caller_sm = (caller_opts->x_aarch64_isa_flags & AARCH64_FL_SM_STATE);
20706   auto callee_sm = (callee_opts->x_aarch64_isa_flags & AARCH64_FL_SM_STATE);
20707   if (callee_sm
20708       && caller_sm != callee_sm
20709       && callee_has_property (AARCH64_IPA_SM_FIXED))
20710     return false;
20711
20712   /* aarch64_function_attribute_inlinable_p prevents new-ZA and new-ZT0
20713      functions from being inlined into others.  We also need to prevent
20714      inlining of shared-ZA functions into functions without ZA state,
20715      since this is an error condition.
20716
20717      The only other problematic case for ZA is inlining a function that
20718      directly clobbers ZA or ZT0 into a function that has ZA or ZT0 state.  */
20719   auto caller_za = (caller_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
20720   auto callee_za = (callee_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
20721   if (!caller_za && callee_za)
20722     return false;
20723   if (!callee_za
20724       && aarch64_fndecl_has_state (caller, "za")
20725       && callee_has_property (AARCH64_IPA_CLOBBERS_ZA))
20726     return false;
20727   if (!callee_za
20728       && aarch64_fndecl_has_state (caller, "zt0")
20729       && callee_has_property (AARCH64_IPA_CLOBBERS_ZT0))
20730     return false;
20731
20732   /* Allow non-strict aligned functions inlining into strict
20733      aligned ones.  */
20734   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
20735        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
20736       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
20737            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
20738     return false;
20739
20740   bool always_inline = lookup_attribute ("always_inline",
20741                                           DECL_ATTRIBUTES (callee));
20742
20743   /* If the architectural features match up and the callee is always_inline
20744      then the other attributes don't matter.  */
20745   if (always_inline)
20746     return true;
20747
20748   if (caller_opts->x_aarch64_cmodel_var
20749       != callee_opts->x_aarch64_cmodel_var)
20750     return false;
20751
20752   if (caller_opts->x_aarch64_tls_dialect
20753       != callee_opts->x_aarch64_tls_dialect)
20754     return false;
20755
20756   /* Honour explicit requests to workaround errata.  */
20757   if (!aarch64_tribools_ok_for_inlining_p (
20758           caller_opts->x_aarch64_fix_a53_err835769,
20759           callee_opts->x_aarch64_fix_a53_err835769,
20760           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
20761     return false;
20762
20763   if (!aarch64_tribools_ok_for_inlining_p (
20764           caller_opts->x_aarch64_fix_a53_err843419,
20765           callee_opts->x_aarch64_fix_a53_err843419,
20766           2, TARGET_FIX_ERR_A53_843419))
20767     return false;
20768
20769   /* If the user explicitly specified -momit-leaf-frame-pointer for the
20770      caller and calle and they don't match up, reject inlining.  */
20771   if (!aarch64_tribools_ok_for_inlining_p (
20772           caller_opts->x_flag_omit_leaf_frame_pointer,
20773           callee_opts->x_flag_omit_leaf_frame_pointer,
20774           2, 1))
20775     return false;
20776
20777   /* If the callee has specific tuning overrides, respect them.  */
20778   if (callee_opts->x_aarch64_override_tune_string != NULL
20779       && caller_opts->x_aarch64_override_tune_string == NULL)
20780     return false;
20781
20782   /* If the user specified tuning override strings for the
20783      caller and callee and they don't match up, reject inlining.
20784      We just do a string compare here, we don't analyze the meaning
20785      of the string, as it would be too costly for little gain.  */
20786   if (callee_opts->x_aarch64_override_tune_string
20787       && caller_opts->x_aarch64_override_tune_string
20788       && (strcmp (callee_opts->x_aarch64_override_tune_string,
20789                   caller_opts->x_aarch64_override_tune_string) != 0))
20790     return false;
20791
20792   return true;
20793 }
20794
20795 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
20796    been already.  */
20797
20798 arm_pcs
20799 aarch64_tlsdesc_abi_id ()
20800 {
20801   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
20802   if (!tlsdesc_abi.initialized_p ())
20803     {
20804       HARD_REG_SET full_reg_clobbers;
20805       CLEAR_HARD_REG_SET (full_reg_clobbers);
20806       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
20807       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
20808       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
20809         SET_HARD_REG_BIT (full_reg_clobbers, regno);
20810       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
20811     }
20812   return ARM_PCS_TLSDESC;
20813 }
20814
20815 /* Return true if SYMBOL_REF X binds locally.  */
20816
20817 static bool
20818 aarch64_symbol_binds_local_p (const_rtx x)
20819 {
20820   return (SYMBOL_REF_DECL (x)
20821           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
20822           : SYMBOL_REF_LOCAL_P (x));
20823 }
20824
20825 /* Return true if SYMBOL_REF X is thread local */
20826 static bool
20827 aarch64_tls_symbol_p (rtx x)
20828 {
20829   if (! TARGET_HAVE_TLS)
20830     return false;
20831
20832   x = strip_salt (x);
20833   if (!SYMBOL_REF_P (x))
20834     return false;
20835
20836   return SYMBOL_REF_TLS_MODEL (x) != 0;
20837 }
20838
20839 /* Classify a TLS symbol into one of the TLS kinds.  */
20840 enum aarch64_symbol_type
20841 aarch64_classify_tls_symbol (rtx x)
20842 {
20843   enum tls_model tls_kind = tls_symbolic_operand_type (x);
20844
20845   switch (tls_kind)
20846     {
20847     case TLS_MODEL_GLOBAL_DYNAMIC:
20848     case TLS_MODEL_LOCAL_DYNAMIC:
20849       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
20850
20851     case TLS_MODEL_INITIAL_EXEC:
20852       switch (aarch64_cmodel)
20853         {
20854         case AARCH64_CMODEL_TINY:
20855         case AARCH64_CMODEL_TINY_PIC:
20856           return SYMBOL_TINY_TLSIE;
20857         default:
20858           return SYMBOL_SMALL_TLSIE;
20859         }
20860
20861     case TLS_MODEL_LOCAL_EXEC:
20862       if (aarch64_tls_size == 12)
20863         return SYMBOL_TLSLE12;
20864       else if (aarch64_tls_size == 24)
20865         return SYMBOL_TLSLE24;
20866       else if (aarch64_tls_size == 32)
20867         return SYMBOL_TLSLE32;
20868       else if (aarch64_tls_size == 48)
20869         return SYMBOL_TLSLE48;
20870       else
20871         gcc_unreachable ();
20872
20873     case TLS_MODEL_EMULATED:
20874     case TLS_MODEL_NONE:
20875       return SYMBOL_FORCE_TO_MEM;
20876
20877     default:
20878       gcc_unreachable ();
20879     }
20880 }
20881
20882 /* Return the correct method for accessing X + OFFSET, where X is either
20883    a SYMBOL_REF or LABEL_REF.  */
20884
20885 enum aarch64_symbol_type
20886 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
20887 {
20888   x = strip_salt (x);
20889
20890   if (LABEL_REF_P (x))
20891     {
20892       switch (aarch64_cmodel)
20893         {
20894         case AARCH64_CMODEL_LARGE:
20895           return SYMBOL_FORCE_TO_MEM;
20896
20897         case AARCH64_CMODEL_TINY_PIC:
20898         case AARCH64_CMODEL_TINY:
20899           return SYMBOL_TINY_ABSOLUTE;
20900
20901         case AARCH64_CMODEL_SMALL_SPIC:
20902         case AARCH64_CMODEL_SMALL_PIC:
20903         case AARCH64_CMODEL_SMALL:
20904           return SYMBOL_SMALL_ABSOLUTE;
20905
20906         default:
20907           gcc_unreachable ();
20908         }
20909     }
20910
20911   if (SYMBOL_REF_P (x))
20912     {
20913       if (aarch64_tls_symbol_p (x))
20914         return aarch64_classify_tls_symbol (x);
20915
20916       switch (aarch64_cmodel)
20917         {
20918         case AARCH64_CMODEL_TINY_PIC:
20919         case AARCH64_CMODEL_TINY:
20920           /* With -fPIC non-local symbols use the GOT.  For orthogonality
20921              always use the GOT for extern weak symbols.  */
20922           if ((flag_pic || SYMBOL_REF_WEAK (x))
20923               && !aarch64_symbol_binds_local_p (x))
20924             return SYMBOL_TINY_GOT;
20925
20926           /* When we retrieve symbol + offset address, we have to make sure
20927              the offset does not cause overflow of the final address.  But
20928              we have no way of knowing the address of symbol at compile time
20929              so we can't accurately say if the distance between the PC and
20930              symbol + offset is outside the addressible range of +/-1MB in the
20931              TINY code model.  So we limit the maximum offset to +/-64KB and
20932              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
20933              If offset_within_block_p is true we allow larger offsets.  */
20934           if (!(IN_RANGE (offset, -0x10000, 0x10000)
20935                 || offset_within_block_p (x, offset)))
20936             return SYMBOL_FORCE_TO_MEM;
20937
20938           return SYMBOL_TINY_ABSOLUTE;
20939
20940
20941         case AARCH64_CMODEL_SMALL_SPIC:
20942         case AARCH64_CMODEL_SMALL_PIC:
20943         case AARCH64_CMODEL_SMALL:
20944           if ((flag_pic || SYMBOL_REF_WEAK (x))
20945               && !aarch64_symbol_binds_local_p (x))
20946             return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
20947                     ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
20948
20949           /* Same reasoning as the tiny code model, but the offset cap here is
20950              1MB, allowing +/-3.9GB for the offset to the symbol.  */
20951           if (!(IN_RANGE (offset, -0x100000, 0x100000)
20952                 || offset_within_block_p (x, offset)))
20953             return SYMBOL_FORCE_TO_MEM;
20954
20955           return SYMBOL_SMALL_ABSOLUTE;
20956
20957         case AARCH64_CMODEL_LARGE:
20958           /* This is alright even in PIC code as the constant
20959              pool reference is always PC relative and within
20960              the same translation unit.  */
20961           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
20962             return SYMBOL_SMALL_ABSOLUTE;
20963           else
20964             return SYMBOL_FORCE_TO_MEM;
20965
20966         default:
20967           gcc_unreachable ();
20968         }
20969     }
20970
20971   /* By default push everything into the constant pool.  */
20972   return SYMBOL_FORCE_TO_MEM;
20973 }
20974
20975 bool
20976 aarch64_constant_address_p (rtx x)
20977 {
20978   return (CONSTANT_P (x) && memory_address_p (DImode, x));
20979 }
20980
20981 bool
20982 aarch64_legitimate_pic_operand_p (rtx x)
20983 {
20984   poly_int64 offset;
20985   x = strip_offset_and_salt (x, &offset);
20986   if (SYMBOL_REF_P (x))
20987     return false;
20988
20989   return true;
20990 }
20991
20992 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
20993    that should be rematerialized rather than spilled.  */
20994
20995 static bool
20996 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
20997 {
20998   /* Support CSE and rematerialization of common constants.  */
20999   if (CONST_INT_P (x)
21000       || CONST_DOUBLE_P (x))
21001     return true;
21002
21003   /* Only accept variable-length vector constants if they can be
21004      handled directly.
21005
21006      ??? It would be possible (but complex) to handle rematerialization
21007      of other constants via secondary reloads.  */
21008   if (!GET_MODE_SIZE (mode).is_constant ())
21009     return aarch64_simd_valid_immediate (x, NULL);
21010
21011   /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
21012      least be forced to memory and loaded from there.  */
21013   if (CONST_VECTOR_P (x))
21014     return !targetm.cannot_force_const_mem (mode, x);
21015
21016   /* Do not allow vector struct mode constants for Advanced SIMD.
21017      We could support 0 and -1 easily, but they need support in
21018      aarch64-simd.md.  */
21019   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21020   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21021     return false;
21022
21023   if (GET_CODE (x) == HIGH)
21024     x = XEXP (x, 0);
21025
21026   /* Accept polynomial constants that can be calculated by using the
21027      destination of a move as the sole temporary.  Constants that
21028      require a second temporary cannot be rematerialized (they can't be
21029      forced to memory and also aren't legitimate constants).  */
21030   poly_int64 offset;
21031   if (poly_int_rtx_p (x, &offset))
21032     return aarch64_offset_temporaries (false, offset) <= 1;
21033
21034   /* If an offset is being added to something else, we need to allow the
21035      base to be moved into the destination register, meaning that there
21036      are no free temporaries for the offset.  */
21037   x = strip_offset_and_salt (x, &offset);
21038   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
21039     return false;
21040
21041   /* Do not allow const (plus (anchor_symbol, const_int)).  */
21042   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
21043     return false;
21044
21045   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
21046      so spilling them is better than rematerialization.  */
21047   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
21048     return true;
21049
21050   /* Label references are always constant.  */
21051   if (LABEL_REF_P (x))
21052     return true;
21053
21054   return false;
21055 }
21056
21057 rtx
21058 aarch64_load_tp (rtx target)
21059 {
21060   if (!target
21061       || GET_MODE (target) != Pmode
21062       || !register_operand (target, Pmode))
21063     target = gen_reg_rtx (Pmode);
21064
21065   /* Can return in any reg.  */
21066   emit_insn (gen_aarch64_load_tp_hard (target));
21067   return target;
21068 }
21069
21070 /* On AAPCS systems, this is the "struct __va_list".  */
21071 static GTY(()) tree va_list_type;
21072
21073 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
21074    Return the type to use as __builtin_va_list.
21075
21076    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
21077
21078    struct __va_list
21079    {
21080      void *__stack;
21081      void *__gr_top;
21082      void *__vr_top;
21083      int   __gr_offs;
21084      int   __vr_offs;
21085    };  */
21086
21087 static tree
21088 aarch64_build_builtin_va_list (void)
21089 {
21090   tree va_list_name;
21091   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21092
21093   /* Create the type.  */
21094   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
21095   /* Give it the required name.  */
21096   va_list_name = build_decl (BUILTINS_LOCATION,
21097                              TYPE_DECL,
21098                              get_identifier ("__va_list"),
21099                              va_list_type);
21100   DECL_ARTIFICIAL (va_list_name) = 1;
21101   TYPE_NAME (va_list_type) = va_list_name;
21102   TYPE_STUB_DECL (va_list_type) = va_list_name;
21103
21104   /* Create the fields.  */
21105   f_stack = build_decl (BUILTINS_LOCATION,
21106                         FIELD_DECL, get_identifier ("__stack"),
21107                         ptr_type_node);
21108   f_grtop = build_decl (BUILTINS_LOCATION,
21109                         FIELD_DECL, get_identifier ("__gr_top"),
21110                         ptr_type_node);
21111   f_vrtop = build_decl (BUILTINS_LOCATION,
21112                         FIELD_DECL, get_identifier ("__vr_top"),
21113                         ptr_type_node);
21114   f_groff = build_decl (BUILTINS_LOCATION,
21115                         FIELD_DECL, get_identifier ("__gr_offs"),
21116                         integer_type_node);
21117   f_vroff = build_decl (BUILTINS_LOCATION,
21118                         FIELD_DECL, get_identifier ("__vr_offs"),
21119                         integer_type_node);
21120
21121   /* Tell tree-stdarg pass about our internal offset fields.
21122      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
21123      purpose to identify whether the code is updating va_list internal
21124      offset fields through irregular way.  */
21125   va_list_gpr_counter_field = f_groff;
21126   va_list_fpr_counter_field = f_vroff;
21127
21128   DECL_ARTIFICIAL (f_stack) = 1;
21129   DECL_ARTIFICIAL (f_grtop) = 1;
21130   DECL_ARTIFICIAL (f_vrtop) = 1;
21131   DECL_ARTIFICIAL (f_groff) = 1;
21132   DECL_ARTIFICIAL (f_vroff) = 1;
21133
21134   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
21135   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
21136   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
21137   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
21138   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
21139
21140   TYPE_FIELDS (va_list_type) = f_stack;
21141   DECL_CHAIN (f_stack) = f_grtop;
21142   DECL_CHAIN (f_grtop) = f_vrtop;
21143   DECL_CHAIN (f_vrtop) = f_groff;
21144   DECL_CHAIN (f_groff) = f_vroff;
21145
21146   /* Compute its layout.  */
21147   layout_type (va_list_type);
21148
21149   return va_list_type;
21150 }
21151
21152 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
21153 static void
21154 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
21155 {
21156   const CUMULATIVE_ARGS *cum;
21157   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21158   tree stack, grtop, vrtop, groff, vroff;
21159   tree t;
21160   int gr_save_area_size = cfun->va_list_gpr_size;
21161   int vr_save_area_size = cfun->va_list_fpr_size;
21162   int vr_offset;
21163
21164   cum = &crtl->args.info;
21165   if (cfun->va_list_gpr_size)
21166     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
21167                              cfun->va_list_gpr_size);
21168   if (cfun->va_list_fpr_size)
21169     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
21170                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
21171
21172   if (!TARGET_FLOAT)
21173     {
21174       gcc_assert (cum->aapcs_nvrn == 0);
21175       vr_save_area_size = 0;
21176     }
21177
21178   f_stack = TYPE_FIELDS (va_list_type_node);
21179   f_grtop = DECL_CHAIN (f_stack);
21180   f_vrtop = DECL_CHAIN (f_grtop);
21181   f_groff = DECL_CHAIN (f_vrtop);
21182   f_vroff = DECL_CHAIN (f_groff);
21183
21184   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
21185                   NULL_TREE);
21186   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
21187                   NULL_TREE);
21188   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
21189                   NULL_TREE);
21190   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
21191                   NULL_TREE);
21192   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
21193                   NULL_TREE);
21194
21195   /* Emit code to initialize STACK, which points to the next varargs stack
21196      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
21197      by named arguments.  STACK is 8-byte aligned.  */
21198   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
21199   if (cum->aapcs_stack_size > 0)
21200     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
21201   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
21202   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21203
21204   /* Emit code to initialize GRTOP, the top of the GR save area.
21205      virtual_incoming_args_rtx should have been 16 byte aligned.  */
21206   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
21207   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
21208   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21209
21210   /* Emit code to initialize VRTOP, the top of the VR save area.
21211      This address is gr_save_area_bytes below GRTOP, rounded
21212      down to the next 16-byte boundary.  */
21213   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
21214   vr_offset = ROUND_UP (gr_save_area_size,
21215                         STACK_BOUNDARY / BITS_PER_UNIT);
21216
21217   if (vr_offset)
21218     t = fold_build_pointer_plus_hwi (t, -vr_offset);
21219   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
21220   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21221
21222   /* Emit code to initialize GROFF, the offset from GRTOP of the
21223      next GPR argument.  */
21224   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
21225               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
21226   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21227
21228   /* Likewise emit code to initialize VROFF, the offset from FTOP
21229      of the next VR argument.  */
21230   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
21231               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
21232   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21233 }
21234
21235 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
21236
21237 static tree
21238 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
21239                               gimple_seq *post_p ATTRIBUTE_UNUSED)
21240 {
21241   tree addr;
21242   bool indirect_p;
21243   bool is_ha;           /* is HFA or HVA.  */
21244   bool dw_align;        /* double-word align.  */
21245   machine_mode ag_mode = VOIDmode;
21246   int nregs;
21247   machine_mode mode;
21248
21249   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21250   tree stack, f_top, f_off, off, arg, roundup, on_stack;
21251   HOST_WIDE_INT size, rsize, adjust, align;
21252   tree t, u, cond1, cond2;
21253
21254   indirect_p = pass_va_arg_by_reference (type);
21255   if (indirect_p)
21256     type = build_pointer_type (type);
21257
21258   mode = TYPE_MODE (type);
21259
21260   f_stack = TYPE_FIELDS (va_list_type_node);
21261   f_grtop = DECL_CHAIN (f_stack);
21262   f_vrtop = DECL_CHAIN (f_grtop);
21263   f_groff = DECL_CHAIN (f_vrtop);
21264   f_vroff = DECL_CHAIN (f_groff);
21265
21266   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
21267                   f_stack, NULL_TREE);
21268   size = int_size_in_bytes (type);
21269
21270   unsigned int abi_break_gcc_9;
21271   unsigned int abi_break_gcc_13;
21272   unsigned int abi_break_gcc_14;
21273   align
21274     = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
21275                                       &abi_break_gcc_13, &abi_break_gcc_14)
21276     / BITS_PER_UNIT;
21277
21278   dw_align = false;
21279   adjust = 0;
21280   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
21281                                                &is_ha, false))
21282     {
21283       /* No frontends can create types with variable-sized modes, so we
21284          shouldn't be asked to pass or return them.  */
21285       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
21286
21287       /* TYPE passed in fp/simd registers.  */
21288       if (!TARGET_FLOAT)
21289         aarch64_err_no_fpadvsimd (mode);
21290
21291       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
21292                       unshare_expr (valist), f_vrtop, NULL_TREE);
21293       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
21294                       unshare_expr (valist), f_vroff, NULL_TREE);
21295
21296       rsize = nregs * UNITS_PER_VREG;
21297
21298       if (is_ha)
21299         {
21300           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
21301             adjust = UNITS_PER_VREG - ag_size;
21302         }
21303       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21304                && size < UNITS_PER_VREG)
21305         {
21306           adjust = UNITS_PER_VREG - size;
21307         }
21308     }
21309   else
21310     {
21311       /* TYPE passed in general registers.  */
21312       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
21313                       unshare_expr (valist), f_grtop, NULL_TREE);
21314       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
21315                       unshare_expr (valist), f_groff, NULL_TREE);
21316       rsize = ROUND_UP (size, UNITS_PER_WORD);
21317       nregs = rsize / UNITS_PER_WORD;
21318
21319       if (align <= 8
21320           && abi_break_gcc_13
21321           && warn_psabi
21322           && !bitint_or_aggr_of_bitint_p (type))
21323         inform (input_location, "parameter passing for argument of type "
21324                 "%qT changed in GCC 13.1", type);
21325
21326       if (warn_psabi
21327           && abi_break_gcc_14
21328           && (abi_break_gcc_14 > 8 * BITS_PER_UNIT) != (align > 8)
21329           && !bitint_or_aggr_of_bitint_p (type))
21330         inform (input_location, "parameter passing for argument of type "
21331                 "%qT changed in GCC 14.1", type);
21332
21333       if (align > 8)
21334         {
21335           if (abi_break_gcc_9
21336               && warn_psabi
21337               && !bitint_or_aggr_of_bitint_p (type))
21338             inform (input_location, "parameter passing for argument of type "
21339                     "%qT changed in GCC 9.1", type);
21340           dw_align = true;
21341         }
21342
21343       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21344           && size < UNITS_PER_WORD)
21345         {
21346           adjust = UNITS_PER_WORD  - size;
21347         }
21348     }
21349
21350   /* Get a local temporary for the field value.  */
21351   off = get_initialized_tmp_var (f_off, pre_p, NULL);
21352
21353   /* Emit code to branch if off >= 0.  */
21354   t = build2 (GE_EXPR, boolean_type_node, off,
21355               build_int_cst (TREE_TYPE (off), 0));
21356   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
21357
21358   if (dw_align)
21359     {
21360       /* Emit: offs = (offs + 15) & -16.  */
21361       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21362                   build_int_cst (TREE_TYPE (off), 15));
21363       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
21364                   build_int_cst (TREE_TYPE (off), -16));
21365       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
21366     }
21367   else
21368     roundup = NULL;
21369
21370   /* Update ap.__[g|v]r_offs  */
21371   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21372               build_int_cst (TREE_TYPE (off), rsize));
21373   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
21374
21375   /* String up.  */
21376   if (roundup)
21377     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21378
21379   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
21380   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
21381               build_int_cst (TREE_TYPE (f_off), 0));
21382   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
21383
21384   /* String up: make sure the assignment happens before the use.  */
21385   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
21386   COND_EXPR_ELSE (cond1) = t;
21387
21388   /* Prepare the trees handling the argument that is passed on the stack;
21389      the top level node will store in ON_STACK.  */
21390   arg = get_initialized_tmp_var (stack, pre_p, NULL);
21391   if (align > 8)
21392     {
21393       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
21394       t = fold_build_pointer_plus_hwi (arg, 15);
21395       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21396                   build_int_cst (TREE_TYPE (t), -16));
21397       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
21398     }
21399   else
21400     roundup = NULL;
21401   /* Advance ap.__stack  */
21402   t = fold_build_pointer_plus_hwi (arg, size + 7);
21403   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21404               build_int_cst (TREE_TYPE (t), -8));
21405   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
21406   /* String up roundup and advance.  */
21407   if (roundup)
21408     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21409   /* String up with arg */
21410   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
21411   /* Big-endianness related address adjustment.  */
21412   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21413       && size < UNITS_PER_WORD)
21414   {
21415     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
21416                 size_int (UNITS_PER_WORD - size));
21417     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
21418   }
21419
21420   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
21421   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
21422
21423   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
21424   t = off;
21425   if (adjust)
21426     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
21427                 build_int_cst (TREE_TYPE (off), adjust));
21428
21429   t = fold_convert (sizetype, t);
21430   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
21431
21432   if (is_ha)
21433     {
21434       /* type ha; // treat as "struct {ftype field[n];}"
21435          ... [computing offs]
21436          for (i = 0; i <nregs; ++i, offs += 16)
21437            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
21438          return ha;  */
21439       int i;
21440       tree tmp_ha, field_t, field_ptr_t;
21441
21442       /* Declare a local variable.  */
21443       tmp_ha = create_tmp_var_raw (type, "ha");
21444       gimple_add_tmp_var (tmp_ha);
21445
21446       /* Establish the base type.  */
21447       switch (ag_mode)
21448         {
21449         case E_SFmode:
21450           field_t = float_type_node;
21451           field_ptr_t = float_ptr_type_node;
21452           break;
21453         case E_DFmode:
21454           field_t = double_type_node;
21455           field_ptr_t = double_ptr_type_node;
21456           break;
21457         case E_TFmode:
21458           field_t = long_double_type_node;
21459           field_ptr_t = long_double_ptr_type_node;
21460           break;
21461         case E_SDmode:
21462           field_t = dfloat32_type_node;
21463           field_ptr_t = build_pointer_type (dfloat32_type_node);
21464           break;
21465         case E_DDmode:
21466           field_t = dfloat64_type_node;
21467           field_ptr_t = build_pointer_type (dfloat64_type_node);
21468           break;
21469         case E_TDmode:
21470           field_t = dfloat128_type_node;
21471           field_ptr_t = build_pointer_type (dfloat128_type_node);
21472           break;
21473         case E_HFmode:
21474           field_t = aarch64_fp16_type_node;
21475           field_ptr_t = aarch64_fp16_ptr_type_node;
21476           break;
21477         case E_BFmode:
21478           field_t = bfloat16_type_node;
21479           field_ptr_t = aarch64_bf16_ptr_type_node;
21480           break;
21481         case E_V2SImode:
21482         case E_V4SImode:
21483             {
21484               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
21485               field_t = build_vector_type_for_mode (innertype, ag_mode);
21486               field_ptr_t = build_pointer_type (field_t);
21487             }
21488           break;
21489         default:
21490           gcc_assert (0);
21491         }
21492
21493       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
21494       TREE_ADDRESSABLE (tmp_ha) = 1;
21495       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
21496       addr = t;
21497       t = fold_convert (field_ptr_t, addr);
21498       t = build2 (MODIFY_EXPR, field_t,
21499                   build1 (INDIRECT_REF, field_t, tmp_ha),
21500                   build1 (INDIRECT_REF, field_t, t));
21501
21502       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
21503       for (i = 1; i < nregs; ++i)
21504         {
21505           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
21506           u = fold_convert (field_ptr_t, addr);
21507           u = build2 (MODIFY_EXPR, field_t,
21508                       build2 (MEM_REF, field_t, tmp_ha,
21509                               build_int_cst (field_ptr_t,
21510                                              (i *
21511                                               int_size_in_bytes (field_t)))),
21512                       build1 (INDIRECT_REF, field_t, u));
21513           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
21514         }
21515
21516       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
21517       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
21518     }
21519
21520   COND_EXPR_ELSE (cond2) = t;
21521   addr = fold_convert (build_pointer_type (type), cond1);
21522   addr = build_va_arg_indirect_ref (addr);
21523
21524   if (indirect_p)
21525     addr = build_va_arg_indirect_ref (addr);
21526
21527   return addr;
21528 }
21529
21530 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
21531
21532 static void
21533 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
21534                                 const function_arg_info &arg,
21535                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
21536 {
21537   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
21538   CUMULATIVE_ARGS local_cum;
21539   int gr_saved = cfun->va_list_gpr_size;
21540   int vr_saved = cfun->va_list_fpr_size;
21541
21542   /* The caller has advanced CUM up to, but not beyond, the last named
21543      argument.  Advance a local copy of CUM past the last "real" named
21544      argument, to find out how many registers are left over.  */
21545   local_cum = *cum;
21546   if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
21547     aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
21548
21549   /* Found out how many registers we need to save.
21550      Honor tree-stdvar analysis results.  */
21551   if (cfun->va_list_gpr_size)
21552     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
21553                     cfun->va_list_gpr_size / UNITS_PER_WORD);
21554   if (cfun->va_list_fpr_size)
21555     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
21556                     cfun->va_list_fpr_size / UNITS_PER_VREG);
21557
21558   if (!TARGET_FLOAT)
21559     {
21560       gcc_assert (local_cum.aapcs_nvrn == 0);
21561       vr_saved = 0;
21562     }
21563
21564   if (!no_rtl)
21565     {
21566       if (gr_saved > 0)
21567         {
21568           rtx ptr, mem;
21569
21570           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
21571           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
21572                                - gr_saved * UNITS_PER_WORD);
21573           mem = gen_frame_mem (BLKmode, ptr);
21574           set_mem_alias_set (mem, get_varargs_alias_set ());
21575
21576           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
21577                                mem, gr_saved);
21578         }
21579       if (vr_saved > 0)
21580         {
21581           /* We can't use move_block_from_reg, because it will use
21582              the wrong mode, storing D regs only.  */
21583           machine_mode mode = TImode;
21584           int off, i, vr_start;
21585
21586           /* Set OFF to the offset from virtual_incoming_args_rtx of
21587              the first vector register.  The VR save area lies below
21588              the GR one, and is aligned to 16 bytes.  */
21589           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
21590                            STACK_BOUNDARY / BITS_PER_UNIT);
21591           off -= vr_saved * UNITS_PER_VREG;
21592
21593           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
21594           for (i = 0; i < vr_saved; ++i)
21595             {
21596               rtx ptr, mem;
21597
21598               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
21599               mem = gen_frame_mem (mode, ptr);
21600               set_mem_alias_set (mem, get_varargs_alias_set ());
21601               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
21602               off += UNITS_PER_VREG;
21603             }
21604         }
21605     }
21606
21607   /* We don't save the size into *PRETEND_SIZE because we want to avoid
21608      any complication of having crtl->args.pretend_args_size changed.  */
21609   cfun->machine->frame.saved_varargs_size
21610     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
21611                  STACK_BOUNDARY / BITS_PER_UNIT)
21612        + vr_saved * UNITS_PER_VREG);
21613 }
21614
21615 static void
21616 aarch64_conditional_register_usage (void)
21617 {
21618   int i;
21619   if (!TARGET_FLOAT)
21620     {
21621       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
21622         {
21623           fixed_regs[i] = 1;
21624           call_used_regs[i] = 1;
21625           CLEAR_HARD_REG_BIT (operand_reg_set, i);
21626         }
21627     }
21628   if (!TARGET_SVE)
21629     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
21630       {
21631         fixed_regs[i] = 1;
21632         call_used_regs[i] = 1;
21633       }
21634
21635   /* Only allow these registers to be accessed via special patterns.  */
21636   CLEAR_HARD_REG_BIT (operand_reg_set, VG_REGNUM);
21637   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
21638   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
21639   for (int i = FIRST_FAKE_REGNUM; i <= LAST_FAKE_REGNUM; ++i)
21640     CLEAR_HARD_REG_BIT (operand_reg_set, i);
21641
21642   /* When tracking speculation, we need a couple of call-clobbered registers
21643      to track the speculation state.  It would be nice to just use
21644      IP0 and IP1, but currently there are numerous places that just
21645      assume these registers are free for other uses (eg pointer
21646      authentication).  */
21647   if (aarch64_track_speculation)
21648     {
21649       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
21650       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
21651       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21652       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21653     }
21654 }
21655
21656 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
21657
21658 bool
21659 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
21660 {
21661   /* For records we're passed a FIELD_DECL, for arrays we're passed
21662      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
21663   const_tree type = TREE_TYPE (field_or_array);
21664
21665   /* Assign BLKmode to anything that contains more than 2 SVE predicates.
21666      For structures, the "multiple" case is indicated by MODE being
21667      VOIDmode.  */
21668   unsigned int num_zr, num_pr;
21669   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr > 2)
21670     {
21671       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
21672         return !simple_cst_equal (TYPE_SIZE (field_or_array),
21673                                   TYPE_SIZE (type));
21674       return mode == VOIDmode;
21675     }
21676
21677   return default_member_type_forces_blk (field_or_array, mode);
21678 }
21679
21680 /* Bitmasks that indicate whether earlier versions of GCC would have
21681    taken a different path through the ABI logic.  This should result in
21682    a -Wpsabi warning if the earlier path led to a different ABI decision.
21683
21684    WARN_PSABI_EMPTY_CXX17_BASE
21685       Indicates that the type includes an artificial empty C++17 base field
21686       that, prior to GCC 10.1, would prevent the type from being treated as
21687       a HFA or HVA.  See PR94383 for details.
21688
21689    WARN_PSABI_NO_UNIQUE_ADDRESS
21690       Indicates that the type includes an empty [[no_unique_address]] field
21691       that, prior to GCC 10.1, would prevent the type from being treated as
21692       a HFA or HVA.  */
21693 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
21694 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
21695 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
21696
21697 /* Walk down the type tree of TYPE counting consecutive base elements.
21698    If *MODEP is VOIDmode, then set it to the first valid floating point
21699    type.  If a non-floating point type is found, or if a floating point
21700    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
21701    otherwise return the count in the sub-tree.
21702
21703    The WARN_PSABI_FLAGS argument allows the caller to check whether this
21704    function has changed its behavior relative to earlier versions of GCC.
21705    Normally the argument should be nonnull and point to a zero-initialized
21706    variable.  The function then records whether the ABI decision might
21707    be affected by a known fix to the ABI logic, setting the associated
21708    WARN_PSABI_* bits if so.
21709
21710    When the argument is instead a null pointer, the function tries to
21711    simulate the behavior of GCC before all such ABI fixes were made.
21712    This is useful to check whether the function returns something
21713    different after the ABI fixes.  */
21714 static int
21715 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
21716                          unsigned int *warn_psabi_flags)
21717 {
21718   machine_mode mode;
21719   HOST_WIDE_INT size;
21720
21721   if (aarch64_sve::builtin_type_p (type))
21722     return -1;
21723
21724   switch (TREE_CODE (type))
21725     {
21726     case REAL_TYPE:
21727       mode = TYPE_MODE (type);
21728       if (mode != DFmode && mode != SFmode
21729           && mode != TFmode && mode != HFmode
21730           && mode != SDmode && mode != DDmode && mode != TDmode)
21731         return -1;
21732
21733       if (*modep == VOIDmode)
21734         *modep = mode;
21735
21736       if (*modep == mode)
21737         return 1;
21738
21739       break;
21740
21741     case COMPLEX_TYPE:
21742       mode = TYPE_MODE (TREE_TYPE (type));
21743       if (mode != DFmode && mode != SFmode
21744           && mode != TFmode && mode != HFmode)
21745         return -1;
21746
21747       if (*modep == VOIDmode)
21748         *modep = mode;
21749
21750       if (*modep == mode)
21751         return 2;
21752
21753       break;
21754
21755     case VECTOR_TYPE:
21756       /* Use V2SImode and V4SImode as representatives of all 64-bit
21757          and 128-bit vector types.  */
21758       size = int_size_in_bytes (type);
21759       switch (size)
21760         {
21761         case 8:
21762           mode = V2SImode;
21763           break;
21764         case 16:
21765           mode = V4SImode;
21766           break;
21767         default:
21768           return -1;
21769         }
21770
21771       if (*modep == VOIDmode)
21772         *modep = mode;
21773
21774       /* Vector modes are considered to be opaque: two vectors are
21775          equivalent for the purposes of being homogeneous aggregates
21776          if they are the same size.  */
21777       if (*modep == mode)
21778         return 1;
21779
21780       break;
21781
21782     case ARRAY_TYPE:
21783       {
21784         int count;
21785         tree index = TYPE_DOMAIN (type);
21786
21787         /* Can't handle incomplete types nor sizes that are not
21788            fixed.  */
21789         if (!COMPLETE_TYPE_P (type)
21790             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21791           return -1;
21792
21793         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
21794                                          warn_psabi_flags);
21795         if (count == -1
21796             || !index
21797             || !TYPE_MAX_VALUE (index)
21798             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
21799             || !TYPE_MIN_VALUE (index)
21800             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
21801             || count < 0)
21802           return -1;
21803
21804         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
21805                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
21806
21807         /* There must be no padding.  */
21808         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21809                       count * GET_MODE_BITSIZE (*modep)))
21810           return -1;
21811
21812         return count;
21813       }
21814
21815     case RECORD_TYPE:
21816       {
21817         int count = 0;
21818         int sub_count;
21819         tree field;
21820
21821         /* Can't handle incomplete types nor sizes that are not
21822            fixed.  */
21823         if (!COMPLETE_TYPE_P (type)
21824             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21825           return -1;
21826
21827         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21828           {
21829             if (TREE_CODE (field) != FIELD_DECL)
21830               continue;
21831
21832             if (DECL_FIELD_ABI_IGNORED (field))
21833               {
21834                 /* See whether this is something that earlier versions of
21835                    GCC failed to ignore.  */
21836                 unsigned int flag;
21837                 if (lookup_attribute ("no_unique_address",
21838                                       DECL_ATTRIBUTES (field)))
21839                   flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
21840                 else if (cxx17_empty_base_field_p (field))
21841                   flag = WARN_PSABI_EMPTY_CXX17_BASE;
21842                 else
21843                   /* No compatibility problem.  */
21844                   continue;
21845
21846                 /* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
21847                 if (warn_psabi_flags)
21848                   {
21849                     *warn_psabi_flags |= flag;
21850                     continue;
21851                   }
21852               }
21853             /* A zero-width bitfield may affect layout in some
21854                circumstances, but adds no members.  The determination
21855                of whether or not a type is an HFA is performed after
21856                layout is complete, so if the type still looks like an
21857                HFA afterwards, it is still classed as one.  This is
21858                potentially an ABI break for the hard-float ABI.  */
21859             else if (DECL_BIT_FIELD (field)
21860                      && integer_zerop (DECL_SIZE (field)))
21861               {
21862                 /* Prior to GCC-12 these fields were striped early,
21863                    hiding them from the back-end entirely and
21864                    resulting in the correct behaviour for argument
21865                    passing.  Simulate that old behaviour without
21866                    generating a warning.  */
21867                 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
21868                   continue;
21869                 if (warn_psabi_flags)
21870                   {
21871                     *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
21872                     continue;
21873                   }
21874               }
21875
21876             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
21877                                                  warn_psabi_flags);
21878             if (sub_count < 0)
21879               return -1;
21880             count += sub_count;
21881           }
21882
21883         /* There must be no padding.  */
21884         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21885                       count * GET_MODE_BITSIZE (*modep)))
21886           return -1;
21887
21888         return count;
21889       }
21890
21891     case UNION_TYPE:
21892     case QUAL_UNION_TYPE:
21893       {
21894         /* These aren't very interesting except in a degenerate case.  */
21895         int count = 0;
21896         int sub_count;
21897         tree field;
21898
21899         /* Can't handle incomplete types nor sizes that are not
21900            fixed.  */
21901         if (!COMPLETE_TYPE_P (type)
21902             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21903           return -1;
21904
21905         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21906           {
21907             if (TREE_CODE (field) != FIELD_DECL)
21908               continue;
21909
21910             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
21911                                                  warn_psabi_flags);
21912             if (sub_count < 0)
21913               return -1;
21914             count = count > sub_count ? count : sub_count;
21915           }
21916
21917         /* There must be no padding.  */
21918         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21919                       count * GET_MODE_BITSIZE (*modep)))
21920           return -1;
21921
21922         return count;
21923       }
21924
21925     default:
21926       break;
21927     }
21928
21929   return -1;
21930 }
21931
21932 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
21933    type as described in AAPCS64 \S 4.1.2.
21934
21935    See the comment above aarch64_composite_type_p for the notes on MODE.  */
21936
21937 static bool
21938 aarch64_short_vector_p (const_tree type,
21939                         machine_mode mode)
21940 {
21941   poly_int64 size = -1;
21942
21943   if (type && VECTOR_TYPE_P (type))
21944     {
21945       if (aarch64_sve::builtin_type_p (type))
21946         return false;
21947       size = int_size_in_bytes (type);
21948     }
21949   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
21950            || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
21951     {
21952       /* The containing "else if" is too loose: it means that we look at TYPE
21953          if the type is a vector type (good), but that we otherwise ignore TYPE
21954          and look only at the mode.  This is wrong because the type describes
21955          the language-level information whereas the mode is purely an internal
21956          GCC concept.  We can therefore reach here for types that are not
21957          vectors in the AAPCS64 sense.
21958
21959          We can't "fix" that for the traditional Advanced SIMD vector modes
21960          without breaking backwards compatibility.  However, there's no such
21961          baggage for the structure modes, which were introduced in GCC 12.  */
21962       if (aarch64_advsimd_struct_mode_p (mode))
21963         return false;
21964
21965       /* For similar reasons, rely only on the type, not the mode, when
21966          processing SVE types.  */
21967       if (type && aarch64_some_values_include_pst_objects_p (type))
21968         /* Leave later code to report an error if SVE is disabled.  */
21969         gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
21970       else
21971         size = GET_MODE_SIZE (mode);
21972     }
21973   if (known_eq (size, 8) || known_eq (size, 16))
21974     {
21975       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
21976          they are being treated as scalable AAPCS64 types.  */
21977       gcc_assert (!aarch64_sve_mode_p (mode)
21978                   && !aarch64_advsimd_struct_mode_p (mode));
21979       return true;
21980     }
21981   return false;
21982 }
21983
21984 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
21985    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
21986    array types.  The C99 floating-point complex types are also considered
21987    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
21988    types, which are GCC extensions and out of the scope of AAPCS64, are
21989    treated as composite types here as well.
21990
21991    Note that MODE itself is not sufficient in determining whether a type
21992    is such a composite type or not.  This is because
21993    stor-layout.cc:compute_record_mode may have already changed the MODE
21994    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
21995    structure with only one field may have its MODE set to the mode of the
21996    field.  Also an integer mode whose size matches the size of the
21997    RECORD_TYPE type may be used to substitute the original mode
21998    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
21999    solely relied on.  */
22000
22001 static bool
22002 aarch64_composite_type_p (const_tree type,
22003                           machine_mode mode)
22004 {
22005   if (aarch64_short_vector_p (type, mode))
22006     return false;
22007
22008   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
22009     return true;
22010
22011   if (type
22012       && TREE_CODE (type) == BITINT_TYPE
22013       && int_size_in_bytes (type) > 16)
22014     return true;
22015
22016   if (mode == BLKmode
22017       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
22018       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
22019     return true;
22020
22021   return false;
22022 }
22023
22024 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
22025    shall be passed or returned in simd/fp register(s) (providing these
22026    parameter passing registers are available).
22027
22028    Upon successful return, *COUNT returns the number of needed registers,
22029    *BASE_MODE returns the mode of the individual register and when IS_HA
22030    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
22031    floating-point aggregate or a homogeneous short-vector aggregate.
22032
22033    SILENT_P is true if the function should refrain from reporting any
22034    diagnostics.  This should only be used if the caller is certain that
22035    any ABI decisions would eventually come through this function with
22036    SILENT_P set to false.  */
22037
22038 static bool
22039 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
22040                                          const_tree type,
22041                                          machine_mode *base_mode,
22042                                          int *count,
22043                                          bool *is_ha,
22044                                          bool silent_p)
22045 {
22046   if (is_ha != NULL) *is_ha = false;
22047
22048   machine_mode new_mode = VOIDmode;
22049   bool composite_p = aarch64_composite_type_p (type, mode);
22050
22051   if ((!composite_p
22052        && (GET_MODE_CLASS (mode) == MODE_FLOAT
22053            || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
22054       || aarch64_short_vector_p (type, mode))
22055     {
22056       *count = 1;
22057       new_mode = mode;
22058     }
22059   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
22060     {
22061       if (is_ha != NULL) *is_ha = true;
22062       *count = 2;
22063       new_mode = GET_MODE_INNER (mode);
22064     }
22065   else if (type && composite_p)
22066     {
22067       unsigned int warn_psabi_flags = 0;
22068       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
22069                                               &warn_psabi_flags);
22070       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
22071         {
22072           static unsigned last_reported_type_uid;
22073           unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
22074           int alt;
22075           if (!silent_p
22076               && warn_psabi
22077               && warn_psabi_flags
22078               && uid != last_reported_type_uid
22079               && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
22080                   != ag_count))
22081             {
22082               const char *url10
22083                 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
22084               const char *url12
22085                 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
22086               gcc_assert (alt == -1);
22087               last_reported_type_uid = uid;
22088               /* Use TYPE_MAIN_VARIANT to strip any redundant const
22089                  qualification.  */
22090               if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
22091                 inform (input_location, "parameter passing for argument of "
22092                         "type %qT with %<[[no_unique_address]]%> members "
22093                         "changed %{in GCC 10.1%}",
22094                         TYPE_MAIN_VARIANT (type), url10);
22095               else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
22096                 inform (input_location, "parameter passing for argument of "
22097                         "type %qT when C++17 is enabled changed to match "
22098                         "C++14 %{in GCC 10.1%}",
22099                         TYPE_MAIN_VARIANT (type), url10);
22100               else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
22101                 inform (input_location, "parameter passing for argument of "
22102                         "type %qT changed %{in GCC 12.1%}",
22103                         TYPE_MAIN_VARIANT (type), url12);
22104             }
22105
22106           if (is_ha != NULL) *is_ha = true;
22107           *count = ag_count;
22108         }
22109       else
22110         return false;
22111     }
22112   else
22113     return false;
22114
22115   gcc_assert (!aarch64_sve_mode_p (new_mode));
22116   *base_mode = new_mode;
22117   return true;
22118 }
22119
22120 /* Implement TARGET_STRUCT_VALUE_RTX.  */
22121
22122 static rtx
22123 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
22124                           int incoming ATTRIBUTE_UNUSED)
22125 {
22126   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
22127 }
22128
22129 /* Implements target hook vector_mode_supported_p.  */
22130 static bool
22131 aarch64_vector_mode_supported_p (machine_mode mode)
22132 {
22133   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
22134   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22135 }
22136
22137 /* Implements target hook vector_mode_supported_any_target_p.  */
22138 static bool
22139 aarch64_vector_mode_supported_any_target_p (machine_mode mode)
22140 {
22141   unsigned int vec_flags = aarch64_classify_vector_mode (mode, true);
22142   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22143 }
22144
22145 /* Return the full-width SVE vector mode for element mode MODE, if one
22146    exists.  */
22147 opt_machine_mode
22148 aarch64_full_sve_mode (scalar_mode mode)
22149 {
22150   switch (mode)
22151     {
22152     case E_DFmode:
22153       return VNx2DFmode;
22154     case E_SFmode:
22155       return VNx4SFmode;
22156     case E_HFmode:
22157       return VNx8HFmode;
22158     case E_BFmode:
22159       return VNx8BFmode;
22160     case E_DImode:
22161       return VNx2DImode;
22162     case E_SImode:
22163       return VNx4SImode;
22164     case E_HImode:
22165       return VNx8HImode;
22166     case E_QImode:
22167       return VNx16QImode;
22168     default:
22169       return opt_machine_mode ();
22170     }
22171 }
22172
22173 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
22174    if it exists.  */
22175 opt_machine_mode
22176 aarch64_vq_mode (scalar_mode mode)
22177 {
22178   switch (mode)
22179     {
22180     case E_DFmode:
22181       return V2DFmode;
22182     case E_SFmode:
22183       return V4SFmode;
22184     case E_HFmode:
22185       return V8HFmode;
22186     case E_BFmode:
22187       return V8BFmode;
22188     case E_SImode:
22189       return V4SImode;
22190     case E_HImode:
22191       return V8HImode;
22192     case E_QImode:
22193       return V16QImode;
22194     case E_DImode:
22195       return V2DImode;
22196     default:
22197       return opt_machine_mode ();
22198     }
22199 }
22200
22201 /* Return appropriate SIMD container
22202    for MODE within a vector of WIDTH bits.  */
22203 static machine_mode
22204 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
22205 {
22206   if (TARGET_SVE
22207       && maybe_ne (width, 128)
22208       && known_eq (width, BITS_PER_SVE_VECTOR))
22209     return aarch64_full_sve_mode (mode).else_mode (word_mode);
22210
22211   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
22212   if (TARGET_BASE_SIMD)
22213     {
22214       if (known_eq (width, 128))
22215         return aarch64_vq_mode (mode).else_mode (word_mode);
22216       else
22217         switch (mode)
22218           {
22219           case E_SFmode:
22220             return V2SFmode;
22221           case E_HFmode:
22222             return V4HFmode;
22223           case E_BFmode:
22224             return V4BFmode;
22225           case E_SImode:
22226             return V2SImode;
22227           case E_HImode:
22228             return V4HImode;
22229           case E_QImode:
22230             return V8QImode;
22231           default:
22232             break;
22233           }
22234     }
22235   return word_mode;
22236 }
22237
22238 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
22239    and return whether the SVE mode should be preferred over the
22240    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
22241 static bool
22242 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
22243 {
22244   /* Take into account the aarch64-autovec-preference param if non-zero.  */
22245   bool only_asimd_p = aarch64_autovec_preference == 1;
22246   bool only_sve_p = aarch64_autovec_preference == 2;
22247
22248   if (only_asimd_p)
22249     return false;
22250   if (only_sve_p)
22251     return true;
22252
22253   /* The preference in case of a tie in costs.  */
22254   bool prefer_asimd = aarch64_autovec_preference == 3;
22255   bool prefer_sve = aarch64_autovec_preference == 4;
22256
22257   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
22258   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
22259   /* If the CPU information does not have an SVE width registered use the
22260      generic poly_int comparison that prefers SVE.  If a preference is
22261      explicitly requested avoid this path.  */
22262   if (aarch64_tune_params.sve_width == SVE_SCALABLE
22263       && !prefer_asimd
22264       && !prefer_sve)
22265     return maybe_gt (nunits_sve, nunits_asimd);
22266
22267   /* Otherwise estimate the runtime width of the modes involved.  */
22268   HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
22269   HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
22270
22271   /* Preferring SVE means picking it first unless the Advanced SIMD mode
22272      is clearly wider.  */
22273   if (prefer_sve)
22274     return est_sve >= est_asimd;
22275   /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
22276      is clearly wider.  */
22277   if (prefer_asimd)
22278     return est_sve > est_asimd;
22279
22280   /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
22281   return est_sve > est_asimd;
22282 }
22283
22284 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
22285 static machine_mode
22286 aarch64_preferred_simd_mode (scalar_mode mode)
22287 {
22288   /* Take into account explicit auto-vectorization ISA preferences through
22289      aarch64_cmp_autovec_modes.  */
22290   if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
22291     return aarch64_full_sve_mode (mode).else_mode (word_mode);
22292   if (TARGET_SIMD)
22293     return aarch64_vq_mode (mode).else_mode (word_mode);
22294   return word_mode;
22295 }
22296
22297 /* Return a list of possible vector sizes for the vectorizer
22298    to iterate over.  */
22299 static unsigned int
22300 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
22301 {
22302   static const machine_mode sve_modes[] = {
22303     /* Try using full vectors for all element types.  */
22304     VNx16QImode,
22305
22306     /* Try using 16-bit containers for 8-bit elements and full vectors
22307        for wider elements.  */
22308     VNx8QImode,
22309
22310     /* Try using 32-bit containers for 8-bit and 16-bit elements and
22311        full vectors for wider elements.  */
22312     VNx4QImode,
22313
22314     /* Try using 64-bit containers for all element types.  */
22315     VNx2QImode
22316   };
22317
22318   static const machine_mode advsimd_modes[] = {
22319     /* Try using 128-bit vectors for all element types.  */
22320     V16QImode,
22321
22322     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
22323        for wider elements.  */
22324     V8QImode,
22325
22326     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
22327        for wider elements.
22328
22329        TODO: We could support a limited form of V4QImode too, so that
22330        we use 32-bit vectors for 8-bit elements.  */
22331     V4HImode,
22332
22333     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
22334        for 64-bit elements.
22335
22336        TODO: We could similarly support limited forms of V2QImode and V2HImode
22337        for this case.  */
22338     V2SImode
22339   };
22340
22341   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
22342      This is because:
22343
22344      - If we can't use N-byte Advanced SIMD vectors then the placement
22345        doesn't matter; we'll just continue as though the Advanced SIMD
22346        entry didn't exist.
22347
22348      - If an SVE main loop with N bytes ends up being cheaper than an
22349        Advanced SIMD main loop with N bytes then by default we'll replace
22350        the Advanced SIMD version with the SVE one.
22351
22352      - If an Advanced SIMD main loop with N bytes ends up being cheaper
22353        than an SVE main loop with N bytes then by default we'll try to
22354        use the SVE loop to vectorize the epilogue instead.  */
22355
22356   bool only_asimd_p = aarch64_autovec_preference == 1;
22357   bool only_sve_p = aarch64_autovec_preference == 2;
22358
22359   unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
22360   unsigned int advsimd_i = 0;
22361
22362   while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
22363     {
22364       if (sve_i < ARRAY_SIZE (sve_modes)
22365           && aarch64_cmp_autovec_modes (sve_modes[sve_i],
22366                                         advsimd_modes[advsimd_i]))
22367         modes->safe_push (sve_modes[sve_i++]);
22368       else
22369         modes->safe_push (advsimd_modes[advsimd_i++]);
22370     }
22371   while (sve_i < ARRAY_SIZE (sve_modes))
22372    modes->safe_push (sve_modes[sve_i++]);
22373
22374   unsigned int flags = 0;
22375   if (aarch64_vect_compare_costs)
22376     flags |= VECT_COMPARE_COSTS;
22377   return flags;
22378 }
22379
22380 /* Implement TARGET_MANGLE_TYPE.  */
22381
22382 static const char *
22383 aarch64_mangle_type (const_tree type)
22384 {
22385   /* The AArch64 ABI documents say that "__va_list" has to be
22386      mangled as if it is in the "std" namespace.  */
22387   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
22388     return "St9__va_list";
22389
22390   /* Half-precision floating point types.  */
22391   if (SCALAR_FLOAT_TYPE_P (type) && TYPE_PRECISION (type) == 16)
22392     {
22393       if (TYPE_MAIN_VARIANT (type) == float16_type_node)
22394         return NULL;
22395       if (TYPE_MODE (type) == BFmode)
22396         return "u6__bf16";
22397       else
22398         return "Dh";
22399     }
22400
22401   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
22402      builtin types.  */
22403   if (TYPE_NAME (type) != NULL)
22404     {
22405       const char *res;
22406       if ((res = aarch64_general_mangle_builtin_type (type))
22407           || (res = aarch64_sve::mangle_builtin_type (type)))
22408         return res;
22409     }
22410
22411   /* Use the default mangling.  */
22412   return NULL;
22413 }
22414
22415 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
22416
22417 static bool
22418 aarch64_verify_type_context (location_t loc, type_context_kind context,
22419                              const_tree type, bool silent_p)
22420 {
22421   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
22422 }
22423
22424 /* Find the first rtx_insn before insn that will generate an assembly
22425    instruction.  */
22426
22427 static rtx_insn *
22428 aarch64_prev_real_insn (rtx_insn *insn)
22429 {
22430   if (!insn)
22431     return NULL;
22432
22433   do
22434     {
22435       insn = prev_real_insn (insn);
22436     }
22437   while (insn && recog_memoized (insn) < 0);
22438
22439   return insn;
22440 }
22441
22442 static bool
22443 is_madd_op (enum attr_type t1)
22444 {
22445   unsigned int i;
22446   /* A number of these may be AArch32 only.  */
22447   enum attr_type mlatypes[] = {
22448     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
22449     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
22450     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
22451   };
22452
22453   for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
22454     {
22455       if (t1 == mlatypes[i])
22456         return true;
22457     }
22458
22459   return false;
22460 }
22461
22462 /* Check if there is a register dependency between a load and the insn
22463    for which we hold recog_data.  */
22464
22465 static bool
22466 dep_between_memop_and_curr (rtx memop)
22467 {
22468   rtx load_reg;
22469   int opno;
22470
22471   gcc_assert (GET_CODE (memop) == SET);
22472
22473   if (!REG_P (SET_DEST (memop)))
22474     return false;
22475
22476   load_reg = SET_DEST (memop);
22477   for (opno = 1; opno < recog_data.n_operands; opno++)
22478     {
22479       rtx operand = recog_data.operand[opno];
22480       if (REG_P (operand)
22481           && reg_overlap_mentioned_p (load_reg, operand))
22482         return true;
22483
22484     }
22485   return false;
22486 }
22487
22488
22489 /* When working around the Cortex-A53 erratum 835769,
22490    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
22491    instruction and has a preceding memory instruction such that a NOP
22492    should be inserted between them.  */
22493
22494 bool
22495 aarch64_madd_needs_nop (rtx_insn* insn)
22496 {
22497   enum attr_type attr_type;
22498   rtx_insn *prev;
22499   rtx body;
22500
22501   if (!TARGET_FIX_ERR_A53_835769)
22502     return false;
22503
22504   if (!INSN_P (insn) || recog_memoized (insn) < 0)
22505     return false;
22506
22507   attr_type = get_attr_type (insn);
22508   if (!is_madd_op (attr_type))
22509     return false;
22510
22511   prev = aarch64_prev_real_insn (insn);
22512   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
22513      Restore recog state to INSN to avoid state corruption.  */
22514   extract_constrain_insn_cached (insn);
22515
22516   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
22517     return false;
22518
22519   body = single_set (prev);
22520
22521   /* If the previous insn is a memory op and there is no dependency between
22522      it and the DImode madd, emit a NOP between them.  If body is NULL then we
22523      have a complex memory operation, probably a load/store pair.
22524      Be conservative for now and emit a NOP.  */
22525   if (GET_MODE (recog_data.operand[0]) == DImode
22526       && (!body || !dep_between_memop_and_curr (body)))
22527     return true;
22528
22529   return false;
22530
22531 }
22532
22533
22534 /* Implement FINAL_PRESCAN_INSN.  */
22535
22536 void
22537 aarch64_final_prescan_insn (rtx_insn *insn)
22538 {
22539   if (aarch64_madd_needs_nop (insn))
22540     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
22541 }
22542
22543
22544 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
22545    instruction.  */
22546
22547 bool
22548 aarch64_sve_index_immediate_p (rtx base_or_step)
22549 {
22550   return (CONST_INT_P (base_or_step)
22551           && IN_RANGE (INTVAL (base_or_step), -16, 15));
22552 }
22553
22554 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
22555    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
22556
22557 bool
22558 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
22559 {
22560   rtx elt = unwrap_const_vec_duplicate (x);
22561   if (!CONST_INT_P (elt))
22562     return false;
22563
22564   HOST_WIDE_INT val = INTVAL (elt);
22565   if (negate_p)
22566     val = -val;
22567   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
22568
22569   if (val & 0xff)
22570     return IN_RANGE (val, 0, 0xff);
22571   return IN_RANGE (val, 0, 0xff00);
22572 }
22573
22574 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
22575    instructions when applied to mode MODE.  Negate X first if NEGATE_P
22576    is true.  */
22577
22578 bool
22579 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
22580 {
22581   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
22582     return false;
22583
22584   /* After the optional negation, the immediate must be nonnegative.
22585      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
22586      instead of SQADD Zn.B, Zn.B, #129.  */
22587   rtx elt = unwrap_const_vec_duplicate (x);
22588   return negate_p == (INTVAL (elt) < 0);
22589 }
22590
22591 /* Return true if X is a valid immediate operand for an SVE logical
22592    instruction such as AND.  */
22593
22594 bool
22595 aarch64_sve_bitmask_immediate_p (rtx x)
22596 {
22597   rtx elt;
22598
22599   return (const_vec_duplicate_p (x, &elt)
22600           && CONST_INT_P (elt)
22601           && aarch64_bitmask_imm (INTVAL (elt),
22602                                   GET_MODE_INNER (GET_MODE (x))));
22603 }
22604
22605 /* Return true if X is a valid immediate for the SVE DUP and CPY
22606    instructions.  */
22607
22608 bool
22609 aarch64_sve_dup_immediate_p (rtx x)
22610 {
22611   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
22612   if (!CONST_INT_P (x))
22613     return false;
22614
22615   HOST_WIDE_INT val = INTVAL (x);
22616   if (val & 0xff)
22617     return IN_RANGE (val, -0x80, 0x7f);
22618   return IN_RANGE (val, -0x8000, 0x7f00);
22619 }
22620
22621 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
22622    SIGNED_P says whether the operand is signed rather than unsigned.  */
22623
22624 bool
22625 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
22626 {
22627   x = unwrap_const_vec_duplicate (x);
22628   return (CONST_INT_P (x)
22629           && (signed_p
22630               ? IN_RANGE (INTVAL (x), -16, 15)
22631               : IN_RANGE (INTVAL (x), 0, 127)));
22632 }
22633
22634 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
22635    instruction.  Negate X first if NEGATE_P is true.  */
22636
22637 bool
22638 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
22639 {
22640   rtx elt;
22641   REAL_VALUE_TYPE r;
22642
22643   if (!const_vec_duplicate_p (x, &elt)
22644       || !CONST_DOUBLE_P (elt))
22645     return false;
22646
22647   r = *CONST_DOUBLE_REAL_VALUE (elt);
22648
22649   if (negate_p)
22650     r = real_value_negate (&r);
22651
22652   if (real_equal (&r, &dconst1))
22653     return true;
22654   if (real_equal (&r, &dconsthalf))
22655     return true;
22656   return false;
22657 }
22658
22659 /* Return true if X is a valid immediate operand for an SVE FMUL
22660    instruction.  */
22661
22662 bool
22663 aarch64_sve_float_mul_immediate_p (rtx x)
22664 {
22665   rtx elt;
22666
22667   return (const_vec_duplicate_p (x, &elt)
22668           && CONST_DOUBLE_P (elt)
22669           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
22670               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
22671 }
22672
22673 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
22674    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
22675    is nonnull, use it to describe valid immediates.  */
22676 static bool
22677 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
22678                                     simd_immediate_info *info,
22679                                     enum simd_immediate_check which,
22680                                     simd_immediate_info::insn_type insn)
22681 {
22682   /* Try a 4-byte immediate with LSL.  */
22683   for (unsigned int shift = 0; shift < 32; shift += 8)
22684     if ((val32 & (0xff << shift)) == val32)
22685       {
22686         if (info)
22687           *info = simd_immediate_info (SImode, val32 >> shift, insn,
22688                                        simd_immediate_info::LSL, shift);
22689         return true;
22690       }
22691
22692   /* Try a 2-byte immediate with LSL.  */
22693   unsigned int imm16 = val32 & 0xffff;
22694   if (imm16 == (val32 >> 16))
22695     for (unsigned int shift = 0; shift < 16; shift += 8)
22696       if ((imm16 & (0xff << shift)) == imm16)
22697         {
22698           if (info)
22699             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
22700                                          simd_immediate_info::LSL, shift);
22701           return true;
22702         }
22703
22704   /* Try a 4-byte immediate with MSL, except for cases that MVN
22705      can handle.  */
22706   if (which == AARCH64_CHECK_MOV)
22707     for (unsigned int shift = 8; shift < 24; shift += 8)
22708       {
22709         unsigned int low = (1 << shift) - 1;
22710         if (((val32 & (0xff << shift)) | low) == val32)
22711           {
22712             if (info)
22713               *info = simd_immediate_info (SImode, val32 >> shift, insn,
22714                                            simd_immediate_info::MSL, shift);
22715             return true;
22716           }
22717       }
22718
22719   return false;
22720 }
22721
22722 /* Return true if replicating VAL64 is a valid immediate for the
22723    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
22724    use it to describe valid immediates.  */
22725 static bool
22726 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
22727                                  simd_immediate_info *info,
22728                                  enum simd_immediate_check which)
22729 {
22730   unsigned int val32 = val64 & 0xffffffff;
22731   unsigned int val16 = val64 & 0xffff;
22732   unsigned int val8 = val64 & 0xff;
22733
22734   if (val32 == (val64 >> 32))
22735     {
22736       if ((which & AARCH64_CHECK_ORR) != 0
22737           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
22738                                                  simd_immediate_info::MOV))
22739         return true;
22740
22741       if ((which & AARCH64_CHECK_BIC) != 0
22742           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
22743                                                  simd_immediate_info::MVN))
22744         return true;
22745
22746       /* Try using a replicated byte.  */
22747       if (which == AARCH64_CHECK_MOV
22748           && val16 == (val32 >> 16)
22749           && val8 == (val16 >> 8))
22750         {
22751           if (info)
22752             *info = simd_immediate_info (QImode, val8);
22753           return true;
22754         }
22755     }
22756
22757   /* Try using a bit-to-bytemask.  */
22758   if (which == AARCH64_CHECK_MOV)
22759     {
22760       unsigned int i;
22761       for (i = 0; i < 64; i += 8)
22762         {
22763           unsigned char byte = (val64 >> i) & 0xff;
22764           if (byte != 0 && byte != 0xff)
22765             break;
22766         }
22767       if (i == 64)
22768         {
22769           if (info)
22770             *info = simd_immediate_info (DImode, val64);
22771           return true;
22772         }
22773     }
22774   return false;
22775 }
22776
22777 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
22778    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
22779
22780 static bool
22781 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
22782                              simd_immediate_info *info)
22783 {
22784   scalar_int_mode mode = DImode;
22785   unsigned int val32 = val64 & 0xffffffff;
22786   if (val32 == (val64 >> 32))
22787     {
22788       mode = SImode;
22789       unsigned int val16 = val32 & 0xffff;
22790       if (val16 == (val32 >> 16))
22791         {
22792           mode = HImode;
22793           unsigned int val8 = val16 & 0xff;
22794           if (val8 == (val16 >> 8))
22795             mode = QImode;
22796         }
22797     }
22798   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
22799   if (IN_RANGE (val, -0x80, 0x7f))
22800     {
22801       /* DUP with no shift.  */
22802       if (info)
22803         *info = simd_immediate_info (mode, val);
22804       return true;
22805     }
22806   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
22807     {
22808       /* DUP with LSL #8.  */
22809       if (info)
22810         *info = simd_immediate_info (mode, val);
22811       return true;
22812     }
22813   if (aarch64_bitmask_imm (val64, mode))
22814     {
22815       /* DUPM.  */
22816       if (info)
22817         *info = simd_immediate_info (mode, val);
22818       return true;
22819     }
22820   return false;
22821 }
22822
22823 /* Return true if X is an UNSPEC_PTRUE constant of the form:
22824
22825        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
22826
22827    where PATTERN is the svpattern as a CONST_INT and where ZERO
22828    is a zero constant of the required PTRUE mode (which can have
22829    fewer elements than X's mode, if zero bits are significant).
22830
22831    If so, and if INFO is nonnull, describe the immediate in INFO.  */
22832 bool
22833 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
22834 {
22835   if (GET_CODE (x) != CONST)
22836     return false;
22837
22838   x = XEXP (x, 0);
22839   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
22840     return false;
22841
22842   if (info)
22843     {
22844       aarch64_svpattern pattern
22845         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
22846       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
22847       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
22848       *info = simd_immediate_info (int_mode, pattern);
22849     }
22850   return true;
22851 }
22852
22853 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
22854    it to describe valid immediates.  */
22855
22856 static bool
22857 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
22858 {
22859   if (aarch64_sve_ptrue_svpattern_p (x, info))
22860     return true;
22861
22862   if (x == CONST0_RTX (GET_MODE (x)))
22863     {
22864       if (info)
22865         *info = simd_immediate_info (DImode, 0);
22866       return true;
22867     }
22868
22869   /* Analyze the value as a VNx16BImode.  This should be relatively
22870      efficient, since rtx_vector_builder has enough built-in capacity
22871      to store all VLA predicate constants without needing the heap.  */
22872   rtx_vector_builder builder;
22873   if (!aarch64_get_sve_pred_bits (builder, x))
22874     return false;
22875
22876   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
22877   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
22878     {
22879       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
22880       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
22881       if (pattern != AARCH64_NUM_SVPATTERNS)
22882         {
22883           if (info)
22884             {
22885               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
22886               *info = simd_immediate_info (int_mode, pattern);
22887             }
22888           return true;
22889         }
22890     }
22891   return false;
22892 }
22893
22894 /* Return true if OP is a valid SIMD immediate for the operation
22895    described by WHICH.  If INFO is nonnull, use it to describe valid
22896    immediates.  */
22897 bool
22898 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
22899                               enum simd_immediate_check which)
22900 {
22901   machine_mode mode = GET_MODE (op);
22902   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
22903   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
22904     return false;
22905
22906   if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
22907     return false;
22908
22909   if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
22910     return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
22911
22912   if (vec_flags & VEC_SVE_PRED)
22913     return aarch64_sve_pred_valid_immediate (op, info);
22914
22915   scalar_mode elt_mode = GET_MODE_INNER (mode);
22916   rtx base, step;
22917   unsigned int n_elts;
22918   if (CONST_VECTOR_P (op)
22919       && CONST_VECTOR_DUPLICATE_P (op))
22920     n_elts = CONST_VECTOR_NPATTERNS (op);
22921   else if ((vec_flags & VEC_SVE_DATA)
22922            && const_vec_series_p (op, &base, &step))
22923     {
22924       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
22925       if (!aarch64_sve_index_immediate_p (base)
22926           || !aarch64_sve_index_immediate_p (step))
22927         return false;
22928
22929       if (info)
22930         {
22931           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
22932              should yield two integer values per 128-bit block, meaning
22933              that we need to treat it in the same way as V2DI and then
22934              ignore the upper 32 bits of each element.  */
22935           elt_mode = aarch64_sve_container_int_mode (mode);
22936           *info = simd_immediate_info (elt_mode, base, step);
22937         }
22938       return true;
22939     }
22940   else if (CONST_VECTOR_P (op)
22941            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
22942     /* N_ELTS set above.  */;
22943   else
22944     return false;
22945
22946   scalar_float_mode elt_float_mode;
22947   if (n_elts == 1
22948       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
22949     {
22950       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
22951       if (aarch64_float_const_zero_rtx_p (elt)
22952           || aarch64_float_const_representable_p (elt))
22953         {
22954           if (info)
22955             *info = simd_immediate_info (elt_float_mode, elt);
22956           return true;
22957         }
22958     }
22959
22960   /* If all elements in an SVE vector have the same value, we have a free
22961      choice between using the element mode and using the container mode.
22962      Using the element mode means that unused parts of the vector are
22963      duplicates of the used elements, while using the container mode means
22964      that the unused parts are an extension of the used elements.  Using the
22965      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
22966      for its container mode VNx4SI while 0x00000101 isn't.
22967
22968      If not all elements in an SVE vector have the same value, we need the
22969      transition from one element to the next to occur at container boundaries.
22970      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
22971      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
22972   scalar_int_mode elt_int_mode;
22973   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
22974     elt_int_mode = aarch64_sve_container_int_mode (mode);
22975   else
22976     elt_int_mode = int_mode_for_mode (elt_mode).require ();
22977
22978   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
22979   if (elt_size > 8)
22980     return false;
22981
22982   /* Expand the vector constant out into a byte vector, with the least
22983      significant byte of the register first.  */
22984   auto_vec<unsigned char, 16> bytes;
22985   bytes.reserve (n_elts * elt_size);
22986   for (unsigned int i = 0; i < n_elts; i++)
22987     {
22988       /* The vector is provided in gcc endian-neutral fashion.
22989          For aarch64_be Advanced SIMD, it must be laid out in the vector
22990          register in reverse order.  */
22991       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
22992       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
22993
22994       if (elt_mode != elt_int_mode)
22995         elt = gen_lowpart (elt_int_mode, elt);
22996
22997       if (!CONST_INT_P (elt))
22998         return false;
22999
23000       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
23001       for (unsigned int byte = 0; byte < elt_size; byte++)
23002         {
23003           bytes.quick_push (elt_val & 0xff);
23004           elt_val >>= BITS_PER_UNIT;
23005         }
23006     }
23007
23008   /* The immediate must repeat every eight bytes.  */
23009   unsigned int nbytes = bytes.length ();
23010   for (unsigned i = 8; i < nbytes; ++i)
23011     if (bytes[i] != bytes[i - 8])
23012       return false;
23013
23014   /* Get the repeating 8-byte value as an integer.  No endian correction
23015      is needed here because bytes is already in lsb-first order.  */
23016   unsigned HOST_WIDE_INT val64 = 0;
23017   for (unsigned int i = 0; i < 8; i++)
23018     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
23019               << (i * BITS_PER_UNIT));
23020
23021   if (vec_flags & VEC_SVE_DATA)
23022     return aarch64_sve_valid_immediate (val64, info);
23023   else
23024     return aarch64_advsimd_valid_immediate (val64, info, which);
23025 }
23026
23027 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
23028    has a step in the range of INDEX.  Return the index expression if so,
23029    otherwise return null.  */
23030 rtx
23031 aarch64_check_zero_based_sve_index_immediate (rtx x)
23032 {
23033   rtx base, step;
23034   if (const_vec_series_p (x, &base, &step)
23035       && base == const0_rtx
23036       && aarch64_sve_index_immediate_p (step))
23037     return step;
23038   return NULL_RTX;
23039 }
23040
23041 /* Check of immediate shift constants are within range.  */
23042 bool
23043 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
23044 {
23045   x = unwrap_const_vec_duplicate (x);
23046   if (!CONST_INT_P (x))
23047     return false;
23048   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
23049   if (left)
23050     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
23051   else
23052     return IN_RANGE (INTVAL (x), 1, bit_width);
23053 }
23054
23055 /* Return the bitmask CONST_INT to select the bits required by a zero extract
23056    operation of width WIDTH at bit position POS.  */
23057
23058 rtx
23059 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
23060 {
23061   gcc_assert (CONST_INT_P (width));
23062   gcc_assert (CONST_INT_P (pos));
23063
23064   unsigned HOST_WIDE_INT mask
23065     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
23066   return GEN_INT (mask << UINTVAL (pos));
23067 }
23068
23069 bool
23070 aarch64_mov_operand_p (rtx x, machine_mode mode)
23071 {
23072   if (GET_CODE (x) == HIGH
23073       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
23074     return true;
23075
23076   if (CONST_INT_P (x))
23077     return true;
23078
23079   if (VECTOR_MODE_P (GET_MODE (x)))
23080     {
23081       /* Require predicate constants to be VNx16BI before RA, so that we
23082          force everything to have a canonical form.  */
23083       if (!lra_in_progress
23084           && !reload_completed
23085           && aarch64_sve_pred_mode_p (GET_MODE (x))
23086           && known_eq (GET_MODE_SIZE (GET_MODE (x)), BYTES_PER_SVE_PRED)
23087           && GET_MODE (x) != VNx16BImode)
23088         return false;
23089
23090       return aarch64_simd_valid_immediate (x, NULL);
23091     }
23092
23093   /* Remove UNSPEC_SALT_ADDR before checking symbol reference.  */
23094   x = strip_salt (x);
23095
23096   /* GOT accesses are valid moves.  */
23097   if (SYMBOL_REF_P (x)
23098       && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
23099     return true;
23100
23101   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
23102     return true;
23103
23104   if (TARGET_SVE
23105       && (aarch64_sve_cnt_immediate_p (x)
23106           || aarch64_sve_rdvl_immediate_p (x)))
23107     return true;
23108
23109   if (aarch64_rdsvl_immediate_p (x))
23110     return true;
23111
23112   return aarch64_classify_symbolic_expression (x)
23113     == SYMBOL_TINY_ABSOLUTE;
23114 }
23115
23116 /* Return a function-invariant register that contains VALUE.  *CACHED_INSN
23117    caches instructions that set up such registers, so that they can be
23118    reused by future calls.  */
23119
23120 static rtx
23121 aarch64_get_shareable_reg (rtx_insn **cached_insn, rtx value)
23122 {
23123   rtx_insn *insn = *cached_insn;
23124   if (insn && INSN_P (insn) && !insn->deleted ())
23125     {
23126       rtx pat = PATTERN (insn);
23127       if (GET_CODE (pat) == SET)
23128         {
23129           rtx dest = SET_DEST (pat);
23130           if (REG_P (dest)
23131               && !HARD_REGISTER_P (dest)
23132               && rtx_equal_p (SET_SRC (pat), value))
23133             return dest;
23134         }
23135     }
23136   rtx reg = gen_reg_rtx (GET_MODE (value));
23137   *cached_insn = emit_insn_before (gen_rtx_SET (reg, value),
23138                                    function_beg_insn);
23139   return reg;
23140 }
23141
23142 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
23143    the constant creation.  */
23144
23145 rtx
23146 aarch64_gen_shareable_zero (machine_mode mode)
23147 {
23148   rtx reg = aarch64_get_shareable_reg (&cfun->machine->advsimd_zero_insn,
23149                                        CONST0_RTX (V4SImode));
23150   return lowpart_subreg (mode, reg, GET_MODE (reg));
23151 }
23152
23153 /* INSN is some form of extension or shift that can be split into a
23154    permutation involving a shared zero.  Return true if we should
23155    perform such a split.
23156
23157    ??? For now, make sure that the split instruction executes more
23158    frequently than the zero that feeds it.  In future it would be good
23159    to split without that restriction and instead recombine shared zeros
23160    if they turn out not to be worthwhile.  This would allow splits in
23161    single-block functions and would also cope more naturally with
23162    rematerialization.  */
23163
23164 bool
23165 aarch64_split_simd_shift_p (rtx_insn *insn)
23166 {
23167   return (can_create_pseudo_p ()
23168           && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))
23169           && (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count
23170               < BLOCK_FOR_INSN (insn)->count));
23171 }
23172
23173 /* Return a const_int vector of VAL.  */
23174 rtx
23175 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
23176 {
23177   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
23178   return gen_const_vec_duplicate (mode, c);
23179 }
23180
23181 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
23182
23183 bool
23184 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
23185 {
23186   machine_mode vmode;
23187
23188   vmode = aarch64_simd_container_mode (mode, 64);
23189   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
23190   return aarch64_simd_valid_immediate (op_v, NULL);
23191 }
23192
23193 /* Construct and return a PARALLEL RTX vector with elements numbering the
23194    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
23195    the vector - from the perspective of the architecture.  This does not
23196    line up with GCC's perspective on lane numbers, so we end up with
23197    different masks depending on our target endian-ness.  The diagram
23198    below may help.  We must draw the distinction when building masks
23199    which select one half of the vector.  An instruction selecting
23200    architectural low-lanes for a big-endian target, must be described using
23201    a mask selecting GCC high-lanes.
23202
23203                  Big-Endian             Little-Endian
23204
23205 GCC             0   1   2   3           3   2   1   0
23206               | x | x | x | x |       | x | x | x | x |
23207 Architecture    3   2   1   0           3   2   1   0
23208
23209 Low Mask:         { 2, 3 }                { 0, 1 }
23210 High Mask:        { 0, 1 }                { 2, 3 }
23211
23212    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
23213
23214 rtx
23215 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
23216 {
23217   rtvec v = rtvec_alloc (nunits / 2);
23218   int high_base = nunits / 2;
23219   int low_base = 0;
23220   int base;
23221   rtx t1;
23222   int i;
23223
23224   if (BYTES_BIG_ENDIAN)
23225     base = high ? low_base : high_base;
23226   else
23227     base = high ? high_base : low_base;
23228
23229   for (i = 0; i < nunits / 2; i++)
23230     RTVEC_ELT (v, i) = GEN_INT (base + i);
23231
23232   t1 = gen_rtx_PARALLEL (mode, v);
23233   return t1;
23234 }
23235
23236 /* Check OP for validity as a PARALLEL RTX vector with elements
23237    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
23238    from the perspective of the architecture.  See the diagram above
23239    aarch64_simd_vect_par_cnst_half for more details.  */
23240
23241 bool
23242 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
23243                                        bool high)
23244 {
23245   int nelts;
23246   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
23247     return false;
23248
23249   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
23250   HOST_WIDE_INT count_op = XVECLEN (op, 0);
23251   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
23252   int i = 0;
23253
23254   if (count_op != count_ideal)
23255     return false;
23256
23257   for (i = 0; i < count_ideal; i++)
23258     {
23259       rtx elt_op = XVECEXP (op, 0, i);
23260       rtx elt_ideal = XVECEXP (ideal, 0, i);
23261
23262       if (!CONST_INT_P (elt_op)
23263           || INTVAL (elt_ideal) != INTVAL (elt_op))
23264         return false;
23265     }
23266   return true;
23267 }
23268
23269 /* Return a PARALLEL containing NELTS elements, with element I equal
23270    to BASE + I * STEP.  */
23271
23272 rtx
23273 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
23274 {
23275   rtvec vec = rtvec_alloc (nelts);
23276   for (unsigned int i = 0; i < nelts; ++i)
23277     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
23278   return gen_rtx_PARALLEL (VOIDmode, vec);
23279 }
23280
23281 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
23282    series with step STEP.  */
23283
23284 bool
23285 aarch64_stepped_int_parallel_p (rtx op, int step)
23286 {
23287   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
23288     return false;
23289
23290   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
23291   for (int i = 1; i < XVECLEN (op, 0); ++i)
23292     if (!CONST_INT_P (XVECEXP (op, 0, i))
23293         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
23294       return false;
23295
23296   return true;
23297 }
23298
23299 /* Return true if OPERANDS[0] to OPERANDS[NUM_OPERANDS - 1] form a
23300    sequence of strided registers, with the stride being equal STRIDE.
23301    The operands are already known to be FPRs.  */
23302 bool
23303 aarch64_strided_registers_p (rtx *operands, unsigned int num_operands,
23304                              unsigned int stride)
23305 {
23306   for (unsigned int i = 1; i < num_operands; ++i)
23307     if (REGNO (operands[i]) != REGNO (operands[0]) + i * stride)
23308       return false;
23309   return true;
23310 }
23311
23312 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
23313    HIGH (exclusive).  */
23314 void
23315 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
23316                           const_tree exp)
23317 {
23318   HOST_WIDE_INT lane;
23319   gcc_assert (CONST_INT_P (operand));
23320   lane = INTVAL (operand);
23321
23322   if (lane < low || lane >= high)
23323   {
23324     if (exp)
23325       error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
23326                 lane, low, high - 1);
23327     else
23328       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
23329   }
23330 }
23331
23332 /* Peform endian correction on lane number N, which indexes a vector
23333    of mode MODE, and return the result as an SImode rtx.  */
23334
23335 rtx
23336 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
23337 {
23338   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
23339 }
23340
23341 /* Return TRUE if OP is a valid vector addressing mode.  */
23342
23343 bool
23344 aarch64_simd_mem_operand_p (rtx op)
23345 {
23346   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
23347                         || REG_P (XEXP (op, 0)));
23348 }
23349
23350 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
23351
23352 bool
23353 aarch64_sve_ld1r_operand_p (rtx op)
23354 {
23355   struct aarch64_address_info addr;
23356   scalar_mode mode;
23357
23358   return (MEM_P (op)
23359           && is_a <scalar_mode> (GET_MODE (op), &mode)
23360           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
23361           && addr.type == ADDRESS_REG_IMM
23362           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
23363 }
23364
23365 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
23366    where the size of the read data is specified by `mode` and the size of the
23367    vector elements are specified by `elem_mode`.   */
23368 bool
23369 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
23370                                    scalar_mode elem_mode)
23371 {
23372   struct aarch64_address_info addr;
23373   if (!MEM_P (op)
23374       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
23375     return false;
23376
23377   if (addr.type == ADDRESS_REG_IMM)
23378     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
23379
23380   if (addr.type == ADDRESS_REG_REG)
23381     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
23382
23383   return false;
23384 }
23385
23386 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
23387 bool
23388 aarch64_sve_ld1rq_operand_p (rtx op)
23389 {
23390   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
23391                                             GET_MODE_INNER (GET_MODE (op)));
23392 }
23393
23394 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
23395    accessing a vector where the element size is specified by `elem_mode`.  */
23396 bool
23397 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
23398 {
23399   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
23400 }
23401
23402 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
23403 bool
23404 aarch64_sve_ldff1_operand_p (rtx op)
23405 {
23406   if (!MEM_P (op))
23407     return false;
23408
23409   struct aarch64_address_info addr;
23410   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
23411     return false;
23412
23413   if (addr.type == ADDRESS_REG_IMM)
23414     return known_eq (addr.const_offset, 0);
23415
23416   return addr.type == ADDRESS_REG_REG;
23417 }
23418
23419 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
23420 bool
23421 aarch64_sve_ldnf1_operand_p (rtx op)
23422 {
23423   struct aarch64_address_info addr;
23424
23425   return (MEM_P (op)
23426           && aarch64_classify_address (&addr, XEXP (op, 0),
23427                                        GET_MODE (op), false)
23428           && addr.type == ADDRESS_REG_IMM);
23429 }
23430
23431 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
23432    The conditions for STR are the same.  */
23433 bool
23434 aarch64_sve_ldr_operand_p (rtx op)
23435 {
23436   struct aarch64_address_info addr;
23437
23438   return (MEM_P (op)
23439           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
23440                                        false, ADDR_QUERY_ANY)
23441           && addr.type == ADDRESS_REG_IMM);
23442 }
23443
23444 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
23445    addressing memory of mode MODE.  */
23446 bool
23447 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
23448 {
23449   struct aarch64_address_info addr;
23450   if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
23451     return false;
23452
23453   if (addr.type == ADDRESS_REG_IMM)
23454     return offset_6bit_signed_scaled_p (mode, addr.const_offset);
23455
23456   return addr.type == ADDRESS_REG_REG;
23457 }
23458
23459 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
23460    We need to be able to access the individual pieces, so the range
23461    is different from LD[234] and ST[234].  */
23462 bool
23463 aarch64_sve_struct_memory_operand_p (rtx op)
23464 {
23465   if (!MEM_P (op))
23466     return false;
23467
23468   machine_mode mode = GET_MODE (op);
23469   struct aarch64_address_info addr;
23470   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
23471                                  ADDR_QUERY_ANY)
23472       || addr.type != ADDRESS_REG_IMM)
23473     return false;
23474
23475   poly_int64 first = addr.const_offset;
23476   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
23477   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
23478           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
23479 }
23480
23481 /* Return true if OFFSET is a constant integer and if VNUM is
23482    OFFSET * the number of bytes in an SVE vector.  This is the requirement
23483    that exists in SME LDR and STR instructions, where the VL offset must
23484    equal the ZA slice offset.  */
23485 bool
23486 aarch64_sme_ldr_vnum_offset_p (rtx offset, rtx vnum)
23487 {
23488   if (!CONST_INT_P (offset) || !IN_RANGE (INTVAL (offset), 0, 15))
23489     return false;
23490
23491   if (TARGET_STREAMING)
23492     {
23493       poly_int64 const_vnum;
23494       return (poly_int_rtx_p (vnum, &const_vnum)
23495               && known_eq (const_vnum,
23496                            INTVAL (offset) * BYTES_PER_SVE_VECTOR));
23497     }
23498   else
23499     {
23500       HOST_WIDE_INT factor;
23501       return (aarch64_sme_vq_unspec_p (vnum, &factor)
23502               && factor == INTVAL (offset) * 16);
23503     }
23504 }
23505
23506 /* Emit a register copy from operand to operand, taking care not to
23507    early-clobber source registers in the process.
23508
23509    COUNT is the number of components into which the copy needs to be
23510    decomposed.  */
23511 void
23512 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
23513                                 unsigned int count)
23514 {
23515   unsigned int i;
23516   int rdest = REGNO (operands[0]);
23517   int rsrc = REGNO (operands[1]);
23518
23519   if (!reg_overlap_mentioned_p (operands[0], operands[1])
23520       || rdest < rsrc)
23521     for (i = 0; i < count; i++)
23522       emit_move_insn (gen_rtx_REG (mode, rdest + i),
23523                       gen_rtx_REG (mode, rsrc + i));
23524   else
23525     for (i = 0; i < count; i++)
23526       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
23527                       gen_rtx_REG (mode, rsrc + count - i - 1));
23528 }
23529
23530 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
23531    one of VSTRUCT modes: OI, CI, or XI.  */
23532 int
23533 aarch64_simd_attr_length_rglist (machine_mode mode)
23534 {
23535   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
23536   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
23537 }
23538
23539 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
23540    alignment of a vector to 128 bits.  SVE predicates have an alignment of
23541    16 bits.  */
23542 static HOST_WIDE_INT
23543 aarch64_simd_vector_alignment (const_tree type)
23544 {
23545   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
23546      be set for non-predicate vectors of booleans.  Modes are the most
23547      direct way we have of identifying real SVE predicate types.  */
23548   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
23549     return 16;
23550   widest_int min_size
23551     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
23552   return wi::umin (min_size, 128).to_uhwi ();
23553 }
23554
23555 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
23556 static poly_uint64
23557 aarch64_vectorize_preferred_vector_alignment (const_tree type)
23558 {
23559   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
23560     {
23561       /* If the length of the vector is a fixed power of 2, try to align
23562          to that length, otherwise don't try to align at all.  */
23563       HOST_WIDE_INT result;
23564       if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
23565           || !pow2p_hwi (result))
23566         result = TYPE_ALIGN (TREE_TYPE (type));
23567       return result;
23568     }
23569   return TYPE_ALIGN (type);
23570 }
23571
23572 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
23573 static bool
23574 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
23575 {
23576   if (is_packed)
23577     return false;
23578
23579   /* For fixed-length vectors, check that the vectorizer will aim for
23580      full-vector alignment.  This isn't true for generic GCC vectors
23581      that are wider than the ABI maximum of 128 bits.  */
23582   poly_uint64 preferred_alignment =
23583     aarch64_vectorize_preferred_vector_alignment (type);
23584   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23585       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
23586                    preferred_alignment))
23587     return false;
23588
23589   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
23590   return true;
23591 }
23592
23593 /* Return true if the vector misalignment factor is supported by the
23594    target.  */
23595 static bool
23596 aarch64_builtin_support_vector_misalignment (machine_mode mode,
23597                                              const_tree type, int misalignment,
23598                                              bool is_packed)
23599 {
23600   if (TARGET_SIMD && STRICT_ALIGNMENT)
23601     {
23602       /* Return if movmisalign pattern is not supported for this mode.  */
23603       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
23604         return false;
23605
23606       /* Misalignment factor is unknown at compile time.  */
23607       if (misalignment == -1)
23608         return false;
23609     }
23610   return default_builtin_support_vector_misalignment (mode, type, misalignment,
23611                                                       is_packed);
23612 }
23613
23614 /* If VALS is a vector constant that can be loaded into a register
23615    using DUP, generate instructions to do so and return an RTX to
23616    assign to the register.  Otherwise return NULL_RTX.  */
23617 static rtx
23618 aarch64_simd_dup_constant (rtx vals)
23619 {
23620   machine_mode mode = GET_MODE (vals);
23621   machine_mode inner_mode = GET_MODE_INNER (mode);
23622   rtx x;
23623
23624   if (!const_vec_duplicate_p (vals, &x))
23625     return NULL_RTX;
23626
23627   /* We can load this constant by using DUP and a constant in a
23628      single ARM register.  This will be cheaper than a vector
23629      load.  */
23630   x = force_reg (inner_mode, x);
23631   return gen_vec_duplicate (mode, x);
23632 }
23633
23634
23635 /* Generate code to load VALS, which is a PARALLEL containing only
23636    constants (for vec_init) or CONST_VECTOR, efficiently into a
23637    register.  Returns an RTX to copy into the register, or NULL_RTX
23638    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
23639 static rtx
23640 aarch64_simd_make_constant (rtx vals)
23641 {
23642   machine_mode mode = GET_MODE (vals);
23643   rtx const_dup;
23644   rtx const_vec = NULL_RTX;
23645   int n_const = 0;
23646   int i;
23647
23648   if (CONST_VECTOR_P (vals))
23649     const_vec = vals;
23650   else if (GET_CODE (vals) == PARALLEL)
23651     {
23652       /* A CONST_VECTOR must contain only CONST_INTs and
23653          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
23654          Only store valid constants in a CONST_VECTOR.  */
23655       int n_elts = XVECLEN (vals, 0);
23656       for (i = 0; i < n_elts; ++i)
23657         {
23658           rtx x = XVECEXP (vals, 0, i);
23659           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23660             n_const++;
23661         }
23662       if (n_const == n_elts)
23663         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
23664     }
23665   else
23666     gcc_unreachable ();
23667
23668   if (const_vec != NULL_RTX
23669       && aarch64_simd_valid_immediate (const_vec, NULL))
23670     /* Load using MOVI/MVNI.  */
23671     return const_vec;
23672   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
23673     /* Loaded using DUP.  */
23674     return const_dup;
23675   else if (const_vec != NULL_RTX)
23676     /* Load from constant pool. We cannot take advantage of single-cycle
23677        LD1 because we need a PC-relative addressing mode.  */
23678     return const_vec;
23679   else
23680     /* A PARALLEL containing something not valid inside CONST_VECTOR.
23681        We cannot construct an initializer.  */
23682     return NULL_RTX;
23683 }
23684
23685 /* A subroutine of aarch64_expand_vector_init, with the same interface.
23686    The caller has already tried a divide-and-conquer approach, so do
23687    not consider that case here.  */
23688
23689 void
23690 aarch64_expand_vector_init_fallback (rtx target, rtx vals)
23691 {
23692   machine_mode mode = GET_MODE (target);
23693   scalar_mode inner_mode = GET_MODE_INNER (mode);
23694   /* The number of vector elements.  */
23695   int n_elts = XVECLEN (vals, 0);
23696   /* The number of vector elements which are not constant.  */
23697   int n_var = 0;
23698   rtx any_const = NULL_RTX;
23699   /* The first element of vals.  */
23700   rtx v0 = XVECEXP (vals, 0, 0);
23701   bool all_same = true;
23702
23703   /* This is a special vec_init<M><N> where N is not an element mode but a
23704      vector mode with half the elements of M.  We expect to find two entries
23705      of mode N in VALS and we must put their concatentation into TARGET.  */
23706   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
23707     {
23708       machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
23709       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
23710                   && known_eq (GET_MODE_SIZE (mode),
23711                                2 * GET_MODE_SIZE (narrow_mode)));
23712       emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
23713                                          XVECEXP (vals, 0, 0),
23714                                          XVECEXP (vals, 0, 1)));
23715      return;
23716    }
23717
23718   /* Count the number of variable elements to initialise.  */
23719   for (int i = 0; i < n_elts; ++i)
23720     {
23721       rtx x = XVECEXP (vals, 0, i);
23722       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
23723         ++n_var;
23724       else
23725         any_const = x;
23726
23727       all_same &= rtx_equal_p (x, v0);
23728     }
23729
23730   /* No variable elements, hand off to aarch64_simd_make_constant which knows
23731      how best to handle this.  */
23732   if (n_var == 0)
23733     {
23734       rtx constant = aarch64_simd_make_constant (vals);
23735       if (constant != NULL_RTX)
23736         {
23737           emit_move_insn (target, constant);
23738           return;
23739         }
23740     }
23741
23742   /* Splat a single non-constant element if we can.  */
23743   if (all_same)
23744     {
23745       rtx x = force_reg (inner_mode, v0);
23746       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
23747       return;
23748     }
23749
23750   enum insn_code icode = optab_handler (vec_set_optab, mode);
23751   gcc_assert (icode != CODE_FOR_nothing);
23752
23753   /* If there are only variable elements, try to optimize
23754      the insertion using dup for the most common element
23755      followed by insertions.  */
23756
23757   /* The algorithm will fill matches[*][0] with the earliest matching element,
23758      and matches[X][1] with the count of duplicate elements (if X is the
23759      earliest element which has duplicates).  */
23760
23761   if (n_var >= n_elts - 1 && n_elts <= 16)
23762     {
23763       int matches[16][2] = {0};
23764       for (int i = 0; i < n_elts; i++)
23765         {
23766           for (int j = 0; j <= i; j++)
23767             {
23768               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
23769                 {
23770                   matches[i][0] = j;
23771                   matches[j][1]++;
23772                   break;
23773                 }
23774             }
23775         }
23776       int maxelement = 0;
23777       int maxv = 0;
23778       rtx const_elem = NULL_RTX;
23779       int const_elem_pos = 0;
23780
23781       for (int i = 0; i < n_elts; i++)
23782         {
23783           if (matches[i][1] > maxv)
23784             {
23785               maxelement = i;
23786               maxv = matches[i][1];
23787             }
23788           if (CONST_INT_P (XVECEXP (vals, 0, i))
23789               || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
23790             {
23791               const_elem_pos = i;
23792               const_elem = XVECEXP (vals, 0, i);
23793             }
23794         }
23795
23796       /* Create a duplicate of the most common element, unless all elements
23797          are equally useless to us, in which case just immediately set the
23798          vector register using the first element.  */
23799
23800       if (maxv == 1)
23801         {
23802           /* For vectors of two 64-bit elements, we can do even better.  */
23803           if (n_elts == 2
23804               && (inner_mode == E_DImode
23805                   || inner_mode == E_DFmode))
23806
23807             {
23808               rtx x0 = XVECEXP (vals, 0, 0);
23809               rtx x1 = XVECEXP (vals, 0, 1);
23810               /* Combine can pick up this case, but handling it directly
23811                  here leaves clearer RTL.
23812
23813                  This is load_pair_lanes<mode>, and also gives us a clean-up
23814                  for store_pair_lanes<mode>.  */
23815               if (memory_operand (x0, inner_mode)
23816                   && memory_operand (x1, inner_mode)
23817                   && aarch64_mergeable_load_pair_p (mode, x0, x1))
23818                 {
23819                   rtx t;
23820                   if (inner_mode == DFmode)
23821                     t = gen_load_pair_lanesdf (target, x0, x1);
23822                   else
23823                     t = gen_load_pair_lanesdi (target, x0, x1);
23824                   emit_insn (t);
23825                   return;
23826                 }
23827             }
23828           /* The subreg-move sequence below will move into lane zero of the
23829              vector register.  For big-endian we want that position to hold
23830              the last element of VALS.  */
23831           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
23832
23833           /* If we have a single constant element, use that for duplicating
23834              instead.  */
23835           if (const_elem)
23836             {
23837               maxelement = const_elem_pos;
23838               aarch64_emit_move (target, gen_vec_duplicate (mode, const_elem));
23839             }
23840           else
23841             {
23842               rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
23843               aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
23844             }
23845         }
23846       else
23847         {
23848           rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
23849           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
23850         }
23851
23852       /* Insert the rest.  */
23853       for (int i = 0; i < n_elts; i++)
23854         {
23855           rtx x = XVECEXP (vals, 0, i);
23856           if (matches[i][0] == maxelement)
23857             continue;
23858           x = force_reg (inner_mode, x);
23859           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
23860         }
23861       return;
23862     }
23863
23864   /* Initialise a vector which is part-variable.  We want to first try
23865      to build those lanes which are constant in the most efficient way we
23866      can.  */
23867   if (n_var != n_elts)
23868     {
23869       rtx copy = copy_rtx (vals);
23870
23871       /* Load constant part of vector.  We really don't care what goes into the
23872          parts we will overwrite, but we're more likely to be able to load the
23873          constant efficiently if it has fewer, larger, repeating parts
23874          (see aarch64_simd_valid_immediate).  */
23875       for (int i = 0; i < n_elts; i++)
23876         {
23877           rtx x = XVECEXP (vals, 0, i);
23878           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23879             continue;
23880           rtx subst = any_const;
23881           for (int bit = n_elts / 2; bit > 0; bit /= 2)
23882             {
23883               /* Look in the copied vector, as more elements are const.  */
23884               rtx test = XVECEXP (copy, 0, i ^ bit);
23885               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
23886                 {
23887                   subst = test;
23888                   break;
23889                 }
23890             }
23891           XVECEXP (copy, 0, i) = subst;
23892         }
23893       aarch64_expand_vector_init_fallback (target, copy);
23894     }
23895
23896   /* Insert the variable lanes directly.  */
23897   for (int i = 0; i < n_elts; i++)
23898     {
23899       rtx x = XVECEXP (vals, 0, i);
23900       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23901         continue;
23902       x = force_reg (inner_mode, x);
23903       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
23904     }
23905 }
23906
23907 /* Return even or odd half of VALS depending on EVEN_P.  */
23908
23909 static rtx
23910 aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
23911 {
23912   int n = XVECLEN (vals, 0);
23913   machine_mode new_mode
23914     = aarch64_simd_container_mode (GET_MODE_INNER (mode),
23915                                    GET_MODE_BITSIZE (mode).to_constant () / 2);
23916   rtvec vec = rtvec_alloc (n / 2);
23917   for (int i = 0; i < n / 2; i++)
23918     RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
23919                                   : XVECEXP (vals, 0, 2 * i + 1);
23920   return gen_rtx_PARALLEL (new_mode, vec);
23921 }
23922
23923 /* Return true if SET is a scalar move.  */
23924
23925 static bool
23926 scalar_move_insn_p (rtx set)
23927 {
23928   rtx src = SET_SRC (set);
23929   rtx dest = SET_DEST (set);
23930   return (is_a<scalar_mode> (GET_MODE (dest))
23931           && aarch64_mov_operand (src, GET_MODE (dest)));
23932 }
23933
23934 /* Similar to seq_cost, but ignore cost for scalar moves.  */
23935
23936 static unsigned
23937 seq_cost_ignoring_scalar_moves (const rtx_insn *seq, bool speed)
23938 {
23939   unsigned cost = 0;
23940
23941   for (; seq; seq = NEXT_INSN (seq))
23942     if (NONDEBUG_INSN_P (seq))
23943       {
23944         if (rtx set = single_set (seq))
23945           {
23946             if (!scalar_move_insn_p (set))
23947               cost += set_rtx_cost (set, speed);
23948           }
23949         else
23950           {
23951             int this_cost = insn_cost (CONST_CAST_RTX_INSN (seq), speed);
23952             if (this_cost > 0)
23953               cost += this_cost;
23954             else
23955               cost++;
23956           }
23957       }
23958
23959   return cost;
23960 }
23961
23962 /* Expand a vector initialization sequence, such that TARGET is
23963    initialized to contain VALS.  */
23964
23965 void
23966 aarch64_expand_vector_init (rtx target, rtx vals)
23967 {
23968   /* Try decomposing the initializer into even and odd halves and
23969      then ZIP them together.  Use the resulting sequence if it is
23970      strictly cheaper than loading VALS directly.
23971
23972      Prefer the fallback sequence in the event of a tie, since it
23973      will tend to use fewer registers.  */
23974
23975   machine_mode mode = GET_MODE (target);
23976   int n_elts = XVECLEN (vals, 0);
23977
23978   if (n_elts < 4
23979       || maybe_ne (GET_MODE_BITSIZE (mode), 128))
23980     {
23981       aarch64_expand_vector_init_fallback (target, vals);
23982       return;
23983     }
23984
23985   start_sequence ();
23986   rtx halves[2];
23987   unsigned costs[2];
23988   for (int i = 0; i < 2; i++)
23989     {
23990       start_sequence ();
23991       rtx new_vals = aarch64_unzip_vector_init (mode, vals, i == 0);
23992       rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
23993       aarch64_expand_vector_init (tmp_reg, new_vals);
23994       halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
23995       rtx_insn *rec_seq = get_insns ();
23996       end_sequence ();
23997       costs[i] = seq_cost_ignoring_scalar_moves (rec_seq, !optimize_size);
23998       emit_insn (rec_seq);
23999     }
24000
24001   rtvec v = gen_rtvec (2, halves[0], halves[1]);
24002   rtx_insn *zip1_insn
24003     = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24004   unsigned seq_total_cost
24005     = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
24006   seq_total_cost += insn_cost (zip1_insn, !optimize_size);
24007
24008   rtx_insn *seq = get_insns ();
24009   end_sequence ();
24010
24011   start_sequence ();
24012   aarch64_expand_vector_init_fallback (target, vals);
24013   rtx_insn *fallback_seq = get_insns ();
24014   unsigned fallback_seq_cost
24015     = seq_cost_ignoring_scalar_moves (fallback_seq, !optimize_size);
24016   end_sequence ();
24017
24018   emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
24019 }
24020
24021 /* Emit RTL corresponding to:
24022    insr TARGET, ELEM.  */
24023
24024 static void
24025 emit_insr (rtx target, rtx elem)
24026 {
24027   machine_mode mode = GET_MODE (target);
24028   scalar_mode elem_mode = GET_MODE_INNER (mode);
24029   elem = force_reg (elem_mode, elem);
24030
24031   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
24032   gcc_assert (icode != CODE_FOR_nothing);
24033   emit_insn (GEN_FCN (icode) (target, target, elem));
24034 }
24035
24036 /* Subroutine of aarch64_sve_expand_vector_init for handling
24037    trailing constants.
24038    This function works as follows:
24039    (a) Create a new vector consisting of trailing constants.
24040    (b) Initialize TARGET with the constant vector using emit_move_insn.
24041    (c) Insert remaining elements in TARGET using insr.
24042    NELTS is the total number of elements in original vector while
24043    while NELTS_REQD is the number of elements that are actually
24044    significant.
24045
24046    ??? The heuristic used is to do above only if number of constants
24047    is at least half the total number of elements.  May need fine tuning.  */
24048
24049 static bool
24050 aarch64_sve_expand_vector_init_handle_trailing_constants
24051  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
24052 {
24053   machine_mode mode = GET_MODE (target);
24054   scalar_mode elem_mode = GET_MODE_INNER (mode);
24055   int n_trailing_constants = 0;
24056
24057   for (int i = nelts_reqd - 1;
24058        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
24059        i--)
24060     n_trailing_constants++;
24061
24062   if (n_trailing_constants >= nelts_reqd / 2)
24063     {
24064       /* Try to use the natural pattern of BUILDER to extend the trailing
24065          constant elements to a full vector.  Replace any variables in the
24066          extra elements with zeros.
24067
24068          ??? It would be better if the builders supported "don't care"
24069              elements, with the builder filling in whichever elements
24070              give the most compact encoding.  */
24071       rtx_vector_builder v (mode, nelts, 1);
24072       for (int i = 0; i < nelts; i++)
24073         {
24074           rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
24075           if (!valid_for_const_vector_p (elem_mode, x))
24076             x = CONST0_RTX (elem_mode);
24077           v.quick_push (x);
24078         }
24079       rtx const_vec = v.build ();
24080       emit_move_insn (target, const_vec);
24081
24082       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
24083         emit_insr (target, builder.elt (i));
24084
24085       return true;
24086     }
24087
24088   return false;
24089 }
24090
24091 /* Subroutine of aarch64_sve_expand_vector_init.
24092    Works as follows:
24093    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
24094    (b) Skip trailing elements from BUILDER, which are the same as
24095        element NELTS_REQD - 1.
24096    (c) Insert earlier elements in reverse order in TARGET using insr.  */
24097
24098 static void
24099 aarch64_sve_expand_vector_init_insert_elems (rtx target,
24100                                              const rtx_vector_builder &builder,
24101                                              int nelts_reqd)
24102 {
24103   machine_mode mode = GET_MODE (target);
24104   scalar_mode elem_mode = GET_MODE_INNER (mode);
24105
24106   struct expand_operand ops[2];
24107   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
24108   gcc_assert (icode != CODE_FOR_nothing);
24109
24110   create_output_operand (&ops[0], target, mode);
24111   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
24112   expand_insn (icode, 2, ops);
24113
24114   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24115   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
24116     emit_insr (target, builder.elt (i));
24117 }
24118
24119 /* Subroutine of aarch64_sve_expand_vector_init to handle case
24120    when all trailing elements of builder are same.
24121    This works as follows:
24122    (a) Use expand_insn interface to broadcast last vector element in TARGET.
24123    (b) Insert remaining elements in TARGET using insr.
24124
24125    ??? The heuristic used is to do above if number of same trailing elements
24126    is at least 3/4 of total number of elements, loosely based on
24127    heuristic from mostly_zeros_p.  May need fine-tuning.  */
24128
24129 static bool
24130 aarch64_sve_expand_vector_init_handle_trailing_same_elem
24131  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
24132 {
24133   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24134   if (ndups >= (3 * nelts_reqd) / 4)
24135     {
24136       aarch64_sve_expand_vector_init_insert_elems (target, builder,
24137                                                    nelts_reqd - ndups + 1);
24138       return true;
24139     }
24140
24141   return false;
24142 }
24143
24144 /* Initialize register TARGET from BUILDER. NELTS is the constant number
24145    of elements in BUILDER.
24146
24147    The function tries to initialize TARGET from BUILDER if it fits one
24148    of the special cases outlined below.
24149
24150    Failing that, the function divides BUILDER into two sub-vectors:
24151    v_even = even elements of BUILDER;
24152    v_odd = odd elements of BUILDER;
24153
24154    and recursively calls itself with v_even and v_odd.
24155
24156    if (recursive call succeeded for v_even or v_odd)
24157      TARGET = zip (v_even, v_odd)
24158
24159    The function returns true if it managed to build TARGET from BUILDER
24160    with one of the special cases, false otherwise.
24161
24162    Example: {a, 1, b, 2, c, 3, d, 4}
24163
24164    The vector gets divided into:
24165    v_even = {a, b, c, d}
24166    v_odd = {1, 2, 3, 4}
24167
24168    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
24169    initialize tmp2 from constant vector v_odd using emit_move_insn.
24170
24171    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
24172    4 elements, so we construct tmp1 from v_even using insr:
24173    tmp1 = dup(d)
24174    insr tmp1, c
24175    insr tmp1, b
24176    insr tmp1, a
24177
24178    And finally:
24179    TARGET = zip (tmp1, tmp2)
24180    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
24181
24182 static bool
24183 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
24184                                 int nelts, int nelts_reqd)
24185 {
24186   machine_mode mode = GET_MODE (target);
24187
24188   /* Case 1: Vector contains trailing constants.  */
24189
24190   if (aarch64_sve_expand_vector_init_handle_trailing_constants
24191        (target, builder, nelts, nelts_reqd))
24192     return true;
24193
24194   /* Case 2: Vector contains leading constants.  */
24195
24196   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
24197   for (int i = 0; i < nelts_reqd; i++)
24198     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
24199   rev_builder.finalize ();
24200
24201   if (aarch64_sve_expand_vector_init_handle_trailing_constants
24202        (target, rev_builder, nelts, nelts_reqd))
24203     {
24204       emit_insn (gen_aarch64_sve_rev (mode, target, target));
24205       return true;
24206     }
24207
24208   /* Case 3: Vector contains trailing same element.  */
24209
24210   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24211        (target, builder, nelts_reqd))
24212     return true;
24213
24214   /* Case 4: Vector contains leading same element.  */
24215
24216   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24217        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
24218     {
24219       emit_insn (gen_aarch64_sve_rev (mode, target, target));
24220       return true;
24221     }
24222
24223   /* Avoid recursing below 4-elements.
24224      ??? The threshold 4 may need fine-tuning.  */
24225
24226   if (nelts_reqd <= 4)
24227     return false;
24228
24229   rtx_vector_builder v_even (mode, nelts, 1);
24230   rtx_vector_builder v_odd (mode, nelts, 1);
24231
24232   for (int i = 0; i < nelts * 2; i += 2)
24233     {
24234       v_even.quick_push (builder.elt (i));
24235       v_odd.quick_push (builder.elt (i + 1));
24236     }
24237
24238   v_even.finalize ();
24239   v_odd.finalize ();
24240
24241   rtx tmp1 = gen_reg_rtx (mode);
24242   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
24243                                                     nelts, nelts_reqd / 2);
24244
24245   rtx tmp2 = gen_reg_rtx (mode);
24246   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
24247                                                    nelts, nelts_reqd / 2);
24248
24249   if (!did_even_p && !did_odd_p)
24250     return false;
24251
24252   /* Initialize v_even and v_odd using INSR if it didn't match any of the
24253      special cases and zip v_even, v_odd.  */
24254
24255   if (!did_even_p)
24256     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
24257
24258   if (!did_odd_p)
24259     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
24260
24261   rtvec v = gen_rtvec (2, tmp1, tmp2);
24262   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24263   return true;
24264 }
24265
24266 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
24267
24268 void
24269 aarch64_sve_expand_vector_init (rtx target, rtx vals)
24270 {
24271   machine_mode mode = GET_MODE (target);
24272   int nelts = XVECLEN (vals, 0);
24273
24274   rtx_vector_builder v (mode, nelts, 1);
24275   for (int i = 0; i < nelts; i++)
24276     v.quick_push (XVECEXP (vals, 0, i));
24277   v.finalize ();
24278
24279   /* If neither sub-vectors of v could be initialized specially,
24280      then use INSR to insert all elements from v into TARGET.
24281      ??? This might not be optimal for vectors with large
24282      initializers like 16-element or above.
24283      For nelts < 4, it probably isn't useful to handle specially.  */
24284
24285   if (nelts < 4
24286       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
24287     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
24288 }
24289
24290 /* Check whether VALUE is a vector constant in which every element
24291    is either a power of 2 or a negated power of 2.  If so, return
24292    a constant vector of log2s, and flip CODE between PLUS and MINUS
24293    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
24294
24295 static rtx
24296 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
24297 {
24298   if (!CONST_VECTOR_P (value))
24299     return NULL_RTX;
24300
24301   rtx_vector_builder builder;
24302   if (!builder.new_unary_operation (GET_MODE (value), value, false))
24303     return NULL_RTX;
24304
24305   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
24306   /* 1 if the result of the multiplication must be negated,
24307      0 if it mustn't, or -1 if we don't yet care.  */
24308   int negate = -1;
24309   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
24310   for (unsigned int i = 0; i < encoded_nelts; ++i)
24311     {
24312       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
24313       if (!CONST_SCALAR_INT_P (elt))
24314         return NULL_RTX;
24315       rtx_mode_t val (elt, int_mode);
24316       wide_int pow2 = wi::neg (val);
24317       if (val != pow2)
24318         {
24319           /* It matters whether we negate or not.  Make that choice,
24320              and make sure that it's consistent with previous elements.  */
24321           if (negate == !wi::neg_p (val))
24322             return NULL_RTX;
24323           negate = wi::neg_p (val);
24324           if (!negate)
24325             pow2 = val;
24326         }
24327       /* POW2 is now the value that we want to be a power of 2.  */
24328       int shift = wi::exact_log2 (pow2);
24329       if (shift < 0)
24330         return NULL_RTX;
24331       builder.quick_push (gen_int_mode (shift, int_mode));
24332     }
24333   if (negate == -1)
24334     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
24335     code = PLUS;
24336   else if (negate == 1)
24337     code = code == PLUS ? MINUS : PLUS;
24338   return builder.build ();
24339 }
24340
24341 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
24342    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
24343    operands array, in the same order as for fma_optab.  Return true if
24344    the function emitted all the necessary instructions, false if the caller
24345    should generate the pattern normally with the new OPERANDS array.  */
24346
24347 bool
24348 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
24349 {
24350   machine_mode mode = GET_MODE (operands[0]);
24351   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
24352     {
24353       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
24354                                   NULL_RTX, true, OPTAB_DIRECT);
24355       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
24356                           operands[3], product, operands[0], true,
24357                           OPTAB_DIRECT);
24358       return true;
24359     }
24360   operands[2] = force_reg (mode, operands[2]);
24361   return false;
24362 }
24363
24364 /* Likewise, but for a conditional pattern.  */
24365
24366 bool
24367 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
24368 {
24369   machine_mode mode = GET_MODE (operands[0]);
24370   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
24371     {
24372       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
24373                                   NULL_RTX, true, OPTAB_DIRECT);
24374       emit_insn (gen_cond (code, mode, operands[0], operands[1],
24375                            operands[4], product, operands[5]));
24376       return true;
24377     }
24378   operands[3] = force_reg (mode, operands[3]);
24379   return false;
24380 }
24381
24382 static unsigned HOST_WIDE_INT
24383 aarch64_shift_truncation_mask (machine_mode mode)
24384 {
24385   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
24386     return 0;
24387   return GET_MODE_UNIT_BITSIZE (mode) - 1;
24388 }
24389
24390 /* Select a format to encode pointers in exception handling data.  */
24391 int
24392 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
24393 {
24394    int type;
24395    switch (aarch64_cmodel)
24396      {
24397      case AARCH64_CMODEL_TINY:
24398      case AARCH64_CMODEL_TINY_PIC:
24399      case AARCH64_CMODEL_SMALL:
24400      case AARCH64_CMODEL_SMALL_PIC:
24401      case AARCH64_CMODEL_SMALL_SPIC:
24402        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
24403           for everything.  */
24404        type = DW_EH_PE_sdata4;
24405        break;
24406      default:
24407        /* No assumptions here.  8-byte relocs required.  */
24408        type = DW_EH_PE_sdata8;
24409        break;
24410      }
24411    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
24412 }
24413
24414 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
24415
24416 static void
24417 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
24418 {
24419   if (TREE_CODE (decl) == FUNCTION_DECL)
24420     {
24421       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
24422       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
24423         {
24424           fprintf (stream, "\t.variant_pcs\t");
24425           assemble_name (stream, name);
24426           fprintf (stream, "\n");
24427         }
24428     }
24429 }
24430
24431 /* The last .arch and .tune assembly strings that we printed.  */
24432 static std::string aarch64_last_printed_arch_string;
24433 static std::string aarch64_last_printed_tune_string;
24434
24435 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
24436    by the function fndecl.  */
24437
24438 void
24439 aarch64_declare_function_name (FILE *stream, const char* name,
24440                                 tree fndecl)
24441 {
24442   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
24443
24444   struct cl_target_option *targ_options;
24445   if (target_parts)
24446     targ_options = TREE_TARGET_OPTION (target_parts);
24447   else
24448     targ_options = TREE_TARGET_OPTION (target_option_current_node);
24449   gcc_assert (targ_options);
24450
24451   const struct processor *this_arch
24452     = aarch64_get_arch (targ_options->x_selected_arch);
24453
24454   auto isa_flags = targ_options->x_aarch64_asm_isa_flags;
24455   std::string extension
24456     = aarch64_get_extension_string_for_isa_flags (isa_flags,
24457                                                   this_arch->flags);
24458   /* Only update the assembler .arch string if it is distinct from the last
24459      such string we printed.  */
24460   std::string to_print = this_arch->name + extension;
24461   if (to_print != aarch64_last_printed_arch_string)
24462     {
24463       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
24464       aarch64_last_printed_arch_string = to_print;
24465     }
24466
24467   /* Print the cpu name we're tuning for in the comments, might be
24468      useful to readers of the generated asm.  Do it only when it changes
24469      from function to function and verbose assembly is requested.  */
24470   const struct processor *this_tune
24471     = aarch64_get_tune_cpu (targ_options->x_selected_tune);
24472
24473   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
24474     {
24475       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
24476                    this_tune->name);
24477       aarch64_last_printed_tune_string = this_tune->name;
24478     }
24479
24480   aarch64_asm_output_variant_pcs (stream, fndecl, name);
24481
24482   /* Don't forget the type directive for ELF.  */
24483   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
24484   ASM_OUTPUT_FUNCTION_LABEL (stream, name, fndecl);
24485
24486   cfun->machine->label_is_assembled = true;
24487 }
24488
24489 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  */
24490
24491 void
24492 aarch64_print_patchable_function_entry (FILE *file,
24493                                         unsigned HOST_WIDE_INT patch_area_size,
24494                                         bool record_p)
24495 {
24496   if (!cfun->machine->label_is_assembled)
24497     {
24498       /* Emit the patching area before the entry label, if any.  */
24499       default_print_patchable_function_entry (file, patch_area_size,
24500                                               record_p);
24501       return;
24502     }
24503
24504   rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
24505                                GEN_INT (record_p));
24506   basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
24507
24508   if (!aarch_bti_enabled ()
24509       || cgraph_node::get (cfun->decl)->only_called_directly_p ())
24510     {
24511       /* Emit the patchable_area at the beginning of the function.  */
24512       rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
24513       INSN_ADDRESSES_NEW (insn, -1);
24514       return;
24515     }
24516
24517   rtx_insn *insn = next_real_nondebug_insn (get_insns ());
24518   if (!insn
24519       || !INSN_P (insn)
24520       || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
24521       || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
24522     {
24523       /* Emit a BTI_C.  */
24524       insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
24525     }
24526
24527   /* Emit the patchable_area after BTI_C.  */
24528   insn = emit_insn_after (pa, insn);
24529   INSN_ADDRESSES_NEW (insn, -1);
24530 }
24531
24532 /* Output patchable area.  */
24533
24534 void
24535 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
24536 {
24537   default_print_patchable_function_entry (asm_out_file, patch_area_size,
24538                                           record_p);
24539 }
24540
24541 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
24542
24543 void
24544 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
24545 {
24546   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
24547   const char *value = IDENTIFIER_POINTER (target);
24548   aarch64_asm_output_variant_pcs (stream, decl, name);
24549   ASM_OUTPUT_DEF (stream, name, value);
24550 }
24551
24552 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
24553    function symbol references.  */
24554
24555 void
24556 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
24557 {
24558   default_elf_asm_output_external (stream, decl, name);
24559   aarch64_asm_output_variant_pcs (stream, decl, name);
24560 }
24561
24562 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
24563    Used to output the .cfi_b_key_frame directive when signing the current
24564    function with the B key.  */
24565
24566 void
24567 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
24568 {
24569   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
24570       && aarch64_ra_sign_key == AARCH64_KEY_B)
24571         asm_fprintf (f, "\t.cfi_b_key_frame\n");
24572 }
24573
24574 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
24575
24576 static void
24577 aarch64_start_file (void)
24578 {
24579   struct cl_target_option *default_options
24580     = TREE_TARGET_OPTION (target_option_default_node);
24581
24582   const struct processor *default_arch
24583     = aarch64_get_arch (default_options->x_selected_arch);
24584   auto default_isa_flags = default_options->x_aarch64_asm_isa_flags;
24585   std::string extension
24586     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
24587                                                   default_arch->flags);
24588
24589    aarch64_last_printed_arch_string = default_arch->name + extension;
24590    aarch64_last_printed_tune_string = "";
24591    asm_fprintf (asm_out_file, "\t.arch %s\n",
24592                 aarch64_last_printed_arch_string.c_str ());
24593
24594    default_file_start ();
24595 }
24596
24597 /* Emit load exclusive.  */
24598
24599 static void
24600 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
24601                              rtx mem, rtx model_rtx)
24602 {
24603   if (mode == TImode)
24604     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
24605                                                 gen_highpart (DImode, rval),
24606                                                 mem, model_rtx));
24607   else
24608     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
24609 }
24610
24611 /* Emit store exclusive.  */
24612
24613 static void
24614 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
24615                               rtx mem, rtx rval, rtx model_rtx)
24616 {
24617   if (mode == TImode)
24618     emit_insn (gen_aarch64_store_exclusive_pair
24619                (bval, mem, operand_subword (rval, 0, 0, TImode),
24620                 operand_subword (rval, 1, 0, TImode), model_rtx));
24621   else
24622     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
24623 }
24624
24625 /* Mark the previous jump instruction as unlikely.  */
24626
24627 static void
24628 aarch64_emit_unlikely_jump (rtx insn)
24629 {
24630   rtx_insn *jump = emit_jump_insn (insn);
24631   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
24632 }
24633
24634 /* We store the names of the various atomic helpers in a 5x5 array.
24635    Return the libcall function given MODE, MODEL and NAMES.  */
24636
24637 rtx
24638 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
24639                         const atomic_ool_names *names)
24640 {
24641   memmodel model = memmodel_from_int (INTVAL (model_rtx));
24642   int mode_idx, model_idx;
24643
24644   switch (mode)
24645     {
24646     case E_QImode:
24647       mode_idx = 0;
24648       break;
24649     case E_HImode:
24650       mode_idx = 1;
24651       break;
24652     case E_SImode:
24653       mode_idx = 2;
24654       break;
24655     case E_DImode:
24656       mode_idx = 3;
24657       break;
24658     case E_TImode:
24659       mode_idx = 4;
24660       break;
24661     default:
24662       gcc_unreachable ();
24663     }
24664
24665   switch (model)
24666     {
24667     case MEMMODEL_RELAXED:
24668       model_idx = 0;
24669       break;
24670     case MEMMODEL_CONSUME:
24671     case MEMMODEL_ACQUIRE:
24672       model_idx = 1;
24673       break;
24674     case MEMMODEL_RELEASE:
24675       model_idx = 2;
24676       break;
24677     case MEMMODEL_ACQ_REL:
24678     case MEMMODEL_SEQ_CST:
24679       model_idx = 3;
24680       break;
24681     case MEMMODEL_SYNC_ACQUIRE:
24682     case MEMMODEL_SYNC_RELEASE:
24683     case MEMMODEL_SYNC_SEQ_CST:
24684       model_idx = 4;
24685       break;
24686     default:
24687       gcc_unreachable ();
24688     }
24689
24690   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
24691                                       VISIBILITY_HIDDEN);
24692 }
24693
24694 #define DEF0(B, N) \
24695   { "__aarch64_" #B #N "_relax", \
24696     "__aarch64_" #B #N "_acq", \
24697     "__aarch64_" #B #N "_rel", \
24698     "__aarch64_" #B #N "_acq_rel", \
24699     "__aarch64_" #B #N "_sync" }
24700
24701 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
24702                  { NULL, NULL, NULL, NULL }
24703 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
24704
24705 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
24706 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
24707 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
24708 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
24709 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
24710 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
24711
24712 #undef DEF0
24713 #undef DEF4
24714 #undef DEF5
24715
24716 /* Expand a compare and swap pattern.  */
24717
24718 void
24719 aarch64_expand_compare_and_swap (rtx operands[])
24720 {
24721   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
24722   machine_mode mode, r_mode;
24723
24724   bval = operands[0];
24725   rval = operands[1];
24726   mem = operands[2];
24727   oldval = operands[3];
24728   newval = operands[4];
24729   is_weak = operands[5];
24730   mod_s = operands[6];
24731   mod_f = operands[7];
24732   mode = GET_MODE (mem);
24733
24734   /* Normally the succ memory model must be stronger than fail, but in the
24735      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
24736      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
24737   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
24738       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
24739     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
24740
24741   r_mode = mode;
24742   if (mode == QImode || mode == HImode)
24743     {
24744       r_mode = SImode;
24745       rval = gen_reg_rtx (r_mode);
24746     }
24747
24748   if (TARGET_LSE)
24749     {
24750       /* The CAS insn requires oldval and rval overlap, but we need to
24751          have a copy of oldval saved across the operation to tell if
24752          the operation is successful.  */
24753       if (reg_overlap_mentioned_p (rval, oldval))
24754         rval = copy_to_mode_reg (r_mode, oldval);
24755       else
24756         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
24757       if (mode == TImode)
24758         newval = force_reg (mode, newval);
24759
24760       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
24761                                                    newval, mod_s));
24762       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24763     }
24764   else if (TARGET_OUTLINE_ATOMICS)
24765     {
24766       /* Oldval must satisfy compare afterward.  */
24767       if (!aarch64_plus_operand (oldval, mode))
24768         oldval = force_reg (mode, oldval);
24769       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
24770       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
24771                                       oldval, mode, newval, mode,
24772                                       XEXP (mem, 0), Pmode);
24773       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24774     }
24775   else
24776     {
24777       /* The oldval predicate varies by mode.  Test it and force to reg.  */
24778       insn_code code = code_for_aarch64_compare_and_swap (mode);
24779       if (!insn_data[code].operand[2].predicate (oldval, mode))
24780         oldval = force_reg (mode, oldval);
24781
24782       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
24783                                  is_weak, mod_s, mod_f));
24784       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
24785     }
24786
24787   if (r_mode != mode)
24788     rval = gen_lowpart (mode, rval);
24789   emit_move_insn (operands[1], rval);
24790
24791   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
24792   emit_insn (gen_rtx_SET (bval, x));
24793 }
24794
24795 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
24796    sequence implementing an atomic operation.  */
24797
24798 static void
24799 aarch64_emit_post_barrier (enum memmodel model)
24800 {
24801   const enum memmodel base_model = memmodel_base (model);
24802
24803   if (is_mm_sync (model)
24804       && (base_model == MEMMODEL_ACQUIRE
24805           || base_model == MEMMODEL_ACQ_REL
24806           || base_model == MEMMODEL_SEQ_CST))
24807     {
24808       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
24809     }
24810 }
24811
24812 /* Split a compare and swap pattern.  */
24813
24814 void
24815 aarch64_split_compare_and_swap (rtx operands[])
24816 {
24817   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
24818   gcc_assert (epilogue_completed);
24819
24820   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
24821   machine_mode mode;
24822   bool is_weak;
24823   rtx_code_label *label1, *label2;
24824   enum memmodel model;
24825
24826   rval = operands[0];
24827   mem = operands[1];
24828   oldval = operands[2];
24829   newval = operands[3];
24830   model_rtx = operands[5];
24831   scratch = operands[7];
24832   mode = GET_MODE (mem);
24833   model = memmodel_from_int (INTVAL (model_rtx));
24834   is_weak = operands[4] != const0_rtx && mode != TImode;
24835
24836   /* When OLDVAL is zero and we want the strong version we can emit a tighter
24837     loop:
24838     .label1:
24839         LD[A]XR rval, [mem]
24840         CBNZ    rval, .label2
24841         ST[L]XR scratch, newval, [mem]
24842         CBNZ    scratch, .label1
24843     .label2:
24844         CMP     rval, 0.  */
24845   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
24846                         oldval == const0_rtx && mode != TImode);
24847
24848   label1 = NULL;
24849   if (!is_weak)
24850     {
24851       label1 = gen_label_rtx ();
24852       emit_label (label1);
24853     }
24854   label2 = gen_label_rtx ();
24855
24856   /* The initial load can be relaxed for a __sync operation since a final
24857      barrier will be emitted to stop code hoisting.  */
24858   if (is_mm_sync (model))
24859     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
24860   else
24861     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
24862
24863   if (strong_zero_p)
24864     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
24865   else
24866     {
24867       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24868       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
24869     }
24870   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24871                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
24872   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24873
24874   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
24875
24876   if (!is_weak)
24877     {
24878       x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
24879       aarch64_emit_unlikely_jump (x);
24880     }
24881   else
24882     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
24883
24884   /* 128-bit LDAXP is not atomic unless STLXP succeeds.  So for a mismatch,
24885      store the returned value and loop if the STLXP fails.  */
24886   if (mode == TImode)
24887     {
24888       rtx_code_label *label3 = gen_label_rtx ();
24889       emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
24890       emit_barrier ();
24891
24892       emit_label (label2);
24893       aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
24894
24895       x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
24896       aarch64_emit_unlikely_jump (x);
24897
24898       label2 = label3;
24899     }
24900
24901   emit_label (label2);
24902
24903   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
24904      to set the condition flags.  If this is not used it will be removed by
24905      later passes.  */
24906   if (strong_zero_p)
24907     aarch64_gen_compare_reg (NE, rval, const0_rtx);
24908
24909   /* Emit any final barrier needed for a __sync operation.  */
24910   if (is_mm_sync (model))
24911     aarch64_emit_post_barrier (model);
24912 }
24913
24914 /* Split an atomic operation.  */
24915
24916 void
24917 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
24918                          rtx value, rtx model_rtx, rtx cond)
24919 {
24920   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
24921   gcc_assert (epilogue_completed);
24922
24923   machine_mode mode = GET_MODE (mem);
24924   machine_mode wmode = (mode == DImode ? DImode : SImode);
24925   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
24926   const bool is_sync = is_mm_sync (model);
24927   rtx_code_label *label;
24928   rtx x;
24929
24930   /* Split the atomic operation into a sequence.  */
24931   label = gen_label_rtx ();
24932   emit_label (label);
24933
24934   if (new_out)
24935     new_out = gen_lowpart (wmode, new_out);
24936   if (old_out)
24937     old_out = gen_lowpart (wmode, old_out);
24938   else
24939     old_out = new_out;
24940   value = simplify_gen_subreg (wmode, value, mode, 0);
24941
24942   /* The initial load can be relaxed for a __sync operation since a final
24943      barrier will be emitted to stop code hoisting.  */
24944  if (is_sync)
24945     aarch64_emit_load_exclusive (mode, old_out, mem,
24946                                  GEN_INT (MEMMODEL_RELAXED));
24947   else
24948     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
24949
24950   switch (code)
24951     {
24952     case SET:
24953       new_out = value;
24954       break;
24955
24956     case NOT:
24957       x = gen_rtx_AND (wmode, old_out, value);
24958       emit_insn (gen_rtx_SET (new_out, x));
24959       x = gen_rtx_NOT (wmode, new_out);
24960       emit_insn (gen_rtx_SET (new_out, x));
24961       break;
24962
24963     case MINUS:
24964       if (CONST_INT_P (value))
24965         {
24966           value = GEN_INT (-UINTVAL (value));
24967           code = PLUS;
24968         }
24969       /* Fall through.  */
24970
24971     default:
24972       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
24973       emit_insn (gen_rtx_SET (new_out, x));
24974       break;
24975     }
24976
24977   aarch64_emit_store_exclusive (mode, cond, mem,
24978                                 gen_lowpart (mode, new_out), model_rtx);
24979
24980   x = aarch64_gen_compare_zero_and_branch (NE, cond, label);
24981   aarch64_emit_unlikely_jump (x);
24982
24983   /* Emit any final barrier needed for a __sync operation.  */
24984   if (is_sync)
24985     aarch64_emit_post_barrier (model);
24986 }
24987
24988 static void
24989 aarch64_init_libfuncs (void)
24990 {
24991    /* Half-precision float operations.  The compiler handles all operations
24992      with NULL libfuncs by converting to SFmode.  */
24993
24994   /* Conversions.  */
24995   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
24996   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
24997
24998   /* Arithmetic.  */
24999   set_optab_libfunc (add_optab, HFmode, NULL);
25000   set_optab_libfunc (sdiv_optab, HFmode, NULL);
25001   set_optab_libfunc (smul_optab, HFmode, NULL);
25002   set_optab_libfunc (neg_optab, HFmode, NULL);
25003   set_optab_libfunc (sub_optab, HFmode, NULL);
25004
25005   /* Comparisons.  */
25006   set_optab_libfunc (eq_optab, HFmode, NULL);
25007   set_optab_libfunc (ne_optab, HFmode, NULL);
25008   set_optab_libfunc (lt_optab, HFmode, NULL);
25009   set_optab_libfunc (le_optab, HFmode, NULL);
25010   set_optab_libfunc (ge_optab, HFmode, NULL);
25011   set_optab_libfunc (gt_optab, HFmode, NULL);
25012   set_optab_libfunc (unord_optab, HFmode, NULL);
25013 }
25014
25015 /* Target hook for c_mode_for_suffix.  */
25016 static machine_mode
25017 aarch64_c_mode_for_suffix (char suffix)
25018 {
25019   if (suffix == 'q')
25020     return TFmode;
25021
25022   return VOIDmode;
25023 }
25024
25025 /* We can only represent floating point constants which will fit in
25026    "quarter-precision" values.  These values are characterised by
25027    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
25028    by:
25029
25030    (-1)^s * (n/16) * 2^r
25031
25032    Where:
25033      's' is the sign bit.
25034      'n' is an integer in the range 16 <= n <= 31.
25035      'r' is an integer in the range -3 <= r <= 4.  */
25036
25037 /* Return true iff X can be represented by a quarter-precision
25038    floating point immediate operand X.  Note, we cannot represent 0.0.  */
25039 bool
25040 aarch64_float_const_representable_p (rtx x)
25041 {
25042   /* This represents our current view of how many bits
25043      make up the mantissa.  */
25044   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
25045   int exponent;
25046   unsigned HOST_WIDE_INT mantissa, mask;
25047   REAL_VALUE_TYPE r, m;
25048   bool fail;
25049
25050   x = unwrap_const_vec_duplicate (x);
25051   if (!CONST_DOUBLE_P (x))
25052     return false;
25053
25054   if (GET_MODE (x) == VOIDmode
25055       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
25056     return false;
25057
25058   r = *CONST_DOUBLE_REAL_VALUE (x);
25059
25060   /* We cannot represent infinities, NaNs or +/-zero.  We won't
25061      know if we have +zero until we analyse the mantissa, but we
25062      can reject the other invalid values.  */
25063   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
25064       || REAL_VALUE_MINUS_ZERO (r))
25065     return false;
25066
25067   /* For BFmode, only handle 0.0. */
25068   if (GET_MODE (x) == BFmode)
25069     return real_iszero (&r, false);
25070
25071   /* Extract exponent.  */
25072   r = real_value_abs (&r);
25073   exponent = REAL_EXP (&r);
25074
25075   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
25076      highest (sign) bit, with a fixed binary point at bit point_pos.
25077      m1 holds the low part of the mantissa, m2 the high part.
25078      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
25079      bits for the mantissa, this can fail (low bits will be lost).  */
25080   real_ldexp (&m, &r, point_pos - exponent);
25081   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
25082
25083   /* If the low part of the mantissa has bits set we cannot represent
25084      the value.  */
25085   if (w.ulow () != 0)
25086     return false;
25087   /* We have rejected the lower HOST_WIDE_INT, so update our
25088      understanding of how many bits lie in the mantissa and
25089      look only at the high HOST_WIDE_INT.  */
25090   mantissa = w.elt (1);
25091   point_pos -= HOST_BITS_PER_WIDE_INT;
25092
25093   /* We can only represent values with a mantissa of the form 1.xxxx.  */
25094   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
25095   if ((mantissa & mask) != 0)
25096     return false;
25097
25098   /* Having filtered unrepresentable values, we may now remove all
25099      but the highest 5 bits.  */
25100   mantissa >>= point_pos - 5;
25101
25102   /* We cannot represent the value 0.0, so reject it.  This is handled
25103      elsewhere.  */
25104   if (mantissa == 0)
25105     return false;
25106
25107   /* Then, as bit 4 is always set, we can mask it off, leaving
25108      the mantissa in the range [0, 15].  */
25109   mantissa &= ~(1 << 4);
25110   gcc_assert (mantissa <= 15);
25111
25112   /* GCC internally does not use IEEE754-like encoding (where normalized
25113      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
25114      Our mantissa values are shifted 4 places to the left relative to
25115      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
25116      by 5 places to correct for GCC's representation.  */
25117   exponent = 5 - exponent;
25118
25119   return (exponent >= 0 && exponent <= 7);
25120 }
25121
25122 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
25123    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
25124    output MOVI/MVNI, ORR or BIC immediate.  */
25125 char*
25126 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
25127                                    enum simd_immediate_check which)
25128 {
25129   bool is_valid;
25130   static char templ[40];
25131   const char *mnemonic;
25132   const char *shift_op;
25133   unsigned int lane_count = 0;
25134   char element_char;
25135
25136   struct simd_immediate_info info;
25137
25138   /* This will return true to show const_vector is legal for use as either
25139      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
25140      It will also update INFO to show how the immediate should be generated.
25141      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
25142   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
25143   gcc_assert (is_valid);
25144
25145   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25146   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
25147
25148   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25149     {
25150       gcc_assert (info.insn == simd_immediate_info::MOV
25151                   && info.u.mov.shift == 0);
25152       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
25153          move immediate path.  */
25154       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25155         info.u.mov.value = GEN_INT (0);
25156       else
25157         {
25158           const unsigned int buf_size = 20;
25159           char float_buf[buf_size] = {'\0'};
25160           real_to_decimal_for_mode (float_buf,
25161                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25162                                     buf_size, buf_size, 1, info.elt_mode);
25163
25164           if (lane_count == 1)
25165             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
25166           else
25167             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
25168                       lane_count, element_char, float_buf);
25169           return templ;
25170         }
25171     }
25172
25173   gcc_assert (CONST_INT_P (info.u.mov.value));
25174
25175   if (which == AARCH64_CHECK_MOV)
25176     {
25177       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
25178       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
25179                   ? "msl" : "lsl");
25180       if (lane_count == 1)
25181         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
25182                   mnemonic, UINTVAL (info.u.mov.value));
25183       else if (info.u.mov.shift)
25184         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25185                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
25186                   element_char, UINTVAL (info.u.mov.value), shift_op,
25187                   info.u.mov.shift);
25188       else
25189         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25190                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
25191                   element_char, UINTVAL (info.u.mov.value));
25192     }
25193   else
25194     {
25195       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
25196       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
25197       if (info.u.mov.shift)
25198         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25199                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
25200                   element_char, UINTVAL (info.u.mov.value), "lsl",
25201                   info.u.mov.shift);
25202       else
25203         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25204                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
25205                   element_char, UINTVAL (info.u.mov.value));
25206     }
25207   return templ;
25208 }
25209
25210 char*
25211 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
25212 {
25213
25214   /* If a floating point number was passed and we desire to use it in an
25215      integer mode do the conversion to integer.  */
25216   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
25217     {
25218       unsigned HOST_WIDE_INT ival;
25219       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
25220           gcc_unreachable ();
25221       immediate = gen_int_mode (ival, mode);
25222     }
25223
25224   machine_mode vmode;
25225   /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
25226      a 128 bit vector mode.  */
25227   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
25228
25229   vmode = aarch64_simd_container_mode (mode, width);
25230   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
25231   return aarch64_output_simd_mov_immediate (v_op, width);
25232 }
25233
25234 /* Return the output string to use for moving immediate CONST_VECTOR
25235    into an SVE register.  */
25236
25237 char *
25238 aarch64_output_sve_mov_immediate (rtx const_vector)
25239 {
25240   static char templ[40];
25241   struct simd_immediate_info info;
25242   char element_char;
25243
25244   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
25245   gcc_assert (is_valid);
25246
25247   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25248
25249   machine_mode vec_mode = GET_MODE (const_vector);
25250   if (aarch64_sve_pred_mode_p (vec_mode))
25251     {
25252       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
25253       if (info.insn == simd_immediate_info::MOV)
25254         {
25255           gcc_assert (info.u.mov.value == const0_rtx);
25256           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
25257         }
25258       else
25259         {
25260           gcc_assert (info.insn == simd_immediate_info::PTRUE);
25261           unsigned int total_bytes;
25262           if (info.u.pattern == AARCH64_SV_ALL
25263               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
25264             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
25265                       total_bytes / GET_MODE_SIZE (info.elt_mode));
25266           else
25267             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
25268                       svpattern_token (info.u.pattern));
25269         }
25270       return buf;
25271     }
25272
25273   if (info.insn == simd_immediate_info::INDEX)
25274     {
25275       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
25276                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
25277                 element_char, INTVAL (info.u.index.base),
25278                 INTVAL (info.u.index.step));
25279       return templ;
25280     }
25281
25282   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25283     {
25284       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25285         info.u.mov.value = GEN_INT (0);
25286       else
25287         {
25288           const int buf_size = 20;
25289           char float_buf[buf_size] = {};
25290           real_to_decimal_for_mode (float_buf,
25291                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25292                                     buf_size, buf_size, 1, info.elt_mode);
25293
25294           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
25295                     element_char, float_buf);
25296           return templ;
25297         }
25298     }
25299
25300   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
25301             element_char, INTVAL (info.u.mov.value));
25302   return templ;
25303 }
25304
25305 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
25306    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
25307    pattern.  */
25308
25309 char *
25310 aarch64_output_sve_ptrues (rtx const_unspec)
25311 {
25312   static char templ[40];
25313
25314   struct simd_immediate_info info;
25315   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
25316   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
25317
25318   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25319   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
25320             svpattern_token (info.u.pattern));
25321   return templ;
25322 }
25323
25324 /* Split operands into moves from op[1] + op[2] into op[0].  */
25325
25326 void
25327 aarch64_split_combinev16qi (rtx operands[3])
25328 {
25329   machine_mode halfmode = GET_MODE (operands[1]);
25330
25331   gcc_assert (halfmode == V16QImode);
25332
25333   rtx destlo = simplify_gen_subreg (halfmode, operands[0],
25334                                     GET_MODE (operands[0]), 0);
25335   rtx desthi = simplify_gen_subreg (halfmode, operands[0],
25336                                     GET_MODE (operands[0]),
25337                                     GET_MODE_SIZE (halfmode));
25338
25339   bool skiplo = rtx_equal_p (destlo, operands[1]);
25340   bool skiphi = rtx_equal_p (desthi, operands[2]);
25341
25342   if (skiplo && skiphi)
25343     {
25344       /* No-op move.  Can't split to nothing; emit something.  */
25345       emit_note (NOTE_INSN_DELETED);
25346       return;
25347     }
25348
25349   /* Special case of reversed high/low parts.  */
25350   if (reg_overlap_mentioned_p (operands[2], destlo)
25351       && reg_overlap_mentioned_p (operands[1], desthi))
25352     {
25353       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25354       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
25355       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25356     }
25357   else if (!reg_overlap_mentioned_p (operands[2], destlo))
25358     {
25359       /* Try to avoid unnecessary moves if part of the result
25360          is in the right place already.  */
25361       if (!skiplo)
25362         emit_move_insn (destlo, operands[1]);
25363       if (!skiphi)
25364         emit_move_insn (desthi, operands[2]);
25365     }
25366   else
25367     {
25368       if (!skiphi)
25369         emit_move_insn (desthi, operands[2]);
25370       if (!skiplo)
25371         emit_move_insn (destlo, operands[1]);
25372     }
25373 }
25374
25375 /* vec_perm support.  */
25376
25377 struct expand_vec_perm_d
25378 {
25379   rtx target, op0, op1;
25380   vec_perm_indices perm;
25381   machine_mode vmode;
25382   machine_mode op_mode;
25383   unsigned int vec_flags;
25384   unsigned int op_vec_flags;
25385   bool one_vector_p;
25386   bool testing_p;
25387 };
25388
25389 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
25390
25391 /* Generate a variable permutation.  */
25392
25393 static void
25394 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
25395 {
25396   machine_mode vmode = GET_MODE (target);
25397   bool one_vector_p = rtx_equal_p (op0, op1);
25398
25399   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
25400   gcc_checking_assert (GET_MODE (op0) == vmode);
25401   gcc_checking_assert (GET_MODE (op1) == vmode);
25402   gcc_checking_assert (GET_MODE (sel) == vmode);
25403   gcc_checking_assert (TARGET_SIMD);
25404
25405   if (one_vector_p)
25406     {
25407       if (vmode == V8QImode)
25408         {
25409           /* Expand the argument to a V16QI mode by duplicating it.  */
25410           rtx pair = gen_reg_rtx (V16QImode);
25411           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
25412           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25413         }
25414       else
25415         {
25416           emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
25417         }
25418     }
25419   else
25420     {
25421       rtx pair;
25422
25423       if (vmode == V8QImode)
25424         {
25425           pair = gen_reg_rtx (V16QImode);
25426           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
25427           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25428         }
25429       else
25430         {
25431           pair = gen_reg_rtx (V2x16QImode);
25432           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
25433           emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
25434         }
25435     }
25436 }
25437
25438 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
25439    NELT is the number of elements in the vector.  */
25440
25441 void
25442 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
25443                          unsigned int nelt)
25444 {
25445   machine_mode vmode = GET_MODE (target);
25446   bool one_vector_p = rtx_equal_p (op0, op1);
25447   rtx mask;
25448
25449   /* The TBL instruction does not use a modulo index, so we must take care
25450      of that ourselves.  */
25451   mask = aarch64_simd_gen_const_vector_dup (vmode,
25452       one_vector_p ? nelt - 1 : 2 * nelt - 1);
25453   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
25454
25455   /* For big-endian, we also need to reverse the index within the vector
25456      (but not which vector).  */
25457   if (BYTES_BIG_ENDIAN)
25458     {
25459       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
25460       if (!one_vector_p)
25461         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
25462       sel = expand_simple_binop (vmode, XOR, sel, mask,
25463                                  NULL, 0, OPTAB_LIB_WIDEN);
25464     }
25465   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
25466 }
25467
25468 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
25469
25470 static void
25471 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
25472 {
25473   emit_insn (gen_rtx_SET (target,
25474                           gen_rtx_UNSPEC (GET_MODE (target),
25475                                           gen_rtvec (2, op0, op1), code)));
25476 }
25477
25478 /* Expand an SVE vec_perm with the given operands.  */
25479
25480 void
25481 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
25482 {
25483   machine_mode data_mode = GET_MODE (target);
25484   machine_mode sel_mode = GET_MODE (sel);
25485   /* Enforced by the pattern condition.  */
25486   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
25487
25488   /* Note: vec_perm indices are supposed to wrap when they go beyond the
25489      size of the two value vectors, i.e. the upper bits of the indices
25490      are effectively ignored.  SVE TBL instead produces 0 for any
25491      out-of-range indices, so we need to modulo all the vec_perm indices
25492      to ensure they are all in range.  */
25493   rtx sel_reg = force_reg (sel_mode, sel);
25494
25495   /* Check if the sel only references the first values vector.  */
25496   if (CONST_VECTOR_P (sel)
25497       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
25498     {
25499       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
25500       return;
25501     }
25502
25503   /* Check if the two values vectors are the same.  */
25504   if (rtx_equal_p (op0, op1))
25505     {
25506       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
25507       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25508                                          NULL, 0, OPTAB_DIRECT);
25509       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
25510       return;
25511     }
25512
25513   /* Run TBL on for each value vector and combine the results.  */
25514
25515   rtx res0 = gen_reg_rtx (data_mode);
25516   rtx res1 = gen_reg_rtx (data_mode);
25517   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
25518   if (!CONST_VECTOR_P (sel)
25519       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
25520     {
25521       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
25522                                                        2 * nunits - 1);
25523       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25524                                      NULL, 0, OPTAB_DIRECT);
25525     }
25526   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
25527   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
25528                                      NULL, 0, OPTAB_DIRECT);
25529   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
25530   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
25531     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
25532   else
25533     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
25534 }
25535
25536 /* Recognize patterns suitable for the TRN instructions.  */
25537 static bool
25538 aarch64_evpc_trn (struct expand_vec_perm_d *d)
25539 {
25540   HOST_WIDE_INT odd;
25541   poly_uint64 nelt = d->perm.length ();
25542   rtx out, in0, in1;
25543   machine_mode vmode = d->vmode;
25544
25545   if (GET_MODE_UNIT_SIZE (vmode) > 8)
25546     return false;
25547
25548   /* Note that these are little-endian tests.
25549      We correct for big-endian later.  */
25550   if (!d->perm[0].is_constant (&odd)
25551       || (odd != 0 && odd != 1)
25552       || !d->perm.series_p (0, 2, odd, 2)
25553       || !d->perm.series_p (1, 2, nelt + odd, 2))
25554     return false;
25555
25556   /* Success!  */
25557   if (d->testing_p)
25558     return true;
25559
25560   in0 = d->op0;
25561   in1 = d->op1;
25562   /* We don't need a big-endian lane correction for SVE; see the comment
25563      at the head of aarch64-sve.md for details.  */
25564   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25565     {
25566       std::swap (in0, in1);
25567       odd = !odd;
25568     }
25569   out = d->target;
25570
25571   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25572                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
25573   return true;
25574 }
25575
25576 /* Try to re-encode the PERM constant so it combines odd and even elements.
25577    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
25578    We retry with this new constant with the full suite of patterns.  */
25579 static bool
25580 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
25581 {
25582   expand_vec_perm_d newd;
25583
25584   if (d->vec_flags != VEC_ADVSIMD)
25585     return false;
25586
25587   /* Get the new mode.  Always twice the size of the inner
25588      and half the elements.  */
25589   poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
25590   unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
25591   auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
25592   machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
25593
25594   if (new_mode == word_mode)
25595     return false;
25596
25597   vec_perm_indices newpermindices;
25598
25599   if (!newpermindices.new_shrunk_vector (d->perm, 2))
25600     return false;
25601
25602   newd.vmode = new_mode;
25603   newd.vec_flags = VEC_ADVSIMD;
25604   newd.op_mode = newd.vmode;
25605   newd.op_vec_flags = newd.vec_flags;
25606   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
25607   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
25608   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
25609   newd.testing_p = d->testing_p;
25610   newd.one_vector_p = d->one_vector_p;
25611
25612   newd.perm.new_vector (newpermindices.encoding (), newd.one_vector_p ? 1 : 2,
25613                         newpermindices.nelts_per_input ());
25614   return aarch64_expand_vec_perm_const_1 (&newd);
25615 }
25616
25617 /* Recognize patterns suitable for the UZP instructions.  */
25618 static bool
25619 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
25620 {
25621   HOST_WIDE_INT odd;
25622   rtx out, in0, in1;
25623   machine_mode vmode = d->vmode;
25624
25625   if (GET_MODE_UNIT_SIZE (vmode) > 8)
25626     return false;
25627
25628   /* Note that these are little-endian tests.
25629      We correct for big-endian later.  */
25630   if (!d->perm[0].is_constant (&odd)
25631       || (odd != 0 && odd != 1)
25632       || !d->perm.series_p (0, 1, odd, 2))
25633     return false;
25634
25635   /* Success!  */
25636   if (d->testing_p)
25637     return true;
25638
25639   in0 = d->op0;
25640   in1 = d->op1;
25641   /* We don't need a big-endian lane correction for SVE; see the comment
25642      at the head of aarch64-sve.md for details.  */
25643   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25644     {
25645       std::swap (in0, in1);
25646       odd = !odd;
25647     }
25648   out = d->target;
25649
25650   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25651                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
25652   return true;
25653 }
25654
25655 /* Recognize patterns suitable for the ZIP instructions.  */
25656 static bool
25657 aarch64_evpc_zip (struct expand_vec_perm_d *d)
25658 {
25659   unsigned int high;
25660   poly_uint64 nelt = d->perm.length ();
25661   rtx out, in0, in1;
25662   machine_mode vmode = d->vmode;
25663
25664   if (GET_MODE_UNIT_SIZE (vmode) > 8)
25665     return false;
25666
25667   /* Note that these are little-endian tests.
25668      We correct for big-endian later.  */
25669   poly_uint64 first = d->perm[0];
25670   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
25671       || !d->perm.series_p (0, 2, first, 1)
25672       || !d->perm.series_p (1, 2, first + nelt, 1))
25673     return false;
25674   high = maybe_ne (first, 0U);
25675
25676   /* Success!  */
25677   if (d->testing_p)
25678     return true;
25679
25680   in0 = d->op0;
25681   in1 = d->op1;
25682   /* We don't need a big-endian lane correction for SVE; see the comment
25683      at the head of aarch64-sve.md for details.  */
25684   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25685     {
25686       std::swap (in0, in1);
25687       high = !high;
25688     }
25689   out = d->target;
25690
25691   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25692                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
25693   return true;
25694 }
25695
25696 /* Recognize patterns for the EXT insn.  */
25697
25698 static bool
25699 aarch64_evpc_ext (struct expand_vec_perm_d *d)
25700 {
25701   HOST_WIDE_INT location;
25702   rtx offset;
25703
25704   /* The first element always refers to the first vector.
25705      Check if the extracted indices are increasing by one.  */
25706   if ((d->vec_flags & VEC_SVE_PRED)
25707       || !d->perm[0].is_constant (&location)
25708       || !d->perm.series_p (0, 1, location, 1))
25709     return false;
25710
25711   /* Success! */
25712   if (d->testing_p)
25713     return true;
25714
25715   /* The case where (location == 0) is a no-op for both big- and little-endian,
25716      and is removed by the mid-end at optimization levels -O1 and higher.
25717
25718      We don't need a big-endian lane correction for SVE; see the comment
25719      at the head of aarch64-sve.md for details.  */
25720   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
25721     {
25722       /* After setup, we want the high elements of the first vector (stored
25723          at the LSB end of the register), and the low elements of the second
25724          vector (stored at the MSB end of the register). So swap.  */
25725       std::swap (d->op0, d->op1);
25726       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
25727          to_constant () is safe since this is restricted to Advanced SIMD
25728          vectors.  */
25729       location = d->perm.length ().to_constant () - location;
25730     }
25731
25732   offset = GEN_INT (location);
25733   emit_set_insn (d->target,
25734                  gen_rtx_UNSPEC (d->vmode,
25735                                  gen_rtvec (3, d->op0, d->op1, offset),
25736                                  UNSPEC_EXT));
25737   return true;
25738 }
25739
25740 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
25741    within each 64-bit, 32-bit or 16-bit granule.  */
25742
25743 static bool
25744 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
25745 {
25746   HOST_WIDE_INT diff;
25747   unsigned int i, size, unspec;
25748   machine_mode pred_mode;
25749
25750   if ((d->vec_flags & VEC_SVE_PRED)
25751       || !d->one_vector_p
25752       || !d->perm[0].is_constant (&diff)
25753       || !diff)
25754     return false;
25755
25756   if (d->vec_flags & VEC_SVE_DATA)
25757     size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
25758   else
25759     size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
25760   if (size == 64)
25761     {
25762       unspec = UNSPEC_REV64;
25763       pred_mode = VNx2BImode;
25764     }
25765   else if (size == 32)
25766     {
25767       unspec = UNSPEC_REV32;
25768       pred_mode = VNx4BImode;
25769     }
25770   else if (size == 16)
25771     {
25772       unspec = UNSPEC_REV16;
25773       pred_mode = VNx8BImode;
25774     }
25775   else
25776     return false;
25777
25778   unsigned int step = diff + 1;
25779   for (i = 0; i < step; ++i)
25780     if (!d->perm.series_p (i, step, diff - i, step))
25781       return false;
25782
25783   /* Success! */
25784   if (d->testing_p)
25785     return true;
25786
25787   if (d->vec_flags & VEC_SVE_DATA)
25788     {
25789       rtx pred = aarch64_ptrue_reg (pred_mode);
25790       emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
25791                                          d->target, pred, d->op0));
25792       return true;
25793     }
25794   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
25795   emit_set_insn (d->target, src);
25796   return true;
25797 }
25798
25799 /* Recognize patterns for the REV insn, which reverses elements within
25800    a full vector.  */
25801
25802 static bool
25803 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
25804 {
25805   poly_uint64 nelt = d->perm.length ();
25806
25807   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
25808     return false;
25809
25810   if (!d->perm.series_p (0, 1, nelt - 1, -1))
25811     return false;
25812
25813   /* Success! */
25814   if (d->testing_p)
25815     return true;
25816
25817   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
25818   emit_set_insn (d->target, src);
25819   return true;
25820 }
25821
25822 static bool
25823 aarch64_evpc_dup (struct expand_vec_perm_d *d)
25824 {
25825   rtx out = d->target;
25826   rtx in0;
25827   HOST_WIDE_INT elt;
25828   machine_mode vmode = d->vmode;
25829   rtx lane;
25830
25831   if ((d->vec_flags & VEC_SVE_PRED)
25832       || d->perm.encoding ().encoded_nelts () != 1
25833       || !d->perm[0].is_constant (&elt))
25834     return false;
25835
25836   if ((d->vec_flags & VEC_SVE_DATA)
25837       && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
25838     return false;
25839
25840   /* Success! */
25841   if (d->testing_p)
25842     return true;
25843
25844   /* The generic preparation in aarch64_expand_vec_perm_const_1
25845      swaps the operand order and the permute indices if it finds
25846      d->perm[0] to be in the second operand.  Thus, we can always
25847      use d->op0 and need not do any extra arithmetic to get the
25848      correct lane number.  */
25849   in0 = d->op0;
25850   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
25851
25852   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
25853   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
25854   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
25855   return true;
25856 }
25857
25858 static bool
25859 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
25860 {
25861   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
25862   machine_mode vmode = d->vmode;
25863
25864   /* Make sure that the indices are constant.  */
25865   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
25866   for (unsigned int i = 0; i < encoded_nelts; ++i)
25867     if (!d->perm[i].is_constant ())
25868       return false;
25869
25870   if (d->testing_p)
25871     return true;
25872
25873   /* Generic code will try constant permutation twice.  Once with the
25874      original mode and again with the elements lowered to QImode.
25875      So wait and don't do the selector expansion ourselves.  */
25876   if (vmode != V8QImode && vmode != V16QImode)
25877     return false;
25878
25879   /* to_constant is safe since this routine is specific to Advanced SIMD
25880      vectors.  */
25881   unsigned int nelt = d->perm.length ().to_constant ();
25882   for (unsigned int i = 0; i < nelt; ++i)
25883     /* If big-endian and two vectors we end up with a weird mixed-endian
25884        mode on NEON.  Reverse the index within each word but not the word
25885        itself.  to_constant is safe because we checked is_constant above.  */
25886     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
25887                         ? d->perm[i].to_constant () ^ (nelt - 1)
25888                         : d->perm[i].to_constant ());
25889
25890   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
25891   sel = force_reg (vmode, sel);
25892
25893   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
25894   return true;
25895 }
25896
25897 /* Try to implement D using an SVE TBL instruction.  */
25898
25899 static bool
25900 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
25901 {
25902   unsigned HOST_WIDE_INT nelt;
25903
25904   /* Permuting two variable-length vectors could overflow the
25905      index range.  */
25906   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
25907     return false;
25908
25909   if (d->testing_p)
25910     return true;
25911
25912   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
25913   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
25914   if (d->one_vector_p)
25915     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
25916   else
25917     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
25918   return true;
25919 }
25920
25921 /* Try to implement D using SVE dup instruction.  */
25922
25923 static bool
25924 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
25925 {
25926   if (BYTES_BIG_ENDIAN
25927       || !d->one_vector_p
25928       || d->vec_flags != VEC_SVE_DATA
25929       || d->op_vec_flags != VEC_ADVSIMD
25930       || d->perm.encoding ().nelts_per_pattern () != 1
25931       || !known_eq (d->perm.encoding ().npatterns (),
25932                     GET_MODE_NUNITS (d->op_mode))
25933       || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
25934     return false;
25935
25936   int npatterns = d->perm.encoding ().npatterns ();
25937   for (int i = 0; i < npatterns; i++)
25938     if (!known_eq (d->perm[i], i))
25939       return false;
25940
25941   if (d->testing_p)
25942     return true;
25943
25944   aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
25945   return true;
25946 }
25947
25948 /* Try to implement D using SVE SEL instruction.  */
25949
25950 static bool
25951 aarch64_evpc_sel (struct expand_vec_perm_d *d)
25952 {
25953   machine_mode vmode = d->vmode;
25954   int unit_size = GET_MODE_UNIT_SIZE (vmode);
25955
25956   if (d->vec_flags != VEC_SVE_DATA
25957       || unit_size > 8)
25958     return false;
25959
25960   int n_patterns = d->perm.encoding ().npatterns ();
25961   poly_int64 vec_len = d->perm.length ();
25962
25963   for (int i = 0; i < n_patterns; ++i)
25964     if (!known_eq (d->perm[i], i)
25965         && !known_eq (d->perm[i], vec_len + i))
25966       return false;
25967
25968   for (int i = n_patterns; i < n_patterns * 2; i++)
25969     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
25970         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
25971       return false;
25972
25973   if (d->testing_p)
25974     return true;
25975
25976   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
25977
25978   /* Build a predicate that is true when op0 elements should be used.  */
25979   rtx_vector_builder builder (pred_mode, n_patterns, 2);
25980   for (int i = 0; i < n_patterns * 2; i++)
25981     {
25982       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
25983                                           : CONST0_RTX (BImode);
25984       builder.quick_push (elem);
25985     }
25986
25987   rtx const_vec = builder.build ();
25988   rtx pred = force_reg (pred_mode, const_vec);
25989   /* TARGET = PRED ? OP0 : OP1.  */
25990   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
25991   return true;
25992 }
25993
25994 /* Recognize patterns suitable for the INS instructions.  */
25995 static bool
25996 aarch64_evpc_ins (struct expand_vec_perm_d *d)
25997 {
25998   machine_mode mode = d->vmode;
25999   unsigned HOST_WIDE_INT nelt;
26000
26001   if (d->vec_flags != VEC_ADVSIMD)
26002     return false;
26003
26004   /* to_constant is safe since this routine is specific to Advanced SIMD
26005      vectors.  */
26006   nelt = d->perm.length ().to_constant ();
26007   rtx insv = d->op0;
26008
26009   HOST_WIDE_INT idx = -1;
26010
26011   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26012     {
26013       HOST_WIDE_INT elt;
26014       if (!d->perm[i].is_constant (&elt))
26015         return false;
26016       if (elt == (HOST_WIDE_INT) i)
26017         continue;
26018       if (idx != -1)
26019         {
26020           idx = -1;
26021           break;
26022         }
26023       idx = i;
26024     }
26025
26026   if (idx == -1)
26027     {
26028       insv = d->op1;
26029       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26030         {
26031           if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
26032             continue;
26033           if (idx != -1)
26034             return false;
26035           idx = i;
26036         }
26037
26038       if (idx == -1)
26039         return false;
26040     }
26041
26042   if (d->testing_p)
26043     return true;
26044
26045   gcc_assert (idx != -1);
26046
26047   unsigned extractindex = d->perm[idx].to_constant ();
26048   rtx extractv = d->op0;
26049   if (extractindex >= nelt)
26050     {
26051       extractv = d->op1;
26052       extractindex -= nelt;
26053     }
26054   gcc_assert (extractindex < nelt);
26055
26056   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
26057   expand_operand ops[5];
26058   create_output_operand (&ops[0], d->target, mode);
26059   create_input_operand (&ops[1], insv, mode);
26060   create_integer_operand (&ops[2], 1 << idx);
26061   create_input_operand (&ops[3], extractv, mode);
26062   create_integer_operand (&ops[4], extractindex);
26063   expand_insn (icode, 5, ops);
26064
26065   return true;
26066 }
26067
26068 static bool
26069 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
26070 {
26071   gcc_assert (d->op_mode != E_VOIDmode);
26072
26073   /* The pattern matching functions above are written to look for a small
26074      number to begin the sequence (0, 1, N/2).  If we begin with an index
26075      from the second operand, we can swap the operands.  */
26076   poly_int64 nelt = d->perm.length ();
26077   if (known_ge (d->perm[0], nelt))
26078     {
26079       d->perm.rotate_inputs (1);
26080       std::swap (d->op0, d->op1);
26081     }
26082
26083   if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
26084        || d->vec_flags == VEC_SVE_DATA
26085        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
26086        || d->vec_flags == VEC_SVE_PRED)
26087       && known_gt (nelt, 1))
26088     {
26089       if (d->vmode == d->op_mode)
26090         {
26091           if (aarch64_evpc_rev_local (d))
26092             return true;
26093           else if (aarch64_evpc_rev_global (d))
26094             return true;
26095           else if (aarch64_evpc_ext (d))
26096             return true;
26097           else if (aarch64_evpc_dup (d))
26098             return true;
26099           else if (aarch64_evpc_zip (d))
26100             return true;
26101           else if (aarch64_evpc_uzp (d))
26102             return true;
26103           else if (aarch64_evpc_trn (d))
26104             return true;
26105           else if (aarch64_evpc_sel (d))
26106             return true;
26107           else if (aarch64_evpc_ins (d))
26108             return true;
26109           else if (aarch64_evpc_reencode (d))
26110             return true;
26111
26112           if (d->vec_flags == VEC_SVE_DATA)
26113             return aarch64_evpc_sve_tbl (d);
26114           else if (d->vec_flags == VEC_ADVSIMD)
26115             return aarch64_evpc_tbl (d);
26116         }
26117       else
26118         {
26119           if (aarch64_evpc_sve_dup (d))
26120             return true;
26121         }
26122     }
26123   return false;
26124 }
26125
26126 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
26127
26128 static bool
26129 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
26130                                   rtx target, rtx op0, rtx op1,
26131                                   const vec_perm_indices &sel)
26132 {
26133   struct expand_vec_perm_d d;
26134
26135   /* Check whether the mask can be applied to a single vector.  */
26136   if (sel.ninputs () == 1
26137       || (op0 && rtx_equal_p (op0, op1)))
26138     d.one_vector_p = true;
26139   else if (sel.all_from_input_p (0))
26140     {
26141       d.one_vector_p = true;
26142       op1 = op0;
26143     }
26144   else if (sel.all_from_input_p (1))
26145     {
26146       d.one_vector_p = true;
26147       op0 = op1;
26148     }
26149   else
26150     d.one_vector_p = false;
26151
26152   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
26153                      sel.nelts_per_input ());
26154   d.vmode = vmode;
26155   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
26156   d.op_mode = op_mode;
26157   d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
26158   d.target = target;
26159   d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
26160   if (op0 == op1)
26161     d.op1 = d.op0;
26162   else
26163     d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
26164   d.testing_p = !target;
26165
26166   if (!d.testing_p)
26167     return aarch64_expand_vec_perm_const_1 (&d);
26168
26169   rtx_insn *last = get_last_insn ();
26170   bool ret = aarch64_expand_vec_perm_const_1 (&d);
26171   gcc_assert (last == get_last_insn ());
26172
26173   return ret;
26174 }
26175 /* Generate a byte permute mask for a register of mode MODE,
26176    which has NUNITS units.  */
26177
26178 rtx
26179 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
26180 {
26181   /* We have to reverse each vector because we dont have
26182      a permuted load that can reverse-load according to ABI rules.  */
26183   rtx mask;
26184   rtvec v = rtvec_alloc (16);
26185   unsigned int i, j;
26186   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
26187
26188   gcc_assert (BYTES_BIG_ENDIAN);
26189   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
26190
26191   for (i = 0; i < nunits; i++)
26192     for (j = 0; j < usize; j++)
26193       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
26194   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
26195   return force_reg (V16QImode, mask);
26196 }
26197
26198 /* Expand an SVE integer comparison using the SVE equivalent of:
26199
26200      (set TARGET (CODE OP0 OP1)).  */
26201
26202 void
26203 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
26204 {
26205   machine_mode pred_mode = GET_MODE (target);
26206   machine_mode data_mode = GET_MODE (op0);
26207   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
26208                                       op0, op1);
26209   if (!rtx_equal_p (target, res))
26210     emit_move_insn (target, res);
26211 }
26212
26213 /* Return the UNSPEC_COND_* code for comparison CODE.  */
26214
26215 static unsigned int
26216 aarch64_unspec_cond_code (rtx_code code)
26217 {
26218   switch (code)
26219     {
26220     case NE:
26221       return UNSPEC_COND_FCMNE;
26222     case EQ:
26223       return UNSPEC_COND_FCMEQ;
26224     case LT:
26225       return UNSPEC_COND_FCMLT;
26226     case GT:
26227       return UNSPEC_COND_FCMGT;
26228     case LE:
26229       return UNSPEC_COND_FCMLE;
26230     case GE:
26231       return UNSPEC_COND_FCMGE;
26232     case UNORDERED:
26233       return UNSPEC_COND_FCMUO;
26234     default:
26235       gcc_unreachable ();
26236     }
26237 }
26238
26239 /* Emit:
26240
26241       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26242
26243    where <X> is the operation associated with comparison CODE.
26244    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26245
26246 static void
26247 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
26248                           bool known_ptrue_p, rtx op0, rtx op1)
26249 {
26250   rtx flag = gen_int_mode (known_ptrue_p, SImode);
26251   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
26252                                gen_rtvec (4, pred, flag, op0, op1),
26253                                aarch64_unspec_cond_code (code));
26254   emit_set_insn (target, unspec);
26255 }
26256
26257 /* Emit the SVE equivalent of:
26258
26259       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
26260       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
26261       (set TARGET (ior:PRED_MODE TMP1 TMP2))
26262
26263    where <Xi> is the operation associated with comparison CODEi.
26264    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26265
26266 static void
26267 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
26268                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
26269 {
26270   machine_mode pred_mode = GET_MODE (pred);
26271   rtx tmp1 = gen_reg_rtx (pred_mode);
26272   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
26273   rtx tmp2 = gen_reg_rtx (pred_mode);
26274   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
26275   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
26276 }
26277
26278 /* Emit the SVE equivalent of:
26279
26280       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26281       (set TARGET (not TMP))
26282
26283    where <X> is the operation associated with comparison CODE.
26284    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26285
26286 static void
26287 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
26288                                  bool known_ptrue_p, rtx op0, rtx op1)
26289 {
26290   machine_mode pred_mode = GET_MODE (pred);
26291   rtx tmp = gen_reg_rtx (pred_mode);
26292   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
26293   aarch64_emit_unop (target, one_cmpl_optab, tmp);
26294 }
26295
26296 /* Expand an SVE floating-point comparison using the SVE equivalent of:
26297
26298      (set TARGET (CODE OP0 OP1))
26299
26300    If CAN_INVERT_P is true, the caller can also handle inverted results;
26301    return true if the result is in fact inverted.  */
26302
26303 bool
26304 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
26305                                   rtx op0, rtx op1, bool can_invert_p)
26306 {
26307   machine_mode pred_mode = GET_MODE (target);
26308   machine_mode data_mode = GET_MODE (op0);
26309
26310   rtx ptrue = aarch64_ptrue_reg (pred_mode);
26311   switch (code)
26312     {
26313     case UNORDERED:
26314       /* UNORDERED has no immediate form.  */
26315       op1 = force_reg (data_mode, op1);
26316       /* fall through */
26317     case LT:
26318     case LE:
26319     case GT:
26320     case GE:
26321     case EQ:
26322     case NE:
26323       {
26324         /* There is native support for the comparison.  */
26325         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26326         return false;
26327       }
26328
26329     case LTGT:
26330       /* This is a trapping operation (LT or GT).  */
26331       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
26332       return false;
26333
26334     case UNEQ:
26335       if (!flag_trapping_math)
26336         {
26337           /* This would trap for signaling NaNs.  */
26338           op1 = force_reg (data_mode, op1);
26339           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
26340                                         ptrue, true, op0, op1);
26341           return false;
26342         }
26343       /* fall through */
26344     case UNLT:
26345     case UNLE:
26346     case UNGT:
26347     case UNGE:
26348       if (flag_trapping_math)
26349         {
26350           /* Work out which elements are ordered.  */
26351           rtx ordered = gen_reg_rtx (pred_mode);
26352           op1 = force_reg (data_mode, op1);
26353           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
26354                                            ptrue, true, op0, op1);
26355
26356           /* Test the opposite condition for the ordered elements,
26357              then invert the result.  */
26358           if (code == UNEQ)
26359             code = NE;
26360           else
26361             code = reverse_condition_maybe_unordered (code);
26362           if (can_invert_p)
26363             {
26364               aarch64_emit_sve_fp_cond (target, code,
26365                                         ordered, false, op0, op1);
26366               return true;
26367             }
26368           aarch64_emit_sve_invert_fp_cond (target, code,
26369                                            ordered, false, op0, op1);
26370           return false;
26371         }
26372       break;
26373
26374     case ORDERED:
26375       /* ORDERED has no immediate form.  */
26376       op1 = force_reg (data_mode, op1);
26377       break;
26378
26379     default:
26380       gcc_unreachable ();
26381     }
26382
26383   /* There is native support for the inverse comparison.  */
26384   code = reverse_condition_maybe_unordered (code);
26385   if (can_invert_p)
26386     {
26387       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26388       return true;
26389     }
26390   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
26391   return false;
26392 }
26393
26394 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
26395    of the data being selected and CMP_MODE is the mode of the values being
26396    compared.  */
26397
26398 void
26399 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
26400                           rtx *ops)
26401 {
26402   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
26403   rtx pred = gen_reg_rtx (pred_mode);
26404   if (FLOAT_MODE_P (cmp_mode))
26405     {
26406       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
26407                                             ops[4], ops[5], true))
26408         std::swap (ops[1], ops[2]);
26409     }
26410   else
26411     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
26412
26413   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
26414     ops[1] = force_reg (data_mode, ops[1]);
26415   /* The "false" value can only be zero if the "true" value is a constant.  */
26416   if (register_operand (ops[1], data_mode)
26417       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
26418     ops[2] = force_reg (data_mode, ops[2]);
26419
26420   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
26421   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
26422 }
26423
26424 /* Return true if:
26425
26426    (a) MODE1 and MODE2 use the same layout for bytes that are common
26427        to both modes;
26428
26429    (b) subregs involving the two modes behave as the target-independent
26430        subreg rules require; and
26431
26432    (c) there is at least one register that can hold both modes.
26433
26434    Return false otherwise.  */
26435
26436 static bool
26437 aarch64_modes_compatible_p (machine_mode mode1, machine_mode mode2)
26438 {
26439   unsigned int flags1 = aarch64_classify_vector_mode (mode1);
26440   unsigned int flags2 = aarch64_classify_vector_mode (mode2);
26441
26442   bool sve1_p = (flags1 & VEC_ANY_SVE);
26443   bool sve2_p = (flags2 & VEC_ANY_SVE);
26444
26445   bool partial_sve1_p = sve1_p && (flags1 & VEC_PARTIAL);
26446   bool partial_sve2_p = sve2_p && (flags2 & VEC_PARTIAL);
26447
26448   bool pred1_p = (flags1 & VEC_SVE_PRED);
26449   bool pred2_p = (flags2 & VEC_SVE_PRED);
26450
26451   bool partial_advsimd_struct1_p = (flags1 == (VEC_ADVSIMD | VEC_STRUCT
26452                                                | VEC_PARTIAL));
26453   bool partial_advsimd_struct2_p = (flags2 == (VEC_ADVSIMD | VEC_STRUCT
26454                                                | VEC_PARTIAL));
26455
26456   /* Don't allow changes between predicate modes and other modes.
26457      Only predicate registers can hold predicate modes and only
26458      non-predicate registers can hold non-predicate modes, so any
26459      attempt to mix them would require a round trip through memory.  */
26460   if (pred1_p != pred2_p)
26461     return false;
26462
26463   /* The contents of partial SVE modes are distributed evenly across
26464      the register, whereas GCC expects them to be clustered together.
26465      We therefore need to be careful about mode changes involving them.  */
26466   if (partial_sve1_p && partial_sve2_p)
26467     {
26468       /* Reject changes between partial SVE modes that have different
26469          patterns of significant and insignificant bits.  */
26470       if ((aarch64_sve_container_bits (mode1)
26471            != aarch64_sve_container_bits (mode2))
26472           || GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26473         return false;
26474     }
26475   else if (partial_sve1_p)
26476     {
26477       /* The first lane of MODE1 is where GCC expects it, but anything
26478          bigger than that is not.  */
26479       if (maybe_gt (GET_MODE_SIZE (mode2), GET_MODE_UNIT_SIZE (mode1)))
26480         return false;
26481     }
26482   else if (partial_sve2_p)
26483     {
26484       /* Similarly in reverse.  */
26485       if (maybe_gt (GET_MODE_SIZE (mode1), GET_MODE_UNIT_SIZE (mode2)))
26486         return false;
26487     }
26488
26489   /* Don't allow changes between partial Advanced SIMD structure modes
26490      and other modes that are bigger than 8 bytes.  E.g. V16QI and V2x8QI
26491      are the same size, but the former occupies one Q register while the
26492      latter occupies two D registers.  */
26493   if (partial_advsimd_struct1_p != partial_advsimd_struct2_p
26494       && maybe_gt (GET_MODE_SIZE (mode1), 8)
26495       && maybe_gt (GET_MODE_SIZE (mode2), 8))
26496     return false;
26497
26498   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26499     {
26500       /* Don't allow changes between SVE modes and other modes that might
26501          be bigger than 128 bits.  In particular, OImode, CImode and XImode
26502          divide into 128-bit quantities while SVE modes divide into
26503          BITS_PER_SVE_VECTOR quantities.  */
26504       if (sve1_p && !sve2_p && maybe_gt (GET_MODE_BITSIZE (mode2), 128))
26505         return false;
26506       if (sve2_p && !sve1_p && maybe_gt (GET_MODE_BITSIZE (mode1), 128))
26507         return false;
26508     }
26509
26510   if (BYTES_BIG_ENDIAN)
26511     {
26512       /* Don't allow changes between SVE data modes and non-SVE modes.
26513          See the comment at the head of aarch64-sve.md for details.  */
26514       if (sve1_p != sve2_p)
26515         return false;
26516
26517       /* Don't allow changes in element size: lane 0 of the new vector
26518          would not then be lane 0 of the old vector.  See the comment
26519          above aarch64_maybe_expand_sve_subreg_move for a more detailed
26520          description.
26521
26522          In the worst case, this forces a register to be spilled in
26523          one mode and reloaded in the other, which handles the
26524          endianness correctly.  */
26525       if (sve1_p && GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26526         return false;
26527     }
26528   return true;
26529 }
26530
26531 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always defer
26532    to aarch64_modes_compatible_p.  However due to issues with register
26533    allocation it is preferable to avoid tieing integer scalar and FP
26534    scalar modes.  Executing integer operations in general registers is
26535    better than treating them as scalar vector operations.  This reduces
26536    latency and avoids redundant int<->FP moves.  So tie modes if they
26537    are either the same class, or one of them is a vector mode.  */
26538
26539 static bool
26540 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
26541 {
26542   if (aarch64_modes_compatible_p (mode1, mode2))
26543     {
26544       if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
26545         return true;
26546       if (VECTOR_MODE_P (mode1) || VECTOR_MODE_P (mode2))
26547         return true;
26548     }
26549   return false;
26550 }
26551
26552 /* Return a new RTX holding the result of moving POINTER forward by
26553    AMOUNT bytes.  */
26554
26555 static rtx
26556 aarch64_move_pointer (rtx pointer, poly_int64 amount)
26557 {
26558   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
26559
26560   return adjust_automodify_address (pointer, GET_MODE (pointer),
26561                                     next, amount);
26562 }
26563
26564 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
26565    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
26566    rather than memcpy.  Return true iff we succeeded.  */
26567 bool
26568 aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
26569 {
26570   if (!TARGET_MOPS)
26571     return false;
26572
26573   /* All three registers are changed by the instruction, so each one
26574      must be a fresh pseudo.  */
26575   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26576   rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
26577   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26578   rtx src_mem = replace_equiv_address (operands[1], src_addr);
26579   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
26580   if (is_memmove)
26581     emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
26582   else
26583     emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
26584   return true;
26585 }
26586
26587 /* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
26588    OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
26589    if this is a memmove rather than memcpy.  Return true if we succeed,
26590    otherwise return false, indicating that a libcall should be emitted.  */
26591 bool
26592 aarch64_expand_cpymem (rtx *operands, bool is_memmove)
26593 {
26594   int mode_bytes;
26595   rtx dst = operands[0];
26596   rtx src = operands[1];
26597   unsigned align = UINTVAL (operands[3]);
26598   rtx base;
26599   machine_mode mode = BLKmode, next_mode;
26600
26601   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
26602   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
26603     return aarch64_expand_cpymem_mops (operands, is_memmove);
26604
26605   unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
26606
26607   /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
26608   unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
26609   unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
26610                                        : aarch64_mops_memcpy_size_threshold;
26611
26612   /* Reduce the maximum size with -Os.  */
26613   if (optimize_function_for_size_p (cfun))
26614     max_copy_size /= 4;
26615
26616   /* Large copies use MOPS when available or a library call.  */
26617   if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
26618     return aarch64_expand_cpymem_mops (operands, is_memmove);
26619
26620   /* Default to 32-byte LDP/STP on large copies, however small copies or
26621      no SIMD support fall back to 16-byte chunks.
26622      ??? Although it would be possible to use LDP/STP Qn in streaming mode
26623      (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
26624      whether that would improve performance.  */
26625   bool use_qregs = size > 24 && TARGET_SIMD;
26626
26627   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26628   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26629
26630   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
26631   src = adjust_automodify_address (src, VOIDmode, base, 0);
26632
26633   auto_vec<std::pair<rtx, rtx>, 16> ops;
26634   int offset = 0;
26635
26636   while (size > 0)
26637     {
26638       /* Find the largest mode in which to do the copy in without over reading
26639          or writing.  */
26640       opt_scalar_int_mode mode_iter;
26641       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26642         if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
26643           mode = mode_iter.require ();
26644
26645       gcc_assert (mode != BLKmode);
26646
26647       mode_bytes = GET_MODE_SIZE (mode).to_constant ();
26648
26649       /* Prefer Q-register accesses.  */
26650       if (mode_bytes == 16 && use_qregs)
26651         mode = V4SImode;
26652
26653       rtx reg = gen_reg_rtx (mode);
26654       rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
26655       rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
26656       ops.safe_push ({ load, store });
26657       size -= mode_bytes;
26658       offset += mode_bytes;
26659
26660       /* Emit trailing copies using overlapping unaligned accesses
26661          (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
26662       if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
26663         {
26664           next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
26665           int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
26666           gcc_assert (n_bytes <= mode_bytes);
26667           offset -= n_bytes - size;
26668           size = n_bytes;
26669         }
26670     }
26671
26672   /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
26673   int nops = ops.length();
26674   int inc = is_memmove || nops <= 8 ? nops : 6;
26675
26676   for (int i = 0; i < nops; i += inc)
26677     {
26678       int m = MIN (nops, i + inc);
26679       /* Emit loads.  */
26680       for (int j = i; j < m; j++)
26681         emit_insn (ops[j].first);
26682       /* Emit stores.  */
26683       for (int j = i; j < m; j++)
26684         emit_insn (ops[j].second);
26685     }
26686   return true;
26687 }
26688
26689 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
26690    as for the setmem pattern.  Return true iff we succeed.  */
26691 static bool
26692 aarch64_expand_setmem_mops (rtx *operands)
26693 {
26694   if (!TARGET_MOPS)
26695     return false;
26696
26697   /* The first two registers are changed by the instruction, so both
26698      of them must be a fresh pseudo.  */
26699   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26700   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26701   rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
26702   rtx val = operands[2];
26703   if (val != CONST0_RTX (QImode))
26704     val = force_reg (QImode, val);
26705   emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
26706   return true;
26707 }
26708
26709 /* Expand setmem, as if from a __builtin_memset.  Return true if
26710    we succeed, otherwise return false.  */
26711
26712 bool
26713 aarch64_expand_setmem (rtx *operands)
26714 {
26715   int mode_bytes;
26716   unsigned HOST_WIDE_INT len;
26717   rtx dst = operands[0];
26718   rtx val = operands[2], src;
26719   unsigned align = UINTVAL (operands[3]);
26720   rtx base;
26721   machine_mode mode = BLKmode, next_mode;
26722
26723   /* Variable-sized or strict-align memset may use the MOPS expansion.  */
26724   if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
26725       || (STRICT_ALIGNMENT && align < 16))
26726     return aarch64_expand_setmem_mops (operands);
26727
26728   /* Set inline limits for memset.  MOPS has a separate threshold.  */
26729   unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
26730   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
26731
26732   len = UINTVAL (operands[1]);
26733
26734   /* Large memset uses MOPS when available or a library call.  */
26735   if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
26736     return aarch64_expand_setmem_mops (operands);
26737
26738   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26739   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26740
26741   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
26742   val = expand_vector_broadcast (V16QImode, val);
26743   val = force_reg (V16QImode, val);
26744
26745   int offset = 0;
26746   while (len > 0)
26747     {
26748       /* Find the largest mode in which to do the copy without
26749          over writing.  */
26750       opt_scalar_int_mode mode_iter;
26751       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26752         if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16))
26753           mode = mode_iter.require ();
26754
26755       gcc_assert (mode != BLKmode);
26756
26757       mode_bytes = GET_MODE_SIZE (mode).to_constant ();
26758
26759       src = val;
26760
26761       /* Prefer Q-register accesses.  */
26762       if (mode_bytes == 16)
26763         mode = V16QImode;
26764       else
26765         src = lowpart_subreg (mode, src, GET_MODE (val));
26766
26767       emit_move_insn (adjust_address (dst, mode, offset), src);
26768       len -= mode_bytes;
26769       offset += mode_bytes;
26770
26771       /* Emit trailing writes using overlapping unaligned accesses
26772          (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
26773       if (len > 0 && len < 16 && !STRICT_ALIGNMENT)
26774         {
26775           next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
26776           int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
26777           gcc_assert (n_bytes <= mode_bytes);
26778           offset -= n_bytes - len;
26779           len = n_bytes;
26780         }
26781     }
26782
26783   return true;
26784 }
26785
26786
26787 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
26788    SImode stores.  Handle the case when the constant has identical
26789    bottom and top halves.  This is beneficial when the two stores can be
26790    merged into an STP and we avoid synthesising potentially expensive
26791    immediates twice.  Return true if such a split is possible.  */
26792
26793 bool
26794 aarch64_split_dimode_const_store (rtx dst, rtx src)
26795 {
26796   rtx lo = gen_lowpart (SImode, src);
26797   rtx hi = gen_highpart_mode (SImode, DImode, src);
26798
26799   if (!rtx_equal_p (lo, hi))
26800     return false;
26801
26802   unsigned int orig_cost
26803     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
26804   unsigned int lo_cost
26805     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
26806
26807   /* We want to transform:
26808      MOV        x1, 49370
26809      MOVK       x1, 0x140, lsl 16
26810      MOVK       x1, 0xc0da, lsl 32
26811      MOVK       x1, 0x140, lsl 48
26812      STR        x1, [x0]
26813    into:
26814      MOV        w1, 49370
26815      MOVK       w1, 0x140, lsl 16
26816      STP        w1, w1, [x0]
26817    So we want to perform this when we save at least one instruction.  */
26818   if (orig_cost <= lo_cost)
26819     return false;
26820
26821   rtx mem_lo = adjust_address (dst, SImode, 0);
26822   if (!aarch64_mem_pair_operand (mem_lo, SImode))
26823     return false;
26824
26825   rtx tmp_reg = gen_reg_rtx (SImode);
26826   aarch64_expand_mov_immediate (tmp_reg, lo);
26827   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
26828   /* Don't emit an explicit store pair as this may not be always profitable.
26829      Let the sched-fusion logic decide whether to merge them.  */
26830   emit_move_insn (mem_lo, tmp_reg);
26831   emit_move_insn (mem_hi, tmp_reg);
26832
26833   return true;
26834 }
26835
26836 /* Generate RTL for a conditional branch with rtx comparison CODE in
26837    mode CC_MODE.  The destination of the unlikely conditional branch
26838    is LABEL_REF.  */
26839
26840 void
26841 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
26842                               rtx label_ref)
26843 {
26844   rtx x;
26845   x = gen_rtx_fmt_ee (code, VOIDmode,
26846                       gen_rtx_REG (cc_mode, CC_REGNUM),
26847                       const0_rtx);
26848
26849   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
26850                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
26851                             pc_rtx);
26852   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
26853 }
26854
26855 /* Generate DImode scratch registers for 128-bit (TImode) addition.
26856
26857    OP1 represents the TImode destination operand 1
26858    OP2 represents the TImode destination operand 2
26859    LOW_DEST represents the low half (DImode) of TImode operand 0
26860    LOW_IN1 represents the low half (DImode) of TImode operand 1
26861    LOW_IN2 represents the low half (DImode) of TImode operand 2
26862    HIGH_DEST represents the high half (DImode) of TImode operand 0
26863    HIGH_IN1 represents the high half (DImode) of TImode operand 1
26864    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
26865
26866 void
26867 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
26868                             rtx *low_in1, rtx *low_in2,
26869                             rtx *high_dest, rtx *high_in1,
26870                             rtx *high_in2)
26871 {
26872   *low_dest = gen_reg_rtx (DImode);
26873   *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
26874   *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
26875   *high_dest = gen_reg_rtx (DImode);
26876   *high_in1 = force_highpart_subreg (DImode, op1, TImode);
26877   *high_in2 = force_highpart_subreg (DImode, op2, TImode);
26878 }
26879
26880 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
26881
26882    OP1 represents the TImode destination operand 1
26883    OP2 represents the TImode destination operand 2
26884    LOW_DEST represents the low half (DImode) of TImode operand 0
26885    LOW_IN1 represents the low half (DImode) of TImode operand 1
26886    LOW_IN2 represents the low half (DImode) of TImode operand 2
26887    HIGH_DEST represents the high half (DImode) of TImode operand 0
26888    HIGH_IN1 represents the high half (DImode) of TImode operand 1
26889    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
26890
26891
26892 void
26893 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
26894                              rtx *low_in1, rtx *low_in2,
26895                              rtx *high_dest, rtx *high_in1,
26896                              rtx *high_in2)
26897 {
26898   *low_dest = gen_reg_rtx (DImode);
26899   *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
26900   *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
26901   *high_dest = gen_reg_rtx (DImode);
26902
26903   *high_in1 = force_highpart_subreg (DImode, op1, TImode);
26904   *high_in2 = force_highpart_subreg (DImode, op2, TImode);
26905 }
26906
26907 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
26908
26909    OP0 represents the TImode destination operand 0
26910    LOW_DEST represents the low half (DImode) of TImode operand 0
26911    LOW_IN1 represents the low half (DImode) of TImode operand 1
26912    LOW_IN2 represents the low half (DImode) of TImode operand 2
26913    HIGH_DEST represents the high half (DImode) of TImode operand 0
26914    HIGH_IN1 represents the high half (DImode) of TImode operand 1
26915    HIGH_IN2 represents the high half (DImode) of TImode operand 2
26916    UNSIGNED_P is true if the operation is being performed on unsigned
26917    values.  */
26918 void
26919 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
26920                        rtx low_in2, rtx high_dest, rtx high_in1,
26921                        rtx high_in2, bool unsigned_p)
26922 {
26923   if (low_in2 == const0_rtx)
26924     {
26925       low_dest = low_in1;
26926       high_in2 = force_reg (DImode, high_in2);
26927       if (unsigned_p)
26928         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
26929       else
26930         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
26931     }
26932   else
26933     {
26934       if (aarch64_plus_immediate (low_in2, DImode))
26935         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
26936                                             GEN_INT (-UINTVAL (low_in2))));
26937       else
26938         {
26939           low_in2 = force_reg (DImode, low_in2);
26940           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
26941         }
26942       high_in2 = force_reg (DImode, high_in2);
26943
26944       if (unsigned_p)
26945         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
26946       else
26947         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
26948     }
26949
26950   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
26951   emit_move_insn (gen_highpart (DImode, op0), high_dest);
26952
26953 }
26954
26955 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
26956
26957 static unsigned HOST_WIDE_INT
26958 aarch64_asan_shadow_offset (void)
26959 {
26960   if (TARGET_ILP32)
26961     return (HOST_WIDE_INT_1 << 29);
26962   else
26963     return (HOST_WIDE_INT_1 << 36);
26964 }
26965
26966 static rtx
26967 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
26968                         rtx_code code, tree treeop0, tree treeop1)
26969 {
26970   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
26971   rtx op0, op1;
26972   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
26973   insn_code icode;
26974   struct expand_operand ops[4];
26975
26976   start_sequence ();
26977   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
26978
26979   op_mode = GET_MODE (op0);
26980   if (op_mode == VOIDmode)
26981     op_mode = GET_MODE (op1);
26982
26983   switch (op_mode)
26984     {
26985     case E_QImode:
26986     case E_HImode:
26987     case E_SImode:
26988       cmp_mode = SImode;
26989       icode = CODE_FOR_cmpsi;
26990       break;
26991
26992     case E_DImode:
26993       cmp_mode = DImode;
26994       icode = CODE_FOR_cmpdi;
26995       break;
26996
26997     case E_SFmode:
26998       cmp_mode = SFmode;
26999       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
27000       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
27001       break;
27002
27003     case E_DFmode:
27004       cmp_mode = DFmode;
27005       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
27006       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
27007       break;
27008
27009     default:
27010       end_sequence ();
27011       return NULL_RTX;
27012     }
27013
27014   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
27015   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
27016   if (!op0 || !op1)
27017     {
27018       end_sequence ();
27019       return NULL_RTX;
27020     }
27021   *prep_seq = get_insns ();
27022   end_sequence ();
27023
27024   create_fixed_operand (&ops[0], op0);
27025   create_fixed_operand (&ops[1], op1);
27026
27027   start_sequence ();
27028   if (!maybe_expand_insn (icode, 2, ops))
27029     {
27030       end_sequence ();
27031       return NULL_RTX;
27032     }
27033   *gen_seq = get_insns ();
27034   end_sequence ();
27035
27036   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
27037                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
27038 }
27039
27040 static rtx
27041 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
27042                        rtx_code cmp_code, tree treeop0, tree treeop1,
27043                        rtx_code bit_code)
27044 {
27045   rtx op0, op1, target;
27046   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27047   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27048   insn_code icode;
27049   struct expand_operand ops[6];
27050   int aarch64_cond;
27051
27052   push_to_sequence (*prep_seq);
27053   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27054
27055   op_mode = GET_MODE (op0);
27056   if (op_mode == VOIDmode)
27057     op_mode = GET_MODE (op1);
27058
27059   switch (op_mode)
27060     {
27061     case E_QImode:
27062     case E_HImode:
27063     case E_SImode:
27064       cmp_mode = SImode;
27065       break;
27066
27067     case E_DImode:
27068       cmp_mode = DImode;
27069       break;
27070
27071     case E_SFmode:
27072       cmp_mode = SFmode;
27073       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27074       break;
27075
27076     case E_DFmode:
27077       cmp_mode = DFmode;
27078       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27079       break;
27080
27081     default:
27082       end_sequence ();
27083       return NULL_RTX;
27084     }
27085
27086   icode = code_for_ccmp (cc_mode, cmp_mode);
27087
27088   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
27089   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
27090   if (!op0 || !op1)
27091     {
27092       end_sequence ();
27093       return NULL_RTX;
27094     }
27095   *prep_seq = get_insns ();
27096   end_sequence ();
27097
27098   target = gen_rtx_REG (cc_mode, CC_REGNUM);
27099   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
27100
27101   if (bit_code != AND)
27102     {
27103       /* Treat the ccmp patterns as canonical and use them where possible,
27104          but fall back to ccmp_rev patterns if there's no other option.  */
27105       rtx_code prev_code = GET_CODE (prev);
27106       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
27107       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
27108           && !(prev_code == EQ
27109                || prev_code == NE
27110                || prev_code == ORDERED
27111                || prev_code == UNORDERED))
27112         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
27113       else
27114         {
27115           rtx_code code = reverse_condition (prev_code);
27116           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
27117         }
27118       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
27119     }
27120
27121   create_fixed_operand (&ops[0], XEXP (prev, 0));
27122   create_fixed_operand (&ops[1], target);
27123   create_fixed_operand (&ops[2], op0);
27124   create_fixed_operand (&ops[3], op1);
27125   create_fixed_operand (&ops[4], prev);
27126   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
27127
27128   push_to_sequence (*gen_seq);
27129   if (!maybe_expand_insn (icode, 6, ops))
27130     {
27131       end_sequence ();
27132       return NULL_RTX;
27133     }
27134
27135   *gen_seq = get_insns ();
27136   end_sequence ();
27137
27138   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
27139 }
27140
27141 #undef TARGET_GEN_CCMP_FIRST
27142 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
27143
27144 #undef TARGET_GEN_CCMP_NEXT
27145 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
27146
27147 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
27148    instruction fusion of some sort.  */
27149
27150 static bool
27151 aarch64_macro_fusion_p (void)
27152 {
27153   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
27154 }
27155
27156
27157 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
27158    should be kept together during scheduling.  */
27159
27160 static bool
27161 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
27162 {
27163   rtx set_dest;
27164   rtx prev_set = single_set (prev);
27165   rtx curr_set = single_set (curr);
27166   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
27167   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
27168
27169   if (!aarch64_macro_fusion_p ())
27170     return false;
27171
27172   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
27173     {
27174       /* We are trying to match:
27175          prev (mov)  == (set (reg r0) (const_int imm16))
27176          curr (movk) == (set (zero_extract (reg r0)
27177                                            (const_int 16)
27178                                            (const_int 16))
27179                              (const_int imm16_1))  */
27180
27181       set_dest = SET_DEST (curr_set);
27182
27183       if (GET_CODE (set_dest) == ZERO_EXTRACT
27184           && CONST_INT_P (SET_SRC (curr_set))
27185           && CONST_INT_P (SET_SRC (prev_set))
27186           && CONST_INT_P (XEXP (set_dest, 2))
27187           && INTVAL (XEXP (set_dest, 2)) == 16
27188           && REG_P (XEXP (set_dest, 0))
27189           && REG_P (SET_DEST (prev_set))
27190           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
27191         {
27192           return true;
27193         }
27194     }
27195
27196   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
27197     {
27198
27199       /*  We're trying to match:
27200           prev (adrp) == (set (reg r1)
27201                               (high (symbol_ref ("SYM"))))
27202           curr (add) == (set (reg r0)
27203                              (lo_sum (reg r1)
27204                                      (symbol_ref ("SYM"))))
27205           Note that r0 need not necessarily be the same as r1, especially
27206           during pre-regalloc scheduling.  */
27207
27208       if (satisfies_constraint_Ush (SET_SRC (prev_set))
27209           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27210         {
27211           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
27212               && REG_P (XEXP (SET_SRC (curr_set), 0))
27213               && REGNO (XEXP (SET_SRC (curr_set), 0))
27214                  == REGNO (SET_DEST (prev_set))
27215               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
27216                               XEXP (SET_SRC (curr_set), 1)))
27217             return true;
27218         }
27219     }
27220
27221   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
27222     {
27223
27224       /* We're trying to match:
27225          prev (movk) == (set (zero_extract (reg r0)
27226                                            (const_int 16)
27227                                            (const_int 32))
27228                              (const_int imm16_1))
27229          curr (movk) == (set (zero_extract (reg r0)
27230                                            (const_int 16)
27231                                            (const_int 48))
27232                              (const_int imm16_2))  */
27233
27234       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
27235           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
27236           && REG_P (XEXP (SET_DEST (prev_set), 0))
27237           && REG_P (XEXP (SET_DEST (curr_set), 0))
27238           && REGNO (XEXP (SET_DEST (prev_set), 0))
27239              == REGNO (XEXP (SET_DEST (curr_set), 0))
27240           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
27241           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
27242           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
27243           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
27244           && CONST_INT_P (SET_SRC (prev_set))
27245           && CONST_INT_P (SET_SRC (curr_set)))
27246         return true;
27247
27248     }
27249   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
27250     {
27251       /* We're trying to match:
27252           prev (adrp) == (set (reg r0)
27253                               (high (symbol_ref ("SYM"))))
27254           curr (ldr) == (set (reg r1)
27255                              (mem (lo_sum (reg r0)
27256                                              (symbol_ref ("SYM")))))
27257                  or
27258           curr (ldr) == (set (reg r1)
27259                              (zero_extend (mem
27260                                            (lo_sum (reg r0)
27261                                                    (symbol_ref ("SYM"))))))  */
27262       if (satisfies_constraint_Ush (SET_SRC (prev_set))
27263           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27264         {
27265           rtx curr_src = SET_SRC (curr_set);
27266
27267           if (GET_CODE (curr_src) == ZERO_EXTEND)
27268             curr_src = XEXP (curr_src, 0);
27269
27270           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
27271               && REG_P (XEXP (XEXP (curr_src, 0), 0))
27272               && REGNO (XEXP (XEXP (curr_src, 0), 0))
27273                  == REGNO (SET_DEST (prev_set))
27274               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
27275                               XEXP (SET_SRC (prev_set), 0)))
27276               return true;
27277         }
27278     }
27279
27280   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
27281   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
27282       && prev_set && curr_set && any_condjump_p (curr)
27283       && GET_CODE (SET_SRC (prev_set)) == COMPARE
27284       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
27285       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
27286     return true;
27287
27288   /* Fuse flag-setting ALU instructions and conditional branch.  */
27289   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
27290       && any_condjump_p (curr))
27291     {
27292       unsigned int condreg1, condreg2;
27293       rtx cc_reg_1;
27294       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
27295       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
27296
27297       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
27298           && prev
27299           && modified_in_p (cc_reg_1, prev))
27300         {
27301           enum attr_type prev_type = get_attr_type (prev);
27302
27303           /* FIXME: this misses some which is considered simple arthematic
27304              instructions for ThunderX.  Simple shifts are missed here.  */
27305           if (prev_type == TYPE_ALUS_SREG
27306               || prev_type == TYPE_ALUS_IMM
27307               || prev_type == TYPE_LOGICS_REG
27308               || prev_type == TYPE_LOGICS_IMM)
27309             return true;
27310         }
27311     }
27312
27313   /* Fuse ALU instructions and CBZ/CBNZ.  */
27314   if (prev_set
27315       && curr_set
27316       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
27317       && any_condjump_p (curr))
27318     {
27319       /* We're trying to match:
27320           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
27321           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
27322                                                          (const_int 0))
27323                                                  (label_ref ("SYM"))
27324                                                  (pc))  */
27325       if (SET_DEST (curr_set) == (pc_rtx)
27326           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
27327           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
27328           && REG_P (SET_DEST (prev_set))
27329           && REGNO (SET_DEST (prev_set))
27330              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
27331         {
27332           /* Fuse ALU operations followed by conditional branch instruction.  */
27333           switch (get_attr_type (prev))
27334             {
27335             case TYPE_ALU_IMM:
27336             case TYPE_ALU_SREG:
27337             case TYPE_ADC_REG:
27338             case TYPE_ADC_IMM:
27339             case TYPE_ADCS_REG:
27340             case TYPE_ADCS_IMM:
27341             case TYPE_LOGIC_REG:
27342             case TYPE_LOGIC_IMM:
27343             case TYPE_CSEL:
27344             case TYPE_ADR:
27345             case TYPE_MOV_IMM:
27346             case TYPE_SHIFT_REG:
27347             case TYPE_SHIFT_IMM:
27348             case TYPE_BFM:
27349             case TYPE_RBIT:
27350             case TYPE_REV:
27351             case TYPE_EXTEND:
27352               return true;
27353
27354             default:;
27355             }
27356         }
27357     }
27358
27359   /* Fuse A+B+1 and A-B-1 */
27360   if (simple_sets_p
27361       && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
27362     {
27363       /* We're trying to match:
27364           prev == (set (r0) (plus (r0) (r1)))
27365           curr == (set (r0) (plus (r0) (const_int 1)))
27366         or:
27367           prev == (set (r0) (minus (r0) (r1)))
27368           curr == (set (r0) (plus (r0) (const_int -1))) */
27369
27370       rtx prev_src = SET_SRC (prev_set);
27371       rtx curr_src = SET_SRC (curr_set);
27372
27373       int polarity = 1;
27374       if (GET_CODE (prev_src) == MINUS)
27375         polarity = -1;
27376
27377       if (GET_CODE (curr_src) == PLUS
27378           && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
27379           && CONST_INT_P (XEXP (curr_src, 1))
27380           && INTVAL (XEXP (curr_src, 1)) == polarity
27381           && REG_P (XEXP (curr_src, 0))
27382           && REG_P (SET_DEST (prev_set))
27383           && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
27384         return true;
27385     }
27386
27387   return false;
27388 }
27389
27390 /* Return true iff the instruction fusion described by OP is enabled.  */
27391
27392 bool
27393 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
27394 {
27395   return (aarch64_tune_params.fusible_ops & op) != 0;
27396 }
27397
27398 /* If MEM is in the form of [base+offset], extract the two parts
27399    of address and set to BASE and OFFSET, otherwise return false
27400    after clearing BASE and OFFSET.  */
27401
27402 bool
27403 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
27404 {
27405   rtx addr;
27406
27407   gcc_assert (MEM_P (mem));
27408
27409   addr = XEXP (mem, 0);
27410
27411   if (REG_P (addr))
27412     {
27413       *base = addr;
27414       *offset = const0_rtx;
27415       return true;
27416     }
27417
27418   if (GET_CODE (addr) == PLUS
27419       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
27420     {
27421       *base = XEXP (addr, 0);
27422       *offset = XEXP (addr, 1);
27423       return true;
27424     }
27425
27426   *base = NULL_RTX;
27427   *offset = NULL_RTX;
27428
27429   return false;
27430 }
27431
27432 /* Types for scheduling fusion.  */
27433 enum sched_fusion_type
27434 {
27435   SCHED_FUSION_NONE = 0,
27436   SCHED_FUSION_LD_SIGN_EXTEND,
27437   SCHED_FUSION_LD_ZERO_EXTEND,
27438   SCHED_FUSION_LD,
27439   SCHED_FUSION_ST,
27440   SCHED_FUSION_NUM
27441 };
27442
27443 /* If INSN is a load or store of address in the form of [base+offset],
27444    extract the two parts and set to BASE and OFFSET.  Return scheduling
27445    fusion type this INSN is.  */
27446
27447 static enum sched_fusion_type
27448 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
27449 {
27450   rtx x, dest, src;
27451   enum sched_fusion_type fusion = SCHED_FUSION_LD;
27452
27453   gcc_assert (INSN_P (insn));
27454   x = PATTERN (insn);
27455   if (GET_CODE (x) != SET)
27456     return SCHED_FUSION_NONE;
27457
27458   src = SET_SRC (x);
27459   dest = SET_DEST (x);
27460
27461   machine_mode dest_mode = GET_MODE (dest);
27462
27463   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
27464     return SCHED_FUSION_NONE;
27465
27466   if (GET_CODE (src) == SIGN_EXTEND)
27467     {
27468       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
27469       src = XEXP (src, 0);
27470       if (!MEM_P (src) || GET_MODE (src) != SImode)
27471         return SCHED_FUSION_NONE;
27472     }
27473   else if (GET_CODE (src) == ZERO_EXTEND)
27474     {
27475       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
27476       src = XEXP (src, 0);
27477       if (!MEM_P (src) || GET_MODE (src) != SImode)
27478         return SCHED_FUSION_NONE;
27479     }
27480
27481   if (MEM_P (src) && REG_P (dest))
27482     extract_base_offset_in_addr (src, base, offset);
27483   else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
27484     {
27485       fusion = SCHED_FUSION_ST;
27486       extract_base_offset_in_addr (dest, base, offset);
27487     }
27488   else
27489     return SCHED_FUSION_NONE;
27490
27491   if (*base == NULL_RTX || *offset == NULL_RTX)
27492     fusion = SCHED_FUSION_NONE;
27493
27494   return fusion;
27495 }
27496
27497 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
27498
27499    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
27500    and PRI are only calculated for these instructions.  For other instruction,
27501    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
27502    type instruction fusion can be added by returning different priorities.
27503
27504    It's important that irrelevant instructions get the largest FUSION_PRI.  */
27505
27506 static void
27507 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
27508                                int *fusion_pri, int *pri)
27509 {
27510   int tmp, off_val;
27511   rtx base, offset;
27512   enum sched_fusion_type fusion;
27513
27514   gcc_assert (INSN_P (insn));
27515
27516   tmp = max_pri - 1;
27517   fusion = fusion_load_store (insn, &base, &offset);
27518   if (fusion == SCHED_FUSION_NONE)
27519     {
27520       *pri = tmp;
27521       *fusion_pri = tmp;
27522       return;
27523     }
27524
27525   /* Set FUSION_PRI according to fusion type and base register.  */
27526   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
27527
27528   /* Calculate PRI.  */
27529   tmp /= 2;
27530
27531   /* INSN with smaller offset goes first.  */
27532   off_val = (int)(INTVAL (offset));
27533   if (off_val >= 0)
27534     tmp -= (off_val & 0xfffff);
27535   else
27536     tmp += ((- off_val) & 0xfffff);
27537
27538   *pri = tmp;
27539   return;
27540 }
27541
27542 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
27543    Adjust priority of sha1h instructions so they are scheduled before
27544    other SHA1 instructions.  */
27545
27546 static int
27547 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
27548 {
27549   rtx x = PATTERN (insn);
27550
27551   if (GET_CODE (x) == SET)
27552     {
27553       x = SET_SRC (x);
27554
27555       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
27556         return priority + 10;
27557     }
27558
27559   return priority;
27560 }
27561
27562 /* If REVERSED is null, return true if memory reference *MEM2 comes
27563    immediately after memory reference *MEM1.  Do not change the references
27564    in this case.
27565
27566    Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
27567    if they are, try to make them use constant offsets from the same base
27568    register.  Return true on success.  When returning true, set *REVERSED
27569    to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2.  */
27570 static bool
27571 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
27572 {
27573   if (reversed)
27574     *reversed = false;
27575
27576   if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
27577       || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
27578     return false;
27579
27580   if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
27581     return false;
27582
27583   auto size1 = MEM_SIZE (*mem1);
27584   auto size2 = MEM_SIZE (*mem2);
27585
27586   rtx base1, base2, offset1, offset2;
27587   extract_base_offset_in_addr (*mem1, &base1, &offset1);
27588   extract_base_offset_in_addr (*mem2, &base2, &offset2);
27589
27590   /* Make sure at least one memory is in base+offset form.  */
27591   if (!(base1 && offset1) && !(base2 && offset2))
27592     return false;
27593
27594   /* If both mems already use the same base register, just check the
27595      offsets.  */
27596   if (base1 && base2 && rtx_equal_p (base1, base2))
27597     {
27598       if (!offset1 || !offset2)
27599         return false;
27600
27601       if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
27602         return true;
27603
27604       if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
27605         {
27606           *reversed = true;
27607           return true;
27608         }
27609
27610       return false;
27611     }
27612
27613   /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
27614      guarantee that the values are consecutive.  */
27615   if (MEM_EXPR (*mem1)
27616       && MEM_EXPR (*mem2)
27617       && MEM_OFFSET_KNOWN_P (*mem1)
27618       && MEM_OFFSET_KNOWN_P (*mem2))
27619     {
27620       poly_int64 expr_offset1;
27621       poly_int64 expr_offset2;
27622       tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
27623                                                        &expr_offset1);
27624       tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
27625                                                        &expr_offset2);
27626       if (!expr_base1
27627           || !expr_base2
27628           || !DECL_P (expr_base1)
27629           || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
27630         return false;
27631
27632       expr_offset1 += MEM_OFFSET (*mem1);
27633       expr_offset2 += MEM_OFFSET (*mem2);
27634
27635       if (known_eq (expr_offset1 + size1, expr_offset2))
27636         ;
27637       else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
27638         *reversed = true;
27639       else
27640         return false;
27641
27642       if (reversed)
27643         {
27644           if (base2)
27645             {
27646               rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
27647                                          expr_offset1 - expr_offset2);
27648               *mem1 = replace_equiv_address_nv (*mem1, addr1);
27649             }
27650           else
27651             {
27652               rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
27653                                          expr_offset2 - expr_offset1);
27654               *mem2 = replace_equiv_address_nv (*mem2, addr2);
27655             }
27656         }
27657       return true;
27658     }
27659
27660   return false;
27661 }
27662
27663 /* Test if MODE is suitable for a single transfer register in an ldp or stp
27664    instruction.  */
27665
27666 bool
27667 aarch64_ldpstp_operand_mode_p (machine_mode mode)
27668 {
27669   if (!targetm.hard_regno_mode_ok (V0_REGNUM, mode)
27670       || hard_regno_nregs (V0_REGNUM, mode) > 1)
27671     return false;
27672
27673   const auto size = GET_MODE_SIZE (mode);
27674   return known_eq (size, 4) || known_eq (size, 8) || known_eq (size, 16);
27675 }
27676
27677 /* Return true if MEM1 and MEM2 can be combined into a single access
27678    of mode MODE, with the combined access having the same address as MEM1.  */
27679
27680 bool
27681 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
27682 {
27683   if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
27684     return false;
27685   return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
27686 }
27687
27688 /* Return true if MEM agrees with the ldp-stp policy model.
27689    Otherwise, false.  */
27690
27691 bool
27692 aarch64_mem_ok_with_ldpstp_policy_model (rtx mem, bool load, machine_mode mode)
27693 {
27694   auto policy = (load
27695                  ? aarch64_tune_params.ldp_policy_model
27696                  : aarch64_tune_params.stp_policy_model);
27697
27698   /* If we have AARCH64_LDP_STP_POLICY_NEVER, reject the load pair.  */
27699   if (policy == AARCH64_LDP_STP_POLICY_NEVER)
27700     return false;
27701
27702   /* If we have AARCH64_LDP_STP_POLICY_ALIGNED,
27703      do not emit the load pair unless the alignment is checked to be
27704      at least double the alignment of the type.  */
27705   if (policy == AARCH64_LDP_STP_POLICY_ALIGNED
27706       && !optimize_function_for_size_p (cfun)
27707       && MEM_ALIGN (mem) < 2 * GET_MODE_ALIGNMENT (mode))
27708     return false;
27709
27710   return true;
27711 }
27712
27713 /* Given OPERANDS of consecutive load/store, check if we can merge
27714    them into ldp/stp.  LOAD is true if they are load instructions.  */
27715
27716 bool
27717 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load)
27718 {
27719   enum reg_class rclass_1, rclass_2;
27720   rtx mem_1, mem_2, reg_1, reg_2;
27721
27722   if (load)
27723     {
27724       mem_1 = operands[1];
27725       mem_2 = operands[3];
27726       reg_1 = operands[0];
27727       reg_2 = operands[2];
27728       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
27729       if (REGNO (reg_1) == REGNO (reg_2))
27730         return false;
27731       if (reg_overlap_mentioned_p (reg_1, mem_2))
27732         return false;
27733     }
27734   else
27735     {
27736       mem_1 = operands[0];
27737       mem_2 = operands[2];
27738       reg_1 = operands[1];
27739       reg_2 = operands[3];
27740     }
27741
27742   /* The mems cannot be volatile.  */
27743   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
27744     return false;
27745
27746   /* Check if the addresses are in the form of [base+offset].  */
27747   bool reversed = false;
27748   if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
27749     return false;
27750
27751   /* The operands must be of the same size.  */
27752   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
27753                         GET_MODE_SIZE (GET_MODE (mem_2))));
27754
27755   /* The lower memory access must be a mem-pair operand.  */
27756   rtx lower_mem = reversed ? mem_2 : mem_1;
27757   machine_mode lower_mem_mode = GET_MODE (lower_mem);
27758   if (!aarch64_mem_pair_operand (lower_mem, lower_mem_mode))
27759     return false;
27760
27761   /* Check if lower_mem is ok with the ldp-stp policy model.  */
27762   if (!aarch64_mem_ok_with_ldpstp_policy_model (lower_mem, load,
27763                                                 lower_mem_mode))
27764     return false;
27765
27766   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
27767     rclass_1 = FP_REGS;
27768   else
27769     rclass_1 = GENERAL_REGS;
27770
27771   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
27772     rclass_2 = FP_REGS;
27773   else
27774     rclass_2 = GENERAL_REGS;
27775
27776   /* Check if the registers are of same class.  */
27777   if (rclass_1 != rclass_2)
27778     return false;
27779
27780   return true;
27781 }
27782
27783 /* Given OPERANDS of consecutive load/store that can be merged,
27784    swap them if they are not in ascending order.  */
27785 void
27786 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
27787 {
27788   int mem_op = load ? 1 : 0;
27789   bool reversed = false;
27790   if (!aarch64_check_consecutive_mems (operands + mem_op,
27791                                        operands + mem_op + 2, &reversed))
27792     gcc_unreachable ();
27793
27794   if (reversed)
27795     {
27796       /* Irrespective of whether this is a load or a store,
27797          we do the same swap.  */
27798       std::swap (operands[0], operands[2]);
27799       std::swap (operands[1], operands[3]);
27800     }
27801 }
27802
27803 /* Helper function used for generation of load/store pair instructions, called
27804    from peepholes in aarch64-ldpstp.md.  OPERANDS is an array of
27805    operands as matched by the peepholes in that file.  LOAD_P is true if we're
27806    generating a load pair, otherwise we're generating a store pair.  CODE is
27807    either {ZERO,SIGN}_EXTEND for extending loads or UNKNOWN if we're generating a
27808    standard load/store pair.  */
27809
27810 void
27811 aarch64_finish_ldpstp_peephole (rtx *operands, bool load_p, enum rtx_code code)
27812 {
27813   aarch64_swap_ldrstr_operands (operands, load_p);
27814
27815   if (load_p)
27816     emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
27817                                       operands[1], code));
27818   else
27819     {
27820       gcc_assert (code == UNKNOWN);
27821       emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
27822                                          operands[3]));
27823     }
27824 }
27825
27826 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
27827    comparison between the two.  */
27828 int
27829 aarch64_host_wide_int_compare (const void *x, const void *y)
27830 {
27831   return wi::cmps (* ((const HOST_WIDE_INT *) x),
27832                    * ((const HOST_WIDE_INT *) y));
27833 }
27834
27835 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
27836    other pointing to a REG rtx containing an offset, compare the offsets
27837    of the two pairs.
27838
27839    Return:
27840
27841         1 iff offset (X) > offset (Y)
27842         0 iff offset (X) == offset (Y)
27843         -1 iff offset (X) < offset (Y)  */
27844 int
27845 aarch64_ldrstr_offset_compare (const void *x, const void *y)
27846 {
27847   const rtx * operands_1 = (const rtx *) x;
27848   const rtx * operands_2 = (const rtx *) y;
27849   rtx mem_1, mem_2, base, offset_1, offset_2;
27850
27851   if (MEM_P (operands_1[0]))
27852     mem_1 = operands_1[0];
27853   else
27854     mem_1 = operands_1[1];
27855
27856   if (MEM_P (operands_2[0]))
27857     mem_2 = operands_2[0];
27858   else
27859     mem_2 = operands_2[1];
27860
27861   /* Extract the offsets.  */
27862   extract_base_offset_in_addr (mem_1, &base, &offset_1);
27863   extract_base_offset_in_addr (mem_2, &base, &offset_2);
27864
27865   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
27866
27867   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
27868 }
27869
27870 /* Given OPERANDS of consecutive load/store, check if we can merge
27871    them into ldp/stp by adjusting the offset.  LOAD is true if they
27872    are load instructions.  MODE is the mode of memory operands.
27873
27874    Given below consecutive stores:
27875
27876      str  w1, [xb, 0x100]
27877      str  w1, [xb, 0x104]
27878      str  w1, [xb, 0x108]
27879      str  w1, [xb, 0x10c]
27880
27881    Though the offsets are out of the range supported by stp, we can
27882    still pair them after adjusting the offset, like:
27883
27884      add  scratch, xb, 0x100
27885      stp  w1, w1, [scratch]
27886      stp  w1, w1, [scratch, 0x8]
27887
27888    The peephole patterns detecting this opportunity should guarantee
27889    the scratch register is avaliable.  */
27890
27891 bool
27892 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
27893                                        machine_mode mode)
27894 {
27895   const int num_insns = 4;
27896   enum reg_class rclass;
27897   HOST_WIDE_INT offvals[num_insns], msize;
27898   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
27899
27900   if (load)
27901     {
27902       for (int i = 0; i < num_insns; i++)
27903         {
27904           reg[i] = operands[2 * i];
27905           mem[i] = operands[2 * i + 1];
27906
27907           gcc_assert (REG_P (reg[i]));
27908         }
27909
27910       /* Do not attempt to merge the loads if the loads clobber each other.  */
27911       for (int i = 0; i < 8; i += 2)
27912         for (int j = i + 2; j < 8; j += 2)
27913           if (reg_overlap_mentioned_p (operands[i], operands[j]))
27914             return false;
27915     }
27916   else
27917     for (int i = 0; i < num_insns; i++)
27918       {
27919         mem[i] = operands[2 * i];
27920         reg[i] = operands[2 * i + 1];
27921       }
27922
27923   /* Skip if memory operand is by itself valid for ldp/stp.  */
27924   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
27925     return false;
27926
27927   for (int i = 0; i < num_insns; i++)
27928     {
27929       /* The mems cannot be volatile.  */
27930       if (MEM_VOLATILE_P (mem[i]))
27931         return false;
27932
27933       /* Check if the addresses are in the form of [base+offset].  */
27934       extract_base_offset_in_addr (mem[i], base + i, offset + i);
27935       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
27936         return false;
27937     }
27938
27939   /* Check if the registers are of same class.  */
27940   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
27941     ? FP_REGS : GENERAL_REGS;
27942
27943   for (int i = 1; i < num_insns; i++)
27944     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
27945       {
27946         if (rclass != FP_REGS)
27947           return false;
27948       }
27949     else
27950       {
27951         if (rclass != GENERAL_REGS)
27952           return false;
27953       }
27954
27955   /* Only the last register in the order in which they occur
27956      may be clobbered by the load.  */
27957   if (rclass == GENERAL_REGS && load)
27958     for (int i = 0; i < num_insns - 1; i++)
27959       if (reg_mentioned_p (reg[i], mem[i]))
27960         return false;
27961
27962   /* Check if the bases are same.  */
27963   for (int i = 0; i < num_insns - 1; i++)
27964     if (!rtx_equal_p (base[i], base[i + 1]))
27965       return false;
27966
27967   for (int i = 0; i < num_insns; i++)
27968     offvals[i] = INTVAL (offset[i]);
27969
27970   msize = GET_MODE_SIZE (mode).to_constant ();
27971
27972   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
27973   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
27974          aarch64_host_wide_int_compare);
27975
27976   if (!(offvals[1] == offvals[0] + msize
27977         && offvals[3] == offvals[2] + msize))
27978     return false;
27979
27980   /* Check that offsets are within range of each other.  The ldp/stp
27981      instructions have 7 bit immediate offsets, so use 0x80.  */
27982   if (offvals[2] - offvals[0] >= msize * 0x80)
27983     return false;
27984
27985   /* The offsets must be aligned with respect to each other.  */
27986   if (offvals[0] % msize != offvals[2] % msize)
27987     return false;
27988
27989    /* Check if mem[0] is ok with the ldp-stp policy model.  */
27990   if (!aarch64_mem_ok_with_ldpstp_policy_model (mem[0], load, mode))
27991     return false;
27992
27993   return true;
27994 }
27995
27996 /* Given OPERANDS of consecutive load/store, this function pairs them
27997    into LDP/STP after adjusting the offset.  It depends on the fact
27998    that the operands can be sorted so the offsets are correct for STP.
27999    MODE is the mode of memory operands.  CODE is the rtl operator
28000    which should be applied to all memory operands, it's SIGN_EXTEND,
28001    ZERO_EXTEND or UNKNOWN.  */
28002
28003 bool
28004 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
28005                              machine_mode mode, RTX_CODE code)
28006 {
28007   rtx base, offset_1, offset_2;
28008   rtx mem_1, mem_2;
28009   rtx temp_operands[8];
28010   HOST_WIDE_INT off_val_1, off_val_2, base_off, new_off_1, new_off_2,
28011                 stp_off_upper_limit, stp_off_lower_limit, msize;
28012
28013   /* We make changes on a copy as we may still bail out.  */
28014   for (int i = 0; i < 8; i ++)
28015     temp_operands[i] = operands[i];
28016
28017   /* Sort the operands.  Note for cases as below:
28018        [base + 0x310] = A
28019        [base + 0x320] = B
28020        [base + 0x330] = C
28021        [base + 0x320] = D
28022      We need stable sorting otherwise wrong data may be store to offset 0x320.
28023      Also note the dead store in above case should be optimized away, but no
28024      guarantees here.  */
28025   gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
28026                  aarch64_ldrstr_offset_compare);
28027
28028   /* Copy the memory operands so that if we have to bail for some
28029      reason the original addresses are unchanged.  */
28030   if (load)
28031     {
28032       mem_1 = copy_rtx (temp_operands[1]);
28033       mem_2 = copy_rtx (temp_operands[5]);
28034     }
28035   else
28036     {
28037       mem_1 = copy_rtx (temp_operands[0]);
28038       mem_2 = copy_rtx (temp_operands[4]);
28039       gcc_assert (code == UNKNOWN);
28040     }
28041
28042   extract_base_offset_in_addr (mem_1, &base, &offset_1);
28043   extract_base_offset_in_addr (mem_2, &base, &offset_2);
28044   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
28045               && offset_2 != NULL_RTX);
28046
28047   /* Adjust offset so it can fit in LDP/STP instruction.  */
28048   msize = GET_MODE_SIZE (mode).to_constant();
28049   stp_off_upper_limit = msize * (0x40 - 1);
28050   stp_off_lower_limit = - msize * 0x40;
28051
28052   off_val_1 = INTVAL (offset_1);
28053   off_val_2 = INTVAL (offset_2);
28054
28055   /* The base offset is optimally half way between the two STP/LDP offsets.  */
28056   if (msize <= 4)
28057     base_off = (off_val_1 + off_val_2) / 2;
28058   else
28059     /* However, due to issues with negative LDP/STP offset generation for
28060        larger modes, for DF, DD, DI and vector modes. we must not use negative
28061        addresses smaller than 9 signed unadjusted bits can store.  This
28062        provides the most range in this case.  */
28063     base_off = off_val_1;
28064
28065   /* Adjust the base so that it is aligned with the addresses but still
28066      optimal.  */
28067   if (base_off % msize != off_val_1 % msize)
28068     /* Fix the offset, bearing in mind we want to make it bigger not
28069        smaller.  */
28070     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28071   else if (msize <= 4)
28072     /* The negative range of LDP/STP is one larger than the positive range.  */
28073     base_off += msize;
28074
28075   /* Check if base offset is too big or too small.  We can attempt to resolve
28076      this issue by setting it to the maximum value and seeing if the offsets
28077      still fit.  */
28078   if (base_off >= 0x1000)
28079     {
28080       base_off = 0x1000 - 1;
28081       /* We must still make sure that the base offset is aligned with respect
28082          to the address.  But it may not be made any bigger.  */
28083       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28084     }
28085
28086   /* Likewise for the case where the base is too small.  */
28087   if (base_off <= -0x1000)
28088     {
28089       base_off = -0x1000 + 1;
28090       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28091     }
28092
28093   /* Offset of the first STP/LDP.  */
28094   new_off_1 = off_val_1 - base_off;
28095
28096   /* Offset of the second STP/LDP.  */
28097   new_off_2 = off_val_2 - base_off;
28098
28099   /* The offsets must be within the range of the LDP/STP instructions.  */
28100   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
28101       || new_off_2 > stp_off_upper_limit || new_off_2 < stp_off_lower_limit)
28102     return false;
28103
28104   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
28105                                                   new_off_1), true);
28106   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
28107                                                   new_off_2), true);
28108
28109   if (!aarch64_mem_pair_operand (mem_1, mode)
28110       || !aarch64_mem_pair_operand (mem_2, mode))
28111     return false;
28112
28113   if (load)
28114     {
28115       operands[0] = temp_operands[0];
28116       operands[1] = mem_1;
28117       operands[2] = temp_operands[2];
28118       operands[4] = temp_operands[4];
28119       operands[5] = mem_2;
28120       operands[6] = temp_operands[6];
28121     }
28122   else
28123     {
28124       operands[0] = mem_1;
28125       operands[1] = temp_operands[1];
28126       operands[3] = temp_operands[3];
28127       operands[4] = mem_2;
28128       operands[5] = temp_operands[5];
28129       operands[7] = temp_operands[7];
28130     }
28131
28132   /* Emit adjusting instruction.  */
28133   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
28134   /* Emit ldp/stp instructions.  */
28135   if (load)
28136     {
28137       emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
28138                                         operands[1], code));
28139       emit_insn (aarch64_gen_load_pair (operands[4], operands[6],
28140                                         operands[5], code));
28141     }
28142   else
28143     {
28144       emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
28145                                          operands[3]));
28146       emit_insn (aarch64_gen_store_pair (operands[4], operands[5],
28147                                          operands[7]));
28148     }
28149   return true;
28150 }
28151
28152 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
28153    it isn't worth branching around empty masked ops (including masked
28154    stores).  */
28155
28156 static bool
28157 aarch64_empty_mask_is_expensive (unsigned)
28158 {
28159   return false;
28160 }
28161
28162 /* Return 1 if pseudo register should be created and used to hold
28163    GOT address for PIC code.  */
28164
28165 bool
28166 aarch64_use_pseudo_pic_reg (void)
28167 {
28168   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
28169 }
28170
28171 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
28172
28173 static int
28174 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
28175 {
28176   switch (XINT (x, 1))
28177     {
28178     case UNSPEC_GOTSMALLPIC:
28179     case UNSPEC_GOTSMALLPIC28K:
28180     case UNSPEC_GOTTINYPIC:
28181       return 0;
28182     default:
28183       break;
28184     }
28185
28186   return default_unspec_may_trap_p (x, flags);
28187 }
28188
28189
28190 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
28191    return the log2 of that value.  Otherwise return -1.  */
28192
28193 int
28194 aarch64_fpconst_pow_of_2 (rtx x)
28195 {
28196   const REAL_VALUE_TYPE *r;
28197
28198   if (!CONST_DOUBLE_P (x))
28199     return -1;
28200
28201   r = CONST_DOUBLE_REAL_VALUE (x);
28202
28203   if (REAL_VALUE_NEGATIVE (*r)
28204       || REAL_VALUE_ISNAN (*r)
28205       || REAL_VALUE_ISINF (*r)
28206       || !real_isinteger (r, DFmode))
28207     return -1;
28208
28209   return exact_log2 (real_to_integer (r));
28210 }
28211
28212 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
28213    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
28214    return n. Otherwise return -1.  */
28215
28216 int
28217 aarch64_fpconst_pow2_recip (rtx x)
28218 {
28219   REAL_VALUE_TYPE r0;
28220
28221   if (!CONST_DOUBLE_P (x))
28222     return -1;
28223
28224   r0 = *CONST_DOUBLE_REAL_VALUE (x);
28225   if (exact_real_inverse (DFmode, &r0)
28226       && !REAL_VALUE_NEGATIVE (r0))
28227     {
28228         int ret = exact_log2 (real_to_integer (&r0));
28229         if (ret >= 1 && ret <= 32)
28230             return ret;
28231     }
28232   return -1;
28233 }
28234
28235 /* If X is a vector of equal CONST_DOUBLE values and that value is
28236    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
28237
28238 int
28239 aarch64_vec_fpconst_pow_of_2 (rtx x)
28240 {
28241   int nelts;
28242   if (!CONST_VECTOR_P (x)
28243       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
28244     return -1;
28245
28246   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
28247     return -1;
28248
28249   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
28250   if (firstval <= 0)
28251     return -1;
28252
28253   for (int i = 1; i < nelts; i++)
28254     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
28255       return -1;
28256
28257   return firstval;
28258 }
28259
28260 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
28261    to float.
28262
28263    __fp16 always promotes through this hook.
28264    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
28265    through the generic excess precision logic rather than here.  */
28266
28267 static tree
28268 aarch64_promoted_type (const_tree t)
28269 {
28270   if (SCALAR_FLOAT_TYPE_P (t)
28271       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
28272     return float_type_node;
28273
28274   return NULL_TREE;
28275 }
28276
28277 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
28278
28279 static bool
28280 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
28281                            optimization_type opt_type)
28282 {
28283   switch (op)
28284     {
28285     case rsqrt_optab:
28286       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
28287
28288     default:
28289       return true;
28290     }
28291 }
28292
28293 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
28294
28295 static unsigned int
28296 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
28297                                         int *offset)
28298 {
28299   /* Polynomial invariant 1 == (VG / 2) - 1.  */
28300   gcc_assert (i == 1);
28301   *factor = 2;
28302   *offset = 1;
28303   return AARCH64_DWARF_VG;
28304 }
28305
28306 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
28307    if MODE is [BH]Fmode, and punt to the generic implementation otherwise.  */
28308
28309 static bool
28310 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
28311 {
28312   return ((mode == HFmode || mode == BFmode)
28313           ? true
28314           : default_libgcc_floating_mode_supported_p (mode));
28315 }
28316
28317 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
28318    if MODE is [BH]Fmode, and punt to the generic implementation otherwise.  */
28319
28320 static bool
28321 aarch64_scalar_mode_supported_p (scalar_mode mode)
28322 {
28323   if (DECIMAL_FLOAT_MODE_P (mode))
28324     return default_decimal_float_supported_p ();
28325
28326   return ((mode == HFmode || mode == BFmode)
28327           ? true
28328           : default_scalar_mode_supported_p (mode));
28329 }
28330
28331 /* Set the value of FLT_EVAL_METHOD.
28332    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
28333
28334     0: evaluate all operations and constants, whose semantic type has at
28335        most the range and precision of type float, to the range and
28336        precision of float; evaluate all other operations and constants to
28337        the range and precision of the semantic type;
28338
28339     N, where _FloatN is a supported interchange floating type
28340        evaluate all operations and constants, whose semantic type has at
28341        most the range and precision of _FloatN type, to the range and
28342        precision of the _FloatN type; evaluate all other operations and
28343        constants to the range and precision of the semantic type;
28344
28345    If we have the ARMv8.2-A extensions then we support _Float16 in native
28346    precision, so we should set this to 16.  Otherwise, we support the type,
28347    but want to evaluate expressions in float precision, so set this to
28348    0.  */
28349
28350 static enum flt_eval_method
28351 aarch64_excess_precision (enum excess_precision_type type)
28352 {
28353   switch (type)
28354     {
28355       case EXCESS_PRECISION_TYPE_FAST:
28356       case EXCESS_PRECISION_TYPE_STANDARD:
28357         /* We can calculate either in 16-bit range and precision or
28358            32-bit range and precision.  Make that decision based on whether
28359            we have native support for the ARMv8.2-A 16-bit floating-point
28360            instructions or not.  */
28361         return (TARGET_FP_F16INST
28362                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
28363                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
28364       case EXCESS_PRECISION_TYPE_IMPLICIT:
28365       case EXCESS_PRECISION_TYPE_FLOAT16:
28366         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
28367       default:
28368         gcc_unreachable ();
28369     }
28370   return FLT_EVAL_METHOD_UNPREDICTABLE;
28371 }
28372
28373 /* Implement TARGET_C_BITINT_TYPE_INFO.
28374    Return true if _BitInt(N) is supported and fill its details into *INFO.  */
28375 bool
28376 aarch64_bitint_type_info (int n, struct bitint_info *info)
28377 {
28378   if (TARGET_BIG_END)
28379     return false;
28380
28381   if (n <= 8)
28382     info->limb_mode = QImode;
28383   else if (n <= 16)
28384     info->limb_mode = HImode;
28385   else if (n <= 32)
28386     info->limb_mode = SImode;
28387   else if (n <= 64)
28388     info->limb_mode = DImode;
28389   else if (n <= 128)
28390     info->limb_mode = TImode;
28391   else
28392     /* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
28393        type {signed,unsigned} __int128[M] where M*128 >= N.  However, to be
28394        able to use libgcc's implementation to support large _BitInt's we need
28395        to use a LIMB_MODE that is no larger than 'long long'.  This is why we
28396        use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
28397        be TImode to ensure we are ABI compliant.  */
28398     info->limb_mode = DImode;
28399
28400   if (n > 128)
28401     info->abi_limb_mode = TImode;
28402   else
28403     info->abi_limb_mode = info->limb_mode;
28404   info->big_endian = TARGET_BIG_END;
28405   info->extended = false;
28406   return true;
28407 }
28408
28409 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
28410    scheduled for speculative execution.  Reject the long-running division
28411    and square-root instructions.  */
28412
28413 static bool
28414 aarch64_sched_can_speculate_insn (rtx_insn *insn)
28415 {
28416   switch (get_attr_type (insn))
28417     {
28418       case TYPE_SDIV:
28419       case TYPE_UDIV:
28420       case TYPE_FDIVS:
28421       case TYPE_FDIVD:
28422       case TYPE_FSQRTS:
28423       case TYPE_FSQRTD:
28424       case TYPE_NEON_FP_SQRT_S:
28425       case TYPE_NEON_FP_SQRT_D:
28426       case TYPE_NEON_FP_SQRT_S_Q:
28427       case TYPE_NEON_FP_SQRT_D_Q:
28428       case TYPE_NEON_FP_DIV_S:
28429       case TYPE_NEON_FP_DIV_D:
28430       case TYPE_NEON_FP_DIV_S_Q:
28431       case TYPE_NEON_FP_DIV_D_Q:
28432         return false;
28433       default:
28434         return true;
28435     }
28436 }
28437
28438 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
28439
28440 static int
28441 aarch64_compute_pressure_classes (reg_class *classes)
28442 {
28443   int i = 0;
28444   classes[i++] = GENERAL_REGS;
28445   classes[i++] = FP_REGS;
28446   /* PR_REGS isn't a useful pressure class because many predicate pseudo
28447      registers need to go in PR_LO_REGS at some point during their
28448      lifetime.  Splitting it into two halves has the effect of making
28449      all predicates count against PR_LO_REGS, so that we try whenever
28450      possible to restrict the number of live predicates to 8.  This
28451      greatly reduces the amount of spilling in certain loops.  */
28452   classes[i++] = PR_LO_REGS;
28453   classes[i++] = PR_HI_REGS;
28454   return i;
28455 }
28456
28457 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
28458
28459 static bool
28460 aarch64_can_change_mode_class (machine_mode from,
28461                                machine_mode to, reg_class_t)
28462 {
28463   return aarch64_modes_compatible_p (from, to);
28464 }
28465
28466 /* Implement TARGET_EARLY_REMAT_MODES.  */
28467
28468 static void
28469 aarch64_select_early_remat_modes (sbitmap modes)
28470 {
28471   /* SVE values are not normally live across a call, so it should be
28472      worth doing early rematerialization even in VL-specific mode.  */
28473   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
28474     if (aarch64_sve_mode_p ((machine_mode) i))
28475       bitmap_set_bit (modes, i);
28476 }
28477
28478 /* Override the default target speculation_safe_value.  */
28479 static rtx
28480 aarch64_speculation_safe_value (machine_mode mode,
28481                                 rtx result, rtx val, rtx failval)
28482 {
28483   /* Maybe we should warn if falling back to hard barriers.  They are
28484      likely to be noticably more expensive than the alternative below.  */
28485   if (!aarch64_track_speculation)
28486     return default_speculation_safe_value (mode, result, val, failval);
28487
28488   if (!REG_P (val))
28489     val = copy_to_mode_reg (mode, val);
28490
28491   if (!aarch64_reg_or_zero (failval, mode))
28492     failval = copy_to_mode_reg (mode, failval);
28493
28494   emit_insn (gen_despeculate_copy (mode, result, val, failval));
28495   return result;
28496 }
28497
28498 /* Implement TARGET_ESTIMATED_POLY_VALUE.
28499    Look into the tuning structure for an estimate.
28500    KIND specifies the type of requested estimate: min, max or likely.
28501    For cores with a known SVE width all three estimates are the same.
28502    For generic SVE tuning we want to distinguish the maximum estimate from
28503    the minimum and likely ones.
28504    The likely estimate is the same as the minimum in that case to give a
28505    conservative behavior of auto-vectorizing with SVE when it is a win
28506    even for 128-bit SVE.
28507    When SVE width information is available VAL.coeffs[1] is multiplied by
28508    the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
28509
28510 static HOST_WIDE_INT
28511 aarch64_estimated_poly_value (poly_int64 val,
28512                               poly_value_estimate_kind kind
28513                                 = POLY_VALUE_LIKELY)
28514 {
28515   unsigned int width_source = aarch64_tune_params.sve_width;
28516
28517   /* If there is no core-specific information then the minimum and likely
28518      values are based on 128-bit vectors and the maximum is based on
28519      the architectural maximum of 2048 bits.  */
28520   if (width_source == SVE_SCALABLE)
28521     switch (kind)
28522       {
28523       case POLY_VALUE_MIN:
28524       case POLY_VALUE_LIKELY:
28525         return val.coeffs[0];
28526       case POLY_VALUE_MAX:
28527           return val.coeffs[0] + val.coeffs[1] * 15;
28528       }
28529
28530   /* Allow sve_width to be a bitmask of different VL, treating the lowest
28531      as likely.  This could be made more general if future -mtune options
28532      need it to be.  */
28533   if (kind == POLY_VALUE_MAX)
28534     width_source = 1 << floor_log2 (width_source);
28535   else
28536     width_source = least_bit_hwi (width_source);
28537
28538   /* If the core provides width information, use that.  */
28539   HOST_WIDE_INT over_128 = width_source - 128;
28540   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
28541 }
28542
28543
28544 /* Return true for types that could be supported as SIMD return or
28545    argument types.  */
28546
28547 static bool
28548 supported_simd_type (tree t)
28549 {
28550   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
28551     {
28552       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
28553       return s == 1 || s == 2 || s == 4 || s == 8;
28554     }
28555   return false;
28556 }
28557
28558 /* Determine the lane size for the clone argument/return type.  This follows
28559    the LS(P) rule in the VFABIA64.  */
28560
28561 static unsigned
28562 lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
28563 {
28564   gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
28565
28566   /* For non map-to-vector types that are pointers we use the element type it
28567      points to.  */
28568   if (POINTER_TYPE_P (type))
28569     switch (clone_arg_type)
28570       {
28571       default:
28572         break;
28573       case SIMD_CLONE_ARG_TYPE_UNIFORM:
28574       case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
28575       case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
28576         type = TREE_TYPE (type);
28577         break;
28578       }
28579
28580   /* For types (or pointers of non map-to-vector types point to) that are
28581      integers or floating point, we use their size if they are 1, 2, 4 or 8.
28582    */
28583   if (INTEGRAL_TYPE_P (type)
28584       || SCALAR_FLOAT_TYPE_P (type))
28585     switch (TYPE_PRECISION (type) / BITS_PER_UNIT)
28586       {
28587       default:
28588         break;
28589       case 1:
28590       case 2:
28591       case 4:
28592       case 8:
28593         return TYPE_PRECISION (type);
28594       }
28595   /* For any other we use the size of uintptr_t.  For map-to-vector types that
28596      are pointers, using the size of uintptr_t is the same as using the size of
28597      their type, seeing all pointers are the same size as uintptr_t.  */
28598   return POINTER_SIZE;
28599 }
28600
28601
28602 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
28603
28604 static int
28605 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
28606                                         struct cgraph_simd_clone *clonei,
28607                                         tree base_type ATTRIBUTE_UNUSED,
28608                                         int num, bool explicit_p)
28609 {
28610   tree t, ret_type;
28611   unsigned int nds_elt_bits;
28612   unsigned HOST_WIDE_INT const_simdlen;
28613
28614   if (!TARGET_SIMD)
28615     return 0;
28616
28617   /* For now, SVE simdclones won't produce illegal simdlen, So only check
28618      const simdlens here.  */
28619   if (maybe_ne (clonei->simdlen, 0U)
28620       && clonei->simdlen.is_constant (&const_simdlen)
28621       && (const_simdlen < 2
28622           || const_simdlen > 1024
28623           || (const_simdlen & (const_simdlen - 1)) != 0))
28624     {
28625       if (explicit_p)
28626         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28627                     "unsupported simdlen %wd", const_simdlen);
28628       return 0;
28629     }
28630
28631   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
28632   /* According to AArch64's Vector ABI the type that determines the simdlen is
28633      the narrowest of types, so we ignore base_type for AArch64.  */
28634   if (TREE_CODE (ret_type) != VOID_TYPE
28635       && !supported_simd_type (ret_type))
28636     {
28637       if (!explicit_p)
28638         ;
28639       else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28640         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28641                     "GCC does not currently support return type %qT "
28642                     "for simd", ret_type);
28643       else
28644         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28645                     "unsupported return type %qT for simd",
28646                     ret_type);
28647       return 0;
28648     }
28649
28650   auto_vec<std::pair <tree, unsigned int>> vec_elts (clonei->nargs + 1);
28651
28652   /* We are looking for the NDS type here according to the VFABIA64.  */
28653   if (TREE_CODE (ret_type) != VOID_TYPE)
28654     {
28655       nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
28656       vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
28657     }
28658   else
28659     nds_elt_bits = POINTER_SIZE;
28660
28661   int i;
28662   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
28663   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
28664   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
28665        t && t != void_list_node; t = TREE_CHAIN (t), i++)
28666     {
28667       tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
28668       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
28669           && !supported_simd_type (arg_type))
28670         {
28671           if (!explicit_p)
28672             ;
28673           else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28674             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28675                         "GCC does not currently support argument type %qT "
28676                         "for simd", arg_type);
28677           else
28678             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28679                         "unsupported argument type %qT for simd",
28680                         arg_type);
28681           return 0;
28682         }
28683       unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
28684       if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
28685         vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
28686       if (nds_elt_bits > lane_bits)
28687         nds_elt_bits = lane_bits;
28688     }
28689
28690   clonei->vecsize_mangle = 'n';
28691   clonei->mask_mode = VOIDmode;
28692   poly_uint64 simdlen;
28693   auto_vec<poly_uint64> simdlens (2);
28694   /* Keep track of the possible simdlens the clones of this function can have,
28695      and check them later to see if we support them.  */
28696   if (known_eq (clonei->simdlen, 0U))
28697     {
28698       simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
28699       if (maybe_ne (simdlen, 1U))
28700         simdlens.safe_push (simdlen);
28701       simdlens.safe_push (simdlen * 2);
28702     }
28703   else
28704     simdlens.safe_push (clonei->simdlen);
28705
28706   clonei->vecsize_int = 0;
28707   clonei->vecsize_float = 0;
28708
28709   /* We currently do not support generating simdclones where vector arguments
28710      do not fit into a single vector register, i.e. vector types that are more
28711      than 128-bits large.  This is because of how we currently represent such
28712      types in ACLE, where we use a struct to allow us to pass them as arguments
28713      and return.
28714      Hence why we have to check whether the simdlens available for this
28715      simdclone would cause a vector type to be larger than 128-bits, and reject
28716      such a clone.  */
28717   unsigned j = 0;
28718   while (j < simdlens.length ())
28719     {
28720       bool remove_simdlen = false;
28721       for (auto elt : vec_elts)
28722         if (known_gt (simdlens[j] * elt.second, 128U))
28723           {
28724             /* Don't issue a warning for every simdclone when there is no
28725                specific simdlen clause.  */
28726             if (explicit_p && maybe_ne (clonei->simdlen, 0U))
28727               warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28728                           "GCC does not currently support simdlen %wd for "
28729                           "type %qT",
28730                           constant_lower_bound (simdlens[j]), elt.first);
28731             remove_simdlen = true;
28732             break;
28733           }
28734       if (remove_simdlen)
28735         simdlens.ordered_remove (j);
28736       else
28737         j++;
28738     }
28739
28740
28741   int count = simdlens.length ();
28742   if (count == 0)
28743     {
28744       if (explicit_p && known_eq (clonei->simdlen, 0U))
28745         {
28746           /* Warn the user if we can't generate any simdclone.  */
28747           simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
28748           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28749                       "GCC does not currently support a simdclone with simdlens"
28750                       " %wd and %wd for these types.",
28751                       constant_lower_bound (simdlen),
28752                       constant_lower_bound (simdlen*2));
28753         }
28754       return 0;
28755     }
28756
28757   gcc_assert (num < count);
28758   clonei->simdlen = simdlens[num];
28759   return count;
28760 }
28761
28762 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
28763
28764 static void
28765 aarch64_simd_clone_adjust (struct cgraph_node *node)
28766 {
28767   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
28768      use the correct ABI.  */
28769
28770   tree t = TREE_TYPE (node->decl);
28771   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
28772                                         TYPE_ATTRIBUTES (t));
28773 }
28774
28775 /* Implement TARGET_SIMD_CLONE_USABLE.  */
28776
28777 static int
28778 aarch64_simd_clone_usable (struct cgraph_node *node)
28779 {
28780   switch (node->simdclone->vecsize_mangle)
28781     {
28782     case 'n':
28783       if (!TARGET_SIMD)
28784         return -1;
28785       return 0;
28786     default:
28787       gcc_unreachable ();
28788     }
28789 }
28790
28791 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
28792
28793 static int
28794 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
28795 {
28796   auto check_attr = [&](const char *ns, const char *name) {
28797     tree attr1 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type1));
28798     tree attr2 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type2));
28799     if (!attr1 && !attr2)
28800       return true;
28801
28802     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
28803   };
28804
28805   if (!check_attr ("gnu", "aarch64_vector_pcs"))
28806     return 0;
28807   if (!check_attr ("gnu", "Advanced SIMD type"))
28808     return 0;
28809   if (!check_attr ("gnu", "SVE type"))
28810     return 0;
28811   if (!check_attr ("gnu", "SVE sizeless type"))
28812     return 0;
28813   if (!check_attr ("arm", "streaming"))
28814     return 0;
28815   if (!check_attr ("arm", "streaming_compatible"))
28816     return 0;
28817   if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "za")
28818       != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "za"))
28819     return 0;
28820   if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "zt0")
28821       != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "zt0"))
28822     return 0;
28823   return 1;
28824 }
28825
28826 /* Implement TARGET_MERGE_DECL_ATTRIBUTES.  */
28827
28828 static tree
28829 aarch64_merge_decl_attributes (tree olddecl, tree newdecl)
28830 {
28831   tree old_attrs = DECL_ATTRIBUTES (olddecl);
28832   tree old_new = lookup_attribute ("arm", "new", old_attrs);
28833
28834   tree new_attrs = DECL_ATTRIBUTES (newdecl);
28835   tree new_new = lookup_attribute ("arm", "new", new_attrs);
28836
28837   if (DECL_INITIAL (olddecl) && new_new)
28838     {
28839       error ("cannot apply attribute %qs to %q+D after the function"
28840              " has been defined", "new", newdecl);
28841       inform (DECL_SOURCE_LOCATION (olddecl), "%q+D defined here",
28842               newdecl);
28843     }
28844   else
28845     {
28846       if (old_new && new_new)
28847         {
28848           old_attrs = remove_attribute ("arm", "new", old_attrs);
28849           TREE_VALUE (new_new) = chainon (TREE_VALUE (new_new),
28850                                           TREE_VALUE (old_new));
28851         }
28852       if (new_new)
28853         aarch64_check_arm_new_against_type (TREE_VALUE (new_new), newdecl);
28854     }
28855
28856   return merge_attributes (old_attrs, new_attrs);
28857 }
28858
28859 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
28860
28861 static const char *
28862 aarch64_get_multilib_abi_name (void)
28863 {
28864   if (TARGET_BIG_END)
28865     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
28866   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
28867 }
28868
28869 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
28870    global variable based guard use the default else
28871    return a null tree.  */
28872 static tree
28873 aarch64_stack_protect_guard (void)
28874 {
28875   if (aarch64_stack_protector_guard == SSP_GLOBAL)
28876     return default_stack_protect_guard ();
28877
28878   return NULL_TREE;
28879 }
28880
28881 /* Return the diagnostic message string if the binary operation OP is
28882    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
28883
28884 static const char *
28885 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
28886                            const_tree type2)
28887 {
28888   if (VECTOR_TYPE_P (type1)
28889       && VECTOR_TYPE_P (type2)
28890       && !TYPE_INDIVISIBLE_P (type1)
28891       && !TYPE_INDIVISIBLE_P (type2)
28892       && (aarch64_sve::builtin_type_p (type1)
28893           != aarch64_sve::builtin_type_p (type2)))
28894     return N_("cannot combine GNU and SVE vectors in a binary operation");
28895
28896   /* Operation allowed.  */
28897   return NULL;
28898 }
28899
28900 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES.  Here we tell the rest of the
28901    compiler that we automatically ignore the top byte of our pointers, which
28902    allows using -fsanitize=hwaddress.  */
28903 bool
28904 aarch64_can_tag_addresses ()
28905 {
28906   return !TARGET_ILP32;
28907 }
28908
28909 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
28910    section at the end if needed.  */
28911 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
28912 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
28913 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
28914 void
28915 aarch64_file_end_indicate_exec_stack ()
28916 {
28917   file_end_indicate_exec_stack ();
28918
28919   unsigned feature_1_and = 0;
28920   if (aarch_bti_enabled ())
28921     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
28922
28923   if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE)
28924     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
28925
28926   if (feature_1_and)
28927     {
28928       /* Generate .note.gnu.property section.  */
28929       switch_to_section (get_section (".note.gnu.property",
28930                                       SECTION_NOTYPE, NULL));
28931
28932       /* PT_NOTE header: namesz, descsz, type.
28933          namesz = 4 ("GNU\0")
28934          descsz = 16 (Size of the program property array)
28935                   [(12 + padding) * Number of array elements]
28936          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
28937       assemble_align (POINTER_SIZE);
28938       assemble_integer (GEN_INT (4), 4, 32, 1);
28939       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
28940       assemble_integer (GEN_INT (5), 4, 32, 1);
28941
28942       /* PT_NOTE name.  */
28943       assemble_string ("GNU", 4);
28944
28945       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
28946          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
28947          datasz = 4
28948          data   = feature_1_and.  */
28949       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
28950       assemble_integer (GEN_INT (4), 4, 32, 1);
28951       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
28952
28953       /* Pad the size of the note to the required alignment.  */
28954       assemble_align (POINTER_SIZE);
28955     }
28956 }
28957 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
28958 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
28959 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
28960
28961 /* Helper function for straight line speculation.
28962    Return what barrier should be emitted for straight line speculation
28963    mitigation.
28964    When not mitigating against straight line speculation this function returns
28965    an empty string.
28966    When mitigating against straight line speculation, use:
28967    * SB when the v8.5-A SB extension is enabled.
28968    * DSB+ISB otherwise.  */
28969 const char *
28970 aarch64_sls_barrier (int mitigation_required)
28971 {
28972   return mitigation_required
28973     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
28974     : "";
28975 }
28976
28977 static GTY (()) tree aarch64_sls_shared_thunks[30];
28978 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
28979 const char *indirect_symbol_names[30] = {
28980     "__call_indirect_x0",
28981     "__call_indirect_x1",
28982     "__call_indirect_x2",
28983     "__call_indirect_x3",
28984     "__call_indirect_x4",
28985     "__call_indirect_x5",
28986     "__call_indirect_x6",
28987     "__call_indirect_x7",
28988     "__call_indirect_x8",
28989     "__call_indirect_x9",
28990     "__call_indirect_x10",
28991     "__call_indirect_x11",
28992     "__call_indirect_x12",
28993     "__call_indirect_x13",
28994     "__call_indirect_x14",
28995     "__call_indirect_x15",
28996     "", /* "__call_indirect_x16",  */
28997     "", /* "__call_indirect_x17",  */
28998     "__call_indirect_x18",
28999     "__call_indirect_x19",
29000     "__call_indirect_x20",
29001     "__call_indirect_x21",
29002     "__call_indirect_x22",
29003     "__call_indirect_x23",
29004     "__call_indirect_x24",
29005     "__call_indirect_x25",
29006     "__call_indirect_x26",
29007     "__call_indirect_x27",
29008     "__call_indirect_x28",
29009     "__call_indirect_x29",
29010 };
29011
29012 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
29013    line speculation.  Instead of a simple BLR that can be speculated past,
29014    we emit a BL to this thunk, and this thunk contains a BR to the relevant
29015    register.  These thunks have the relevant speculation barries put after
29016    their indirect branch so that speculation is blocked.
29017
29018    We use such a thunk so the speculation barriers are kept off the
29019    architecturally executed path in order to reduce the performance overhead.
29020
29021    When optimizing for size we use stubs shared by the linked object.
29022    When optimizing for performance we emit stubs for each function in the hope
29023    that the branch predictor can better train on jumps specific for a given
29024    function.  */
29025 rtx
29026 aarch64_sls_create_blr_label (int regnum)
29027 {
29028   gcc_assert (STUB_REGNUM_P (regnum));
29029   if (optimize_function_for_size_p (cfun))
29030     {
29031       /* For the thunks shared between different functions in this compilation
29032          unit we use a named symbol -- this is just for users to more easily
29033          understand the generated assembly.  */
29034       aarch64_sls_shared_thunks_needed = true;
29035       const char *thunk_name = indirect_symbol_names[regnum];
29036       if (aarch64_sls_shared_thunks[regnum] == NULL)
29037         {
29038           /* Build a decl representing this function stub and record it for
29039              later.  We build a decl here so we can use the GCC machinery for
29040              handling sections automatically (through `get_named_section` and
29041              `make_decl_one_only`).  That saves us a lot of trouble handling
29042              the specifics of different output file formats.  */
29043           tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
29044                                   get_identifier (thunk_name),
29045                                   build_function_type_list (void_type_node,
29046                                                             NULL_TREE));
29047           DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
29048                                            NULL_TREE, void_type_node);
29049           TREE_PUBLIC (decl) = 1;
29050           TREE_STATIC (decl) = 1;
29051           DECL_IGNORED_P (decl) = 1;
29052           DECL_ARTIFICIAL (decl) = 1;
29053           make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29054           resolve_unique_section (decl, 0, false);
29055           aarch64_sls_shared_thunks[regnum] = decl;
29056         }
29057
29058       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
29059     }
29060
29061   if (cfun->machine->call_via[regnum] == NULL)
29062     cfun->machine->call_via[regnum]
29063       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
29064   return cfun->machine->call_via[regnum];
29065 }
29066
29067 /* Helper function for aarch64_sls_emit_blr_function_thunks and
29068    aarch64_sls_emit_shared_blr_thunks below.  */
29069 static void
29070 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
29071 {
29072   /* Save in x16 and branch to that function so this transformation does
29073      not prevent jumping to `BTI c` instructions.  */
29074   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
29075   asm_fprintf (out_file, "\tbr\tx16\n");
29076 }
29077
29078 /* Emit all BLR stubs for this particular function.
29079    Here we emit all the BLR stubs needed for the current function.  Since we
29080    emit these stubs in a consecutive block we know there will be no speculation
29081    gadgets between each stub, and hence we only emit a speculation barrier at
29082    the end of the stub sequences.
29083
29084    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
29085 void
29086 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
29087 {
29088   if (! aarch64_harden_sls_blr_p ())
29089     return;
29090
29091   bool any_functions_emitted = false;
29092   /* We must save and restore the current function section since this assembly
29093      is emitted at the end of the function.  This means it can be emitted *just
29094      after* the cold section of a function.  That cold part would be emitted in
29095      a different section.  That switch would trigger a `.cfi_endproc` directive
29096      to be emitted in the original section and a `.cfi_startproc` directive to
29097      be emitted in the new section.  Switching to the original section without
29098      restoring would mean that the `.cfi_endproc` emitted as a function ends
29099      would happen in a different section -- leaving an unmatched
29100      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
29101      in the standard text section.  */
29102   section *save_text_section = in_section;
29103   switch_to_section (function_section (current_function_decl));
29104   for (int regnum = 0; regnum < 30; ++regnum)
29105     {
29106       rtx specu_label = cfun->machine->call_via[regnum];
29107       if (specu_label == NULL)
29108         continue;
29109
29110       targetm.asm_out.print_operand (out_file, specu_label, 0);
29111       asm_fprintf (out_file, ":\n");
29112       aarch64_sls_emit_function_stub (out_file, regnum);
29113       any_functions_emitted = true;
29114     }
29115   if (any_functions_emitted)
29116     /* Can use the SB if needs be here, since this stub will only be used
29117       by the current function, and hence for the current target.  */
29118     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
29119   switch_to_section (save_text_section);
29120 }
29121
29122 /* Emit shared BLR stubs for the current compilation unit.
29123    Over the course of compiling this unit we may have converted some BLR
29124    instructions to a BL to a shared stub function.  This is where we emit those
29125    stub functions.
29126    This function is for the stubs shared between different functions in this
29127    compilation unit.  We share when optimizing for size instead of speed.
29128
29129    This function is called through the TARGET_ASM_FILE_END hook.  */
29130 void
29131 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
29132 {
29133   if (! aarch64_sls_shared_thunks_needed)
29134     return;
29135
29136   for (int regnum = 0; regnum < 30; ++regnum)
29137     {
29138       tree decl = aarch64_sls_shared_thunks[regnum];
29139       if (!decl)
29140         continue;
29141
29142       const char *name = indirect_symbol_names[regnum];
29143       switch_to_section (get_named_section (decl, NULL, 0));
29144       ASM_OUTPUT_ALIGN (out_file, 2);
29145       targetm.asm_out.globalize_label (out_file, name);
29146       /* Only emits if the compiler is configured for an assembler that can
29147          handle visibility directives.  */
29148       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
29149       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
29150       ASM_OUTPUT_LABEL (out_file, name);
29151       aarch64_sls_emit_function_stub (out_file, regnum);
29152       /* Use the most conservative target to ensure it can always be used by any
29153          function in the translation unit.  */
29154       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
29155       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
29156     }
29157 }
29158
29159 /* Implement TARGET_ASM_FILE_END.  */
29160 void
29161 aarch64_asm_file_end ()
29162 {
29163   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
29164   /* Since this function will be called for the ASM_FILE_END hook, we ensure
29165      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
29166      for FreeBSD) still gets called.  */
29167 #ifdef TARGET_ASM_FILE_END
29168   TARGET_ASM_FILE_END ();
29169 #endif
29170 }
29171
29172 const char *
29173 aarch64_indirect_call_asm (rtx addr)
29174 {
29175   gcc_assert (REG_P (addr));
29176   if (aarch64_harden_sls_blr_p ())
29177     {
29178       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
29179       output_asm_insn ("bl\t%0", &stub_label);
29180     }
29181   else
29182    output_asm_insn ("blr\t%0", &addr);
29183   return "";
29184 }
29185
29186 /* Emit the assembly instruction to load the thread pointer into DEST.
29187    Select between different tpidr_elN registers depending on -mtp= setting.  */
29188
29189 const char *
29190 aarch64_output_load_tp (rtx dest)
29191 {
29192   const char *tpidrs[] = {"tpidr_el0", "tpidr_el1", "tpidr_el2",
29193                           "tpidr_el3", "tpidrro_el0"};
29194   char buffer[64];
29195   snprintf (buffer, sizeof (buffer), "mrs\t%%0, %s",
29196             tpidrs[aarch64_tpidr_register]);
29197   output_asm_insn (buffer, &dest);
29198   return "";
29199 }
29200
29201 /* Set up the value of REG_ALLOC_ORDER from scratch.
29202
29203    It was previously good practice to put call-clobbered registers ahead
29204    of call-preserved registers, but that isn't necessary these days.
29205    IRA's model of register save/restore costs is much more sophisticated
29206    than the model that a simple ordering could provide.  We leave
29207    HONOR_REG_ALLOC_ORDER undefined so that we can get the full benefit
29208    of IRA's model.
29209
29210    However, it is still useful to list registers that are members of
29211    multiple classes after registers that are members of fewer classes.
29212    For example, we have:
29213
29214    - FP_LO8_REGS: v0-v7
29215    - FP_LO_REGS: v0-v15
29216    - FP_REGS: v0-v31
29217
29218    If, as a tie-breaker, we allocate FP_REGS in the order v0-v31,
29219    we run the risk of starving other (lower-priority) pseudos that
29220    require FP_LO8_REGS or FP_LO_REGS.  Allocating FP_LO_REGS in the
29221    order v0-v15 could similarly starve pseudos that require FP_LO8_REGS.
29222    Allocating downwards rather than upwards avoids this problem, at least
29223    in code that has reasonable register pressure.
29224
29225    The situation for predicate registers is similar.  */
29226
29227 void
29228 aarch64_adjust_reg_alloc_order ()
29229 {
29230   for (int i = 0; i < FIRST_PSEUDO_REGISTER; ++i)
29231     if (IN_RANGE (i, V0_REGNUM, V31_REGNUM))
29232       reg_alloc_order[i] = V31_REGNUM - (i - V0_REGNUM);
29233     else if (IN_RANGE (i, P0_REGNUM, P15_REGNUM))
29234       reg_alloc_order[i] = P15_REGNUM - (i - P0_REGNUM);
29235     else
29236       reg_alloc_order[i] = i;
29237 }
29238
29239 /* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
29240    of vector mode MODE to select half the elements of that vector.
29241    Allow any combination of indices except duplicates (or out of range of
29242    the mode units).  */
29243
29244 bool
29245 aarch64_parallel_select_half_p (machine_mode mode, rtx par)
29246 {
29247   int nunits = XVECLEN (par, 0);
29248   if (!known_eq (GET_MODE_NUNITS (mode), nunits * 2))
29249     return false;
29250   int mode_nunits = nunits * 2;
29251   /* Put all the elements of PAR into a hash_set and use its
29252      uniqueness guarantees to check that we don't try to insert the same
29253      element twice.  */
29254   hash_set<rtx> parset;
29255   for (int i = 0; i < nunits; ++i)
29256     {
29257       rtx elt = XVECEXP (par, 0, i);
29258       if (!CONST_INT_P (elt)
29259           || !IN_RANGE (INTVAL (elt), 0, mode_nunits - 1)
29260           || parset.add (elt))
29261         return false;
29262     }
29263   return true;
29264 }
29265
29266 /* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
29267    contain any common elements.  */
29268
29269 bool
29270 aarch64_pars_overlap_p (rtx par1, rtx par2)
29271 {
29272   int len1 = XVECLEN (par1, 0);
29273   int len2 = XVECLEN (par2, 0);
29274   hash_set<rtx> parset;
29275   for (int i = 0; i < len1; ++i)
29276     parset.add (XVECEXP (par1, 0, i));
29277   for (int i = 0; i < len2; ++i)
29278     if (parset.contains (XVECEXP (par2, 0, i)))
29279       return true;
29280   return false;
29281 }
29282
29283 /* Implement OPTIMIZE_MODE_SWITCHING.  */
29284
29285 bool
29286 aarch64_optimize_mode_switching (aarch64_mode_entity entity)
29287 {
29288   bool have_sme_state = (aarch64_cfun_incoming_pstate_za () != 0
29289                          || (aarch64_cfun_has_new_state ("za")
29290                              && df_regs_ever_live_p (ZA_REGNUM))
29291                          || (aarch64_cfun_has_new_state ("zt0")
29292                              && df_regs_ever_live_p (ZT0_REGNUM)));
29293
29294   if (have_sme_state && nonlocal_goto_handler_labels)
29295     {
29296       static bool reported;
29297       if (!reported)
29298         {
29299           sorry ("non-local gotos in functions with SME state");
29300           reported = true;
29301         }
29302     }
29303
29304   switch (entity)
29305     {
29306     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29307     case aarch64_mode_entity::LOCAL_SME_STATE:
29308       return have_sme_state && !nonlocal_goto_handler_labels;
29309     }
29310   gcc_unreachable ();
29311 }
29312
29313 /* Implement TARGET_MODE_EMIT for ZA_SAVE_BUFFER.  */
29314
29315 static void
29316 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode mode,
29317                                   aarch64_tristate_mode prev_mode)
29318 {
29319   if (mode == aarch64_tristate_mode::YES)
29320     {
29321       gcc_assert (prev_mode == aarch64_tristate_mode::NO);
29322       aarch64_init_tpidr2_block ();
29323     }
29324   else
29325     gcc_unreachable ();
29326 }
29327
29328 /* Implement TARGET_MODE_EMIT for LOCAL_SME_STATE.  */
29329
29330 static void
29331 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
29332                                    aarch64_local_sme_state prev_mode)
29333 {
29334   /* Back-propagation should ensure that we're always starting from
29335      a known mode.  */
29336   gcc_assert (prev_mode != aarch64_local_sme_state::ANY);
29337
29338   if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29339     {
29340       /* Commit any uncommitted lazy save.  This leaves ZA either active
29341          and zero (lazy save case) or off (normal case).
29342
29343          The sequence is:
29344
29345              mrs <temp>, tpidr2_el0
29346              cbz <temp>, no_save
29347              bl __arm_tpidr2_save
29348              msr tpidr2_el0, xzr
29349              zero { za }       // Only if ZA is live
29350              zero { zt0 }      // Only if ZT0 is live
29351          no_save:  */
29352       auto tmp_reg = gen_reg_rtx (DImode);
29353       emit_insn (gen_aarch64_read_tpidr2 (tmp_reg));
29354       auto label = gen_label_rtx ();
29355       rtx branch = aarch64_gen_compare_zero_and_branch (EQ, tmp_reg, label);
29356       auto jump = emit_jump_insn (branch);
29357       JUMP_LABEL (jump) = label;
29358       emit_insn (gen_aarch64_tpidr2_save ());
29359       emit_insn (gen_aarch64_clear_tpidr2 ());
29360       if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29361           || mode == aarch64_local_sme_state::ACTIVE_DEAD)
29362         {
29363           if (aarch64_cfun_has_state ("za"))
29364             emit_insn (gen_aarch64_initial_zero_za ());
29365           if (aarch64_cfun_has_state ("zt0"))
29366             emit_insn (gen_aarch64_sme_zero_zt0 ());
29367         }
29368       emit_label (label);
29369     }
29370
29371   if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29372       || mode == aarch64_local_sme_state::ACTIVE_DEAD)
29373     {
29374       if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29375         {
29376           /* Make ZA active after being inactive.
29377
29378              First handle the case in which the lazy save we set up was
29379              committed by a callee.  If the function's source-level ZA state
29380              is live then we must conditionally restore it from the lazy
29381              save buffer.  Otherwise we can just force PSTATE.ZA to 1.  */
29382           if (mode == aarch64_local_sme_state::ACTIVE_LIVE)
29383             emit_insn (gen_aarch64_restore_za (aarch64_get_tpidr2_ptr ()));
29384           else
29385             emit_insn (gen_aarch64_smstart_za ());
29386
29387           /* Now handle the case in which the lazy save was not committed.
29388              In that case, ZA still contains the current function's ZA state,
29389              and we just need to cancel the lazy save.  */
29390           emit_insn (gen_aarch64_clear_tpidr2 ());
29391
29392           /* Restore the ZT0 state, if we have some.  */
29393           if (aarch64_cfun_has_state ("zt0"))
29394             aarch64_restore_zt0 (true);
29395
29396           return;
29397         }
29398
29399       if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL)
29400         {
29401           /* Retrieve the current function's ZA state from the lazy save
29402              buffer.  */
29403           aarch64_restore_za (aarch64_get_tpidr2_ptr ());
29404
29405           /* Restore the ZT0 state, if we have some.  */
29406           if (aarch64_cfun_has_state ("zt0"))
29407             aarch64_restore_zt0 (true);
29408           return;
29409         }
29410
29411       if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER
29412           || prev_mode == aarch64_local_sme_state::OFF)
29413         {
29414           /* INACTIVE_CALLER means that we are enabling ZA for the first
29415              time in this function.  The code above means that ZA is either
29416              active and zero (if we committed a lazy save) or off.  Handle
29417              the latter case by forcing ZA on.
29418
29419              OFF means that PSTATE.ZA is guaranteed to be 0.  We just need
29420              to force it to 1.
29421
29422              Both cases leave ZA zeroed.  */
29423           emit_insn (gen_aarch64_smstart_za ());
29424
29425           /* Restore the ZT0 state, if we have some.  */
29426           if (prev_mode == aarch64_local_sme_state::OFF
29427               && aarch64_cfun_has_state ("zt0"))
29428             aarch64_restore_zt0 (true);
29429           return;
29430         }
29431
29432       if (prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29433           || prev_mode == aarch64_local_sme_state::ACTIVE_LIVE)
29434         /* A simple change in liveness, such as in a CFG structure where
29435            ZA is only conditionally defined.  No code is needed.  */
29436         return;
29437
29438       gcc_unreachable ();
29439     }
29440
29441   if (mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29442     {
29443       if (prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29444           || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29445           || prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29446         {
29447           /* Save the ZT0 state, if we have some.  */
29448           if (aarch64_cfun_has_state ("zt0"))
29449             aarch64_save_zt0 ();
29450
29451           /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
29452              case of setting up a lazy save buffer before a call.
29453              A transition from INACTIVE_CALLER is similar, except that
29454              the contents of ZA are known to be zero.
29455
29456              A transition from ACTIVE_DEAD means that ZA is live at the
29457              point of the transition, but is dead on at least one incoming
29458              edge.  (That is, ZA is only conditionally initialized.)
29459              For efficiency, we want to set up a lazy save even for
29460              dead contents, since forcing ZA off would make later code
29461              restore ZA from the lazy save buffer.  */
29462           emit_insn (gen_aarch64_write_tpidr2 (aarch64_get_tpidr2_ptr ()));
29463           return;
29464         }
29465
29466       if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL
29467           || prev_mode == aarch64_local_sme_state::OFF)
29468         /* We're simply discarding the information about which inactive
29469            state applies.  */
29470         return;
29471
29472       gcc_unreachable ();
29473     }
29474
29475   if (mode == aarch64_local_sme_state::INACTIVE_CALLER
29476       || mode == aarch64_local_sme_state::OFF)
29477     {
29478       /* Save the ZT0 state, if we have some.  */
29479       if ((prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29480            || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD)
29481           && mode == aarch64_local_sme_state::OFF
29482           && aarch64_cfun_has_state ("zt0"))
29483         aarch64_save_zt0 ();
29484
29485       /* The transition to INACTIVE_CALLER is used before returning from
29486          new("za") functions.  Any state in ZA belongs to the current
29487          function rather than a caller, but that state is no longer
29488          needed.  Clear any pending lazy save and turn ZA off.
29489
29490          The transition to OFF is used before calling a private-ZA function.
29491          We committed any incoming lazy save above, so at this point any
29492          contents in ZA belong to the current function.  */
29493       if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29494         emit_insn (gen_aarch64_clear_tpidr2 ());
29495
29496       if (prev_mode != aarch64_local_sme_state::OFF
29497           && prev_mode != aarch64_local_sme_state::SAVED_LOCAL)
29498         emit_insn (gen_aarch64_smstop_za ());
29499
29500       return;
29501     }
29502
29503   if (mode == aarch64_local_sme_state::SAVED_LOCAL)
29504     {
29505       /* This is a transition to an exception handler.  */
29506       gcc_assert (prev_mode == aarch64_local_sme_state::OFF
29507                   || prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL);
29508       return;
29509     }
29510
29511   gcc_unreachable ();
29512 }
29513
29514 /* Implement TARGET_MODE_EMIT.  */
29515
29516 static void
29517 aarch64_mode_emit (int entity, int mode, int prev_mode, HARD_REG_SET live)
29518 {
29519   if (mode == prev_mode)
29520     return;
29521
29522   start_sequence ();
29523   switch (aarch64_mode_entity (entity))
29524     {
29525     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29526       aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode (mode),
29527                                         aarch64_tristate_mode (prev_mode));
29528       break;
29529
29530     case aarch64_mode_entity::LOCAL_SME_STATE:
29531       aarch64_mode_emit_local_sme_state (aarch64_local_sme_state (mode),
29532                                          aarch64_local_sme_state (prev_mode));
29533       break;
29534     }
29535   rtx_insn *seq = get_insns ();
29536   end_sequence ();
29537
29538   /* Get the set of clobbered registers that are currently live.  */
29539   HARD_REG_SET clobbers = {};
29540   for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
29541     {
29542       if (!NONDEBUG_INSN_P (insn))
29543         continue;
29544       vec_rtx_properties properties;
29545       properties.add_insn (insn, false);
29546       for (rtx_obj_reference ref : properties.refs ())
29547         if (ref.is_write () && HARD_REGISTER_NUM_P (ref.regno))
29548           SET_HARD_REG_BIT (clobbers, ref.regno);
29549     }
29550   clobbers &= live;
29551
29552   /* Emit instructions to save clobbered registers to pseudos.  Queue
29553      instructions to restore the registers afterwards.
29554
29555      This should only needed in rare situations.  */
29556   auto_vec<rtx, 33> after;
29557   for (unsigned int regno = R0_REGNUM; regno < R30_REGNUM; ++regno)
29558     if (TEST_HARD_REG_BIT (clobbers, regno))
29559       {
29560         rtx hard_reg = gen_rtx_REG (DImode, regno);
29561         rtx pseudo_reg = gen_reg_rtx (DImode);
29562         emit_move_insn (pseudo_reg, hard_reg);
29563         after.quick_push (gen_move_insn (hard_reg, pseudo_reg));
29564       }
29565   if (TEST_HARD_REG_BIT (clobbers, CC_REGNUM))
29566     {
29567       rtx pseudo_reg = gen_reg_rtx (DImode);
29568       emit_insn (gen_aarch64_save_nzcv (pseudo_reg));
29569       after.quick_push (gen_aarch64_restore_nzcv (pseudo_reg));
29570     }
29571
29572   /* Emit the transition instructions themselves.  */
29573   emit_insn (seq);
29574
29575   /* Restore the clobbered registers.  */
29576   for (auto *insn : after)
29577     emit_insn (insn);
29578 }
29579
29580 /* Return true if INSN references the SME state represented by hard register
29581    REGNO.  */
29582
29583 static bool
29584 aarch64_insn_references_sme_state_p (rtx_insn *insn, unsigned int regno)
29585 {
29586   df_ref ref;
29587   FOR_EACH_INSN_DEF (ref, insn)
29588     if (!DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
29589         && DF_REF_REGNO (ref) == regno)
29590       return true;
29591   FOR_EACH_INSN_USE (ref, insn)
29592     if (DF_REF_REGNO (ref) == regno)
29593       return true;
29594   return false;
29595 }
29596
29597 /* Implement TARGET_MODE_NEEDED for LOCAL_SME_STATE.  */
29598
29599 static aarch64_local_sme_state
29600 aarch64_mode_needed_local_sme_state (rtx_insn *insn, HARD_REG_SET live)
29601 {
29602   if (!CALL_P (insn)
29603       && find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29604     {
29605       static bool reported;
29606       if (!reported)
29607         {
29608           sorry ("catching non-call exceptions in functions with SME state");
29609           reported = true;
29610         }
29611       /* Aim for graceful error recovery by picking the value that is
29612          least likely to generate an ICE.  */
29613       return aarch64_local_sme_state::INACTIVE_LOCAL;
29614     }
29615
29616   /* A non-local goto is equivalent to a return.  We disallow non-local
29617      receivers in functions with SME state, so we know that the target
29618      expects ZA to be dormant or off.  */
29619   if (JUMP_P (insn)
29620       && find_reg_note (insn, REG_NON_LOCAL_GOTO, NULL_RTX))
29621     return aarch64_local_sme_state::INACTIVE_CALLER;
29622
29623   /* start_private_za_call and end_private_za_call bracket a sequence
29624      that calls a private-ZA function.  Force ZA to be turned off if the
29625      function doesn't have any live ZA state, otherwise require ZA to be
29626      inactive.  */
29627   auto icode = recog_memoized (insn);
29628   if (icode == CODE_FOR_aarch64_start_private_za_call
29629       || icode == CODE_FOR_aarch64_end_private_za_call)
29630     return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29631             ? aarch64_local_sme_state::INACTIVE_LOCAL
29632             : aarch64_local_sme_state::OFF);
29633
29634   /* Force ZA to contain the current function's ZA state if INSN wants
29635      to access it.  Do the same for accesses to ZT0, since ZA and ZT0
29636      are both controlled by PSTATE.ZA.  */
29637   if (aarch64_insn_references_sme_state_p (insn, ZA_REGNUM)
29638       || aarch64_insn_references_sme_state_p (insn, ZT0_REGNUM))
29639     return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29640             ? aarch64_local_sme_state::ACTIVE_LIVE
29641             : aarch64_local_sme_state::ACTIVE_DEAD);
29642
29643   return aarch64_local_sme_state::ANY;
29644 }
29645
29646 /* Implement TARGET_MODE_NEEDED for ZA_SAVE_BUFFER.  */
29647
29648 static aarch64_tristate_mode
29649 aarch64_mode_needed_za_save_buffer (rtx_insn *insn, HARD_REG_SET live)
29650 {
29651   /* We need to set up a lazy save buffer no later than the first
29652      transition to INACTIVE_LOCAL (which involves setting up a lazy save).  */
29653   if (aarch64_mode_needed_local_sme_state (insn, live)
29654       == aarch64_local_sme_state::INACTIVE_LOCAL)
29655     return aarch64_tristate_mode::YES;
29656
29657   /* Also make sure that the lazy save buffer is set up before the first
29658      insn that throws internally.  The exception handler will sometimes
29659      load from it.  */
29660   if (find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29661     return aarch64_tristate_mode::YES;
29662
29663   return aarch64_tristate_mode::MAYBE;
29664 }
29665
29666 /* Implement TARGET_MODE_NEEDED.  */
29667
29668 static int
29669 aarch64_mode_needed (int entity, rtx_insn *insn, HARD_REG_SET live)
29670 {
29671   switch (aarch64_mode_entity (entity))
29672     {
29673     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29674       return int (aarch64_mode_needed_za_save_buffer (insn, live));
29675
29676     case aarch64_mode_entity::LOCAL_SME_STATE:
29677       return int (aarch64_mode_needed_local_sme_state (insn, live));
29678     }
29679   gcc_unreachable ();
29680 }
29681
29682 /* Implement TARGET_MODE_AFTER for LOCAL_SME_STATE.  */
29683
29684 static aarch64_local_sme_state
29685 aarch64_mode_after_local_sme_state (aarch64_local_sme_state mode,
29686                                     HARD_REG_SET live)
29687 {
29688   /* Note places where ZA dies, so that we can try to avoid saving and
29689      restoring state that isn't needed.  */
29690   if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29691       && !TEST_HARD_REG_BIT (live, ZA_REGNUM))
29692     return aarch64_local_sme_state::ACTIVE_DEAD;
29693
29694   /* Note where ZA is born, e.g. when moving past an __arm_out("za")
29695      function.  */
29696   if (mode == aarch64_local_sme_state::ACTIVE_DEAD
29697       && TEST_HARD_REG_BIT (live, ZA_REGNUM))
29698     return aarch64_local_sme_state::ACTIVE_LIVE;
29699
29700   return mode;
29701 }
29702
29703 /* Implement TARGET_MODE_AFTER.  */
29704
29705 static int
29706 aarch64_mode_after (int entity, int mode, rtx_insn *, HARD_REG_SET live)
29707 {
29708   switch (aarch64_mode_entity (entity))
29709     {
29710     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29711       return mode;
29712
29713     case aarch64_mode_entity::LOCAL_SME_STATE:
29714       return int (aarch64_mode_after_local_sme_state
29715                   (aarch64_local_sme_state (mode), live));
29716     }
29717   gcc_unreachable ();
29718 }
29719
29720 /* Implement TARGET_MODE_CONFLUENCE for LOCAL_SME_STATE.  */
29721
29722 static aarch64_local_sme_state
29723 aarch64_local_sme_confluence (aarch64_local_sme_state mode1,
29724                               aarch64_local_sme_state mode2)
29725 {
29726   /* Perform a symmetrical check for two values.  */
29727   auto is_pair = [&](aarch64_local_sme_state val1,
29728                      aarch64_local_sme_state val2)
29729     {
29730       return ((mode1 == val1 && mode2 == val2)
29731               || (mode1 == val2 && mode2 == val1));
29732     };
29733
29734   /* INACTIVE_CALLER means ZA is off or it has dormant contents belonging
29735      to a caller.  OFF is one of the options.  */
29736   if (is_pair (aarch64_local_sme_state::INACTIVE_CALLER,
29737                aarch64_local_sme_state::OFF))
29738     return aarch64_local_sme_state::INACTIVE_CALLER;
29739
29740   /* Similarly for dormant contents belonging to the current function.  */
29741   if (is_pair (aarch64_local_sme_state::INACTIVE_LOCAL,
29742                aarch64_local_sme_state::OFF))
29743     return aarch64_local_sme_state::INACTIVE_LOCAL;
29744
29745   /* Treat a conditionally-initialized value as a fully-initialized value.  */
29746   if (is_pair (aarch64_local_sme_state::ACTIVE_LIVE,
29747                aarch64_local_sme_state::ACTIVE_DEAD))
29748     return aarch64_local_sme_state::ACTIVE_LIVE;
29749
29750   return aarch64_local_sme_state::ANY;
29751 }
29752
29753 /* Implement TARGET_MODE_CONFLUENCE.  */
29754
29755 static int
29756 aarch64_mode_confluence (int entity, int mode1, int mode2)
29757 {
29758   gcc_assert (mode1 != mode2);
29759   switch (aarch64_mode_entity (entity))
29760     {
29761     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29762       return int (aarch64_tristate_mode::MAYBE);
29763
29764     case aarch64_mode_entity::LOCAL_SME_STATE:
29765       return int (aarch64_local_sme_confluence
29766                   (aarch64_local_sme_state (mode1),
29767                    aarch64_local_sme_state (mode2)));
29768     }
29769   gcc_unreachable ();
29770 }
29771
29772 /* Implement TARGET_MODE_BACKPROP for an entity that either stays
29773    NO throughput, or makes one transition from NO to YES.  */
29774
29775 static aarch64_tristate_mode
29776 aarch64_one_shot_backprop (aarch64_tristate_mode mode1,
29777                            aarch64_tristate_mode mode2)
29778 {
29779   /* Keep bringing the transition forward until it starts from NO.  */
29780   if (mode1 == aarch64_tristate_mode::MAYBE
29781       && mode2 == aarch64_tristate_mode::YES)
29782     return mode2;
29783
29784   return aarch64_tristate_mode::MAYBE;
29785 }
29786
29787 /* Implement TARGET_MODE_BACKPROP for LOCAL_SME_STATE.  */
29788
29789 static aarch64_local_sme_state
29790 aarch64_local_sme_backprop (aarch64_local_sme_state mode1,
29791                             aarch64_local_sme_state mode2)
29792 {
29793   /* We always need to know what the current state is when transitioning
29794      to a new state.  Force any location with indeterminate starting state
29795      to be active.  */
29796   if (mode1 == aarch64_local_sme_state::ANY)
29797     switch (mode2)
29798       {
29799       case aarch64_local_sme_state::INACTIVE_CALLER:
29800       case aarch64_local_sme_state::OFF:
29801       case aarch64_local_sme_state::ACTIVE_DEAD:
29802         /* The current function's ZA state is not live.  */
29803         return aarch64_local_sme_state::ACTIVE_DEAD;
29804
29805       case aarch64_local_sme_state::INACTIVE_LOCAL:
29806       case aarch64_local_sme_state::ACTIVE_LIVE:
29807         /* The current function's ZA state is live.  */
29808         return aarch64_local_sme_state::ACTIVE_LIVE;
29809
29810       case aarch64_local_sme_state::SAVED_LOCAL:
29811         /* This is a transition to an exception handler.  Since we don't
29812            support non-call exceptions for SME functions, the source of
29813            the transition must be known.  We'll assert later if that's
29814            not the case.  */
29815         return aarch64_local_sme_state::ANY;
29816
29817       case aarch64_local_sme_state::ANY:
29818         return aarch64_local_sme_state::ANY;
29819       }
29820
29821   return aarch64_local_sme_state::ANY;
29822 }
29823
29824 /* Implement TARGET_MODE_BACKPROP.  */
29825
29826 static int
29827 aarch64_mode_backprop (int entity, int mode1, int mode2)
29828 {
29829   switch (aarch64_mode_entity (entity))
29830     {
29831     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29832       return int (aarch64_one_shot_backprop (aarch64_tristate_mode (mode1),
29833                                              aarch64_tristate_mode (mode2)));
29834
29835     case aarch64_mode_entity::LOCAL_SME_STATE:
29836       return int (aarch64_local_sme_backprop
29837                   (aarch64_local_sme_state (mode1),
29838                    aarch64_local_sme_state (mode2)));
29839     }
29840   gcc_unreachable ();
29841 }
29842
29843 /* Implement TARGET_MODE_ENTRY.  */
29844
29845 static int
29846 aarch64_mode_entry (int entity)
29847 {
29848   switch (aarch64_mode_entity (entity))
29849     {
29850     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29851       return int (aarch64_tristate_mode::NO);
29852
29853     case aarch64_mode_entity::LOCAL_SME_STATE:
29854       return int (aarch64_cfun_shared_flags ("za") != 0
29855                   ? aarch64_local_sme_state::ACTIVE_LIVE
29856                   : aarch64_cfun_incoming_pstate_za () != 0
29857                   ? aarch64_local_sme_state::ACTIVE_DEAD
29858                   : aarch64_local_sme_state::INACTIVE_CALLER);
29859     }
29860   gcc_unreachable ();
29861 }
29862
29863 /* Implement TARGET_MODE_EXIT.  */
29864
29865 static int
29866 aarch64_mode_exit (int entity)
29867 {
29868   switch (aarch64_mode_entity (entity))
29869     {
29870     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29871       return int (aarch64_tristate_mode::MAYBE);
29872
29873     case aarch64_mode_entity::LOCAL_SME_STATE:
29874       return int (aarch64_cfun_shared_flags ("za") != 0
29875                   ? aarch64_local_sme_state::ACTIVE_LIVE
29876                   : aarch64_cfun_incoming_pstate_za () != 0
29877                   ? aarch64_local_sme_state::ACTIVE_DEAD
29878                   : aarch64_local_sme_state::INACTIVE_CALLER);
29879     }
29880   gcc_unreachable ();
29881 }
29882
29883 /* Implement TARGET_MODE_EH_HANDLER.  */
29884
29885 static int
29886 aarch64_mode_eh_handler (int entity)
29887 {
29888   switch (aarch64_mode_entity (entity))
29889     {
29890     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29891       /* Require a lazy save buffer to be allocated before the first
29892          insn that can throw.  */
29893       return int (aarch64_tristate_mode::YES);
29894
29895     case aarch64_mode_entity::LOCAL_SME_STATE:
29896       return int (aarch64_local_sme_state::SAVED_LOCAL);
29897     }
29898   gcc_unreachable ();
29899 }
29900
29901 /* Implement TARGET_MODE_PRIORITY.  */
29902
29903 static int
29904 aarch64_mode_priority (int, int n)
29905 {
29906   return n;
29907 }
29908
29909 /* Implement TARGET_MD_ASM_ADJUST.  */
29910
29911 static rtx_insn *
29912 aarch64_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
29913                        vec<machine_mode> &input_modes,
29914                        vec<const char *> &constraints,
29915                        vec<rtx> &uses, vec<rtx> &clobbers,
29916                        HARD_REG_SET &clobbered_regs, location_t loc)
29917 {
29918   rtx_insn *seq = arm_md_asm_adjust (outputs, inputs, input_modes, constraints,
29919                                      uses, clobbers, clobbered_regs, loc);
29920
29921   /* "za" in the clobber list of a function with ZA state is defined to
29922      mean that the asm can read from and write to ZA.  We can model the
29923      read using a USE, but unfortunately, it's not possible to model the
29924      write directly.   Use a separate insn to model the effect.
29925
29926      We must ensure that ZA is active on entry, which is enforced by using
29927      SME_STATE_REGNUM.  The asm must ensure that ZA is active on return.
29928
29929      The same thing applies to ZT0.  */
29930   if (TARGET_ZA)
29931     for (unsigned int i = clobbers.length (); i-- > 0; )
29932       {
29933         rtx x = clobbers[i];
29934         if (REG_P (x)
29935             && (REGNO (x) == ZA_REGNUM || REGNO (x) == ZT0_REGNUM))
29936           {
29937             auto id = cfun->machine->next_asm_update_za_id++;
29938
29939             start_sequence ();
29940             if (seq)
29941               emit_insn (seq);
29942             rtx id_rtx = gen_int_mode (id, SImode);
29943             emit_insn (REGNO (x) == ZA_REGNUM
29944                        ? gen_aarch64_asm_update_za (id_rtx)
29945                        : gen_aarch64_asm_update_zt0 (id_rtx));
29946             seq = get_insns ();
29947             end_sequence ();
29948
29949             auto mode = REGNO (x) == ZA_REGNUM ? VNx16QImode : V8DImode;
29950             uses.safe_push (gen_rtx_REG (mode, REGNO (x)));
29951             uses.safe_push (gen_rtx_REG (DImode, SME_STATE_REGNUM));
29952
29953             clobbers.ordered_remove (i);
29954             CLEAR_HARD_REG_BIT (clobbered_regs, REGNO (x));
29955           }
29956       }
29957   return seq;
29958 }
29959
29960 /* BB is the target of an exception or nonlocal goto edge, which means
29961    that PSTATE.SM is known to be 0 on entry.  Put it into the state that
29962    the current function requires.  */
29963
29964 static bool
29965 aarch64_switch_pstate_sm_for_landing_pad (basic_block bb)
29966 {
29967   if (TARGET_NON_STREAMING)
29968     return false;
29969
29970   start_sequence ();
29971   rtx_insn *guard_label = nullptr;
29972   if (TARGET_STREAMING_COMPATIBLE)
29973     guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
29974                                                   AARCH64_FL_SM_OFF);
29975   aarch64_sme_mode_switch_regs args_switch;
29976   args_switch.add_call_preserved_regs (df_get_live_in (bb));
29977   args_switch.emit_prologue ();
29978   aarch64_switch_pstate_sm (AARCH64_FL_SM_OFF, AARCH64_FL_SM_ON);
29979   args_switch.emit_epilogue ();
29980   if (guard_label)
29981     emit_label (guard_label);
29982   auto seq = get_insns ();
29983   end_sequence ();
29984
29985   emit_insn_after (seq, bb_note (bb));
29986   return true;
29987 }
29988
29989 /* JUMP is a nonlocal goto.  Its target requires PSTATE.SM to be 0 on entry,
29990    so arrange to make it so.  */
29991
29992 static bool
29993 aarch64_switch_pstate_sm_for_jump (rtx_insn *jump)
29994 {
29995   if (TARGET_NON_STREAMING)
29996     return false;
29997
29998   start_sequence ();
29999   rtx_insn *guard_label = nullptr;
30000   if (TARGET_STREAMING_COMPATIBLE)
30001     guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30002                                                   AARCH64_FL_SM_OFF);
30003   aarch64_switch_pstate_sm (AARCH64_FL_SM_ON, AARCH64_FL_SM_OFF);
30004   if (guard_label)
30005     emit_label (guard_label);
30006   auto seq = get_insns ();
30007   end_sequence ();
30008
30009   emit_insn_before (seq, jump);
30010   return true;
30011 }
30012
30013 /* If CALL involves a change in PSTATE.SM, emit the instructions needed
30014    to switch to the new mode and the instructions needed to restore the
30015    original mode.  Return true if something changed.  */
30016 static bool
30017 aarch64_switch_pstate_sm_for_call (rtx_call_insn *call)
30018 {
30019   /* Mode switches for sibling calls are handled via the epilogue.  */
30020   if (SIBLING_CALL_P (call))
30021     return false;
30022
30023   auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
30024   if (!aarch64_call_switches_pstate_sm (callee_isa_mode))
30025     return false;
30026
30027   /* Switch mode before the call, preserving any argument registers
30028      across the switch.  */
30029   start_sequence ();
30030   rtx_insn *args_guard_label = nullptr;
30031   if (TARGET_STREAMING_COMPATIBLE)
30032     args_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30033                                                        callee_isa_mode);
30034   aarch64_sme_mode_switch_regs args_switch;
30035   args_switch.add_call_args (call);
30036   args_switch.emit_prologue ();
30037   aarch64_switch_pstate_sm (AARCH64_ISA_MODE, callee_isa_mode);
30038   args_switch.emit_epilogue ();
30039   if (args_guard_label)
30040     emit_label (args_guard_label);
30041   auto args_seq = get_insns ();
30042   end_sequence ();
30043   emit_insn_before (args_seq, call);
30044
30045   if (find_reg_note (call, REG_NORETURN, NULL_RTX))
30046     return true;
30047
30048   /* Switch mode after the call, preserving any return registers across
30049      the switch.  */
30050   start_sequence ();
30051   rtx_insn *return_guard_label = nullptr;
30052   if (TARGET_STREAMING_COMPATIBLE)
30053     return_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30054                                                          callee_isa_mode);
30055   aarch64_sme_mode_switch_regs return_switch;
30056   return_switch.add_call_result (call);
30057   return_switch.emit_prologue ();
30058   aarch64_switch_pstate_sm (callee_isa_mode, AARCH64_ISA_MODE);
30059   return_switch.emit_epilogue ();
30060   if (return_guard_label)
30061     emit_label (return_guard_label);
30062   auto result_seq = get_insns ();
30063   end_sequence ();
30064   emit_insn_after (result_seq, call);
30065   return true;
30066 }
30067
30068 namespace {
30069
30070 const pass_data pass_data_switch_pstate_sm =
30071 {
30072   RTL_PASS, // type
30073   "smstarts", // name
30074   OPTGROUP_NONE, // optinfo_flags
30075   TV_NONE, // tv_id
30076   0, // properties_required
30077   0, // properties_provided
30078   0, // properties_destroyed
30079   0, // todo_flags_start
30080   TODO_df_finish, // todo_flags_finish
30081 };
30082
30083 class pass_switch_pstate_sm : public rtl_opt_pass
30084 {
30085 public:
30086   pass_switch_pstate_sm (gcc::context *ctxt)
30087     : rtl_opt_pass (pass_data_switch_pstate_sm, ctxt)
30088   {}
30089
30090   // opt_pass methods:
30091   bool gate (function *) override final;
30092   unsigned int execute (function *) override final;
30093 };
30094
30095 bool
30096 pass_switch_pstate_sm::gate (function *fn)
30097 {
30098   return (aarch64_fndecl_pstate_sm (fn->decl) != AARCH64_FL_SM_OFF
30099           || cfun->machine->call_switches_pstate_sm);
30100 }
30101
30102 /* Emit any instructions needed to switch PSTATE.SM.  */
30103 unsigned int
30104 pass_switch_pstate_sm::execute (function *fn)
30105 {
30106   basic_block bb;
30107
30108   auto_sbitmap blocks (last_basic_block_for_fn (cfun));
30109   bitmap_clear (blocks);
30110   FOR_EACH_BB_FN (bb, fn)
30111     {
30112       if (has_abnormal_call_or_eh_pred_edge_p (bb)
30113           && aarch64_switch_pstate_sm_for_landing_pad (bb))
30114         bitmap_set_bit (blocks, bb->index);
30115
30116       if (cfun->machine->call_switches_pstate_sm)
30117         {
30118           rtx_insn *insn;
30119           FOR_BB_INSNS (bb, insn)
30120             if (auto *call = dyn_cast<rtx_call_insn *> (insn))
30121               if (aarch64_switch_pstate_sm_for_call (call))
30122                 bitmap_set_bit (blocks, bb->index);
30123         }
30124
30125       auto end = BB_END (bb);
30126       if (JUMP_P (end)
30127           && find_reg_note (end, REG_NON_LOCAL_GOTO, NULL_RTX)
30128           && aarch64_switch_pstate_sm_for_jump (end))
30129         bitmap_set_bit (blocks, bb->index);
30130     }
30131   find_many_sub_basic_blocks (blocks);
30132   clear_aux_for_blocks ();
30133   return 0;
30134 }
30135
30136 }
30137
30138 rtl_opt_pass *
30139 make_pass_switch_pstate_sm (gcc::context *ctxt)
30140 {
30141   return new pass_switch_pstate_sm (ctxt);
30142 }
30143
30144 /* Parse an implementation-defined system register name of
30145    the form S[0-3]_[0-7]_C[0-15]_C[0-15]_[0-7].
30146    Return true if name matched against above pattern, false
30147    otherwise.  */
30148 bool
30149 aarch64_is_implem_def_reg (const char *regname)
30150 {
30151   unsigned pos = 0;
30152   unsigned name_len = strlen (regname);
30153   if (name_len < 12 || name_len > 14)
30154     return false;
30155
30156   auto cterm_valid_p = [&]()
30157   {
30158     bool leading_zero_p = false;
30159     unsigned i = 0;
30160     char n[3] = {0};
30161
30162     if (regname[pos] != 'c')
30163       return false;
30164     pos++;
30165     while (regname[pos] != '_')
30166       {
30167         if (leading_zero_p)
30168           return false;
30169         if (i == 0 && regname[pos] == '0')
30170           leading_zero_p = true;
30171         if (i > 2)
30172           return false;
30173         if (!ISDIGIT (regname[pos]))
30174           return false;
30175         n[i++] = regname[pos++];
30176       }
30177     if (atoi (n) > 15)
30178       return false;
30179     return true;
30180   };
30181
30182   if (regname[pos] != 's')
30183     return false;
30184   pos++;
30185   if (regname[pos] < '0' || regname[pos] > '3')
30186     return false;
30187   pos++;
30188   if (regname[pos++] != '_')
30189     return false;
30190   if (regname[pos] < '0' || regname[pos] > '7')
30191     return false;
30192   pos++;
30193   if (regname[pos++] != '_')
30194     return false;
30195   if (!cterm_valid_p ())
30196     return false;
30197   if (regname[pos++] != '_')
30198     return false;
30199   if (!cterm_valid_p ())
30200     return false;
30201   if (regname[pos++] != '_')
30202     return false;
30203   if (regname[pos] < '0' || regname[pos] > '7')
30204     return false;
30205   return true;
30206 }
30207
30208 /* Return true if REGNAME matches either a known permitted system
30209    register name, or a generic sysreg specification.  For use in
30210    back-end predicate `aarch64_sysreg_string'.  */
30211 bool
30212 aarch64_valid_sysreg_name_p (const char *regname)
30213 {
30214   const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30215   if (sysreg == NULL)
30216     return aarch64_is_implem_def_reg (regname);
30217   if (sysreg->arch_reqs)
30218     return (aarch64_isa_flags & sysreg->arch_reqs);
30219   return true;
30220 }
30221
30222 /* Return the generic sysreg specification for a valid system register
30223    name, otherwise NULL.  WRITE_P is true iff the register is being
30224    written to.  IS128OP indicates the requested system register should
30225    be checked for a 128-bit implementation.  */
30226 const char *
30227 aarch64_retrieve_sysreg (const char *regname, bool write_p, bool is128op)
30228 {
30229   const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30230   if (sysreg == NULL)
30231     {
30232       if (aarch64_is_implem_def_reg (regname))
30233         return regname;
30234       else
30235         return NULL;
30236     }
30237   if (is128op && !(sysreg->properties & F_REG_128))
30238     return NULL;
30239   if ((write_p && (sysreg->properties & F_REG_READ))
30240       || (!write_p && (sysreg->properties & F_REG_WRITE)))
30241     return NULL;
30242   if ((~aarch64_isa_flags & sysreg->arch_reqs) != 0)
30243     return NULL;
30244   return sysreg->encoding;
30245 }
30246
30247 /* Target-specific selftests.  */
30248
30249 #if CHECKING_P
30250
30251 namespace selftest {
30252
30253 /* Selftest for the RTL loader.
30254    Verify that the RTL loader copes with a dump from
30255    print_rtx_function.  This is essentially just a test that class
30256    function_reader can handle a real dump, but it also verifies
30257    that lookup_reg_by_dump_name correctly handles hard regs.
30258    The presence of hard reg names in the dump means that the test is
30259    target-specific, hence it is in this file.  */
30260
30261 static void
30262 aarch64_test_loading_full_dump ()
30263 {
30264   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
30265
30266   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
30267
30268   rtx_insn *insn_1 = get_insn_by_uid (1);
30269   ASSERT_EQ (NOTE, GET_CODE (insn_1));
30270
30271   rtx_insn *insn_15 = get_insn_by_uid (15);
30272   ASSERT_EQ (INSN, GET_CODE (insn_15));
30273   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
30274
30275   /* Verify crtl->return_rtx.  */
30276   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
30277   ASSERT_EQ (0, REGNO (crtl->return_rtx));
30278   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
30279 }
30280
30281 /* Test the fractional_cost class.  */
30282
30283 static void
30284 aarch64_test_fractional_cost ()
30285 {
30286   using cf = fractional_cost;
30287
30288   ASSERT_EQ (cf (0, 20), 0);
30289
30290   ASSERT_EQ (cf (4, 2), 2);
30291   ASSERT_EQ (3, cf (9, 3));
30292
30293   ASSERT_NE (cf (5, 2), 2);
30294   ASSERT_NE (3, cf (8, 3));
30295
30296   ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
30297   ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
30298   ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
30299
30300   ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
30301   ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
30302   ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
30303   ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
30304   ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
30305   ASSERT_EQ (3 - cf (10, 3), 0);
30306
30307   ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
30308   ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
30309
30310   ASSERT_TRUE (cf (4, 15) <= cf (5, 15));
30311   ASSERT_TRUE (cf (5, 15) <= cf (5, 15));
30312   ASSERT_FALSE (cf (6, 15) <= cf (5, 15));
30313   ASSERT_TRUE (cf (1, 3) <= cf (2, 5));
30314   ASSERT_TRUE (cf (1, 12) <= cf (1, 6));
30315   ASSERT_TRUE (cf (5, 3) <= cf (5, 3));
30316   ASSERT_TRUE (cf (239, 240) <= 1);
30317   ASSERT_TRUE (cf (240, 240) <= 1);
30318   ASSERT_FALSE (cf (241, 240) <= 1);
30319   ASSERT_FALSE (2 <= cf (207, 104));
30320   ASSERT_TRUE (2 <= cf (208, 104));
30321   ASSERT_TRUE (2 <= cf (209, 104));
30322
30323   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
30324   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
30325   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
30326   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
30327   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
30328   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
30329   ASSERT_TRUE (cf (239, 240) < 1);
30330   ASSERT_FALSE (cf (240, 240) < 1);
30331   ASSERT_FALSE (cf (241, 240) < 1);
30332   ASSERT_FALSE (2 < cf (207, 104));
30333   ASSERT_FALSE (2 < cf (208, 104));
30334   ASSERT_TRUE (2 < cf (209, 104));
30335
30336   ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
30337   ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
30338   ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
30339   ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
30340   ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
30341   ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
30342   ASSERT_FALSE (cf (239, 240) >= 1);
30343   ASSERT_TRUE (cf (240, 240) >= 1);
30344   ASSERT_TRUE (cf (241, 240) >= 1);
30345   ASSERT_TRUE (2 >= cf (207, 104));
30346   ASSERT_TRUE (2 >= cf (208, 104));
30347   ASSERT_FALSE (2 >= cf (209, 104));
30348
30349   ASSERT_FALSE (cf (4, 15) > cf (5, 15));
30350   ASSERT_FALSE (cf (5, 15) > cf (5, 15));
30351   ASSERT_TRUE (cf (6, 15) > cf (5, 15));
30352   ASSERT_FALSE (cf (1, 3) > cf (2, 5));
30353   ASSERT_FALSE (cf (1, 12) > cf (1, 6));
30354   ASSERT_FALSE (cf (5, 3) > cf (5, 3));
30355   ASSERT_FALSE (cf (239, 240) > 1);
30356   ASSERT_FALSE (cf (240, 240) > 1);
30357   ASSERT_TRUE (cf (241, 240) > 1);
30358   ASSERT_TRUE (2 > cf (207, 104));
30359   ASSERT_FALSE (2 > cf (208, 104));
30360   ASSERT_FALSE (2 > cf (209, 104));
30361
30362   ASSERT_EQ (cf (1, 2).ceil (), 1);
30363   ASSERT_EQ (cf (11, 7).ceil (), 2);
30364   ASSERT_EQ (cf (20, 1).ceil (), 20);
30365   ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
30366   ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
30367   ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
30368   ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
30369   ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
30370
30371   ASSERT_EQ (cf (1, 2).as_double (), 0.5);
30372 }
30373
30374 /* Calculate whether our system register data, as imported from
30375    `aarch64-sys-reg.def' has any duplicate entries.  */
30376 static void
30377 aarch64_test_sysreg_encoding_clashes (void)
30378 {
30379   using dup_instances_t = hash_map<nofree_string_hash,
30380                                    std::vector<const sysreg_t*>>;
30381
30382   dup_instances_t duplicate_instances;
30383
30384   /* Every time an encoding is established to come up more than once
30385      we add it to a "clash-analysis queue", which is then used to extract
30386      necessary information from our hash map when establishing whether
30387      repeated encodings are valid.  */
30388
30389   /* 1) Collect recurrence information.  */
30390   for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
30391     {
30392       const sysreg_t *reg = aarch64_sysregs + i;
30393
30394       std::vector<const sysreg_t*> *tmp
30395         = &duplicate_instances.get_or_insert (reg->encoding);
30396
30397       tmp->push_back (reg);
30398     }
30399
30400   /* 2) Carry out analysis on collected data.  */
30401   for (auto instance : duplicate_instances)
30402     {
30403       unsigned nrep = instance.second.size ();
30404       if (nrep > 1)
30405         for (unsigned i = 0; i < nrep; i++)
30406           for (unsigned j = i + 1; j < nrep; j++)
30407             {
30408               const sysreg_t *a = instance.second[i];
30409               const sysreg_t *b = instance.second[j];
30410               ASSERT_TRUE ((a->properties != b->properties)
30411                            || (a->arch_reqs != b->arch_reqs));
30412             }
30413     }
30414 }
30415
30416 /* Run all target-specific selftests.  */
30417
30418 static void
30419 aarch64_run_selftests (void)
30420 {
30421   aarch64_test_loading_full_dump ();
30422   aarch64_test_fractional_cost ();
30423   aarch64_test_sysreg_encoding_clashes ();
30424 }
30425
30426 } // namespace selftest
30427
30428 #endif /* #if CHECKING_P */
30429
30430 #undef TARGET_STACK_PROTECT_GUARD
30431 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
30432
30433 #undef TARGET_ADDRESS_COST
30434 #define TARGET_ADDRESS_COST aarch64_address_cost
30435
30436 /* This hook will determines whether unnamed bitfields affect the alignment
30437    of the containing structure.  The hook returns true if the structure
30438    should inherit the alignment requirements of an unnamed bitfield's
30439    type.  */
30440 #undef TARGET_ALIGN_ANON_BITFIELD
30441 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
30442
30443 #undef TARGET_ASM_ALIGNED_DI_OP
30444 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
30445
30446 #undef TARGET_ASM_ALIGNED_HI_OP
30447 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
30448
30449 #undef TARGET_ASM_ALIGNED_SI_OP
30450 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
30451
30452 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
30453 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
30454   hook_bool_const_tree_hwi_hwi_const_tree_true
30455
30456 #undef TARGET_ASM_FILE_START
30457 #define TARGET_ASM_FILE_START aarch64_start_file
30458
30459 #undef TARGET_ASM_OUTPUT_MI_THUNK
30460 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
30461
30462 #undef TARGET_ASM_SELECT_RTX_SECTION
30463 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
30464
30465 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
30466 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
30467
30468 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
30469 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
30470
30471 #undef TARGET_BUILD_BUILTIN_VA_LIST
30472 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
30473
30474 #undef TARGET_CALLEE_COPIES
30475 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
30476
30477 #undef TARGET_FRAME_POINTER_REQUIRED
30478 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
30479
30480 #undef TARGET_CAN_ELIMINATE
30481 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
30482
30483 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
30484 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P \
30485   aarch64_function_attribute_inlinable_p
30486
30487 #undef TARGET_NEED_IPA_FN_TARGET_INFO
30488 #define TARGET_NEED_IPA_FN_TARGET_INFO aarch64_need_ipa_fn_target_info
30489
30490 #undef TARGET_UPDATE_IPA_FN_TARGET_INFO
30491 #define TARGET_UPDATE_IPA_FN_TARGET_INFO aarch64_update_ipa_fn_target_info
30492
30493 #undef TARGET_CAN_INLINE_P
30494 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
30495
30496 #undef TARGET_CANNOT_FORCE_CONST_MEM
30497 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
30498
30499 #undef TARGET_CASE_VALUES_THRESHOLD
30500 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
30501
30502 #undef TARGET_CONDITIONAL_REGISTER_USAGE
30503 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
30504
30505 #undef TARGET_MEMBER_TYPE_FORCES_BLK
30506 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
30507
30508 /* Only the least significant bit is used for initialization guard
30509    variables.  */
30510 #undef TARGET_CXX_GUARD_MASK_BIT
30511 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
30512
30513 #undef TARGET_C_MODE_FOR_SUFFIX
30514 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
30515
30516 #ifdef TARGET_BIG_ENDIAN_DEFAULT
30517 #undef  TARGET_DEFAULT_TARGET_FLAGS
30518 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
30519 #endif
30520
30521 #undef TARGET_CLASS_MAX_NREGS
30522 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
30523
30524 #undef TARGET_BUILTIN_DECL
30525 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
30526
30527 #undef TARGET_BUILTIN_RECIPROCAL
30528 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
30529
30530 #undef TARGET_C_EXCESS_PRECISION
30531 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
30532
30533 #undef TARGET_C_BITINT_TYPE_INFO
30534 #define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
30535
30536 #undef  TARGET_EXPAND_BUILTIN
30537 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
30538
30539 #undef TARGET_EXPAND_BUILTIN_VA_START
30540 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
30541
30542 #undef TARGET_FOLD_BUILTIN
30543 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
30544
30545 #undef TARGET_FUNCTION_ARG
30546 #define TARGET_FUNCTION_ARG aarch64_function_arg
30547
30548 #undef TARGET_FUNCTION_ARG_ADVANCE
30549 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
30550
30551 #undef TARGET_FUNCTION_ARG_BOUNDARY
30552 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
30553
30554 #undef TARGET_FUNCTION_ARG_PADDING
30555 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
30556
30557 #undef TARGET_GET_RAW_RESULT_MODE
30558 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
30559 #undef TARGET_GET_RAW_ARG_MODE
30560 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
30561
30562 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
30563 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
30564
30565 #undef TARGET_FUNCTION_VALUE
30566 #define TARGET_FUNCTION_VALUE aarch64_function_value
30567
30568 #undef TARGET_FUNCTION_VALUE_REGNO_P
30569 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
30570
30571 #undef TARGET_START_CALL_ARGS
30572 #define TARGET_START_CALL_ARGS aarch64_start_call_args
30573
30574 #undef TARGET_END_CALL_ARGS
30575 #define TARGET_END_CALL_ARGS aarch64_end_call_args
30576
30577 #undef TARGET_GIMPLE_FOLD_BUILTIN
30578 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
30579
30580 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
30581 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
30582
30583 #undef  TARGET_INIT_BUILTINS
30584 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
30585
30586 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
30587 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
30588   aarch64_ira_change_pseudo_allocno_class
30589
30590 #undef TARGET_LEGITIMATE_ADDRESS_P
30591 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
30592
30593 #undef TARGET_LEGITIMATE_CONSTANT_P
30594 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
30595
30596 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
30597 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
30598   aarch64_legitimize_address_displacement
30599
30600 #undef TARGET_LIBGCC_CMP_RETURN_MODE
30601 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
30602
30603 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
30604 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
30605 aarch64_libgcc_floating_mode_supported_p
30606
30607 #undef TARGET_MANGLE_TYPE
30608 #define TARGET_MANGLE_TYPE aarch64_mangle_type
30609
30610 #undef TARGET_INVALID_BINARY_OP
30611 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
30612
30613 #undef TARGET_VERIFY_TYPE_CONTEXT
30614 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
30615
30616 #undef TARGET_MEMORY_MOVE_COST
30617 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
30618
30619 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
30620 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
30621
30622 #undef TARGET_MUST_PASS_IN_STACK
30623 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
30624
30625 /* This target hook should return true if accesses to volatile bitfields
30626    should use the narrowest mode possible.  It should return false if these
30627    accesses should use the bitfield container type.  */
30628 #undef TARGET_NARROW_VOLATILE_BITFIELD
30629 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
30630
30631 #undef  TARGET_OPTION_OVERRIDE
30632 #define TARGET_OPTION_OVERRIDE aarch64_override_options
30633
30634 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
30635 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
30636   aarch64_override_options_after_change
30637
30638 #undef TARGET_OFFLOAD_OPTIONS
30639 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
30640
30641 #undef TARGET_OPTION_RESTORE
30642 #define TARGET_OPTION_RESTORE aarch64_option_restore
30643
30644 #undef TARGET_OPTION_PRINT
30645 #define TARGET_OPTION_PRINT aarch64_option_print
30646
30647 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
30648 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
30649
30650 #undef TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P
30651 #define TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P \
30652   aarch64_option_valid_version_attribute_p
30653
30654 #undef TARGET_SET_CURRENT_FUNCTION
30655 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
30656
30657 #undef TARGET_PASS_BY_REFERENCE
30658 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
30659
30660 #undef TARGET_PREFERRED_RELOAD_CLASS
30661 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
30662
30663 #undef TARGET_SCHED_REASSOCIATION_WIDTH
30664 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
30665
30666 #undef TARGET_DWARF_FRAME_REG_MODE
30667 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
30668
30669 #undef TARGET_PROMOTED_TYPE
30670 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
30671
30672 #undef TARGET_SECONDARY_RELOAD
30673 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
30674
30675 #undef TARGET_SECONDARY_MEMORY_NEEDED
30676 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
30677
30678 #undef TARGET_SHIFT_TRUNCATION_MASK
30679 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
30680
30681 #undef TARGET_SETUP_INCOMING_VARARGS
30682 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
30683
30684 #undef TARGET_STRUCT_VALUE_RTX
30685 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
30686
30687 #undef TARGET_REGISTER_MOVE_COST
30688 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
30689
30690 #undef TARGET_RETURN_IN_MEMORY
30691 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
30692
30693 #undef TARGET_RETURN_IN_MSB
30694 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
30695
30696 #undef TARGET_RTX_COSTS
30697 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
30698
30699 #undef TARGET_INSN_COST
30700 #define TARGET_INSN_COST aarch64_insn_cost
30701
30702 #undef TARGET_SCALAR_MODE_SUPPORTED_P
30703 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
30704
30705 #undef TARGET_SCHED_ISSUE_RATE
30706 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
30707
30708 #undef TARGET_SCHED_VARIABLE_ISSUE
30709 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
30710
30711 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
30712 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
30713   aarch64_sched_first_cycle_multipass_dfa_lookahead
30714
30715 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
30716 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
30717   aarch64_first_cycle_multipass_dfa_lookahead_guard
30718
30719 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
30720 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
30721   aarch64_get_separate_components
30722
30723 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
30724 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
30725   aarch64_components_for_bb
30726
30727 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
30728 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
30729   aarch64_disqualify_components
30730
30731 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
30732 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
30733   aarch64_emit_prologue_components
30734
30735 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
30736 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
30737   aarch64_emit_epilogue_components
30738
30739 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
30740 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
30741   aarch64_set_handled_components
30742
30743 #undef TARGET_TRAMPOLINE_INIT
30744 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
30745
30746 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
30747 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
30748
30749 #undef TARGET_VECTOR_MODE_SUPPORTED_P
30750 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
30751
30752 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
30753 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P aarch64_vector_mode_supported_any_target_p
30754
30755 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
30756 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
30757
30758 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
30759 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
30760   aarch64_builtin_support_vector_misalignment
30761
30762 #undef TARGET_ARRAY_MODE
30763 #define TARGET_ARRAY_MODE aarch64_array_mode
30764
30765 #undef TARGET_ARRAY_MODE_SUPPORTED_P
30766 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
30767
30768 #undef TARGET_VECTORIZE_CREATE_COSTS
30769 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
30770
30771 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
30772 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
30773   aarch64_builtin_vectorization_cost
30774
30775 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
30776 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
30777
30778 #undef TARGET_VECTORIZE_BUILTINS
30779 #define TARGET_VECTORIZE_BUILTINS
30780
30781 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
30782 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
30783   aarch64_autovectorize_vector_modes
30784
30785 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
30786 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
30787   aarch64_atomic_assign_expand_fenv
30788
30789 /* Section anchor support.  */
30790
30791 #undef TARGET_MIN_ANCHOR_OFFSET
30792 #define TARGET_MIN_ANCHOR_OFFSET -256
30793
30794 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
30795    byte offset; we can do much more for larger data types, but have no way
30796    to determine the size of the access.  We assume accesses are aligned.  */
30797 #undef TARGET_MAX_ANCHOR_OFFSET
30798 #define TARGET_MAX_ANCHOR_OFFSET 4095
30799
30800 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
30801 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
30802   aarch64_vectorize_preferred_div_as_shifts_over_mult
30803
30804 #undef TARGET_VECTOR_ALIGNMENT
30805 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
30806
30807 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
30808 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
30809   aarch64_vectorize_preferred_vector_alignment
30810 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
30811 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
30812   aarch64_simd_vector_alignment_reachable
30813
30814 /* vec_perm support.  */
30815
30816 #undef TARGET_VECTORIZE_VEC_PERM_CONST
30817 #define TARGET_VECTORIZE_VEC_PERM_CONST \
30818   aarch64_vectorize_vec_perm_const
30819
30820 #undef TARGET_VECTORIZE_RELATED_MODE
30821 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
30822 #undef TARGET_VECTORIZE_GET_MASK_MODE
30823 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
30824 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
30825 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
30826   aarch64_empty_mask_is_expensive
30827 #undef TARGET_PREFERRED_ELSE_VALUE
30828 #define TARGET_PREFERRED_ELSE_VALUE \
30829   aarch64_preferred_else_value
30830
30831 #undef TARGET_INIT_LIBFUNCS
30832 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
30833
30834 #undef TARGET_FIXED_CONDITION_CODE_REGS
30835 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
30836
30837 #undef TARGET_FLAGS_REGNUM
30838 #define TARGET_FLAGS_REGNUM CC_REGNUM
30839
30840 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
30841 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
30842
30843 #undef TARGET_ASAN_SHADOW_OFFSET
30844 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
30845
30846 #undef TARGET_LEGITIMIZE_ADDRESS
30847 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
30848
30849 #undef TARGET_SCHED_CAN_SPECULATE_INSN
30850 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
30851
30852 #undef TARGET_CAN_USE_DOLOOP_P
30853 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
30854
30855 #undef TARGET_SCHED_ADJUST_PRIORITY
30856 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
30857
30858 #undef TARGET_SCHED_MACRO_FUSION_P
30859 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
30860
30861 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
30862 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
30863
30864 #undef TARGET_SCHED_FUSION_PRIORITY
30865 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
30866
30867 #undef TARGET_UNSPEC_MAY_TRAP_P
30868 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
30869
30870 #undef TARGET_USE_PSEUDO_PIC_REG
30871 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
30872
30873 #undef TARGET_PRINT_OPERAND
30874 #define TARGET_PRINT_OPERAND aarch64_print_operand
30875
30876 #undef TARGET_PRINT_OPERAND_ADDRESS
30877 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
30878
30879 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
30880 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
30881
30882 #undef TARGET_OPTAB_SUPPORTED_P
30883 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
30884
30885 #undef TARGET_OMIT_STRUCT_RETURN_REG
30886 #define TARGET_OMIT_STRUCT_RETURN_REG true
30887
30888 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
30889 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
30890   aarch64_dwarf_poly_indeterminate_value
30891
30892 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
30893 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
30894 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
30895
30896 #undef TARGET_HARD_REGNO_NREGS
30897 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
30898 #undef TARGET_HARD_REGNO_MODE_OK
30899 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
30900
30901 #undef TARGET_MODES_TIEABLE_P
30902 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
30903
30904 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
30905 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
30906   aarch64_hard_regno_call_part_clobbered
30907
30908 #undef TARGET_INSN_CALLEE_ABI
30909 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
30910
30911 #undef TARGET_CONSTANT_ALIGNMENT
30912 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
30913
30914 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
30915 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
30916   aarch64_stack_clash_protection_alloca_probe_range
30917
30918 #undef TARGET_COMPUTE_PRESSURE_CLASSES
30919 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
30920
30921 #undef TARGET_CAN_CHANGE_MODE_CLASS
30922 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
30923
30924 #undef TARGET_SELECT_EARLY_REMAT_MODES
30925 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
30926
30927 #undef TARGET_SPECULATION_SAFE_VALUE
30928 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
30929
30930 #undef TARGET_ESTIMATED_POLY_VALUE
30931 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
30932
30933 #undef TARGET_ATTRIBUTE_TABLE
30934 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
30935
30936 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
30937 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
30938   aarch64_simd_clone_compute_vecsize_and_simdlen
30939
30940 #undef TARGET_SIMD_CLONE_ADJUST
30941 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
30942
30943 #undef TARGET_SIMD_CLONE_USABLE
30944 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
30945
30946 #undef TARGET_COMP_TYPE_ATTRIBUTES
30947 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
30948
30949 #undef TARGET_MERGE_DECL_ATTRIBUTES
30950 #define TARGET_MERGE_DECL_ATTRIBUTES aarch64_merge_decl_attributes
30951
30952 #undef TARGET_GET_MULTILIB_ABI_NAME
30953 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
30954
30955 #undef TARGET_FNTYPE_ABI
30956 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
30957
30958 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
30959 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
30960
30961 #if CHECKING_P
30962 #undef TARGET_RUN_TARGET_SELFTESTS
30963 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
30964 #endif /* #if CHECKING_P */
30965
30966 #undef TARGET_ASM_POST_CFI_STARTPROC
30967 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
30968
30969 #undef TARGET_STRICT_ARGUMENT_NAMING
30970 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
30971
30972 #undef TARGET_MODE_EMIT
30973 #define TARGET_MODE_EMIT aarch64_mode_emit
30974
30975 #undef TARGET_MODE_NEEDED
30976 #define TARGET_MODE_NEEDED aarch64_mode_needed
30977
30978 #undef TARGET_MODE_AFTER
30979 #define TARGET_MODE_AFTER aarch64_mode_after
30980
30981 #undef TARGET_MODE_CONFLUENCE
30982 #define TARGET_MODE_CONFLUENCE aarch64_mode_confluence
30983
30984 #undef TARGET_MODE_BACKPROP
30985 #define TARGET_MODE_BACKPROP aarch64_mode_backprop
30986
30987 #undef TARGET_MODE_ENTRY
30988 #define TARGET_MODE_ENTRY aarch64_mode_entry
30989
30990 #undef TARGET_MODE_EXIT
30991 #define TARGET_MODE_EXIT aarch64_mode_exit
30992
30993 #undef TARGET_MODE_EH_HANDLER
30994 #define TARGET_MODE_EH_HANDLER aarch64_mode_eh_handler
30995
30996 #undef TARGET_MODE_PRIORITY
30997 #define TARGET_MODE_PRIORITY aarch64_mode_priority
30998
30999 #undef TARGET_MD_ASM_ADJUST
31000 #define TARGET_MD_ASM_ADJUST aarch64_md_asm_adjust
31001
31002 #undef TARGET_ASM_FILE_END
31003 #define TARGET_ASM_FILE_END aarch64_asm_file_end
31004
31005 #undef TARGET_ASM_FUNCTION_EPILOGUE
31006 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
31007
31008 #undef TARGET_HAVE_SHADOW_CALL_STACK
31009 #define TARGET_HAVE_SHADOW_CALL_STACK true
31010
31011 #undef TARGET_CONST_ANCHOR
31012 #define TARGET_CONST_ANCHOR 0x1000000
31013
31014 #undef TARGET_EXTRA_LIVE_ON_ENTRY
31015 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
31016
31017 #undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
31018 #define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
31019
31020 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
31021 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
31022
31023 #undef TARGET_OPTION_FUNCTION_VERSIONS
31024 #define TARGET_OPTION_FUNCTION_VERSIONS aarch64_common_function_versions
31025
31026 #undef TARGET_COMPARE_VERSION_PRIORITY
31027 #define TARGET_COMPARE_VERSION_PRIORITY aarch64_compare_version_priority
31028
31029 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
31030 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
31031   aarch64_generate_version_dispatcher_body
31032
31033 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
31034 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
31035   aarch64_get_function_versions_dispatcher
31036
31037 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
31038 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME aarch64_mangle_decl_assembler_name
31039
31040 struct gcc_target targetm = TARGET_INITIALIZER;
31041
31042 #include "gt-aarch64.h"