gcc/config/aarch64/aarch64.cc

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2022 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #define INCLUDE_STRING
  24 #define INCLUDE_ALGORITHM
  25 #include "config.h"
  26 #include "system.h"
  27 #include "coretypes.h"
  28 #include "backend.h"
  29 #include "target.h"
  30 #include "rtl.h"
  31 #include "tree.h"
  32 #include "memmodel.h"
  33 #include "gimple.h"
  34 #include "cfghooks.h"
  35 #include "cfgloop.h"
  36 #include "df.h"
  37 #include "tm_p.h"
  38 #include "stringpool.h"
  39 #include "attribs.h"
  40 #include "optabs.h"
  41 #include "regs.h"
  42 #include "emit-rtl.h"
  43 #include "recog.h"
  44 #include "cgraph.h"
  45 #include "diagnostic.h"
  46 #include "insn-attr.h"
  47 #include "alias.h"
  48 #include "fold-const.h"
  49 #include "stor-layout.h"
  50 #include "calls.h"
  51 #include "varasm.h"
  52 #include "output.h"
  53 #include "flags.h"
  54 #include "explow.h"
  55 #include "expr.h"
  56 #include "reload.h"
  57 #include "langhooks.h"
  58 #include "opts.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76 #include "expmed.h"
  77 #include "function-abi.h"
  78 #include "gimple-pretty-print.h"
  79 #include "tree-ssa-loop-niter.h"
  80 #include "fractional-cost.h"
  81 #include "rtlanal.h"
  82 #include "tree-dfa.h"
  83 #include "asan.h"
  84 #include "aarch64-feature-deps.h"
  85
  86 /* This file should be included last.  */
  87 #include "target-def.h"
  88
  89 /* Defined for convenience.  */
  90 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  91
  92 /* Information about a legitimate vector immediate operand.  */
  93 struct simd_immediate_info
  94 {
  95   enum insn_type { MOV, MVN, INDEX, PTRUE };
  96   enum modifier_type { LSL, MSL };
  97
  98   simd_immediate_info () {}
  99   simd_immediate_info (scalar_float_mode, rtx);
 100   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 101                        insn_type = MOV, modifier_type = LSL,
 102                        unsigned int = 0);
 103   simd_immediate_info (scalar_mode, rtx, rtx);
 104   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
 105
 106   /* The mode of the elements.  */
 107   scalar_mode elt_mode;
 108
 109   /* The instruction to use to move the immediate into a vector.  */
 110   insn_type insn;
 111
 112   union
 113   {
 114     /* For MOV and MVN.  */
 115     struct
 116     {
 117       /* The value of each element.  */
 118       rtx value;
 119
 120       /* The kind of shift modifier to use, and the number of bits to shift.
 121          This is (LSL, 0) if no shift is needed.  */
 122       modifier_type modifier;
 123       unsigned int shift;
 124     } mov;
 125
 126     /* For INDEX.  */
 127     struct
 128     {
 129       /* The value of the first element and the step to be added for each
 130          subsequent element.  */
 131       rtx base, step;
 132     } index;
 133
 134     /* For PTRUE.  */
 135     aarch64_svpattern pattern;
 136   } u;
 137 };
 138
 139 /* Construct a floating-point immediate in which each element has mode
 140    ELT_MODE_IN and value VALUE_IN.  */
 141 inline simd_immediate_info
 142 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 143   : elt_mode (elt_mode_in), insn (MOV)
 144 {
 145   u.mov.value = value_in;
 146   u.mov.modifier = LSL;
 147   u.mov.shift = 0;
 148 }
 149
 150 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 151    and value VALUE_IN.  The other parameters are as for the structure
 152    fields.  */
 153 inline simd_immediate_info
 154 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 155                        unsigned HOST_WIDE_INT value_in,
 156                        insn_type insn_in, modifier_type modifier_in,
 157                        unsigned int shift_in)
 158   : elt_mode (elt_mode_in), insn (insn_in)
 159 {
 160   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 161   u.mov.modifier = modifier_in;
 162   u.mov.shift = shift_in;
 163 }
 164
 165 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 166    and where element I is equal to BASE_IN + I * STEP_IN.  */
 167 inline simd_immediate_info
 168 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 169   : elt_mode (elt_mode_in), insn (INDEX)
 170 {
 171   u.index.base = base_in;
 172   u.index.step = step_in;
 173 }
 174
 175 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 176    and has PTRUE pattern PATTERN_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 179                        aarch64_svpattern pattern_in)
 180   : elt_mode (elt_mode_in), insn (PTRUE)
 181 {
 182   u.pattern = pattern_in;
 183 }
 184
 185 namespace {
 186
 187 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
 188 class pure_scalable_type_info
 189 {
 190 public:
 191   /* Represents the result of analyzing a type.  All values are nonzero,
 192      in the possibly forlorn hope that accidental conversions to bool
 193      trigger a warning.  */
 194   enum analysis_result
 195   {
 196     /* The type does not have an ABI identity; i.e. it doesn't contain
 197        at least one object whose type is a Fundamental Data Type.  */
 198     NO_ABI_IDENTITY = 1,
 199
 200     /* The type is definitely a Pure Scalable Type.  */
 201     IS_PST,
 202
 203     /* The type is definitely not a Pure Scalable Type.  */
 204     ISNT_PST,
 205
 206     /* It doesn't matter for PCS purposes whether the type is a Pure
 207        Scalable Type or not, since the type will be handled the same
 208        way regardless.
 209
 210        Specifically, this means that if the type is a Pure Scalable Type,
 211        there aren't enough argument registers to hold it, and so it will
 212        need to be passed or returned in memory.  If the type isn't a
 213        Pure Scalable Type, it's too big to be passed or returned in core
 214        or SIMD&FP registers, and so again will need to go in memory.  */
 215     DOESNT_MATTER
 216   };
 217
 218   /* Aggregates of 17 bytes or more are normally passed and returned
 219      in memory, so aggregates of that size can safely be analyzed as
 220      DOESNT_MATTER.  We need to be able to collect enough pieces to
 221      represent a PST that is smaller than that.  Since predicates are
 222      2 bytes in size for -msve-vector-bits=128, that means we need to be
 223      able to store at least 8 pieces.
 224
 225      We also need to be able to store enough pieces to represent
 226      a single vector in each vector argument register and a single
 227      predicate in each predicate argument register.  This means that
 228      we need at least 12 pieces.  */
 229   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
 230   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
 231
 232   /* Describes one piece of a PST.  Each piece is one of:
 233
 234      - a single Scalable Vector Type (SVT)
 235      - a single Scalable Predicate Type (SPT)
 236      - a PST containing 2, 3 or 4 SVTs, with no padding
 237
 238      It either represents a single built-in type or a PST formed from
 239      multiple homogeneous built-in types.  */
 240   struct piece
 241   {
 242     rtx get_rtx (unsigned int, unsigned int) const;
 243
 244     /* The number of vector and predicate registers that the piece
 245        occupies.  One of the two is always zero.  */
 246     unsigned int num_zr;
 247     unsigned int num_pr;
 248
 249     /* The mode of the registers described above.  */
 250     machine_mode mode;
 251
 252     /* If this piece is formed from multiple homogeneous built-in types,
 253        this is the mode of the built-in types, otherwise it is MODE.  */
 254     machine_mode orig_mode;
 255
 256     /* The offset in bytes of the piece from the start of the type.  */
 257     poly_uint64_pod offset;
 258   };
 259
 260   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
 261      are in memory order.  */
 262   auto_vec<piece, MAX_PIECES> pieces;
 263
 264   unsigned int num_zr () const;
 265   unsigned int num_pr () const;
 266
 267   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
 268
 269   analysis_result analyze (const_tree);
 270   bool analyze_registers (const_tree);
 271
 272 private:
 273   analysis_result analyze_array (const_tree);
 274   analysis_result analyze_record (const_tree);
 275   void add_piece (const piece &);
 276 };
 277 }
 278
 279 /* The current code model.  */
 280 enum aarch64_code_model aarch64_cmodel;
 281
 282 /* The number of 64-bit elements in an SVE vector.  */
 283 poly_uint16 aarch64_sve_vg;
 284
 285 #ifdef HAVE_AS_TLS
 286 #undef TARGET_HAVE_TLS
 287 #define TARGET_HAVE_TLS 1
 288 #endif
 289
 290 static bool aarch64_composite_type_p (const_tree, machine_mode);
 291 static bool aarch64_return_in_memory_1 (const_tree);
 292 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 293                                                      const_tree,
 294                                                      machine_mode *, int *,
 295                                                      bool *, bool);
 296 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 297 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 298 static void aarch64_override_options_after_change (void);
 299 static bool aarch64_vector_mode_supported_p (machine_mode);
 300 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 301 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 302                                                          const_tree type,
 303                                                          int misalignment,
 304                                                          bool is_packed);
 305 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 306 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 307                                             aarch64_addr_query_type);
 308
 309 /* The processor for which instructions should be scheduled.  */
 310 enum aarch64_processor aarch64_tune = cortexa53;
 311
 312 /* Mask to specify which instruction scheduling options should be used.  */
 313 uint64_t aarch64_tune_flags = 0;
 314
 315 /* Global flag for PC relative loads.  */
 316 bool aarch64_pcrelative_literal_loads;
 317
 318 /* Global flag for whether frame pointer is enabled.  */
 319 bool aarch64_use_frame_pointer;
 320
 321 #define BRANCH_PROTECT_STR_MAX 255
 322 char *accepted_branch_protection_string = NULL;
 323
 324 static enum aarch64_parse_opt_result
 325 aarch64_parse_branch_protection (const char*, char**);
 326
 327 /* Support for command line parsing of boolean flags in the tuning
 328    structures.  */
 329 struct aarch64_flag_desc
 330 {
 331   const char* name;
 332   unsigned int flag;
 333 };
 334
 335 #define AARCH64_FUSION_PAIR(name, internal_name) \
 336   { name, AARCH64_FUSE_##internal_name },
 337 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 338 {
 339   { "none", AARCH64_FUSE_NOTHING },
 340 #include "aarch64-fusion-pairs.def"
 341   { "all", AARCH64_FUSE_ALL },
 342   { NULL, AARCH64_FUSE_NOTHING }
 343 };
 344
 345 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 346   { name, AARCH64_EXTRA_TUNE_##internal_name },
 347 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 348 {
 349   { "none", AARCH64_EXTRA_TUNE_NONE },
 350 #include "aarch64-tuning-flags.def"
 351   { "all", AARCH64_EXTRA_TUNE_ALL },
 352   { NULL, AARCH64_EXTRA_TUNE_NONE }
 353 };
 354
 355 /* Tuning parameters.  */
 356
 357 static const struct cpu_addrcost_table generic_addrcost_table =
 358 {
 359     {
 360       1, /* hi  */
 361       0, /* si  */
 362       0, /* di  */
 363       1, /* ti  */
 364     },
 365   0, /* pre_modify  */
 366   0, /* post_modify  */
 367   0, /* post_modify_ld3_st3  */
 368   0, /* post_modify_ld4_st4  */
 369   0, /* register_offset  */
 370   0, /* register_sextend  */
 371   0, /* register_zextend  */
 372   0 /* imm_offset  */
 373 };
 374
 375 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 376 {
 377     {
 378       0, /* hi  */
 379       0, /* si  */
 380       0, /* di  */
 381       2, /* ti  */
 382     },
 383   0, /* pre_modify  */
 384   0, /* post_modify  */
 385   0, /* post_modify_ld3_st3  */
 386   0, /* post_modify_ld4_st4  */
 387   1, /* register_offset  */
 388   1, /* register_sextend  */
 389   2, /* register_zextend  */
 390   0, /* imm_offset  */
 391 };
 392
 393 static const struct cpu_addrcost_table xgene1_addrcost_table =
 394 {
 395     {
 396       1, /* hi  */
 397       0, /* si  */
 398       0, /* di  */
 399       1, /* ti  */
 400     },
 401   1, /* pre_modify  */
 402   1, /* post_modify  */
 403   1, /* post_modify_ld3_st3  */
 404   1, /* post_modify_ld4_st4  */
 405   0, /* register_offset  */
 406   1, /* register_sextend  */
 407   1, /* register_zextend  */
 408   0, /* imm_offset  */
 409 };
 410
 411 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 412 {
 413     {
 414       1, /* hi  */
 415       1, /* si  */
 416       1, /* di  */
 417       2, /* ti  */
 418     },
 419   0, /* pre_modify  */
 420   0, /* post_modify  */
 421   0, /* post_modify_ld3_st3  */
 422   0, /* post_modify_ld4_st4  */
 423   2, /* register_offset  */
 424   3, /* register_sextend  */
 425   3, /* register_zextend  */
 426   0, /* imm_offset  */
 427 };
 428
 429 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
 430 {
 431     {
 432       1, /* hi  */
 433       1, /* si  */
 434       1, /* di  */
 435       2, /* ti  */
 436     },
 437   0, /* pre_modify  */
 438   0, /* post_modify  */
 439   0, /* post_modify_ld3_st3  */
 440   0, /* post_modify_ld4_st4  */
 441   2, /* register_offset  */
 442   3, /* register_sextend  */
 443   3, /* register_zextend  */
 444   0, /* imm_offset  */
 445 };
 446
 447 static const struct cpu_addrcost_table tsv110_addrcost_table =
 448 {
 449     {
 450       1, /* hi  */
 451       0, /* si  */
 452       0, /* di  */
 453       1, /* ti  */
 454     },
 455   0, /* pre_modify  */
 456   0, /* post_modify  */
 457   0, /* post_modify_ld3_st3  */
 458   0, /* post_modify_ld4_st4  */
 459   0, /* register_offset  */
 460   1, /* register_sextend  */
 461   1, /* register_zextend  */
 462   0, /* imm_offset  */
 463 };
 464
 465 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 466 {
 467     {
 468       1, /* hi  */
 469       1, /* si  */
 470       1, /* di  */
 471       2, /* ti  */
 472     },
 473   1, /* pre_modify  */
 474   1, /* post_modify  */
 475   1, /* post_modify_ld3_st3  */
 476   1, /* post_modify_ld4_st4  */
 477   3, /* register_offset  */
 478   3, /* register_sextend  */
 479   3, /* register_zextend  */
 480   2, /* imm_offset  */
 481 };
 482
 483 static const struct cpu_addrcost_table a64fx_addrcost_table =
 484 {
 485     {
 486       1, /* hi  */
 487       1, /* si  */
 488       1, /* di  */
 489       2, /* ti  */
 490     },
 491   0, /* pre_modify  */
 492   0, /* post_modify  */
 493   0, /* post_modify_ld3_st3  */
 494   0, /* post_modify_ld4_st4  */
 495   2, /* register_offset  */
 496   3, /* register_sextend  */
 497   3, /* register_zextend  */
 498   0, /* imm_offset  */
 499 };
 500
 501 static const struct cpu_addrcost_table neoversev1_addrcost_table =
 502 {
 503     {
 504       1, /* hi  */
 505       0, /* si  */
 506       0, /* di  */
 507       1, /* ti  */
 508     },
 509   0, /* pre_modify  */
 510   0, /* post_modify  */
 511   3, /* post_modify_ld3_st3  */
 512   3, /* post_modify_ld4_st4  */
 513   0, /* register_offset  */
 514   0, /* register_sextend  */
 515   0, /* register_zextend  */
 516   0 /* imm_offset  */
 517 };
 518
 519 static const struct cpu_addrcost_table neoversen2_addrcost_table =
 520 {
 521     {
 522       1, /* hi  */
 523       0, /* si  */
 524       0, /* di  */
 525       1, /* ti  */
 526     },
 527   0, /* pre_modify  */
 528   0, /* post_modify  */
 529   2, /* post_modify_ld3_st3  */
 530   2, /* post_modify_ld4_st4  */
 531   0, /* register_offset  */
 532   0, /* register_sextend  */
 533   0, /* register_zextend  */
 534   0 /* imm_offset  */
 535 };
 536
 537 static const struct cpu_addrcost_table neoversev2_addrcost_table =
 538 {
 539     {
 540       1, /* hi  */
 541       0, /* si  */
 542       0, /* di  */
 543       1, /* ti  */
 544     },
 545   0, /* pre_modify  */
 546   0, /* post_modify  */
 547   2, /* post_modify_ld3_st3  */
 548   2, /* post_modify_ld4_st4  */
 549   0, /* register_offset  */
 550   0, /* register_sextend  */
 551   0, /* register_zextend  */
 552   0 /* imm_offset  */
 553 };
 554
 555 static const struct cpu_regmove_cost generic_regmove_cost =
 556 {
 557   1, /* GP2GP  */
 558   /* Avoid the use of slow int<->fp moves for spilling by setting
 559      their cost higher than memmov_cost.  */
 560   5, /* GP2FP  */
 561   5, /* FP2GP  */
 562   2 /* FP2FP  */
 563 };
 564
 565 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 566 {
 567   1, /* GP2GP  */
 568   /* Avoid the use of slow int<->fp moves for spilling by setting
 569      their cost higher than memmov_cost.  */
 570   5, /* GP2FP  */
 571   5, /* FP2GP  */
 572   2 /* FP2FP  */
 573 };
 574
 575 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 576 {
 577   1, /* GP2GP  */
 578   /* Avoid the use of slow int<->fp moves for spilling by setting
 579      their cost higher than memmov_cost.  */
 580   5, /* GP2FP  */
 581   5, /* FP2GP  */
 582   2 /* FP2FP  */
 583 };
 584
 585 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 586 {
 587   1, /* GP2GP  */
 588   /* Avoid the use of slow int<->fp moves for spilling by setting
 589      their cost higher than memmov_cost (actual, 4 and 9).  */
 590   9, /* GP2FP  */
 591   9, /* FP2GP  */
 592   1 /* FP2FP  */
 593 };
 594
 595 static const struct cpu_regmove_cost thunderx_regmove_cost =
 596 {
 597   2, /* GP2GP  */
 598   2, /* GP2FP  */
 599   6, /* FP2GP  */
 600   4 /* FP2FP  */
 601 };
 602
 603 static const struct cpu_regmove_cost xgene1_regmove_cost =
 604 {
 605   1, /* GP2GP  */
 606   /* Avoid the use of slow int<->fp moves for spilling by setting
 607      their cost higher than memmov_cost.  */
 608   8, /* GP2FP  */
 609   8, /* FP2GP  */
 610   2 /* FP2FP  */
 611 };
 612
 613 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 614 {
 615   2, /* GP2GP  */
 616   /* Avoid the use of int<->fp moves for spilling.  */
 617   6, /* GP2FP  */
 618   6, /* FP2GP  */
 619   4 /* FP2FP  */
 620 };
 621
 622 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 623 {
 624   1, /* GP2GP  */
 625   /* Avoid the use of int<->fp moves for spilling.  */
 626   5, /* GP2FP  */
 627   6, /* FP2GP  */
 628   3, /* FP2FP  */
 629 };
 630
 631 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
 632 {
 633   1, /* GP2GP  */
 634   /* Avoid the use of int<->fp moves for spilling.  */
 635   4, /* GP2FP  */
 636   5, /* FP2GP  */
 637   4  /* FP2FP  */
 638 };
 639
 640 static const struct cpu_regmove_cost tsv110_regmove_cost =
 641 {
 642   1, /* GP2GP  */
 643   /* Avoid the use of slow int<->fp moves for spilling by setting
 644      their cost higher than memmov_cost.  */
 645   2, /* GP2FP  */
 646   3, /* FP2GP  */
 647   2  /* FP2FP  */
 648 };
 649
 650 static const struct cpu_regmove_cost a64fx_regmove_cost =
 651 {
 652   1, /* GP2GP  */
 653   /* Avoid the use of slow int<->fp moves for spilling by setting
 654      their cost higher than memmov_cost.  */
 655   5, /* GP2FP  */
 656   7, /* FP2GP  */
 657   2 /* FP2FP  */
 658 };
 659
 660 static const struct cpu_regmove_cost neoversen2_regmove_cost =
 661 {
 662   1, /* GP2GP  */
 663   /* Spilling to int<->fp instead of memory is recommended so set
 664      realistic costs compared to memmov_cost.  */
 665   3, /* GP2FP  */
 666   2, /* FP2GP  */
 667   2 /* FP2FP  */
 668 };
 669
 670 static const struct cpu_regmove_cost neoversev1_regmove_cost =
 671 {
 672   1, /* GP2GP  */
 673   /* Spilling to int<->fp instead of memory is recommended so set
 674      realistic costs compared to memmov_cost.  */
 675   3, /* GP2FP  */
 676   2, /* FP2GP  */
 677   2 /* FP2FP  */
 678 };
 679
 680 static const struct cpu_regmove_cost neoversev2_regmove_cost =
 681 {
 682   1, /* GP2GP  */
 683   /* Spilling to int<->fp instead of memory is recommended so set
 684      realistic costs compared to memmov_cost.  */
 685   3, /* GP2FP  */
 686   2, /* FP2GP  */
 687   2 /* FP2FP  */
 688 };
 689
 690 /* Generic costs for Advanced SIMD vector operations.   */
 691 static const advsimd_vec_cost generic_advsimd_vector_cost =
 692 {
 693   1, /* int_stmt_cost  */
 694   1, /* fp_stmt_cost  */
 695   0, /* ld2_st2_permute_cost  */
 696   0, /* ld3_st3_permute_cost  */
 697   0, /* ld4_st4_permute_cost  */
 698   2, /* permute_cost  */
 699   2, /* reduc_i8_cost  */
 700   2, /* reduc_i16_cost  */
 701   2, /* reduc_i32_cost  */
 702   2, /* reduc_i64_cost  */
 703   2, /* reduc_f16_cost  */
 704   2, /* reduc_f32_cost  */
 705   2, /* reduc_f64_cost  */
 706   2, /* store_elt_extra_cost  */
 707   2, /* vec_to_scalar_cost  */
 708   1, /* scalar_to_vec_cost  */
 709   1, /* align_load_cost  */
 710   1, /* unalign_load_cost  */
 711   1, /* unalign_store_cost  */
 712   1  /* store_cost  */
 713 };
 714
 715 /* Generic costs for SVE vector operations.  */
 716 static const sve_vec_cost generic_sve_vector_cost =
 717 {
 718   {
 719     1, /* int_stmt_cost  */
 720     1, /* fp_stmt_cost  */
 721     0, /* ld2_st2_permute_cost  */
 722     0, /* ld3_st3_permute_cost  */
 723     0, /* ld4_st4_permute_cost  */
 724     2, /* permute_cost  */
 725     2, /* reduc_i8_cost  */
 726     2, /* reduc_i16_cost  */
 727     2, /* reduc_i32_cost  */
 728     2, /* reduc_i64_cost  */
 729     2, /* reduc_f16_cost  */
 730     2, /* reduc_f32_cost  */
 731     2, /* reduc_f64_cost  */
 732     2, /* store_elt_extra_cost  */
 733     2, /* vec_to_scalar_cost  */
 734     1, /* scalar_to_vec_cost  */
 735     1, /* align_load_cost  */
 736     1, /* unalign_load_cost  */
 737     1, /* unalign_store_cost  */
 738     1  /* store_cost  */
 739   },
 740   2, /* clast_cost  */
 741   2, /* fadda_f16_cost  */
 742   2, /* fadda_f32_cost  */
 743   2, /* fadda_f64_cost  */
 744   4, /* gather_load_x32_cost  */
 745   2, /* gather_load_x64_cost  */
 746   1 /* scatter_store_elt_cost  */
 747 };
 748
 749 /* Generic costs for vector insn classes.  */
 750 static const struct cpu_vector_cost generic_vector_cost =
 751 {
 752   1, /* scalar_int_stmt_cost  */
 753   1, /* scalar_fp_stmt_cost  */
 754   1, /* scalar_load_cost  */
 755   1, /* scalar_store_cost  */
 756   3, /* cond_taken_branch_cost  */
 757   1, /* cond_not_taken_branch_cost  */
 758   &generic_advsimd_vector_cost, /* advsimd  */
 759   &generic_sve_vector_cost, /* sve */
 760   nullptr /* issue_info  */
 761 };
 762
 763 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
 764 {
 765   2, /* int_stmt_cost  */
 766   5, /* fp_stmt_cost  */
 767   0, /* ld2_st2_permute_cost  */
 768   0, /* ld3_st3_permute_cost  */
 769   0, /* ld4_st4_permute_cost  */
 770   3, /* permute_cost  */
 771   13, /* reduc_i8_cost  */
 772   13, /* reduc_i16_cost  */
 773   13, /* reduc_i32_cost  */
 774   13, /* reduc_i64_cost  */
 775   13, /* reduc_f16_cost  */
 776   13, /* reduc_f32_cost  */
 777   13, /* reduc_f64_cost  */
 778   13, /* store_elt_extra_cost  */
 779   13, /* vec_to_scalar_cost  */
 780   4, /* scalar_to_vec_cost  */
 781   6, /* align_load_cost  */
 782   6, /* unalign_load_cost  */
 783   1, /* unalign_store_cost  */
 784   1  /* store_cost  */
 785 };
 786
 787 static const sve_vec_cost a64fx_sve_vector_cost =
 788 {
 789   {
 790     2, /* int_stmt_cost  */
 791     5, /* fp_stmt_cost  */
 792     0, /* ld2_st2_permute_cost  */
 793     0, /* ld3_st3_permute_cost  */
 794     0, /* ld4_st4_permute_cost  */
 795     3, /* permute_cost  */
 796     13, /* reduc_i8_cost  */
 797     13, /* reduc_i16_cost  */
 798     13, /* reduc_i32_cost  */
 799     13, /* reduc_i64_cost  */
 800     13, /* reduc_f16_cost  */
 801     13, /* reduc_f32_cost  */
 802     13, /* reduc_f64_cost  */
 803     13, /* store_elt_extra_cost  */
 804     13, /* vec_to_scalar_cost  */
 805     4, /* scalar_to_vec_cost  */
 806     6, /* align_load_cost  */
 807     6, /* unalign_load_cost  */
 808     1, /* unalign_store_cost  */
 809     1  /* store_cost  */
 810   },
 811   13, /* clast_cost  */
 812   13, /* fadda_f16_cost  */
 813   13, /* fadda_f32_cost  */
 814   13, /* fadda_f64_cost  */
 815   64, /* gather_load_x32_cost  */
 816   32, /* gather_load_x64_cost  */
 817   1 /* scatter_store_elt_cost  */
 818 };
 819
 820 static const struct cpu_vector_cost a64fx_vector_cost =
 821 {
 822   1, /* scalar_int_stmt_cost  */
 823   5, /* scalar_fp_stmt_cost  */
 824   4, /* scalar_load_cost  */
 825   1, /* scalar_store_cost  */
 826   3, /* cond_taken_branch_cost  */
 827   1, /* cond_not_taken_branch_cost  */
 828   &a64fx_advsimd_vector_cost, /* advsimd  */
 829   &a64fx_sve_vector_cost, /* sve  */
 830   nullptr /* issue_info  */
 831 };
 832
 833 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
 834 {
 835   1, /* int_stmt_cost  */
 836   3, /* fp_stmt_cost  */
 837   0, /* ld2_st2_permute_cost  */
 838   0, /* ld3_st3_permute_cost  */
 839   0, /* ld4_st4_permute_cost  */
 840   2, /* permute_cost  */
 841   1, /* reduc_i8_cost  */
 842   1, /* reduc_i16_cost  */
 843   1, /* reduc_i32_cost  */
 844   1, /* reduc_i64_cost  */
 845   1, /* reduc_f16_cost  */
 846   1, /* reduc_f32_cost  */
 847   1, /* reduc_f64_cost  */
 848   1, /* store_elt_extra_cost  */
 849   1, /* vec_to_scalar_cost  */
 850   1, /* scalar_to_vec_cost  */
 851   1, /* align_load_cost  */
 852   1, /* unalign_load_cost  */
 853   1, /* unalign_store_cost  */
 854   1  /* store_cost  */
 855 };
 856
 857 /* QDF24XX costs for vector insn classes.  */
 858 static const struct cpu_vector_cost qdf24xx_vector_cost =
 859 {
 860   1, /* scalar_int_stmt_cost  */
 861   1, /* scalar_fp_stmt_cost  */
 862   1, /* scalar_load_cost  */
 863   1, /* scalar_store_cost  */
 864   3, /* cond_taken_branch_cost  */
 865   1, /* cond_not_taken_branch_cost  */
 866   &qdf24xx_advsimd_vector_cost, /* advsimd  */
 867   nullptr, /* sve  */
 868   nullptr /* issue_info  */
 869 };
 870
 871
 872 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
 873 {
 874   4, /* int_stmt_cost  */
 875   1, /* fp_stmt_cost  */
 876   0, /* ld2_st2_permute_cost  */
 877   0, /* ld3_st3_permute_cost  */
 878   0, /* ld4_st4_permute_cost  */
 879   4, /* permute_cost  */
 880   2, /* reduc_i8_cost  */
 881   2, /* reduc_i16_cost  */
 882   2, /* reduc_i32_cost  */
 883   2, /* reduc_i64_cost  */
 884   2, /* reduc_f16_cost  */
 885   2, /* reduc_f32_cost  */
 886   2, /* reduc_f64_cost  */
 887   2, /* store_elt_extra_cost  */
 888   2, /* vec_to_scalar_cost  */
 889   2, /* scalar_to_vec_cost  */
 890   3, /* align_load_cost  */
 891   5, /* unalign_load_cost  */
 892   5, /* unalign_store_cost  */
 893   1  /* store_cost  */
 894 };
 895
 896 /* ThunderX costs for vector insn classes.  */
 897 static const struct cpu_vector_cost thunderx_vector_cost =
 898 {
 899   1, /* scalar_int_stmt_cost  */
 900   1, /* scalar_fp_stmt_cost  */
 901   3, /* scalar_load_cost  */
 902   1, /* scalar_store_cost  */
 903   3, /* cond_taken_branch_cost  */
 904   3, /* cond_not_taken_branch_cost  */
 905   &thunderx_advsimd_vector_cost, /* advsimd  */
 906   nullptr, /* sve  */
 907   nullptr /* issue_info  */
 908 };
 909
 910 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
 911 {
 912   2, /* int_stmt_cost  */
 913   2, /* fp_stmt_cost  */
 914   0, /* ld2_st2_permute_cost  */
 915   0, /* ld3_st3_permute_cost  */
 916   0, /* ld4_st4_permute_cost  */
 917   2, /* permute_cost  */
 918   3, /* reduc_i8_cost  */
 919   3, /* reduc_i16_cost  */
 920   3, /* reduc_i32_cost  */
 921   3, /* reduc_i64_cost  */
 922   3, /* reduc_f16_cost  */
 923   3, /* reduc_f32_cost  */
 924   3, /* reduc_f64_cost  */
 925   3, /* store_elt_extra_cost  */
 926   3, /* vec_to_scalar_cost  */
 927   2, /* scalar_to_vec_cost  */
 928   5, /* align_load_cost  */
 929   5, /* unalign_load_cost  */
 930   1, /* unalign_store_cost  */
 931   1  /* store_cost  */
 932 };
 933
 934 static const struct cpu_vector_cost tsv110_vector_cost =
 935 {
 936   1, /* scalar_int_stmt_cost  */
 937   1, /* scalar_fp_stmt_cost  */
 938   5, /* scalar_load_cost  */
 939   1, /* scalar_store_cost  */
 940   1, /* cond_taken_branch_cost  */
 941   1, /* cond_not_taken_branch_cost  */
 942   &tsv110_advsimd_vector_cost, /* advsimd  */
 943   nullptr, /* sve  */
 944   nullptr /* issue_info  */
 945 };
 946
 947 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
 948 {
 949   2, /* int_stmt_cost  */
 950   2, /* fp_stmt_cost  */
 951   0, /* ld2_st2_permute_cost  */
 952   0, /* ld3_st3_permute_cost  */
 953   0, /* ld4_st4_permute_cost  */
 954   3, /* permute_cost  */
 955   8, /* reduc_i8_cost  */
 956   8, /* reduc_i16_cost  */
 957   8, /* reduc_i32_cost  */
 958   8, /* reduc_i64_cost  */
 959   8, /* reduc_f16_cost  */
 960   8, /* reduc_f32_cost  */
 961   8, /* reduc_f64_cost  */
 962   8, /* store_elt_extra_cost  */
 963   8, /* vec_to_scalar_cost  */
 964   8, /* scalar_to_vec_cost  */
 965   4, /* align_load_cost  */
 966   4, /* unalign_load_cost  */
 967   1, /* unalign_store_cost  */
 968   1  /* store_cost  */
 969 };
 970
 971 /* Cortex-A57 costs for vector insn classes.  */
 972 static const struct cpu_vector_cost cortexa57_vector_cost =
 973 {
 974   1, /* scalar_int_stmt_cost  */
 975   1, /* scalar_fp_stmt_cost  */
 976   4, /* scalar_load_cost  */
 977   1, /* scalar_store_cost  */
 978   1, /* cond_taken_branch_cost  */
 979   1, /* cond_not_taken_branch_cost  */
 980   &cortexa57_advsimd_vector_cost, /* advsimd  */
 981   nullptr, /* sve  */
 982   nullptr /* issue_info  */
 983 };
 984
 985 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
 986 {
 987   3, /* int_stmt_cost  */
 988   3, /* fp_stmt_cost  */
 989   0, /* ld2_st2_permute_cost  */
 990   0, /* ld3_st3_permute_cost  */
 991   0, /* ld4_st4_permute_cost  */
 992   3, /* permute_cost  */
 993   3, /* reduc_i8_cost  */
 994   3, /* reduc_i16_cost  */
 995   3, /* reduc_i32_cost  */
 996   3, /* reduc_i64_cost  */
 997   3, /* reduc_f16_cost  */
 998   3, /* reduc_f32_cost  */
 999   3, /* reduc_f64_cost  */
1000   3, /* store_elt_extra_cost  */
1001   3, /* vec_to_scalar_cost  */
1002   3, /* scalar_to_vec_cost  */
1003   5, /* align_load_cost  */
1004   5, /* unalign_load_cost  */
1005   1, /* unalign_store_cost  */
1006   1  /* store_cost  */
1007 };
1008
1009 static const struct cpu_vector_cost exynosm1_vector_cost =
1010 {
1011   1, /* scalar_int_stmt_cost  */
1012   1, /* scalar_fp_stmt_cost  */
1013   5, /* scalar_load_cost  */
1014   1, /* scalar_store_cost  */
1015   1, /* cond_taken_branch_cost  */
1016   1, /* cond_not_taken_branch_cost  */
1017   &exynosm1_advsimd_vector_cost, /* advsimd  */
1018   nullptr, /* sve  */
1019   nullptr /* issue_info  */
1020 };
1021
1022 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
1023 {
1024   2, /* int_stmt_cost  */
1025   2, /* fp_stmt_cost  */
1026   0, /* ld2_st2_permute_cost  */
1027   0, /* ld3_st3_permute_cost  */
1028   0, /* ld4_st4_permute_cost  */
1029   2, /* permute_cost  */
1030   4, /* reduc_i8_cost  */
1031   4, /* reduc_i16_cost  */
1032   4, /* reduc_i32_cost  */
1033   4, /* reduc_i64_cost  */
1034   4, /* reduc_f16_cost  */
1035   4, /* reduc_f32_cost  */
1036   4, /* reduc_f64_cost  */
1037   4, /* store_elt_extra_cost  */
1038   4, /* vec_to_scalar_cost  */
1039   4, /* scalar_to_vec_cost  */
1040   10, /* align_load_cost  */
1041   10, /* unalign_load_cost  */
1042   2, /* unalign_store_cost  */
1043   2  /* store_cost  */
1044 };
1045
1046 /* Generic costs for vector insn classes.  */
1047 static const struct cpu_vector_cost xgene1_vector_cost =
1048 {
1049   1, /* scalar_int_stmt_cost  */
1050   1, /* scalar_fp_stmt_cost  */
1051   5, /* scalar_load_cost  */
1052   1, /* scalar_store_cost  */
1053   2, /* cond_taken_branch_cost  */
1054   1, /* cond_not_taken_branch_cost  */
1055   &xgene1_advsimd_vector_cost, /* advsimd  */
1056   nullptr, /* sve  */
1057   nullptr /* issue_info  */
1058 };
1059
1060 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
1061 {
1062   4, /* int_stmt_cost  */
1063   5, /* fp_stmt_cost  */
1064   0, /* ld2_st2_permute_cost  */
1065   0, /* ld3_st3_permute_cost  */
1066   0, /* ld4_st4_permute_cost  */
1067   10, /* permute_cost  */
1068   6, /* reduc_i8_cost  */
1069   6, /* reduc_i16_cost  */
1070   6, /* reduc_i32_cost  */
1071   6, /* reduc_i64_cost  */
1072   6, /* reduc_f16_cost  */
1073   6, /* reduc_f32_cost  */
1074   6, /* reduc_f64_cost  */
1075   6, /* store_elt_extra_cost  */
1076   6, /* vec_to_scalar_cost  */
1077   5, /* scalar_to_vec_cost  */
1078   4, /* align_load_cost  */
1079   4, /* unalign_load_cost  */
1080   1, /* unalign_store_cost  */
1081   1  /* store_cost  */
1082 };
1083
1084 /* Costs for vector insn classes for Vulcan.  */
1085 static const struct cpu_vector_cost thunderx2t99_vector_cost =
1086 {
1087   1, /* scalar_int_stmt_cost  */
1088   6, /* scalar_fp_stmt_cost  */
1089   4, /* scalar_load_cost  */
1090   1, /* scalar_store_cost  */
1091   2, /* cond_taken_branch_cost  */
1092   1,  /* cond_not_taken_branch_cost  */
1093   &thunderx2t99_advsimd_vector_cost, /* advsimd  */
1094   nullptr, /* sve  */
1095   nullptr /* issue_info  */
1096 };
1097
1098 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1099 {
1100   5, /* int_stmt_cost  */
1101   5, /* fp_stmt_cost  */
1102   0, /* ld2_st2_permute_cost  */
1103   0, /* ld3_st3_permute_cost  */
1104   0, /* ld4_st4_permute_cost  */
1105   10, /* permute_cost  */
1106   5, /* reduc_i8_cost  */
1107   5, /* reduc_i16_cost  */
1108   5, /* reduc_i32_cost  */
1109   5, /* reduc_i64_cost  */
1110   5, /* reduc_f16_cost  */
1111   5, /* reduc_f32_cost  */
1112   5, /* reduc_f64_cost  */
1113   5, /* store_elt_extra_cost  */
1114   5, /* vec_to_scalar_cost  */
1115   5, /* scalar_to_vec_cost  */
1116   4, /* align_load_cost  */
1117   4, /* unalign_load_cost  */
1118   4, /* unalign_store_cost  */
1119   4  /* store_cost  */
1120 };
1121
1122 static const struct cpu_vector_cost thunderx3t110_vector_cost =
1123 {
1124   1, /* scalar_int_stmt_cost  */
1125   5, /* scalar_fp_stmt_cost  */
1126   4, /* scalar_load_cost  */
1127   1, /* scalar_store_cost  */
1128   2, /* cond_taken_branch_cost  */
1129   1,  /* cond_not_taken_branch_cost  */
1130   &thunderx3t110_advsimd_vector_cost, /* advsimd  */
1131   nullptr, /* sve  */
1132   nullptr /* issue_info  */
1133 };
1134
1135 static const advsimd_vec_cost ampere1_advsimd_vector_cost =
1136 {
1137   3, /* int_stmt_cost  */
1138   3, /* fp_stmt_cost  */
1139   0, /* ld2_st2_permute_cost  */
1140   0, /* ld3_st3_permute_cost  */
1141   0, /* ld4_st4_permute_cost  */
1142   2, /* permute_cost  */
1143   12, /* reduc_i8_cost  */
1144   9, /* reduc_i16_cost  */
1145   6, /* reduc_i32_cost  */
1146   5, /* reduc_i64_cost  */
1147   9, /* reduc_f16_cost  */
1148   6, /* reduc_f32_cost  */
1149   5, /* reduc_f64_cost  */
1150   8, /* store_elt_extra_cost  */
1151   6, /* vec_to_scalar_cost  */
1152   7, /* scalar_to_vec_cost  */
1153   5, /* align_load_cost  */
1154   5, /* unalign_load_cost  */
1155   2, /* unalign_store_cost  */
1156   2  /* store_cost  */
1157 };
1158
1159 /* Ampere-1 costs for vector insn classes.  */
1160 static const struct cpu_vector_cost ampere1_vector_cost =
1161 {
1162   1, /* scalar_int_stmt_cost  */
1163   1, /* scalar_fp_stmt_cost  */
1164   4, /* scalar_load_cost  */
1165   1, /* scalar_store_cost  */
1166   1, /* cond_taken_branch_cost  */
1167   1, /* cond_not_taken_branch_cost  */
1168   &ampere1_advsimd_vector_cost, /* advsimd  */
1169   nullptr, /* sve  */
1170   nullptr  /* issue_info  */
1171 };
1172
1173 /* Generic costs for branch instructions.  */
1174 static const struct cpu_branch_cost generic_branch_cost =
1175 {
1176   1,  /* Predictable.  */
1177   3   /* Unpredictable.  */
1178 };
1179
1180 /* Generic approximation modes.  */
1181 static const cpu_approx_modes generic_approx_modes =
1182 {
1183   AARCH64_APPROX_NONE,  /* division  */
1184   AARCH64_APPROX_NONE,  /* sqrt  */
1185   AARCH64_APPROX_NONE   /* recip_sqrt  */
1186 };
1187
1188 /* Approximation modes for Exynos M1.  */
1189 static const cpu_approx_modes exynosm1_approx_modes =
1190 {
1191   AARCH64_APPROX_NONE,  /* division  */
1192   AARCH64_APPROX_ALL,   /* sqrt  */
1193   AARCH64_APPROX_ALL    /* recip_sqrt  */
1194 };
1195
1196 /* Approximation modes for X-Gene 1.  */
1197 static const cpu_approx_modes xgene1_approx_modes =
1198 {
1199   AARCH64_APPROX_NONE,  /* division  */
1200   AARCH64_APPROX_NONE,  /* sqrt  */
1201   AARCH64_APPROX_ALL    /* recip_sqrt  */
1202 };
1203
1204 /* Generic prefetch settings (which disable prefetch).  */
1205 static const cpu_prefetch_tune generic_prefetch_tune =
1206 {
1207   0,                    /* num_slots  */
1208   -1,                   /* l1_cache_size  */
1209   -1,                   /* l1_cache_line_size  */
1210   -1,                   /* l2_cache_size  */
1211   true,                 /* prefetch_dynamic_strides */
1212   -1,                   /* minimum_stride */
1213   -1                    /* default_opt_level  */
1214 };
1215
1216 static const cpu_prefetch_tune exynosm1_prefetch_tune =
1217 {
1218   0,                    /* num_slots  */
1219   -1,                   /* l1_cache_size  */
1220   64,                   /* l1_cache_line_size  */
1221   -1,                   /* l2_cache_size  */
1222   true,                 /* prefetch_dynamic_strides */
1223   -1,                   /* minimum_stride */
1224   -1                    /* default_opt_level  */
1225 };
1226
1227 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1228 {
1229   4,                    /* num_slots  */
1230   32,                   /* l1_cache_size  */
1231   64,                   /* l1_cache_line_size  */
1232   512,                  /* l2_cache_size  */
1233   false,                /* prefetch_dynamic_strides */
1234   2048,                 /* minimum_stride */
1235   3                     /* default_opt_level  */
1236 };
1237
1238 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1239 {
1240   8,                    /* num_slots  */
1241   32,                   /* l1_cache_size  */
1242   128,                  /* l1_cache_line_size  */
1243   16*1024,              /* l2_cache_size  */
1244   true,                 /* prefetch_dynamic_strides */
1245   -1,                   /* minimum_stride */
1246   3                     /* default_opt_level  */
1247 };
1248
1249 static const cpu_prefetch_tune thunderx_prefetch_tune =
1250 {
1251   8,                    /* num_slots  */
1252   32,                   /* l1_cache_size  */
1253   128,                  /* l1_cache_line_size  */
1254   -1,                   /* l2_cache_size  */
1255   true,                 /* prefetch_dynamic_strides */
1256   -1,                   /* minimum_stride */
1257   -1                    /* default_opt_level  */
1258 };
1259
1260 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1261 {
1262   8,                    /* num_slots  */
1263   32,                   /* l1_cache_size  */
1264   64,                   /* l1_cache_line_size  */
1265   256,                  /* l2_cache_size  */
1266   true,                 /* prefetch_dynamic_strides */
1267   -1,                   /* minimum_stride */
1268   -1                    /* default_opt_level  */
1269 };
1270
1271 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1272 {
1273   8,                    /* num_slots  */
1274   32,                   /* l1_cache_size  */
1275   64,                   /* l1_cache_line_size  */
1276   256,                  /* l2_cache_size  */
1277   true,                 /* prefetch_dynamic_strides */
1278   -1,                   /* minimum_stride */
1279   -1                    /* default_opt_level  */
1280 };
1281
1282 static const cpu_prefetch_tune tsv110_prefetch_tune =
1283 {
1284   0,                    /* num_slots  */
1285   64,                   /* l1_cache_size  */
1286   64,                   /* l1_cache_line_size  */
1287   512,                  /* l2_cache_size  */
1288   true,                 /* prefetch_dynamic_strides */
1289   -1,                   /* minimum_stride */
1290   -1                    /* default_opt_level  */
1291 };
1292
1293 static const cpu_prefetch_tune xgene1_prefetch_tune =
1294 {
1295   8,                    /* num_slots  */
1296   32,                   /* l1_cache_size  */
1297   64,                   /* l1_cache_line_size  */
1298   256,                  /* l2_cache_size  */
1299   true,                 /* prefetch_dynamic_strides */
1300   -1,                   /* minimum_stride */
1301   -1                    /* default_opt_level  */
1302 };
1303
1304 static const cpu_prefetch_tune a64fx_prefetch_tune =
1305 {
1306   8,                    /* num_slots  */
1307   64,                   /* l1_cache_size  */
1308   256,                  /* l1_cache_line_size  */
1309   32768,                /* l2_cache_size  */
1310   true,                 /* prefetch_dynamic_strides */
1311   -1,                   /* minimum_stride */
1312   -1                    /* default_opt_level  */
1313 };
1314
1315 static const cpu_prefetch_tune ampere1_prefetch_tune =
1316 {
1317   0,                    /* num_slots  */
1318   64,                   /* l1_cache_size  */
1319   64,                   /* l1_cache_line_size  */
1320   2048,                 /* l2_cache_size  */
1321   true,                 /* prefetch_dynamic_strides */
1322   -1,                   /* minimum_stride */
1323   -1                    /* default_opt_level  */
1324 };
1325
1326 static const struct tune_params generic_tunings =
1327 {
1328   &cortexa57_extra_costs,
1329   &generic_addrcost_table,
1330   &generic_regmove_cost,
1331   &generic_vector_cost,
1332   &generic_branch_cost,
1333   &generic_approx_modes,
1334   SVE_NOT_IMPLEMENTED, /* sve_width  */
1335   { 4, /* load_int.  */
1336     4, /* store_int.  */
1337     4, /* load_fp.  */
1338     4, /* store_fp.  */
1339     4, /* load_pred.  */
1340     4 /* store_pred.  */
1341   }, /* memmov_cost.  */
1342   2, /* issue_rate  */
1343   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1344   "16:12",      /* function_align.  */
1345   "4",  /* jump_align.  */
1346   "8",  /* loop_align.  */
1347   2,    /* int_reassoc_width.  */
1348   4,    /* fp_reassoc_width.  */
1349   1,    /* fma_reassoc_width.  */
1350   1,    /* vec_reassoc_width.  */
1351   2,    /* min_div_recip_mul_sf.  */
1352   2,    /* min_div_recip_mul_df.  */
1353   0,    /* max_case_values.  */
1354   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1355   /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1356      Neoverse V1.  It does not have a noticeable effect on A64FX and should
1357      have at most a very minor effect on SVE2 cores.  */
1358   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),    /* tune_flags.  */
1359   &generic_prefetch_tune
1360 };
1361
1362 static const struct tune_params cortexa35_tunings =
1363 {
1364   &cortexa53_extra_costs,
1365   &generic_addrcost_table,
1366   &cortexa53_regmove_cost,
1367   &generic_vector_cost,
1368   &generic_branch_cost,
1369   &generic_approx_modes,
1370   SVE_NOT_IMPLEMENTED, /* sve_width  */
1371   { 4, /* load_int.  */
1372     4, /* store_int.  */
1373     4, /* load_fp.  */
1374     4, /* store_fp.  */
1375     4, /* load_pred.  */
1376     4 /* store_pred.  */
1377   }, /* memmov_cost.  */
1378   1, /* issue_rate  */
1379   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1380    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1381   "16", /* function_align.  */
1382   "4",  /* jump_align.  */
1383   "8",  /* loop_align.  */
1384   2,    /* int_reassoc_width.  */
1385   4,    /* fp_reassoc_width.  */
1386   1,    /* fma_reassoc_width.  */
1387   1,    /* vec_reassoc_width.  */
1388   2,    /* min_div_recip_mul_sf.  */
1389   2,    /* min_div_recip_mul_df.  */
1390   0,    /* max_case_values.  */
1391   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1392   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1393   &generic_prefetch_tune
1394 };
1395
1396 static const struct tune_params cortexa53_tunings =
1397 {
1398   &cortexa53_extra_costs,
1399   &generic_addrcost_table,
1400   &cortexa53_regmove_cost,
1401   &generic_vector_cost,
1402   &generic_branch_cost,
1403   &generic_approx_modes,
1404   SVE_NOT_IMPLEMENTED, /* sve_width  */
1405   { 4, /* load_int.  */
1406     4, /* store_int.  */
1407     4, /* load_fp.  */
1408     4, /* store_fp.  */
1409     4, /* load_pred.  */
1410     4 /* store_pred.  */
1411   }, /* memmov_cost.  */
1412   2, /* issue_rate  */
1413   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1414    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1415   "16", /* function_align.  */
1416   "4",  /* jump_align.  */
1417   "8",  /* loop_align.  */
1418   2,    /* int_reassoc_width.  */
1419   4,    /* fp_reassoc_width.  */
1420   1,    /* fma_reassoc_width.  */
1421   1,    /* vec_reassoc_width.  */
1422   2,    /* min_div_recip_mul_sf.  */
1423   2,    /* min_div_recip_mul_df.  */
1424   0,    /* max_case_values.  */
1425   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1426   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1427   &generic_prefetch_tune
1428 };
1429
1430 static const struct tune_params cortexa57_tunings =
1431 {
1432   &cortexa57_extra_costs,
1433   &generic_addrcost_table,
1434   &cortexa57_regmove_cost,
1435   &cortexa57_vector_cost,
1436   &generic_branch_cost,
1437   &generic_approx_modes,
1438   SVE_NOT_IMPLEMENTED, /* sve_width  */
1439   { 4, /* load_int.  */
1440     4, /* store_int.  */
1441     4, /* load_fp.  */
1442     4, /* store_fp.  */
1443     4, /* load_pred.  */
1444     4 /* store_pred.  */
1445   }, /* memmov_cost.  */
1446   3, /* issue_rate  */
1447   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1448    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1449   "16", /* function_align.  */
1450   "4",  /* jump_align.  */
1451   "8",  /* loop_align.  */
1452   2,    /* int_reassoc_width.  */
1453   4,    /* fp_reassoc_width.  */
1454   1,    /* fma_reassoc_width.  */
1455   1,    /* vec_reassoc_width.  */
1456   2,    /* min_div_recip_mul_sf.  */
1457   2,    /* min_div_recip_mul_df.  */
1458   0,    /* max_case_values.  */
1459   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1460   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
1461   &generic_prefetch_tune
1462 };
1463
1464 static const struct tune_params cortexa72_tunings =
1465 {
1466   &cortexa57_extra_costs,
1467   &generic_addrcost_table,
1468   &cortexa57_regmove_cost,
1469   &cortexa57_vector_cost,
1470   &generic_branch_cost,
1471   &generic_approx_modes,
1472   SVE_NOT_IMPLEMENTED, /* sve_width  */
1473   { 4, /* load_int.  */
1474     4, /* store_int.  */
1475     4, /* load_fp.  */
1476     4, /* store_fp.  */
1477     4, /* load_pred.  */
1478     4 /* store_pred.  */
1479   }, /* memmov_cost.  */
1480   3, /* issue_rate  */
1481   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1482    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1483   "16", /* function_align.  */
1484   "4",  /* jump_align.  */
1485   "8",  /* loop_align.  */
1486   2,    /* int_reassoc_width.  */
1487   4,    /* fp_reassoc_width.  */
1488   1,    /* fma_reassoc_width.  */
1489   1,    /* vec_reassoc_width.  */
1490   2,    /* min_div_recip_mul_sf.  */
1491   2,    /* min_div_recip_mul_df.  */
1492   0,    /* max_case_values.  */
1493   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1494   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1495   &generic_prefetch_tune
1496 };
1497
1498 static const struct tune_params cortexa73_tunings =
1499 {
1500   &cortexa57_extra_costs,
1501   &generic_addrcost_table,
1502   &cortexa57_regmove_cost,
1503   &cortexa57_vector_cost,
1504   &generic_branch_cost,
1505   &generic_approx_modes,
1506   SVE_NOT_IMPLEMENTED, /* sve_width  */
1507   { 4, /* load_int.  */
1508     4, /* store_int.  */
1509     4, /* load_fp.  */
1510     4, /* store_fp.  */
1511     4, /* load_pred.  */
1512     4 /* store_pred.  */
1513   }, /* memmov_cost.  */
1514   2, /* issue_rate.  */
1515   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1516    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1517   "16", /* function_align.  */
1518   "4",  /* jump_align.  */
1519   "8",  /* loop_align.  */
1520   2,    /* int_reassoc_width.  */
1521   4,    /* fp_reassoc_width.  */
1522   1,    /* fma_reassoc_width.  */
1523   1,    /* vec_reassoc_width.  */
1524   2,    /* min_div_recip_mul_sf.  */
1525   2,    /* min_div_recip_mul_df.  */
1526   0,    /* max_case_values.  */
1527   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1528   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1529   &generic_prefetch_tune
1530 };
1531
1532
1533
1534 static const struct tune_params exynosm1_tunings =
1535 {
1536   &exynosm1_extra_costs,
1537   &exynosm1_addrcost_table,
1538   &exynosm1_regmove_cost,
1539   &exynosm1_vector_cost,
1540   &generic_branch_cost,
1541   &exynosm1_approx_modes,
1542   SVE_NOT_IMPLEMENTED, /* sve_width  */
1543   { 4, /* load_int.  */
1544     4, /* store_int.  */
1545     4, /* load_fp.  */
1546     4, /* store_fp.  */
1547     4, /* load_pred.  */
1548     4 /* store_pred.  */
1549   }, /* memmov_cost.  */
1550   3,    /* issue_rate  */
1551   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
1552   "4",  /* function_align.  */
1553   "4",  /* jump_align.  */
1554   "4",  /* loop_align.  */
1555   2,    /* int_reassoc_width.  */
1556   4,    /* fp_reassoc_width.  */
1557   1,    /* fma_reassoc_width.  */
1558   1,    /* vec_reassoc_width.  */
1559   2,    /* min_div_recip_mul_sf.  */
1560   2,    /* min_div_recip_mul_df.  */
1561   48,   /* max_case_values.  */
1562   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
1563   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
1564   &exynosm1_prefetch_tune
1565 };
1566
1567 static const struct tune_params thunderxt88_tunings =
1568 {
1569   &thunderx_extra_costs,
1570   &generic_addrcost_table,
1571   &thunderx_regmove_cost,
1572   &thunderx_vector_cost,
1573   &generic_branch_cost,
1574   &generic_approx_modes,
1575   SVE_NOT_IMPLEMENTED, /* sve_width  */
1576   { 6, /* load_int.  */
1577     6, /* store_int.  */
1578     6, /* load_fp.  */
1579     6, /* store_fp.  */
1580     6, /* load_pred.  */
1581     6 /* store_pred.  */
1582   }, /* memmov_cost.  */
1583   2, /* issue_rate  */
1584   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1585   "8",  /* function_align.  */
1586   "8",  /* jump_align.  */
1587   "8",  /* loop_align.  */
1588   2,    /* int_reassoc_width.  */
1589   4,    /* fp_reassoc_width.  */
1590   1,    /* fma_reassoc_width.  */
1591   1,    /* vec_reassoc_width.  */
1592   2,    /* min_div_recip_mul_sf.  */
1593   2,    /* min_div_recip_mul_df.  */
1594   0,    /* max_case_values.  */
1595   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1596   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
1597   &thunderxt88_prefetch_tune
1598 };
1599
1600 static const struct tune_params thunderx_tunings =
1601 {
1602   &thunderx_extra_costs,
1603   &generic_addrcost_table,
1604   &thunderx_regmove_cost,
1605   &thunderx_vector_cost,
1606   &generic_branch_cost,
1607   &generic_approx_modes,
1608   SVE_NOT_IMPLEMENTED, /* sve_width  */
1609   { 6, /* load_int.  */
1610     6, /* store_int.  */
1611     6, /* load_fp.  */
1612     6, /* store_fp.  */
1613     6, /* load_pred.  */
1614     6 /* store_pred.  */
1615   }, /* memmov_cost.  */
1616   2, /* issue_rate  */
1617   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1618   "8",  /* function_align.  */
1619   "8",  /* jump_align.  */
1620   "8",  /* loop_align.  */
1621   2,    /* int_reassoc_width.  */
1622   4,    /* fp_reassoc_width.  */
1623   1,    /* fma_reassoc_width.  */
1624   1,    /* vec_reassoc_width.  */
1625   2,    /* min_div_recip_mul_sf.  */
1626   2,    /* min_div_recip_mul_df.  */
1627   0,    /* max_case_values.  */
1628   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1629   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1630    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
1631   &thunderx_prefetch_tune
1632 };
1633
1634 static const struct tune_params tsv110_tunings =
1635 {
1636   &tsv110_extra_costs,
1637   &tsv110_addrcost_table,
1638   &tsv110_regmove_cost,
1639   &tsv110_vector_cost,
1640   &generic_branch_cost,
1641   &generic_approx_modes,
1642   SVE_NOT_IMPLEMENTED, /* sve_width  */
1643   { 4, /* load_int.  */
1644     4, /* store_int.  */
1645     4, /* load_fp.  */
1646     4, /* store_fp.  */
1647     4, /* load_pred.  */
1648     4 /* store_pred.  */
1649   }, /* memmov_cost.  */
1650   4,    /* issue_rate  */
1651   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1652    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1653   "16", /* function_align.  */
1654   "4",  /* jump_align.  */
1655   "8",  /* loop_align.  */
1656   2,    /* int_reassoc_width.  */
1657   4,    /* fp_reassoc_width.  */
1658   1,    /* fma_reassoc_width.  */
1659   1,    /* vec_reassoc_width.  */
1660   2,    /* min_div_recip_mul_sf.  */
1661   2,    /* min_div_recip_mul_df.  */
1662   0,    /* max_case_values.  */
1663   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1664   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
1665   &tsv110_prefetch_tune
1666 };
1667
1668 static const struct tune_params xgene1_tunings =
1669 {
1670   &xgene1_extra_costs,
1671   &xgene1_addrcost_table,
1672   &xgene1_regmove_cost,
1673   &xgene1_vector_cost,
1674   &generic_branch_cost,
1675   &xgene1_approx_modes,
1676   SVE_NOT_IMPLEMENTED, /* sve_width  */
1677   { 6, /* load_int.  */
1678     6, /* store_int.  */
1679     6, /* load_fp.  */
1680     6, /* store_fp.  */
1681     6, /* load_pred.  */
1682     6 /* store_pred.  */
1683   }, /* memmov_cost.  */
1684   4, /* issue_rate  */
1685   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1686   "16", /* function_align.  */
1687   "16", /* jump_align.  */
1688   "16", /* loop_align.  */
1689   2,    /* int_reassoc_width.  */
1690   4,    /* fp_reassoc_width.  */
1691   1,    /* fma_reassoc_width.  */
1692   1,    /* vec_reassoc_width.  */
1693   2,    /* min_div_recip_mul_sf.  */
1694   2,    /* min_div_recip_mul_df.  */
1695   17,   /* max_case_values.  */
1696   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1697   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1698   &xgene1_prefetch_tune
1699 };
1700
1701 static const struct tune_params emag_tunings =
1702 {
1703   &xgene1_extra_costs,
1704   &xgene1_addrcost_table,
1705   &xgene1_regmove_cost,
1706   &xgene1_vector_cost,
1707   &generic_branch_cost,
1708   &xgene1_approx_modes,
1709   SVE_NOT_IMPLEMENTED,
1710   { 6, /* load_int.  */
1711     6, /* store_int.  */
1712     6, /* load_fp.  */
1713     6, /* store_fp.  */
1714     6, /* load_pred.  */
1715     6 /* store_pred.  */
1716   }, /* memmov_cost.  */
1717   4, /* issue_rate  */
1718   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1719   "16", /* function_align.  */
1720   "16", /* jump_align.  */
1721   "16", /* loop_align.  */
1722   2,    /* int_reassoc_width.  */
1723   4,    /* fp_reassoc_width.  */
1724   1,    /* fma_reassoc_width.  */
1725   1,    /* vec_reassoc_width.  */
1726   2,    /* min_div_recip_mul_sf.  */
1727   2,    /* min_div_recip_mul_df.  */
1728   17,   /* max_case_values.  */
1729   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1730   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1731   &xgene1_prefetch_tune
1732 };
1733
1734 static const struct tune_params qdf24xx_tunings =
1735 {
1736   &qdf24xx_extra_costs,
1737   &qdf24xx_addrcost_table,
1738   &qdf24xx_regmove_cost,
1739   &qdf24xx_vector_cost,
1740   &generic_branch_cost,
1741   &generic_approx_modes,
1742   SVE_NOT_IMPLEMENTED, /* sve_width  */
1743   { 4, /* load_int.  */
1744     4, /* store_int.  */
1745     4, /* load_fp.  */
1746     4, /* store_fp.  */
1747     4, /* load_pred.  */
1748     4 /* store_pred.  */
1749   }, /* memmov_cost.  */
1750   4, /* issue_rate  */
1751   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1752    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1753   "16", /* function_align.  */
1754   "8",  /* jump_align.  */
1755   "16", /* loop_align.  */
1756   2,    /* int_reassoc_width.  */
1757   4,    /* fp_reassoc_width.  */
1758   1,    /* fma_reassoc_width.  */
1759   1,    /* vec_reassoc_width.  */
1760   2,    /* min_div_recip_mul_sf.  */
1761   2,    /* min_div_recip_mul_df.  */
1762   0,    /* max_case_values.  */
1763   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1764   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1765   &qdf24xx_prefetch_tune
1766 };
1767
1768 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1769    for now.  */
1770 static const struct tune_params saphira_tunings =
1771 {
1772   &generic_extra_costs,
1773   &generic_addrcost_table,
1774   &generic_regmove_cost,
1775   &generic_vector_cost,
1776   &generic_branch_cost,
1777   &generic_approx_modes,
1778   SVE_NOT_IMPLEMENTED, /* sve_width  */
1779   { 4, /* load_int.  */
1780     4, /* store_int.  */
1781     4, /* load_fp.  */
1782     4, /* store_fp.  */
1783     4, /* load_pred.  */
1784     4 /* store_pred.  */
1785   }, /* memmov_cost.  */
1786   4, /* issue_rate  */
1787   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1788    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1789   "16", /* function_align.  */
1790   "8",  /* jump_align.  */
1791   "16", /* loop_align.  */
1792   2,    /* int_reassoc_width.  */
1793   4,    /* fp_reassoc_width.  */
1794   1,    /* fma_reassoc_width.  */
1795   1,    /* vec_reassoc_width.  */
1796   2,    /* min_div_recip_mul_sf.  */
1797   2,    /* min_div_recip_mul_df.  */
1798   0,    /* max_case_values.  */
1799   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1800   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1801   &generic_prefetch_tune
1802 };
1803
1804 static const struct tune_params thunderx2t99_tunings =
1805 {
1806   &thunderx2t99_extra_costs,
1807   &thunderx2t99_addrcost_table,
1808   &thunderx2t99_regmove_cost,
1809   &thunderx2t99_vector_cost,
1810   &generic_branch_cost,
1811   &generic_approx_modes,
1812   SVE_NOT_IMPLEMENTED, /* sve_width  */
1813   { 4, /* load_int.  */
1814     4, /* store_int.  */
1815     4, /* load_fp.  */
1816     4, /* store_fp.  */
1817     4, /* load_pred.  */
1818     4 /* store_pred.  */
1819   }, /* memmov_cost.  */
1820   4, /* issue_rate.  */
1821   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1822    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1823   "16", /* function_align.  */
1824   "8",  /* jump_align.  */
1825   "16", /* loop_align.  */
1826   3,    /* int_reassoc_width.  */
1827   2,    /* fp_reassoc_width.  */
1828   1,    /* fma_reassoc_width.  */
1829   2,    /* vec_reassoc_width.  */
1830   2,    /* min_div_recip_mul_sf.  */
1831   2,    /* min_div_recip_mul_df.  */
1832   0,    /* max_case_values.  */
1833   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1834   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1835   &thunderx2t99_prefetch_tune
1836 };
1837
1838 static const struct tune_params thunderx3t110_tunings =
1839 {
1840   &thunderx3t110_extra_costs,
1841   &thunderx3t110_addrcost_table,
1842   &thunderx3t110_regmove_cost,
1843   &thunderx3t110_vector_cost,
1844   &generic_branch_cost,
1845   &generic_approx_modes,
1846   SVE_NOT_IMPLEMENTED, /* sve_width  */
1847   { 4, /* load_int.  */
1848     4, /* store_int.  */
1849     4, /* load_fp.  */
1850     4, /* store_fp.  */
1851     4, /* load_pred.  */
1852     4 /* store_pred.  */
1853   }, /* memmov_cost.  */
1854   6, /* issue_rate.  */
1855   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1856    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1857   "16", /* function_align.  */
1858   "8",  /* jump_align.  */
1859   "16", /* loop_align.  */
1860   3,    /* int_reassoc_width.  */
1861   2,    /* fp_reassoc_width.  */
1862   1,    /* fma_reassoc_width.  */
1863   2,    /* vec_reassoc_width.  */
1864   2,    /* min_div_recip_mul_sf.  */
1865   2,    /* min_div_recip_mul_df.  */
1866   0,    /* max_case_values.  */
1867   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1868   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1869   &thunderx3t110_prefetch_tune
1870 };
1871
1872 static const struct tune_params neoversen1_tunings =
1873 {
1874   &cortexa76_extra_costs,
1875   &generic_addrcost_table,
1876   &generic_regmove_cost,
1877   &cortexa57_vector_cost,
1878   &generic_branch_cost,
1879   &generic_approx_modes,
1880   SVE_NOT_IMPLEMENTED, /* sve_width  */
1881   { 4, /* load_int.  */
1882     2, /* store_int.  */
1883     5, /* load_fp.  */
1884     2, /* store_fp.  */
1885     4, /* load_pred.  */
1886     4 /* store_pred.  */
1887   }, /* memmov_cost.  */
1888   3, /* issue_rate  */
1889   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1890   "32:16",      /* function_align.  */
1891   "4",          /* jump_align.  */
1892   "32:16",      /* loop_align.  */
1893   2,    /* int_reassoc_width.  */
1894   4,    /* fp_reassoc_width.  */
1895   1,    /* fma_reassoc_width.  */
1896   2,    /* vec_reassoc_width.  */
1897   2,    /* min_div_recip_mul_sf.  */
1898   2,    /* min_div_recip_mul_df.  */
1899   0,    /* max_case_values.  */
1900   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1901   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),      /* tune_flags.  */
1902   &generic_prefetch_tune
1903 };
1904
1905 static const struct tune_params ampere1_tunings =
1906 {
1907   &ampere1_extra_costs,
1908   &generic_addrcost_table,
1909   &generic_regmove_cost,
1910   &ampere1_vector_cost,
1911   &generic_branch_cost,
1912   &generic_approx_modes,
1913   SVE_NOT_IMPLEMENTED, /* sve_width  */
1914   { 4, /* load_int.  */
1915     4, /* store_int.  */
1916     4, /* load_fp.  */
1917     4, /* store_fp.  */
1918     4, /* load_pred.  */
1919     4 /* store_pred.  */
1920   }, /* memmov_cost.  */
1921   4, /* issue_rate  */
1922   (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1923    AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1924    AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1925    AARCH64_FUSE_CMP_BRANCH),
1926   /* fusible_ops  */
1927   "32",         /* function_align.  */
1928   "4",          /* jump_align.  */
1929   "32:16",      /* loop_align.  */
1930   2,    /* int_reassoc_width.  */
1931   4,    /* fp_reassoc_width.  */
1932   1,    /* fma_reassoc_width.  */
1933   2,    /* vec_reassoc_width.  */
1934   2,    /* min_div_recip_mul_sf.  */
1935   2,    /* min_div_recip_mul_df.  */
1936   0,    /* max_case_values.  */
1937   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1938   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1939   &ampere1_prefetch_tune
1940 };
1941
1942 static const struct tune_params ampere1a_tunings =
1943 {
1944   &ampere1a_extra_costs,
1945   &generic_addrcost_table,
1946   &generic_regmove_cost,
1947   &ampere1_vector_cost,
1948   &generic_branch_cost,
1949   &generic_approx_modes,
1950   SVE_NOT_IMPLEMENTED, /* sve_width  */
1951   { 4, /* load_int.  */
1952     4, /* store_int.  */
1953     4, /* load_fp.  */
1954     4, /* store_fp.  */
1955     4, /* load_pred.  */
1956     4 /* store_pred.  */
1957   }, /* memmov_cost.  */
1958   4, /* issue_rate  */
1959   (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1960    AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1961    AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1962    AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
1963    AARCH64_FUSE_ADDSUB_2REG_CONST1),
1964   /* fusible_ops  */
1965   "32",         /* function_align.  */
1966   "4",          /* jump_align.  */
1967   "32:16",      /* loop_align.  */
1968   2,    /* int_reassoc_width.  */
1969   4,    /* fp_reassoc_width.  */
1970   1,    /* fma_reassoc_width.  */
1971   2,    /* vec_reassoc_width.  */
1972   2,    /* min_div_recip_mul_sf.  */
1973   2,    /* min_div_recip_mul_df.  */
1974   0,    /* max_case_values.  */
1975   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1976   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1977   &ampere1_prefetch_tune
1978 };
1979
1980 static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1981 {
1982   2, /* int_stmt_cost  */
1983   2, /* fp_stmt_cost  */
1984   4, /* ld2_st2_permute_cost */
1985   4, /* ld3_st3_permute_cost  */
1986   5, /* ld4_st4_permute_cost  */
1987   3, /* permute_cost  */
1988   4, /* reduc_i8_cost  */
1989   4, /* reduc_i16_cost  */
1990   2, /* reduc_i32_cost  */
1991   2, /* reduc_i64_cost  */
1992   6, /* reduc_f16_cost  */
1993   3, /* reduc_f32_cost  */
1994   2, /* reduc_f64_cost  */
1995   2, /* store_elt_extra_cost  */
1996   /* This value is just inherited from the Cortex-A57 table.  */
1997   8, /* vec_to_scalar_cost  */
1998   /* This depends very much on what the scalar value is and
1999      where it comes from.  E.g. some constants take two dependent
2000      instructions or a load, while others might be moved from a GPR.
2001      4 seems to be a reasonable compromise in practice.  */
2002   4, /* scalar_to_vec_cost  */
2003   4, /* align_load_cost  */
2004   4, /* unalign_load_cost  */
2005   /* Although stores have a latency of 2 and compete for the
2006      vector pipes, in practice it's better not to model that.  */
2007   1, /* unalign_store_cost  */
2008   1  /* store_cost  */
2009 };
2010
2011 static const sve_vec_cost neoversev1_sve_vector_cost =
2012 {
2013   {
2014     2, /* int_stmt_cost  */
2015     2, /* fp_stmt_cost  */
2016     4, /* ld2_st2_permute_cost  */
2017     7, /* ld3_st3_permute_cost  */
2018     8, /* ld4_st4_permute_cost  */
2019     3, /* permute_cost  */
2020     /* Theoretically, a reduction involving 31 scalar ADDs could
2021        complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
2022        completes in 14 cycles, so give it a cost of 31 + 5.  */
2023     36, /* reduc_i8_cost  */
2024     /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
2025     22, /* reduc_i16_cost  */
2026     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
2027     14, /* reduc_i32_cost  */
2028     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
2029     11, /* reduc_i64_cost  */
2030     /* Theoretically, a reduction involving 15 scalar FADDs could
2031        complete in ~9 cycles and would have a cost of 30.  FADDV
2032        completes in 13 cycles, so give it a cost of 30 + 4.  */
2033     34, /* reduc_f16_cost  */
2034     /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
2035     19, /* reduc_f32_cost  */
2036     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
2037     11, /* reduc_f64_cost  */
2038     2, /* store_elt_extra_cost  */
2039     /* This value is just inherited from the Cortex-A57 table.  */
2040     8, /* vec_to_scalar_cost  */
2041     /* See the comment above the Advanced SIMD versions.  */
2042     4, /* scalar_to_vec_cost  */
2043     4, /* align_load_cost  */
2044     4, /* unalign_load_cost  */
2045     /* Although stores have a latency of 2 and compete for the
2046        vector pipes, in practice it's better not to model that.  */
2047     1, /* unalign_store_cost  */
2048     1  /* store_cost  */
2049   },
2050   3, /* clast_cost  */
2051   19, /* fadda_f16_cost  */
2052   11, /* fadda_f32_cost  */
2053   8, /* fadda_f64_cost  */
2054   32, /* gather_load_x32_cost  */
2055   16, /* gather_load_x64_cost  */
2056   3 /* scatter_store_elt_cost  */
2057 };
2058
2059 static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
2060 {
2061   3, /* loads_stores_per_cycle  */
2062   2, /* stores_per_cycle  */
2063   4, /* general_ops_per_cycle  */
2064   0, /* fp_simd_load_general_ops  */
2065   1 /* fp_simd_store_general_ops  */
2066 };
2067
2068 static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
2069 {
2070   {
2071     3, /* loads_stores_per_cycle  */
2072     2, /* stores_per_cycle  */
2073     4, /* general_ops_per_cycle  */
2074     0, /* fp_simd_load_general_ops  */
2075     1 /* fp_simd_store_general_ops  */
2076   },
2077   2, /* ld2_st2_general_ops  */
2078   2, /* ld3_st3_general_ops  */
2079   3 /* ld4_st4_general_ops  */
2080 };
2081
2082 static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
2083 {
2084   {
2085     {
2086       2, /* loads_per_cycle  */
2087       2, /* stores_per_cycle  */
2088       2, /* general_ops_per_cycle  */
2089       0, /* fp_simd_load_general_ops  */
2090       1 /* fp_simd_store_general_ops  */
2091     },
2092     2, /* ld2_st2_general_ops  */
2093     2, /* ld3_st3_general_ops  */
2094     3 /* ld4_st4_general_ops  */
2095   },
2096   1, /* pred_ops_per_cycle  */
2097   2, /* while_pred_ops  */
2098   2, /* int_cmp_pred_ops  */
2099   1, /* fp_cmp_pred_ops  */
2100   1, /* gather_scatter_pair_general_ops  */
2101   1 /* gather_scatter_pair_pred_ops  */
2102 };
2103
2104 static const aarch64_vec_issue_info neoversev1_vec_issue_info =
2105 {
2106   &neoversev1_scalar_issue_info,
2107   &neoversev1_advsimd_issue_info,
2108   &neoversev1_sve_issue_info
2109 };
2110
2111 /* Neoverse V1 costs for vector insn classes.  */
2112 static const struct cpu_vector_cost neoversev1_vector_cost =
2113 {
2114   1, /* scalar_int_stmt_cost  */
2115   2, /* scalar_fp_stmt_cost  */
2116   4, /* scalar_load_cost  */
2117   1, /* scalar_store_cost  */
2118   1, /* cond_taken_branch_cost  */
2119   1, /* cond_not_taken_branch_cost  */
2120   &neoversev1_advsimd_vector_cost, /* advsimd  */
2121   &neoversev1_sve_vector_cost, /* sve  */
2122   &neoversev1_vec_issue_info /* issue_info  */
2123 };
2124
2125 static const struct tune_params neoversev1_tunings =
2126 {
2127   &cortexa76_extra_costs,
2128   &neoversev1_addrcost_table,
2129   &neoversev1_regmove_cost,
2130   &neoversev1_vector_cost,
2131   &generic_branch_cost,
2132   &generic_approx_modes,
2133   SVE_256, /* sve_width  */
2134   { 4, /* load_int.  */
2135     2, /* store_int.  */
2136     6, /* load_fp.  */
2137     2, /* store_fp.  */
2138     6, /* load_pred.  */
2139     1 /* store_pred.  */
2140   }, /* memmov_cost.  */
2141   3, /* issue_rate  */
2142   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2143   "32:16",      /* function_align.  */
2144   "4",          /* jump_align.  */
2145   "32:16",      /* loop_align.  */
2146   2,    /* int_reassoc_width.  */
2147   4,    /* fp_reassoc_width.  */
2148   4,    /* fma_reassoc_width.  */
2149   2,    /* vec_reassoc_width.  */
2150   2,    /* min_div_recip_mul_sf.  */
2151   2,    /* min_div_recip_mul_df.  */
2152   0,    /* max_case_values.  */
2153   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2154   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2155    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2156    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
2157    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
2158   &generic_prefetch_tune
2159 };
2160
2161 static const sve_vec_cost neoverse512tvb_sve_vector_cost =
2162 {
2163   {
2164     2, /* int_stmt_cost  */
2165     2, /* fp_stmt_cost  */
2166     4, /* ld2_st2_permute_cost  */
2167     5, /* ld3_st3_permute_cost  */
2168     5, /* ld4_st4_permute_cost  */
2169     3, /* permute_cost  */
2170     /* Theoretically, a reduction involving 15 scalar ADDs could
2171        complete in ~5 cycles and would have a cost of 15.  Assume that
2172        [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6.  */
2173     21, /* reduc_i8_cost  */
2174     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
2175     13, /* reduc_i16_cost  */
2176     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
2177     9, /* reduc_i32_cost  */
2178     /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7.  */
2179     8, /* reduc_i64_cost  */
2180     /* Theoretically, a reduction involving 7 scalar FADDs could
2181        complete in ~6 cycles and would have a cost of 14.  Assume that
2182        FADDV completes in 8 cycles and so give it a cost of 14 + 2.  */
2183     16, /* reduc_f16_cost  */
2184     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
2185     8, /* reduc_f32_cost  */
2186     /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2.  */
2187     4, /* reduc_f64_cost  */
2188     2, /* store_elt_extra_cost  */
2189     /* This value is just inherited from the Cortex-A57 table.  */
2190     8, /* vec_to_scalar_cost  */
2191     /* This depends very much on what the scalar value is and
2192        where it comes from.  E.g. some constants take two dependent
2193        instructions or a load, while others might be moved from a GPR.
2194        4 seems to be a reasonable compromise in practice.  */
2195     4, /* scalar_to_vec_cost  */
2196     4, /* align_load_cost  */
2197     4, /* unalign_load_cost  */
2198     /* Although stores generally have a latency of 2 and compete for the
2199        vector pipes, in practice it's better not to model that.  */
2200     1, /* unalign_store_cost  */
2201     1  /* store_cost  */
2202   },
2203   3, /* clast_cost  */
2204   10, /* fadda_f16_cost  */
2205   6, /* fadda_f32_cost  */
2206   4, /* fadda_f64_cost  */
2207   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2208      (6 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2209      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2210      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2211      (cost 2) to that, to avoid the difference being lost in rounding.
2212
2213      There is no easy comparison between a strided Advanced SIMD x32 load
2214      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2215      operation more than a 64-bit gather.  */
2216   14, /* gather_load_x32_cost  */
2217   12, /* gather_load_x64_cost  */
2218   3 /* scatter_store_elt_cost  */
2219 };
2220
2221 static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
2222 {
2223   {
2224     {
2225       3, /* loads_per_cycle  */
2226       2, /* stores_per_cycle  */
2227       4, /* general_ops_per_cycle  */
2228       0, /* fp_simd_load_general_ops  */
2229       1 /* fp_simd_store_general_ops  */
2230     },
2231     2, /* ld2_st2_general_ops  */
2232     2, /* ld3_st3_general_ops  */
2233     3 /* ld4_st4_general_ops  */
2234   },
2235   2, /* pred_ops_per_cycle  */
2236   2, /* while_pred_ops  */
2237   2, /* int_cmp_pred_ops  */
2238   1, /* fp_cmp_pred_ops  */
2239   1, /* gather_scatter_pair_general_ops  */
2240   1 /* gather_scatter_pair_pred_ops  */
2241 };
2242
2243 static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
2244 {
2245   &neoversev1_scalar_issue_info,
2246   &neoversev1_advsimd_issue_info,
2247   &neoverse512tvb_sve_issue_info
2248 };
2249
2250 static const struct cpu_vector_cost neoverse512tvb_vector_cost =
2251 {
2252   1, /* scalar_int_stmt_cost  */
2253   2, /* scalar_fp_stmt_cost  */
2254   4, /* scalar_load_cost  */
2255   1, /* scalar_store_cost  */
2256   1, /* cond_taken_branch_cost  */
2257   1, /* cond_not_taken_branch_cost  */
2258   &neoversev1_advsimd_vector_cost, /* advsimd  */
2259   &neoverse512tvb_sve_vector_cost, /* sve  */
2260   &neoverse512tvb_vec_issue_info /* issue_info  */
2261 };
2262
2263 static const struct tune_params neoverse512tvb_tunings =
2264 {
2265   &cortexa76_extra_costs,
2266   &neoversev1_addrcost_table,
2267   &neoversev1_regmove_cost,
2268   &neoverse512tvb_vector_cost,
2269   &generic_branch_cost,
2270   &generic_approx_modes,
2271   SVE_128 | SVE_256, /* sve_width  */
2272   { 4, /* load_int.  */
2273     2, /* store_int.  */
2274     6, /* load_fp.  */
2275     2, /* store_fp.  */
2276     6, /* load_pred.  */
2277     1 /* store_pred.  */
2278   }, /* memmov_cost.  */
2279   3, /* issue_rate  */
2280   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2281   "32:16",      /* function_align.  */
2282   "4",          /* jump_align.  */
2283   "32:16",      /* loop_align.  */
2284   2,    /* int_reassoc_width.  */
2285   4,    /* fp_reassoc_width.  */
2286   4,    /* fma_reassoc_width.  */
2287   2,    /* vec_reassoc_width.  */
2288   2,    /* min_div_recip_mul_sf.  */
2289   2,    /* min_div_recip_mul_df.  */
2290   0,    /* max_case_values.  */
2291   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2292   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2293    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2294    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),     /* tune_flags.  */
2295   &generic_prefetch_tune
2296 };
2297
2298 static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
2299 {
2300   2, /* int_stmt_cost  */
2301   2, /* fp_stmt_cost  */
2302   2, /* ld2_st2_permute_cost */
2303   2, /* ld3_st3_permute_cost  */
2304   3, /* ld4_st4_permute_cost  */
2305   3, /* permute_cost  */
2306   4, /* reduc_i8_cost  */
2307   4, /* reduc_i16_cost  */
2308   2, /* reduc_i32_cost  */
2309   2, /* reduc_i64_cost  */
2310   6, /* reduc_f16_cost  */
2311   4, /* reduc_f32_cost  */
2312   2, /* reduc_f64_cost  */
2313   2, /* store_elt_extra_cost  */
2314   /* This value is just inherited from the Cortex-A57 table.  */
2315   8, /* vec_to_scalar_cost  */
2316   /* This depends very much on what the scalar value is and
2317      where it comes from.  E.g. some constants take two dependent
2318      instructions or a load, while others might be moved from a GPR.
2319      4 seems to be a reasonable compromise in practice.  */
2320   4, /* scalar_to_vec_cost  */
2321   4, /* align_load_cost  */
2322   4, /* unalign_load_cost  */
2323   /* Although stores have a latency of 2 and compete for the
2324      vector pipes, in practice it's better not to model that.  */
2325   1, /* unalign_store_cost  */
2326   1  /* store_cost  */
2327 };
2328
2329 static const sve_vec_cost neoversen2_sve_vector_cost =
2330 {
2331   {
2332     2, /* int_stmt_cost  */
2333     2, /* fp_stmt_cost  */
2334     3, /* ld2_st2_permute_cost  */
2335     4, /* ld3_st3_permute_cost  */
2336     4, /* ld4_st4_permute_cost  */
2337     3, /* permute_cost  */
2338     /* Theoretically, a reduction involving 15 scalar ADDs could
2339        complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
2340        completes in 11 cycles, so give it a cost of 15 + 6.  */
2341     21, /* reduc_i8_cost  */
2342     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
2343     13, /* reduc_i16_cost  */
2344     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
2345     9, /* reduc_i32_cost  */
2346     /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
2347     2, /* reduc_i64_cost  */
2348     /* Theoretically, a reduction involving 7 scalar FADDs could
2349        complete in ~8 cycles and would have a cost of 14.  FADDV
2350        completes in 6 cycles, so give it a cost of 14 - 2.  */
2351     12, /* reduc_f16_cost  */
2352     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
2353     6, /* reduc_f32_cost  */
2354     /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
2355     2, /* reduc_f64_cost  */
2356     2, /* store_elt_extra_cost  */
2357     /* This value is just inherited from the Cortex-A57 table.  */
2358     8, /* vec_to_scalar_cost  */
2359     /* See the comment above the Advanced SIMD versions.  */
2360     4, /* scalar_to_vec_cost  */
2361     4, /* align_load_cost  */
2362     4, /* unalign_load_cost  */
2363     /* Although stores have a latency of 2 and compete for the
2364        vector pipes, in practice it's better not to model that.  */
2365     1, /* unalign_store_cost  */
2366     1  /* store_cost  */
2367   },
2368   3, /* clast_cost  */
2369   10, /* fadda_f16_cost  */
2370   6, /* fadda_f32_cost  */
2371   4, /* fadda_f64_cost  */
2372   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2373      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2374      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2375      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2376      (cost 2) to that, to avoid the difference being lost in rounding.
2377
2378      There is no easy comparison between a strided Advanced SIMD x32 load
2379      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2380      operation more than a 64-bit gather.  */
2381   14, /* gather_load_x32_cost  */
2382   12, /* gather_load_x64_cost  */
2383   3 /* scatter_store_elt_cost  */
2384 };
2385
2386 static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
2387 {
2388   3, /* loads_stores_per_cycle  */
2389   2, /* stores_per_cycle  */
2390   4, /* general_ops_per_cycle  */
2391   0, /* fp_simd_load_general_ops  */
2392   1 /* fp_simd_store_general_ops  */
2393 };
2394
2395 static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
2396 {
2397   {
2398     3, /* loads_stores_per_cycle  */
2399     2, /* stores_per_cycle  */
2400     2, /* general_ops_per_cycle  */
2401     0, /* fp_simd_load_general_ops  */
2402     1 /* fp_simd_store_general_ops  */
2403   },
2404   2, /* ld2_st2_general_ops  */
2405   2, /* ld3_st3_general_ops  */
2406   3 /* ld4_st4_general_ops  */
2407 };
2408
2409 static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
2410 {
2411   {
2412     {
2413       3, /* loads_per_cycle  */
2414       2, /* stores_per_cycle  */
2415       2, /* general_ops_per_cycle  */
2416       0, /* fp_simd_load_general_ops  */
2417       1 /* fp_simd_store_general_ops  */
2418     },
2419     2, /* ld2_st2_general_ops  */
2420     3, /* ld3_st3_general_ops  */
2421     3 /* ld4_st4_general_ops  */
2422   },
2423   2, /* pred_ops_per_cycle  */
2424   2, /* while_pred_ops  */
2425   2, /* int_cmp_pred_ops  */
2426   1, /* fp_cmp_pred_ops  */
2427   1, /* gather_scatter_pair_general_ops  */
2428   1 /* gather_scatter_pair_pred_ops  */
2429 };
2430
2431 static const aarch64_vec_issue_info neoversen2_vec_issue_info =
2432 {
2433   &neoversen2_scalar_issue_info,
2434   &neoversen2_advsimd_issue_info,
2435   &neoversen2_sve_issue_info
2436 };
2437
2438 /* Neoverse N2 costs for vector insn classes.  */
2439 static const struct cpu_vector_cost neoversen2_vector_cost =
2440 {
2441   1, /* scalar_int_stmt_cost  */
2442   2, /* scalar_fp_stmt_cost  */
2443   4, /* scalar_load_cost  */
2444   1, /* scalar_store_cost  */
2445   1, /* cond_taken_branch_cost  */
2446   1, /* cond_not_taken_branch_cost  */
2447   &neoversen2_advsimd_vector_cost, /* advsimd  */
2448   &neoversen2_sve_vector_cost, /* sve  */
2449   &neoversen2_vec_issue_info /* issue_info  */
2450 };
2451
2452 static const struct tune_params neoversen2_tunings =
2453 {
2454   &cortexa76_extra_costs,
2455   &neoversen2_addrcost_table,
2456   &neoversen2_regmove_cost,
2457   &neoversen2_vector_cost,
2458   &generic_branch_cost,
2459   &generic_approx_modes,
2460   SVE_128, /* sve_width  */
2461   { 4, /* load_int.  */
2462     1, /* store_int.  */
2463     6, /* load_fp.  */
2464     2, /* store_fp.  */
2465     6, /* load_pred.  */
2466     1 /* store_pred.  */
2467   }, /* memmov_cost.  */
2468   3, /* issue_rate  */
2469   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2470   "32:16",      /* function_align.  */
2471   "4",          /* jump_align.  */
2472   "32:16",      /* loop_align.  */
2473   2,    /* int_reassoc_width.  */
2474   4,    /* fp_reassoc_width.  */
2475   1,    /* fma_reassoc_width.  */
2476   2,    /* vec_reassoc_width.  */
2477   2,    /* min_div_recip_mul_sf.  */
2478   2,    /* min_div_recip_mul_df.  */
2479   0,    /* max_case_values.  */
2480   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2481   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2482    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2483    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2484    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),     /* tune_flags.  */
2485   &generic_prefetch_tune
2486 };
2487
2488 static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
2489 {
2490   2, /* int_stmt_cost  */
2491   2, /* fp_stmt_cost  */
2492   2, /* ld2_st2_permute_cost */
2493   2, /* ld3_st3_permute_cost  */
2494   3, /* ld4_st4_permute_cost  */
2495   3, /* permute_cost  */
2496   4, /* reduc_i8_cost  */
2497   4, /* reduc_i16_cost  */
2498   2, /* reduc_i32_cost  */
2499   2, /* reduc_i64_cost  */
2500   6, /* reduc_f16_cost  */
2501   3, /* reduc_f32_cost  */
2502   2, /* reduc_f64_cost  */
2503   2, /* store_elt_extra_cost  */
2504   /* This value is just inherited from the Cortex-A57 table.  */
2505   8, /* vec_to_scalar_cost  */
2506   /* This depends very much on what the scalar value is and
2507      where it comes from.  E.g. some constants take two dependent
2508      instructions or a load, while others might be moved from a GPR.
2509      4 seems to be a reasonable compromise in practice.  */
2510   4, /* scalar_to_vec_cost  */
2511   4, /* align_load_cost  */
2512   4, /* unalign_load_cost  */
2513   /* Although stores have a latency of 2 and compete for the
2514      vector pipes, in practice it's better not to model that.  */
2515   1, /* unalign_store_cost  */
2516   1  /* store_cost  */
2517 };
2518
2519 static const sve_vec_cost neoversev2_sve_vector_cost =
2520 {
2521   {
2522     2, /* int_stmt_cost  */
2523     2, /* fp_stmt_cost  */
2524     3, /* ld2_st2_permute_cost  */
2525     3, /* ld3_st3_permute_cost  */
2526     4, /* ld4_st4_permute_cost  */
2527     3, /* permute_cost  */
2528     /* Theoretically, a reduction involving 15 scalar ADDs could
2529        complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
2530        completes in 11 cycles, so give it a cost of 15 + 8.  */
2531     21, /* reduc_i8_cost  */
2532     /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
2533     14, /* reduc_i16_cost  */
2534     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
2535     7, /* reduc_i32_cost  */
2536     /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
2537     2, /* reduc_i64_cost  */
2538     /* Theoretically, a reduction involving 7 scalar FADDs could
2539        complete in ~6 cycles and would have a cost of 14.  FADDV
2540        completes in 8 cycles, so give it a cost of 14 + 2.  */
2541     16, /* reduc_f16_cost  */
2542     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
2543     8, /* reduc_f32_cost  */
2544     /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2.  */
2545     4, /* reduc_f64_cost  */
2546     2, /* store_elt_extra_cost  */
2547     /* This value is just inherited from the Cortex-A57 table.  */
2548     8, /* vec_to_scalar_cost  */
2549     /* See the comment above the Advanced SIMD versions.  */
2550     4, /* scalar_to_vec_cost  */
2551     4, /* align_load_cost  */
2552     4, /* unalign_load_cost  */
2553     /* Although stores have a latency of 2 and compete for the
2554        vector pipes, in practice it's better not to model that.  */
2555     1, /* unalign_store_cost  */
2556     1  /* store_cost  */
2557   },
2558   3, /* clast_cost  */
2559   10, /* fadda_f16_cost  */
2560   6, /* fadda_f32_cost  */
2561   4, /* fadda_f64_cost  */
2562   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2563      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2564      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2565      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2566      (cost 2) to that, to avoid the difference being lost in rounding.
2567
2568      There is no easy comparison between a strided Advanced SIMD x32 load
2569      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2570      operation more than a 64-bit gather.  */
2571   14, /* gather_load_x32_cost  */
2572   12, /* gather_load_x64_cost  */
2573   3 /* scatter_store_elt_cost  */
2574 };
2575
2576 static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
2577 {
2578   3, /* loads_stores_per_cycle  */
2579   2, /* stores_per_cycle  */
2580   6, /* general_ops_per_cycle  */
2581   0, /* fp_simd_load_general_ops  */
2582   1 /* fp_simd_store_general_ops  */
2583 };
2584
2585 static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
2586 {
2587   {
2588     3, /* loads_stores_per_cycle  */
2589     2, /* stores_per_cycle  */
2590     4, /* general_ops_per_cycle  */
2591     0, /* fp_simd_load_general_ops  */
2592     1 /* fp_simd_store_general_ops  */
2593   },
2594   2, /* ld2_st2_general_ops  */
2595   2, /* ld3_st3_general_ops  */
2596   3 /* ld4_st4_general_ops  */
2597 };
2598
2599 static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
2600 {
2601   {
2602     {
2603       3, /* loads_per_cycle  */
2604       2, /* stores_per_cycle  */
2605       4, /* general_ops_per_cycle  */
2606       0, /* fp_simd_load_general_ops  */
2607       1 /* fp_simd_store_general_ops  */
2608     },
2609     2, /* ld2_st2_general_ops  */
2610     3, /* ld3_st3_general_ops  */
2611     3 /* ld4_st4_general_ops  */
2612   },
2613   2, /* pred_ops_per_cycle  */
2614   2, /* while_pred_ops  */
2615   2, /* int_cmp_pred_ops  */
2616   1, /* fp_cmp_pred_ops  */
2617   1, /* gather_scatter_pair_general_ops  */
2618   1 /* gather_scatter_pair_pred_ops  */
2619 };
2620
2621 static const aarch64_vec_issue_info neoversev2_vec_issue_info =
2622 {
2623   &neoversev2_scalar_issue_info,
2624   &neoversev2_advsimd_issue_info,
2625   &neoversev2_sve_issue_info
2626 };
2627
2628 /* Demeter costs for vector insn classes.  */
2629 static const struct cpu_vector_cost neoversev2_vector_cost =
2630 {
2631   1, /* scalar_int_stmt_cost  */
2632   2, /* scalar_fp_stmt_cost  */
2633   4, /* scalar_load_cost  */
2634   1, /* scalar_store_cost  */
2635   1, /* cond_taken_branch_cost  */
2636   1, /* cond_not_taken_branch_cost  */
2637   &neoversev2_advsimd_vector_cost, /* advsimd  */
2638   &neoversev2_sve_vector_cost, /* sve  */
2639   &neoversev2_vec_issue_info /* issue_info  */
2640 };
2641
2642 static const struct tune_params neoversev2_tunings =
2643 {
2644   &cortexa76_extra_costs,
2645   &neoversev2_addrcost_table,
2646   &neoversev2_regmove_cost,
2647   &neoversev2_vector_cost,
2648   &generic_branch_cost,
2649   &generic_approx_modes,
2650   SVE_128, /* sve_width  */
2651   { 4, /* load_int.  */
2652     2, /* store_int.  */
2653     6, /* load_fp.  */
2654     1, /* store_fp.  */
2655     6, /* load_pred.  */
2656     2 /* store_pred.  */
2657   }, /* memmov_cost.  */
2658   5, /* issue_rate  */
2659   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2660   "32:16",      /* function_align.  */
2661   "4",          /* jump_align.  */
2662   "32:16",      /* loop_align.  */
2663   3,    /* int_reassoc_width.  */
2664   6,    /* fp_reassoc_width.  */
2665   4,    /* fma_reassoc_width.  */
2666   3,    /* vec_reassoc_width.  */
2667   2,    /* min_div_recip_mul_sf.  */
2668   2,    /* min_div_recip_mul_df.  */
2669   0,    /* max_case_values.  */
2670   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2671   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2672    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2673    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2674    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),     /* tune_flags.  */
2675   &generic_prefetch_tune
2676 };
2677
2678 static const struct tune_params a64fx_tunings =
2679 {
2680   &a64fx_extra_costs,
2681   &a64fx_addrcost_table,
2682   &a64fx_regmove_cost,
2683   &a64fx_vector_cost,
2684   &generic_branch_cost,
2685   &generic_approx_modes,
2686   SVE_512, /* sve_width  */
2687   { 4, /* load_int.  */
2688     4, /* store_int.  */
2689     4, /* load_fp.  */
2690     4, /* store_fp.  */
2691     4, /* load_pred.  */
2692     4 /* store_pred.  */
2693   }, /* memmov_cost.  */
2694   7, /* issue_rate  */
2695   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2696   "32", /* function_align.  */
2697   "16", /* jump_align.  */
2698   "32", /* loop_align.  */
2699   4,    /* int_reassoc_width.  */
2700   2,    /* fp_reassoc_width.  */
2701   1,    /* fma_reassoc_width.  */
2702   2,    /* vec_reassoc_width.  */
2703   2,    /* min_div_recip_mul_sf.  */
2704   2,    /* min_div_recip_mul_df.  */
2705   0,    /* max_case_values.  */
2706   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2707   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
2708   &a64fx_prefetch_tune
2709 };
2710
2711 /* Support for fine-grained override of the tuning structures.  */
2712 struct aarch64_tuning_override_function
2713 {
2714   const char* name;
2715   void (*parse_override)(const char*, struct tune_params*);
2716 };
2717
2718 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
2719 static void aarch64_parse_tune_string (const char*, struct tune_params*);
2720 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
2721
2722 static const struct aarch64_tuning_override_function
2723 aarch64_tuning_override_functions[] =
2724 {
2725   { "fuse", aarch64_parse_fuse_string },
2726   { "tune", aarch64_parse_tune_string },
2727   { "sve_width", aarch64_parse_sve_width_string },
2728   { NULL, NULL }
2729 };
2730
2731 /* A processor implementing AArch64.  */
2732 struct processor
2733 {
2734   const char *name;
2735   aarch64_processor ident;
2736   aarch64_processor sched_core;
2737   aarch64_arch arch;
2738   aarch64_feature_flags flags;
2739   const tune_params *tune;
2740 };
2741
2742 /* Architectures implementing AArch64.  */
2743 static CONSTEXPR const processor all_architectures[] =
2744 {
2745 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
2746   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
2747    feature_deps::ARCH_IDENT ().enable, NULL},
2748 #include "aarch64-arches.def"
2749   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2750 };
2751
2752 /* Processor cores implementing AArch64.  */
2753 static const struct processor all_cores[] =
2754 {
2755 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
2756   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
2757    feature_deps::cpu_##IDENT, &COSTS##_tunings},
2758 #include "aarch64-cores.def"
2759   {"generic", generic, cortexa53, AARCH64_ARCH_V8A,
2760    feature_deps::V8A ().enable, &generic_tunings},
2761   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2762 };
2763
2764 /* The current tuning set.  */
2765 struct tune_params aarch64_tune_params = generic_tunings;
2766
2767 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
2768
2769 static tree
2770 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
2771                                      int, bool *no_add_attrs)
2772 {
2773   /* Since we set fn_type_req to true, the caller should have checked
2774      this for us.  */
2775   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
2776   switch ((arm_pcs) fntype_abi (*node).id ())
2777     {
2778     case ARM_PCS_AAPCS64:
2779     case ARM_PCS_SIMD:
2780       return NULL_TREE;
2781
2782     case ARM_PCS_SVE:
2783       error ("the %qE attribute cannot be applied to an SVE function type",
2784              name);
2785       *no_add_attrs = true;
2786       return NULL_TREE;
2787
2788     case ARM_PCS_TLSDESC:
2789     case ARM_PCS_UNKNOWN:
2790       break;
2791     }
2792   gcc_unreachable ();
2793 }
2794
2795 /* Table of machine attributes.  */
2796 static const struct attribute_spec aarch64_attribute_table[] =
2797 {
2798   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
2799        affects_type_identity, handler, exclude } */
2800   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
2801                           handle_aarch64_vector_pcs_attribute, NULL },
2802   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
2803                           aarch64_sve::handle_arm_sve_vector_bits_attribute,
2804                           NULL },
2805   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
2806   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
2807   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
2808   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
2809 };
2810
2811 /* An ISA extension in the co-processor and main instruction set space.  */
2812 struct aarch64_option_extension
2813 {
2814   const char *const name;
2815   const unsigned long flags_on;
2816   const unsigned long flags_off;
2817 };
2818
2819 typedef enum aarch64_cond_code
2820 {
2821   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2822   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2823   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2824 }
2825 aarch64_cc;
2826
2827 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2828
2829 struct aarch64_branch_protect_type
2830 {
2831   /* The type's name that the user passes to the branch-protection option
2832     string.  */
2833   const char* name;
2834   /* Function to handle the protection type and set global variables.
2835     First argument is the string token corresponding with this type and the
2836     second argument is the next token in the option string.
2837     Return values:
2838     * AARCH64_PARSE_OK: Handling was sucessful.
2839     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
2840       should print an error.
2841     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
2842       own error.  */
2843   enum aarch64_parse_opt_result (*handler)(char*, char*);
2844   /* A list of types that can follow this type in the option string.  */
2845   const aarch64_branch_protect_type* subtypes;
2846   unsigned int num_subtypes;
2847 };
2848
2849 static enum aarch64_parse_opt_result
2850 aarch64_handle_no_branch_protection (char* str, char* rest)
2851 {
2852   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
2853   aarch64_enable_bti = 0;
2854   if (rest)
2855     {
2856       error ("unexpected %<%s%> after %<%s%>", rest, str);
2857       return AARCH64_PARSE_INVALID_FEATURE;
2858     }
2859   return AARCH64_PARSE_OK;
2860 }
2861
2862 static enum aarch64_parse_opt_result
2863 aarch64_handle_standard_branch_protection (char* str, char* rest)
2864 {
2865   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2866   aarch64_ra_sign_key = AARCH64_KEY_A;
2867   aarch64_enable_bti = 1;
2868   if (rest)
2869     {
2870       error ("unexpected %<%s%> after %<%s%>", rest, str);
2871       return AARCH64_PARSE_INVALID_FEATURE;
2872     }
2873   return AARCH64_PARSE_OK;
2874 }
2875
2876 static enum aarch64_parse_opt_result
2877 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
2878                                     char* rest ATTRIBUTE_UNUSED)
2879 {
2880   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2881   aarch64_ra_sign_key = AARCH64_KEY_A;
2882   return AARCH64_PARSE_OK;
2883 }
2884
2885 static enum aarch64_parse_opt_result
2886 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
2887                               char* rest ATTRIBUTE_UNUSED)
2888 {
2889   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
2890   return AARCH64_PARSE_OK;
2891 }
2892
2893 static enum aarch64_parse_opt_result
2894 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
2895                               char* rest ATTRIBUTE_UNUSED)
2896 {
2897   aarch64_ra_sign_key = AARCH64_KEY_B;
2898   return AARCH64_PARSE_OK;
2899 }
2900
2901 static enum aarch64_parse_opt_result
2902 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
2903                                     char* rest ATTRIBUTE_UNUSED)
2904 {
2905   aarch64_enable_bti = 1;
2906   return AARCH64_PARSE_OK;
2907 }
2908
2909 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
2910   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
2911   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
2912   { NULL, NULL, NULL, 0 }
2913 };
2914
2915 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
2916   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
2917   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
2918   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
2919     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
2920   { "bti", aarch64_handle_bti_protection, NULL, 0 },
2921   { NULL, NULL, NULL, 0 }
2922 };
2923
2924 /* The condition codes of the processor, and the inverse function.  */
2925 static const char * const aarch64_condition_codes[] =
2926 {
2927   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2928   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2929 };
2930
2931 /* The preferred condition codes for SVE conditions.  */
2932 static const char *const aarch64_sve_condition_codes[] =
2933 {
2934   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2935   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2936 };
2937
2938 /* Return the assembly token for svpattern value VALUE.  */
2939
2940 static const char *
2941 svpattern_token (enum aarch64_svpattern pattern)
2942 {
2943   switch (pattern)
2944     {
2945 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2946     AARCH64_FOR_SVPATTERN (CASE)
2947 #undef CASE
2948     case AARCH64_NUM_SVPATTERNS:
2949       break;
2950     }
2951   gcc_unreachable ();
2952 }
2953
2954 /* Return the location of a piece that is known to be passed or returned
2955    in registers.  FIRST_ZR is the first unused vector argument register
2956    and FIRST_PR is the first unused predicate argument register.  */
2957
2958 rtx
2959 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2960                                          unsigned int first_pr) const
2961 {
2962   gcc_assert (VECTOR_MODE_P (mode)
2963               && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2964               && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2965
2966   if (num_zr > 0 && num_pr == 0)
2967     return gen_rtx_REG (mode, first_zr);
2968
2969   if (num_zr == 0 && num_pr == 1)
2970     return gen_rtx_REG (mode, first_pr);
2971
2972   gcc_unreachable ();
2973 }
2974
2975 /* Return the total number of vector registers required by the PST.  */
2976
2977 unsigned int
2978 pure_scalable_type_info::num_zr () const
2979 {
2980   unsigned int res = 0;
2981   for (unsigned int i = 0; i < pieces.length (); ++i)
2982     res += pieces[i].num_zr;
2983   return res;
2984 }
2985
2986 /* Return the total number of predicate registers required by the PST.  */
2987
2988 unsigned int
2989 pure_scalable_type_info::num_pr () const
2990 {
2991   unsigned int res = 0;
2992   for (unsigned int i = 0; i < pieces.length (); ++i)
2993     res += pieces[i].num_pr;
2994   return res;
2995 }
2996
2997 /* Return the location of a PST that is known to be passed or returned
2998    in registers.  FIRST_ZR is the first unused vector argument register
2999    and FIRST_PR is the first unused predicate argument register.  */
3000
3001 rtx
3002 pure_scalable_type_info::get_rtx (machine_mode mode,
3003                                   unsigned int first_zr,
3004                                   unsigned int first_pr) const
3005 {
3006   /* Try to return a single REG if possible.  This leads to better
3007      code generation; it isn't required for correctness.  */
3008   if (mode == pieces[0].mode)
3009     {
3010       gcc_assert (pieces.length () == 1);
3011       return pieces[0].get_rtx (first_zr, first_pr);
3012     }
3013
3014   /* Build up a PARALLEL that contains the individual pieces.  */
3015   rtvec rtxes = rtvec_alloc (pieces.length ());
3016   for (unsigned int i = 0; i < pieces.length (); ++i)
3017     {
3018       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
3019       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
3020       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
3021       first_zr += pieces[i].num_zr;
3022       first_pr += pieces[i].num_pr;
3023     }
3024   return gen_rtx_PARALLEL (mode, rtxes);
3025 }
3026
3027 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
3028    in the AAPCS64.  */
3029
3030 pure_scalable_type_info::analysis_result
3031 pure_scalable_type_info::analyze (const_tree type)
3032 {
3033   /* Prevent accidental reuse.  */
3034   gcc_assert (pieces.is_empty ());
3035
3036   /* No code will be generated for erroneous types, so we won't establish
3037      an ABI mapping.  */
3038   if (type == error_mark_node)
3039     return NO_ABI_IDENTITY;
3040
3041   /* Zero-sized types disappear in the language->ABI mapping.  */
3042   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3043     return NO_ABI_IDENTITY;
3044
3045   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
3046   piece p = {};
3047   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
3048     {
3049       machine_mode mode = TYPE_MODE_RAW (type);
3050       gcc_assert (VECTOR_MODE_P (mode)
3051                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
3052
3053       p.mode = p.orig_mode = mode;
3054       add_piece (p);
3055       return IS_PST;
3056     }
3057
3058   /* Check for user-defined PSTs.  */
3059   if (TREE_CODE (type) == ARRAY_TYPE)
3060     return analyze_array (type);
3061   if (TREE_CODE (type) == RECORD_TYPE)
3062     return analyze_record (type);
3063
3064   return ISNT_PST;
3065 }
3066
3067 /* Analyze a type that is known not to be passed or returned in memory.
3068    Return true if it has an ABI identity and is a Pure Scalable Type.  */
3069
3070 bool
3071 pure_scalable_type_info::analyze_registers (const_tree type)
3072 {
3073   analysis_result result = analyze (type);
3074   gcc_assert (result != DOESNT_MATTER);
3075   return result == IS_PST;
3076 }
3077
3078 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
3079
3080 pure_scalable_type_info::analysis_result
3081 pure_scalable_type_info::analyze_array (const_tree type)
3082 {
3083   /* Analyze the element type.  */
3084   pure_scalable_type_info element_info;
3085   analysis_result result = element_info.analyze (TREE_TYPE (type));
3086   if (result != IS_PST)
3087     return result;
3088
3089   /* An array of unknown, flexible or variable length will be passed and
3090      returned by reference whatever we do.  */
3091   tree nelts_minus_one = array_type_nelts (type);
3092   if (!tree_fits_uhwi_p (nelts_minus_one))
3093     return DOESNT_MATTER;
3094
3095   /* Likewise if the array is constant-sized but too big to be interesting.
3096      The double checks against MAX_PIECES are to protect against overflow.  */
3097   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
3098   if (count > MAX_PIECES)
3099     return DOESNT_MATTER;
3100   count += 1;
3101   if (count * element_info.pieces.length () > MAX_PIECES)
3102     return DOESNT_MATTER;
3103
3104   /* The above checks should have weeded out elements of unknown size.  */
3105   poly_uint64 element_bytes;
3106   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
3107     gcc_unreachable ();
3108
3109   /* Build up the list of individual vectors and predicates.  */
3110   gcc_assert (!element_info.pieces.is_empty ());
3111   for (unsigned int i = 0; i < count; ++i)
3112     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
3113       {
3114         piece p = element_info.pieces[j];
3115         p.offset += i * element_bytes;
3116         add_piece (p);
3117       }
3118   return IS_PST;
3119 }
3120
3121 /* Subroutine of analyze for handling RECORD_TYPEs.  */
3122
3123 pure_scalable_type_info::analysis_result
3124 pure_scalable_type_info::analyze_record (const_tree type)
3125 {
3126   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3127     {
3128       if (TREE_CODE (field) != FIELD_DECL)
3129         continue;
3130
3131       /* Zero-sized fields disappear in the language->ABI mapping.  */
3132       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
3133         continue;
3134
3135       /* All fields with an ABI identity must be PSTs for the record as
3136          a whole to be a PST.  If any individual field is too big to be
3137          interesting then the record is too.  */
3138       pure_scalable_type_info field_info;
3139       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
3140       if (subresult == NO_ABI_IDENTITY)
3141         continue;
3142       if (subresult != IS_PST)
3143         return subresult;
3144
3145       /* Since all previous fields are PSTs, we ought to be able to track
3146          the field offset using poly_ints.  */
3147       tree bitpos = bit_position (field);
3148       gcc_assert (poly_int_tree_p (bitpos));
3149
3150       /* For the same reason, it shouldn't be possible to create a PST field
3151          whose offset isn't byte-aligned.  */
3152       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
3153                                                 BITS_PER_UNIT);
3154
3155       /* Punt if the record is too big to be interesting.  */
3156       poly_uint64 bytepos;
3157       if (!wide_bytepos.to_uhwi (&bytepos)
3158           || pieces.length () + field_info.pieces.length () > MAX_PIECES)
3159         return DOESNT_MATTER;
3160
3161       /* Add the individual vectors and predicates in the field to the
3162          record's list.  */
3163       gcc_assert (!field_info.pieces.is_empty ());
3164       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
3165         {
3166           piece p = field_info.pieces[i];
3167           p.offset += bytepos;
3168           add_piece (p);
3169         }
3170     }
3171   /* Empty structures disappear in the language->ABI mapping.  */
3172   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
3173 }
3174
3175 /* Add P to the list of pieces in the type.  */
3176
3177 void
3178 pure_scalable_type_info::add_piece (const piece &p)
3179 {
3180   /* Try to fold the new piece into the previous one to form a
3181      single-mode PST.  For example, if we see three consecutive vectors
3182      of the same mode, we can represent them using the corresponding
3183      3-tuple mode.
3184
3185      This is purely an optimization.  */
3186   if (!pieces.is_empty ())
3187     {
3188       piece &prev = pieces.last ();
3189       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
3190       unsigned int nelems1, nelems2;
3191       if (prev.orig_mode == p.orig_mode
3192           && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
3193           && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
3194                                   GET_MODE_NUNITS (p.orig_mode), &nelems1)
3195           && constant_multiple_p (GET_MODE_NUNITS (p.mode),
3196                                   GET_MODE_NUNITS (p.orig_mode), &nelems2)
3197           && targetm.array_mode (p.orig_mode,
3198                                  nelems1 + nelems2).exists (&prev.mode))
3199         {
3200           prev.num_zr += p.num_zr;
3201           prev.num_pr += p.num_pr;
3202           return;
3203         }
3204     }
3205   pieces.quick_push (p);
3206 }
3207
3208 /* Return true if at least one possible value of type TYPE includes at
3209    least one object of Pure Scalable Type, in the sense of the AAPCS64.
3210
3211    This is a relatively expensive test for some types, so it should
3212    generally be made as late as possible.  */
3213
3214 static bool
3215 aarch64_some_values_include_pst_objects_p (const_tree type)
3216 {
3217   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3218     return false;
3219
3220   if (aarch64_sve::builtin_type_p (type))
3221     return true;
3222
3223   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
3224     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
3225
3226   if (RECORD_OR_UNION_TYPE_P (type))
3227     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3228       if (TREE_CODE (field) == FIELD_DECL
3229           && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
3230         return true;
3231
3232   return false;
3233 }
3234
3235 /* Return the descriptor of the SIMD ABI.  */
3236
3237 static const predefined_function_abi &
3238 aarch64_simd_abi (void)
3239 {
3240   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
3241   if (!simd_abi.initialized_p ())
3242     {
3243       HARD_REG_SET full_reg_clobbers
3244         = default_function_abi.full_reg_clobbers ();
3245       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
3246         if (FP_SIMD_SAVED_REGNUM_P (regno))
3247           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3248       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
3249     }
3250   return simd_abi;
3251 }
3252
3253 /* Return the descriptor of the SVE PCS.  */
3254
3255 static const predefined_function_abi &
3256 aarch64_sve_abi (void)
3257 {
3258   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
3259   if (!sve_abi.initialized_p ())
3260     {
3261       HARD_REG_SET full_reg_clobbers
3262         = default_function_abi.full_reg_clobbers ();
3263       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
3264         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3265       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
3266         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3267       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
3268     }
3269   return sve_abi;
3270 }
3271
3272 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
3273    wraps, otherwise return X itself.  */
3274
3275 static rtx
3276 strip_salt (rtx x)
3277 {
3278   rtx search = x;
3279   if (GET_CODE (search) == CONST)
3280     search = XEXP (search, 0);
3281   if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
3282     x = XVECEXP (search, 0, 0);
3283   return x;
3284 }
3285
3286 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
3287    expression.  */
3288
3289 static rtx
3290 strip_offset_and_salt (rtx addr, poly_int64 *offset)
3291 {
3292   return strip_salt (strip_offset (addr, offset));
3293 }
3294
3295 /* Generate code to enable conditional branches in functions over 1 MiB.  */
3296 const char *
3297 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
3298                         const char * branch_format)
3299 {
3300     rtx_code_label * tmp_label = gen_label_rtx ();
3301     char label_buf[256];
3302     char buffer[128];
3303     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
3304                                  CODE_LABEL_NUMBER (tmp_label));
3305     const char *label_ptr = targetm.strip_name_encoding (label_buf);
3306     rtx dest_label = operands[pos_label];
3307     operands[pos_label] = tmp_label;
3308
3309     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
3310     output_asm_insn (buffer, operands);
3311
3312     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
3313     operands[pos_label] = dest_label;
3314     output_asm_insn (buffer, operands);
3315     return "";
3316 }
3317
3318 void
3319 aarch64_err_no_fpadvsimd (machine_mode mode)
3320 {
3321   if (TARGET_GENERAL_REGS_ONLY)
3322     if (FLOAT_MODE_P (mode))
3323       error ("%qs is incompatible with the use of floating-point types",
3324              "-mgeneral-regs-only");
3325     else
3326       error ("%qs is incompatible with the use of vector types",
3327              "-mgeneral-regs-only");
3328   else
3329     if (FLOAT_MODE_P (mode))
3330       error ("%qs feature modifier is incompatible with the use of"
3331              " floating-point types", "+nofp");
3332     else
3333       error ("%qs feature modifier is incompatible with the use of"
3334              " vector types", "+nofp");
3335 }
3336
3337 /* Report when we try to do something that requires SVE when SVE is disabled.
3338    This is an error of last resort and isn't very high-quality.  It usually
3339    involves attempts to measure the vector length in some way.  */
3340 static void
3341 aarch64_report_sve_required (void)
3342 {
3343   static bool reported_p = false;
3344
3345   /* Avoid reporting a slew of messages for a single oversight.  */
3346   if (reported_p)
3347     return;
3348
3349   error ("this operation requires the SVE ISA extension");
3350   inform (input_location, "you can enable SVE using the command-line"
3351           " option %<-march%>, or by using the %<target%>"
3352           " attribute or pragma");
3353   reported_p = true;
3354 }
3355
3356 /* Return true if REGNO is P0-P15 or one of the special FFR-related
3357    registers.  */
3358 inline bool
3359 pr_or_ffr_regnum_p (unsigned int regno)
3360 {
3361   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
3362 }
3363
3364 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
3365    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
3366    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
3367    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
3368    and GENERAL_REGS is lower than the memory cost (in this case the best class
3369    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
3370    cost results in bad allocations with many redundant int<->FP moves which
3371    are expensive on various cores.
3372    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
3373    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
3374    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
3375    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
3376    The result of this is that it is no longer inefficient to have a higher
3377    memory move cost than the register move cost.
3378 */
3379
3380 static reg_class_t
3381 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
3382                                          reg_class_t best_class)
3383 {
3384   machine_mode mode;
3385
3386   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
3387       || !reg_class_subset_p (FP_REGS, allocno_class))
3388     return allocno_class;
3389
3390   if (!reg_class_subset_p (GENERAL_REGS, best_class)
3391       || !reg_class_subset_p (FP_REGS, best_class))
3392     return best_class;
3393
3394   mode = PSEUDO_REGNO_MODE (regno);
3395   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
3396 }
3397
3398 static unsigned int
3399 aarch64_min_divisions_for_recip_mul (machine_mode mode)
3400 {
3401   if (GET_MODE_UNIT_SIZE (mode) == 4)
3402     return aarch64_tune_params.min_div_recip_mul_sf;
3403   return aarch64_tune_params.min_div_recip_mul_df;
3404 }
3405
3406 /* Return the reassociation width of treeop OPC with mode MODE.  */
3407 static int
3408 aarch64_reassociation_width (unsigned opc, machine_mode mode)
3409 {
3410   if (VECTOR_MODE_P (mode))
3411     return aarch64_tune_params.vec_reassoc_width;
3412   if (INTEGRAL_MODE_P (mode))
3413     return aarch64_tune_params.int_reassoc_width;
3414   /* Reassociation reduces the number of FMAs which may result in worse
3415      performance.  Use a per-CPU setting for FMA reassociation which allows
3416      narrow CPUs with few FP pipes to switch it off (value of 1), and wider
3417      CPUs with many FP pipes to enable reassociation.
3418      Since the reassociation pass doesn't understand FMA at all, assume
3419      that any FP addition might turn into FMA.  */
3420   if (FLOAT_MODE_P (mode))
3421     return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
3422                             : aarch64_tune_params.fp_reassoc_width;
3423   return 1;
3424 }
3425
3426 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
3427 unsigned
3428 aarch64_debugger_regno (unsigned regno)
3429 {
3430    if (GP_REGNUM_P (regno))
3431      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
3432    else if (regno == SP_REGNUM)
3433      return AARCH64_DWARF_SP;
3434    else if (FP_REGNUM_P (regno))
3435      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
3436    else if (PR_REGNUM_P (regno))
3437      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
3438    else if (regno == VG_REGNUM)
3439      return AARCH64_DWARF_VG;
3440
3441    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
3442       equivalent DWARF register.  */
3443    return DWARF_FRAME_REGISTERS;
3444 }
3445
3446 /* If X is a CONST_DOUBLE, return its bit representation as a constant
3447    integer, otherwise return X unmodified.  */
3448 static rtx
3449 aarch64_bit_representation (rtx x)
3450 {
3451   if (CONST_DOUBLE_P (x))
3452     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
3453   return x;
3454 }
3455
3456 /* Return an estimate for the number of quadwords in an SVE vector.  This is
3457    equivalent to the number of Advanced SIMD vectors in an SVE vector.  */
3458 static unsigned int
3459 aarch64_estimated_sve_vq ()
3460 {
3461   return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
3462 }
3463
3464 /* Return true if MODE is an SVE predicate mode.  */
3465 static bool
3466 aarch64_sve_pred_mode_p (machine_mode mode)
3467 {
3468   return (TARGET_SVE
3469           && (mode == VNx16BImode
3470               || mode == VNx8BImode
3471               || mode == VNx4BImode
3472               || mode == VNx2BImode));
3473 }
3474
3475 /* Three mutually-exclusive flags describing a vector or predicate type.  */
3476 const unsigned int VEC_ADVSIMD  = 1;
3477 const unsigned int VEC_SVE_DATA = 2;
3478 const unsigned int VEC_SVE_PRED = 4;
3479 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
3480    a structure of 2, 3 or 4 vectors.  */
3481 const unsigned int VEC_STRUCT   = 8;
3482 /* Can be used in combination with VEC_SVE_DATA to indicate that the
3483    vector has fewer significant bytes than a full SVE vector.  */
3484 const unsigned int VEC_PARTIAL  = 16;
3485 /* Useful combinations of the above.  */
3486 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
3487 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
3488
3489 /* Return a set of flags describing the vector properties of mode MODE.
3490    Ignore modes that are not supported by the current target.  */
3491 static unsigned int
3492 aarch64_classify_vector_mode (machine_mode mode)
3493 {
3494   if (aarch64_sve_pred_mode_p (mode))
3495     return VEC_SVE_PRED;
3496
3497   /* Make the decision based on the mode's enum value rather than its
3498      properties, so that we keep the correct classification regardless
3499      of -msve-vector-bits.  */
3500   switch (mode)
3501     {
3502     /* Partial SVE QI vectors.  */
3503     case E_VNx2QImode:
3504     case E_VNx4QImode:
3505     case E_VNx8QImode:
3506     /* Partial SVE HI vectors.  */
3507     case E_VNx2HImode:
3508     case E_VNx4HImode:
3509     /* Partial SVE SI vector.  */
3510     case E_VNx2SImode:
3511     /* Partial SVE HF vectors.  */
3512     case E_VNx2HFmode:
3513     case E_VNx4HFmode:
3514     /* Partial SVE BF vectors.  */
3515     case E_VNx2BFmode:
3516     case E_VNx4BFmode:
3517     /* Partial SVE SF vector.  */
3518     case E_VNx2SFmode:
3519       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
3520
3521     case E_VNx16QImode:
3522     case E_VNx8HImode:
3523     case E_VNx4SImode:
3524     case E_VNx2DImode:
3525     case E_VNx8BFmode:
3526     case E_VNx8HFmode:
3527     case E_VNx4SFmode:
3528     case E_VNx2DFmode:
3529       return TARGET_SVE ? VEC_SVE_DATA : 0;
3530
3531     /* x2 SVE vectors.  */
3532     case E_VNx32QImode:
3533     case E_VNx16HImode:
3534     case E_VNx8SImode:
3535     case E_VNx4DImode:
3536     case E_VNx16BFmode:
3537     case E_VNx16HFmode:
3538     case E_VNx8SFmode:
3539     case E_VNx4DFmode:
3540     /* x3 SVE vectors.  */
3541     case E_VNx48QImode:
3542     case E_VNx24HImode:
3543     case E_VNx12SImode:
3544     case E_VNx6DImode:
3545     case E_VNx24BFmode:
3546     case E_VNx24HFmode:
3547     case E_VNx12SFmode:
3548     case E_VNx6DFmode:
3549     /* x4 SVE vectors.  */
3550     case E_VNx64QImode:
3551     case E_VNx32HImode:
3552     case E_VNx16SImode:
3553     case E_VNx8DImode:
3554     case E_VNx32BFmode:
3555     case E_VNx32HFmode:
3556     case E_VNx16SFmode:
3557     case E_VNx8DFmode:
3558       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
3559
3560     case E_OImode:
3561     case E_CImode:
3562     case E_XImode:
3563       return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
3564
3565     /* Structures of 64-bit Advanced SIMD vectors.  */
3566     case E_V2x8QImode:
3567     case E_V2x4HImode:
3568     case E_V2x2SImode:
3569     case E_V2x1DImode:
3570     case E_V2x4BFmode:
3571     case E_V2x4HFmode:
3572     case E_V2x2SFmode:
3573     case E_V2x1DFmode:
3574     case E_V3x8QImode:
3575     case E_V3x4HImode:
3576     case E_V3x2SImode:
3577     case E_V3x1DImode:
3578     case E_V3x4BFmode:
3579     case E_V3x4HFmode:
3580     case E_V3x2SFmode:
3581     case E_V3x1DFmode:
3582     case E_V4x8QImode:
3583     case E_V4x4HImode:
3584     case E_V4x2SImode:
3585     case E_V4x1DImode:
3586     case E_V4x4BFmode:
3587     case E_V4x4HFmode:
3588     case E_V4x2SFmode:
3589     case E_V4x1DFmode:
3590       return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
3591
3592     /* Structures of 128-bit Advanced SIMD vectors.  */
3593     case E_V2x16QImode:
3594     case E_V2x8HImode:
3595     case E_V2x4SImode:
3596     case E_V2x2DImode:
3597     case E_V2x8BFmode:
3598     case E_V2x8HFmode:
3599     case E_V2x4SFmode:
3600     case E_V2x2DFmode:
3601     case E_V3x16QImode:
3602     case E_V3x8HImode:
3603     case E_V3x4SImode:
3604     case E_V3x2DImode:
3605     case E_V3x8BFmode:
3606     case E_V3x8HFmode:
3607     case E_V3x4SFmode:
3608     case E_V3x2DFmode:
3609     case E_V4x16QImode:
3610     case E_V4x8HImode:
3611     case E_V4x4SImode:
3612     case E_V4x2DImode:
3613     case E_V4x8BFmode:
3614     case E_V4x8HFmode:
3615     case E_V4x4SFmode:
3616     case E_V4x2DFmode:
3617       return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
3618
3619     /* 64-bit Advanced SIMD vectors.  */
3620     case E_V8QImode:
3621     case E_V4HImode:
3622     case E_V2SImode:
3623     case E_V1DImode:
3624     case E_V4HFmode:
3625     case E_V4BFmode:
3626     case E_V2SFmode:
3627     case E_V1DFmode:
3628     /* 128-bit Advanced SIMD vectors.  */
3629     case E_V16QImode:
3630     case E_V8HImode:
3631     case E_V4SImode:
3632     case E_V2DImode:
3633     case E_V8HFmode:
3634     case E_V8BFmode:
3635     case E_V4SFmode:
3636     case E_V2DFmode:
3637       return TARGET_FLOAT ? VEC_ADVSIMD : 0;
3638
3639     default:
3640       return 0;
3641     }
3642 }
3643
3644 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
3645 bool
3646 aarch64_advsimd_struct_mode_p (machine_mode mode)
3647 {
3648   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3649   return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
3650 }
3651
3652 /* Return true if MODE is an Advanced SIMD D-register structure mode.  */
3653 static bool
3654 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
3655 {
3656   return (aarch64_classify_vector_mode (mode)
3657           == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
3658 }
3659
3660 /* Return true if MODE is an Advanced SIMD Q-register structure mode.  */
3661 static bool
3662 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
3663 {
3664   return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
3665 }
3666
3667 /* Return true if MODE is any of the data vector modes, including
3668    structure modes.  */
3669 static bool
3670 aarch64_vector_data_mode_p (machine_mode mode)
3671 {
3672   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
3673 }
3674
3675 /* Return true if MODE is any form of SVE mode, including predicates,
3676    vectors and structures.  */
3677 bool
3678 aarch64_sve_mode_p (machine_mode mode)
3679 {
3680   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
3681 }
3682
3683 /* Return true if MODE is an SVE data vector mode; either a single vector
3684    or a structure of vectors.  */
3685 static bool
3686 aarch64_sve_data_mode_p (machine_mode mode)
3687 {
3688   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
3689 }
3690
3691 /* Return the number of defined bytes in one constituent vector of
3692    SVE mode MODE, which has vector flags VEC_FLAGS.  */
3693 static poly_int64
3694 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
3695 {
3696   if (vec_flags & VEC_PARTIAL)
3697     /* A single partial vector.  */
3698     return GET_MODE_SIZE (mode);
3699
3700   if (vec_flags & VEC_SVE_DATA)
3701     /* A single vector or a tuple.  */
3702     return BYTES_PER_SVE_VECTOR;
3703
3704   /* A single predicate.  */
3705   gcc_assert (vec_flags & VEC_SVE_PRED);
3706   return BYTES_PER_SVE_PRED;
3707 }
3708
3709 /* If MODE holds an array of vectors, return the number of vectors
3710    in the array, otherwise return 1.  */
3711
3712 static unsigned int
3713 aarch64_ldn_stn_vectors (machine_mode mode)
3714 {
3715   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3716   if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
3717     return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
3718   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
3719     return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
3720   if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
3721     return exact_div (GET_MODE_SIZE (mode),
3722                       BYTES_PER_SVE_VECTOR).to_constant ();
3723   return 1;
3724 }
3725
3726 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
3727    corresponding vector structure mode.  */
3728 static opt_machine_mode
3729 aarch64_advsimd_vector_array_mode (machine_mode mode,
3730                                    unsigned HOST_WIDE_INT nelems)
3731 {
3732   unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
3733   if (known_eq (GET_MODE_SIZE (mode), 8))
3734     flags |= VEC_PARTIAL;
3735
3736   machine_mode struct_mode;
3737   FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
3738     if (aarch64_classify_vector_mode (struct_mode) == flags
3739         && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
3740         && known_eq (GET_MODE_NUNITS (struct_mode),
3741              GET_MODE_NUNITS (mode) * nelems))
3742       return struct_mode;
3743   return opt_machine_mode ();
3744 }
3745
3746 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
3747
3748 opt_machine_mode
3749 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
3750 {
3751   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
3752                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
3753   machine_mode mode;
3754   FOR_EACH_MODE_IN_CLASS (mode, mclass)
3755     if (inner_mode == GET_MODE_INNER (mode)
3756         && known_eq (nunits, GET_MODE_NUNITS (mode))
3757         && aarch64_sve_data_mode_p (mode))
3758       return mode;
3759   return opt_machine_mode ();
3760 }
3761
3762 /* Implement target hook TARGET_ARRAY_MODE.  */
3763 static opt_machine_mode
3764 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
3765 {
3766   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
3767       && IN_RANGE (nelems, 2, 4))
3768     return aarch64_sve_data_mode (GET_MODE_INNER (mode),
3769                                   GET_MODE_NUNITS (mode) * nelems);
3770   if (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD
3771       && IN_RANGE (nelems, 2, 4))
3772     return aarch64_advsimd_vector_array_mode (mode, nelems);
3773
3774   return opt_machine_mode ();
3775 }
3776
3777 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
3778 static bool
3779 aarch64_array_mode_supported_p (machine_mode mode,
3780                                 unsigned HOST_WIDE_INT nelems)
3781 {
3782   if (TARGET_SIMD
3783       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
3784           || AARCH64_VALID_SIMD_DREG_MODE (mode))
3785       && (nelems >= 2 && nelems <= 4))
3786     return true;
3787
3788   return false;
3789 }
3790
3791 /* MODE is some form of SVE vector mode.  For data modes, return the number
3792    of vector register bits that each element of MODE occupies, such as 64
3793    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
3794    in a 64-bit container).  For predicate modes, return the number of
3795    data bits controlled by each significant predicate bit.  */
3796
3797 static unsigned int
3798 aarch64_sve_container_bits (machine_mode mode)
3799 {
3800   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3801   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
3802                              ? BITS_PER_SVE_VECTOR
3803                              : GET_MODE_BITSIZE (mode));
3804   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
3805 }
3806
3807 /* Return the SVE predicate mode to use for elements that have
3808    ELEM_NBYTES bytes, if such a mode exists.  */
3809
3810 opt_machine_mode
3811 aarch64_sve_pred_mode (unsigned int elem_nbytes)
3812 {
3813   if (TARGET_SVE)
3814     {
3815       if (elem_nbytes == 1)
3816         return VNx16BImode;
3817       if (elem_nbytes == 2)
3818         return VNx8BImode;
3819       if (elem_nbytes == 4)
3820         return VNx4BImode;
3821       if (elem_nbytes == 8)
3822         return VNx2BImode;
3823     }
3824   return opt_machine_mode ();
3825 }
3826
3827 /* Return the SVE predicate mode that should be used to control
3828    SVE mode MODE.  */
3829
3830 machine_mode
3831 aarch64_sve_pred_mode (machine_mode mode)
3832 {
3833   unsigned int bits = aarch64_sve_container_bits (mode);
3834   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
3835 }
3836
3837 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
3838
3839 static opt_machine_mode
3840 aarch64_get_mask_mode (machine_mode mode)
3841 {
3842   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3843   if (vec_flags & VEC_SVE_DATA)
3844     return aarch64_sve_pred_mode (mode);
3845
3846   return default_get_mask_mode (mode);
3847 }
3848
3849 /* Return the integer element mode associated with SVE mode MODE.  */
3850
3851 static scalar_int_mode
3852 aarch64_sve_element_int_mode (machine_mode mode)
3853 {
3854   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3855                              ? BITS_PER_SVE_VECTOR
3856                              : GET_MODE_BITSIZE (mode));
3857   unsigned int elt_bits = vector_element_size (vector_bits,
3858                                                GET_MODE_NUNITS (mode));
3859   return int_mode_for_size (elt_bits, 0).require ();
3860 }
3861
3862 /* Return an integer element mode that contains exactly
3863    aarch64_sve_container_bits (MODE) bits.  This is wider than
3864    aarch64_sve_element_int_mode if MODE is a partial vector,
3865    otherwise it's the same.  */
3866
3867 static scalar_int_mode
3868 aarch64_sve_container_int_mode (machine_mode mode)
3869 {
3870   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
3871 }
3872
3873 /* Return the integer vector mode associated with SVE mode MODE.
3874    Unlike related_int_vector_mode, this can handle the case in which
3875    MODE is a predicate (and thus has a different total size).  */
3876
3877 machine_mode
3878 aarch64_sve_int_mode (machine_mode mode)
3879 {
3880   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
3881   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
3882 }
3883
3884 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
3885
3886 static opt_machine_mode
3887 aarch64_vectorize_related_mode (machine_mode vector_mode,
3888                                 scalar_mode element_mode,
3889                                 poly_uint64 nunits)
3890 {
3891   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
3892
3893   /* If we're operating on SVE vectors, try to return an SVE mode.  */
3894   poly_uint64 sve_nunits;
3895   if ((vec_flags & VEC_SVE_DATA)
3896       && multiple_p (BYTES_PER_SVE_VECTOR,
3897                      GET_MODE_SIZE (element_mode), &sve_nunits))
3898     {
3899       machine_mode sve_mode;
3900       if (maybe_ne (nunits, 0U))
3901         {
3902           /* Try to find a full or partial SVE mode with exactly
3903              NUNITS units.  */
3904           if (multiple_p (sve_nunits, nunits)
3905               && aarch64_sve_data_mode (element_mode,
3906                                         nunits).exists (&sve_mode))
3907             return sve_mode;
3908         }
3909       else
3910         {
3911           /* Take the preferred number of units from the number of bytes
3912              that fit in VECTOR_MODE.  We always start by "autodetecting"
3913              a full vector mode with preferred_simd_mode, so vectors
3914              chosen here will also be full vector modes.  Then
3915              autovectorize_vector_modes tries smaller starting modes
3916              and thus smaller preferred numbers of units.  */
3917           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
3918           if (aarch64_sve_data_mode (element_mode,
3919                                      sve_nunits).exists (&sve_mode))
3920             return sve_mode;
3921         }
3922     }
3923
3924   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
3925   if (TARGET_SIMD
3926       && (vec_flags & VEC_ADVSIMD)
3927       && known_eq (nunits, 0U)
3928       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3929       && maybe_ge (GET_MODE_BITSIZE (element_mode)
3930                    * GET_MODE_NUNITS (vector_mode), 128U))
3931     {
3932       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3933       if (VECTOR_MODE_P (res))
3934         return res;
3935     }
3936
3937   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3938 }
3939
3940 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
3941    prefer to use the first arithmetic operand as the else value if
3942    the else value doesn't matter, since that exactly matches the SVE
3943    destructive merging form.  For ternary operations we could either
3944    pick the first operand and use FMAD-like instructions or the last
3945    operand and use FMLA-like instructions; the latter seems more
3946    natural.  */
3947
3948 static tree
3949 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
3950 {
3951   return nops == 3 ? ops[2] : ops[0];
3952 }
3953
3954 /* Implement TARGET_HARD_REGNO_NREGS.  */
3955
3956 static unsigned int
3957 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
3958 {
3959   /* ??? Logically we should only need to provide a value when
3960      HARD_REGNO_MODE_OK says that the combination is valid,
3961      but at the moment we need to handle all modes.  Just ignore
3962      any runtime parts for registers that can't store them.  */
3963   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
3964   switch (aarch64_regno_regclass (regno))
3965     {
3966     case FP_REGS:
3967     case FP_LO_REGS:
3968     case FP_LO8_REGS:
3969       {
3970         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3971         if (vec_flags & VEC_SVE_DATA)
3972           return exact_div (GET_MODE_SIZE (mode),
3973                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
3974         if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
3975           return GET_MODE_SIZE (mode).to_constant () / 8;
3976         return CEIL (lowest_size, UNITS_PER_VREG);
3977       }
3978     case PR_REGS:
3979     case PR_LO_REGS:
3980     case PR_HI_REGS:
3981     case FFR_REGS:
3982     case PR_AND_FFR_REGS:
3983       return 1;
3984     default:
3985       return CEIL (lowest_size, UNITS_PER_WORD);
3986     }
3987   gcc_unreachable ();
3988 }
3989
3990 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
3991
3992 static bool
3993 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
3994 {
3995   if (mode == V8DImode)
3996     return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
3997            && multiple_p (regno - R0_REGNUM, 2);
3998
3999   if (GET_MODE_CLASS (mode) == MODE_CC)
4000     return regno == CC_REGNUM;
4001
4002   if (regno == VG_REGNUM)
4003     /* This must have the same size as _Unwind_Word.  */
4004     return mode == DImode;
4005
4006   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4007   if (vec_flags & VEC_SVE_PRED)
4008     return pr_or_ffr_regnum_p (regno);
4009
4010   if (pr_or_ffr_regnum_p (regno))
4011     return false;
4012
4013   if (regno == SP_REGNUM)
4014     /* The purpose of comparing with ptr_mode is to support the
4015        global register variable associated with the stack pointer
4016        register via the syntax of asm ("wsp") in ILP32.  */
4017     return mode == Pmode || mode == ptr_mode;
4018
4019   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
4020     return mode == Pmode;
4021
4022   if (GP_REGNUM_P (regno))
4023     {
4024       if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
4025         return false;
4026       if (known_le (GET_MODE_SIZE (mode), 8))
4027         return true;
4028       if (known_le (GET_MODE_SIZE (mode), 16))
4029         return (regno & 1) == 0;
4030     }
4031   else if (FP_REGNUM_P (regno))
4032     {
4033       if (vec_flags & VEC_STRUCT)
4034         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
4035       else
4036         return !VECTOR_MODE_P (mode) || vec_flags != 0;
4037     }
4038
4039   return false;
4040 }
4041
4042 /* Return true if a function with type FNTYPE returns its value in
4043    SVE vector or predicate registers.  */
4044
4045 static bool
4046 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
4047 {
4048   tree return_type = TREE_TYPE (fntype);
4049
4050   pure_scalable_type_info pst_info;
4051   switch (pst_info.analyze (return_type))
4052     {
4053     case pure_scalable_type_info::IS_PST:
4054       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
4055               && pst_info.num_pr () <= NUM_PR_ARG_REGS);
4056
4057     case pure_scalable_type_info::DOESNT_MATTER:
4058       gcc_assert (aarch64_return_in_memory_1 (return_type));
4059       return false;
4060
4061     case pure_scalable_type_info::NO_ABI_IDENTITY:
4062     case pure_scalable_type_info::ISNT_PST:
4063       return false;
4064     }
4065   gcc_unreachable ();
4066 }
4067
4068 /* Return true if a function with type FNTYPE takes arguments in
4069    SVE vector or predicate registers.  */
4070
4071 static bool
4072 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
4073 {
4074   CUMULATIVE_ARGS args_so_far_v;
4075   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
4076                                 NULL_TREE, 0, true);
4077   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
4078
4079   for (tree chain = TYPE_ARG_TYPES (fntype);
4080        chain && chain != void_list_node;
4081        chain = TREE_CHAIN (chain))
4082     {
4083       tree arg_type = TREE_VALUE (chain);
4084       if (arg_type == error_mark_node)
4085         return false;
4086
4087       function_arg_info arg (arg_type, /*named=*/true);
4088       apply_pass_by_reference_rules (&args_so_far_v, arg);
4089       pure_scalable_type_info pst_info;
4090       if (pst_info.analyze_registers (arg.type))
4091         {
4092           unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
4093           unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
4094           gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
4095           return true;
4096         }
4097
4098       targetm.calls.function_arg_advance (args_so_far, arg);
4099     }
4100   return false;
4101 }
4102
4103 /* Implement TARGET_FNTYPE_ABI.  */
4104
4105 static const predefined_function_abi &
4106 aarch64_fntype_abi (const_tree fntype)
4107 {
4108   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
4109     return aarch64_simd_abi ();
4110
4111   if (aarch64_returns_value_in_sve_regs_p (fntype)
4112       || aarch64_takes_arguments_in_sve_regs_p (fntype))
4113     return aarch64_sve_abi ();
4114
4115   return default_function_abi;
4116 }
4117
4118 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
4119
4120 static bool
4121 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
4122 {
4123   return (aarch64_sve::builtin_type_p (type1)
4124           == aarch64_sve::builtin_type_p (type2));
4125 }
4126
4127 /* Return true if we should emit CFI for register REGNO.  */
4128
4129 static bool
4130 aarch64_emit_cfi_for_reg_p (unsigned int regno)
4131 {
4132   return (GP_REGNUM_P (regno)
4133           || !default_function_abi.clobbers_full_reg_p (regno));
4134 }
4135
4136 /* Return the mode we should use to save and restore register REGNO.  */
4137
4138 static machine_mode
4139 aarch64_reg_save_mode (unsigned int regno)
4140 {
4141   if (GP_REGNUM_P (regno))
4142     return DImode;
4143
4144   if (FP_REGNUM_P (regno))
4145     switch (crtl->abi->id ())
4146       {
4147       case ARM_PCS_AAPCS64:
4148         /* Only the low 64 bits are saved by the base PCS.  */
4149         return DFmode;
4150
4151       case ARM_PCS_SIMD:
4152         /* The vector PCS saves the low 128 bits (which is the full
4153            register on non-SVE targets).  */
4154         return TFmode;
4155
4156       case ARM_PCS_SVE:
4157         /* Use vectors of DImode for registers that need frame
4158            information, so that the first 64 bytes of the save slot
4159            are always the equivalent of what storing D<n> would give.  */
4160         if (aarch64_emit_cfi_for_reg_p (regno))
4161           return VNx2DImode;
4162
4163         /* Use vectors of bytes otherwise, so that the layout is
4164            endian-agnostic, and so that we can use LDR and STR for
4165            big-endian targets.  */
4166         return VNx16QImode;
4167
4168       case ARM_PCS_TLSDESC:
4169       case ARM_PCS_UNKNOWN:
4170         break;
4171       }
4172
4173   if (PR_REGNUM_P (regno))
4174     /* Save the full predicate register.  */
4175     return VNx16BImode;
4176
4177   gcc_unreachable ();
4178 }
4179
4180 /* Implement TARGET_INSN_CALLEE_ABI.  */
4181
4182 const predefined_function_abi &
4183 aarch64_insn_callee_abi (const rtx_insn *insn)
4184 {
4185   rtx pat = PATTERN (insn);
4186   gcc_assert (GET_CODE (pat) == PARALLEL);
4187   rtx unspec = XVECEXP (pat, 0, 1);
4188   gcc_assert (GET_CODE (unspec) == UNSPEC
4189               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
4190   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
4191 }
4192
4193 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
4194    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
4195    clobbers the top 64 bits when restoring the bottom 64 bits.  */
4196
4197 static bool
4198 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
4199                                         unsigned int regno,
4200                                         machine_mode mode)
4201 {
4202   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
4203     {
4204       poly_int64 per_register_size = GET_MODE_SIZE (mode);
4205       unsigned int nregs = hard_regno_nregs (regno, mode);
4206       if (nregs > 1)
4207         per_register_size = exact_div (per_register_size, nregs);
4208       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
4209         return maybe_gt (per_register_size, 16);
4210       return maybe_gt (per_register_size, 8);
4211     }
4212   return false;
4213 }
4214
4215 /* Implement REGMODE_NATURAL_SIZE.  */
4216 poly_uint64
4217 aarch64_regmode_natural_size (machine_mode mode)
4218 {
4219   /* The natural size for SVE data modes is one SVE data vector,
4220      and similarly for predicates.  We can't independently modify
4221      anything smaller than that.  */
4222   /* ??? For now, only do this for variable-width SVE registers.
4223      Doing it for constant-sized registers breaks lower-subreg.cc.  */
4224   /* ??? And once that's fixed, we should probably have similar
4225      code for Advanced SIMD.  */
4226   if (!aarch64_sve_vg.is_constant ())
4227     {
4228       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4229       if (vec_flags & VEC_SVE_PRED)
4230         return BYTES_PER_SVE_PRED;
4231       if (vec_flags & VEC_SVE_DATA)
4232         return BYTES_PER_SVE_VECTOR;
4233     }
4234   return UNITS_PER_WORD;
4235 }
4236
4237 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
4238 machine_mode
4239 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
4240                                      machine_mode mode)
4241 {
4242   /* The predicate mode determines which bits are significant and
4243      which are "don't care".  Decreasing the number of lanes would
4244      lose data while increasing the number of lanes would make bits
4245      unnecessarily significant.  */
4246   if (PR_REGNUM_P (regno))
4247     return mode;
4248   if (known_ge (GET_MODE_SIZE (mode), 4))
4249     return mode;
4250   else
4251     return SImode;
4252 }
4253
4254 /* Return true if I's bits are consecutive ones from the MSB.  */
4255 bool
4256 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
4257 {
4258   return exact_log2 (-i) != HOST_WIDE_INT_M1;
4259 }
4260
4261 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
4262    that strcpy from constants will be faster.  */
4263
4264 static HOST_WIDE_INT
4265 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
4266 {
4267   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
4268     return MAX (align, BITS_PER_WORD);
4269   return align;
4270 }
4271
4272 /* Return true if calls to DECL should be treated as
4273    long-calls (ie called via a register).  */
4274 static bool
4275 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
4276 {
4277   return false;
4278 }
4279
4280 /* Return true if calls to symbol-ref SYM should be treated as
4281    long-calls (ie called via a register).  */
4282 bool
4283 aarch64_is_long_call_p (rtx sym)
4284 {
4285   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
4286 }
4287
4288 /* Return true if calls to symbol-ref SYM should not go through
4289    plt stubs.  */
4290
4291 bool
4292 aarch64_is_noplt_call_p (rtx sym)
4293 {
4294   const_tree decl = SYMBOL_REF_DECL (sym);
4295
4296   if (flag_pic
4297       && decl
4298       && (!flag_plt
4299           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
4300       && !targetm.binds_local_p (decl))
4301     return true;
4302
4303   return false;
4304 }
4305
4306 /* Emit an insn that's a simple single-set.  Both the operands must be
4307    known to be valid.  */
4308 inline static rtx_insn *
4309 emit_set_insn (rtx x, rtx y)
4310 {
4311   return emit_insn (gen_rtx_SET (x, y));
4312 }
4313
4314 /* X and Y are two things to compare using CODE.  Emit the compare insn and
4315    return the rtx for register 0 in the proper mode.  */
4316 rtx
4317 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
4318 {
4319   machine_mode cmp_mode = GET_MODE (x);
4320   machine_mode cc_mode;
4321   rtx cc_reg;
4322
4323   if (cmp_mode == TImode)
4324     {
4325       gcc_assert (code == NE);
4326
4327       cc_mode = CCmode;
4328       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4329
4330       rtx x_lo = operand_subword (x, 0, 0, TImode);
4331       rtx y_lo = operand_subword (y, 0, 0, TImode);
4332       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
4333
4334       rtx x_hi = operand_subword (x, 1, 0, TImode);
4335       rtx y_hi = operand_subword (y, 1, 0, TImode);
4336       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
4337                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
4338                                GEN_INT (AARCH64_EQ)));
4339     }
4340   else
4341     {
4342       cc_mode = SELECT_CC_MODE (code, x, y);
4343       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4344       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
4345     }
4346   return cc_reg;
4347 }
4348
4349 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
4350
4351 static rtx
4352 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
4353                                   machine_mode y_mode)
4354 {
4355   if (y_mode == E_QImode || y_mode == E_HImode)
4356     {
4357       if (CONST_INT_P (y))
4358         {
4359           y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
4360           y_mode = SImode;
4361         }
4362       else
4363         {
4364           rtx t, cc_reg;
4365           machine_mode cc_mode;
4366
4367           t = gen_rtx_ZERO_EXTEND (SImode, y);
4368           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
4369           cc_mode = CC_SWPmode;
4370           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4371           emit_set_insn (cc_reg, t);
4372           return cc_reg;
4373         }
4374     }
4375
4376   if (!aarch64_plus_operand (y, y_mode))
4377     y = force_reg (y_mode, y);
4378
4379   return aarch64_gen_compare_reg (code, x, y);
4380 }
4381
4382 /* Consider the operation:
4383
4384      OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
4385
4386    where:
4387
4388    - CODE is [SU]MAX or [SU]MIN
4389    - OPERANDS[2] and OPERANDS[3] are constant integers
4390    - OPERANDS[3] is a positive or negative shifted 12-bit immediate
4391    - all operands have mode MODE
4392
4393    Decide whether it is possible to implement the operation using:
4394
4395      SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
4396      or
4397      ADDS <tmp>, OPERANDS[1], OPERANDS[3]
4398
4399    followed by:
4400
4401      <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
4402
4403    where <insn> is one of CSEL, CSINV or CSINC.  Return true if so.
4404    If GENERATE_P is true, also update OPERANDS as follows:
4405
4406      OPERANDS[4] = -OPERANDS[3]
4407      OPERANDS[5] = the rtl condition representing <cond>
4408      OPERANDS[6] = <tmp>
4409      OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC.  */
4410 bool
4411 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
4412 {
4413   signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
4414   rtx dst = operands[0];
4415   rtx maxmin_op = operands[2];
4416   rtx add_op = operands[3];
4417   machine_mode mode = GET_MODE (dst);
4418
4419   /* max (x, y) - z == (x >= y + 1 ? x : y) - z
4420                     == (x >= y ? x : y) - z
4421                     == (x > y ? x : y) - z
4422                     == (x > y - 1 ? x : y) - z
4423
4424      min (x, y) - z == (x <= y - 1 ? x : y) - z
4425                     == (x <= y ? x : y) - z
4426                     == (x < y ? x : y) - z
4427                     == (x < y + 1 ? x : y) - z
4428
4429      Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
4430      which x is compared with z.  Set DIFF to y - z.  Thus the supported
4431      combinations are as follows, with DIFF being the value after the ":":
4432
4433      max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1   [z == y + 1]
4434                     == x >= y ? x - y : 0              [z == y]
4435                     == x > y ? x - y : 0               [z == y]
4436                     == x > y - 1 ? x - (y - 1) : 1     [z == y - 1]
4437
4438      min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1    [z == y - 1]
4439                     == x <= y ? x - y : 0              [z == y]
4440                     == x < y ? x - y : 0               [z == y]
4441                     == x < y + 1 ? x - (y + 1) : -1    [z == y + 1].  */
4442   auto maxmin_val = rtx_mode_t (maxmin_op, mode);
4443   auto add_val = rtx_mode_t (add_op, mode);
4444   auto sub_val = wi::neg (add_val);
4445   auto diff = wi::sub (maxmin_val, sub_val);
4446   if (!(diff == 0
4447         || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
4448         || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
4449     return false;
4450
4451   if (!generate_p)
4452     return true;
4453
4454   rtx_code cmp;
4455   switch (code)
4456     {
4457     case SMAX:
4458       cmp = diff == 1 ? GT : GE;
4459       break;
4460     case UMAX:
4461       cmp = diff == 1 ? GTU : GEU;
4462       break;
4463     case SMIN:
4464       cmp = diff == -1 ? LT : LE;
4465       break;
4466     case UMIN:
4467       cmp = diff == -1 ? LTU : LEU;
4468       break;
4469     default:
4470       gcc_unreachable ();
4471     }
4472   rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
4473
4474   operands[4] = immed_wide_int_const (sub_val, mode);
4475   operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
4476   if (can_create_pseudo_p ())
4477     operands[6] = gen_reg_rtx (mode);
4478   else
4479     operands[6] = dst;
4480   operands[7] = immed_wide_int_const (diff, mode);
4481
4482   return true;
4483 }
4484
4485
4486 /* Build the SYMBOL_REF for __tls_get_addr.  */
4487
4488 static GTY(()) rtx tls_get_addr_libfunc;
4489
4490 rtx
4491 aarch64_tls_get_addr (void)
4492 {
4493   if (!tls_get_addr_libfunc)
4494     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
4495   return tls_get_addr_libfunc;
4496 }
4497
4498 /* Return the TLS model to use for ADDR.  */
4499
4500 static enum tls_model
4501 tls_symbolic_operand_type (rtx addr)
4502 {
4503   enum tls_model tls_kind = TLS_MODEL_NONE;
4504   poly_int64 offset;
4505   addr = strip_offset_and_salt (addr, &offset);
4506   if (SYMBOL_REF_P (addr))
4507     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
4508
4509   return tls_kind;
4510 }
4511
4512 /* We'll allow lo_sum's in addresses in our legitimate addresses
4513    so that combine would take care of combining addresses where
4514    necessary, but for generation purposes, we'll generate the address
4515    as :
4516    RTL                               Absolute
4517    tmp = hi (symbol_ref);            adrp  x1, foo
4518    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
4519                                      nop
4520
4521    PIC                               TLS
4522    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
4523    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
4524                                      bl   __tls_get_addr
4525                                      nop
4526
4527    Load TLS symbol, depending on TLS mechanism and TLS access model.
4528
4529    Global Dynamic - Traditional TLS:
4530    adrp tmp, :tlsgd:imm
4531    add  dest, tmp, #:tlsgd_lo12:imm
4532    bl   __tls_get_addr
4533
4534    Global Dynamic - TLS Descriptors:
4535    adrp dest, :tlsdesc:imm
4536    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
4537    add  dest, dest, #:tlsdesc_lo12:imm
4538    blr  tmp
4539    mrs  tp, tpidr_el0
4540    add  dest, dest, tp
4541
4542    Initial Exec:
4543    mrs  tp, tpidr_el0
4544    adrp tmp, :gottprel:imm
4545    ldr  dest, [tmp, #:gottprel_lo12:imm]
4546    add  dest, dest, tp
4547
4548    Local Exec:
4549    mrs  tp, tpidr_el0
4550    add  t0, tp, #:tprel_hi12:imm, lsl #12
4551    add  t0, t0, #:tprel_lo12_nc:imm
4552 */
4553
4554 static void
4555 aarch64_load_symref_appropriately (rtx dest, rtx imm,
4556                                    enum aarch64_symbol_type type)
4557 {
4558   switch (type)
4559     {
4560     case SYMBOL_SMALL_ABSOLUTE:
4561       {
4562         /* In ILP32, the mode of dest can be either SImode or DImode.  */
4563         rtx tmp_reg = dest;
4564         machine_mode mode = GET_MODE (dest);
4565
4566         gcc_assert (mode == Pmode || mode == ptr_mode);
4567
4568         if (can_create_pseudo_p ())
4569           tmp_reg = gen_reg_rtx (mode);
4570
4571         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
4572         emit_insn (gen_add_losym (dest, tmp_reg, imm));
4573         return;
4574       }
4575
4576     case SYMBOL_TINY_ABSOLUTE:
4577       emit_insn (gen_rtx_SET (dest, imm));
4578       return;
4579
4580     case SYMBOL_SMALL_GOT_28K:
4581       {
4582         machine_mode mode = GET_MODE (dest);
4583         rtx gp_rtx = pic_offset_table_rtx;
4584         rtx insn;
4585         rtx mem;
4586
4587         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
4588            here before rtl expand.  Tree IVOPT will generate rtl pattern to
4589            decide rtx costs, in which case pic_offset_table_rtx is not
4590            initialized.  For that case no need to generate the first adrp
4591            instruction as the final cost for global variable access is
4592            one instruction.  */
4593         if (gp_rtx != NULL)
4594           {
4595             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
4596                using the page base as GOT base, the first page may be wasted,
4597                in the worst scenario, there is only 28K space for GOT).
4598
4599                The generate instruction sequence for accessing global variable
4600                is:
4601
4602                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
4603
4604                Only one instruction needed. But we must initialize
4605                pic_offset_table_rtx properly.  We generate initialize insn for
4606                every global access, and allow CSE to remove all redundant.
4607
4608                The final instruction sequences will look like the following
4609                for multiply global variables access.
4610
4611                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
4612
4613                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
4614                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
4615                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
4616                  ...  */
4617
4618             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
4619             crtl->uses_pic_offset_table = 1;
4620             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
4621
4622             if (mode != GET_MODE (gp_rtx))
4623              gp_rtx = gen_lowpart (mode, gp_rtx);
4624
4625           }
4626
4627         if (mode == ptr_mode)
4628           {
4629             if (mode == DImode)
4630               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
4631             else
4632               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
4633
4634             mem = XVECEXP (SET_SRC (insn), 0, 0);
4635           }
4636         else
4637           {
4638             gcc_assert (mode == Pmode);
4639
4640             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
4641             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
4642           }
4643
4644         /* The operand is expected to be MEM.  Whenever the related insn
4645            pattern changed, above code which calculate mem should be
4646            updated.  */
4647         gcc_assert (MEM_P (mem));
4648         MEM_READONLY_P (mem) = 1;
4649         MEM_NOTRAP_P (mem) = 1;
4650         emit_insn (insn);
4651         return;
4652       }
4653
4654     case SYMBOL_SMALL_GOT_4G:
4655       emit_insn (gen_rtx_SET (dest, imm));
4656       return;
4657
4658     case SYMBOL_SMALL_TLSGD:
4659       {
4660         rtx_insn *insns;
4661         /* The return type of __tls_get_addr is the C pointer type
4662            so use ptr_mode.  */
4663         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
4664         rtx tmp_reg = dest;
4665
4666         if (GET_MODE (dest) != ptr_mode)
4667           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
4668
4669         start_sequence ();
4670         if (ptr_mode == SImode)
4671           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
4672         else
4673           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
4674         insns = get_insns ();
4675         end_sequence ();
4676
4677         RTL_CONST_CALL_P (insns) = 1;
4678         emit_libcall_block (insns, tmp_reg, result, imm);
4679         /* Convert back to the mode of the dest adding a zero_extend
4680            from SImode (ptr_mode) to DImode (Pmode). */
4681         if (dest != tmp_reg)
4682           convert_move (dest, tmp_reg, true);
4683         return;
4684       }
4685
4686     case SYMBOL_SMALL_TLSDESC:
4687       {
4688         machine_mode mode = GET_MODE (dest);
4689         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
4690         rtx tp;
4691
4692         gcc_assert (mode == Pmode || mode == ptr_mode);
4693
4694         /* In ILP32, the got entry is always of SImode size.  Unlike
4695            small GOT, the dest is fixed at reg 0.  */
4696         if (TARGET_ILP32)
4697           emit_insn (gen_tlsdesc_small_si (imm));
4698         else
4699           emit_insn (gen_tlsdesc_small_di (imm));
4700         tp = aarch64_load_tp (NULL);
4701
4702         if (mode != Pmode)
4703           tp = gen_lowpart (mode, tp);
4704
4705         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
4706         if (REG_P (dest))
4707           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4708         return;
4709       }
4710
4711     case SYMBOL_SMALL_TLSIE:
4712       {
4713         /* In ILP32, the mode of dest can be either SImode or DImode,
4714            while the got entry is always of SImode size.  The mode of
4715            dest depends on how dest is used: if dest is assigned to a
4716            pointer (e.g. in the memory), it has SImode; it may have
4717            DImode if dest is dereferenced to access the memeory.
4718            This is why we have to handle three different tlsie_small
4719            patterns here (two patterns for ILP32).  */
4720         machine_mode mode = GET_MODE (dest);
4721         rtx tmp_reg = gen_reg_rtx (mode);
4722         rtx tp = aarch64_load_tp (NULL);
4723
4724         if (mode == ptr_mode)
4725           {
4726             if (mode == DImode)
4727               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
4728             else
4729               {
4730                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
4731                 tp = gen_lowpart (mode, tp);
4732               }
4733           }
4734         else
4735           {
4736             gcc_assert (mode == Pmode);
4737             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
4738           }
4739
4740         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
4741         if (REG_P (dest))
4742           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4743         return;
4744       }
4745
4746     case SYMBOL_TLSLE12:
4747     case SYMBOL_TLSLE24:
4748     case SYMBOL_TLSLE32:
4749     case SYMBOL_TLSLE48:
4750       {
4751         machine_mode mode = GET_MODE (dest);
4752         rtx tp = aarch64_load_tp (NULL);
4753
4754         if (mode != Pmode)
4755           tp = gen_lowpart (mode, tp);
4756
4757         switch (type)
4758           {
4759           case SYMBOL_TLSLE12:
4760             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
4761                         (dest, tp, imm));
4762             break;
4763           case SYMBOL_TLSLE24:
4764             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
4765                         (dest, tp, imm));
4766           break;
4767           case SYMBOL_TLSLE32:
4768             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
4769                         (dest, imm));
4770             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4771                         (dest, dest, tp));
4772           break;
4773           case SYMBOL_TLSLE48:
4774             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
4775                         (dest, imm));
4776             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4777                         (dest, dest, tp));
4778             break;
4779           default:
4780             gcc_unreachable ();
4781           }
4782
4783         if (REG_P (dest))
4784           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4785         return;
4786       }
4787
4788     case SYMBOL_TINY_GOT:
4789       {
4790         rtx insn;
4791         machine_mode mode = GET_MODE (dest);
4792
4793         if (mode == ptr_mode)
4794           insn = gen_ldr_got_tiny (mode, dest, imm);
4795         else
4796           {
4797             gcc_assert (mode == Pmode);
4798             insn = gen_ldr_got_tiny_sidi (dest, imm);
4799           }
4800
4801         emit_insn (insn);
4802         return;
4803       }
4804
4805     case SYMBOL_TINY_TLSIE:
4806       {
4807         machine_mode mode = GET_MODE (dest);
4808         rtx tp = aarch64_load_tp (NULL);
4809
4810         if (mode == ptr_mode)
4811           {
4812             if (mode == DImode)
4813               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
4814             else
4815               {
4816                 tp = gen_lowpart (mode, tp);
4817                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
4818               }
4819           }
4820         else
4821           {
4822             gcc_assert (mode == Pmode);
4823             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
4824           }
4825
4826         if (REG_P (dest))
4827           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4828         return;
4829       }
4830
4831     default:
4832       gcc_unreachable ();
4833     }
4834 }
4835
4836 /* Emit a move from SRC to DEST.  Assume that the move expanders can
4837    handle all moves if !can_create_pseudo_p ().  The distinction is
4838    important because, unlike emit_move_insn, the move expanders know
4839    how to force Pmode objects into the constant pool even when the
4840    constant pool address is not itself legitimate.  */
4841 static rtx
4842 aarch64_emit_move (rtx dest, rtx src)
4843 {
4844   return (can_create_pseudo_p ()
4845           ? emit_move_insn (dest, src)
4846           : emit_move_insn_1 (dest, src));
4847 }
4848
4849 /* Apply UNOPTAB to OP and store the result in DEST.  */
4850
4851 static void
4852 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
4853 {
4854   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
4855   if (dest != tmp)
4856     emit_move_insn (dest, tmp);
4857 }
4858
4859 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
4860
4861 static void
4862 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
4863 {
4864   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
4865                           OPTAB_DIRECT);
4866   if (dest != tmp)
4867     emit_move_insn (dest, tmp);
4868 }
4869
4870 /* Split a 128-bit move operation into two 64-bit move operations,
4871    taking care to handle partial overlap of register to register
4872    copies.  Special cases are needed when moving between GP regs and
4873    FP regs.  SRC can be a register, constant or memory; DST a register
4874    or memory.  If either operand is memory it must not have any side
4875    effects.  */
4876 void
4877 aarch64_split_128bit_move (rtx dst, rtx src)
4878 {
4879   rtx dst_lo, dst_hi;
4880   rtx src_lo, src_hi;
4881
4882   machine_mode mode = GET_MODE (dst);
4883
4884   gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
4885   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
4886   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
4887
4888   if (REG_P (dst) && REG_P (src))
4889     {
4890       int src_regno = REGNO (src);
4891       int dst_regno = REGNO (dst);
4892
4893       /* Handle FP <-> GP regs.  */
4894       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
4895         {
4896           src_lo = gen_lowpart (word_mode, src);
4897           src_hi = gen_highpart (word_mode, src);
4898
4899           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
4900           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
4901           return;
4902         }
4903       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
4904         {
4905           dst_lo = gen_lowpart (word_mode, dst);
4906           dst_hi = gen_highpart (word_mode, dst);
4907
4908           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
4909           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
4910           return;
4911         }
4912     }
4913
4914   dst_lo = gen_lowpart (word_mode, dst);
4915   dst_hi = gen_highpart (word_mode, dst);
4916   src_lo = gen_lowpart (word_mode, src);
4917   src_hi = gen_highpart_mode (word_mode, mode, src);
4918
4919   /* At most one pairing may overlap.  */
4920   if (reg_overlap_mentioned_p (dst_lo, src_hi))
4921     {
4922       aarch64_emit_move (dst_hi, src_hi);
4923       aarch64_emit_move (dst_lo, src_lo);
4924     }
4925   else
4926     {
4927       aarch64_emit_move (dst_lo, src_lo);
4928       aarch64_emit_move (dst_hi, src_hi);
4929     }
4930 }
4931
4932 /* Return true if we should split a move from 128-bit value SRC
4933    to 128-bit register DEST.  */
4934
4935 bool
4936 aarch64_split_128bit_move_p (rtx dst, rtx src)
4937 {
4938   if (FP_REGNUM_P (REGNO (dst)))
4939     return REG_P (src) && !FP_REGNUM_P (REGNO (src));
4940   /* All moves to GPRs need to be split.  */
4941   return true;
4942 }
4943
4944 /* Split a complex SIMD move.  */
4945
4946 void
4947 aarch64_split_simd_move (rtx dst, rtx src)
4948 {
4949   machine_mode src_mode = GET_MODE (src);
4950   machine_mode dst_mode = GET_MODE (dst);
4951
4952   gcc_assert (VECTOR_MODE_P (dst_mode));
4953
4954   if (REG_P (dst) && REG_P (src))
4955     {
4956       gcc_assert (VECTOR_MODE_P (src_mode));
4957       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
4958     }
4959 }
4960
4961 bool
4962 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
4963                               machine_mode ymode, rtx y)
4964 {
4965   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
4966   gcc_assert (r != NULL);
4967   return rtx_equal_p (x, r);
4968 }
4969
4970 /* Return TARGET if it is nonnull and a register of mode MODE.
4971    Otherwise, return a fresh register of mode MODE if we can,
4972    or TARGET reinterpreted as MODE if we can't.  */
4973
4974 static rtx
4975 aarch64_target_reg (rtx target, machine_mode mode)
4976 {
4977   if (target && REG_P (target) && GET_MODE (target) == mode)
4978     return target;
4979   if (!can_create_pseudo_p ())
4980     {
4981       gcc_assert (target);
4982       return gen_lowpart (mode, target);
4983     }
4984   return gen_reg_rtx (mode);
4985 }
4986
4987 /* Return a register that contains the constant in BUILDER, given that
4988    the constant is a legitimate move operand.  Use TARGET as the register
4989    if it is nonnull and convenient.  */
4990
4991 static rtx
4992 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
4993 {
4994   rtx src = builder.build ();
4995   target = aarch64_target_reg (target, GET_MODE (src));
4996   emit_insn (gen_rtx_SET (target, src));
4997   return target;
4998 }
4999
5000 static rtx
5001 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
5002 {
5003   if (can_create_pseudo_p ())
5004     return force_reg (mode, value);
5005   else
5006     {
5007       gcc_assert (x);
5008       aarch64_emit_move (x, value);
5009       return x;
5010     }
5011 }
5012
5013 /* Return true if predicate value X is a constant in which every element
5014    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
5015    value, i.e. as a predicate in which all bits are significant.  */
5016
5017 static bool
5018 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
5019 {
5020   if (!CONST_VECTOR_P (x))
5021     return false;
5022
5023   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
5024                                              GET_MODE_NUNITS (GET_MODE (x)));
5025   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
5026   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
5027   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
5028
5029   unsigned int nelts = const_vector_encoded_nelts (x);
5030   for (unsigned int i = 0; i < nelts; ++i)
5031     {
5032       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
5033       if (!CONST_INT_P (elt))
5034         return false;
5035
5036       builder.quick_push (elt);
5037       for (unsigned int j = 1; j < factor; ++j)
5038         builder.quick_push (const0_rtx);
5039     }
5040   builder.finalize ();
5041   return true;
5042 }
5043
5044 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
5045    widest predicate element size it can have (that is, the largest size
5046    for which each element would still be 0 or 1).  */
5047
5048 unsigned int
5049 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
5050 {
5051   /* Start with the most optimistic assumption: that we only need
5052      one bit per pattern.  This is what we will use if only the first
5053      bit in each pattern is ever set.  */
5054   unsigned int mask = GET_MODE_SIZE (DImode);
5055   mask |= builder.npatterns ();
5056
5057   /* Look for set bits.  */
5058   unsigned int nelts = builder.encoded_nelts ();
5059   for (unsigned int i = 1; i < nelts; ++i)
5060     if (INTVAL (builder.elt (i)) != 0)
5061       {
5062         if (i & 1)
5063           return 1;
5064         mask |= i;
5065       }
5066   return mask & -mask;
5067 }
5068
5069 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
5070    return that predicate mode, otherwise return opt_machine_mode ().  */
5071
5072 opt_machine_mode
5073 aarch64_ptrue_all_mode (rtx x)
5074 {
5075   gcc_assert (GET_MODE (x) == VNx16BImode);
5076   if (!CONST_VECTOR_P (x)
5077       || !CONST_VECTOR_DUPLICATE_P (x)
5078       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
5079       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
5080     return opt_machine_mode ();
5081
5082   unsigned int nelts = const_vector_encoded_nelts (x);
5083   for (unsigned int i = 1; i < nelts; ++i)
5084     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
5085       return opt_machine_mode ();
5086
5087   return aarch64_sve_pred_mode (nelts);
5088 }
5089
5090 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
5091    that the constant would have with predicate element size ELT_SIZE
5092    (ignoring the upper bits in each element) and return:
5093
5094    * -1 if all bits are set
5095    * N if the predicate has N leading set bits followed by all clear bits
5096    * 0 if the predicate does not have any of these forms.  */
5097
5098 int
5099 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
5100                               unsigned int elt_size)
5101 {
5102   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
5103      followed by set bits.  */
5104   if (builder.nelts_per_pattern () == 3)
5105     return 0;
5106
5107   /* Skip over leading set bits.  */
5108   unsigned int nelts = builder.encoded_nelts ();
5109   unsigned int i = 0;
5110   for (; i < nelts; i += elt_size)
5111     if (INTVAL (builder.elt (i)) == 0)
5112       break;
5113   unsigned int vl = i / elt_size;
5114
5115   /* Check for the all-true case.  */
5116   if (i == nelts)
5117     return -1;
5118
5119   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
5120      repeating pattern of set bits followed by clear bits.  */
5121   if (builder.nelts_per_pattern () != 2)
5122     return 0;
5123
5124   /* We have a "foreground" value and a duplicated "background" value.
5125      If the background might repeat and the last set bit belongs to it,
5126      we might have set bits followed by clear bits followed by set bits.  */
5127   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
5128     return 0;
5129
5130   /* Make sure that the rest are all clear.  */
5131   for (; i < nelts; i += elt_size)
5132     if (INTVAL (builder.elt (i)) != 0)
5133       return 0;
5134
5135   return vl;
5136 }
5137
5138 /* See if there is an svpattern that encodes an SVE predicate of mode
5139    PRED_MODE in which the first VL bits are set and the rest are clear.
5140    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
5141    A VL of -1 indicates an all-true vector.  */
5142
5143 aarch64_svpattern
5144 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
5145 {
5146   if (vl < 0)
5147     return AARCH64_SV_ALL;
5148
5149   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
5150     return AARCH64_NUM_SVPATTERNS;
5151
5152   if (vl >= 1 && vl <= 8)
5153     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
5154
5155   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
5156     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
5157
5158   int max_vl;
5159   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
5160     {
5161       if (vl == (max_vl / 3) * 3)
5162         return AARCH64_SV_MUL3;
5163       /* These would only trigger for non-power-of-2 lengths.  */
5164       if (vl == (max_vl & -4))
5165         return AARCH64_SV_MUL4;
5166       if (vl == (1 << floor_log2 (max_vl)))
5167         return AARCH64_SV_POW2;
5168       if (vl == max_vl)
5169         return AARCH64_SV_ALL;
5170     }
5171   return AARCH64_NUM_SVPATTERNS;
5172 }
5173
5174 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
5175    bits has the lowest bit set and the upper bits clear.  This is the
5176    VNx16BImode equivalent of a PTRUE for controlling elements of
5177    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
5178    all bits are significant, even the upper zeros.  */
5179
5180 rtx
5181 aarch64_ptrue_all (unsigned int elt_size)
5182 {
5183   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
5184   builder.quick_push (const1_rtx);
5185   for (unsigned int i = 1; i < elt_size; ++i)
5186     builder.quick_push (const0_rtx);
5187   return builder.build ();
5188 }
5189
5190 /* Return an all-true predicate register of mode MODE.  */
5191
5192 rtx
5193 aarch64_ptrue_reg (machine_mode mode)
5194 {
5195   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5196   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
5197   return gen_lowpart (mode, reg);
5198 }
5199
5200 /* Return an all-false predicate register of mode MODE.  */
5201
5202 rtx
5203 aarch64_pfalse_reg (machine_mode mode)
5204 {
5205   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5206   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
5207   return gen_lowpart (mode, reg);
5208 }
5209
5210 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
5211    for it.  PRED2[0] is the predicate for the instruction whose result
5212    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
5213    for it.  Return true if we can prove that the two predicates are
5214    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
5215    with PRED1[0] without changing behavior.  */
5216
5217 bool
5218 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
5219 {
5220   machine_mode mode = GET_MODE (pred1[0]);
5221   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
5222               && mode == GET_MODE (pred2[0])
5223               && aarch64_sve_ptrue_flag (pred1[1], SImode)
5224               && aarch64_sve_ptrue_flag (pred2[1], SImode));
5225
5226   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
5227                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
5228   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
5229                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
5230   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
5231 }
5232
5233 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
5234    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
5235    Use TARGET as the target register if nonnull and convenient.  */
5236
5237 static rtx
5238 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
5239                           machine_mode data_mode, rtx op1, rtx op2)
5240 {
5241   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
5242   expand_operand ops[5];
5243   create_output_operand (&ops[0], target, pred_mode);
5244   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
5245   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
5246   create_input_operand (&ops[3], op1, data_mode);
5247   create_input_operand (&ops[4], op2, data_mode);
5248   expand_insn (icode, 5, ops);
5249   return ops[0].value;
5250 }
5251
5252 /* Use a comparison to convert integer vector SRC into MODE, which is
5253    the corresponding SVE predicate mode.  Use TARGET for the result
5254    if it's nonnull and convenient.  */
5255
5256 rtx
5257 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
5258 {
5259   machine_mode src_mode = GET_MODE (src);
5260   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
5261                                    src, CONST0_RTX (src_mode));
5262 }
5263
5264 /* Return the assembly token for svprfop value PRFOP.  */
5265
5266 static const char *
5267 svprfop_token (enum aarch64_svprfop prfop)
5268 {
5269   switch (prfop)
5270     {
5271 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
5272     AARCH64_FOR_SVPRFOP (CASE)
5273 #undef CASE
5274     case AARCH64_NUM_SVPRFOPS:
5275       break;
5276     }
5277   gcc_unreachable ();
5278 }
5279
5280 /* Return the assembly string for an SVE prefetch operation with
5281    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
5282    and that SUFFIX is the format for the remaining operands.  */
5283
5284 char *
5285 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
5286                              const char *suffix)
5287 {
5288   static char buffer[128];
5289   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
5290   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
5291                                    mnemonic, svprfop_token (prfop), suffix);
5292   gcc_assert (written < sizeof (buffer));
5293   return buffer;
5294 }
5295
5296 /* Check whether we can calculate the number of elements in PATTERN
5297    at compile time, given that there are NELTS_PER_VQ elements per
5298    128-bit block.  Return the value if so, otherwise return -1.  */
5299
5300 HOST_WIDE_INT
5301 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
5302 {
5303   unsigned int vl, const_vg;
5304   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
5305     vl = 1 + (pattern - AARCH64_SV_VL1);
5306   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
5307     vl = 16 << (pattern - AARCH64_SV_VL16);
5308   else if (aarch64_sve_vg.is_constant (&const_vg))
5309     {
5310       /* There are two vector granules per quadword.  */
5311       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
5312       switch (pattern)
5313         {
5314         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
5315         case AARCH64_SV_MUL4: return nelts & -4;
5316         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
5317         case AARCH64_SV_ALL: return nelts;
5318         default: gcc_unreachable ();
5319         }
5320     }
5321   else
5322     return -1;
5323
5324   /* There are two vector granules per quadword.  */
5325   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
5326   if (known_le (vl, nelts_all))
5327     return vl;
5328
5329   /* Requesting more elements than are available results in a PFALSE.  */
5330   if (known_gt (vl, nelts_all))
5331     return 0;
5332
5333   return -1;
5334 }
5335
5336 /* Return true if we can move VALUE into a register using a single
5337    CNT[BHWD] instruction.  */
5338
5339 static bool
5340 aarch64_sve_cnt_immediate_p (poly_int64 value)
5341 {
5342   HOST_WIDE_INT factor = value.coeffs[0];
5343   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
5344   return (value.coeffs[1] == factor
5345           && IN_RANGE (factor, 2, 16 * 16)
5346           && (factor & 1) == 0
5347           && factor <= 16 * (factor & -factor));
5348 }
5349
5350 /* Likewise for rtx X.  */
5351
5352 bool
5353 aarch64_sve_cnt_immediate_p (rtx x)
5354 {
5355   poly_int64 value;
5356   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
5357 }
5358
5359 /* Return the asm string for an instruction with a CNT-like vector size
5360    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5361    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5362    first part of the operands template (the part that comes before the
5363    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
5364    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
5365    in each quadword.  If it is zero, we can use any element size.  */
5366
5367 static char *
5368 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5369                                   aarch64_svpattern pattern,
5370                                   unsigned int factor,
5371                                   unsigned int nelts_per_vq)
5372 {
5373   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
5374
5375   if (nelts_per_vq == 0)
5376     /* There is some overlap in the ranges of the four CNT instructions.
5377        Here we always use the smallest possible element size, so that the
5378        multiplier is 1 whereever possible.  */
5379     nelts_per_vq = factor & -factor;
5380   int shift = std::min (exact_log2 (nelts_per_vq), 4);
5381   gcc_assert (IN_RANGE (shift, 1, 4));
5382   char suffix = "dwhb"[shift - 1];
5383
5384   factor >>= shift;
5385   unsigned int written;
5386   if (pattern == AARCH64_SV_ALL && factor == 1)
5387     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
5388                         prefix, suffix, operands);
5389   else if (factor == 1)
5390     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
5391                         prefix, suffix, operands, svpattern_token (pattern));
5392   else
5393     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
5394                         prefix, suffix, operands, svpattern_token (pattern),
5395                         factor);
5396   gcc_assert (written < sizeof (buffer));
5397   return buffer;
5398 }
5399
5400 /* Return the asm string for an instruction with a CNT-like vector size
5401    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5402    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5403    first part of the operands template (the part that comes before the
5404    vector size itself).  X is the value of the vector size operand,
5405    as a polynomial integer rtx; we need to convert this into an "all"
5406    pattern with a multiplier.  */
5407
5408 char *
5409 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5410                                   rtx x)
5411 {
5412   poly_int64 value = rtx_to_poly_int64 (x);
5413   gcc_assert (aarch64_sve_cnt_immediate_p (value));
5414   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
5415                                            value.coeffs[1], 0);
5416 }
5417
5418 /* Return the asm string for an instruction with a CNT-like vector size
5419    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5420    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5421    first part of the operands template (the part that comes before the
5422    vector size itself).  CNT_PAT[0..2] are the operands of the
5423    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
5424
5425 char *
5426 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
5427                                       const char *operands, rtx *cnt_pat)
5428 {
5429   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
5430   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
5431   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
5432   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
5433                                            factor, nelts_per_vq);
5434 }
5435
5436 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
5437
5438 bool
5439 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
5440 {
5441   poly_int64 value;
5442   return (poly_int_rtx_p (x, &value)
5443           && (aarch64_sve_cnt_immediate_p (value)
5444               || aarch64_sve_cnt_immediate_p (-value)));
5445 }
5446
5447 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
5448    operand 0.  */
5449
5450 char *
5451 aarch64_output_sve_scalar_inc_dec (rtx offset)
5452 {
5453   poly_int64 offset_value = rtx_to_poly_int64 (offset);
5454   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
5455   if (offset_value.coeffs[1] > 0)
5456     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
5457                                              offset_value.coeffs[1], 0);
5458   else
5459     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
5460                                              -offset_value.coeffs[1], 0);
5461 }
5462
5463 /* Return true if we can add VALUE to a register using a single ADDVL
5464    or ADDPL instruction.  */
5465
5466 static bool
5467 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
5468 {
5469   HOST_WIDE_INT factor = value.coeffs[0];
5470   if (factor == 0 || value.coeffs[1] != factor)
5471     return false;
5472   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
5473      and a value of 16 is one vector width.  */
5474   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
5475           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
5476 }
5477
5478 /* Likewise for rtx X.  */
5479
5480 bool
5481 aarch64_sve_addvl_addpl_immediate_p (rtx x)
5482 {
5483   poly_int64 value;
5484   return (poly_int_rtx_p (x, &value)
5485           && aarch64_sve_addvl_addpl_immediate_p (value));
5486 }
5487
5488 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
5489    to operand 1 and storing the result in operand 0.  */
5490
5491 char *
5492 aarch64_output_sve_addvl_addpl (rtx offset)
5493 {
5494   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
5495   poly_int64 offset_value = rtx_to_poly_int64 (offset);
5496   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
5497
5498   int factor = offset_value.coeffs[1];
5499   if ((factor & 15) == 0)
5500     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
5501   else
5502     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
5503   return buffer;
5504 }
5505
5506 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5507    instruction.  If it is, store the number of elements in each vector
5508    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
5509    factor in *FACTOR_OUT (if nonnull).  */
5510
5511 bool
5512 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
5513                                         unsigned int *nelts_per_vq_out)
5514 {
5515   rtx elt;
5516   poly_int64 value;
5517
5518   if (!const_vec_duplicate_p (x, &elt)
5519       || !poly_int_rtx_p (elt, &value))
5520     return false;
5521
5522   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
5523   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
5524     /* There's no vector INCB.  */
5525     return false;
5526
5527   HOST_WIDE_INT factor = value.coeffs[0];
5528   if (value.coeffs[1] != factor)
5529     return false;
5530
5531   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
5532   if ((factor % nelts_per_vq) != 0
5533       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
5534     return false;
5535
5536   if (factor_out)
5537     *factor_out = factor;
5538   if (nelts_per_vq_out)
5539     *nelts_per_vq_out = nelts_per_vq;
5540   return true;
5541 }
5542
5543 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5544    instruction.  */
5545
5546 bool
5547 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
5548 {
5549   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
5550 }
5551
5552 /* Return the asm template for an SVE vector INC or DEC instruction.
5553    OPERANDS gives the operands before the vector count and X is the
5554    value of the vector count operand itself.  */
5555
5556 char *
5557 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
5558 {
5559   int factor;
5560   unsigned int nelts_per_vq;
5561   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
5562     gcc_unreachable ();
5563   if (factor < 0)
5564     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
5565                                              -factor, nelts_per_vq);
5566   else
5567     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
5568                                              factor, nelts_per_vq);
5569 }
5570
5571 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5572
5573 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5574   {
5575     0x0000000100000001ull,
5576     0x0001000100010001ull,
5577     0x0101010101010101ull,
5578     0x1111111111111111ull,
5579     0x5555555555555555ull,
5580   };
5581
5582
5583
5584 /* Return true if 64-bit VAL is a valid bitmask immediate.  */
5585 static bool
5586 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
5587 {
5588   unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
5589   int bits;
5590
5591   /* Check for a single sequence of one bits and return quickly if so.
5592      The special cases of all ones and all zeroes returns false.  */
5593   tmp = val + (val & -val);
5594
5595   if (tmp == (tmp & -tmp))
5596     return (val + 1) > 1;
5597
5598   /* Invert if the immediate doesn't start with a zero bit - this means we
5599      only need to search for sequences of one bits.  */
5600   if (val & 1)
5601     val = ~val;
5602
5603   /* Find the first set bit and set tmp to val with the first sequence of one
5604      bits removed.  Return success if there is a single sequence of ones.  */
5605   first_one = val & -val;
5606   tmp = val & (val + first_one);
5607
5608   if (tmp == 0)
5609     return true;
5610
5611   /* Find the next set bit and compute the difference in bit position.  */
5612   next_one = tmp & -tmp;
5613   bits = clz_hwi (first_one) - clz_hwi (next_one);
5614   mask = val ^ tmp;
5615
5616   /* Check the bit position difference is a power of 2, and that the first
5617      sequence of one bits fits within 'bits' bits.  */
5618   if ((mask >> bits) != 0 || bits != (bits & -bits))
5619     return false;
5620
5621   /* Check the sequence of one bits is repeated 64/bits times.  */
5622   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5623 }
5624
5625
5626 /* Return true if VAL is a valid bitmask immediate for MODE.  */
5627 bool
5628 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5629 {
5630   if (mode == DImode)
5631     return aarch64_bitmask_imm (val);
5632
5633   if (mode == SImode)
5634     return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
5635
5636   /* Replicate small immediates to fit 64 bits.  */
5637   int size = GET_MODE_UNIT_PRECISION (mode);
5638   val &= (HOST_WIDE_INT_1U << size) - 1;
5639   val *= bitmask_imm_mul[__builtin_clz (size) - 26];
5640
5641   return aarch64_bitmask_imm (val);
5642 }
5643
5644
5645 /* Return true if the immediate VAL can be a bitfield immediate
5646    by changing the given MASK bits in VAL to zeroes, ones or bits
5647    from the other half of VAL.  Return the new immediate in VAL2.  */
5648 static inline bool
5649 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
5650                        unsigned HOST_WIDE_INT &val2,
5651                        unsigned HOST_WIDE_INT mask)
5652 {
5653   val2 = val & ~mask;
5654   if (val2 != val && aarch64_bitmask_imm (val2))
5655     return true;
5656   val2 = val | mask;
5657   if (val2 != val && aarch64_bitmask_imm (val2))
5658     return true;
5659   val = val & ~mask;
5660   val2 = val | (((val >> 32) | (val << 32)) & mask);
5661   if (val2 != val && aarch64_bitmask_imm (val2))
5662     return true;
5663   val2 = val | (((val >> 16) | (val << 48)) & mask);
5664   if (val2 != val && aarch64_bitmask_imm (val2))
5665     return true;
5666   return false;
5667 }
5668
5669
5670 /* Return true if VAL is a valid MOVZ immediate.  */
5671 static inline bool
5672 aarch64_is_movz (unsigned HOST_WIDE_INT val)
5673 {
5674   return (val >> (ctz_hwi (val) & 48)) < 65536;
5675 }
5676
5677
5678 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ.  */
5679 bool
5680 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
5681 {
5682   return aarch64_is_movz (val) || aarch64_is_movz (~val)
5683     || aarch64_bitmask_imm (val);
5684 }
5685
5686
5687 /* Return true if VAL is an immediate that can be created by a single
5688    MOV instruction.  */
5689 bool
5690 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5691 {
5692   gcc_assert (mode == SImode || mode == DImode);
5693
5694   if (val < 65536)
5695     return true;
5696
5697   unsigned HOST_WIDE_INT mask =
5698     (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
5699
5700   if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
5701     return true;
5702
5703   val = (val & mask) | ((val << 32) & ~mask);
5704   return aarch64_bitmask_imm (val);
5705 }
5706
5707
5708 static int
5709 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
5710                                 machine_mode mode)
5711 {
5712   int i;
5713   unsigned HOST_WIDE_INT val, val2, mask;
5714   int one_match, zero_match;
5715   int num_insns;
5716
5717   gcc_assert (mode == SImode || mode == DImode);
5718
5719   val = INTVAL (imm);
5720
5721   if (aarch64_move_imm (val, mode))
5722     {
5723       if (generate)
5724         emit_insn (gen_rtx_SET (dest, imm));
5725       return 1;
5726     }
5727
5728   if ((val >> 32) == 0 || mode == SImode)
5729     {
5730       if (generate)
5731         {
5732           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
5733           if (mode == SImode)
5734             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
5735                                        GEN_INT ((val >> 16) & 0xffff)));
5736           else
5737             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
5738                                        GEN_INT ((val >> 16) & 0xffff)));
5739         }
5740       return 2;
5741     }
5742
5743   /* Remaining cases are all for DImode.  */
5744
5745   mask = 0xffff;
5746   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
5747     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
5748   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
5749     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
5750
5751   /* Try a bitmask immediate and a movk to generate the immediate
5752      in 2 instructions.  */
5753
5754   if (zero_match < 2 && one_match < 2)
5755     {
5756       for (i = 0; i < 64; i += 16)
5757         {
5758           if (aarch64_check_bitmask (val, val2, mask << i))
5759             break;
5760
5761           val2 = val & ~(mask << i);
5762           if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
5763             break;
5764         }
5765
5766       if (i != 64)
5767         {
5768           if (generate)
5769             {
5770               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5771               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5772                                          GEN_INT ((val >> i) & 0xffff)));
5773             }
5774           return 2;
5775         }
5776     }
5777
5778   /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
5779   if (zero_match + one_match == 0)
5780     {
5781       for (i = 0; i < 48; i += 16)
5782         for (int j = i + 16; j < 64; j += 16)
5783           if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
5784             {
5785               if (generate)
5786                 {
5787                   emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5788                   emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5789                                              GEN_INT ((val >> i) & 0xffff)));
5790                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
5791                                                GEN_INT ((val >> j) & 0xffff)));
5792                 }
5793               return 3;
5794             }
5795     }
5796
5797   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
5798      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
5799      otherwise skip zero bits.  */
5800
5801   num_insns = 1;
5802   mask = 0xffff;
5803   val2 = one_match > zero_match ? ~val : val;
5804   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
5805
5806   if (generate)
5807     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
5808                                            ? (val | ~(mask << i))
5809                                            : (val & (mask << i)))));
5810   for (i += 16; i < 64; i += 16)
5811     {
5812       if ((val2 & (mask << i)) == 0)
5813         continue;
5814       if (generate)
5815         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5816                                    GEN_INT ((val >> i) & 0xffff)));
5817       num_insns ++;
5818     }
5819
5820   return num_insns;
5821 }
5822
5823 /* Return whether imm is a 128-bit immediate which is simple enough to
5824    expand inline.  */
5825 bool
5826 aarch64_mov128_immediate (rtx imm)
5827 {
5828   if (CONST_INT_P (imm))
5829     return true;
5830
5831   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
5832
5833   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
5834   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
5835
5836   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
5837          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
5838 }
5839
5840
5841 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5842    a left shift of 0 or 12 bits.  */
5843 bool
5844 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
5845 {
5846   return val < 4096 || (val & 0xfff000) == val;
5847 }
5848
5849 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5850    that can be created with a left shift of 0 or 12.  */
5851 static HOST_WIDE_INT
5852 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
5853 {
5854   /* Check to see if the value fits in 24 bits, as that is the maximum we can
5855      handle correctly.  */
5856   gcc_assert (val < 0x1000000);
5857
5858   if (val < 4096)
5859     return val;
5860
5861   return val & 0xfff000;
5862 }
5863
5864
5865 /* Test whether:
5866
5867      X = (X & AND_VAL) | IOR_VAL;
5868
5869    can be implemented using:
5870
5871      MOVK X, #(IOR_VAL >> shift), LSL #shift
5872
5873    Return the shift if so, otherwise return -1.  */
5874 int
5875 aarch64_movk_shift (const wide_int_ref &and_val,
5876                     const wide_int_ref &ior_val)
5877 {
5878   unsigned int precision = and_val.get_precision ();
5879   unsigned HOST_WIDE_INT mask = 0xffff;
5880   for (unsigned int shift = 0; shift < precision; shift += 16)
5881     {
5882       if (and_val == ~mask && (ior_val & mask) == ior_val)
5883         return shift;
5884       mask <<= 16;
5885     }
5886   return -1;
5887 }
5888
5889 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5890    Assumed precondition: VAL_IN Is not zero.  */
5891
5892 unsigned HOST_WIDE_INT
5893 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5894 {
5895   int lowest_bit_set = ctz_hwi (val_in);
5896   int highest_bit_set = floor_log2 (val_in);
5897   gcc_assert (val_in != 0);
5898
5899   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5900           (HOST_WIDE_INT_1U << lowest_bit_set));
5901 }
5902
5903 /* Create constant where bits outside of lowest bit set to highest bit set
5904    are set to 1.  */
5905
5906 unsigned HOST_WIDE_INT
5907 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5908 {
5909   return val_in | ~aarch64_and_split_imm1 (val_in);
5910 }
5911
5912 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5913
5914 bool
5915 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5916 {
5917   scalar_int_mode int_mode;
5918   if (!is_a <scalar_int_mode> (mode, &int_mode))
5919     return false;
5920
5921   if (aarch64_bitmask_imm (val_in, int_mode))
5922     return false;
5923
5924   if (aarch64_move_imm (val_in, int_mode))
5925     return false;
5926
5927   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5928
5929   return aarch64_bitmask_imm (imm2, int_mode);
5930 }
5931
5932 /* Return the number of temporary registers that aarch64_add_offset_1
5933    would need to add OFFSET to a register.  */
5934
5935 static unsigned int
5936 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
5937 {
5938   return absu_hwi (offset) < 0x1000000 ? 0 : 1;
5939 }
5940
5941 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
5942    a non-polynomial OFFSET.  MODE is the mode of the addition.
5943    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5944    be set and CFA adjustments added to the generated instructions.
5945
5946    TEMP1, if nonnull, is a register of mode MODE that can be used as a
5947    temporary if register allocation is already complete.  This temporary
5948    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
5949    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
5950    the immediate again.
5951
5952    Since this function may be used to adjust the stack pointer, we must
5953    ensure that it cannot cause transient stack deallocation (for example
5954    by first incrementing SP and then decrementing when adjusting by a
5955    large immediate).  */
5956
5957 static void
5958 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
5959                       rtx src, HOST_WIDE_INT offset, rtx temp1,
5960                       bool frame_related_p, bool emit_move_imm)
5961 {
5962   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5963   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5964
5965   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
5966   rtx_insn *insn;
5967
5968   if (!moffset)
5969     {
5970       if (!rtx_equal_p (dest, src))
5971         {
5972           insn = emit_insn (gen_rtx_SET (dest, src));
5973           RTX_FRAME_RELATED_P (insn) = frame_related_p;
5974         }
5975       return;
5976     }
5977
5978   /* Single instruction adjustment.  */
5979   if (aarch64_uimm12_shift (moffset))
5980     {
5981       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
5982       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5983       return;
5984     }
5985
5986   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
5987      and either:
5988
5989      a) the offset cannot be loaded by a 16-bit move or
5990      b) there is no spare register into which we can move it.  */
5991   if (moffset < 0x1000000
5992       && ((!temp1 && !can_create_pseudo_p ())
5993           || !aarch64_move_imm (moffset, mode)))
5994     {
5995       HOST_WIDE_INT low_off = moffset & 0xfff;
5996
5997       low_off = offset < 0 ? -low_off : low_off;
5998       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
5999       RTX_FRAME_RELATED_P (insn) = frame_related_p;
6000       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
6001       RTX_FRAME_RELATED_P (insn) = frame_related_p;
6002       return;
6003     }
6004
6005   /* Emit a move immediate if required and an addition/subtraction.  */
6006   if (emit_move_imm)
6007     {
6008       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
6009       temp1 = aarch64_force_temporary (mode, temp1,
6010                                        gen_int_mode (moffset, mode));
6011     }
6012   insn = emit_insn (offset < 0
6013                     ? gen_sub3_insn (dest, src, temp1)
6014                     : gen_add3_insn (dest, src, temp1));
6015   if (frame_related_p)
6016     {
6017       RTX_FRAME_RELATED_P (insn) = frame_related_p;
6018       rtx adj = plus_constant (mode, src, offset);
6019       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
6020     }
6021 }
6022
6023 /* Return the number of temporary registers that aarch64_add_offset
6024    would need to move OFFSET into a register or add OFFSET to a register;
6025    ADD_P is true if we want the latter rather than the former.  */
6026
6027 static unsigned int
6028 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
6029 {
6030   /* This follows the same structure as aarch64_add_offset.  */
6031   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
6032     return 0;
6033
6034   unsigned int count = 0;
6035   HOST_WIDE_INT factor = offset.coeffs[1];
6036   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
6037   poly_int64 poly_offset (factor, factor);
6038   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
6039     /* Need one register for the ADDVL/ADDPL result.  */
6040     count += 1;
6041   else if (factor != 0)
6042     {
6043       factor = abs (factor);
6044       if (factor > 16 * (factor & -factor))
6045         /* Need one register for the CNT result and one for the multiplication
6046            factor.  If necessary, the second temporary can be reused for the
6047            constant part of the offset.  */
6048         return 2;
6049       /* Need one register for the CNT result (which might then
6050          be shifted).  */
6051       count += 1;
6052     }
6053   return count + aarch64_add_offset_1_temporaries (constant);
6054 }
6055
6056 /* If X can be represented as a poly_int64, return the number
6057    of temporaries that are required to add it to a register.
6058    Return -1 otherwise.  */
6059
6060 int
6061 aarch64_add_offset_temporaries (rtx x)
6062 {
6063   poly_int64 offset;
6064   if (!poly_int_rtx_p (x, &offset))
6065     return -1;
6066   return aarch64_offset_temporaries (true, offset);
6067 }
6068
6069 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
6070    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
6071    be set and CFA adjustments added to the generated instructions.
6072
6073    TEMP1, if nonnull, is a register of mode MODE that can be used as a
6074    temporary if register allocation is already complete.  This temporary
6075    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
6076    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
6077    false to avoid emitting the immediate again.
6078
6079    TEMP2, if nonnull, is a second temporary register that doesn't
6080    overlap either DEST or REG.
6081
6082    Since this function may be used to adjust the stack pointer, we must
6083    ensure that it cannot cause transient stack deallocation (for example
6084    by first incrementing SP and then decrementing when adjusting by a
6085    large immediate).  */
6086
6087 static void
6088 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6089                     poly_int64 offset, rtx temp1, rtx temp2,
6090                     bool frame_related_p, bool emit_move_imm = true)
6091 {
6092   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
6093   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
6094   gcc_assert (temp1 == NULL_RTX
6095               || !frame_related_p
6096               || !reg_overlap_mentioned_p (temp1, dest));
6097   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
6098
6099   /* Try using ADDVL or ADDPL to add the whole value.  */
6100   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
6101     {
6102       rtx offset_rtx = gen_int_mode (offset, mode);
6103       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6104       RTX_FRAME_RELATED_P (insn) = frame_related_p;
6105       return;
6106     }
6107
6108   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
6109      SVE vector register, over and above the minimum size of 128 bits.
6110      This is equivalent to half the value returned by CNTD with a
6111      vector shape of ALL.  */
6112   HOST_WIDE_INT factor = offset.coeffs[1];
6113   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
6114
6115   /* Try using ADDVL or ADDPL to add the VG-based part.  */
6116   poly_int64 poly_offset (factor, factor);
6117   if (src != const0_rtx
6118       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
6119     {
6120       rtx offset_rtx = gen_int_mode (poly_offset, mode);
6121       if (frame_related_p)
6122         {
6123           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6124           RTX_FRAME_RELATED_P (insn) = true;
6125           src = dest;
6126         }
6127       else
6128         {
6129           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
6130           src = aarch64_force_temporary (mode, temp1, addr);
6131           temp1 = temp2;
6132           temp2 = NULL_RTX;
6133         }
6134     }
6135   /* Otherwise use a CNT-based sequence.  */
6136   else if (factor != 0)
6137     {
6138       /* Use a subtraction if we have a negative factor.  */
6139       rtx_code code = PLUS;
6140       if (factor < 0)
6141         {
6142           factor = -factor;
6143           code = MINUS;
6144         }
6145
6146       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
6147          into the multiplication.  */
6148       rtx val;
6149       int shift = 0;
6150       if (factor & 1)
6151         /* Use a right shift by 1.  */
6152         shift = -1;
6153       else
6154         factor /= 2;
6155       HOST_WIDE_INT low_bit = factor & -factor;
6156       if (factor <= 16 * low_bit)
6157         {
6158           if (factor > 16 * 8)
6159             {
6160               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
6161                  the value with the minimum multiplier and shift it into
6162                  position.  */
6163               int extra_shift = exact_log2 (low_bit);
6164               shift += extra_shift;
6165               factor >>= extra_shift;
6166             }
6167           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
6168         }
6169       else
6170         {
6171           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
6172              directly, since that should increase the chances of being
6173              able to use a shift and add sequence.  If LOW_BIT itself
6174              is out of range, just use CNTD.  */
6175           if (low_bit <= 16 * 8)
6176             factor /= low_bit;
6177           else
6178             low_bit = 1;
6179
6180           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
6181           val = aarch64_force_temporary (mode, temp1, val);
6182
6183           if (can_create_pseudo_p ())
6184             {
6185               rtx coeff1 = gen_int_mode (factor, mode);
6186               val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
6187             }
6188           else
6189             {
6190               /* Go back to using a negative multiplication factor if we have
6191                  no register from which to subtract.  */
6192               if (code == MINUS && src == const0_rtx)
6193                 {
6194                   factor = -factor;
6195                   code = PLUS;
6196                 }
6197               rtx coeff1 = gen_int_mode (factor, mode);
6198               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
6199               val = gen_rtx_MULT (mode, val, coeff1);
6200             }
6201         }
6202
6203       if (shift > 0)
6204         {
6205           /* Multiply by 1 << SHIFT.  */
6206           val = aarch64_force_temporary (mode, temp1, val);
6207           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
6208         }
6209       else if (shift == -1)
6210         {
6211           /* Divide by 2.  */
6212           val = aarch64_force_temporary (mode, temp1, val);
6213           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
6214         }
6215
6216       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
6217       if (src != const0_rtx)
6218         {
6219           val = aarch64_force_temporary (mode, temp1, val);
6220           val = gen_rtx_fmt_ee (code, mode, src, val);
6221         }
6222       else if (code == MINUS)
6223         {
6224           val = aarch64_force_temporary (mode, temp1, val);
6225           val = gen_rtx_NEG (mode, val);
6226         }
6227
6228       if (constant == 0 || frame_related_p)
6229         {
6230           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
6231           if (frame_related_p)
6232             {
6233               RTX_FRAME_RELATED_P (insn) = true;
6234               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6235                             gen_rtx_SET (dest, plus_constant (Pmode, src,
6236                                                               poly_offset)));
6237             }
6238           src = dest;
6239           if (constant == 0)
6240             return;
6241         }
6242       else
6243         {
6244           src = aarch64_force_temporary (mode, temp1, val);
6245           temp1 = temp2;
6246           temp2 = NULL_RTX;
6247         }
6248
6249       emit_move_imm = true;
6250     }
6251
6252   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
6253                         frame_related_p, emit_move_imm);
6254 }
6255
6256 /* Like aarch64_add_offset, but the offset is given as an rtx rather
6257    than a poly_int64.  */
6258
6259 void
6260 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6261                           rtx offset_rtx, rtx temp1, rtx temp2)
6262 {
6263   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
6264                       temp1, temp2, false);
6265 }
6266
6267 /* Add DELTA to the stack pointer, marking the instructions frame-related.
6268    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
6269    if TEMP1 already contains abs (DELTA).  */
6270
6271 static inline void
6272 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
6273 {
6274   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
6275                       temp1, temp2, true, emit_move_imm);
6276 }
6277
6278 /* Subtract DELTA from the stack pointer, marking the instructions
6279    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
6280    if nonnull.  */
6281
6282 static inline void
6283 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
6284                 bool emit_move_imm = true)
6285 {
6286   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
6287                       temp1, temp2, frame_related_p, emit_move_imm);
6288 }
6289
6290 /* Set DEST to (vec_series BASE STEP).  */
6291
6292 static void
6293 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
6294 {
6295   machine_mode mode = GET_MODE (dest);
6296   scalar_mode inner = GET_MODE_INNER (mode);
6297
6298   /* Each operand can be a register or an immediate in the range [-16, 15].  */
6299   if (!aarch64_sve_index_immediate_p (base))
6300     base = force_reg (inner, base);
6301   if (!aarch64_sve_index_immediate_p (step))
6302     step = force_reg (inner, step);
6303
6304   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
6305 }
6306
6307 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
6308    register of mode MODE.  Use TARGET for the result if it's nonnull
6309    and convenient.
6310
6311    The two vector modes must have the same element mode.  The behavior
6312    is to duplicate architectural lane N of SRC into architectural lanes
6313    N + I * STEP of the result.  On big-endian targets, architectural
6314    lane 0 of an Advanced SIMD vector is the last element of the vector
6315    in memory layout, so for big-endian targets this operation has the
6316    effect of reversing SRC before duplicating it.  Callers need to
6317    account for this.  */
6318
6319 rtx
6320 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
6321 {
6322   machine_mode src_mode = GET_MODE (src);
6323   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
6324   insn_code icode = (BYTES_BIG_ENDIAN
6325                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
6326                      : code_for_aarch64_vec_duplicate_vq_le (mode));
6327
6328   unsigned int i = 0;
6329   expand_operand ops[3];
6330   create_output_operand (&ops[i++], target, mode);
6331   create_output_operand (&ops[i++], src, src_mode);
6332   if (BYTES_BIG_ENDIAN)
6333     {
6334       /* Create a PARALLEL describing the reversal of SRC.  */
6335       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
6336       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
6337                                                   nelts_per_vq - 1, -1);
6338       create_fixed_operand (&ops[i++], sel);
6339     }
6340   expand_insn (icode, i, ops);
6341   return ops[0].value;
6342 }
6343
6344 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
6345    the memory image into DEST.  Return true on success.  */
6346
6347 static bool
6348 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
6349 {
6350   src = force_const_mem (GET_MODE (src), src);
6351   if (!src)
6352     return false;
6353
6354   /* Make sure that the address is legitimate.  */
6355   if (!aarch64_sve_ld1rq_operand_p (src))
6356     {
6357       rtx addr = force_reg (Pmode, XEXP (src, 0));
6358       src = replace_equiv_address (src, addr);
6359     }
6360
6361   machine_mode mode = GET_MODE (dest);
6362   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6363   rtx ptrue = aarch64_ptrue_reg (pred_mode);
6364   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
6365   return true;
6366 }
6367
6368 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
6369    by N "background" values.  Try to move it into TARGET using:
6370
6371       PTRUE PRED.<T>, VL<N>
6372       MOV TRUE.<T>, #<foreground>
6373       MOV FALSE.<T>, #<background>
6374       SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
6375
6376    The PTRUE is always a single instruction but the MOVs might need a
6377    longer sequence.  If the background value is zero (as it often is),
6378    the sequence can sometimes collapse to a PTRUE followed by a
6379    zero-predicated move.
6380
6381    Return the target on success, otherwise return null.  */
6382
6383 static rtx
6384 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
6385 {
6386   gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
6387
6388   /* Make sure that the PTRUE is valid.  */
6389   machine_mode mode = GET_MODE (src);
6390   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6391   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6392   if (aarch64_svpattern_for_vl (pred_mode, npatterns)
6393       == AARCH64_NUM_SVPATTERNS)
6394     return NULL_RTX;
6395
6396   rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
6397   rtx_vector_builder true_builder (mode, npatterns, 1);
6398   rtx_vector_builder false_builder (mode, npatterns, 1);
6399   for (unsigned int i = 0; i < npatterns; ++i)
6400     {
6401       true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6402       pred_builder.quick_push (CONST1_RTX (BImode));
6403     }
6404   for (unsigned int i = 0; i < npatterns; ++i)
6405     {
6406       false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
6407       pred_builder.quick_push (CONST0_RTX (BImode));
6408     }
6409   expand_operand ops[4];
6410   create_output_operand (&ops[0], target, mode);
6411   create_input_operand (&ops[1], true_builder.build (), mode);
6412   create_input_operand (&ops[2], false_builder.build (), mode);
6413   create_input_operand (&ops[3], pred_builder.build (), pred_mode);
6414   expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
6415   return target;
6416 }
6417
6418 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
6419    SVE data mode and isn't a legitimate constant.  Use TARGET for the
6420    result if convenient.
6421
6422    The returned register can have whatever mode seems most natural
6423    given the contents of SRC.  */
6424
6425 static rtx
6426 aarch64_expand_sve_const_vector (rtx target, rtx src)
6427 {
6428   machine_mode mode = GET_MODE (src);
6429   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6430   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
6431   scalar_mode elt_mode = GET_MODE_INNER (mode);
6432   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
6433   unsigned int container_bits = aarch64_sve_container_bits (mode);
6434   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
6435
6436   if (nelts_per_pattern == 1
6437       && encoded_bits <= 128
6438       && container_bits != elt_bits)
6439     {
6440       /* We have a partial vector mode and a constant whose full-vector
6441          equivalent would occupy a repeating 128-bit sequence.  Build that
6442          full-vector equivalent instead, so that we have the option of
6443          using LD1RQ and Advanced SIMD operations.  */
6444       unsigned int repeat = container_bits / elt_bits;
6445       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
6446       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
6447       for (unsigned int i = 0; i < npatterns; ++i)
6448         for (unsigned int j = 0; j < repeat; ++j)
6449           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6450       target = aarch64_target_reg (target, full_mode);
6451       return aarch64_expand_sve_const_vector (target, builder.build ());
6452     }
6453
6454   if (nelts_per_pattern == 1 && encoded_bits == 128)
6455     {
6456       /* The constant is a duplicated quadword but can't be narrowed
6457          beyond a quadword.  Get the memory image of the first quadword
6458          as a 128-bit vector and try using LD1RQ to load it from memory.
6459
6460          The effect for both endiannesses is to load memory lane N into
6461          architectural lanes N + I * STEP of the result.  On big-endian
6462          targets, the layout of the 128-bit vector in an Advanced SIMD
6463          register would be different from its layout in an SVE register,
6464          but this 128-bit vector is a memory value only.  */
6465       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6466       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
6467       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
6468         return target;
6469     }
6470
6471   if (nelts_per_pattern == 1 && encoded_bits < 128)
6472     {
6473       /* The vector is a repeating sequence of 64 bits or fewer.
6474          See if we can load them using an Advanced SIMD move and then
6475          duplicate it to fill a vector.  This is better than using a GPR
6476          move because it keeps everything in the same register file.  */
6477       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6478       rtx_vector_builder builder (vq_mode, npatterns, 1);
6479       for (unsigned int i = 0; i < npatterns; ++i)
6480         {
6481           /* We want memory lane N to go into architectural lane N,
6482              so reverse for big-endian targets.  The DUP .Q pattern
6483              has a compensating reverse built-in.  */
6484           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
6485           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
6486         }
6487       rtx vq_src = builder.build ();
6488       if (aarch64_simd_valid_immediate (vq_src, NULL))
6489         {
6490           vq_src = force_reg (vq_mode, vq_src);
6491           return aarch64_expand_sve_dupq (target, mode, vq_src);
6492         }
6493
6494       /* Get an integer representation of the repeating part of Advanced
6495          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
6496          which for big-endian targets is lane-swapped wrt a normal
6497          Advanced SIMD vector.  This means that for both endiannesses,
6498          memory lane N of SVE vector SRC corresponds to architectural
6499          lane N of a register holding VQ_SRC.  This in turn means that
6500          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
6501          as a single 128-bit value) and thus that memory lane 0 of SRC is
6502          in the lsb of the integer.  Duplicating the integer therefore
6503          ensures that memory lane N of SRC goes into architectural lane
6504          N + I * INDEX of the SVE register.  */
6505       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
6506       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
6507       if (elt_value)
6508         {
6509           /* Pretend that we had a vector of INT_MODE to start with.  */
6510           elt_mode = int_mode;
6511           mode = aarch64_full_sve_mode (int_mode).require ();
6512
6513           /* If the integer can be moved into a general register by a
6514              single instruction, do that and duplicate the result.  */
6515           if (CONST_INT_P (elt_value)
6516               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
6517             {
6518               elt_value = force_reg (elt_mode, elt_value);
6519               return expand_vector_broadcast (mode, elt_value);
6520             }
6521         }
6522       else if (npatterns == 1)
6523         /* We're duplicating a single value, but can't do better than
6524            force it to memory and load from there.  This handles things
6525            like symbolic constants.  */
6526         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
6527
6528       if (elt_value)
6529         {
6530           /* Load the element from memory if we can, otherwise move it into
6531              a register and use a DUP.  */
6532           rtx op = force_const_mem (elt_mode, elt_value);
6533           if (!op)
6534             op = force_reg (elt_mode, elt_value);
6535           return expand_vector_broadcast (mode, op);
6536         }
6537     }
6538
6539   /* Try using INDEX.  */
6540   rtx base, step;
6541   if (const_vec_series_p (src, &base, &step))
6542     {
6543       aarch64_expand_vec_series (target, base, step);
6544       return target;
6545     }
6546
6547   /* From here on, it's better to force the whole constant to memory
6548      if we can.  */
6549   if (GET_MODE_NUNITS (mode).is_constant ())
6550     return NULL_RTX;
6551
6552   if (nelts_per_pattern == 2)
6553     if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
6554       return res;
6555
6556   /* Expand each pattern individually.  */
6557   gcc_assert (npatterns > 1);
6558   rtx_vector_builder builder;
6559   auto_vec<rtx, 16> vectors (npatterns);
6560   for (unsigned int i = 0; i < npatterns; ++i)
6561     {
6562       builder.new_vector (mode, 1, nelts_per_pattern);
6563       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
6564         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
6565       vectors.quick_push (force_reg (mode, builder.build ()));
6566     }
6567
6568   /* Use permutes to interleave the separate vectors.  */
6569   while (npatterns > 1)
6570     {
6571       npatterns /= 2;
6572       for (unsigned int i = 0; i < npatterns; ++i)
6573         {
6574           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
6575           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
6576           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
6577           vectors[i] = tmp;
6578         }
6579     }
6580   gcc_assert (vectors[0] == target);
6581   return target;
6582 }
6583
6584 /* Use WHILE to set a predicate register of mode MODE in which the first
6585    VL bits are set and the rest are clear.  Use TARGET for the register
6586    if it's nonnull and convenient.  */
6587
6588 static rtx
6589 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
6590                                  unsigned int vl)
6591 {
6592   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
6593   target = aarch64_target_reg (target, mode);
6594   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
6595                         target, const0_rtx, limit));
6596   return target;
6597 }
6598
6599 static rtx
6600 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
6601
6602 /* BUILDER is a constant predicate in which the index of every set bit
6603    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
6604    by inverting every element at a multiple of ELT_SIZE and EORing the
6605    result with an ELT_SIZE PTRUE.
6606
6607    Return a register that contains the constant on success, otherwise
6608    return null.  Use TARGET as the register if it is nonnull and
6609    convenient.  */
6610
6611 static rtx
6612 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
6613                                    unsigned int elt_size)
6614 {
6615   /* Invert every element at a multiple of ELT_SIZE, keeping the
6616      other bits zero.  */
6617   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
6618                                   builder.nelts_per_pattern ());
6619   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6620     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
6621       inv_builder.quick_push (const1_rtx);
6622     else
6623       inv_builder.quick_push (const0_rtx);
6624   inv_builder.finalize ();
6625
6626   /* See if we can load the constant cheaply.  */
6627   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
6628   if (!inv)
6629     return NULL_RTX;
6630
6631   /* EOR the result with an ELT_SIZE PTRUE.  */
6632   rtx mask = aarch64_ptrue_all (elt_size);
6633   mask = force_reg (VNx16BImode, mask);
6634   inv = gen_lowpart (VNx16BImode, inv);
6635   target = aarch64_target_reg (target, VNx16BImode);
6636   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
6637   return target;
6638 }
6639
6640 /* BUILDER is a constant predicate in which the index of every set bit
6641    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
6642    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
6643    register on success, otherwise return null.  Use TARGET as the register
6644    if nonnull and convenient.  */
6645
6646 static rtx
6647 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
6648                                    unsigned int elt_size,
6649                                    unsigned int permute_size)
6650 {
6651   /* We're going to split the constant into two new constants A and B,
6652      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
6653      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6654
6655      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6656      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6657
6658      where _ indicates elements that will be discarded by the permute.
6659
6660      First calculate the ELT_SIZEs for A and B.  */
6661   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
6662   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
6663   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
6664     if (INTVAL (builder.elt (i)) != 0)
6665       {
6666         if (i & permute_size)
6667           b_elt_size |= i - permute_size;
6668         else
6669           a_elt_size |= i;
6670       }
6671   a_elt_size &= -a_elt_size;
6672   b_elt_size &= -b_elt_size;
6673
6674   /* Now construct the vectors themselves.  */
6675   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
6676                                 builder.nelts_per_pattern ());
6677   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
6678                                 builder.nelts_per_pattern ());
6679   unsigned int nelts = builder.encoded_nelts ();
6680   for (unsigned int i = 0; i < nelts; ++i)
6681     if (i & (elt_size - 1))
6682       {
6683         a_builder.quick_push (const0_rtx);
6684         b_builder.quick_push (const0_rtx);
6685       }
6686     else if ((i & permute_size) == 0)
6687       {
6688         /* The A and B elements are significant.  */
6689         a_builder.quick_push (builder.elt (i));
6690         b_builder.quick_push (builder.elt (i + permute_size));
6691       }
6692     else
6693       {
6694         /* The A and B elements are going to be discarded, so pick whatever
6695            is likely to give a nice constant.  We are targeting element
6696            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6697            with the aim of each being a sequence of ones followed by
6698            a sequence of zeros.  So:
6699
6700            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6701              duplicate the last X_ELT_SIZE element, to extend the
6702              current sequence of ones or zeros.
6703
6704            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6705              zero, so that the constant really does have X_ELT_SIZE and
6706              not a smaller size.  */
6707         if (a_elt_size > permute_size)
6708           a_builder.quick_push (const0_rtx);
6709         else
6710           a_builder.quick_push (a_builder.elt (i - a_elt_size));
6711         if (b_elt_size > permute_size)
6712           b_builder.quick_push (const0_rtx);
6713         else
6714           b_builder.quick_push (b_builder.elt (i - b_elt_size));
6715       }
6716   a_builder.finalize ();
6717   b_builder.finalize ();
6718
6719   /* Try loading A into a register.  */
6720   rtx_insn *last = get_last_insn ();
6721   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
6722   if (!a)
6723     return NULL_RTX;
6724
6725   /* Try loading B into a register.  */
6726   rtx b = a;
6727   if (a_builder != b_builder)
6728     {
6729       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
6730       if (!b)
6731         {
6732           delete_insns_since (last);
6733           return NULL_RTX;
6734         }
6735     }
6736
6737   /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
6738      operands but permutes them as though they had mode MODE.  */
6739   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
6740   target = aarch64_target_reg (target, GET_MODE (a));
6741   rtx type_reg = CONST0_RTX (mode);
6742   emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
6743   return target;
6744 }
6745
6746 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
6747    constant in BUILDER into an SVE predicate register.  Return the register
6748    on success, otherwise return null.  Use TARGET for the register if
6749    nonnull and convenient.
6750
6751    ALLOW_RECURSE_P is true if we can use methods that would call this
6752    function recursively.  */
6753
6754 static rtx
6755 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
6756                                  bool allow_recurse_p)
6757 {
6758   if (builder.encoded_nelts () == 1)
6759     /* A PFALSE or a PTRUE .B ALL.  */
6760     return aarch64_emit_set_immediate (target, builder);
6761
6762   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
6763   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
6764     {
6765       /* If we can load the constant using PTRUE, use it as-is.  */
6766       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
6767       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
6768         return aarch64_emit_set_immediate (target, builder);
6769
6770       /* Otherwise use WHILE to set the first VL bits.  */
6771       return aarch64_sve_move_pred_via_while (target, mode, vl);
6772     }
6773
6774   if (!allow_recurse_p)
6775     return NULL_RTX;
6776
6777   /* Try inverting the vector in element size ELT_SIZE and then EORing
6778      the result with an ELT_SIZE PTRUE.  */
6779   if (INTVAL (builder.elt (0)) == 0)
6780     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
6781                                                      elt_size))
6782       return res;
6783
6784   /* Try using TRN1 to permute two simpler constants.  */
6785   for (unsigned int i = elt_size; i <= 8; i *= 2)
6786     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
6787                                                      elt_size, i))
6788       return res;
6789
6790   return NULL_RTX;
6791 }
6792
6793 /* Return an SVE predicate register that contains the VNx16BImode
6794    constant in BUILDER, without going through the move expanders.
6795
6796    The returned register can have whatever mode seems most natural
6797    given the contents of BUILDER.  Use TARGET for the result if
6798    convenient.  */
6799
6800 static rtx
6801 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
6802 {
6803   /* Try loading the constant using pure predicate operations.  */
6804   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
6805     return res;
6806
6807   /* Try forcing the constant to memory.  */
6808   if (builder.full_nelts ().is_constant ())
6809     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
6810       {
6811         target = aarch64_target_reg (target, VNx16BImode);
6812         emit_move_insn (target, mem);
6813         return target;
6814       }
6815
6816   /* The last resort is to load the constant as an integer and then
6817      compare it against zero.  Use -1 for set bits in order to increase
6818      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
6819   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
6820                                   builder.nelts_per_pattern ());
6821   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6822     int_builder.quick_push (INTVAL (builder.elt (i))
6823                             ? constm1_rtx : const0_rtx);
6824   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
6825                                            int_builder.build ());
6826 }
6827
6828 /* Set DEST to immediate IMM.  */
6829
6830 void
6831 aarch64_expand_mov_immediate (rtx dest, rtx imm)
6832 {
6833   machine_mode mode = GET_MODE (dest);
6834
6835   /* Check on what type of symbol it is.  */
6836   scalar_int_mode int_mode;
6837   if ((SYMBOL_REF_P (imm)
6838        || LABEL_REF_P (imm)
6839        || GET_CODE (imm) == CONST
6840        || GET_CODE (imm) == CONST_POLY_INT)
6841       && is_a <scalar_int_mode> (mode, &int_mode))
6842     {
6843       rtx mem;
6844       poly_int64 offset;
6845       HOST_WIDE_INT const_offset;
6846       enum aarch64_symbol_type sty;
6847
6848       /* If we have (const (plus symbol offset)), separate out the offset
6849          before we start classifying the symbol.  */
6850       rtx base = strip_offset (imm, &offset);
6851
6852       /* We must always add an offset involving VL separately, rather than
6853          folding it into the relocation.  */
6854       if (!offset.is_constant (&const_offset))
6855         {
6856           if (!TARGET_SVE)
6857             {
6858               aarch64_report_sve_required ();
6859               return;
6860             }
6861           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
6862             emit_insn (gen_rtx_SET (dest, imm));
6863           else
6864             {
6865               /* Do arithmetic on 32-bit values if the result is smaller
6866                  than that.  */
6867               if (partial_subreg_p (int_mode, SImode))
6868                 {
6869                   /* It is invalid to do symbol calculations in modes
6870                      narrower than SImode.  */
6871                   gcc_assert (base == const0_rtx);
6872                   dest = gen_lowpart (SImode, dest);
6873                   int_mode = SImode;
6874                 }
6875               if (base != const0_rtx)
6876                 {
6877                   base = aarch64_force_temporary (int_mode, dest, base);
6878                   aarch64_add_offset (int_mode, dest, base, offset,
6879                                       NULL_RTX, NULL_RTX, false);
6880                 }
6881               else
6882                 aarch64_add_offset (int_mode, dest, base, offset,
6883                                     dest, NULL_RTX, false);
6884             }
6885           return;
6886         }
6887
6888       sty = aarch64_classify_symbol (base, const_offset);
6889       switch (sty)
6890         {
6891         case SYMBOL_FORCE_TO_MEM:
6892           if (int_mode != ptr_mode)
6893             imm = convert_memory_address (ptr_mode, imm);
6894
6895           if (const_offset != 0
6896               && targetm.cannot_force_const_mem (ptr_mode, imm))
6897             {
6898               gcc_assert (can_create_pseudo_p ());
6899               base = aarch64_force_temporary (int_mode, dest, base);
6900               aarch64_add_offset (int_mode, dest, base, const_offset,
6901                                   NULL_RTX, NULL_RTX, false);
6902               return;
6903             }
6904
6905           mem = force_const_mem (ptr_mode, imm);
6906           gcc_assert (mem);
6907
6908           /* If we aren't generating PC relative literals, then
6909              we need to expand the literal pool access carefully.
6910              This is something that needs to be done in a number
6911              of places, so could well live as a separate function.  */
6912           if (!aarch64_pcrelative_literal_loads)
6913             {
6914               gcc_assert (can_create_pseudo_p ());
6915               base = gen_reg_rtx (ptr_mode);
6916               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6917               if (ptr_mode != Pmode)
6918                 base = convert_memory_address (Pmode, base);
6919               mem = gen_rtx_MEM (ptr_mode, base);
6920             }
6921
6922           if (int_mode != ptr_mode)
6923             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6924
6925           emit_insn (gen_rtx_SET (dest, mem));
6926
6927           return;
6928
6929         case SYMBOL_SMALL_TLSGD:
6930         case SYMBOL_SMALL_TLSDESC:
6931         case SYMBOL_SMALL_TLSIE:
6932         case SYMBOL_SMALL_GOT_28K:
6933         case SYMBOL_SMALL_GOT_4G:
6934         case SYMBOL_TINY_GOT:
6935         case SYMBOL_TINY_TLSIE:
6936           if (const_offset != 0)
6937             {
6938               gcc_assert(can_create_pseudo_p ());
6939               base = aarch64_force_temporary (int_mode, dest, base);
6940               aarch64_add_offset (int_mode, dest, base, const_offset,
6941                                   NULL_RTX, NULL_RTX, false);
6942               return;
6943             }
6944           /* FALLTHRU */
6945
6946         case SYMBOL_SMALL_ABSOLUTE:
6947         case SYMBOL_TINY_ABSOLUTE:
6948         case SYMBOL_TLSLE12:
6949         case SYMBOL_TLSLE24:
6950         case SYMBOL_TLSLE32:
6951         case SYMBOL_TLSLE48:
6952           aarch64_load_symref_appropriately (dest, imm, sty);
6953           return;
6954
6955         default:
6956           gcc_unreachable ();
6957         }
6958     }
6959
6960   if (!CONST_INT_P (imm))
6961     {
6962       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
6963         {
6964           /* Only the low bit of each .H, .S and .D element is defined,
6965              so we can set the upper bits to whatever we like.  If the
6966              predicate is all-true in MODE, prefer to set all the undefined
6967              bits as well, so that we can share a single .B predicate for
6968              all modes.  */
6969           if (imm == CONSTM1_RTX (mode))
6970             imm = CONSTM1_RTX (VNx16BImode);
6971
6972           /* All methods for constructing predicate modes wider than VNx16BI
6973              will set the upper bits of each element to zero.  Expose this
6974              by moving such constants as a VNx16BI, so that all bits are
6975              significant and so that constants for different modes can be
6976              shared.  The wider constant will still be available as a
6977              REG_EQUAL note.  */
6978           rtx_vector_builder builder;
6979           if (aarch64_get_sve_pred_bits (builder, imm))
6980             {
6981               rtx res = aarch64_expand_sve_const_pred (dest, builder);
6982               if (dest != res)
6983                 emit_move_insn (dest, gen_lowpart (mode, res));
6984               return;
6985             }
6986         }
6987
6988       if (GET_CODE (imm) == HIGH
6989           || aarch64_simd_valid_immediate (imm, NULL))
6990         {
6991           emit_insn (gen_rtx_SET (dest, imm));
6992           return;
6993         }
6994
6995       if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6996         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6997           {
6998             if (dest != res)
6999               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
7000             return;
7001           }
7002
7003       rtx mem = force_const_mem (mode, imm);
7004       gcc_assert (mem);
7005       emit_move_insn (dest, mem);
7006       return;
7007     }
7008
7009   aarch64_internal_mov_immediate (dest, imm, true, mode);
7010 }
7011
7012 /* Return the MEM rtx that provides the canary value that should be used
7013    for stack-smashing protection.  MODE is the mode of the memory.
7014    For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
7015    (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
7016    indicates whether the caller is performing a SET or a TEST operation.  */
7017
7018 rtx
7019 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
7020                                   aarch64_salt_type salt_type)
7021 {
7022   rtx addr;
7023   if (aarch64_stack_protector_guard == SSP_GLOBAL)
7024     {
7025       gcc_assert (MEM_P (decl_rtl));
7026       addr = XEXP (decl_rtl, 0);
7027       poly_int64 offset;
7028       rtx base = strip_offset_and_salt (addr, &offset);
7029       if (!SYMBOL_REF_P (base))
7030         return decl_rtl;
7031
7032       rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
7033       addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
7034       addr = gen_rtx_CONST (Pmode, addr);
7035       addr = plus_constant (Pmode, addr, offset);
7036     }
7037   else
7038     {
7039       /* Calculate the address from the system register.  */
7040       rtx salt = GEN_INT (salt_type);
7041       addr = gen_reg_rtx (mode);
7042       if (mode == DImode)
7043         emit_insn (gen_reg_stack_protect_address_di (addr, salt));
7044       else
7045         {
7046           emit_insn (gen_reg_stack_protect_address_si (addr, salt));
7047           addr = convert_memory_address (Pmode, addr);
7048         }
7049       addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
7050     }
7051   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
7052 }
7053
7054 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
7055    that is known to contain PTRUE.  */
7056
7057 void
7058 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
7059 {
7060   expand_operand ops[3];
7061   machine_mode mode = GET_MODE (dest);
7062   create_output_operand (&ops[0], dest, mode);
7063   create_input_operand (&ops[1], pred, GET_MODE(pred));
7064   create_input_operand (&ops[2], src, mode);
7065   temporary_volatile_ok v (true);
7066   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
7067 }
7068
7069 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
7070    operand is in memory.  In this case we need to use the predicated LD1
7071    and ST1 instead of LDR and STR, both for correctness on big-endian
7072    targets and because LD1 and ST1 support a wider range of addressing modes.
7073    PRED_MODE is the mode of the predicate.
7074
7075    See the comment at the head of aarch64-sve.md for details about the
7076    big-endian handling.  */
7077
7078 void
7079 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
7080 {
7081   machine_mode mode = GET_MODE (dest);
7082   rtx ptrue = aarch64_ptrue_reg (pred_mode);
7083   if (!register_operand (src, mode)
7084       && !register_operand (dest, mode))
7085     {
7086       rtx tmp = gen_reg_rtx (mode);
7087       if (MEM_P (src))
7088         aarch64_emit_sve_pred_move (tmp, ptrue, src);
7089       else
7090         emit_move_insn (tmp, src);
7091       src = tmp;
7092     }
7093   aarch64_emit_sve_pred_move (dest, ptrue, src);
7094 }
7095
7096 /* Called only on big-endian targets.  See whether an SVE vector move
7097    from SRC to DEST is effectively a REV[BHW] instruction, because at
7098    least one operand is a subreg of an SVE vector that has wider or
7099    narrower elements.  Return true and emit the instruction if so.
7100
7101    For example:
7102
7103      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
7104
7105    represents a VIEW_CONVERT between the following vectors, viewed
7106    in memory order:
7107
7108      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
7109      R1: { [0],      [1],      [2],      [3],     ... }
7110
7111    The high part of lane X in R2 should therefore correspond to lane X*2
7112    of R1, but the register representations are:
7113
7114          msb                                      lsb
7115      R2: ...... [1].high  [1].low   [0].high  [0].low
7116      R1: ...... [3]       [2]       [1]       [0]
7117
7118    where the low part of lane X in R2 corresponds to lane X*2 in R1.
7119    We therefore need a reverse operation to swap the high and low values
7120    around.
7121
7122    This is purely an optimization.  Without it we would spill the
7123    subreg operand to the stack in one mode and reload it in the
7124    other mode, which has the same effect as the REV.  */
7125
7126 bool
7127 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
7128 {
7129   gcc_assert (BYTES_BIG_ENDIAN);
7130
7131   /* Do not try to optimize subregs that LRA has created for matched
7132      reloads.  These subregs only exist as a temporary measure to make
7133      the RTL well-formed, but they are exempt from the usual
7134      TARGET_CAN_CHANGE_MODE_CLASS rules.
7135
7136      For example, if we have:
7137
7138        (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
7139
7140      and the constraints require R1 and R2 to be in the same register,
7141      LRA may need to create RTL such as:
7142
7143        (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
7144        (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
7145        (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
7146
7147      which forces both the input and output of the original instruction
7148      to use the same hard register.  But for this to work, the normal
7149      rules have to be suppressed on the subreg input, otherwise LRA
7150      would need to reload that input too, meaning that the process
7151      would never terminate.  To compensate for this, the normal rules
7152      are also suppressed for the subreg output of the first move.
7153      Ignoring the special case and handling the first move normally
7154      would therefore generate wrong code: we would reverse the elements
7155      for the first subreg but not reverse them back for the second subreg.  */
7156   if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
7157     dest = SUBREG_REG (dest);
7158   if (SUBREG_P (src) && !LRA_SUBREG_P (src))
7159     src = SUBREG_REG (src);
7160
7161   /* The optimization handles two single SVE REGs with different element
7162      sizes.  */
7163   if (!REG_P (dest)
7164       || !REG_P (src)
7165       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
7166       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
7167       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
7168           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
7169     return false;
7170
7171   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
7172   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
7173   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
7174                                UNSPEC_REV_SUBREG);
7175   emit_insn (gen_rtx_SET (dest, unspec));
7176   return true;
7177 }
7178
7179 /* Return a copy of X with mode MODE, without changing its other
7180    attributes.  Unlike gen_lowpart, this doesn't care whether the
7181    mode change is valid.  */
7182
7183 rtx
7184 aarch64_replace_reg_mode (rtx x, machine_mode mode)
7185 {
7186   if (GET_MODE (x) == mode)
7187     return x;
7188
7189   x = shallow_copy_rtx (x);
7190   set_mode_and_regno (x, mode, REGNO (x));
7191   return x;
7192 }
7193
7194 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
7195    stored in wider integer containers.  */
7196
7197 static unsigned int
7198 aarch64_sve_rev_unspec (machine_mode mode)
7199 {
7200   switch (GET_MODE_UNIT_SIZE (mode))
7201     {
7202     case 1: return UNSPEC_REVB;
7203     case 2: return UNSPEC_REVH;
7204     case 4: return UNSPEC_REVW;
7205     }
7206   gcc_unreachable ();
7207 }
7208
7209 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
7210    operands.  */
7211
7212 void
7213 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
7214 {
7215   /* Decide which REV operation we need.  The mode with wider elements
7216      determines the mode of the operands and the mode with the narrower
7217      elements determines the reverse width.  */
7218   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
7219   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
7220   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
7221       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
7222     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
7223
7224   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
7225   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
7226
7227   /* Get the operands in the appropriate modes and emit the instruction.  */
7228   ptrue = gen_lowpart (pred_mode, ptrue);
7229   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
7230   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
7231   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
7232                                dest, ptrue, src));
7233 }
7234
7235 static bool
7236 aarch64_function_ok_for_sibcall (tree, tree exp)
7237 {
7238   if (crtl->abi->id () != expr_callee_abi (exp).id ())
7239     return false;
7240
7241   return true;
7242 }
7243
7244 /* Subroutine of aarch64_pass_by_reference for arguments that are not
7245    passed in SVE registers.  */
7246
7247 static bool
7248 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
7249                              const function_arg_info &arg)
7250 {
7251   HOST_WIDE_INT size;
7252   machine_mode dummymode;
7253   int nregs;
7254
7255   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
7256   if (arg.mode == BLKmode && arg.type)
7257     size = int_size_in_bytes (arg.type);
7258   else
7259     /* No frontends can create types with variable-sized modes, so we
7260        shouldn't be asked to pass or return them.  */
7261     size = GET_MODE_SIZE (arg.mode).to_constant ();
7262
7263   /* Aggregates are passed by reference based on their size.  */
7264   if (arg.aggregate_type_p ())
7265     size = int_size_in_bytes (arg.type);
7266
7267   /* Variable sized arguments are always returned by reference.  */
7268   if (size < 0)
7269     return true;
7270
7271   /* Can this be a candidate to be passed in fp/simd register(s)?  */
7272   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
7273                                                &dummymode, &nregs, NULL,
7274                                                !pcum || pcum->silent_p))
7275     return false;
7276
7277   /* Arguments which are variable sized or larger than 2 registers are
7278      passed by reference unless they are a homogenous floating point
7279      aggregate.  */
7280   return size > 2 * UNITS_PER_WORD;
7281 }
7282
7283 /* Implement TARGET_PASS_BY_REFERENCE.  */
7284
7285 static bool
7286 aarch64_pass_by_reference (cumulative_args_t pcum_v,
7287                            const function_arg_info &arg)
7288 {
7289   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7290
7291   if (!arg.type)
7292     return aarch64_pass_by_reference_1 (pcum, arg);
7293
7294   pure_scalable_type_info pst_info;
7295   switch (pst_info.analyze (arg.type))
7296     {
7297     case pure_scalable_type_info::IS_PST:
7298       if (pcum && !pcum->silent_p && !TARGET_SVE)
7299         /* We can't gracefully recover at this point, so make this a
7300            fatal error.  */
7301         fatal_error (input_location, "arguments of type %qT require"
7302                      " the SVE ISA extension", arg.type);
7303
7304       /* Variadic SVE types are passed by reference.  Normal non-variadic
7305          arguments are too if we've run out of registers.  */
7306       return (!arg.named
7307               || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
7308               || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
7309
7310     case pure_scalable_type_info::DOESNT_MATTER:
7311       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
7312       return true;
7313
7314     case pure_scalable_type_info::NO_ABI_IDENTITY:
7315     case pure_scalable_type_info::ISNT_PST:
7316       return aarch64_pass_by_reference_1 (pcum, arg);
7317     }
7318   gcc_unreachable ();
7319 }
7320
7321 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
7322 static bool
7323 aarch64_return_in_msb (const_tree valtype)
7324 {
7325   machine_mode dummy_mode;
7326   int dummy_int;
7327
7328   /* Never happens in little-endian mode.  */
7329   if (!BYTES_BIG_ENDIAN)
7330     return false;
7331
7332   /* Only composite types smaller than or equal to 16 bytes can
7333      be potentially returned in registers.  */
7334   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
7335       || int_size_in_bytes (valtype) <= 0
7336       || int_size_in_bytes (valtype) > 16)
7337     return false;
7338
7339   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
7340      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
7341      is always passed/returned in the least significant bits of fp/simd
7342      register(s).  */
7343   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
7344                                                &dummy_mode, &dummy_int, NULL,
7345                                                false))
7346     return false;
7347
7348   /* Likewise pure scalable types for SVE vector and predicate registers.  */
7349   pure_scalable_type_info pst_info;
7350   if (pst_info.analyze_registers (valtype))
7351     return false;
7352
7353   return true;
7354 }
7355
7356 /* Implement TARGET_FUNCTION_VALUE.
7357    Define how to find the value returned by a function.  */
7358
7359 static rtx
7360 aarch64_function_value (const_tree type, const_tree func,
7361                         bool outgoing ATTRIBUTE_UNUSED)
7362 {
7363   machine_mode mode;
7364   int unsignedp;
7365
7366   mode = TYPE_MODE (type);
7367   if (INTEGRAL_TYPE_P (type))
7368     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
7369
7370   pure_scalable_type_info pst_info;
7371   if (type && pst_info.analyze_registers (type))
7372     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
7373
7374   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7375      are returned in memory, not by value.  */
7376   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7377   bool sve_p = (vec_flags & VEC_ANY_SVE);
7378
7379   if (aarch64_return_in_msb (type))
7380     {
7381       HOST_WIDE_INT size = int_size_in_bytes (type);
7382
7383       if (size % UNITS_PER_WORD != 0)
7384         {
7385           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
7386           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
7387         }
7388     }
7389
7390   int count;
7391   machine_mode ag_mode;
7392   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
7393                                                NULL, false))
7394     {
7395       gcc_assert (!sve_p);
7396       if (!aarch64_composite_type_p (type, mode))
7397         {
7398           gcc_assert (count == 1 && mode == ag_mode);
7399           return gen_rtx_REG (mode, V0_REGNUM);
7400         }
7401       else if (aarch64_advsimd_full_struct_mode_p (mode)
7402                && known_eq (GET_MODE_SIZE (ag_mode), 16))
7403         return gen_rtx_REG (mode, V0_REGNUM);
7404       else if (aarch64_advsimd_partial_struct_mode_p (mode)
7405                && known_eq (GET_MODE_SIZE (ag_mode), 8))
7406         return gen_rtx_REG (mode, V0_REGNUM);
7407       else
7408         {
7409           int i;
7410           rtx par;
7411
7412           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
7413           for (i = 0; i < count; i++)
7414             {
7415               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
7416               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
7417               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7418               XVECEXP (par, 0, i) = tmp;
7419             }
7420           return par;
7421         }
7422     }
7423   else
7424     {
7425       if (sve_p)
7426         {
7427           /* Vector types can acquire a partial SVE mode using things like
7428              __attribute__((vector_size(N))), and this is potentially useful.
7429              However, the choice of mode doesn't affect the type's ABI
7430              identity, so we should treat the types as though they had
7431              the associated integer mode, just like they did before SVE
7432              was introduced.
7433
7434              We know that the vector must be 128 bits or smaller,
7435              otherwise we'd have returned it in memory instead.  */
7436           gcc_assert (type
7437                       && (aarch64_some_values_include_pst_objects_p (type)
7438                           || (vec_flags & VEC_PARTIAL)));
7439
7440           scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
7441           rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
7442           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
7443           return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
7444         }
7445       return gen_rtx_REG (mode, R0_REGNUM);
7446     }
7447 }
7448
7449 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
7450    Return true if REGNO is the number of a hard register in which the values
7451    of called function may come back.  */
7452
7453 static bool
7454 aarch64_function_value_regno_p (const unsigned int regno)
7455 {
7456   /* Maximum of 16 bytes can be returned in the general registers.  Examples
7457      of 16-byte return values are: 128-bit integers and 16-byte small
7458      structures (excluding homogeneous floating-point aggregates).  */
7459   if (regno == R0_REGNUM || regno == R1_REGNUM)
7460     return true;
7461
7462   /* Up to four fp/simd registers can return a function value, e.g. a
7463      homogeneous floating-point aggregate having four members.  */
7464   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
7465     return TARGET_FLOAT;
7466
7467   return false;
7468 }
7469
7470 /* Subroutine for aarch64_return_in_memory for types that are not returned
7471    in SVE registers.  */
7472
7473 static bool
7474 aarch64_return_in_memory_1 (const_tree type)
7475 {
7476   HOST_WIDE_INT size;
7477   machine_mode ag_mode;
7478   int count;
7479
7480   if (!AGGREGATE_TYPE_P (type)
7481       && TREE_CODE (type) != COMPLEX_TYPE
7482       && TREE_CODE (type) != VECTOR_TYPE)
7483     /* Simple scalar types always returned in registers.  */
7484     return false;
7485
7486   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7487                                                &ag_mode, &count, NULL, false))
7488     return false;
7489
7490   /* Types larger than 2 registers returned in memory.  */
7491   size = int_size_in_bytes (type);
7492   return (size < 0 || size > 2 * UNITS_PER_WORD);
7493 }
7494
7495 /* Implement TARGET_RETURN_IN_MEMORY.
7496
7497    If the type T of the result of a function is such that
7498      void func (T arg)
7499    would require that arg be passed as a value in a register (or set of
7500    registers) according to the parameter passing rules, then the result
7501    is returned in the same registers as would be used for such an
7502    argument.  */
7503
7504 static bool
7505 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
7506 {
7507   pure_scalable_type_info pst_info;
7508   switch (pst_info.analyze (type))
7509     {
7510     case pure_scalable_type_info::IS_PST:
7511       return (pst_info.num_zr () > NUM_FP_ARG_REGS
7512               || pst_info.num_pr () > NUM_PR_ARG_REGS);
7513
7514     case pure_scalable_type_info::DOESNT_MATTER:
7515       gcc_assert (aarch64_return_in_memory_1 (type));
7516       return true;
7517
7518     case pure_scalable_type_info::NO_ABI_IDENTITY:
7519     case pure_scalable_type_info::ISNT_PST:
7520       return aarch64_return_in_memory_1 (type);
7521     }
7522   gcc_unreachable ();
7523 }
7524
7525 static bool
7526 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
7527                                const_tree type, int *nregs)
7528 {
7529   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7530   return aarch64_vfp_is_call_or_return_candidate (mode, type,
7531                                                   &pcum->aapcs_vfp_rmode,
7532                                                   nregs, NULL, pcum->silent_p);
7533 }
7534
7535 /* Given MODE and TYPE of a function argument, return the alignment in
7536    bits.  The idea is to suppress any stronger alignment requested by
7537    the user and opt for the natural alignment (specified in AAPCS64 \S
7538    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
7539    calculated in versions of GCC prior to GCC-9.  This is a helper
7540    function for local use only.  */
7541
7542 static unsigned int
7543 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
7544                                 unsigned int *abi_break)
7545 {
7546   *abi_break = 0;
7547   if (!type)
7548     return GET_MODE_ALIGNMENT (mode);
7549
7550   if (integer_zerop (TYPE_SIZE (type)))
7551     return 0;
7552
7553   gcc_assert (TYPE_MODE (type) == mode);
7554
7555   if (!AGGREGATE_TYPE_P (type))
7556     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
7557
7558   if (TREE_CODE (type) == ARRAY_TYPE)
7559     return TYPE_ALIGN (TREE_TYPE (type));
7560
7561   unsigned int alignment = 0;
7562   unsigned int bitfield_alignment = 0;
7563   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7564     if (TREE_CODE (field) == FIELD_DECL)
7565       {
7566         /* Note that we explicitly consider zero-sized fields here,
7567            even though they don't map to AAPCS64 machine types.
7568            For example, in:
7569
7570                struct __attribute__((aligned(8))) empty {};
7571
7572                struct s {
7573                  [[no_unique_address]] empty e;
7574                  int x;
7575                };
7576
7577            "s" contains only one Fundamental Data Type (the int field)
7578            but gains 8-byte alignment and size thanks to "e".  */
7579         alignment = std::max (alignment, DECL_ALIGN (field));
7580         if (DECL_BIT_FIELD_TYPE (field))
7581           bitfield_alignment
7582             = std::max (bitfield_alignment,
7583                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7584       }
7585
7586   if (bitfield_alignment > alignment)
7587     {
7588       *abi_break = alignment;
7589       return bitfield_alignment;
7590     }
7591
7592   return alignment;
7593 }
7594
7595 /* Layout a function argument according to the AAPCS64 rules.  The rule
7596    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
7597    mode that was originally given to us by the target hook, whereas the
7598    mode in ARG might be the result of replacing partial SVE modes with
7599    the equivalent integer mode.  */
7600
7601 static void
7602 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7603 {
7604   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7605   tree type = arg.type;
7606   machine_mode mode = arg.mode;
7607   int ncrn, nvrn, nregs;
7608   bool allocate_ncrn, allocate_nvrn;
7609   HOST_WIDE_INT size;
7610   unsigned int abi_break;
7611
7612   /* We need to do this once per argument.  */
7613   if (pcum->aapcs_arg_processed)
7614     return;
7615
7616   pcum->aapcs_arg_processed = true;
7617
7618   pure_scalable_type_info pst_info;
7619   if (type && pst_info.analyze_registers (type))
7620     {
7621       /* The PCS says that it is invalid to pass an SVE value to an
7622          unprototyped function.  There is no ABI-defined location we
7623          can return in this case, so we have no real choice but to raise
7624          an error immediately, even though this is only a query function.  */
7625       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
7626         {
7627           gcc_assert (!pcum->silent_p);
7628           error ("SVE type %qT cannot be passed to an unprototyped function",
7629                  arg.type);
7630           /* Avoid repeating the message, and avoid tripping the assert
7631              below.  */
7632           pcum->pcs_variant = ARM_PCS_SVE;
7633         }
7634
7635       /* We would have converted the argument into pass-by-reference
7636          form if it didn't fit in registers.  */
7637       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
7638       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
7639       gcc_assert (arg.named
7640                   && pcum->pcs_variant == ARM_PCS_SVE
7641                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
7642                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
7643       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
7644                                           P0_REGNUM + pcum->aapcs_nprn);
7645       return;
7646     }
7647
7648   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7649      are passed by reference, not by value.  */
7650   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7651   bool sve_p = (vec_flags & VEC_ANY_SVE);
7652   if (sve_p)
7653     /* Vector types can acquire a partial SVE mode using things like
7654        __attribute__((vector_size(N))), and this is potentially useful.
7655        However, the choice of mode doesn't affect the type's ABI
7656        identity, so we should treat the types as though they had
7657        the associated integer mode, just like they did before SVE
7658        was introduced.
7659
7660        We know that the vector must be 128 bits or smaller,
7661        otherwise we'd have passed it in memory instead.  */
7662     gcc_assert (type
7663                 && (aarch64_some_values_include_pst_objects_p (type)
7664                     || (vec_flags & VEC_PARTIAL)));
7665
7666   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
7667   if (type)
7668     size = int_size_in_bytes (type);
7669   else
7670     /* No frontends can create types with variable-sized modes, so we
7671        shouldn't be asked to pass or return them.  */
7672     size = GET_MODE_SIZE (mode).to_constant ();
7673   size = ROUND_UP (size, UNITS_PER_WORD);
7674
7675   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
7676   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
7677                                                  mode,
7678                                                  type,
7679                                                  &nregs);
7680   gcc_assert (!sve_p || !allocate_nvrn);
7681
7682   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7683      The following code thus handles passing by SIMD/FP registers first.  */
7684
7685   nvrn = pcum->aapcs_nvrn;
7686
7687   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7688      and homogenous short-vector aggregates (HVA).  */
7689   if (allocate_nvrn)
7690     {
7691       if (!pcum->silent_p && !TARGET_FLOAT)
7692         aarch64_err_no_fpadvsimd (mode);
7693
7694       if (nvrn + nregs <= NUM_FP_ARG_REGS)
7695         {
7696           pcum->aapcs_nextnvrn = nvrn + nregs;
7697           if (!aarch64_composite_type_p (type, mode))
7698             {
7699               gcc_assert (nregs == 1);
7700               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7701             }
7702           else if (aarch64_advsimd_full_struct_mode_p (mode)
7703                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
7704             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7705           else if (aarch64_advsimd_partial_struct_mode_p (mode)
7706                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
7707             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7708           else
7709             {
7710               rtx par;
7711               int i;
7712               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7713               for (i = 0; i < nregs; i++)
7714                 {
7715                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7716                                          V0_REGNUM + nvrn + i);
7717                   rtx offset = gen_int_mode
7718                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7719                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7720                   XVECEXP (par, 0, i) = tmp;
7721                 }
7722               pcum->aapcs_reg = par;
7723             }
7724           return;
7725         }
7726       else
7727         {
7728           /* C.3 NSRN is set to 8.  */
7729           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7730           goto on_stack;
7731         }
7732     }
7733
7734   ncrn = pcum->aapcs_ncrn;
7735   nregs = size / UNITS_PER_WORD;
7736
7737   /* C6 - C9.  though the sign and zero extension semantics are
7738      handled elsewhere.  This is the case where the argument fits
7739      entirely general registers.  */
7740   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7741     {
7742       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7743
7744       /* C.8 if the argument has an alignment of 16 then the NGRN is
7745          rounded up to the next even number.  */
7746       if (nregs == 2
7747           && ncrn % 2
7748           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7749              comparison is there because for > 16 * BITS_PER_UNIT
7750              alignment nregs should be > 2 and therefore it should be
7751              passed by reference rather than value.  */
7752           && (aarch64_function_arg_alignment (mode, type, &abi_break)
7753               == 16 * BITS_PER_UNIT))
7754         {
7755           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
7756             inform (input_location, "parameter passing for argument of type "
7757                     "%qT changed in GCC 9.1", type);
7758           ++ncrn;
7759           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7760         }
7761
7762       /* If an argument with an SVE mode needs to be shifted up to the
7763          high part of the register, treat it as though it had an integer mode.
7764          Using the normal (parallel [...]) would suppress the shifting.  */
7765       if (sve_p
7766           && BYTES_BIG_ENDIAN
7767           && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7768           && aarch64_pad_reg_upward (mode, type, false))
7769         {
7770           mode = int_mode_for_mode (mode).require ();
7771           sve_p = false;
7772         }
7773
7774       /* NREGS can be 0 when e.g. an empty structure is to be passed.
7775          A reg is still generated for it, but the caller should be smart
7776          enough not to use it.  */
7777       if (nregs == 0
7778           || (nregs == 1 && !sve_p)
7779           || GET_MODE_CLASS (mode) == MODE_INT)
7780         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7781       else
7782         {
7783           rtx par;
7784           int i;
7785
7786           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7787           for (i = 0; i < nregs; i++)
7788             {
7789               scalar_int_mode reg_mode = word_mode;
7790               if (nregs == 1)
7791                 reg_mode = int_mode_for_mode (mode).require ();
7792               rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7793               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7794                                        GEN_INT (i * UNITS_PER_WORD));
7795               XVECEXP (par, 0, i) = tmp;
7796             }
7797           pcum->aapcs_reg = par;
7798         }
7799
7800       pcum->aapcs_nextncrn = ncrn + nregs;
7801       return;
7802     }
7803
7804   /* C.11  */
7805   pcum->aapcs_nextncrn = NUM_ARG_REGS;
7806
7807   /* The argument is passed on stack; record the needed number of words for
7808      this argument and align the total size if necessary.  */
7809 on_stack:
7810   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7811
7812   if (aarch64_function_arg_alignment (mode, type, &abi_break)
7813       == 16 * BITS_PER_UNIT)
7814     {
7815       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7816       if (pcum->aapcs_stack_size != new_size)
7817         {
7818           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
7819             inform (input_location, "parameter passing for argument of type "
7820                     "%qT changed in GCC 9.1", type);
7821           pcum->aapcs_stack_size = new_size;
7822         }
7823     }
7824   return;
7825 }
7826
7827 /* Implement TARGET_FUNCTION_ARG.  */
7828
7829 static rtx
7830 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7831 {
7832   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7833   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7834               || pcum->pcs_variant == ARM_PCS_SIMD
7835               || pcum->pcs_variant == ARM_PCS_SVE);
7836
7837   if (arg.end_marker_p ())
7838     return gen_int_mode (pcum->pcs_variant, DImode);
7839
7840   aarch64_layout_arg (pcum_v, arg);
7841   return pcum->aapcs_reg;
7842 }
7843
7844 void
7845 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7846                               const_tree fntype,
7847                               rtx libname ATTRIBUTE_UNUSED,
7848                               const_tree fndecl ATTRIBUTE_UNUSED,
7849                               unsigned n_named ATTRIBUTE_UNUSED,
7850                               bool silent_p)
7851 {
7852   pcum->aapcs_ncrn = 0;
7853   pcum->aapcs_nvrn = 0;
7854   pcum->aapcs_nprn = 0;
7855   pcum->aapcs_nextncrn = 0;
7856   pcum->aapcs_nextnvrn = 0;
7857   pcum->aapcs_nextnprn = 0;
7858   if (fntype)
7859     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7860   else
7861     pcum->pcs_variant = ARM_PCS_AAPCS64;
7862   pcum->aapcs_reg = NULL_RTX;
7863   pcum->aapcs_arg_processed = false;
7864   pcum->aapcs_stack_words = 0;
7865   pcum->aapcs_stack_size = 0;
7866   pcum->silent_p = silent_p;
7867
7868   if (!silent_p
7869       && !TARGET_FLOAT
7870       && fntype && fntype != error_mark_node)
7871     {
7872       const_tree type = TREE_TYPE (fntype);
7873       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
7874       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
7875       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7876                                                    &mode, &nregs, NULL, false))
7877         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7878     }
7879
7880   if (!silent_p
7881       && !TARGET_SVE
7882       && pcum->pcs_variant == ARM_PCS_SVE)
7883     {
7884       /* We can't gracefully recover at this point, so make this a
7885          fatal error.  */
7886       if (fndecl)
7887         fatal_error (input_location, "%qE requires the SVE ISA extension",
7888                      fndecl);
7889       else
7890         fatal_error (input_location, "calls to functions of type %qT require"
7891                      " the SVE ISA extension", fntype);
7892     }
7893 }
7894
7895 static void
7896 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7897                               const function_arg_info &arg)
7898 {
7899   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7900   if (pcum->pcs_variant == ARM_PCS_AAPCS64
7901       || pcum->pcs_variant == ARM_PCS_SIMD
7902       || pcum->pcs_variant == ARM_PCS_SVE)
7903     {
7904       aarch64_layout_arg (pcum_v, arg);
7905       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7906                   != (pcum->aapcs_stack_words != 0));
7907       pcum->aapcs_arg_processed = false;
7908       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7909       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7910       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7911       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7912       pcum->aapcs_stack_words = 0;
7913       pcum->aapcs_reg = NULL_RTX;
7914     }
7915 }
7916
7917 bool
7918 aarch64_function_arg_regno_p (unsigned regno)
7919 {
7920   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7921           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
7922 }
7923
7924 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
7925    PARM_BOUNDARY bits of alignment, but will be given anything up
7926    to STACK_BOUNDARY bits if the type requires it.  This makes sure
7927    that both before and after the layout of each argument, the Next
7928    Stacked Argument Address (NSAA) will have a minimum alignment of
7929    8 bytes.  */
7930
7931 static unsigned int
7932 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7933 {
7934   unsigned int abi_break;
7935   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7936                                                            &abi_break);
7937   alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7938   if (abi_break & warn_psabi)
7939     {
7940       abi_break = MIN (MAX (abi_break, PARM_BOUNDARY), STACK_BOUNDARY);
7941       if (alignment != abi_break)
7942         inform (input_location, "parameter passing for argument of type "
7943                 "%qT changed in GCC 9.1", type);
7944     }
7945
7946   return alignment;
7947 }
7948
7949 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
7950
7951 static fixed_size_mode
7952 aarch64_get_reg_raw_mode (int regno)
7953 {
7954   if (TARGET_SVE && FP_REGNUM_P (regno))
7955     /* Don't use the SVE part of the register for __builtin_apply and
7956        __builtin_return.  The SVE registers aren't used by the normal PCS,
7957        so using them there would be a waste of time.  The PCS extensions
7958        for SVE types are fundamentally incompatible with the
7959        __builtin_return/__builtin_apply interface.  */
7960     return as_a <fixed_size_mode> (V16QImode);
7961   return default_get_reg_raw_mode (regno);
7962 }
7963
7964 /* Implement TARGET_FUNCTION_ARG_PADDING.
7965
7966    Small aggregate types are placed in the lowest memory address.
7967
7968    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
7969
7970 static pad_direction
7971 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7972 {
7973   /* On little-endian targets, the least significant byte of every stack
7974      argument is passed at the lowest byte address of the stack slot.  */
7975   if (!BYTES_BIG_ENDIAN)
7976     return PAD_UPWARD;
7977
7978   /* Otherwise, integral, floating-point and pointer types are padded downward:
7979      the least significant byte of a stack argument is passed at the highest
7980      byte address of the stack slot.  */
7981   if (type
7982       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7983          || POINTER_TYPE_P (type))
7984       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7985     return PAD_DOWNWARD;
7986
7987   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
7988   return PAD_UPWARD;
7989 }
7990
7991 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7992
7993    It specifies padding for the last (may also be the only)
7994    element of a block move between registers and memory.  If
7995    assuming the block is in the memory, padding upward means that
7996    the last element is padded after its highest significant byte,
7997    while in downward padding, the last element is padded at the
7998    its least significant byte side.
7999
8000    Small aggregates and small complex types are always padded
8001    upwards.
8002
8003    We don't need to worry about homogeneous floating-point or
8004    short-vector aggregates; their move is not affected by the
8005    padding direction determined here.  Regardless of endianness,
8006    each element of such an aggregate is put in the least
8007    significant bits of a fp/simd register.
8008
8009    Return !BYTES_BIG_ENDIAN if the least significant byte of the
8010    register has useful data, and return the opposite if the most
8011    significant byte does.  */
8012
8013 bool
8014 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
8015                      bool first ATTRIBUTE_UNUSED)
8016 {
8017
8018   /* Aside from pure scalable types, small composite types are always
8019      padded upward.  */
8020   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
8021     {
8022       HOST_WIDE_INT size;
8023       if (type)
8024         size = int_size_in_bytes (type);
8025       else
8026         /* No frontends can create types with variable-sized modes, so we
8027            shouldn't be asked to pass or return them.  */
8028         size = GET_MODE_SIZE (mode).to_constant ();
8029       if (size < 2 * UNITS_PER_WORD)
8030         {
8031           pure_scalable_type_info pst_info;
8032           if (pst_info.analyze_registers (type))
8033             return false;
8034           return true;
8035         }
8036     }
8037
8038   /* Otherwise, use the default padding.  */
8039   return !BYTES_BIG_ENDIAN;
8040 }
8041
8042 static scalar_int_mode
8043 aarch64_libgcc_cmp_return_mode (void)
8044 {
8045   return SImode;
8046 }
8047
8048 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
8049
8050 /* We use the 12-bit shifted immediate arithmetic instructions so values
8051    must be multiple of (1 << 12), i.e. 4096.  */
8052 #define ARITH_FACTOR 4096
8053
8054 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
8055 #error Cannot use simple address calculation for stack probing
8056 #endif
8057
8058 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
8059    inclusive.  These are offsets from the current stack pointer.  */
8060
8061 static void
8062 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
8063 {
8064   HOST_WIDE_INT size;
8065   if (!poly_size.is_constant (&size))
8066     {
8067       sorry ("stack probes for SVE frames");
8068       return;
8069     }
8070
8071   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
8072
8073   /* See the same assertion on PROBE_INTERVAL above.  */
8074   gcc_assert ((first % ARITH_FACTOR) == 0);
8075
8076   /* See if we have a constant small number of probes to generate.  If so,
8077      that's the easy case.  */
8078   if (size <= PROBE_INTERVAL)
8079     {
8080       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
8081
8082       emit_set_insn (reg1,
8083                      plus_constant (Pmode,
8084                                     stack_pointer_rtx, -(first + base)));
8085       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
8086     }
8087
8088   /* The run-time loop is made up of 8 insns in the generic case while the
8089      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
8090   else if (size <= 4 * PROBE_INTERVAL)
8091     {
8092       HOST_WIDE_INT i, rem;
8093
8094       emit_set_insn (reg1,
8095                      plus_constant (Pmode,
8096                                     stack_pointer_rtx,
8097                                     -(first + PROBE_INTERVAL)));
8098       emit_stack_probe (reg1);
8099
8100       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
8101          it exceeds SIZE.  If only two probes are needed, this will not
8102          generate any code.  Then probe at FIRST + SIZE.  */
8103       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
8104         {
8105           emit_set_insn (reg1,
8106                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
8107           emit_stack_probe (reg1);
8108         }
8109
8110       rem = size - (i - PROBE_INTERVAL);
8111       if (rem > 256)
8112         {
8113           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8114
8115           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
8116           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
8117         }
8118       else
8119         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
8120     }
8121
8122   /* Otherwise, do the same as above, but in a loop.  Note that we must be
8123      extra careful with variables wrapping around because we might be at
8124      the very top (or the very bottom) of the address space and we have
8125      to be able to handle this case properly; in particular, we use an
8126      equality test for the loop condition.  */
8127   else
8128     {
8129       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
8130
8131       /* Step 1: round SIZE to the previous multiple of the interval.  */
8132
8133       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
8134
8135
8136       /* Step 2: compute initial and final value of the loop counter.  */
8137
8138       /* TEST_ADDR = SP + FIRST.  */
8139       emit_set_insn (reg1,
8140                      plus_constant (Pmode, stack_pointer_rtx, -first));
8141
8142       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
8143       HOST_WIDE_INT adjustment = - (first + rounded_size);
8144       if (! aarch64_uimm12_shift (adjustment))
8145         {
8146           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
8147                                           true, Pmode);
8148           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
8149         }
8150       else
8151         emit_set_insn (reg2,
8152                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
8153
8154       /* Step 3: the loop
8155
8156          do
8157            {
8158              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
8159              probe at TEST_ADDR
8160            }
8161          while (TEST_ADDR != LAST_ADDR)
8162
8163          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
8164          until it is equal to ROUNDED_SIZE.  */
8165
8166       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
8167
8168
8169       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
8170          that SIZE is equal to ROUNDED_SIZE.  */
8171
8172       if (size != rounded_size)
8173         {
8174           HOST_WIDE_INT rem = size - rounded_size;
8175
8176           if (rem > 256)
8177             {
8178               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8179
8180               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
8181               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
8182             }
8183           else
8184             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
8185         }
8186     }
8187
8188   /* Make sure nothing is scheduled before we are done.  */
8189   emit_insn (gen_blockage ());
8190 }
8191
8192 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
8193    absolute addresses.  */
8194
8195 const char *
8196 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
8197 {
8198   static int labelno = 0;
8199   char loop_lab[32];
8200   rtx xops[2];
8201
8202   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
8203
8204   /* Loop.  */
8205   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
8206
8207   HOST_WIDE_INT stack_clash_probe_interval
8208     = 1 << param_stack_clash_protection_guard_size;
8209
8210   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
8211   xops[0] = reg1;
8212   HOST_WIDE_INT interval;
8213   if (flag_stack_clash_protection)
8214     interval = stack_clash_probe_interval;
8215   else
8216     interval = PROBE_INTERVAL;
8217
8218   gcc_assert (aarch64_uimm12_shift (interval));
8219   xops[1] = GEN_INT (interval);
8220
8221   output_asm_insn ("sub\t%0, %0, %1", xops);
8222
8223   /* If doing stack clash protection then we probe up by the ABI specified
8224      amount.  We do this because we're dropping full pages at a time in the
8225      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
8226   if (flag_stack_clash_protection)
8227     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
8228   else
8229     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
8230
8231   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
8232      by this amount for each iteration.  */
8233   output_asm_insn ("str\txzr, [%0, %1]", xops);
8234
8235   /* Test if TEST_ADDR == LAST_ADDR.  */
8236   xops[1] = reg2;
8237   output_asm_insn ("cmp\t%0, %1", xops);
8238
8239   /* Branch.  */
8240   fputs ("\tb.ne\t", asm_out_file);
8241   assemble_name_raw (asm_out_file, loop_lab);
8242   fputc ('\n', asm_out_file);
8243
8244   return "";
8245 }
8246
8247 /* Emit the probe loop for doing stack clash probes and stack adjustments for
8248    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
8249    of GUARD_SIZE.  When a probe is emitted it is done at most
8250    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
8251    at most MIN_PROBE_THRESHOLD.  By the end of this function
8252    BASE = BASE - ADJUSTMENT.  */
8253
8254 const char *
8255 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
8256                                       rtx min_probe_threshold, rtx guard_size)
8257 {
8258   /* This function is not allowed to use any instruction generation function
8259      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
8260      so instead emit the code you want using output_asm_insn.  */
8261   gcc_assert (flag_stack_clash_protection);
8262   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
8263   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
8264
8265   /* The minimum required allocation before the residual requires probing.  */
8266   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
8267
8268   /* Clamp the value down to the nearest value that can be used with a cmp.  */
8269   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
8270   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
8271
8272   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
8273   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
8274
8275   static int labelno = 0;
8276   char loop_start_lab[32];
8277   char loop_end_lab[32];
8278   rtx xops[2];
8279
8280   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
8281   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
8282
8283   /* Emit loop start label.  */
8284   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
8285
8286   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
8287   xops[0] = adjustment;
8288   xops[1] = probe_offset_value_rtx;
8289   output_asm_insn ("cmp\t%0, %1", xops);
8290
8291   /* Branch to end if not enough adjustment to probe.  */
8292   fputs ("\tb.lt\t", asm_out_file);
8293   assemble_name_raw (asm_out_file, loop_end_lab);
8294   fputc ('\n', asm_out_file);
8295
8296   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
8297   xops[0] = base;
8298   xops[1] = probe_offset_value_rtx;
8299   output_asm_insn ("sub\t%0, %0, %1", xops);
8300
8301   /* Probe at BASE.  */
8302   xops[1] = const0_rtx;
8303   output_asm_insn ("str\txzr, [%0, %1]", xops);
8304
8305   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
8306   xops[0] = adjustment;
8307   xops[1] = probe_offset_value_rtx;
8308   output_asm_insn ("sub\t%0, %0, %1", xops);
8309
8310   /* Branch to start if still more bytes to allocate.  */
8311   fputs ("\tb\t", asm_out_file);
8312   assemble_name_raw (asm_out_file, loop_start_lab);
8313   fputc ('\n', asm_out_file);
8314
8315   /* No probe leave.  */
8316   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
8317
8318   /* BASE = BASE - ADJUSTMENT.  */
8319   xops[0] = base;
8320   xops[1] = adjustment;
8321   output_asm_insn ("sub\t%0, %0, %1", xops);
8322   return "";
8323 }
8324
8325 /* Determine whether a frame chain needs to be generated.  */
8326 static bool
8327 aarch64_needs_frame_chain (void)
8328 {
8329   /* Force a frame chain for EH returns so the return address is at FP+8.  */
8330   if (frame_pointer_needed || crtl->calls_eh_return)
8331     return true;
8332
8333   /* A leaf function cannot have calls or write LR.  */
8334   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
8335
8336   /* Don't use a frame chain in leaf functions if leaf frame pointers
8337      are disabled.  */
8338   if (flag_omit_leaf_frame_pointer && is_leaf)
8339     return false;
8340
8341   return aarch64_use_frame_pointer;
8342 }
8343
8344 /* Mark the registers that need to be saved by the callee and calculate
8345    the size of the callee-saved registers area and frame record (both FP
8346    and LR may be omitted).  */
8347 static void
8348 aarch64_layout_frame (void)
8349 {
8350   poly_int64 offset = 0;
8351   int regno, last_fp_reg = INVALID_REGNUM;
8352   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
8353   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
8354   bool frame_related_fp_reg_p = false;
8355   aarch64_frame &frame = cfun->machine->frame;
8356
8357   frame.emit_frame_chain = aarch64_needs_frame_chain ();
8358
8359   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
8360      the mid-end is doing.  */
8361   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
8362
8363 #define SLOT_NOT_REQUIRED (-2)
8364 #define SLOT_REQUIRED     (-1)
8365
8366   frame.wb_push_candidate1 = INVALID_REGNUM;
8367   frame.wb_push_candidate2 = INVALID_REGNUM;
8368   frame.spare_pred_reg = INVALID_REGNUM;
8369
8370   /* First mark all the registers that really need to be saved...  */
8371   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8372     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
8373
8374   /* ... that includes the eh data registers (if needed)...  */
8375   if (crtl->calls_eh_return)
8376     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
8377       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
8378
8379   /* ... and any callee saved register that dataflow says is live.  */
8380   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8381     if (df_regs_ever_live_p (regno)
8382         && !fixed_regs[regno]
8383         && (regno == R30_REGNUM
8384             || !crtl->abi->clobbers_full_reg_p (regno)))
8385       frame.reg_offset[regno] = SLOT_REQUIRED;
8386
8387   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8388     if (df_regs_ever_live_p (regno)
8389         && !fixed_regs[regno]
8390         && !crtl->abi->clobbers_full_reg_p (regno))
8391       {
8392         frame.reg_offset[regno] = SLOT_REQUIRED;
8393         last_fp_reg = regno;
8394         if (aarch64_emit_cfi_for_reg_p (regno))
8395           frame_related_fp_reg_p = true;
8396       }
8397
8398   /* Big-endian SVE frames need a spare predicate register in order
8399      to save Z8-Z15.  Decide which register they should use.  Prefer
8400      an unused argument register if possible, so that we don't force P4
8401      to be saved unnecessarily.  */
8402   if (frame_related_fp_reg_p
8403       && crtl->abi->id () == ARM_PCS_SVE
8404       && BYTES_BIG_ENDIAN)
8405     {
8406       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8407       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
8408       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
8409         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
8410           break;
8411       gcc_assert (regno <= P7_REGNUM);
8412       frame.spare_pred_reg = regno;
8413       df_set_regs_ever_live (regno, true);
8414     }
8415
8416   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8417     if (df_regs_ever_live_p (regno)
8418         && !fixed_regs[regno]
8419         && !crtl->abi->clobbers_full_reg_p (regno))
8420       frame.reg_offset[regno] = SLOT_REQUIRED;
8421
8422   /* With stack-clash, LR must be saved in non-leaf functions.  The saving of
8423      LR counts as an implicit probe which allows us to maintain the invariant
8424      described in the comment at expand_prologue.  */
8425   gcc_assert (crtl->is_leaf
8426               || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
8427
8428   /* Now assign stack slots for the registers.  Start with the predicate
8429      registers, since predicate LDR and STR have a relatively small
8430      offset range.  These saves happen below the hard frame pointer.  */
8431   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8432     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8433       {
8434         frame.reg_offset[regno] = offset;
8435         offset += BYTES_PER_SVE_PRED;
8436       }
8437
8438   if (maybe_ne (offset, 0))
8439     {
8440       /* If we have any vector registers to save above the predicate registers,
8441          the offset of the vector register save slots need to be a multiple
8442          of the vector size.  This lets us use the immediate forms of LDR/STR
8443          (or LD1/ST1 for big-endian).
8444
8445          A vector register is 8 times the size of a predicate register,
8446          and we need to save a maximum of 12 predicate registers, so the
8447          first vector register will be at either #1, MUL VL or #2, MUL VL.
8448
8449          If we don't have any vector registers to save, and we know how
8450          big the predicate save area is, we can just round it up to the
8451          next 16-byte boundary.  */
8452       if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
8453         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8454       else
8455         {
8456           if (known_le (offset, vector_save_size))
8457             offset = vector_save_size;
8458           else if (known_le (offset, vector_save_size * 2))
8459             offset = vector_save_size * 2;
8460           else
8461             gcc_unreachable ();
8462         }
8463     }
8464
8465   /* If we need to save any SVE vector registers, add them next.  */
8466   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
8467     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8468       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8469         {
8470           frame.reg_offset[regno] = offset;
8471           offset += vector_save_size;
8472         }
8473
8474   /* OFFSET is now the offset of the hard frame pointer from the bottom
8475      of the callee save area.  */
8476   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
8477   frame.below_hard_fp_saved_regs_size = offset;
8478   if (frame.emit_frame_chain)
8479     {
8480       /* FP and LR are placed in the linkage record.  */
8481       frame.reg_offset[R29_REGNUM] = offset;
8482       frame.wb_push_candidate1 = R29_REGNUM;
8483       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
8484       frame.wb_push_candidate2 = R30_REGNUM;
8485       offset += 2 * UNITS_PER_WORD;
8486     }
8487
8488   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8489     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8490       {
8491         frame.reg_offset[regno] = offset;
8492         if (frame.wb_push_candidate1 == INVALID_REGNUM)
8493           frame.wb_push_candidate1 = regno;
8494         else if (frame.wb_push_candidate2 == INVALID_REGNUM)
8495           frame.wb_push_candidate2 = regno;
8496         offset += UNITS_PER_WORD;
8497       }
8498
8499   poly_int64 max_int_offset = offset;
8500   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8501   bool has_align_gap = maybe_ne (offset, max_int_offset);
8502
8503   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8504     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8505       {
8506         /* If there is an alignment gap between integer and fp callee-saves,
8507            allocate the last fp register to it if possible.  */
8508         if (regno == last_fp_reg
8509             && has_align_gap
8510             && known_eq (vector_save_size, 8)
8511             && multiple_p (offset, 16))
8512           {
8513             frame.reg_offset[regno] = max_int_offset;
8514             break;
8515           }
8516
8517         frame.reg_offset[regno] = offset;
8518         if (frame.wb_push_candidate1 == INVALID_REGNUM)
8519           frame.wb_push_candidate1 = regno;
8520         else if (frame.wb_push_candidate2 == INVALID_REGNUM
8521                  && frame.wb_push_candidate1 >= V0_REGNUM)
8522           frame.wb_push_candidate2 = regno;
8523         offset += vector_save_size;
8524       }
8525
8526   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8527
8528   frame.saved_regs_size = offset;
8529
8530   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
8531
8532   poly_int64 above_outgoing_args
8533     = aligned_upper_bound (varargs_and_saved_regs_size
8534                            + get_frame_size (),
8535                            STACK_BOUNDARY / BITS_PER_UNIT);
8536
8537   frame.hard_fp_offset
8538     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
8539
8540   /* Both these values are already aligned.  */
8541   gcc_assert (multiple_p (crtl->outgoing_args_size,
8542                           STACK_BOUNDARY / BITS_PER_UNIT));
8543   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
8544
8545   frame.locals_offset = frame.saved_varargs_size;
8546
8547   frame.initial_adjust = 0;
8548   frame.final_adjust = 0;
8549   frame.callee_adjust = 0;
8550   frame.sve_callee_adjust = 0;
8551   frame.callee_offset = 0;
8552
8553   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8554   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8555
8556   /* Shadow call stack only deals with functions where the LR is pushed
8557      onto the stack and without specifying the "no_sanitize" attribute
8558      with the argument "shadow-call-stack".  */
8559   frame.is_scs_enabled
8560     = (!crtl->calls_eh_return
8561        && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8562        && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
8563
8564   /* When shadow call stack is enabled, the scs_pop in the epilogue will
8565      restore x30, and we don't need to pop x30 again in the traditional
8566      way.  Pop candidates record the registers that need to be popped
8567      eventually.  */
8568   if (frame.is_scs_enabled)
8569     {
8570       if (frame.wb_pop_candidate2 == R30_REGNUM)
8571         frame.wb_pop_candidate2 = INVALID_REGNUM;
8572       else if (frame.wb_pop_candidate1 == R30_REGNUM)
8573         frame.wb_pop_candidate1 = INVALID_REGNUM;
8574     }
8575
8576   /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8577      256 to ensure that the offset meets the requirements of emit_move_insn.
8578      Similarly, if candidate1 is INVALID_REGNUM, we need to set
8579      max_push_offset to 0, because no registers are popped at this time,
8580      so callee_adjust cannot be adjusted.  */
8581   HOST_WIDE_INT max_push_offset = 0;
8582   if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8583     max_push_offset = 512;
8584   else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8585     max_push_offset = 256;
8586
8587   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
8588   HOST_WIDE_INT const_saved_regs_size;
8589   if (frame.frame_size.is_constant (&const_size)
8590       && const_size < max_push_offset
8591       && known_eq (frame.hard_fp_offset, const_size))
8592     {
8593       /* Simple, small frame with no outgoing arguments:
8594
8595          stp reg1, reg2, [sp, -frame_size]!
8596          stp reg3, reg4, [sp, 16]  */
8597       frame.callee_adjust = const_size;
8598     }
8599   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
8600            && frame.saved_regs_size.is_constant (&const_saved_regs_size)
8601            && const_outgoing_args_size + const_saved_regs_size < 512
8602            /* We could handle this case even with outgoing args, provided
8603               that the number of args left us with valid offsets for all
8604               predicate and vector save slots.  It's such a rare case that
8605               it hardly seems worth the effort though.  */
8606            && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
8607            && !(cfun->calls_alloca
8608                 && frame.hard_fp_offset.is_constant (&const_fp_offset)
8609                 && const_fp_offset < max_push_offset))
8610     {
8611       /* Frame with small outgoing arguments:
8612
8613          sub sp, sp, frame_size
8614          stp reg1, reg2, [sp, outgoing_args_size]
8615          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
8616       frame.initial_adjust = frame.frame_size;
8617       frame.callee_offset = const_outgoing_args_size;
8618     }
8619   else if (saves_below_hard_fp_p
8620            && known_eq (frame.saved_regs_size,
8621                         frame.below_hard_fp_saved_regs_size))
8622     {
8623       /* Frame in which all saves are SVE saves:
8624
8625          sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
8626          save SVE registers relative to SP
8627          sub sp, sp, outgoing_args_size  */
8628       frame.initial_adjust = (frame.hard_fp_offset
8629                               + frame.below_hard_fp_saved_regs_size);
8630       frame.final_adjust = crtl->outgoing_args_size;
8631     }
8632   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
8633            && const_fp_offset < max_push_offset)
8634     {
8635       /* Frame with large outgoing arguments or SVE saves, but with
8636          a small local area:
8637
8638          stp reg1, reg2, [sp, -hard_fp_offset]!
8639          stp reg3, reg4, [sp, 16]
8640          [sub sp, sp, below_hard_fp_saved_regs_size]
8641          [save SVE registers relative to SP]
8642          sub sp, sp, outgoing_args_size  */
8643       frame.callee_adjust = const_fp_offset;
8644       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8645       frame.final_adjust = crtl->outgoing_args_size;
8646     }
8647   else
8648     {
8649       /* Frame with large local area and outgoing arguments or SVE saves,
8650          using frame pointer:
8651
8652          sub sp, sp, hard_fp_offset
8653          stp x29, x30, [sp, 0]
8654          add x29, sp, 0
8655          stp reg3, reg4, [sp, 16]
8656          [sub sp, sp, below_hard_fp_saved_regs_size]
8657          [save SVE registers relative to SP]
8658          sub sp, sp, outgoing_args_size  */
8659       frame.initial_adjust = frame.hard_fp_offset;
8660       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8661       frame.final_adjust = crtl->outgoing_args_size;
8662     }
8663
8664   /* Make sure the individual adjustments add up to the full frame size.  */
8665   gcc_assert (known_eq (frame.initial_adjust
8666                         + frame.callee_adjust
8667                         + frame.sve_callee_adjust
8668                         + frame.final_adjust, frame.frame_size));
8669
8670   if (!frame.emit_frame_chain && frame.callee_adjust == 0)
8671     {
8672       /* We've decided not to associate any register saves with the initial
8673          stack allocation.  */
8674       frame.wb_pop_candidate1 = frame.wb_push_candidate1 = INVALID_REGNUM;
8675       frame.wb_pop_candidate2 = frame.wb_push_candidate2 = INVALID_REGNUM;
8676     }
8677
8678   frame.laid_out = true;
8679 }
8680
8681 /* Return true if the register REGNO is saved on entry to
8682    the current function.  */
8683
8684 static bool
8685 aarch64_register_saved_on_entry (int regno)
8686 {
8687   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8688 }
8689
8690 /* Return the next register up from REGNO up to LIMIT for the callee
8691    to save.  */
8692
8693 static unsigned
8694 aarch64_next_callee_save (unsigned regno, unsigned limit)
8695 {
8696   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
8697     regno ++;
8698   return regno;
8699 }
8700
8701 /* Push the register number REGNO of mode MODE to the stack with write-back
8702    adjusting the stack by ADJUSTMENT.  */
8703
8704 static void
8705 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8706                            HOST_WIDE_INT adjustment)
8707  {
8708   rtx base_rtx = stack_pointer_rtx;
8709   rtx insn, reg, mem;
8710
8711   reg = gen_rtx_REG (mode, regno);
8712   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8713                             plus_constant (Pmode, base_rtx, -adjustment));
8714   mem = gen_frame_mem (mode, mem);
8715
8716   insn = emit_move_insn (mem, reg);
8717   RTX_FRAME_RELATED_P (insn) = 1;
8718 }
8719
8720 /* Generate and return an instruction to store the pair of registers
8721    REG and REG2 of mode MODE to location BASE with write-back adjusting
8722    the stack location BASE by ADJUSTMENT.  */
8723
8724 static rtx
8725 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8726                           HOST_WIDE_INT adjustment)
8727 {
8728   switch (mode)
8729     {
8730     case E_DImode:
8731       return gen_storewb_pairdi_di (base, base, reg, reg2,
8732                                     GEN_INT (-adjustment),
8733                                     GEN_INT (UNITS_PER_WORD - adjustment));
8734     case E_DFmode:
8735       return gen_storewb_pairdf_di (base, base, reg, reg2,
8736                                     GEN_INT (-adjustment),
8737                                     GEN_INT (UNITS_PER_WORD - adjustment));
8738     case E_TFmode:
8739       return gen_storewb_pairtf_di (base, base, reg, reg2,
8740                                     GEN_INT (-adjustment),
8741                                     GEN_INT (UNITS_PER_VREG - adjustment));
8742     default:
8743       gcc_unreachable ();
8744     }
8745 }
8746
8747 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8748    stack pointer by ADJUSTMENT.  */
8749
8750 static void
8751 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8752 {
8753   rtx_insn *insn;
8754   machine_mode mode = aarch64_reg_save_mode (regno1);
8755
8756   if (regno2 == INVALID_REGNUM)
8757     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8758
8759   rtx reg1 = gen_rtx_REG (mode, regno1);
8760   rtx reg2 = gen_rtx_REG (mode, regno2);
8761
8762   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8763                                               reg2, adjustment));
8764   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8765   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8766   RTX_FRAME_RELATED_P (insn) = 1;
8767 }
8768
8769 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8770    adjusting it by ADJUSTMENT afterwards.  */
8771
8772 static rtx
8773 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8774                          HOST_WIDE_INT adjustment)
8775 {
8776   switch (mode)
8777     {
8778     case E_DImode:
8779       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
8780                                    GEN_INT (UNITS_PER_WORD));
8781     case E_DFmode:
8782       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
8783                                    GEN_INT (UNITS_PER_WORD));
8784     case E_TFmode:
8785       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
8786                                    GEN_INT (UNITS_PER_VREG));
8787     default:
8788       gcc_unreachable ();
8789     }
8790 }
8791
8792 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8793    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8794    into CFI_OPS.  */
8795
8796 static void
8797 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8798                   rtx *cfi_ops)
8799 {
8800   machine_mode mode = aarch64_reg_save_mode (regno1);
8801   rtx reg1 = gen_rtx_REG (mode, regno1);
8802
8803   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8804
8805   if (regno2 == INVALID_REGNUM)
8806     {
8807       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8808       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8809       emit_move_insn (reg1, gen_frame_mem (mode, mem));
8810     }
8811   else
8812     {
8813       rtx reg2 = gen_rtx_REG (mode, regno2);
8814       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8815       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8816                                           reg2, adjustment));
8817     }
8818 }
8819
8820 /* Generate and return a store pair instruction of mode MODE to store
8821    register REG1 to MEM1 and register REG2 to MEM2.  */
8822
8823 static rtx
8824 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
8825                         rtx reg2)
8826 {
8827   switch (mode)
8828     {
8829     case E_DImode:
8830       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
8831
8832     case E_DFmode:
8833       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
8834
8835     case E_TFmode:
8836       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
8837
8838     case E_V4SImode:
8839       return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
8840
8841     case E_V16QImode:
8842       return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
8843
8844     default:
8845       gcc_unreachable ();
8846     }
8847 }
8848
8849 /* Generate and regurn a load pair isntruction of mode MODE to load register
8850    REG1 from MEM1 and register REG2 from MEM2.  */
8851
8852 static rtx
8853 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
8854                        rtx mem2)
8855 {
8856   switch (mode)
8857     {
8858     case E_DImode:
8859       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
8860
8861     case E_DFmode:
8862       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
8863
8864     case E_TFmode:
8865       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
8866
8867     case E_V4SImode:
8868       return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
8869
8870     default:
8871       gcc_unreachable ();
8872     }
8873 }
8874
8875 /* Return TRUE if return address signing should be enabled for the current
8876    function, otherwise return FALSE.  */
8877
8878 bool
8879 aarch64_return_address_signing_enabled (void)
8880 {
8881   /* This function should only be called after frame laid out.   */
8882   gcc_assert (cfun->machine->frame.laid_out);
8883
8884   /* Turn return address signing off in any function that uses
8885      __builtin_eh_return.  The address passed to __builtin_eh_return
8886      is not signed so either it has to be signed (with original sp)
8887      or the code path that uses it has to avoid authenticating it.
8888      Currently eh return introduces a return to anywhere gadget, no
8889      matter what we do here since it uses ret with user provided
8890      address. An ideal fix for that is to use indirect branch which
8891      can be protected with BTI j (to some extent).  */
8892   if (crtl->calls_eh_return)
8893     return false;
8894
8895   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
8896      if its LR is pushed onto stack.  */
8897   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
8898           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
8899               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8900 }
8901
8902 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
8903 bool
8904 aarch64_bti_enabled (void)
8905 {
8906   return (aarch64_enable_bti == 1);
8907 }
8908
8909 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8910    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8911    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
8912
8913      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8914          or LD1D address
8915
8916      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8917          if the variable isn't already nonnull
8918
8919    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8920    Handle this case using a temporary base register that is suitable for
8921    all offsets in that range.  Use ANCHOR_REG as this base register if it
8922    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
8923
8924 static inline void
8925 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8926                                      rtx &anchor_reg, poly_int64 &offset,
8927                                      rtx &ptrue)
8928 {
8929   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8930     {
8931       /* This is the maximum valid offset of the anchor from the base.
8932          Lower values would be valid too.  */
8933       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8934       if (!anchor_reg)
8935         {
8936           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8937           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8938                                     gen_int_mode (anchor_offset, Pmode)));
8939         }
8940       base_rtx = anchor_reg;
8941       offset -= anchor_offset;
8942     }
8943   if (!ptrue)
8944     {
8945       int pred_reg = cfun->machine->frame.spare_pred_reg;
8946       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8947                       CONSTM1_RTX (VNx16BImode));
8948       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8949     }
8950 }
8951
8952 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8953    is saved at BASE + OFFSET.  */
8954
8955 static void
8956 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8957                             rtx base, poly_int64 offset)
8958 {
8959   rtx mem = gen_frame_mem (GET_MODE (reg),
8960                            plus_constant (Pmode, base, offset));
8961   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8962 }
8963
8964 /* Emit code to save the callee-saved registers from register number START
8965    to LIMIT to the stack at the location starting at offset START_OFFSET,
8966    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
8967    is true if the hard frame pointer has been set up.  */
8968
8969 static void
8970 aarch64_save_callee_saves (poly_int64 start_offset,
8971                            unsigned start, unsigned limit, bool skip_wb,
8972                            bool hard_fp_valid_p)
8973 {
8974   rtx_insn *insn;
8975   unsigned regno;
8976   unsigned regno2;
8977   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8978
8979   for (regno = aarch64_next_callee_save (start, limit);
8980        regno <= limit;
8981        regno = aarch64_next_callee_save (regno + 1, limit))
8982     {
8983       rtx reg, mem;
8984       poly_int64 offset;
8985       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8986
8987       if (skip_wb
8988           && (regno == cfun->machine->frame.wb_push_candidate1
8989               || regno == cfun->machine->frame.wb_push_candidate2))
8990         continue;
8991
8992       if (cfun->machine->reg_is_wrapped_separately[regno])
8993         continue;
8994
8995       machine_mode mode = aarch64_reg_save_mode (regno);
8996       reg = gen_rtx_REG (mode, regno);
8997       offset = start_offset + cfun->machine->frame.reg_offset[regno];
8998       rtx base_rtx = stack_pointer_rtx;
8999       poly_int64 sp_offset = offset;
9000
9001       HOST_WIDE_INT const_offset;
9002       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9003         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9004                                              offset, ptrue);
9005       else if (GP_REGNUM_P (regno)
9006                && (!offset.is_constant (&const_offset) || const_offset >= 512))
9007         {
9008           gcc_assert (known_eq (start_offset, 0));
9009           poly_int64 fp_offset
9010             = cfun->machine->frame.below_hard_fp_saved_regs_size;
9011           if (hard_fp_valid_p)
9012             base_rtx = hard_frame_pointer_rtx;
9013           else
9014             {
9015               if (!anchor_reg)
9016                 {
9017                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9018                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
9019                                             gen_int_mode (fp_offset, Pmode)));
9020                 }
9021               base_rtx = anchor_reg;
9022             }
9023           offset -= fp_offset;
9024         }
9025       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9026       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
9027
9028       if (!aarch64_sve_mode_p (mode)
9029           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
9030           && !cfun->machine->reg_is_wrapped_separately[regno2]
9031           && known_eq (GET_MODE_SIZE (mode),
9032                        cfun->machine->frame.reg_offset[regno2]
9033                        - cfun->machine->frame.reg_offset[regno]))
9034         {
9035           rtx reg2 = gen_rtx_REG (mode, regno2);
9036           rtx mem2;
9037
9038           offset += GET_MODE_SIZE (mode);
9039           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9040           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
9041                                                     reg2));
9042
9043           /* The first part of a frame-related parallel insn is
9044              always assumed to be relevant to the frame
9045              calculations; subsequent parts, are only
9046              frame-related if explicitly marked.  */
9047           if (aarch64_emit_cfi_for_reg_p (regno2))
9048             {
9049               if (need_cfa_note_p)
9050                 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
9051                                             sp_offset + GET_MODE_SIZE (mode));
9052               else
9053                 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
9054             }
9055
9056           regno = regno2;
9057         }
9058       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9059         {
9060           insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
9061           need_cfa_note_p = true;
9062         }
9063       else if (aarch64_sve_mode_p (mode))
9064         insn = emit_insn (gen_rtx_SET (mem, reg));
9065       else
9066         insn = emit_move_insn (mem, reg);
9067
9068       RTX_FRAME_RELATED_P (insn) = frame_related_p;
9069       if (frame_related_p && need_cfa_note_p)
9070         aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
9071     }
9072 }
9073
9074 /* Emit code to restore the callee registers from register number START
9075    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
9076    skipping any write-back candidates if SKIP_WB is true.  Write the
9077    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
9078
9079 static void
9080 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
9081                               unsigned limit, bool skip_wb, rtx *cfi_ops)
9082 {
9083   unsigned regno;
9084   unsigned regno2;
9085   poly_int64 offset;
9086   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
9087
9088   for (regno = aarch64_next_callee_save (start, limit);
9089        regno <= limit;
9090        regno = aarch64_next_callee_save (regno + 1, limit))
9091     {
9092       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9093       if (cfun->machine->reg_is_wrapped_separately[regno])
9094         continue;
9095
9096       rtx reg, mem;
9097
9098       if (skip_wb
9099           && (regno == cfun->machine->frame.wb_pop_candidate1
9100               || regno == cfun->machine->frame.wb_pop_candidate2))
9101         continue;
9102
9103       machine_mode mode = aarch64_reg_save_mode (regno);
9104       reg = gen_rtx_REG (mode, regno);
9105       offset = start_offset + cfun->machine->frame.reg_offset[regno];
9106       rtx base_rtx = stack_pointer_rtx;
9107       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9108         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9109                                              offset, ptrue);
9110       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9111
9112       if (!aarch64_sve_mode_p (mode)
9113           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
9114           && !cfun->machine->reg_is_wrapped_separately[regno2]
9115           && known_eq (GET_MODE_SIZE (mode),
9116                        cfun->machine->frame.reg_offset[regno2]
9117                        - cfun->machine->frame.reg_offset[regno]))
9118         {
9119           rtx reg2 = gen_rtx_REG (mode, regno2);
9120           rtx mem2;
9121
9122           offset += GET_MODE_SIZE (mode);
9123           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9124           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9125
9126           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
9127           regno = regno2;
9128         }
9129       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9130         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
9131       else if (aarch64_sve_mode_p (mode))
9132         emit_insn (gen_rtx_SET (reg, mem));
9133       else
9134         emit_move_insn (reg, mem);
9135       if (frame_related_p)
9136         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
9137     }
9138 }
9139
9140 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
9141    of MODE.  */
9142
9143 static inline bool
9144 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9145 {
9146   HOST_WIDE_INT multiple;
9147   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9148           && IN_RANGE (multiple, -8, 7));
9149 }
9150
9151 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
9152    of MODE.  */
9153
9154 static inline bool
9155 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9156 {
9157   HOST_WIDE_INT multiple;
9158   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9159           && IN_RANGE (multiple, -32, 31));
9160 }
9161
9162 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
9163    of MODE.  */
9164
9165 static inline bool
9166 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9167 {
9168   HOST_WIDE_INT multiple;
9169   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9170           && IN_RANGE (multiple, 0, 63));
9171 }
9172
9173 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
9174    of MODE.  */
9175
9176 bool
9177 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9178 {
9179   HOST_WIDE_INT multiple;
9180   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9181           && IN_RANGE (multiple, -64, 63));
9182 }
9183
9184 /* Return true if OFFSET is a signed 9-bit value.  */
9185
9186 bool
9187 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
9188                                        poly_int64 offset)
9189 {
9190   HOST_WIDE_INT const_offset;
9191   return (offset.is_constant (&const_offset)
9192           && IN_RANGE (const_offset, -256, 255));
9193 }
9194
9195 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
9196    of MODE.  */
9197
9198 static inline bool
9199 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9200 {
9201   HOST_WIDE_INT multiple;
9202   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9203           && IN_RANGE (multiple, -256, 255));
9204 }
9205
9206 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
9207    of MODE.  */
9208
9209 static inline bool
9210 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9211 {
9212   HOST_WIDE_INT multiple;
9213   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9214           && IN_RANGE (multiple, 0, 4095));
9215 }
9216
9217 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
9218
9219 static sbitmap
9220 aarch64_get_separate_components (void)
9221 {
9222   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9223   bitmap_clear (components);
9224
9225   /* The registers we need saved to the frame.  */
9226   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9227     if (aarch64_register_saved_on_entry (regno))
9228       {
9229         /* Punt on saves and restores that use ST1D and LD1D.  We could
9230            try to be smarter, but it would involve making sure that the
9231            spare predicate register itself is safe to use at the save
9232            and restore points.  Also, when a frame pointer is being used,
9233            the slots are often out of reach of ST1D and LD1D anyway.  */
9234         machine_mode mode = aarch64_reg_save_mode (regno);
9235         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9236           continue;
9237
9238         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9239
9240         /* If the register is saved in the first SVE save slot, we use
9241            it as a stack probe for -fstack-clash-protection.  */
9242         if (flag_stack_clash_protection
9243             && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
9244             && known_eq (offset, 0))
9245           continue;
9246
9247         /* Get the offset relative to the register we'll use.  */
9248         if (frame_pointer_needed)
9249           offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9250         else
9251           offset += crtl->outgoing_args_size;
9252
9253         /* Check that we can access the stack slot of the register with one
9254            direct load with no adjustments needed.  */
9255         if (aarch64_sve_mode_p (mode)
9256             ? offset_9bit_signed_scaled_p (mode, offset)
9257             : offset_12bit_unsigned_scaled_p (mode, offset))
9258           bitmap_set_bit (components, regno);
9259       }
9260
9261   /* Don't mess with the hard frame pointer.  */
9262   if (frame_pointer_needed)
9263     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
9264
9265   /* If the spare predicate register used by big-endian SVE code
9266      is call-preserved, it must be saved in the main prologue
9267      before any saves that use it.  */
9268   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
9269     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
9270
9271   unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9272   unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9273   /* If registers have been chosen to be stored/restored with
9274      writeback don't interfere with them to avoid having to output explicit
9275      stack adjustment instructions.  */
9276   if (reg2 != INVALID_REGNUM)
9277     bitmap_clear_bit (components, reg2);
9278   if (reg1 != INVALID_REGNUM)
9279     bitmap_clear_bit (components, reg1);
9280
9281   bitmap_clear_bit (components, LR_REGNUM);
9282   bitmap_clear_bit (components, SP_REGNUM);
9283
9284   return components;
9285 }
9286
9287 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
9288
9289 static sbitmap
9290 aarch64_components_for_bb (basic_block bb)
9291 {
9292   bitmap in = DF_LIVE_IN (bb);
9293   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
9294   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
9295
9296   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9297   bitmap_clear (components);
9298
9299   /* Clobbered registers don't generate values in any meaningful sense,
9300      since nothing after the clobber can rely on their value.  And we can't
9301      say that partially-clobbered registers are unconditionally killed,
9302      because whether they're killed or not depends on the mode of the
9303      value they're holding.  Thus partially call-clobbered registers
9304      appear in neither the kill set nor the gen set.
9305
9306      Check manually for any calls that clobber more of a register than the
9307      current function can.  */
9308   function_abi_aggregator callee_abis;
9309   rtx_insn *insn;
9310   FOR_BB_INSNS (bb, insn)
9311     if (CALL_P (insn))
9312       callee_abis.note_callee_abi (insn_callee_abi (insn));
9313   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
9314
9315   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
9316   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9317     if (!fixed_regs[regno]
9318         && !crtl->abi->clobbers_full_reg_p (regno)
9319         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
9320             || bitmap_bit_p (in, regno)
9321             || bitmap_bit_p (gen, regno)
9322             || bitmap_bit_p (kill, regno)))
9323       {
9324         bitmap_set_bit (components, regno);
9325
9326         /* If there is a callee-save at an adjacent offset, add it too
9327            to increase the use of LDP/STP.  */
9328         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9329         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
9330
9331         if (regno2 <= LAST_SAVED_REGNUM)
9332           {
9333             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9334             if (regno < regno2
9335                 ? known_eq (offset + 8, offset2)
9336                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
9337               bitmap_set_bit (components, regno2);
9338           }
9339       }
9340
9341   return components;
9342 }
9343
9344 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9345    Nothing to do for aarch64.  */
9346
9347 static void
9348 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
9349 {
9350 }
9351
9352 /* Return the next set bit in BMP from START onwards.  Return the total number
9353    of bits in BMP if no set bit is found at or after START.  */
9354
9355 static unsigned int
9356 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
9357 {
9358   unsigned int nbits = SBITMAP_SIZE (bmp);
9359   if (start == nbits)
9360     return start;
9361
9362   gcc_assert (start < nbits);
9363   for (unsigned int i = start; i < nbits; i++)
9364     if (bitmap_bit_p (bmp, i))
9365       return i;
9366
9367   return nbits;
9368 }
9369
9370 /* Do the work for aarch64_emit_prologue_components and
9371    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
9372    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9373    for these components or the epilogue sequence.  That is, it determines
9374    whether we should emit stores or loads and what kind of CFA notes to attach
9375    to the insns.  Otherwise the logic for the two sequences is very
9376    similar.  */
9377
9378 static void
9379 aarch64_process_components (sbitmap components, bool prologue_p)
9380 {
9381   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
9382                              ? HARD_FRAME_POINTER_REGNUM
9383                              : STACK_POINTER_REGNUM);
9384
9385   unsigned last_regno = SBITMAP_SIZE (components);
9386   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9387   rtx_insn *insn = NULL;
9388
9389   while (regno != last_regno)
9390     {
9391       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9392       machine_mode mode = aarch64_reg_save_mode (regno);
9393
9394       rtx reg = gen_rtx_REG (mode, regno);
9395       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9396       if (frame_pointer_needed)
9397         offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9398       else
9399         offset += crtl->outgoing_args_size;
9400
9401       rtx addr = plus_constant (Pmode, ptr_reg, offset);
9402       rtx mem = gen_frame_mem (mode, addr);
9403
9404       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9405       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9406       /* No more registers to handle after REGNO.
9407          Emit a single save/restore and exit.  */
9408       if (regno2 == last_regno)
9409         {
9410           insn = emit_insn (set);
9411           if (frame_related_p)
9412             {
9413               RTX_FRAME_RELATED_P (insn) = 1;
9414               if (prologue_p)
9415                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9416               else
9417                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9418             }
9419           break;
9420         }
9421
9422       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9423       /* The next register is not of the same class or its offset is not
9424          mergeable with the current one into a pair.  */
9425       if (aarch64_sve_mode_p (mode)
9426           || !satisfies_constraint_Ump (mem)
9427           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9428           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9429           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
9430                        GET_MODE_SIZE (mode)))
9431         {
9432           insn = emit_insn (set);
9433           if (frame_related_p)
9434             {
9435               RTX_FRAME_RELATED_P (insn) = 1;
9436               if (prologue_p)
9437                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9438               else
9439                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9440             }
9441
9442           regno = regno2;
9443           continue;
9444         }
9445
9446       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9447
9448       /* REGNO2 can be saved/restored in a pair with REGNO.  */
9449       rtx reg2 = gen_rtx_REG (mode, regno2);
9450       if (frame_pointer_needed)
9451         offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9452       else
9453         offset2 += crtl->outgoing_args_size;
9454       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9455       rtx mem2 = gen_frame_mem (mode, addr2);
9456       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9457                              : gen_rtx_SET (reg2, mem2);
9458
9459       if (prologue_p)
9460         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
9461       else
9462         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9463
9464       if (frame_related_p || frame_related2_p)
9465         {
9466           RTX_FRAME_RELATED_P (insn) = 1;
9467           if (prologue_p)
9468             {
9469               if (frame_related_p)
9470                 add_reg_note (insn, REG_CFA_OFFSET, set);
9471               if (frame_related2_p)
9472                 add_reg_note (insn, REG_CFA_OFFSET, set2);
9473             }
9474           else
9475             {
9476               if (frame_related_p)
9477                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9478               if (frame_related2_p)
9479                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9480             }
9481         }
9482
9483       regno = aarch64_get_next_set_bit (components, regno2 + 1);
9484     }
9485 }
9486
9487 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
9488
9489 static void
9490 aarch64_emit_prologue_components (sbitmap components)
9491 {
9492   aarch64_process_components (components, true);
9493 }
9494
9495 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
9496
9497 static void
9498 aarch64_emit_epilogue_components (sbitmap components)
9499 {
9500   aarch64_process_components (components, false);
9501 }
9502
9503 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
9504
9505 static void
9506 aarch64_set_handled_components (sbitmap components)
9507 {
9508   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9509     if (bitmap_bit_p (components, regno))
9510       cfun->machine->reg_is_wrapped_separately[regno] = true;
9511 }
9512
9513 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
9514    determining the probe offset for alloca.  */
9515
9516 static HOST_WIDE_INT
9517 aarch64_stack_clash_protection_alloca_probe_range (void)
9518 {
9519   return STACK_CLASH_CALLER_GUARD;
9520 }
9521
9522
9523 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9524    registers.  If POLY_SIZE is not large enough to require a probe this function
9525    will only adjust the stack.  When allocating the stack space
9526    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9527    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
9528    arguments.  If we are then we ensure that any allocation larger than the ABI
9529    defined buffer needs a probe so that the invariant of having a 1KB buffer is
9530    maintained.
9531
9532    We emit barriers after each stack adjustment to prevent optimizations from
9533    breaking the invariant that we never drop the stack more than a page.  This
9534    invariant is needed to make it easier to correctly handle asynchronous
9535    events, e.g. if we were to allow the stack to be dropped by more than a page
9536    and then have multiple probes up and we take a signal somewhere in between
9537    then the signal handler doesn't know the state of the stack and can make no
9538    assumptions about which pages have been probed.  */
9539
9540 static void
9541 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9542                                         poly_int64 poly_size,
9543                                         bool frame_related_p,
9544                                         bool final_adjustment_p)
9545 {
9546   HOST_WIDE_INT guard_size
9547     = 1 << param_stack_clash_protection_guard_size;
9548   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9549   HOST_WIDE_INT min_probe_threshold
9550     = (final_adjustment_p
9551        ? guard_used_by_caller
9552        : guard_size - guard_used_by_caller);
9553   /* When doing the final adjustment for the outgoing arguments, take into
9554      account any unprobed space there is above the current SP.  There are
9555      two cases:
9556
9557      - When saving SVE registers below the hard frame pointer, we force
9558        the lowest save to take place in the prologue before doing the final
9559        adjustment (i.e. we don't allow the save to be shrink-wrapped).
9560        This acts as a probe at SP, so there is no unprobed space.
9561
9562      - When there are no SVE register saves, we use the store of the link
9563        register as a probe.  We can't assume that LR was saved at position 0
9564        though, so treat any space below it as unprobed.  */
9565   if (final_adjustment_p
9566       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
9567     {
9568       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
9569       if (known_ge (lr_offset, 0))
9570         min_probe_threshold -= lr_offset.to_constant ();
9571       else
9572         gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
9573     }
9574
9575   poly_int64 frame_size = cfun->machine->frame.frame_size;
9576
9577   /* We should always have a positive probe threshold.  */
9578   gcc_assert (min_probe_threshold > 0);
9579
9580   if (flag_stack_clash_protection && !final_adjustment_p)
9581     {
9582       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9583       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9584       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9585
9586       if (known_eq (frame_size, 0))
9587         {
9588           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9589         }
9590       else if (known_lt (initial_adjust + sve_callee_adjust,
9591                          guard_size - guard_used_by_caller)
9592                && known_lt (final_adjust, guard_used_by_caller))
9593         {
9594           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9595         }
9596     }
9597
9598   /* If SIZE is not large enough to require probing, just adjust the stack and
9599      exit.  */
9600   if (known_lt (poly_size, min_probe_threshold)
9601       || !flag_stack_clash_protection)
9602     {
9603       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
9604       return;
9605     }
9606
9607   HOST_WIDE_INT size;
9608   /* Handle the SVE non-constant case first.  */
9609   if (!poly_size.is_constant (&size))
9610     {
9611      if (dump_file)
9612       {
9613         fprintf (dump_file, "Stack clash SVE prologue: ");
9614         print_dec (poly_size, dump_file);
9615         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9616       }
9617
9618       /* First calculate the amount of bytes we're actually spilling.  */
9619       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9620                           poly_size, temp1, temp2, false, true);
9621
9622       rtx_insn *insn = get_last_insn ();
9623
9624       if (frame_related_p)
9625         {
9626           /* This is done to provide unwinding information for the stack
9627              adjustments we're about to do, however to prevent the optimizers
9628              from removing the R11 move and leaving the CFA note (which would be
9629              very wrong) we tie the old and new stack pointer together.
9630              The tie will expand to nothing but the optimizers will not touch
9631              the instruction.  */
9632           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9633           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9634           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
9635
9636           /* We want the CFA independent of the stack pointer for the
9637              duration of the loop.  */
9638           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9639           RTX_FRAME_RELATED_P (insn) = 1;
9640         }
9641
9642       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9643       rtx guard_const = gen_int_mode (guard_size, Pmode);
9644
9645       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9646                                                    stack_pointer_rtx, temp1,
9647                                                    probe_const, guard_const));
9648
9649       /* Now reset the CFA register if needed.  */
9650       if (frame_related_p)
9651         {
9652           add_reg_note (insn, REG_CFA_DEF_CFA,
9653                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9654                                       gen_int_mode (poly_size, Pmode)));
9655           RTX_FRAME_RELATED_P (insn) = 1;
9656         }
9657
9658       return;
9659     }
9660
9661   if (dump_file)
9662     fprintf (dump_file,
9663              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9664              " bytes, probing will be required.\n", size);
9665
9666   /* Round size to the nearest multiple of guard_size, and calculate the
9667      residual as the difference between the original size and the rounded
9668      size.  */
9669   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9670   HOST_WIDE_INT residual = size - rounded_size;
9671
9672   /* We can handle a small number of allocations/probes inline.  Otherwise
9673      punt to a loop.  */
9674   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9675     {
9676       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9677         {
9678           aarch64_sub_sp (NULL, temp2, guard_size, true);
9679           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9680                                            guard_used_by_caller));
9681           emit_insn (gen_blockage ());
9682         }
9683       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9684     }
9685   else
9686     {
9687       /* Compute the ending address.  */
9688       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9689                           temp1, NULL, false, true);
9690       rtx_insn *insn = get_last_insn ();
9691
9692       /* For the initial allocation, we don't have a frame pointer
9693          set up, so we always need CFI notes.  If we're doing the
9694          final allocation, then we may have a frame pointer, in which
9695          case it is the CFA, otherwise we need CFI notes.
9696
9697          We can determine which allocation we are doing by looking at
9698          the value of FRAME_RELATED_P since the final allocations are not
9699          frame related.  */
9700       if (frame_related_p)
9701         {
9702           /* We want the CFA independent of the stack pointer for the
9703              duration of the loop.  */
9704           add_reg_note (insn, REG_CFA_DEF_CFA,
9705                         plus_constant (Pmode, temp1, rounded_size));
9706           RTX_FRAME_RELATED_P (insn) = 1;
9707         }
9708
9709       /* This allocates and probes the stack.  Note that this re-uses some of
9710          the existing Ada stack protection code.  However we are guaranteed not
9711          to enter the non loop or residual branches of that code.
9712
9713          The non-loop part won't be entered because if our allocation amount
9714          doesn't require a loop, the case above would handle it.
9715
9716          The residual amount won't be entered because TEMP1 is a mutliple of
9717          the allocation size.  The residual will always be 0.  As such, the only
9718          part we are actually using from that code is the loop setup.  The
9719          actual probing is done in aarch64_output_probe_stack_range.  */
9720       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9721                                                stack_pointer_rtx, temp1));
9722
9723       /* Now reset the CFA register if needed.  */
9724       if (frame_related_p)
9725         {
9726           add_reg_note (insn, REG_CFA_DEF_CFA,
9727                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9728           RTX_FRAME_RELATED_P (insn) = 1;
9729         }
9730
9731       emit_insn (gen_blockage ());
9732       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9733     }
9734
9735   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
9736      be probed.  This maintains the requirement that each page is probed at
9737      least once.  For initial probing we probe only if the allocation is
9738      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
9739      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
9740      GUARD_SIZE.  This works that for any allocation that is large enough to
9741      trigger a probe here, we'll have at least one, and if they're not large
9742      enough for this code to emit anything for them, The page would have been
9743      probed by the saving of FP/LR either by this function or any callees.  If
9744      we don't have any callees then we won't have more stack adjustments and so
9745      are still safe.  */
9746   if (residual)
9747     {
9748       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
9749       /* If we're doing final adjustments, and we've done any full page
9750          allocations then any residual needs to be probed.  */
9751       if (final_adjustment_p && rounded_size != 0)
9752         min_probe_threshold = 0;
9753       /* If doing a small final adjustment, we always probe at offset 0.
9754          This is done to avoid issues when LR is not at position 0 or when
9755          the final adjustment is smaller than the probing offset.  */
9756       else if (final_adjustment_p && rounded_size == 0)
9757         residual_probe_offset = 0;
9758
9759       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
9760       if (residual >= min_probe_threshold)
9761         {
9762           if (dump_file)
9763             fprintf (dump_file,
9764                      "Stack clash AArch64 prologue residuals: "
9765                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9766                      "\n", residual);
9767
9768             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9769                                              residual_probe_offset));
9770           emit_insn (gen_blockage ());
9771         }
9772     }
9773 }
9774
9775 /* Return 1 if the register is used by the epilogue.  We need to say the
9776    return register is used, but only after epilogue generation is complete.
9777    Note that in the case of sibcalls, the values "used by the epilogue" are
9778    considered live at the start of the called function.
9779
9780    For SIMD functions we need to return 1 for FP registers that are saved and
9781    restored by a function but are not zero in call_used_regs.  If we do not do
9782    this optimizations may remove the restore of the register.  */
9783
9784 int
9785 aarch64_epilogue_uses (int regno)
9786 {
9787   if (epilogue_completed)
9788     {
9789       if (regno == LR_REGNUM)
9790         return 1;
9791     }
9792   return 0;
9793 }
9794
9795 /* AArch64 stack frames generated by this compiler look like:
9796
9797         +-------------------------------+
9798         |                               |
9799         |  incoming stack arguments     |
9800         |                               |
9801         +-------------------------------+
9802         |                               | <-- incoming stack pointer (aligned)
9803         |  callee-allocated save area   |
9804         |  for register varargs         |
9805         |                               |
9806         +-------------------------------+
9807         |  local variables              | <-- frame_pointer_rtx
9808         |                               |
9809         +-------------------------------+
9810         |  padding                      | \
9811         +-------------------------------+  |
9812         |  callee-saved registers       |  | frame.saved_regs_size
9813         +-------------------------------+  |
9814         |  LR'                          |  |
9815         +-------------------------------+  |
9816         |  FP'                          |  |
9817         +-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
9818         |  SVE vector registers         |  | \
9819         +-------------------------------+  |  | below_hard_fp_saved_regs_size
9820         |  SVE predicate registers      | /  /
9821         +-------------------------------+
9822         |  dynamic allocation           |
9823         +-------------------------------+
9824         |  padding                      |
9825         +-------------------------------+
9826         |  outgoing stack arguments     | <-- arg_pointer
9827         |                               |
9828         +-------------------------------+
9829         |                               | <-- stack_pointer_rtx (aligned)
9830
9831    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9832    but leave frame_pointer_rtx and hard_frame_pointer_rtx
9833    unchanged.
9834
9835    By default for stack-clash we assume the guard is at least 64KB, but this
9836    value is configurable to either 4KB or 64KB.  We also force the guard size to
9837    be the same as the probing interval and both values are kept in sync.
9838
9839    With those assumptions the callee can allocate up to 63KB (or 3KB depending
9840    on the guard size) of stack space without probing.
9841
9842    When probing is needed, we emit a probe at the start of the prologue
9843    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9844
9845    We have to track how much space has been allocated and the only stores
9846    to the stack we track as implicit probes are the FP/LR stores.
9847
9848    For outgoing arguments we probe if the size is larger than 1KB, such that
9849    the ABI specified buffer is maintained for the next callee.
9850
9851    The following registers are reserved during frame layout and should not be
9852    used for any other purpose:
9853
9854    - r11: Used by stack clash protection when SVE is enabled, and also
9855           as an anchor register when saving and restoring registers
9856    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9857    - r14 and r15: Used for speculation tracking.
9858    - r16(IP0), r17(IP1): Used by indirect tailcalls.
9859    - r30(LR), r29(FP): Used by standard frame layout.
9860
9861    These registers must be avoided in frame layout related code unless the
9862    explicit intention is to interact with one of the features listed above.  */
9863
9864 /* Generate the prologue instructions for entry into a function.
9865    Establish the stack frame by decreasing the stack pointer with a
9866    properly calculated size and, if necessary, create a frame record
9867    filled with the values of LR and previous frame pointer.  The
9868    current FP is also set up if it is in use.  */
9869
9870 void
9871 aarch64_expand_prologue (void)
9872 {
9873   poly_int64 frame_size = cfun->machine->frame.frame_size;
9874   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9875   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
9876   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9877   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
9878   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9879   poly_int64 below_hard_fp_saved_regs_size
9880     = cfun->machine->frame.below_hard_fp_saved_regs_size;
9881   unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9882   unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9883   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
9884   rtx_insn *insn;
9885
9886   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
9887     {
9888       /* Fold the SVE allocation into the initial allocation.
9889          We don't do this in aarch64_layout_arg to avoid pessimizing
9890          the epilogue code.  */
9891       initial_adjust += sve_callee_adjust;
9892       sve_callee_adjust = 0;
9893     }
9894
9895   /* Sign return address for functions.  */
9896   if (aarch64_return_address_signing_enabled ())
9897     {
9898       switch (aarch64_ra_sign_key)
9899         {
9900           case AARCH64_KEY_A:
9901             insn = emit_insn (gen_paciasp ());
9902             break;
9903           case AARCH64_KEY_B:
9904             insn = emit_insn (gen_pacibsp ());
9905             break;
9906           default:
9907             gcc_unreachable ();
9908         }
9909       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9910       RTX_FRAME_RELATED_P (insn) = 1;
9911     }
9912
9913   /* Push return address to shadow call stack.  */
9914   if (cfun->machine->frame.is_scs_enabled)
9915     emit_insn (gen_scs_push ());
9916
9917   if (flag_stack_usage_info)
9918     current_function_static_stack_size = constant_lower_bound (frame_size);
9919
9920   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9921     {
9922       if (crtl->is_leaf && !cfun->calls_alloca)
9923         {
9924           if (maybe_gt (frame_size, PROBE_INTERVAL)
9925               && maybe_gt (frame_size, get_stack_check_protect ()))
9926             aarch64_emit_probe_stack_range (get_stack_check_protect (),
9927                                             (frame_size
9928                                              - get_stack_check_protect ()));
9929         }
9930       else if (maybe_gt (frame_size, 0))
9931         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9932     }
9933
9934   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9935   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9936
9937   /* In theory we should never have both an initial adjustment
9938      and a callee save adjustment.  Verify that is the case since the
9939      code below does not handle it for -fstack-clash-protection.  */
9940   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9941
9942   /* Will only probe if the initial adjustment is larger than the guard
9943      less the amount of the guard reserved for use by the caller's
9944      outgoing args.  */
9945   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9946                                           true, false);
9947
9948   if (callee_adjust != 0)
9949     aarch64_push_regs (reg1, reg2, callee_adjust);
9950
9951   /* The offset of the frame chain record (if any) from the current SP.  */
9952   poly_int64 chain_offset = (initial_adjust + callee_adjust
9953                              - cfun->machine->frame.hard_fp_offset);
9954   gcc_assert (known_ge (chain_offset, 0));
9955
9956   /* The offset of the bottom of the save area from the current SP.  */
9957   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
9958
9959   if (emit_frame_chain)
9960     {
9961       if (callee_adjust == 0)
9962         {
9963           reg1 = R29_REGNUM;
9964           reg2 = R30_REGNUM;
9965           aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
9966                                      false, false);
9967         }
9968       else
9969         gcc_assert (known_eq (chain_offset, 0));
9970       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9971                           stack_pointer_rtx, chain_offset,
9972                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
9973       if (frame_pointer_needed && !frame_size.is_constant ())
9974         {
9975           /* Variable-sized frames need to describe the save slot
9976              address using DW_CFA_expression rather than DW_CFA_offset.
9977              This means that, without taking further action, the
9978              locations of the registers that we've already saved would
9979              remain based on the stack pointer even after we redefine
9980              the CFA based on the frame pointer.  We therefore need new
9981              DW_CFA_expressions to re-express the save slots with addresses
9982              based on the frame pointer.  */
9983           rtx_insn *insn = get_last_insn ();
9984           gcc_assert (RTX_FRAME_RELATED_P (insn));
9985
9986           /* Add an explicit CFA definition if this was previously
9987              implicit.  */
9988           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9989             {
9990               rtx src = plus_constant (Pmode, stack_pointer_rtx,
9991                                        callee_offset);
9992               add_reg_note (insn, REG_CFA_ADJUST_CFA,
9993                             gen_rtx_SET (hard_frame_pointer_rtx, src));
9994             }
9995
9996           /* Change the save slot expressions for the registers that
9997              we've already saved.  */
9998           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9999                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
10000           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
10001                                       hard_frame_pointer_rtx, 0);
10002         }
10003       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
10004     }
10005
10006   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
10007                              callee_adjust != 0 || emit_frame_chain,
10008                              emit_frame_chain);
10009   if (maybe_ne (sve_callee_adjust, 0))
10010     {
10011       gcc_assert (!flag_stack_clash_protection
10012                   || known_eq (initial_adjust, 0));
10013       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
10014                                               sve_callee_adjust,
10015                                               !frame_pointer_needed, false);
10016       saved_regs_offset += sve_callee_adjust;
10017     }
10018   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
10019                              false, emit_frame_chain);
10020   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
10021                              callee_adjust != 0 || emit_frame_chain,
10022                              emit_frame_chain);
10023
10024   /* We may need to probe the final adjustment if it is larger than the guard
10025      that is assumed by the called.  */
10026   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
10027                                           !frame_pointer_needed, true);
10028 }
10029
10030 /* Return TRUE if we can use a simple_return insn.
10031
10032    This function checks whether the callee saved stack is empty, which
10033    means no restore actions are need. The pro_and_epilogue will use
10034    this to check whether shrink-wrapping opt is feasible.  */
10035
10036 bool
10037 aarch64_use_return_insn_p (void)
10038 {
10039   if (!reload_completed)
10040     return false;
10041
10042   if (crtl->profile)
10043     return false;
10044
10045   return known_eq (cfun->machine->frame.frame_size, 0);
10046 }
10047
10048 /* Generate the epilogue instructions for returning from a function.
10049    This is almost exactly the reverse of the prolog sequence, except
10050    that we need to insert barriers to avoid scheduling loads that read
10051    from a deallocated stack, and we optimize the unwind records by
10052    emitting them all together if possible.  */
10053 void
10054 aarch64_expand_epilogue (bool for_sibcall)
10055 {
10056   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
10057   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
10058   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
10059   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
10060   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
10061   poly_int64 below_hard_fp_saved_regs_size
10062     = cfun->machine->frame.below_hard_fp_saved_regs_size;
10063   unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
10064   unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
10065   unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
10066                            ? R29_REGNUM : R30_REGNUM);
10067   rtx cfi_ops = NULL;
10068   rtx_insn *insn;
10069   /* A stack clash protection prologue may not have left EP0_REGNUM or
10070      EP1_REGNUM in a usable state.  The same is true for allocations
10071      with an SVE component, since we then need both temporary registers
10072      for each allocation.  For stack clash we are in a usable state if
10073      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
10074   HOST_WIDE_INT guard_size
10075     = 1 << param_stack_clash_protection_guard_size;
10076   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
10077
10078   /* We can re-use the registers when:
10079
10080      (a) the deallocation amount is the same as the corresponding
10081          allocation amount (which is false if we combine the initial
10082          and SVE callee save allocations in the prologue); and
10083
10084      (b) the allocation amount doesn't need a probe (which is false
10085          if the amount is guard_size - guard_used_by_caller or greater).
10086
10087      In such situations the register should remain live with the correct
10088      value.  */
10089   bool can_inherit_p = (initial_adjust.is_constant ()
10090                         && final_adjust.is_constant ()
10091                         && (!flag_stack_clash_protection
10092                             || (known_lt (initial_adjust,
10093                                           guard_size - guard_used_by_caller)
10094                                 && known_eq (sve_callee_adjust, 0))));
10095
10096   /* We need to add memory barrier to prevent read from deallocated stack.  */
10097   bool need_barrier_p
10098     = maybe_ne (get_frame_size ()
10099                 + cfun->machine->frame.saved_varargs_size, 0);
10100
10101   /* Emit a barrier to prevent loads from a deallocated stack.  */
10102   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
10103       || cfun->calls_alloca
10104       || crtl->calls_eh_return)
10105     {
10106       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10107       need_barrier_p = false;
10108     }
10109
10110   /* Restore the stack pointer from the frame pointer if it may not
10111      be the same as the stack pointer.  */
10112   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10113   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
10114   if (frame_pointer_needed
10115       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
10116     /* If writeback is used when restoring callee-saves, the CFA
10117        is restored on the instruction doing the writeback.  */
10118     aarch64_add_offset (Pmode, stack_pointer_rtx,
10119                         hard_frame_pointer_rtx,
10120                         -callee_offset - below_hard_fp_saved_regs_size,
10121                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
10122   else
10123      /* The case where we need to re-use the register here is very rare, so
10124         avoid the complicated condition and just always emit a move if the
10125         immediate doesn't fit.  */
10126      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
10127
10128   /* Restore the vector registers before the predicate registers,
10129      so that we can use P4 as a temporary for big-endian SVE frames.  */
10130   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
10131                                 callee_adjust != 0, &cfi_ops);
10132   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
10133                                 false, &cfi_ops);
10134   if (maybe_ne (sve_callee_adjust, 0))
10135     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
10136
10137   /* When shadow call stack is enabled, the scs_pop in the epilogue will
10138      restore x30, we don't need to restore x30 again in the traditional
10139      way.  */
10140   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
10141                                 R0_REGNUM, last_gpr,
10142                                 callee_adjust != 0, &cfi_ops);
10143
10144   if (need_barrier_p)
10145     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10146
10147   if (callee_adjust != 0)
10148     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
10149
10150   /* If we have no register restore information, the CFA must have been
10151      defined in terms of the stack pointer since the end of the prologue.  */
10152   gcc_assert (cfi_ops || !frame_pointer_needed);
10153
10154   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
10155     {
10156       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
10157       insn = get_last_insn ();
10158       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
10159       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
10160       RTX_FRAME_RELATED_P (insn) = 1;
10161       cfi_ops = NULL;
10162     }
10163
10164   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10165      add restriction on emit_move optimization to leaf functions.  */
10166   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
10167                   (!can_inherit_p || !crtl->is_leaf
10168                    || df_regs_ever_live_p (EP0_REGNUM)));
10169
10170   if (cfi_ops)
10171     {
10172       /* Emit delayed restores and reset the CFA to be SP.  */
10173       insn = get_last_insn ();
10174       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
10175       REG_NOTES (insn) = cfi_ops;
10176       RTX_FRAME_RELATED_P (insn) = 1;
10177     }
10178
10179   /* Pop return address from shadow call stack.  */
10180   if (cfun->machine->frame.is_scs_enabled)
10181     {
10182       machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
10183       rtx reg = gen_rtx_REG (mode, R30_REGNUM);
10184
10185       insn = emit_insn (gen_scs_pop ());
10186       add_reg_note (insn, REG_CFA_RESTORE, reg);
10187       RTX_FRAME_RELATED_P (insn) = 1;
10188     }
10189
10190   /* We prefer to emit the combined return/authenticate instruction RETAA,
10191      however there are three cases in which we must instead emit an explicit
10192      authentication instruction.
10193
10194         1) Sibcalls don't return in a normal way, so if we're about to call one
10195            we must authenticate.
10196
10197         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10198            generating code for !TARGET_ARMV8_3 we can't use it and must
10199            explicitly authenticate.
10200     */
10201   if (aarch64_return_address_signing_enabled ()
10202       && (for_sibcall || !TARGET_ARMV8_3))
10203     {
10204       switch (aarch64_ra_sign_key)
10205         {
10206           case AARCH64_KEY_A:
10207             insn = emit_insn (gen_autiasp ());
10208             break;
10209           case AARCH64_KEY_B:
10210             insn = emit_insn (gen_autibsp ());
10211             break;
10212           default:
10213             gcc_unreachable ();
10214         }
10215       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10216       RTX_FRAME_RELATED_P (insn) = 1;
10217     }
10218
10219   /* Stack adjustment for exception handler.  */
10220   if (crtl->calls_eh_return && !for_sibcall)
10221     {
10222       /* We need to unwind the stack by the offset computed by
10223          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
10224          to be SP; letting the CFA move during this adjustment
10225          is just as correct as retaining the CFA from the body
10226          of the function.  Therefore, do nothing special.  */
10227       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
10228     }
10229
10230   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10231   if (!for_sibcall)
10232     emit_jump_insn (ret_rtx);
10233 }
10234
10235 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
10236    normally or return to a previous frame after unwinding.
10237
10238    An EH return uses a single shared return sequence.  The epilogue is
10239    exactly like a normal epilogue except that it has an extra input
10240    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
10241    that must be applied after the frame has been destroyed.  An extra label
10242    is inserted before the epilogue which initializes this register to zero,
10243    and this is the entry point for a normal return.
10244
10245    An actual EH return updates the return address, initializes the stack
10246    adjustment and jumps directly into the epilogue (bypassing the zeroing
10247    of the adjustment).  Since the return address is typically saved on the
10248    stack when a function makes a call, the saved LR must be updated outside
10249    the epilogue.
10250
10251    This poses problems as the store is generated well before the epilogue,
10252    so the offset of LR is not known yet.  Also optimizations will remove the
10253    store as it appears dead, even after the epilogue is generated (as the
10254    base or offset for loading LR is different in many cases).
10255
10256    To avoid these problems this implementation forces the frame pointer
10257    in eh_return functions so that the location of LR is fixed and known early.
10258    It also marks the store volatile, so no optimization is permitted to
10259    remove the store.  */
10260 rtx
10261 aarch64_eh_return_handler_rtx (void)
10262 {
10263   rtx tmp = gen_frame_mem (Pmode,
10264     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
10265
10266   /* Mark the store volatile, so no optimization is permitted to remove it.  */
10267   MEM_VOLATILE_P (tmp) = true;
10268   return tmp;
10269 }
10270
10271 /* Output code to add DELTA to the first argument, and then jump
10272    to FUNCTION.  Used for C++ multiple inheritance.  */
10273 static void
10274 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10275                          HOST_WIDE_INT delta,
10276                          HOST_WIDE_INT vcall_offset,
10277                          tree function)
10278 {
10279   /* The this pointer is always in x0.  Note that this differs from
10280      Arm where the this pointer maybe bumped to r1 if r0 is required
10281      to return a pointer to an aggregate.  On AArch64 a result value
10282      pointer will be in x8.  */
10283   int this_regno = R0_REGNUM;
10284   rtx this_rtx, temp0, temp1, addr, funexp;
10285   rtx_insn *insn;
10286   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10287
10288   if (aarch64_bti_enabled ())
10289     emit_insn (gen_bti_c());
10290
10291   reload_completed = 1;
10292   emit_note (NOTE_INSN_PROLOGUE_END);
10293
10294   this_rtx = gen_rtx_REG (Pmode, this_regno);
10295   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10296   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10297
10298   if (vcall_offset == 0)
10299     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
10300   else
10301     {
10302       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10303
10304       addr = this_rtx;
10305       if (delta != 0)
10306         {
10307           if (delta >= -256 && delta < 256)
10308             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10309                                        plus_constant (Pmode, this_rtx, delta));
10310           else
10311             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10312                                 temp1, temp0, false);
10313         }
10314
10315       if (Pmode == ptr_mode)
10316         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10317       else
10318         aarch64_emit_move (temp0,
10319                            gen_rtx_ZERO_EXTEND (Pmode,
10320                                                 gen_rtx_MEM (ptr_mode, addr)));
10321
10322       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10323           addr = plus_constant (Pmode, temp0, vcall_offset);
10324       else
10325         {
10326           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10327                                           Pmode);
10328           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10329         }
10330
10331       if (Pmode == ptr_mode)
10332         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10333       else
10334         aarch64_emit_move (temp1,
10335                            gen_rtx_SIGN_EXTEND (Pmode,
10336                                                 gen_rtx_MEM (ptr_mode, addr)));
10337
10338       emit_insn (gen_add2_insn (this_rtx, temp1));
10339     }
10340
10341   /* Generate a tail call to the target function.  */
10342   if (!TREE_USED (function))
10343     {
10344       assemble_external (function);
10345       TREE_USED (function) = 1;
10346     }
10347   funexp = XEXP (DECL_RTL (function), 0);
10348   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10349   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
10350   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10351   SIBLING_CALL_P (insn) = 1;
10352
10353   insn = get_insns ();
10354   shorten_branches (insn);
10355
10356   assemble_start_function (thunk, fnname);
10357   final_start_function (insn, file, 1);
10358   final (insn, file, 1);
10359   final_end_function ();
10360   assemble_end_function (thunk, fnname);
10361
10362   /* Stop pretending to be a post-reload pass.  */
10363   reload_completed = 0;
10364 }
10365
10366 static bool
10367 aarch64_tls_referenced_p (rtx x)
10368 {
10369   if (!TARGET_HAVE_TLS)
10370     return false;
10371   subrtx_iterator::array_type array;
10372   FOR_EACH_SUBRTX (iter, array, x, ALL)
10373     {
10374       const_rtx x = *iter;
10375       if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10376         return true;
10377       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10378          TLS offsets, not real symbol references.  */
10379       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10380         iter.skip_subrtxes ();
10381     }
10382   return false;
10383 }
10384
10385
10386 static bool
10387 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10388 {
10389   if (GET_CODE (x) == HIGH)
10390     return true;
10391
10392   /* There's no way to calculate VL-based values using relocations.  */
10393   subrtx_iterator::array_type array;
10394   FOR_EACH_SUBRTX (iter, array, x, ALL)
10395     if (GET_CODE (*iter) == CONST_POLY_INT)
10396       return true;
10397
10398   poly_int64 offset;
10399   rtx base = strip_offset_and_salt (x, &offset);
10400   if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10401     {
10402       /* We checked for POLY_INT_CST offsets above.  */
10403       if (aarch64_classify_symbol (base, offset.to_constant ())
10404           != SYMBOL_FORCE_TO_MEM)
10405         return true;
10406       else
10407         /* Avoid generating a 64-bit relocation in ILP32; leave
10408            to aarch64_expand_mov_immediate to handle it properly.  */
10409         return mode != ptr_mode;
10410     }
10411
10412   return aarch64_tls_referenced_p (x);
10413 }
10414
10415 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10416    The expansion for a table switch is quite expensive due to the number
10417    of instructions, the table lookup and hard to predict indirect jump.
10418    When optimizing for speed, and -O3 enabled, use the per-core tuning if
10419    set, otherwise use tables for >= 11 cases as a tradeoff between size and
10420    performance.  When optimizing for size, use 8 for smallest codesize.  */
10421
10422 static unsigned int
10423 aarch64_case_values_threshold (void)
10424 {
10425   /* Use the specified limit for the number of cases before using jump
10426      tables at higher optimization levels.  */
10427   if (optimize > 2
10428       && aarch64_tune_params.max_case_values != 0)
10429     return aarch64_tune_params.max_case_values;
10430   else
10431     return optimize_size ? 8 : 11;
10432 }
10433
10434 /* Return true if register REGNO is a valid index register.
10435    STRICT_P is true if REG_OK_STRICT is in effect.  */
10436
10437 bool
10438 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10439 {
10440   if (!HARD_REGISTER_NUM_P (regno))
10441     {
10442       if (!strict_p)
10443         return true;
10444
10445       if (!reg_renumber)
10446         return false;
10447
10448       regno = reg_renumber[regno];
10449     }
10450   return GP_REGNUM_P (regno);
10451 }
10452
10453 /* Return true if register REGNO is a valid base register for mode MODE.
10454    STRICT_P is true if REG_OK_STRICT is in effect.  */
10455
10456 bool
10457 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10458 {
10459   if (!HARD_REGISTER_NUM_P (regno))
10460     {
10461       if (!strict_p)
10462         return true;
10463
10464       if (!reg_renumber)
10465         return false;
10466
10467       regno = reg_renumber[regno];
10468     }
10469
10470   /* The fake registers will be eliminated to either the stack or
10471      hard frame pointer, both of which are usually valid base registers.
10472      Reload deals with the cases where the eliminated form isn't valid.  */
10473   return (GP_REGNUM_P (regno)
10474           || regno == SP_REGNUM
10475           || regno == FRAME_POINTER_REGNUM
10476           || regno == ARG_POINTER_REGNUM);
10477 }
10478
10479 /* Return true if X is a valid base register for mode MODE.
10480    STRICT_P is true if REG_OK_STRICT is in effect.  */
10481
10482 static bool
10483 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10484 {
10485   if (!strict_p
10486       && SUBREG_P (x)
10487       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10488     x = SUBREG_REG (x);
10489
10490   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10491 }
10492
10493 /* Return true if address offset is a valid index.  If it is, fill in INFO
10494    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10495
10496 static bool
10497 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10498                         machine_mode mode, bool strict_p)
10499 {
10500   enum aarch64_address_type type;
10501   rtx index;
10502   int shift;
10503
10504   /* (reg:P) */
10505   if ((REG_P (x) || SUBREG_P (x))
10506       && GET_MODE (x) == Pmode)
10507     {
10508       type = ADDRESS_REG_REG;
10509       index = x;
10510       shift = 0;
10511     }
10512   /* (sign_extend:DI (reg:SI)) */
10513   else if ((GET_CODE (x) == SIGN_EXTEND
10514             || GET_CODE (x) == ZERO_EXTEND)
10515            && GET_MODE (x) == DImode
10516            && GET_MODE (XEXP (x, 0)) == SImode)
10517     {
10518       type = (GET_CODE (x) == SIGN_EXTEND)
10519         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10520       index = XEXP (x, 0);
10521       shift = 0;
10522     }
10523   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10524   else if (GET_CODE (x) == MULT
10525            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10526                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10527            && GET_MODE (XEXP (x, 0)) == DImode
10528            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10529            && CONST_INT_P (XEXP (x, 1)))
10530     {
10531       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10532         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10533       index = XEXP (XEXP (x, 0), 0);
10534       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10535     }
10536   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10537   else if (GET_CODE (x) == ASHIFT
10538            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10539                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10540            && GET_MODE (XEXP (x, 0)) == DImode
10541            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10542            && CONST_INT_P (XEXP (x, 1)))
10543     {
10544       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10545         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10546       index = XEXP (XEXP (x, 0), 0);
10547       shift = INTVAL (XEXP (x, 1));
10548     }
10549   /* (and:DI (mult:DI (reg:DI) (const_int scale))
10550      (const_int 0xffffffff<<shift)) */
10551   else if (GET_CODE (x) == AND
10552            && GET_MODE (x) == DImode
10553            && GET_CODE (XEXP (x, 0)) == MULT
10554            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10555            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10556            && CONST_INT_P (XEXP (x, 1)))
10557     {
10558       type = ADDRESS_REG_UXTW;
10559       index = XEXP (XEXP (x, 0), 0);
10560       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10561       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10562         shift = -1;
10563     }
10564   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10565      (const_int 0xffffffff<<shift)) */
10566   else if (GET_CODE (x) == AND
10567            && GET_MODE (x) == DImode
10568            && GET_CODE (XEXP (x, 0)) == ASHIFT
10569            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10570            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10571            && CONST_INT_P (XEXP (x, 1)))
10572     {
10573       type = ADDRESS_REG_UXTW;
10574       index = XEXP (XEXP (x, 0), 0);
10575       shift = INTVAL (XEXP (XEXP (x, 0), 1));
10576       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10577         shift = -1;
10578     }
10579   /* (mult:P (reg:P) (const_int scale)) */
10580   else if (GET_CODE (x) == MULT
10581            && GET_MODE (x) == Pmode
10582            && GET_MODE (XEXP (x, 0)) == Pmode
10583            && CONST_INT_P (XEXP (x, 1)))
10584     {
10585       type = ADDRESS_REG_REG;
10586       index = XEXP (x, 0);
10587       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10588     }
10589   /* (ashift:P (reg:P) (const_int shift)) */
10590   else if (GET_CODE (x) == ASHIFT
10591            && GET_MODE (x) == Pmode
10592            && GET_MODE (XEXP (x, 0)) == Pmode
10593            && CONST_INT_P (XEXP (x, 1)))
10594     {
10595       type = ADDRESS_REG_REG;
10596       index = XEXP (x, 0);
10597       shift = INTVAL (XEXP (x, 1));
10598     }
10599   else
10600     return false;
10601
10602   if (!strict_p
10603       && SUBREG_P (index)
10604       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10605     index = SUBREG_REG (index);
10606
10607   if (aarch64_sve_data_mode_p (mode))
10608     {
10609       if (type != ADDRESS_REG_REG
10610           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10611         return false;
10612     }
10613   else
10614     {
10615       if (shift != 0
10616           && !(IN_RANGE (shift, 1, 3)
10617                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10618         return false;
10619     }
10620
10621   if (REG_P (index)
10622       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10623     {
10624       info->type = type;
10625       info->offset = index;
10626       info->shift = shift;
10627       return true;
10628     }
10629
10630   return false;
10631 }
10632
10633 /* Return true if MODE is one of the modes for which we
10634    support LDP/STP operations.  */
10635
10636 static bool
10637 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10638 {
10639   return mode == SImode || mode == DImode
10640          || mode == SFmode || mode == DFmode
10641          || mode == SDmode || mode == DDmode
10642          || (aarch64_vector_mode_supported_p (mode)
10643              && (known_eq (GET_MODE_SIZE (mode), 8)
10644                  || (known_eq (GET_MODE_SIZE (mode), 16)
10645                     && (aarch64_tune_params.extra_tuning_flags
10646                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
10647 }
10648
10649 /* Return true if REGNO is a virtual pointer register, or an eliminable
10650    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
10651    include stack_pointer or hard_frame_pointer.  */
10652 static bool
10653 virt_or_elim_regno_p (unsigned regno)
10654 {
10655   return ((regno >= FIRST_VIRTUAL_REGISTER
10656            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10657           || regno == FRAME_POINTER_REGNUM
10658           || regno == ARG_POINTER_REGNUM);
10659 }
10660
10661 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10662    If it is, fill in INFO appropriately.  STRICT_P is true if
10663    REG_OK_STRICT is in effect.  */
10664
10665 bool
10666 aarch64_classify_address (struct aarch64_address_info *info,
10667                           rtx x, machine_mode mode, bool strict_p,
10668                           aarch64_addr_query_type type)
10669 {
10670   enum rtx_code code = GET_CODE (x);
10671   rtx op0, op1;
10672   poly_int64 offset;
10673
10674   HOST_WIDE_INT const_size;
10675
10676   /* Whether a vector mode is partial doesn't affect address legitimacy.
10677      Partial vectors like VNx8QImode allow the same indexed addressing
10678      mode and MUL VL addressing mode as full vectors like VNx16QImode;
10679      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
10680   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10681   vec_flags &= ~VEC_PARTIAL;
10682
10683   /* On BE, we use load/store pair for all large int mode load/stores.
10684      TI/TF/TDmode may also use a load/store pair.  */
10685   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10686   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10687                             || type == ADDR_QUERY_LDP_STP_N
10688                             || mode == TImode
10689                             || mode == TFmode
10690                             || mode == TDmode
10691                             || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10692                                 && advsimd_struct_p));
10693   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10694      corresponds to the actual size of the memory being loaded/stored and the
10695      mode of the corresponding addressing mode is half of that.  */
10696   if (type == ADDR_QUERY_LDP_STP_N)
10697     {
10698       if (known_eq (GET_MODE_SIZE (mode), 16))
10699         mode = DFmode;
10700       else if (known_eq (GET_MODE_SIZE (mode), 8))
10701         mode = SFmode;
10702       else
10703         return false;
10704     }
10705
10706   bool allow_reg_index_p = (!load_store_pair_p
10707                             && ((vec_flags == 0
10708                                  && known_lt (GET_MODE_SIZE (mode), 16))
10709                                 || vec_flags == VEC_ADVSIMD
10710                                 || vec_flags & VEC_SVE_DATA));
10711
10712   /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10713      The latter is not valid for SVE predicates, and that's rejected through
10714      allow_reg_index_p above.  */
10715   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10716       && (code != REG && code != PLUS))
10717     return false;
10718
10719   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10720      REG addressing.  */
10721   if (advsimd_struct_p
10722       && TARGET_SIMD
10723       && !BYTES_BIG_ENDIAN
10724       && (code != POST_INC && code != REG))
10725     return false;
10726
10727   gcc_checking_assert (GET_MODE (x) == VOIDmode
10728                        || SCALAR_INT_MODE_P (GET_MODE (x)));
10729
10730   switch (code)
10731     {
10732     case REG:
10733     case SUBREG:
10734       info->type = ADDRESS_REG_IMM;
10735       info->base = x;
10736       info->offset = const0_rtx;
10737       info->const_offset = 0;
10738       return aarch64_base_register_rtx_p (x, strict_p);
10739
10740     case PLUS:
10741       op0 = XEXP (x, 0);
10742       op1 = XEXP (x, 1);
10743
10744       if (! strict_p
10745           && REG_P (op0)
10746           && virt_or_elim_regno_p (REGNO (op0))
10747           && poly_int_rtx_p (op1, &offset))
10748         {
10749           info->type = ADDRESS_REG_IMM;
10750           info->base = op0;
10751           info->offset = op1;
10752           info->const_offset = offset;
10753
10754           return true;
10755         }
10756
10757       if (maybe_ne (GET_MODE_SIZE (mode), 0)
10758           && aarch64_base_register_rtx_p (op0, strict_p)
10759           && poly_int_rtx_p (op1, &offset))
10760         {
10761           info->type = ADDRESS_REG_IMM;
10762           info->base = op0;
10763           info->offset = op1;
10764           info->const_offset = offset;
10765
10766           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10767              registers and individual Q registers.  The available
10768              address modes are:
10769              X,X: 7-bit signed scaled offset
10770              Q:   9-bit signed offset
10771              We conservatively require an offset representable in either mode.
10772              When performing the check for pairs of X registers i.e.  LDP/STP
10773              pass down DImode since that is the natural size of the LDP/STP
10774              instruction memory accesses.  */
10775           if (mode == TImode || mode == TFmode || mode == TDmode)
10776             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10777                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10778                         || offset_12bit_unsigned_scaled_p (mode, offset)));
10779
10780           if (mode == V8DImode)
10781             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10782                     && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10783
10784           /* A 7bit offset check because OImode will emit a ldp/stp
10785              instruction (only !TARGET_SIMD or big endian will get here).
10786              For ldp/stp instructions, the offset is scaled for the size of a
10787              single element of the pair.  */
10788           if (aarch64_advsimd_partial_struct_mode_p (mode)
10789               && known_eq (GET_MODE_SIZE (mode), 16))
10790             return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10791           if (aarch64_advsimd_full_struct_mode_p (mode)
10792               && known_eq (GET_MODE_SIZE (mode), 32))
10793             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10794
10795           /* Three 9/12 bit offsets checks because CImode will emit three
10796              ldr/str instructions (only !TARGET_SIMD or big endian will
10797              get here).  */
10798           if (aarch64_advsimd_partial_struct_mode_p (mode)
10799               && known_eq (GET_MODE_SIZE (mode), 24))
10800             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10801                     && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10802                                                                offset + 16)
10803                         || offset_12bit_unsigned_scaled_p (DImode,
10804                                                            offset + 16)));
10805           if (aarch64_advsimd_full_struct_mode_p (mode)
10806               && known_eq (GET_MODE_SIZE (mode), 48))
10807             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10808                     && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10809                                                                offset + 32)
10810                         || offset_12bit_unsigned_scaled_p (TImode,
10811                                                            offset + 32)));
10812
10813           /* Two 7bit offsets checks because XImode will emit two ldp/stp
10814              instructions (only big endian will get here).  */
10815           if (aarch64_advsimd_partial_struct_mode_p (mode)
10816               && known_eq (GET_MODE_SIZE (mode), 32))
10817             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10818                     && aarch64_offset_7bit_signed_scaled_p (DImode,
10819                                                             offset + 16));
10820           if (aarch64_advsimd_full_struct_mode_p (mode)
10821               && known_eq (GET_MODE_SIZE (mode), 64))
10822             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10823                     && aarch64_offset_7bit_signed_scaled_p (TImode,
10824                                                             offset + 32));
10825
10826           /* Make "m" use the LD1 offset range for SVE data modes, so
10827              that pre-RTL optimizers like ivopts will work to that
10828              instead of the wider LDR/STR range.  */
10829           if (vec_flags == VEC_SVE_DATA)
10830             return (type == ADDR_QUERY_M
10831                     ? offset_4bit_signed_scaled_p (mode, offset)
10832                     : offset_9bit_signed_scaled_p (mode, offset));
10833
10834           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10835             {
10836               poly_int64 end_offset = (offset
10837                                        + GET_MODE_SIZE (mode)
10838                                        - BYTES_PER_SVE_VECTOR);
10839               return (type == ADDR_QUERY_M
10840                       ? offset_4bit_signed_scaled_p (mode, offset)
10841                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10842                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10843                                                          end_offset)));
10844             }
10845
10846           if (vec_flags == VEC_SVE_PRED)
10847             return offset_9bit_signed_scaled_p (mode, offset);
10848
10849           if (load_store_pair_p)
10850             return ((known_eq (GET_MODE_SIZE (mode), 4)
10851                      || known_eq (GET_MODE_SIZE (mode), 8)
10852                      || known_eq (GET_MODE_SIZE (mode), 16))
10853                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10854           else
10855             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10856                     || offset_12bit_unsigned_scaled_p (mode, offset));
10857         }
10858
10859       if (allow_reg_index_p)
10860         {
10861           /* Look for base + (scaled/extended) index register.  */
10862           if (aarch64_base_register_rtx_p (op0, strict_p)
10863               && aarch64_classify_index (info, op1, mode, strict_p))
10864             {
10865               info->base = op0;
10866               return true;
10867             }
10868           if (aarch64_base_register_rtx_p (op1, strict_p)
10869               && aarch64_classify_index (info, op0, mode, strict_p))
10870             {
10871               info->base = op1;
10872               return true;
10873             }
10874         }
10875
10876       return false;
10877
10878     case POST_INC:
10879     case POST_DEC:
10880     case PRE_INC:
10881     case PRE_DEC:
10882       info->type = ADDRESS_REG_WB;
10883       info->base = XEXP (x, 0);
10884       info->offset = NULL_RTX;
10885       return aarch64_base_register_rtx_p (info->base, strict_p);
10886
10887     case POST_MODIFY:
10888     case PRE_MODIFY:
10889       info->type = ADDRESS_REG_WB;
10890       info->base = XEXP (x, 0);
10891       if (GET_CODE (XEXP (x, 1)) == PLUS
10892           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10893           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10894           && aarch64_base_register_rtx_p (info->base, strict_p))
10895         {
10896           info->offset = XEXP (XEXP (x, 1), 1);
10897           info->const_offset = offset;
10898
10899           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10900              registers and individual Q registers.  The available
10901              address modes are:
10902              X,X: 7-bit signed scaled offset
10903              Q:   9-bit signed offset
10904              We conservatively require an offset representable in either mode.
10905            */
10906           if (mode == TImode || mode == TFmode || mode == TDmode)
10907             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10908                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10909
10910           if (load_store_pair_p)
10911             return ((known_eq (GET_MODE_SIZE (mode), 4)
10912                      || known_eq (GET_MODE_SIZE (mode), 8)
10913                      || known_eq (GET_MODE_SIZE (mode), 16))
10914                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10915           else
10916             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10917         }
10918       return false;
10919
10920     case CONST:
10921     case SYMBOL_REF:
10922     case LABEL_REF:
10923       /* load literal: pc-relative constant pool entry.  Only supported
10924          for SI mode or larger.  */
10925       info->type = ADDRESS_SYMBOLIC;
10926
10927       if (!load_store_pair_p
10928           && GET_MODE_SIZE (mode).is_constant (&const_size)
10929           && const_size >= 4)
10930         {
10931           poly_int64 offset;
10932           rtx sym = strip_offset_and_salt (x, &offset);
10933           return ((LABEL_REF_P (sym)
10934                    || (SYMBOL_REF_P (sym)
10935                        && CONSTANT_POOL_ADDRESS_P (sym)
10936                        && aarch64_pcrelative_literal_loads)));
10937         }
10938       return false;
10939
10940     case LO_SUM:
10941       info->type = ADDRESS_LO_SUM;
10942       info->base = XEXP (x, 0);
10943       info->offset = XEXP (x, 1);
10944       if (allow_reg_index_p
10945           && aarch64_base_register_rtx_p (info->base, strict_p))
10946         {
10947           poly_int64 offset;
10948           HOST_WIDE_INT const_offset;
10949           rtx sym = strip_offset_and_salt (info->offset, &offset);
10950           if (SYMBOL_REF_P (sym)
10951               && offset.is_constant (&const_offset)
10952               && (aarch64_classify_symbol (sym, const_offset)
10953                   == SYMBOL_SMALL_ABSOLUTE))
10954             {
10955               /* The symbol and offset must be aligned to the access size.  */
10956               unsigned int align;
10957
10958               if (CONSTANT_POOL_ADDRESS_P (sym))
10959                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
10960               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
10961                 {
10962                   tree exp = SYMBOL_REF_DECL (sym);
10963                   align = TYPE_ALIGN (TREE_TYPE (exp));
10964                   align = aarch64_constant_alignment (exp, align);
10965                 }
10966               else if (SYMBOL_REF_DECL (sym))
10967                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
10968               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
10969                        && SYMBOL_REF_BLOCK (sym) != NULL)
10970                 align = SYMBOL_REF_BLOCK (sym)->alignment;
10971               else
10972                 align = BITS_PER_UNIT;
10973
10974               poly_int64 ref_size = GET_MODE_SIZE (mode);
10975               if (known_eq (ref_size, 0))
10976                 ref_size = GET_MODE_SIZE (DImode);
10977
10978               return (multiple_p (const_offset, ref_size)
10979                       && multiple_p (align / BITS_PER_UNIT, ref_size));
10980             }
10981         }
10982       return false;
10983
10984     default:
10985       return false;
10986     }
10987 }
10988
10989 /* Return true if the address X is valid for a PRFM instruction.
10990    STRICT_P is true if we should do strict checking with
10991    aarch64_classify_address.  */
10992
10993 bool
10994 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
10995 {
10996   struct aarch64_address_info addr;
10997
10998   /* PRFM accepts the same addresses as DImode...  */
10999   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
11000   if (!res)
11001     return false;
11002
11003   /* ... except writeback forms.  */
11004   return addr.type != ADDRESS_REG_WB;
11005 }
11006
11007 bool
11008 aarch64_symbolic_address_p (rtx x)
11009 {
11010   poly_int64 offset;
11011   x = strip_offset_and_salt (x, &offset);
11012   return SYMBOL_REF_P (x) || LABEL_REF_P (x);
11013 }
11014
11015 /* Classify the base of symbolic expression X.  */
11016
11017 enum aarch64_symbol_type
11018 aarch64_classify_symbolic_expression (rtx x)
11019 {
11020   rtx offset;
11021
11022   split_const (x, &x, &offset);
11023   return aarch64_classify_symbol (x, INTVAL (offset));
11024 }
11025
11026
11027 /* Return TRUE if X is a legitimate address for accessing memory in
11028    mode MODE.  */
11029 static bool
11030 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
11031 {
11032   struct aarch64_address_info addr;
11033
11034   return aarch64_classify_address (&addr, x, mode, strict_p);
11035 }
11036
11037 /* Return TRUE if X is a legitimate address of type TYPE for accessing
11038    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
11039 bool
11040 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
11041                               aarch64_addr_query_type type)
11042 {
11043   struct aarch64_address_info addr;
11044
11045   return aarch64_classify_address (&addr, x, mode, strict_p, type);
11046 }
11047
11048 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
11049
11050 static bool
11051 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
11052                                          poly_int64 orig_offset,
11053                                          machine_mode mode)
11054 {
11055   HOST_WIDE_INT size;
11056   if (GET_MODE_SIZE (mode).is_constant (&size))
11057     {
11058       HOST_WIDE_INT const_offset, second_offset;
11059
11060       /* A general SVE offset is A * VQ + B.  Remove the A component from
11061          coefficient 0 in order to get the constant B.  */
11062       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
11063
11064       /* Split an out-of-range address displacement into a base and
11065          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
11066          range otherwise to increase opportunities for sharing the base
11067          address of different sizes.  Unaligned accesses use the signed
11068          9-bit range, TImode/TFmode/TDmode use the intersection of signed
11069          scaled 7-bit and signed 9-bit offset.  */
11070       if (mode == TImode || mode == TFmode || mode == TDmode)
11071         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
11072       else if ((const_offset & (size - 1)) != 0)
11073         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
11074       else
11075         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
11076
11077       if (second_offset == 0 || known_eq (orig_offset, second_offset))
11078         return false;
11079
11080       /* Split the offset into second_offset and the rest.  */
11081       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11082       *offset2 = gen_int_mode (second_offset, Pmode);
11083       return true;
11084     }
11085   else
11086     {
11087       /* Get the mode we should use as the basis of the range.  For structure
11088          modes this is the mode of one vector.  */
11089       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11090       machine_mode step_mode
11091         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
11092
11093       /* Get the "mul vl" multiplier we'd like to use.  */
11094       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
11095       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
11096       if (vec_flags & VEC_SVE_DATA)
11097         /* LDR supports a 9-bit range, but the move patterns for
11098            structure modes require all vectors to be in range of the
11099            same base.  The simplest way of accomodating that while still
11100            promoting reuse of anchor points between different modes is
11101            to use an 8-bit range unconditionally.  */
11102         vnum = ((vnum + 128) & 255) - 128;
11103       else
11104         /* Predicates are only handled singly, so we might as well use
11105            the full range.  */
11106         vnum = ((vnum + 256) & 511) - 256;
11107       if (vnum == 0)
11108         return false;
11109
11110       /* Convert the "mul vl" multiplier into a byte offset.  */
11111       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
11112       if (known_eq (second_offset, orig_offset))
11113         return false;
11114
11115       /* Split the offset into second_offset and the rest.  */
11116       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11117       *offset2 = gen_int_mode (second_offset, Pmode);
11118       return true;
11119     }
11120 }
11121
11122 /* Return the binary representation of floating point constant VALUE in INTVAL.
11123    If the value cannot be converted, return false without setting INTVAL.
11124    The conversion is done in the given MODE.  */
11125 bool
11126 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
11127 {
11128
11129   /* We make a general exception for 0.  */
11130   if (aarch64_float_const_zero_rtx_p (value))
11131     {
11132       *intval = 0;
11133       return true;
11134     }
11135
11136   scalar_float_mode mode;
11137   if (!CONST_DOUBLE_P (value)
11138       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
11139       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
11140       /* Only support up to DF mode.  */
11141       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
11142     return false;
11143
11144   unsigned HOST_WIDE_INT ival = 0;
11145
11146   long res[2];
11147   real_to_target (res,
11148                   CONST_DOUBLE_REAL_VALUE (value),
11149                   REAL_MODE_FORMAT (mode));
11150
11151   if (mode == DFmode || mode == DDmode)
11152     {
11153       int order = BYTES_BIG_ENDIAN ? 1 : 0;
11154       ival = zext_hwi (res[order], 32);
11155       ival |= (zext_hwi (res[1 - order], 32) << 32);
11156     }
11157   else
11158       ival = zext_hwi (res[0], 32);
11159
11160   *intval = ival;
11161   return true;
11162 }
11163
11164 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11165    single MOV(+MOVK) followed by an FMOV.  */
11166 bool
11167 aarch64_float_const_rtx_p (rtx x)
11168 {
11169   machine_mode mode = GET_MODE (x);
11170   if (mode == VOIDmode)
11171     return false;
11172
11173   /* Determine whether it's cheaper to write float constants as
11174      mov/movk pairs over ldr/adrp pairs.  */
11175   unsigned HOST_WIDE_INT ival;
11176
11177   if (CONST_DOUBLE_P (x)
11178       && SCALAR_FLOAT_MODE_P (mode)
11179       && aarch64_reinterpret_float_as_int (x, &ival))
11180     {
11181       machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
11182       int num_instr = aarch64_internal_mov_immediate
11183                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11184       return num_instr < 3;
11185     }
11186
11187   return false;
11188 }
11189
11190 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11191    Floating Point).  */
11192 bool
11193 aarch64_float_const_zero_rtx_p (rtx x)
11194 {
11195   /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11196      zr as our callers expect, so no need to check the actual
11197      value if X is of Decimal Floating Point type.  */
11198   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
11199     return false;
11200
11201   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
11202     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
11203   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
11204 }
11205
11206 /* Return TRUE if rtx X is immediate constant that fits in a single
11207    MOVI immediate operation.  */
11208 bool
11209 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11210 {
11211   if (!TARGET_SIMD)
11212      return false;
11213
11214   machine_mode vmode;
11215   scalar_int_mode imode;
11216   unsigned HOST_WIDE_INT ival;
11217
11218   if (CONST_DOUBLE_P (x)
11219       && SCALAR_FLOAT_MODE_P (mode))
11220     {
11221       if (!aarch64_reinterpret_float_as_int (x, &ival))
11222         return false;
11223
11224       /* We make a general exception for 0.  */
11225       if (aarch64_float_const_zero_rtx_p (x))
11226         return true;
11227
11228       imode = int_mode_for_mode (mode).require ();
11229     }
11230   else if (CONST_INT_P (x)
11231            && is_a <scalar_int_mode> (mode, &imode))
11232     ival = INTVAL (x);
11233   else
11234     return false;
11235
11236    /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11237      a 128 bit vector mode.  */
11238   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11239
11240   vmode = aarch64_simd_container_mode (imode, width);
11241   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11242
11243   return aarch64_simd_valid_immediate (v_op, NULL);
11244 }
11245
11246
11247 /* Return the fixed registers used for condition codes.  */
11248
11249 static bool
11250 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11251 {
11252   *p1 = CC_REGNUM;
11253   *p2 = INVALID_REGNUM;
11254   return true;
11255 }
11256
11257 /* This function is used by the call expanders of the machine description.
11258    RESULT is the register in which the result is returned.  It's NULL for
11259    "call" and "sibcall".
11260    MEM is the location of the function call.
11261    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
11262    SIBCALL indicates whether this function call is normal call or sibling call.
11263    It will generate different pattern accordingly.  */
11264
11265 void
11266 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
11267 {
11268   rtx call, callee, tmp;
11269   rtvec vec;
11270   machine_mode mode;
11271
11272   gcc_assert (MEM_P (mem));
11273   callee = XEXP (mem, 0);
11274   mode = GET_MODE (callee);
11275   gcc_assert (mode == Pmode);
11276
11277   /* Decide if we should generate indirect calls by loading the
11278      address of the callee into a register before performing
11279      the branch-and-link.  */
11280   if (SYMBOL_REF_P (callee)
11281       ? (aarch64_is_long_call_p (callee)
11282          || aarch64_is_noplt_call_p (callee))
11283       : !REG_P (callee))
11284     XEXP (mem, 0) = force_reg (mode, callee);
11285
11286   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11287
11288   if (result != NULL_RTX)
11289     call = gen_rtx_SET (result, call);
11290
11291   if (sibcall)
11292     tmp = ret_rtx;
11293   else
11294     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11295
11296   gcc_assert (CONST_INT_P (callee_abi));
11297   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11298                                UNSPEC_CALLEE_ABI);
11299
11300   vec = gen_rtvec (3, call, callee_abi, tmp);
11301   call = gen_rtx_PARALLEL (VOIDmode, vec);
11302
11303   aarch64_emit_call_insn (call);
11304 }
11305
11306 /* Emit call insn with PAT and do aarch64-specific handling.  */
11307
11308 void
11309 aarch64_emit_call_insn (rtx pat)
11310 {
11311   rtx insn = emit_call_insn (pat);
11312
11313   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11314   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11315   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11316 }
11317
11318 machine_mode
11319 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11320 {
11321   machine_mode mode_x = GET_MODE (x);
11322   rtx_code code_x = GET_CODE (x);
11323
11324   /* All floating point compares return CCFP if it is an equality
11325      comparison, and CCFPE otherwise.  */
11326   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11327     {
11328       switch (code)
11329         {
11330         case EQ:
11331         case NE:
11332         case UNORDERED:
11333         case ORDERED:
11334         case UNLT:
11335         case UNLE:
11336         case UNGT:
11337         case UNGE:
11338         case UNEQ:
11339           return CCFPmode;
11340
11341         case LT:
11342         case LE:
11343         case GT:
11344         case GE:
11345         case LTGT:
11346           return CCFPEmode;
11347
11348         default:
11349           gcc_unreachable ();
11350         }
11351     }
11352
11353   /* Equality comparisons of short modes against zero can be performed
11354      using the TST instruction with the appropriate bitmask.  */
11355   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11356       && (code == EQ || code == NE)
11357       && (mode_x == HImode || mode_x == QImode))
11358     return CC_Zmode;
11359
11360   /* Similarly, comparisons of zero_extends from shorter modes can
11361      be performed using an ANDS with an immediate mask.  */
11362   if (y == const0_rtx && code_x == ZERO_EXTEND
11363       && (mode_x == SImode || mode_x == DImode)
11364       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11365       && (code == EQ || code == NE))
11366     return CC_Zmode;
11367
11368   /* Zero extracts support equality comparisons.  */
11369   if ((mode_x == SImode || mode_x == DImode)
11370       && y == const0_rtx
11371       && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11372           && CONST_INT_P (XEXP (x, 2)))
11373       && (code == EQ || code == NE))
11374     return CC_Zmode;
11375
11376   /* ANDS/BICS/TST support equality and all signed comparisons.  */
11377   if ((mode_x == SImode || mode_x == DImode)
11378       && y == const0_rtx
11379       && (code_x == AND)
11380       && (code == EQ || code == NE || code == LT || code == GE
11381           || code == GT || code == LE))
11382     return CC_NZVmode;
11383
11384   /* ADDS/SUBS correctly set N and Z flags.  */
11385   if ((mode_x == SImode || mode_x == DImode)
11386       && y == const0_rtx
11387       && (code == EQ || code == NE || code == LT || code == GE)
11388       && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11389     return CC_NZmode;
11390
11391   /* A compare with a shifted operand.  Because of canonicalization,
11392      the comparison will have to be swapped when we emit the assembly
11393      code.  */
11394   if ((mode_x == SImode || mode_x == DImode)
11395       && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11396       && (code_x == ASHIFT || code_x == ASHIFTRT
11397           || code_x == LSHIFTRT
11398           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11399     return CC_SWPmode;
11400
11401   /* Similarly for a negated operand, but we can only do this for
11402      equalities.  */
11403   if ((mode_x == SImode || mode_x == DImode)
11404       && (REG_P (y) || SUBREG_P (y))
11405       && (code == EQ || code == NE)
11406       && code_x == NEG)
11407     return CC_Zmode;
11408
11409   /* A test for unsigned overflow from an addition.  */
11410   if ((mode_x == DImode || mode_x == TImode)
11411       && (code == LTU || code == GEU)
11412       && code_x == PLUS
11413       && rtx_equal_p (XEXP (x, 0), y))
11414     return CC_Cmode;
11415
11416   /* A test for unsigned overflow from an add with carry.  */
11417   if ((mode_x == DImode || mode_x == TImode)
11418       && (code == LTU || code == GEU)
11419       && code_x == PLUS
11420       && CONST_SCALAR_INT_P (y)
11421       && (rtx_mode_t (y, mode_x)
11422           == (wi::shwi (1, mode_x)
11423               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11424     return CC_ADCmode;
11425
11426   /* A test for signed overflow.  */
11427   if ((mode_x == DImode || mode_x == TImode)
11428       && code == NE
11429       && code_x == PLUS
11430       && GET_CODE (y) == SIGN_EXTEND)
11431     return CC_Vmode;
11432
11433   /* For everything else, return CCmode.  */
11434   return CCmode;
11435 }
11436
11437 static int
11438 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11439
11440 int
11441 aarch64_get_condition_code (rtx x)
11442 {
11443   machine_mode mode = GET_MODE (XEXP (x, 0));
11444   enum rtx_code comp_code = GET_CODE (x);
11445
11446   if (GET_MODE_CLASS (mode) != MODE_CC)
11447     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11448   return aarch64_get_condition_code_1 (mode, comp_code);
11449 }
11450
11451 static int
11452 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11453 {
11454   switch (mode)
11455     {
11456     case E_CCFPmode:
11457     case E_CCFPEmode:
11458       switch (comp_code)
11459         {
11460         case GE: return AARCH64_GE;
11461         case GT: return AARCH64_GT;
11462         case LE: return AARCH64_LS;
11463         case LT: return AARCH64_MI;
11464         case NE: return AARCH64_NE;
11465         case EQ: return AARCH64_EQ;
11466         case ORDERED: return AARCH64_VC;
11467         case UNORDERED: return AARCH64_VS;
11468         case UNLT: return AARCH64_LT;
11469         case UNLE: return AARCH64_LE;
11470         case UNGT: return AARCH64_HI;
11471         case UNGE: return AARCH64_PL;
11472         default: return -1;
11473         }
11474       break;
11475
11476     case E_CCmode:
11477       switch (comp_code)
11478         {
11479         case NE: return AARCH64_NE;
11480         case EQ: return AARCH64_EQ;
11481         case GE: return AARCH64_GE;
11482         case GT: return AARCH64_GT;
11483         case LE: return AARCH64_LE;
11484         case LT: return AARCH64_LT;
11485         case GEU: return AARCH64_CS;
11486         case GTU: return AARCH64_HI;
11487         case LEU: return AARCH64_LS;
11488         case LTU: return AARCH64_CC;
11489         default: return -1;
11490         }
11491       break;
11492
11493     case E_CC_SWPmode:
11494       switch (comp_code)
11495         {
11496         case NE: return AARCH64_NE;
11497         case EQ: return AARCH64_EQ;
11498         case GE: return AARCH64_LE;
11499         case GT: return AARCH64_LT;
11500         case LE: return AARCH64_GE;
11501         case LT: return AARCH64_GT;
11502         case GEU: return AARCH64_LS;
11503         case GTU: return AARCH64_CC;
11504         case LEU: return AARCH64_CS;
11505         case LTU: return AARCH64_HI;
11506         default: return -1;
11507         }
11508       break;
11509
11510     case E_CC_NZCmode:
11511       switch (comp_code)
11512         {
11513         case NE: return AARCH64_NE; /* = any */
11514         case EQ: return AARCH64_EQ; /* = none */
11515         case GE: return AARCH64_PL; /* = nfrst */
11516         case LT: return AARCH64_MI; /* = first */
11517         case GEU: return AARCH64_CS; /* = nlast */
11518         case GTU: return AARCH64_HI; /* = pmore */
11519         case LEU: return AARCH64_LS; /* = plast */
11520         case LTU: return AARCH64_CC; /* = last */
11521         default: return -1;
11522         }
11523       break;
11524
11525     case E_CC_NZVmode:
11526       switch (comp_code)
11527         {
11528         case NE: return AARCH64_NE;
11529         case EQ: return AARCH64_EQ;
11530         case GE: return AARCH64_PL;
11531         case LT: return AARCH64_MI;
11532         case GT: return AARCH64_GT;
11533         case LE: return AARCH64_LE;
11534         default: return -1;
11535         }
11536       break;
11537
11538     case E_CC_NZmode:
11539       switch (comp_code)
11540         {
11541         case NE: return AARCH64_NE;
11542         case EQ: return AARCH64_EQ;
11543         case GE: return AARCH64_PL;
11544         case LT: return AARCH64_MI;
11545         default: return -1;
11546         }
11547       break;
11548
11549     case E_CC_Zmode:
11550       switch (comp_code)
11551         {
11552         case NE: return AARCH64_NE;
11553         case EQ: return AARCH64_EQ;
11554         default: return -1;
11555         }
11556       break;
11557
11558     case E_CC_Cmode:
11559       switch (comp_code)
11560         {
11561         case LTU: return AARCH64_CS;
11562         case GEU: return AARCH64_CC;
11563         default: return -1;
11564         }
11565       break;
11566
11567     case E_CC_ADCmode:
11568       switch (comp_code)
11569         {
11570         case GEU: return AARCH64_CS;
11571         case LTU: return AARCH64_CC;
11572         default: return -1;
11573         }
11574       break;
11575
11576     case E_CC_Vmode:
11577       switch (comp_code)
11578         {
11579         case NE: return AARCH64_VS;
11580         case EQ: return AARCH64_VC;
11581         default: return -1;
11582         }
11583       break;
11584
11585     default:
11586       return -1;
11587     }
11588
11589   return -1;
11590 }
11591
11592 bool
11593 aarch64_const_vec_all_same_in_range_p (rtx x,
11594                                        HOST_WIDE_INT minval,
11595                                        HOST_WIDE_INT maxval)
11596 {
11597   rtx elt;
11598   return (const_vec_duplicate_p (x, &elt)
11599           && CONST_INT_P (elt)
11600           && IN_RANGE (INTVAL (elt), minval, maxval));
11601 }
11602
11603 bool
11604 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11605 {
11606   return aarch64_const_vec_all_same_in_range_p (x, val, val);
11607 }
11608
11609 /* Return true if VEC is a constant in which every element is in the range
11610    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
11611
11612 static bool
11613 aarch64_const_vec_all_in_range_p (rtx vec,
11614                                   HOST_WIDE_INT minval,
11615                                   HOST_WIDE_INT maxval)
11616 {
11617   if (!CONST_VECTOR_P (vec)
11618       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11619     return false;
11620
11621   int nunits;
11622   if (!CONST_VECTOR_STEPPED_P (vec))
11623     nunits = const_vector_encoded_nelts (vec);
11624   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11625     return false;
11626
11627   for (int i = 0; i < nunits; i++)
11628     {
11629       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11630       if (!CONST_INT_P (vec_elem)
11631           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11632         return false;
11633     }
11634   return true;
11635 }
11636
11637 /* N Z C V.  */
11638 #define AARCH64_CC_V 1
11639 #define AARCH64_CC_C (1 << 1)
11640 #define AARCH64_CC_Z (1 << 2)
11641 #define AARCH64_CC_N (1 << 3)
11642
11643 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
11644 static const int aarch64_nzcv_codes[] =
11645 {
11646   0,            /* EQ, Z == 1.  */
11647   AARCH64_CC_Z, /* NE, Z == 0.  */
11648   0,            /* CS, C == 1.  */
11649   AARCH64_CC_C, /* CC, C == 0.  */
11650   0,            /* MI, N == 1.  */
11651   AARCH64_CC_N, /* PL, N == 0.  */
11652   0,            /* VS, V == 1.  */
11653   AARCH64_CC_V, /* VC, V == 0.  */
11654   0,            /* HI, C ==1 && Z == 0.  */
11655   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
11656   AARCH64_CC_V, /* GE, N == V.  */
11657   0,            /* LT, N != V.  */
11658   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
11659   0,            /* LE, !(Z == 0 && N == V).  */
11660   0,            /* AL, Any.  */
11661   0             /* NV, Any.  */
11662 };
11663
11664 /* Print floating-point vector immediate operand X to F, negating it
11665    first if NEGATE is true.  Return true on success, false if it isn't
11666    a constant we can handle.  */
11667
11668 static bool
11669 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11670 {
11671   rtx elt;
11672
11673   if (!const_vec_duplicate_p (x, &elt))
11674     return false;
11675
11676   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11677   if (negate)
11678     r = real_value_negate (&r);
11679
11680   /* Handle the SVE single-bit immediates specially, since they have a
11681      fixed form in the assembly syntax.  */
11682   if (real_equal (&r, &dconst0))
11683     asm_fprintf (f, "0.0");
11684   else if (real_equal (&r, &dconst2))
11685     asm_fprintf (f, "2.0");
11686   else if (real_equal (&r, &dconst1))
11687     asm_fprintf (f, "1.0");
11688   else if (real_equal (&r, &dconsthalf))
11689     asm_fprintf (f, "0.5");
11690   else
11691     {
11692       const int buf_size = 20;
11693       char float_buf[buf_size] = {'\0'};
11694       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11695                                 1, GET_MODE (elt));
11696       asm_fprintf (f, "%s", float_buf);
11697     }
11698
11699   return true;
11700 }
11701
11702 /* Return the equivalent letter for size.  */
11703 static char
11704 sizetochar (int size)
11705 {
11706   switch (size)
11707     {
11708     case 64: return 'd';
11709     case 32: return 's';
11710     case 16: return 'h';
11711     case 8 : return 'b';
11712     default: gcc_unreachable ();
11713     }
11714 }
11715
11716 /* Print operand X to file F in a target specific manner according to CODE.
11717    The acceptable formatting commands given by CODE are:
11718      'c':               An integer or symbol address without a preceding #
11719                         sign.
11720      'C':               Take the duplicated element in a vector constant
11721                         and print it in hex.
11722      'D':               Take the duplicated element in a vector constant
11723                         and print it as an unsigned integer, in decimal.
11724      'e':               Print the sign/zero-extend size as a character 8->b,
11725                         16->h, 32->w.  Can also be used for masks:
11726                         0xff->b, 0xffff->h, 0xffffffff->w.
11727      'I':               If the operand is a duplicated vector constant,
11728                         replace it with the duplicated scalar.  If the
11729                         operand is then a floating-point constant, replace
11730                         it with the integer bit representation.  Print the
11731                         transformed constant as a signed decimal number.
11732      'p':               Prints N such that 2^N == X (X must be power of 2 and
11733                         const int).
11734      'P':               Print the number of non-zero bits in X (a const_int).
11735      'H':               Print the higher numbered register of a pair (TImode)
11736                         of regs.
11737      'm':               Print a condition (eq, ne, etc).
11738      'M':               Same as 'm', but invert condition.
11739      'N':               Take the duplicated element in a vector constant
11740                         and print the negative of it in decimal.
11741      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
11742      'S/T/U/V':         Print a FP/SIMD register name for a register list.
11743                         The register printed is the FP/SIMD register name
11744                         of X + 0/1/2/3 for S/T/U/V.
11745      'R':               Print a scalar Integer/FP/SIMD register name + 1.
11746      'X':               Print bottom 16 bits of integer constant in hex.
11747      'w/x':             Print a general register name or the zero register
11748                         (32-bit or 64-bit).
11749      '0':               Print a normal operand, if it's a general register,
11750                         then we assume DImode.
11751      'k':               Print NZCV for conditional compare instructions.
11752      'A':               Output address constant representing the first
11753                         argument of X, specifying a relocation offset
11754                         if appropriate.
11755      'L':               Output constant address specified by X
11756                         with a relocation offset if appropriate.
11757      'G':               Prints address of X, specifying a PC relative
11758                         relocation mode if appropriate.
11759      'y':               Output address of LDP or STP - this is used for
11760                         some LDP/STPs which don't use a PARALLEL in their
11761                         pattern (so the mode needs to be adjusted).
11762      'z':               Output address of a typical LDP or STP.  */
11763
11764 static void
11765 aarch64_print_operand (FILE *f, rtx x, int code)
11766 {
11767   rtx elt;
11768   switch (code)
11769     {
11770     case 'c':
11771       if (CONST_INT_P (x))
11772         fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11773       else
11774         {
11775           poly_int64 offset;
11776           rtx base = strip_offset_and_salt (x, &offset);
11777           if (SYMBOL_REF_P (base))
11778             output_addr_const (f, x);
11779           else
11780             output_operand_lossage ("unsupported operand for code '%c'", code);
11781         }
11782       break;
11783
11784     case 'e':
11785       {
11786         x = unwrap_const_vec_duplicate (x);
11787         if (!CONST_INT_P (x))
11788           {
11789             output_operand_lossage ("invalid operand for '%%%c'", code);
11790             return;
11791           }
11792
11793         HOST_WIDE_INT val = INTVAL (x);
11794         if ((val & ~7) == 8 || val == 0xff)
11795           fputc ('b', f);
11796         else if ((val & ~7) == 16 || val == 0xffff)
11797           fputc ('h', f);
11798         else if ((val & ~7) == 32 || val == 0xffffffff)
11799           fputc ('w', f);
11800         else
11801           {
11802             output_operand_lossage ("invalid operand for '%%%c'", code);
11803             return;
11804           }
11805       }
11806       break;
11807
11808     case 'p':
11809       {
11810         int n;
11811
11812         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
11813           {
11814             output_operand_lossage ("invalid operand for '%%%c'", code);
11815             return;
11816           }
11817
11818         asm_fprintf (f, "%d", n);
11819       }
11820       break;
11821
11822     case 'P':
11823       if (!CONST_INT_P (x))
11824         {
11825           output_operand_lossage ("invalid operand for '%%%c'", code);
11826           return;
11827         }
11828
11829       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
11830       break;
11831
11832     case 'H':
11833       if (x == const0_rtx)
11834         {
11835           asm_fprintf (f, "xzr");
11836           break;
11837         }
11838
11839       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
11840         {
11841           output_operand_lossage ("invalid operand for '%%%c'", code);
11842           return;
11843         }
11844
11845       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
11846       break;
11847
11848     case 'I':
11849       {
11850         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
11851         if (CONST_INT_P (x))
11852           asm_fprintf (f, "%wd", INTVAL (x));
11853         else
11854           {
11855             output_operand_lossage ("invalid operand for '%%%c'", code);
11856             return;
11857           }
11858         break;
11859       }
11860
11861     case 'M':
11862     case 'm':
11863       {
11864         int cond_code;
11865         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
11866         if (x == const_true_rtx)
11867           {
11868             if (code == 'M')
11869               fputs ("nv", f);
11870             return;
11871           }
11872
11873         if (!COMPARISON_P (x))
11874           {
11875             output_operand_lossage ("invalid operand for '%%%c'", code);
11876             return;
11877           }
11878
11879         cond_code = aarch64_get_condition_code (x);
11880         gcc_assert (cond_code >= 0);
11881         if (code == 'M')
11882           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
11883         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
11884           fputs (aarch64_sve_condition_codes[cond_code], f);
11885         else
11886           fputs (aarch64_condition_codes[cond_code], f);
11887       }
11888       break;
11889
11890     case 'N':
11891       if (!const_vec_duplicate_p (x, &elt))
11892         {
11893           output_operand_lossage ("invalid vector constant");
11894           return;
11895         }
11896
11897       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
11898         asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
11899       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11900                && aarch64_print_vector_float_operand (f, x, true))
11901         ;
11902       else
11903         {
11904           output_operand_lossage ("invalid vector constant");
11905           return;
11906         }
11907       break;
11908
11909     case 'b':
11910     case 'h':
11911     case 's':
11912     case 'd':
11913     case 'q':
11914       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
11915         {
11916           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
11917           return;
11918         }
11919       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
11920       break;
11921
11922     case 'S':
11923     case 'T':
11924     case 'U':
11925     case 'V':
11926       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
11927         {
11928           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
11929           return;
11930         }
11931       asm_fprintf (f, "%c%d",
11932                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
11933                    REGNO (x) - V0_REGNUM + (code - 'S'));
11934       break;
11935
11936     case 'R':
11937       if (REG_P (x) && FP_REGNUM_P (REGNO (x))
11938           && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
11939         asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
11940       else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
11941         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
11942       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
11943         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
11944       else
11945         output_operand_lossage ("incompatible register operand for '%%%c'",
11946                                 code);
11947       break;
11948
11949     case 'X':
11950       if (!CONST_INT_P (x))
11951         {
11952           output_operand_lossage ("invalid operand for '%%%c'", code);
11953           return;
11954         }
11955       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
11956       break;
11957
11958     case 'C':
11959       {
11960         /* Print a replicated constant in hex.  */
11961         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
11962           {
11963             output_operand_lossage ("invalid operand for '%%%c'", code);
11964             return;
11965           }
11966         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
11967         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
11968       }
11969       break;
11970
11971     case 'D':
11972       {
11973         /* Print a replicated constant in decimal, treating it as
11974            unsigned.  */
11975         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
11976           {
11977             output_operand_lossage ("invalid operand for '%%%c'", code);
11978             return;
11979           }
11980         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
11981         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
11982       }
11983       break;
11984
11985     case 'w':
11986     case 'x':
11987       if (x == const0_rtx
11988           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
11989         {
11990           asm_fprintf (f, "%czr", code);
11991           break;
11992         }
11993
11994       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
11995         {
11996           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
11997           break;
11998         }
11999
12000       if (REG_P (x) && REGNO (x) == SP_REGNUM)
12001         {
12002           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12003           break;
12004         }
12005
12006       /* Fall through */
12007
12008     case 0:
12009       if (x == NULL)
12010         {
12011           output_operand_lossage ("missing operand");
12012           return;
12013         }
12014
12015       switch (GET_CODE (x))
12016         {
12017         case REG:
12018           if (aarch64_sve_data_mode_p (GET_MODE (x)))
12019             {
12020               if (REG_NREGS (x) == 1)
12021                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12022               else
12023                 {
12024                   char suffix
12025                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12026                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
12027                                REGNO (x) - V0_REGNUM, suffix,
12028                                END_REGNO (x) - V0_REGNUM - 1, suffix);
12029                 }
12030             }
12031           else
12032             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12033           break;
12034
12035         case MEM:
12036           output_address (GET_MODE (x), XEXP (x, 0));
12037           break;
12038
12039         case LABEL_REF:
12040         case SYMBOL_REF:
12041           output_addr_const (asm_out_file, x);
12042           break;
12043
12044         case CONST_INT:
12045           asm_fprintf (f, "%wd", INTVAL (x));
12046           break;
12047
12048         case CONST:
12049           if (!VECTOR_MODE_P (GET_MODE (x)))
12050             {
12051               output_addr_const (asm_out_file, x);
12052               break;
12053             }
12054           /* fall through */
12055
12056         case CONST_VECTOR:
12057           if (!const_vec_duplicate_p (x, &elt))
12058             {
12059               output_operand_lossage ("invalid vector constant");
12060               return;
12061             }
12062
12063           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12064             asm_fprintf (f, "%wd", INTVAL (elt));
12065           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12066                    && aarch64_print_vector_float_operand (f, x, false))
12067             ;
12068           else
12069             {
12070               output_operand_lossage ("invalid vector constant");
12071               return;
12072             }
12073           break;
12074
12075         case CONST_DOUBLE:
12076           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12077              be getting CONST_DOUBLEs holding integers.  */
12078           gcc_assert (GET_MODE (x) != VOIDmode);
12079           if (aarch64_float_const_zero_rtx_p (x))
12080             {
12081               fputc ('0', f);
12082               break;
12083             }
12084           else if (aarch64_float_const_representable_p (x))
12085             {
12086 #define buf_size 20
12087               char float_buf[buf_size] = {'\0'};
12088               real_to_decimal_for_mode (float_buf,
12089                                         CONST_DOUBLE_REAL_VALUE (x),
12090                                         buf_size, buf_size,
12091                                         1, GET_MODE (x));
12092               asm_fprintf (asm_out_file, "%s", float_buf);
12093               break;
12094 #undef buf_size
12095             }
12096           output_operand_lossage ("invalid constant");
12097           return;
12098         default:
12099           output_operand_lossage ("invalid operand");
12100           return;
12101         }
12102       break;
12103
12104     case 'A':
12105       if (GET_CODE (x) == HIGH)
12106         x = XEXP (x, 0);
12107
12108       switch (aarch64_classify_symbolic_expression (x))
12109         {
12110         case SYMBOL_SMALL_GOT_4G:
12111           asm_fprintf (asm_out_file, ":got:");
12112           break;
12113
12114         case SYMBOL_SMALL_TLSGD:
12115           asm_fprintf (asm_out_file, ":tlsgd:");
12116           break;
12117
12118         case SYMBOL_SMALL_TLSDESC:
12119           asm_fprintf (asm_out_file, ":tlsdesc:");
12120           break;
12121
12122         case SYMBOL_SMALL_TLSIE:
12123           asm_fprintf (asm_out_file, ":gottprel:");
12124           break;
12125
12126         case SYMBOL_TLSLE24:
12127           asm_fprintf (asm_out_file, ":tprel:");
12128           break;
12129
12130         case SYMBOL_TINY_GOT:
12131           gcc_unreachable ();
12132           break;
12133
12134         default:
12135           break;
12136         }
12137       output_addr_const (asm_out_file, x);
12138       break;
12139
12140     case 'L':
12141       switch (aarch64_classify_symbolic_expression (x))
12142         {
12143         case SYMBOL_SMALL_GOT_4G:
12144           asm_fprintf (asm_out_file, ":got_lo12:");
12145           break;
12146
12147         case SYMBOL_SMALL_TLSGD:
12148           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12149           break;
12150
12151         case SYMBOL_SMALL_TLSDESC:
12152           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12153           break;
12154
12155         case SYMBOL_SMALL_TLSIE:
12156           asm_fprintf (asm_out_file, ":gottprel_lo12:");
12157           break;
12158
12159         case SYMBOL_TLSLE12:
12160           asm_fprintf (asm_out_file, ":tprel_lo12:");
12161           break;
12162
12163         case SYMBOL_TLSLE24:
12164           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12165           break;
12166
12167         case SYMBOL_TINY_GOT:
12168           asm_fprintf (asm_out_file, ":got:");
12169           break;
12170
12171         case SYMBOL_TINY_TLSIE:
12172           asm_fprintf (asm_out_file, ":gottprel:");
12173           break;
12174
12175         default:
12176           break;
12177         }
12178       output_addr_const (asm_out_file, x);
12179       break;
12180
12181     case 'G':
12182       switch (aarch64_classify_symbolic_expression (x))
12183         {
12184         case SYMBOL_TLSLE24:
12185           asm_fprintf (asm_out_file, ":tprel_hi12:");
12186           break;
12187         default:
12188           break;
12189         }
12190       output_addr_const (asm_out_file, x);
12191       break;
12192
12193     case 'k':
12194       {
12195         HOST_WIDE_INT cond_code;
12196
12197         if (!CONST_INT_P (x))
12198           {
12199             output_operand_lossage ("invalid operand for '%%%c'", code);
12200             return;
12201           }
12202
12203         cond_code = INTVAL (x);
12204         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12205         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12206       }
12207       break;
12208
12209     case 'y':
12210     case 'z':
12211       {
12212         machine_mode mode = GET_MODE (x);
12213
12214         if (!MEM_P (x)
12215             || (code == 'y'
12216                 && maybe_ne (GET_MODE_SIZE (mode), 8)
12217                 && maybe_ne (GET_MODE_SIZE (mode), 16)))
12218           {
12219             output_operand_lossage ("invalid operand for '%%%c'", code);
12220             return;
12221           }
12222
12223         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12224                                             code == 'y'
12225                                             ? ADDR_QUERY_LDP_STP_N
12226                                             : ADDR_QUERY_LDP_STP))
12227           output_operand_lossage ("invalid operand prefix '%%%c'", code);
12228       }
12229       break;
12230
12231     default:
12232       output_operand_lossage ("invalid operand prefix '%%%c'", code);
12233       return;
12234     }
12235 }
12236
12237 /* Print address 'x' of a memory access with mode 'mode'.
12238    'op' is the context required by aarch64_classify_address.  It can either be
12239    MEM for a normal memory access or PARALLEL for LDP/STP.  */
12240 static bool
12241 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12242                                 aarch64_addr_query_type type)
12243 {
12244   struct aarch64_address_info addr;
12245   unsigned int size, vec_flags;
12246
12247   /* Check all addresses are Pmode - including ILP32.  */
12248   if (GET_MODE (x) != Pmode
12249       && (!CONST_INT_P (x)
12250           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12251     {
12252       output_operand_lossage ("invalid address mode");
12253       return false;
12254     }
12255
12256   if (aarch64_classify_address (&addr, x, mode, true, type))
12257     switch (addr.type)
12258       {
12259       case ADDRESS_REG_IMM:
12260         if (known_eq (addr.const_offset, 0))
12261           {
12262             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12263             return true;
12264           }
12265
12266         vec_flags = aarch64_classify_vector_mode (mode);
12267         if (vec_flags & VEC_ANY_SVE)
12268           {
12269             HOST_WIDE_INT vnum
12270               = exact_div (addr.const_offset,
12271                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12272             asm_fprintf (f, "[%s, #%wd, mul vl]",
12273                          reg_names[REGNO (addr.base)], vnum);
12274             return true;
12275           }
12276
12277         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12278                      INTVAL (addr.offset));
12279         return true;
12280
12281       case ADDRESS_REG_REG:
12282         if (addr.shift == 0)
12283           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12284                        reg_names [REGNO (addr.offset)]);
12285         else
12286           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12287                        reg_names [REGNO (addr.offset)], addr.shift);
12288         return true;
12289
12290       case ADDRESS_REG_UXTW:
12291         if (addr.shift == 0)
12292           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12293                        REGNO (addr.offset) - R0_REGNUM);
12294         else
12295           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12296                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12297         return true;
12298
12299       case ADDRESS_REG_SXTW:
12300         if (addr.shift == 0)
12301           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12302                        REGNO (addr.offset) - R0_REGNUM);
12303         else
12304           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12305                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12306         return true;
12307
12308       case ADDRESS_REG_WB:
12309         /* Writeback is only supported for fixed-width modes.  */
12310         size = GET_MODE_SIZE (mode).to_constant ();
12311         switch (GET_CODE (x))
12312           {
12313           case PRE_INC:
12314             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12315             return true;
12316           case POST_INC:
12317             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12318             return true;
12319           case PRE_DEC:
12320             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12321             return true;
12322           case POST_DEC:
12323             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12324             return true;
12325           case PRE_MODIFY:
12326             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12327                          INTVAL (addr.offset));
12328             return true;
12329           case POST_MODIFY:
12330             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12331                          INTVAL (addr.offset));
12332             return true;
12333           default:
12334             break;
12335           }
12336         break;
12337
12338       case ADDRESS_LO_SUM:
12339         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12340         output_addr_const (f, addr.offset);
12341         asm_fprintf (f, "]");
12342         return true;
12343
12344       case ADDRESS_SYMBOLIC:
12345         output_addr_const (f, x);
12346         return true;
12347       }
12348
12349   return false;
12350 }
12351
12352 /* Print address 'x' of a memory access with mode 'mode'.  */
12353 static void
12354 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12355 {
12356   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12357     output_addr_const (f, x);
12358 }
12359
12360 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
12361
12362 static bool
12363 aarch64_output_addr_const_extra (FILE *file, rtx x)
12364 {
12365   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12366     {
12367       output_addr_const (file, XVECEXP (x, 0, 0));
12368       return true;
12369    }
12370   return false;
12371 }
12372
12373 bool
12374 aarch64_label_mentioned_p (rtx x)
12375 {
12376   const char *fmt;
12377   int i;
12378
12379   if (LABEL_REF_P (x))
12380     return true;
12381
12382   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12383      referencing instruction, but they are constant offsets, not
12384      symbols.  */
12385   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12386     return false;
12387
12388   fmt = GET_RTX_FORMAT (GET_CODE (x));
12389   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12390     {
12391       if (fmt[i] == 'E')
12392         {
12393           int j;
12394
12395           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12396             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12397               return 1;
12398         }
12399       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12400         return 1;
12401     }
12402
12403   return 0;
12404 }
12405
12406 /* Implement REGNO_REG_CLASS.  */
12407
12408 enum reg_class
12409 aarch64_regno_regclass (unsigned regno)
12410 {
12411   if (STUB_REGNUM_P (regno))
12412     return STUB_REGS;
12413
12414   if (GP_REGNUM_P (regno))
12415     return GENERAL_REGS;
12416
12417   if (regno == SP_REGNUM)
12418     return STACK_REG;
12419
12420   if (regno == FRAME_POINTER_REGNUM
12421       || regno == ARG_POINTER_REGNUM)
12422     return POINTER_REGS;
12423
12424   if (FP_REGNUM_P (regno))
12425     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12426             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12427
12428   if (PR_REGNUM_P (regno))
12429     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12430
12431   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12432     return FFR_REGS;
12433
12434   return NO_REGS;
12435 }
12436
12437 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12438    If OFFSET is out of range, return an offset of an anchor point
12439    that is in range.  Return 0 otherwise.  */
12440
12441 static HOST_WIDE_INT
12442 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12443                        machine_mode mode)
12444 {
12445   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
12446   if (size > 16)
12447     return (offset + 0x400) & ~0x7f0;
12448
12449   /* For offsets that aren't a multiple of the access size, the limit is
12450      -256...255.  */
12451   if (offset & (size - 1))
12452     {
12453       /* BLKmode typically uses LDP of X-registers.  */
12454       if (mode == BLKmode)
12455         return (offset + 512) & ~0x3ff;
12456       return (offset + 0x100) & ~0x1ff;
12457     }
12458
12459   /* Small negative offsets are supported.  */
12460   if (IN_RANGE (offset, -256, 0))
12461     return 0;
12462
12463   if (mode == TImode || mode == TFmode || mode == TDmode)
12464     return (offset + 0x100) & ~0x1ff;
12465
12466   /* Use 12-bit offset by access size.  */
12467   return offset & (~0xfff * size);
12468 }
12469
12470 static rtx
12471 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
12472 {
12473   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12474      where mask is selected by alignment and size of the offset.
12475      We try to pick as large a range for the offset as possible to
12476      maximize the chance of a CSE.  However, for aligned addresses
12477      we limit the range to 4k so that structures with different sized
12478      elements are likely to use the same base.  We need to be careful
12479      not to split a CONST for some forms of address expression, otherwise
12480      it will generate sub-optimal code.  */
12481
12482   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12483     {
12484       rtx base = XEXP (x, 0);
12485       rtx offset_rtx = XEXP (x, 1);
12486       HOST_WIDE_INT offset = INTVAL (offset_rtx);
12487
12488       if (GET_CODE (base) == PLUS)
12489         {
12490           rtx op0 = XEXP (base, 0);
12491           rtx op1 = XEXP (base, 1);
12492
12493           /* Force any scaling into a temp for CSE.  */
12494           op0 = force_reg (Pmode, op0);
12495           op1 = force_reg (Pmode, op1);
12496
12497           /* Let the pointer register be in op0.  */
12498           if (REG_POINTER (op1))
12499             std::swap (op0, op1);
12500
12501           /* If the pointer is virtual or frame related, then we know that
12502              virtual register instantiation or register elimination is going
12503              to apply a second constant.  We want the two constants folded
12504              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
12505           if (virt_or_elim_regno_p (REGNO (op0)))
12506             {
12507               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12508                                    NULL_RTX, true, OPTAB_DIRECT);
12509               return gen_rtx_PLUS (Pmode, base, op1);
12510             }
12511
12512           /* Otherwise, in order to encourage CSE (and thence loop strength
12513              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
12514           base = expand_binop (Pmode, add_optab, op0, op1,
12515                                NULL_RTX, true, OPTAB_DIRECT);
12516           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12517         }
12518
12519       HOST_WIDE_INT size;
12520       if (GET_MODE_SIZE (mode).is_constant (&size))
12521         {
12522           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12523                                                              mode);
12524           if (base_offset != 0)
12525             {
12526               base = plus_constant (Pmode, base, base_offset);
12527               base = force_operand (base, NULL_RTX);
12528               return plus_constant (Pmode, base, offset - base_offset);
12529             }
12530         }
12531     }
12532
12533   return x;
12534 }
12535
12536 static reg_class_t
12537 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12538                           reg_class_t rclass,
12539                           machine_mode mode,
12540                           secondary_reload_info *sri)
12541 {
12542   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12543      LDR and STR.  See the comment at the head of aarch64-sve.md for
12544      more details about the big-endian handling.  */
12545   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12546   if (reg_class_subset_p (rclass, FP_REGS)
12547       && !((REG_P (x) && HARD_REGISTER_P (x))
12548            || aarch64_simd_valid_immediate (x, NULL))
12549       && mode != VNx16QImode
12550       && (vec_flags & VEC_SVE_DATA)
12551       && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12552     {
12553       sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12554       return NO_REGS;
12555     }
12556
12557   /* If we have to disable direct literal pool loads and stores because the
12558      function is too big, then we need a scratch register.  */
12559   if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12560       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12561           || targetm.vector_mode_supported_p (GET_MODE (x)))
12562       && !aarch64_pcrelative_literal_loads)
12563     {
12564       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12565       return NO_REGS;
12566     }
12567
12568   /* Without the TARGET_SIMD instructions we cannot move a Q register
12569      to a Q register directly.  We need a scratch.  */
12570   if (REG_P (x)
12571       && (mode == TFmode
12572           || mode == TImode
12573           || mode == TDmode
12574           || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
12575       && mode == GET_MODE (x)
12576       && !TARGET_SIMD
12577       && FP_REGNUM_P (REGNO (x))
12578       && reg_class_subset_p (rclass, FP_REGS))
12579     {
12580       sri->icode = code_for_aarch64_reload_mov (mode);
12581       return NO_REGS;
12582     }
12583
12584   /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12585      because AArch64 has richer addressing modes for LDR/STR instructions
12586      than LDP/STP instructions.  */
12587   if (TARGET_FLOAT && rclass == GENERAL_REGS
12588       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12589     return FP_REGS;
12590
12591   if (rclass == FP_REGS
12592       && (mode == TImode || mode == TFmode || mode == TDmode)
12593       && CONSTANT_P(x))
12594       return GENERAL_REGS;
12595
12596   return NO_REGS;
12597 }
12598
12599 /* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
12600
12601 static bool
12602 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12603                                  reg_class_t class2)
12604 {
12605   if (!TARGET_SIMD
12606       && reg_classes_intersect_p (class1, FP_REGS)
12607       && reg_classes_intersect_p (class2, FP_REGS))
12608     {
12609       /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12610          so we can't easily split a move involving tuples of 128-bit
12611          vectors.  Force the copy through memory instead.
12612
12613          (Tuples of 64-bit vectors are fine.)  */
12614       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12615       if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12616         return true;
12617     }
12618   return false;
12619 }
12620
12621 static bool
12622 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12623 {
12624   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12625
12626   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12627      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
12628   if (frame_pointer_needed)
12629     return to == HARD_FRAME_POINTER_REGNUM;
12630   return true;
12631 }
12632
12633 poly_int64
12634 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12635 {
12636   if (to == HARD_FRAME_POINTER_REGNUM)
12637     {
12638       if (from == ARG_POINTER_REGNUM)
12639         return cfun->machine->frame.hard_fp_offset;
12640
12641       if (from == FRAME_POINTER_REGNUM)
12642         return cfun->machine->frame.hard_fp_offset
12643                - cfun->machine->frame.locals_offset;
12644     }
12645
12646   if (to == STACK_POINTER_REGNUM)
12647     {
12648       if (from == FRAME_POINTER_REGNUM)
12649           return cfun->machine->frame.frame_size
12650                  - cfun->machine->frame.locals_offset;
12651     }
12652
12653   return cfun->machine->frame.frame_size;
12654 }
12655
12656
12657 /* Get return address without mangling.  */
12658
12659 rtx
12660 aarch64_return_addr_rtx (void)
12661 {
12662   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12663   /* Note: aarch64_return_address_signing_enabled only
12664      works after cfun->machine->frame.laid_out is set,
12665      so here we don't know if the return address will
12666      be signed or not.  */
12667   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12668   emit_move_insn (lr, val);
12669   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12670   return lr;
12671 }
12672
12673
12674 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
12675    previous frame.  */
12676
12677 rtx
12678 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12679 {
12680   if (count != 0)
12681     return const0_rtx;
12682   return aarch64_return_addr_rtx ();
12683 }
12684
12685 static void
12686 aarch64_asm_trampoline_template (FILE *f)
12687 {
12688   /* Even if the current function doesn't have branch protection, some
12689      later function might, so since this template is only generated once
12690      we have to add a BTI just in case. */
12691   asm_fprintf (f, "\thint\t34 // bti c\n");
12692
12693   if (TARGET_ILP32)
12694     {
12695       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12696       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12697     }
12698   else
12699     {
12700       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12701       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12702     }
12703   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12704
12705   /* We always emit a speculation barrier.
12706      This is because the same trampoline template is used for every nested
12707      function.  Since nested functions are not particularly common or
12708      performant we don't worry too much about the extra instructions to copy
12709      around.
12710      This is not yet a problem, since we have not yet implemented function
12711      specific attributes to choose between hardening against straight line
12712      speculation or not, but such function specific attributes are likely to
12713      happen in the future.  */
12714   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12715
12716   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12717   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12718 }
12719
12720 static void
12721 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12722 {
12723   rtx fnaddr, mem, a_tramp;
12724   const int tramp_code_sz = 24;
12725
12726   /* Don't need to copy the trailing D-words, we fill those in below.  */
12727   /* We create our own memory address in Pmode so that `emit_block_move` can
12728      use parts of the backend which expect Pmode addresses.  */
12729   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12730   emit_block_move (gen_rtx_MEM (BLKmode, temp),
12731                    assemble_trampoline_template (),
12732                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12733   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
12734   fnaddr = XEXP (DECL_RTL (fndecl), 0);
12735   if (GET_MODE (fnaddr) != ptr_mode)
12736     fnaddr = convert_memory_address (ptr_mode, fnaddr);
12737   emit_move_insn (mem, fnaddr);
12738
12739   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
12740   emit_move_insn (mem, chain_value);
12741
12742   /* XXX We should really define a "clear_cache" pattern and use
12743      gen_clear_cache().  */
12744   a_tramp = XEXP (m_tramp, 0);
12745   maybe_emit_call_builtin___clear_cache (a_tramp,
12746                                          plus_constant (ptr_mode,
12747                                                         a_tramp,
12748                                                         TRAMPOLINE_SIZE));
12749 }
12750
12751 static unsigned char
12752 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
12753 {
12754   /* ??? Logically we should only need to provide a value when
12755      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
12756      can hold MODE, but at the moment we need to handle all modes.
12757      Just ignore any runtime parts for registers that can't store them.  */
12758   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
12759   unsigned int nregs, vec_flags;
12760   switch (regclass)
12761     {
12762     case STUB_REGS:
12763     case TAILCALL_ADDR_REGS:
12764     case POINTER_REGS:
12765     case GENERAL_REGS:
12766     case ALL_REGS:
12767     case POINTER_AND_FP_REGS:
12768     case FP_REGS:
12769     case FP_LO_REGS:
12770     case FP_LO8_REGS:
12771       vec_flags = aarch64_classify_vector_mode (mode);
12772       if ((vec_flags & VEC_SVE_DATA)
12773           && constant_multiple_p (GET_MODE_SIZE (mode),
12774                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
12775         return nregs;
12776       return (vec_flags & VEC_ADVSIMD
12777               ? CEIL (lowest_size, UNITS_PER_VREG)
12778               : CEIL (lowest_size, UNITS_PER_WORD));
12779     case STACK_REG:
12780     case PR_REGS:
12781     case PR_LO_REGS:
12782     case PR_HI_REGS:
12783     case FFR_REGS:
12784     case PR_AND_FFR_REGS:
12785       return 1;
12786
12787     case NO_REGS:
12788       return 0;
12789
12790     default:
12791       break;
12792     }
12793   gcc_unreachable ();
12794 }
12795
12796 static reg_class_t
12797 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
12798 {
12799   if (regclass == POINTER_REGS)
12800     return GENERAL_REGS;
12801
12802   if (regclass == STACK_REG)
12803     {
12804       if (REG_P(x)
12805           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
12806           return regclass;
12807
12808       return NO_REGS;
12809     }
12810
12811   /* Register eliminiation can result in a request for
12812      SP+constant->FP_REGS.  We cannot support such operations which
12813      use SP as source and an FP_REG as destination, so reject out
12814      right now.  */
12815   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
12816     {
12817       rtx lhs = XEXP (x, 0);
12818
12819       /* Look through a possible SUBREG introduced by ILP32.  */
12820       if (SUBREG_P (lhs))
12821         lhs = SUBREG_REG (lhs);
12822
12823       gcc_assert (REG_P (lhs));
12824       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
12825                                       POINTER_REGS));
12826       return NO_REGS;
12827     }
12828
12829   return regclass;
12830 }
12831
12832 void
12833 aarch64_asm_output_labelref (FILE* f, const char *name)
12834 {
12835   asm_fprintf (f, "%U%s", name);
12836 }
12837
12838 static void
12839 aarch64_elf_asm_constructor (rtx symbol, int priority)
12840 {
12841   if (priority == DEFAULT_INIT_PRIORITY)
12842     default_ctor_section_asm_out_constructor (symbol, priority);
12843   else
12844     {
12845       section *s;
12846       /* While priority is known to be in range [0, 65535], so 18 bytes
12847          would be enough, the compiler might not know that.  To avoid
12848          -Wformat-truncation false positive, use a larger size.  */
12849       char buf[23];
12850       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
12851       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12852       switch_to_section (s);
12853       assemble_align (POINTER_SIZE);
12854       assemble_aligned_integer (POINTER_BYTES, symbol);
12855     }
12856 }
12857
12858 static void
12859 aarch64_elf_asm_destructor (rtx symbol, int priority)
12860 {
12861   if (priority == DEFAULT_INIT_PRIORITY)
12862     default_dtor_section_asm_out_destructor (symbol, priority);
12863   else
12864     {
12865       section *s;
12866       /* While priority is known to be in range [0, 65535], so 18 bytes
12867          would be enough, the compiler might not know that.  To avoid
12868          -Wformat-truncation false positive, use a larger size.  */
12869       char buf[23];
12870       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
12871       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12872       switch_to_section (s);
12873       assemble_align (POINTER_SIZE);
12874       assemble_aligned_integer (POINTER_BYTES, symbol);
12875     }
12876 }
12877
12878 const char*
12879 aarch64_output_casesi (rtx *operands)
12880 {
12881   char buf[100];
12882   char label[100];
12883   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
12884   int index;
12885   static const char *const patterns[4][2] =
12886   {
12887     {
12888       "ldrb\t%w3, [%0,%w1,uxtw]",
12889       "add\t%3, %4, %w3, sxtb #2"
12890     },
12891     {
12892       "ldrh\t%w3, [%0,%w1,uxtw #1]",
12893       "add\t%3, %4, %w3, sxth #2"
12894     },
12895     {
12896       "ldr\t%w3, [%0,%w1,uxtw #2]",
12897       "add\t%3, %4, %w3, sxtw #2"
12898     },
12899     /* We assume that DImode is only generated when not optimizing and
12900        that we don't really need 64-bit address offsets.  That would
12901        imply an object file with 8GB of code in a single function!  */
12902     {
12903       "ldr\t%w3, [%0,%w1,uxtw #2]",
12904       "add\t%3, %4, %w3, sxtw #2"
12905     }
12906   };
12907
12908   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
12909
12910   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
12911   index = exact_log2 (GET_MODE_SIZE (mode));
12912
12913   gcc_assert (index >= 0 && index <= 3);
12914
12915   /* Need to implement table size reduction, by chaning the code below.  */
12916   output_asm_insn (patterns[index][0], operands);
12917   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
12918   snprintf (buf, sizeof (buf),
12919             "adr\t%%4, %s", targetm.strip_name_encoding (label));
12920   output_asm_insn (buf, operands);
12921   output_asm_insn (patterns[index][1], operands);
12922   output_asm_insn ("br\t%3", operands);
12923   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
12924                    operands);
12925   assemble_label (asm_out_file, label);
12926   return "";
12927 }
12928
12929
12930 /* Return size in bits of an arithmetic operand which is shifted/scaled and
12931    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
12932    operator.  */
12933
12934 int
12935 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
12936 {
12937   if (shift >= 0 && shift <= 3)
12938     {
12939       int size;
12940       for (size = 8; size <= 32; size *= 2)
12941         {
12942           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
12943           if (mask == bits << shift)
12944             return size;
12945         }
12946     }
12947   return 0;
12948 }
12949
12950 /* Constant pools are per function only when PC relative
12951    literal loads are true or we are in the large memory
12952    model.  */
12953
12954 static inline bool
12955 aarch64_can_use_per_function_literal_pools_p (void)
12956 {
12957   return (aarch64_pcrelative_literal_loads
12958           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
12959 }
12960
12961 static bool
12962 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
12963 {
12964   /* We can't use blocks for constants when we're using a per-function
12965      constant pool.  */
12966   return !aarch64_can_use_per_function_literal_pools_p ();
12967 }
12968
12969 /* Select appropriate section for constants depending
12970    on where we place literal pools.  */
12971
12972 static section *
12973 aarch64_select_rtx_section (machine_mode mode,
12974                             rtx x,
12975                             unsigned HOST_WIDE_INT align)
12976 {
12977   if (aarch64_can_use_per_function_literal_pools_p ())
12978     return function_section (current_function_decl);
12979
12980   return default_elf_select_rtx_section (mode, x, align);
12981 }
12982
12983 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
12984 void
12985 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
12986                                   HOST_WIDE_INT offset)
12987 {
12988   /* When using per-function literal pools, we must ensure that any code
12989      section is aligned to the minimal instruction length, lest we get
12990      errors from the assembler re "unaligned instructions".  */
12991   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
12992     ASM_OUTPUT_ALIGN (f, 2);
12993 }
12994
12995 /* Costs.  */
12996
12997 /* Helper function for rtx cost calculation.  Strip a shift expression
12998    from X.  Returns the inner operand if successful, or the original
12999    expression on failure.  */
13000 static rtx
13001 aarch64_strip_shift (rtx x)
13002 {
13003   rtx op = x;
13004
13005   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13006      we can convert both to ROR during final output.  */
13007   if ((GET_CODE (op) == ASHIFT
13008        || GET_CODE (op) == ASHIFTRT
13009        || GET_CODE (op) == LSHIFTRT
13010        || GET_CODE (op) == ROTATERT
13011        || GET_CODE (op) == ROTATE)
13012       && CONST_INT_P (XEXP (op, 1)))
13013     return XEXP (op, 0);
13014
13015   if (GET_CODE (op) == MULT
13016       && CONST_INT_P (XEXP (op, 1))
13017       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13018     return XEXP (op, 0);
13019
13020   return x;
13021 }
13022
13023 /* Helper function for rtx cost calculation.  Strip an extend
13024    expression from X.  Returns the inner operand if successful, or the
13025    original expression on failure.  We deal with a number of possible
13026    canonicalization variations here. If STRIP_SHIFT is true, then
13027    we can strip off a shift also.  */
13028 static rtx
13029 aarch64_strip_extend (rtx x, bool strip_shift)
13030 {
13031   scalar_int_mode mode;
13032   rtx op = x;
13033
13034   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13035     return op;
13036
13037   if (GET_CODE (op) == AND
13038       && GET_CODE (XEXP (op, 0)) == MULT
13039       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13040       && CONST_INT_P (XEXP (op, 1))
13041       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13042                            INTVAL (XEXP (op, 1))) != 0)
13043     return XEXP (XEXP (op, 0), 0);
13044
13045   /* Now handle extended register, as this may also have an optional
13046      left shift by 1..4.  */
13047   if (strip_shift
13048       && GET_CODE (op) == ASHIFT
13049       && CONST_INT_P (XEXP (op, 1))
13050       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13051     op = XEXP (op, 0);
13052
13053   if (GET_CODE (op) == ZERO_EXTEND
13054       || GET_CODE (op) == SIGN_EXTEND)
13055     op = XEXP (op, 0);
13056
13057   if (op != x)
13058     return op;
13059
13060   return x;
13061 }
13062
13063 /* Helper function for rtx cost calculation. Strip extension as well as any
13064    inner VEC_SELECT high-half from X. Returns the inner vector operand if
13065    successful, or the original expression on failure.  */
13066 static rtx
13067 aarch64_strip_extend_vec_half (rtx x)
13068 {
13069   if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13070     {
13071       x = XEXP (x, 0);
13072       if (GET_CODE (x) == VEC_SELECT
13073           && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13074                                     XEXP (x, 1)))
13075         x = XEXP (x, 0);
13076     }
13077   return x;
13078 }
13079
13080 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13081    any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13082    operand if successful, or the original expression on failure.  */
13083 static rtx
13084 aarch64_strip_duplicate_vec_elt (rtx x)
13085 {
13086   if (GET_CODE (x) == VEC_DUPLICATE
13087       && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13088     {
13089       x = XEXP (x, 0);
13090       if (GET_CODE (x) == VEC_SELECT)
13091         x = XEXP (x, 0);
13092       else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13093                && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13094         x = XEXP (XEXP (x, 0), 0);
13095     }
13096   return x;
13097 }
13098
13099 /* Return true iff CODE is a shift supported in combination
13100    with arithmetic instructions.  */
13101
13102 static bool
13103 aarch64_shift_p (enum rtx_code code)
13104 {
13105   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13106 }
13107
13108
13109 /* Return true iff X is a cheap shift without a sign extend. */
13110
13111 static bool
13112 aarch64_cheap_mult_shift_p (rtx x)
13113 {
13114   rtx op0, op1;
13115
13116   op0 = XEXP (x, 0);
13117   op1 = XEXP (x, 1);
13118
13119   if (!(aarch64_tune_params.extra_tuning_flags
13120                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13121     return false;
13122
13123   if (GET_CODE (op0) == SIGN_EXTEND)
13124     return false;
13125
13126   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13127       && UINTVAL (op1) <= 4)
13128     return true;
13129
13130   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13131     return false;
13132
13133   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13134
13135   if (l2 > 0 && l2 <= 4)
13136     return true;
13137
13138   return false;
13139 }
13140
13141 /* Helper function for rtx cost calculation.  Calculate the cost of
13142    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13143    Return the calculated cost of the expression, recursing manually in to
13144    operands where needed.  */
13145
13146 static int
13147 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13148 {
13149   rtx op0, op1;
13150   const struct cpu_cost_table *extra_cost
13151     = aarch64_tune_params.insn_extra_cost;
13152   int cost = 0;
13153   bool compound_p = (outer == PLUS || outer == MINUS);
13154   machine_mode mode = GET_MODE (x);
13155
13156   gcc_checking_assert (code == MULT);
13157
13158   op0 = XEXP (x, 0);
13159   op1 = XEXP (x, 1);
13160
13161   if (VECTOR_MODE_P (mode))
13162     {
13163       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13164       if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13165         {
13166           /* The select-operand-high-half versions of the instruction have the
13167              same cost as the three vector version - don't add the costs of the
13168              extension or selection into the costs of the multiply.  */
13169           op0 = aarch64_strip_extend_vec_half (op0);
13170           op1 = aarch64_strip_extend_vec_half (op1);
13171           /* The by-element versions of the instruction have the same costs as
13172              the normal 3-vector version.  We make an assumption that the input
13173              to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
13174              costing of a MUL by element pre RA is a bit optimistic.  */
13175           op0 = aarch64_strip_duplicate_vec_elt (op0);
13176           op1 = aarch64_strip_duplicate_vec_elt (op1);
13177         }
13178       cost += rtx_cost (op0, mode, MULT, 0, speed);
13179       cost += rtx_cost (op1, mode, MULT, 1, speed);
13180       if (speed)
13181         {
13182           if (GET_CODE (x) == MULT)
13183             cost += extra_cost->vect.mult;
13184           /* This is to catch the SSRA costing currently flowing here.  */
13185           else
13186             cost += extra_cost->vect.alu;
13187         }
13188       return cost;
13189     }
13190
13191   /* Integer multiply/fma.  */
13192   if (GET_MODE_CLASS (mode) == MODE_INT)
13193     {
13194       /* The multiply will be canonicalized as a shift, cost it as such.  */
13195       if (aarch64_shift_p (GET_CODE (x))
13196           || (CONST_INT_P (op1)
13197               && exact_log2 (INTVAL (op1)) > 0))
13198         {
13199           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13200                            || GET_CODE (op0) == SIGN_EXTEND;
13201           if (speed)
13202             {
13203               if (compound_p)
13204                 {
13205                   /* If the shift is considered cheap,
13206                      then don't add any cost. */
13207                   if (aarch64_cheap_mult_shift_p (x))
13208                     ;
13209                   else if (REG_P (op1))
13210                     /* ARITH + shift-by-register.  */
13211                     cost += extra_cost->alu.arith_shift_reg;
13212                   else if (is_extend)
13213                     /* ARITH + extended register.  We don't have a cost field
13214                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
13215                     cost += extra_cost->alu.extend_arith;
13216                   else
13217                     /* ARITH + shift-by-immediate.  */
13218                     cost += extra_cost->alu.arith_shift;
13219                 }
13220               else
13221                 /* LSL (immediate).  */
13222                 cost += extra_cost->alu.shift;
13223
13224             }
13225           /* Strip extends as we will have costed them in the case above.  */
13226           if (is_extend)
13227             op0 = aarch64_strip_extend (op0, true);
13228
13229           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13230
13231           return cost;
13232         }
13233
13234       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
13235          compound and let the below cases handle it.  After all, MNEG is a
13236          special-case alias of MSUB.  */
13237       if (GET_CODE (op0) == NEG)
13238         {
13239           op0 = XEXP (op0, 0);
13240           compound_p = true;
13241         }
13242
13243       /* Integer multiplies or FMAs have zero/sign extending variants.  */
13244       if ((GET_CODE (op0) == ZERO_EXTEND
13245            && GET_CODE (op1) == ZERO_EXTEND)
13246           || (GET_CODE (op0) == SIGN_EXTEND
13247               && GET_CODE (op1) == SIGN_EXTEND))
13248         {
13249           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13250           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13251
13252           if (speed)
13253             {
13254               if (compound_p)
13255                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
13256                 cost += extra_cost->mult[0].extend_add;
13257               else
13258                 /* MUL/SMULL/UMULL.  */
13259                 cost += extra_cost->mult[0].extend;
13260             }
13261
13262           return cost;
13263         }
13264
13265       /* This is either an integer multiply or a MADD.  In both cases
13266          we want to recurse and cost the operands.  */
13267       cost += rtx_cost (op0, mode, MULT, 0, speed);
13268       cost += rtx_cost (op1, mode, MULT, 1, speed);
13269
13270       if (speed)
13271         {
13272           if (compound_p)
13273             /* MADD/MSUB.  */
13274             cost += extra_cost->mult[mode == DImode].add;
13275           else
13276             /* MUL.  */
13277             cost += extra_cost->mult[mode == DImode].simple;
13278         }
13279
13280       return cost;
13281     }
13282   else
13283     {
13284       if (speed)
13285         {
13286           /* Floating-point FMA/FMUL can also support negations of the
13287              operands, unless the rounding mode is upward or downward in
13288              which case FNMUL is different than FMUL with operand negation.  */
13289           bool neg0 = GET_CODE (op0) == NEG;
13290           bool neg1 = GET_CODE (op1) == NEG;
13291           if (compound_p || !flag_rounding_math || (neg0 && neg1))
13292             {
13293               if (neg0)
13294                 op0 = XEXP (op0, 0);
13295               if (neg1)
13296                 op1 = XEXP (op1, 0);
13297             }
13298
13299           if (compound_p)
13300             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
13301             cost += extra_cost->fp[mode == DFmode].fma;
13302           else
13303             /* FMUL/FNMUL.  */
13304             cost += extra_cost->fp[mode == DFmode].mult;
13305         }
13306
13307       cost += rtx_cost (op0, mode, MULT, 0, speed);
13308       cost += rtx_cost (op1, mode, MULT, 1, speed);
13309       return cost;
13310     }
13311 }
13312
13313 static int
13314 aarch64_address_cost (rtx x,
13315                       machine_mode mode,
13316                       addr_space_t as ATTRIBUTE_UNUSED,
13317                       bool speed)
13318 {
13319   enum rtx_code c = GET_CODE (x);
13320   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13321   struct aarch64_address_info info;
13322   int cost = 0;
13323   info.shift = 0;
13324
13325   if (!aarch64_classify_address (&info, x, mode, false))
13326     {
13327       if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13328         {
13329           /* This is a CONST or SYMBOL ref which will be split
13330              in a different way depending on the code model in use.
13331              Cost it through the generic infrastructure.  */
13332           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13333           /* Divide through by the cost of one instruction to
13334              bring it to the same units as the address costs.  */
13335           cost_symbol_ref /= COSTS_N_INSNS (1);
13336           /* The cost is then the cost of preparing the address,
13337              followed by an immediate (possibly 0) offset.  */
13338           return cost_symbol_ref + addr_cost->imm_offset;
13339         }
13340       else
13341         {
13342           /* This is most likely a jump table from a case
13343              statement.  */
13344           return addr_cost->register_offset;
13345         }
13346     }
13347
13348   switch (info.type)
13349     {
13350       case ADDRESS_LO_SUM:
13351       case ADDRESS_SYMBOLIC:
13352       case ADDRESS_REG_IMM:
13353         cost += addr_cost->imm_offset;
13354         break;
13355
13356       case ADDRESS_REG_WB:
13357         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13358           cost += addr_cost->pre_modify;
13359         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13360           {
13361             unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13362             if (nvectors == 3)
13363               cost += addr_cost->post_modify_ld3_st3;
13364             else if (nvectors == 4)
13365               cost += addr_cost->post_modify_ld4_st4;
13366             else
13367               cost += addr_cost->post_modify;
13368           }
13369         else
13370           gcc_unreachable ();
13371
13372         break;
13373
13374       case ADDRESS_REG_REG:
13375         cost += addr_cost->register_offset;
13376         break;
13377
13378       case ADDRESS_REG_SXTW:
13379         cost += addr_cost->register_sextend;
13380         break;
13381
13382       case ADDRESS_REG_UXTW:
13383         cost += addr_cost->register_zextend;
13384         break;
13385
13386       default:
13387         gcc_unreachable ();
13388     }
13389
13390
13391   if (info.shift > 0)
13392     {
13393       /* For the sake of calculating the cost of the shifted register
13394          component, we can treat same sized modes in the same way.  */
13395       if (known_eq (GET_MODE_BITSIZE (mode), 16))
13396         cost += addr_cost->addr_scale_costs.hi;
13397       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13398         cost += addr_cost->addr_scale_costs.si;
13399       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13400         cost += addr_cost->addr_scale_costs.di;
13401       else
13402         /* We can't tell, or this is a 128-bit vector.  */
13403         cost += addr_cost->addr_scale_costs.ti;
13404     }
13405
13406   return cost;
13407 }
13408
13409 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
13410    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
13411    to be taken.  */
13412
13413 int
13414 aarch64_branch_cost (bool speed_p, bool predictable_p)
13415 {
13416   /* When optimizing for speed, use the cost of unpredictable branches.  */
13417   const struct cpu_branch_cost *branch_costs =
13418     aarch64_tune_params.branch_costs;
13419
13420   if (!speed_p || predictable_p)
13421     return branch_costs->predictable;
13422   else
13423     return branch_costs->unpredictable;
13424 }
13425
13426 /* Return true if X is a zero or sign extract
13427    usable in an ADD or SUB (extended register) instruction.  */
13428 static bool
13429 aarch64_rtx_arith_op_extract_p (rtx x)
13430 {
13431   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13432      No shift.  */
13433   if (GET_CODE (x) == SIGN_EXTEND
13434       || GET_CODE (x) == ZERO_EXTEND)
13435     return REG_P (XEXP (x, 0));
13436
13437   return false;
13438 }
13439
13440 static bool
13441 aarch64_frint_unspec_p (unsigned int u)
13442 {
13443   switch (u)
13444     {
13445       case UNSPEC_FRINTZ:
13446       case UNSPEC_FRINTP:
13447       case UNSPEC_FRINTM:
13448       case UNSPEC_FRINTA:
13449       case UNSPEC_FRINTN:
13450       case UNSPEC_FRINTX:
13451       case UNSPEC_FRINTI:
13452         return true;
13453
13454       default:
13455         return false;
13456     }
13457 }
13458
13459 /* Return true iff X is an rtx that will match an extr instruction
13460    i.e. as described in the *extr<mode>5_insn family of patterns.
13461    OP0 and OP1 will be set to the operands of the shifts involved
13462    on success and will be NULL_RTX otherwise.  */
13463
13464 static bool
13465 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13466 {
13467   rtx op0, op1;
13468   scalar_int_mode mode;
13469   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13470     return false;
13471
13472   *res_op0 = NULL_RTX;
13473   *res_op1 = NULL_RTX;
13474
13475   if (GET_CODE (x) != IOR)
13476     return false;
13477
13478   op0 = XEXP (x, 0);
13479   op1 = XEXP (x, 1);
13480
13481   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13482       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13483     {
13484      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
13485       if (GET_CODE (op1) == ASHIFT)
13486         std::swap (op0, op1);
13487
13488       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13489         return false;
13490
13491       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13492       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13493
13494       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13495           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13496         {
13497           *res_op0 = XEXP (op0, 0);
13498           *res_op1 = XEXP (op1, 0);
13499           return true;
13500         }
13501     }
13502
13503   return false;
13504 }
13505
13506 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13507    storing it in *COST.  Result is true if the total cost of the operation
13508    has now been calculated.  */
13509 static bool
13510 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13511 {
13512   rtx inner;
13513   rtx comparator;
13514   enum rtx_code cmpcode;
13515   const struct cpu_cost_table *extra_cost
13516     = aarch64_tune_params.insn_extra_cost;
13517
13518   if (COMPARISON_P (op0))
13519     {
13520       inner = XEXP (op0, 0);
13521       comparator = XEXP (op0, 1);
13522       cmpcode = GET_CODE (op0);
13523     }
13524   else
13525     {
13526       inner = op0;
13527       comparator = const0_rtx;
13528       cmpcode = NE;
13529     }
13530
13531   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13532     {
13533       /* Conditional branch.  */
13534       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13535         return true;
13536       else
13537         {
13538           if (cmpcode == NE || cmpcode == EQ)
13539             {
13540               if (comparator == const0_rtx)
13541                 {
13542                   /* TBZ/TBNZ/CBZ/CBNZ.  */
13543                   if (GET_CODE (inner) == ZERO_EXTRACT)
13544                     /* TBZ/TBNZ.  */
13545                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13546                                        ZERO_EXTRACT, 0, speed);
13547                   else
13548                     /* CBZ/CBNZ.  */
13549                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13550
13551                   return true;
13552                 }
13553               if (register_operand (inner, VOIDmode)
13554                   && aarch64_imm24 (comparator, VOIDmode))
13555                 {
13556                   /* SUB and SUBS.  */
13557                   *cost += COSTS_N_INSNS (2);
13558                   if (speed)
13559                     *cost += extra_cost->alu.arith * 2;
13560                   return true;
13561                 }
13562             }
13563           else if (cmpcode == LT || cmpcode == GE)
13564             {
13565               /* TBZ/TBNZ.  */
13566               if (comparator == const0_rtx)
13567                 return true;
13568             }
13569         }
13570     }
13571   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13572     {
13573       /* CCMP.  */
13574       if (GET_CODE (op1) == COMPARE)
13575         {
13576           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
13577           if (XEXP (op1, 1) == const0_rtx)
13578             *cost += 1;
13579           if (speed)
13580             {
13581               machine_mode mode = GET_MODE (XEXP (op1, 0));
13582
13583               if (GET_MODE_CLASS (mode) == MODE_INT)
13584                 *cost += extra_cost->alu.arith;
13585               else
13586                 *cost += extra_cost->fp[mode == DFmode].compare;
13587             }
13588           return true;
13589         }
13590
13591       /* It's a conditional operation based on the status flags,
13592          so it must be some flavor of CSEL.  */
13593
13594       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
13595       if (GET_CODE (op1) == NEG
13596           || GET_CODE (op1) == NOT
13597           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13598         op1 = XEXP (op1, 0);
13599       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13600         {
13601           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
13602           op1 = XEXP (op1, 0);
13603           op2 = XEXP (op2, 0);
13604         }
13605       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13606         {
13607           inner = XEXP (op1, 0);
13608           if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13609             /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
13610             op1 = XEXP (inner, 0);
13611         }
13612
13613       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13614       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13615       return true;
13616     }
13617
13618   /* We don't know what this is, cost all operands.  */
13619   return false;
13620 }
13621
13622 /* Check whether X is a bitfield operation of the form shift + extend that
13623    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
13624    operand to which the bitfield operation is applied.  Otherwise return
13625    NULL_RTX.  */
13626
13627 static rtx
13628 aarch64_extend_bitfield_pattern_p (rtx x)
13629 {
13630   rtx_code outer_code = GET_CODE (x);
13631   machine_mode outer_mode = GET_MODE (x);
13632
13633   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13634       && outer_mode != SImode && outer_mode != DImode)
13635     return NULL_RTX;
13636
13637   rtx inner = XEXP (x, 0);
13638   rtx_code inner_code = GET_CODE (inner);
13639   machine_mode inner_mode = GET_MODE (inner);
13640   rtx op = NULL_RTX;
13641
13642   switch (inner_code)
13643     {
13644       case ASHIFT:
13645         if (CONST_INT_P (XEXP (inner, 1))
13646             && (inner_mode == QImode || inner_mode == HImode))
13647           op = XEXP (inner, 0);
13648         break;
13649       case LSHIFTRT:
13650         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13651             && (inner_mode == QImode || inner_mode == HImode))
13652           op = XEXP (inner, 0);
13653         break;
13654       case ASHIFTRT:
13655         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13656             && (inner_mode == QImode || inner_mode == HImode))
13657           op = XEXP (inner, 0);
13658         break;
13659       default:
13660         break;
13661     }
13662
13663   return op;
13664 }
13665
13666 /* Return true if the mask and a shift amount from an RTX of the form
13667    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13668    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
13669
13670 bool
13671 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13672                                     rtx shft_amnt)
13673 {
13674   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
13675          && INTVAL (mask) > 0
13676          && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
13677          && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
13678          && (UINTVAL (mask)
13679              & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
13680 }
13681
13682 /* Return true if the masks and a shift amount from an RTX of the form
13683    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
13684    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
13685
13686 bool
13687 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
13688                                    unsigned HOST_WIDE_INT mask1,
13689                                    unsigned HOST_WIDE_INT shft_amnt,
13690                                    unsigned HOST_WIDE_INT mask2)
13691 {
13692   unsigned HOST_WIDE_INT t;
13693
13694   /* Verify that there is no overlap in what bits are set in the two masks.  */
13695   if (mask1 != ~mask2)
13696     return false;
13697
13698   /* Verify that mask2 is not all zeros or ones.  */
13699   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
13700     return false;
13701
13702   /* The shift amount should always be less than the mode size.  */
13703   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
13704
13705   /* Verify that the mask being shifted is contiguous and would be in the
13706      least significant bits after shifting by shft_amnt.  */
13707   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
13708   return (t == (t & -t));
13709 }
13710
13711 /* Calculate the cost of calculating X, storing it in *COST.  Result
13712    is true if the total cost of the operation has now been calculated.  */
13713 static bool
13714 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
13715                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
13716 {
13717   rtx op0, op1, op2;
13718   const struct cpu_cost_table *extra_cost
13719     = aarch64_tune_params.insn_extra_cost;
13720   rtx_code code = GET_CODE (x);
13721   scalar_int_mode int_mode;
13722
13723   /* By default, assume that everything has equivalent cost to the
13724      cheapest instruction.  Any additional costs are applied as a delta
13725      above this default.  */
13726   *cost = COSTS_N_INSNS (1);
13727
13728   switch (code)
13729     {
13730     case SET:
13731       /* The cost depends entirely on the operands to SET.  */
13732       *cost = 0;
13733       op0 = SET_DEST (x);
13734       op1 = SET_SRC (x);
13735
13736       switch (GET_CODE (op0))
13737         {
13738         case MEM:
13739           if (speed)
13740             {
13741               rtx address = XEXP (op0, 0);
13742               if (VECTOR_MODE_P (mode))
13743                 *cost += extra_cost->ldst.storev;
13744               else if (GET_MODE_CLASS (mode) == MODE_INT)
13745                 *cost += extra_cost->ldst.store;
13746               else if (mode == SFmode || mode == SDmode)
13747                 *cost += extra_cost->ldst.storef;
13748               else if (mode == DFmode || mode == DDmode)
13749                 *cost += extra_cost->ldst.stored;
13750
13751               *cost +=
13752                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13753                                                      0, speed));
13754             }
13755
13756           *cost += rtx_cost (op1, mode, SET, 1, speed);
13757           return true;
13758
13759         case SUBREG:
13760           if (! REG_P (SUBREG_REG (op0)))
13761             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
13762
13763           /* Fall through.  */
13764         case REG:
13765           /* The cost is one per vector-register copied.  */
13766           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
13767             {
13768               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
13769               *cost = COSTS_N_INSNS (nregs);
13770             }
13771           /* const0_rtx is in general free, but we will use an
13772              instruction to set a register to 0.  */
13773           else if (REG_P (op1) || op1 == const0_rtx)
13774             {
13775               /* The cost is 1 per register copied.  */
13776               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
13777               *cost = COSTS_N_INSNS (nregs);
13778             }
13779           else
13780             /* Cost is just the cost of the RHS of the set.  */
13781             *cost += rtx_cost (op1, mode, SET, 1, speed);
13782           return true;
13783
13784         case ZERO_EXTRACT:
13785         case SIGN_EXTRACT:
13786           /* Bit-field insertion.  Strip any redundant widening of
13787              the RHS to meet the width of the target.  */
13788           if (SUBREG_P (op1))
13789             op1 = SUBREG_REG (op1);
13790           if ((GET_CODE (op1) == ZERO_EXTEND
13791                || GET_CODE (op1) == SIGN_EXTEND)
13792               && CONST_INT_P (XEXP (op0, 1))
13793               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
13794               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
13795             op1 = XEXP (op1, 0);
13796
13797           if (CONST_INT_P (op1))
13798             {
13799               /* MOV immediate is assumed to always be cheap.  */
13800               *cost = COSTS_N_INSNS (1);
13801             }
13802           else
13803             {
13804               /* BFM.  */
13805               if (speed)
13806                 *cost += extra_cost->alu.bfi;
13807               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
13808             }
13809
13810           return true;
13811
13812         default:
13813           /* We can't make sense of this, assume default cost.  */
13814           *cost = COSTS_N_INSNS (1);
13815           return false;
13816         }
13817       return false;
13818
13819     case CONST_INT:
13820       /* If an instruction can incorporate a constant within the
13821          instruction, the instruction's expression avoids calling
13822          rtx_cost() on the constant.  If rtx_cost() is called on a
13823          constant, then it is usually because the constant must be
13824          moved into a register by one or more instructions.
13825
13826          The exception is constant 0, which can be expressed
13827          as XZR/WZR and is therefore free.  The exception to this is
13828          if we have (set (reg) (const0_rtx)) in which case we must cost
13829          the move.  However, we can catch that when we cost the SET, so
13830          we don't need to consider that here.  */
13831       if (x == const0_rtx)
13832         *cost = 0;
13833       else
13834         {
13835           /* To an approximation, building any other constant is
13836              proportionally expensive to the number of instructions
13837              required to build that constant.  This is true whether we
13838              are compiling for SPEED or otherwise.  */
13839           machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
13840                                 ? SImode : DImode;
13841           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
13842                                  (NULL_RTX, x, false, imode));
13843         }
13844       return true;
13845
13846     case CONST_DOUBLE:
13847
13848       /* First determine number of instructions to do the move
13849           as an integer constant.  */
13850       if (!aarch64_float_const_representable_p (x)
13851            && !aarch64_can_const_movi_rtx_p (x, mode)
13852            && aarch64_float_const_rtx_p (x))
13853         {
13854           unsigned HOST_WIDE_INT ival;
13855           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
13856           gcc_assert (succeed);
13857
13858           machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
13859                                 ? DImode : SImode;
13860           int ncost = aarch64_internal_mov_immediate
13861                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
13862           *cost += COSTS_N_INSNS (ncost);
13863           return true;
13864         }
13865
13866       if (speed)
13867         {
13868           /* mov[df,sf]_aarch64.  */
13869           if (aarch64_float_const_representable_p (x))
13870             /* FMOV (scalar immediate).  */
13871             *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
13872           else if (!aarch64_float_const_zero_rtx_p (x))
13873             {
13874               /* This will be a load from memory.  */
13875               if (mode == DFmode || mode == DDmode)
13876                 *cost += extra_cost->ldst.loadd;
13877               else
13878                 *cost += extra_cost->ldst.loadf;
13879             }
13880           else
13881             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
13882                or MOV v0.s[0], wzr - neither of which are modeled by the
13883                cost tables.  Just use the default cost.  */
13884             {
13885             }
13886         }
13887
13888       return true;
13889
13890     case MEM:
13891       if (speed)
13892         {
13893           /* For loads we want the base cost of a load, plus an
13894              approximation for the additional cost of the addressing
13895              mode.  */
13896           rtx address = XEXP (x, 0);
13897           if (VECTOR_MODE_P (mode))
13898             *cost += extra_cost->ldst.loadv;
13899           else if (GET_MODE_CLASS (mode) == MODE_INT)
13900             *cost += extra_cost->ldst.load;
13901           else if (mode == SFmode || mode == SDmode)
13902             *cost += extra_cost->ldst.loadf;
13903           else if (mode == DFmode || mode == DDmode)
13904             *cost += extra_cost->ldst.loadd;
13905
13906           *cost +=
13907                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13908                                                      0, speed));
13909         }
13910
13911       return true;
13912
13913     case NEG:
13914       op0 = XEXP (x, 0);
13915
13916       if (VECTOR_MODE_P (mode))
13917         {
13918           if (speed)
13919             {
13920               /* FNEG.  */
13921               *cost += extra_cost->vect.alu;
13922             }
13923           return false;
13924         }
13925
13926       if (GET_MODE_CLASS (mode) == MODE_INT)
13927         {
13928           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
13929               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
13930             {
13931               /* CSETM.  */
13932               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
13933               return true;
13934             }
13935
13936           /* Cost this as SUB wzr, X.  */
13937           op0 = CONST0_RTX (mode);
13938           op1 = XEXP (x, 0);
13939           goto cost_minus;
13940         }
13941
13942       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13943         {
13944           /* Support (neg(fma...)) as a single instruction only if
13945              sign of zeros is unimportant.  This matches the decision
13946              making in aarch64.md.  */
13947           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
13948             {
13949               /* FNMADD.  */
13950               *cost = rtx_cost (op0, mode, NEG, 0, speed);
13951               return true;
13952             }
13953           if (GET_CODE (op0) == MULT)
13954             {
13955               /* FNMUL.  */
13956               *cost = rtx_cost (op0, mode, NEG, 0, speed);
13957               return true;
13958             }
13959           if (speed)
13960             /* FNEG.  */
13961             *cost += extra_cost->fp[mode == DFmode].neg;
13962           return false;
13963         }
13964
13965       return false;
13966
13967     case CLRSB:
13968     case CLZ:
13969       if (speed)
13970         {
13971           if (VECTOR_MODE_P (mode))
13972             *cost += extra_cost->vect.alu;
13973           else
13974             *cost += extra_cost->alu.clz;
13975         }
13976
13977       return false;
13978
13979     case CTZ:
13980       *cost = COSTS_N_INSNS (2);
13981
13982       if (speed)
13983         *cost += extra_cost->alu.clz + extra_cost->alu.rev;
13984       return false;
13985
13986     case COMPARE:
13987       op0 = XEXP (x, 0);
13988       op1 = XEXP (x, 1);
13989
13990       if (op1 == const0_rtx
13991           && GET_CODE (op0) == AND)
13992         {
13993           x = op0;
13994           mode = GET_MODE (op0);
13995           goto cost_logic;
13996         }
13997
13998       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
13999         {
14000           /* TODO: A write to the CC flags possibly costs extra, this
14001              needs encoding in the cost tables.  */
14002
14003           mode = GET_MODE (op0);
14004           /* ANDS.  */
14005           if (GET_CODE (op0) == AND)
14006             {
14007               x = op0;
14008               goto cost_logic;
14009             }
14010
14011           if (GET_CODE (op0) == PLUS)
14012             {
14013               /* ADDS (and CMN alias).  */
14014               x = op0;
14015               goto cost_plus;
14016             }
14017
14018           if (GET_CODE (op0) == MINUS)
14019             {
14020               /* SUBS.  */
14021               x = op0;
14022               goto cost_minus;
14023             }
14024
14025           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14026               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14027               && CONST_INT_P (XEXP (op0, 2)))
14028             {
14029               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14030                  Handle it here directly rather than going to cost_logic
14031                  since we know the immediate generated for the TST is valid
14032                  so we can avoid creating an intermediate rtx for it only
14033                  for costing purposes.  */
14034               if (speed)
14035                 *cost += extra_cost->alu.logical;
14036
14037               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14038                                  ZERO_EXTRACT, 0, speed);
14039               return true;
14040             }
14041
14042           if (GET_CODE (op1) == NEG)
14043             {
14044               /* CMN.  */
14045               if (speed)
14046                 *cost += extra_cost->alu.arith;
14047
14048               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14049               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14050               return true;
14051             }
14052
14053           /* CMP.
14054
14055              Compare can freely swap the order of operands, and
14056              canonicalization puts the more complex operation first.
14057              But the integer MINUS logic expects the shift/extend
14058              operation in op1.  */
14059           if (! (REG_P (op0)
14060                  || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14061           {
14062             op0 = XEXP (x, 1);
14063             op1 = XEXP (x, 0);
14064           }
14065           goto cost_minus;
14066         }
14067
14068       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14069         {
14070           /* FCMP.  */
14071           if (speed)
14072             *cost += extra_cost->fp[mode == DFmode].compare;
14073
14074           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14075             {
14076               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14077               /* FCMP supports constant 0.0 for no extra cost. */
14078               return true;
14079             }
14080           return false;
14081         }
14082
14083       if (VECTOR_MODE_P (mode))
14084         {
14085           /* Vector compare.  */
14086           if (speed)
14087             *cost += extra_cost->vect.alu;
14088
14089           if (aarch64_float_const_zero_rtx_p (op1))
14090             {
14091               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14092                  cost.  */
14093               return true;
14094             }
14095           return false;
14096         }
14097       return false;
14098
14099     case MINUS:
14100       {
14101         op0 = XEXP (x, 0);
14102         op1 = XEXP (x, 1);
14103
14104 cost_minus:
14105         if (VECTOR_MODE_P (mode))
14106           {
14107             /* SUBL2 and SUBW2.  */
14108             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14109             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14110               {
14111                 /* The select-operand-high-half versions of the sub instruction
14112                    have the same cost as the regular three vector version -
14113                    don't add the costs of the select into the costs of the sub.
14114                    */
14115                 op0 = aarch64_strip_extend_vec_half (op0);
14116                 op1 = aarch64_strip_extend_vec_half (op1);
14117               }
14118           }
14119
14120         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14121
14122         /* Detect valid immediates.  */
14123         if ((GET_MODE_CLASS (mode) == MODE_INT
14124              || (GET_MODE_CLASS (mode) == MODE_CC
14125                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14126             && CONST_INT_P (op1)
14127             && aarch64_uimm12_shift (INTVAL (op1)))
14128           {
14129             if (speed)
14130               /* SUB(S) (immediate).  */
14131               *cost += extra_cost->alu.arith;
14132             return true;
14133           }
14134
14135         /* Look for SUB (extended register).  */
14136         if (is_a <scalar_int_mode> (mode)
14137             && aarch64_rtx_arith_op_extract_p (op1))
14138           {
14139             if (speed)
14140               *cost += extra_cost->alu.extend_arith;
14141
14142             op1 = aarch64_strip_extend (op1, true);
14143             *cost += rtx_cost (op1, VOIDmode,
14144                                (enum rtx_code) GET_CODE (op1), 0, speed);
14145             return true;
14146           }
14147
14148         rtx new_op1 = aarch64_strip_extend (op1, false);
14149
14150         /* Cost this as an FMA-alike operation.  */
14151         if ((GET_CODE (new_op1) == MULT
14152              || aarch64_shift_p (GET_CODE (new_op1)))
14153             && code != COMPARE)
14154           {
14155             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14156                                             (enum rtx_code) code,
14157                                             speed);
14158             return true;
14159           }
14160
14161         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14162
14163         if (speed)
14164           {
14165             if (VECTOR_MODE_P (mode))
14166               {
14167                 /* Vector SUB.  */
14168                 *cost += extra_cost->vect.alu;
14169               }
14170             else if (GET_MODE_CLASS (mode) == MODE_INT)
14171               {
14172                 /* SUB(S).  */
14173                 *cost += extra_cost->alu.arith;
14174               }
14175             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14176               {
14177                 /* FSUB.  */
14178                 *cost += extra_cost->fp[mode == DFmode].addsub;
14179               }
14180           }
14181         return true;
14182       }
14183
14184     case PLUS:
14185       {
14186         rtx new_op0;
14187
14188         op0 = XEXP (x, 0);
14189         op1 = XEXP (x, 1);
14190
14191 cost_plus:
14192         if (VECTOR_MODE_P (mode))
14193           {
14194             /* ADDL2 and ADDW2.  */
14195             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14196             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14197               {
14198                 /* The select-operand-high-half versions of the add instruction
14199                    have the same cost as the regular three vector version -
14200                    don't add the costs of the select into the costs of the add.
14201                    */
14202                 op0 = aarch64_strip_extend_vec_half (op0);
14203                 op1 = aarch64_strip_extend_vec_half (op1);
14204               }
14205           }
14206
14207         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14208             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14209           {
14210             /* CSINC.  */
14211             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14212             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14213             return true;
14214           }
14215
14216         if (GET_MODE_CLASS (mode) == MODE_INT
14217             && (aarch64_plus_immediate (op1, mode)
14218                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14219           {
14220             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14221
14222             if (speed)
14223               {
14224                 /* ADD (immediate).  */
14225                 *cost += extra_cost->alu.arith;
14226
14227                 /* Some tunings prefer to not use the VL-based scalar ops.
14228                    Increase the cost of the poly immediate to prevent their
14229                    formation.  */
14230                 if (GET_CODE (op1) == CONST_POLY_INT
14231                     && (aarch64_tune_params.extra_tuning_flags
14232                         & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14233                   *cost += COSTS_N_INSNS (1);
14234               }
14235             return true;
14236           }
14237
14238         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14239
14240         /* Look for ADD (extended register).  */
14241         if (is_a <scalar_int_mode> (mode)
14242             && aarch64_rtx_arith_op_extract_p (op0))
14243           {
14244             if (speed)
14245               *cost += extra_cost->alu.extend_arith;
14246
14247             op0 = aarch64_strip_extend (op0, true);
14248             *cost += rtx_cost (op0, VOIDmode,
14249                                (enum rtx_code) GET_CODE (op0), 0, speed);
14250             return true;
14251           }
14252
14253         /* Strip any extend, leave shifts behind as we will
14254            cost them through mult_cost.  */
14255         new_op0 = aarch64_strip_extend (op0, false);
14256
14257         if (GET_CODE (new_op0) == MULT
14258             || aarch64_shift_p (GET_CODE (new_op0)))
14259           {
14260             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14261                                             speed);
14262             return true;
14263           }
14264
14265         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14266
14267         if (speed)
14268           {
14269             if (VECTOR_MODE_P (mode))
14270               {
14271                 /* Vector ADD.  */
14272                 *cost += extra_cost->vect.alu;
14273               }
14274             else if (GET_MODE_CLASS (mode) == MODE_INT)
14275               {
14276                 /* ADD.  */
14277                 *cost += extra_cost->alu.arith;
14278               }
14279             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14280               {
14281                 /* FADD.  */
14282                 *cost += extra_cost->fp[mode == DFmode].addsub;
14283               }
14284           }
14285         return true;
14286       }
14287
14288     case BSWAP:
14289       *cost = COSTS_N_INSNS (1);
14290
14291       if (speed)
14292         {
14293           if (VECTOR_MODE_P (mode))
14294             *cost += extra_cost->vect.alu;
14295           else
14296             *cost += extra_cost->alu.rev;
14297         }
14298       return false;
14299
14300     case IOR:
14301       if (aarch_rev16_p (x))
14302         {
14303           *cost = COSTS_N_INSNS (1);
14304
14305           if (speed)
14306             {
14307               if (VECTOR_MODE_P (mode))
14308                 *cost += extra_cost->vect.alu;
14309               else
14310                 *cost += extra_cost->alu.rev;
14311             }
14312           return true;
14313         }
14314
14315       if (aarch64_extr_rtx_p (x, &op0, &op1))
14316         {
14317           *cost += rtx_cost (op0, mode, IOR, 0, speed);
14318           *cost += rtx_cost (op1, mode, IOR, 1, speed);
14319           if (speed)
14320             *cost += extra_cost->alu.shift;
14321
14322           return true;
14323         }
14324     /* Fall through.  */
14325     case XOR:
14326     case AND:
14327     cost_logic:
14328       op0 = XEXP (x, 0);
14329       op1 = XEXP (x, 1);
14330
14331       if (VECTOR_MODE_P (mode))
14332         {
14333           if (speed)
14334             *cost += extra_cost->vect.alu;
14335           return true;
14336         }
14337
14338       if (code == AND
14339           && GET_CODE (op0) == MULT
14340           && CONST_INT_P (XEXP (op0, 1))
14341           && CONST_INT_P (op1)
14342           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14343                                INTVAL (op1)) != 0)
14344         {
14345           /* This is a UBFM/SBFM.  */
14346           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14347           if (speed)
14348             *cost += extra_cost->alu.bfx;
14349           return true;
14350         }
14351
14352       if (is_int_mode (mode, &int_mode))
14353         {
14354           if (CONST_INT_P (op1))
14355             {
14356               /* We have a mask + shift version of a UBFIZ
14357                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
14358               if (GET_CODE (op0) == ASHIFT
14359                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14360                                                          XEXP (op0, 1)))
14361                 {
14362                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
14363                                      (enum rtx_code) code, 0, speed);
14364                   if (speed)
14365                     *cost += extra_cost->alu.bfx;
14366
14367                   return true;
14368                 }
14369               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14370                 {
14371                 /* We possibly get the immediate for free, this is not
14372                    modelled.  */
14373                   *cost += rtx_cost (op0, int_mode,
14374                                      (enum rtx_code) code, 0, speed);
14375                   if (speed)
14376                     *cost += extra_cost->alu.logical;
14377
14378                   return true;
14379                 }
14380             }
14381           else
14382             {
14383               rtx new_op0 = op0;
14384
14385               /* Handle ORN, EON, or BIC.  */
14386               if (GET_CODE (op0) == NOT)
14387                 op0 = XEXP (op0, 0);
14388
14389               new_op0 = aarch64_strip_shift (op0);
14390
14391               /* If we had a shift on op0 then this is a logical-shift-
14392                  by-register/immediate operation.  Otherwise, this is just
14393                  a logical operation.  */
14394               if (speed)
14395                 {
14396                   if (new_op0 != op0)
14397                     {
14398                       /* Shift by immediate.  */
14399                       if (CONST_INT_P (XEXP (op0, 1)))
14400                         *cost += extra_cost->alu.log_shift;
14401                       else
14402                         *cost += extra_cost->alu.log_shift_reg;
14403                     }
14404                   else
14405                     *cost += extra_cost->alu.logical;
14406                 }
14407
14408               /* In both cases we want to cost both operands.  */
14409               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14410                                  0, speed);
14411               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14412                                  1, speed);
14413
14414               return true;
14415             }
14416         }
14417       return false;
14418
14419     case NOT:
14420       x = XEXP (x, 0);
14421       op0 = aarch64_strip_shift (x);
14422
14423       if (VECTOR_MODE_P (mode))
14424         {
14425           /* Vector NOT.  */
14426           *cost += extra_cost->vect.alu;
14427           return false;
14428         }
14429
14430       /* MVN-shifted-reg.  */
14431       if (op0 != x)
14432         {
14433           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14434
14435           if (speed)
14436             *cost += extra_cost->alu.log_shift;
14437
14438           return true;
14439         }
14440       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14441          Handle the second form here taking care that 'a' in the above can
14442          be a shift.  */
14443       else if (GET_CODE (op0) == XOR)
14444         {
14445           rtx newop0 = XEXP (op0, 0);
14446           rtx newop1 = XEXP (op0, 1);
14447           rtx op0_stripped = aarch64_strip_shift (newop0);
14448
14449           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14450           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14451
14452           if (speed)
14453             {
14454               if (op0_stripped != newop0)
14455                 *cost += extra_cost->alu.log_shift;
14456               else
14457                 *cost += extra_cost->alu.logical;
14458             }
14459
14460           return true;
14461         }
14462       /* MVN.  */
14463       if (speed)
14464         *cost += extra_cost->alu.logical;
14465
14466       return false;
14467
14468     case ZERO_EXTEND:
14469
14470       op0 = XEXP (x, 0);
14471       /* If a value is written in SI mode, then zero extended to DI
14472          mode, the operation will in general be free as a write to
14473          a 'w' register implicitly zeroes the upper bits of an 'x'
14474          register.  However, if this is
14475
14476            (set (reg) (zero_extend (reg)))
14477
14478          we must cost the explicit register move.  */
14479       if (mode == DImode
14480           && GET_MODE (op0) == SImode)
14481         {
14482           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14483
14484         /* If OP_COST is non-zero, then the cost of the zero extend
14485            is effectively the cost of the inner operation.  Otherwise
14486            we have a MOV instruction and we take the cost from the MOV
14487            itself.  This is true independently of whether we are
14488            optimizing for space or time.  */
14489           if (op_cost)
14490             *cost = op_cost;
14491
14492           return true;
14493         }
14494       else if (MEM_P (op0))
14495         {
14496           /* All loads can zero extend to any size for free.  */
14497           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14498           return true;
14499         }
14500
14501       op0 = aarch64_extend_bitfield_pattern_p (x);
14502       if (op0)
14503         {
14504           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14505           if (speed)
14506             *cost += extra_cost->alu.bfx;
14507           return true;
14508         }
14509
14510       if (speed)
14511         {
14512           if (VECTOR_MODE_P (mode))
14513             {
14514               /* UMOV.  */
14515               *cost += extra_cost->vect.alu;
14516             }
14517           else
14518             {
14519               /* We generate an AND instead of UXTB/UXTH.  */
14520               *cost += extra_cost->alu.logical;
14521             }
14522         }
14523       return false;
14524
14525     case SIGN_EXTEND:
14526       if (MEM_P (XEXP (x, 0)))
14527         {
14528           /* LDRSH.  */
14529           if (speed)
14530             {
14531               rtx address = XEXP (XEXP (x, 0), 0);
14532               *cost += extra_cost->ldst.load_sign_extend;
14533
14534               *cost +=
14535                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14536                                                      0, speed));
14537             }
14538           return true;
14539         }
14540
14541       op0 = aarch64_extend_bitfield_pattern_p (x);
14542       if (op0)
14543         {
14544           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14545           if (speed)
14546             *cost += extra_cost->alu.bfx;
14547           return true;
14548         }
14549
14550       if (speed)
14551         {
14552           if (VECTOR_MODE_P (mode))
14553             *cost += extra_cost->vect.alu;
14554           else
14555             *cost += extra_cost->alu.extend;
14556         }
14557       return false;
14558
14559     case ASHIFT:
14560       op0 = XEXP (x, 0);
14561       op1 = XEXP (x, 1);
14562
14563       if (CONST_INT_P (op1))
14564         {
14565           if (speed)
14566             {
14567               if (VECTOR_MODE_P (mode))
14568                 {
14569                   /* Vector shift (immediate).  */
14570                   *cost += extra_cost->vect.alu;
14571                 }
14572               else
14573                 {
14574                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
14575                      aliases.  */
14576                   *cost += extra_cost->alu.shift;
14577                 }
14578             }
14579
14580           /* We can incorporate zero/sign extend for free.  */
14581           if (GET_CODE (op0) == ZERO_EXTEND
14582               || GET_CODE (op0) == SIGN_EXTEND)
14583             op0 = XEXP (op0, 0);
14584
14585           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
14586           return true;
14587         }
14588       else
14589         {
14590           if (VECTOR_MODE_P (mode))
14591             {
14592               if (speed)
14593                 /* Vector shift (register).  */
14594                 *cost += extra_cost->vect.alu;
14595             }
14596           else
14597             {
14598               if (speed)
14599                 /* LSLV.  */
14600                 *cost += extra_cost->alu.shift_reg;
14601
14602               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14603                   && CONST_INT_P (XEXP (op1, 1))
14604                   && known_eq (INTVAL (XEXP (op1, 1)),
14605                                GET_MODE_BITSIZE (mode) - 1))
14606                 {
14607                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14608                   /* We already demanded XEXP (op1, 0) to be REG_P, so
14609                      don't recurse into it.  */
14610                   return true;
14611                 }
14612             }
14613           return false;  /* All arguments need to be in registers.  */
14614         }
14615
14616     case ROTATE:
14617     case ROTATERT:
14618     case LSHIFTRT:
14619     case ASHIFTRT:
14620       op0 = XEXP (x, 0);
14621       op1 = XEXP (x, 1);
14622
14623       if (CONST_INT_P (op1))
14624         {
14625           /* ASR (immediate) and friends.  */
14626           if (speed)
14627             {
14628               if (VECTOR_MODE_P (mode))
14629                 *cost += extra_cost->vect.alu;
14630               else
14631                 *cost += extra_cost->alu.shift;
14632             }
14633
14634           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14635           return true;
14636         }
14637       else
14638         {
14639           if (VECTOR_MODE_P (mode))
14640             {
14641               if (speed)
14642                 /* Vector shift (register).  */
14643                 *cost += extra_cost->vect.alu;
14644             }
14645           else
14646             {
14647               if (speed)
14648                 /* ASR (register) and friends.  */
14649                 *cost += extra_cost->alu.shift_reg;
14650
14651               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14652                   && CONST_INT_P (XEXP (op1, 1))
14653                   && known_eq (INTVAL (XEXP (op1, 1)),
14654                                GET_MODE_BITSIZE (mode) - 1))
14655                 {
14656                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14657                   /* We already demanded XEXP (op1, 0) to be REG_P, so
14658                      don't recurse into it.  */
14659                   return true;
14660                 }
14661             }
14662           return false;  /* All arguments need to be in registers.  */
14663         }
14664
14665     case SYMBOL_REF:
14666
14667       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
14668           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
14669         {
14670           /* LDR.  */
14671           if (speed)
14672             *cost += extra_cost->ldst.load;
14673         }
14674       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
14675                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
14676         {
14677           /* ADRP, followed by ADD.  */
14678           *cost += COSTS_N_INSNS (1);
14679           if (speed)
14680             *cost += 2 * extra_cost->alu.arith;
14681         }
14682       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
14683                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14684         {
14685           /* ADR.  */
14686           if (speed)
14687             *cost += extra_cost->alu.arith;
14688         }
14689
14690       if (flag_pic)
14691         {
14692           /* One extra load instruction, after accessing the GOT.  */
14693           *cost += COSTS_N_INSNS (1);
14694           if (speed)
14695             *cost += extra_cost->ldst.load;
14696         }
14697       return true;
14698
14699     case HIGH:
14700     case LO_SUM:
14701       /* ADRP/ADD (immediate).  */
14702       if (speed)
14703         *cost += extra_cost->alu.arith;
14704       return true;
14705
14706     case ZERO_EXTRACT:
14707     case SIGN_EXTRACT:
14708       /* UBFX/SBFX.  */
14709       if (speed)
14710         {
14711           if (VECTOR_MODE_P (mode))
14712             *cost += extra_cost->vect.alu;
14713           else
14714             *cost += extra_cost->alu.bfx;
14715         }
14716
14717       /* We can trust that the immediates used will be correct (there
14718          are no by-register forms), so we need only cost op0.  */
14719       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
14720       return true;
14721
14722     case MULT:
14723       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
14724       /* aarch64_rtx_mult_cost always handles recursion to its
14725          operands.  */
14726       return true;
14727
14728     case MOD:
14729     /* We can expand signed mod by power of 2 using a NEGS, two parallel
14730        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
14731        an unconditional negate.  This case should only ever be reached through
14732        the set_smod_pow2_cheap check in expmed.cc.  */
14733       if (CONST_INT_P (XEXP (x, 1))
14734           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
14735           && (mode == SImode || mode == DImode))
14736         {
14737           /* We expand to 4 instructions.  Reset the baseline.  */
14738           *cost = COSTS_N_INSNS (4);
14739
14740           if (speed)
14741             *cost += 2 * extra_cost->alu.logical
14742                      + 2 * extra_cost->alu.arith;
14743
14744           return true;
14745         }
14746
14747     /* Fall-through.  */
14748     case UMOD:
14749       if (speed)
14750         {
14751           /* Slighly prefer UMOD over SMOD.  */
14752           if (VECTOR_MODE_P (mode))
14753             *cost += extra_cost->vect.alu;
14754           else if (GET_MODE_CLASS (mode) == MODE_INT)
14755             *cost += (extra_cost->mult[mode == DImode].add
14756                       + extra_cost->mult[mode == DImode].idiv
14757                       + (code == MOD ? 1 : 0));
14758         }
14759       return false;  /* All arguments need to be in registers.  */
14760
14761     case DIV:
14762     case UDIV:
14763     case SQRT:
14764       if (speed)
14765         {
14766           if (VECTOR_MODE_P (mode))
14767             *cost += extra_cost->vect.alu;
14768           else if (GET_MODE_CLASS (mode) == MODE_INT)
14769             /* There is no integer SQRT, so only DIV and UDIV can get
14770                here.  */
14771             *cost += (extra_cost->mult[mode == DImode].idiv
14772                      /* Slighly prefer UDIV over SDIV.  */
14773                      + (code == DIV ? 1 : 0));
14774           else
14775             *cost += extra_cost->fp[mode == DFmode].div;
14776         }
14777       return false;  /* All arguments need to be in registers.  */
14778
14779     case IF_THEN_ELSE:
14780       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
14781                                          XEXP (x, 2), cost, speed);
14782
14783     case EQ:
14784     case NE:
14785     case GT:
14786     case GTU:
14787     case LT:
14788     case LTU:
14789     case GE:
14790     case GEU:
14791     case LE:
14792     case LEU:
14793
14794       return false; /* All arguments must be in registers.  */
14795
14796     case FMA:
14797       op0 = XEXP (x, 0);
14798       op1 = XEXP (x, 1);
14799       op2 = XEXP (x, 2);
14800
14801       if (speed)
14802         {
14803           if (VECTOR_MODE_P (mode))
14804             *cost += extra_cost->vect.alu;
14805           else
14806             *cost += extra_cost->fp[mode == DFmode].fma;
14807         }
14808
14809       /* FMSUB, FNMADD, and FNMSUB are free.  */
14810       if (GET_CODE (op0) == NEG)
14811         op0 = XEXP (op0, 0);
14812
14813       if (GET_CODE (op2) == NEG)
14814         op2 = XEXP (op2, 0);
14815
14816       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
14817          and the by-element operand as operand 0.  */
14818       if (GET_CODE (op1) == NEG)
14819         op1 = XEXP (op1, 0);
14820
14821       /* Catch vector-by-element operations.  The by-element operand can
14822          either be (vec_duplicate (vec_select (x))) or just
14823          (vec_select (x)), depending on whether we are multiplying by
14824          a vector or a scalar.
14825
14826          Canonicalization is not very good in these cases, FMA4 will put the
14827          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
14828       if (GET_CODE (op0) == VEC_DUPLICATE)
14829         op0 = XEXP (op0, 0);
14830       else if (GET_CODE (op1) == VEC_DUPLICATE)
14831         op1 = XEXP (op1, 0);
14832
14833       if (GET_CODE (op0) == VEC_SELECT)
14834         op0 = XEXP (op0, 0);
14835       else if (GET_CODE (op1) == VEC_SELECT)
14836         op1 = XEXP (op1, 0);
14837
14838       /* If the remaining parameters are not registers,
14839          get the cost to put them into registers.  */
14840       *cost += rtx_cost (op0, mode, FMA, 0, speed);
14841       *cost += rtx_cost (op1, mode, FMA, 1, speed);
14842       *cost += rtx_cost (op2, mode, FMA, 2, speed);
14843       return true;
14844
14845     case FLOAT:
14846     case UNSIGNED_FLOAT:
14847       if (speed)
14848         *cost += extra_cost->fp[mode == DFmode].fromint;
14849       return false;
14850
14851     case FLOAT_EXTEND:
14852       if (speed)
14853         {
14854           if (VECTOR_MODE_P (mode))
14855             {
14856               /*Vector truncate.  */
14857               *cost += extra_cost->vect.alu;
14858             }
14859           else
14860             *cost += extra_cost->fp[mode == DFmode].widen;
14861         }
14862       return false;
14863
14864     case FLOAT_TRUNCATE:
14865       if (speed)
14866         {
14867           if (VECTOR_MODE_P (mode))
14868             {
14869               /*Vector conversion.  */
14870               *cost += extra_cost->vect.alu;
14871             }
14872           else
14873             *cost += extra_cost->fp[mode == DFmode].narrow;
14874         }
14875       return false;
14876
14877     case FIX:
14878     case UNSIGNED_FIX:
14879       x = XEXP (x, 0);
14880       /* Strip the rounding part.  They will all be implemented
14881          by the fcvt* family of instructions anyway.  */
14882       if (GET_CODE (x) == UNSPEC)
14883         {
14884           unsigned int uns_code = XINT (x, 1);
14885
14886           if (uns_code == UNSPEC_FRINTA
14887               || uns_code == UNSPEC_FRINTM
14888               || uns_code == UNSPEC_FRINTN
14889               || uns_code == UNSPEC_FRINTP
14890               || uns_code == UNSPEC_FRINTZ)
14891             x = XVECEXP (x, 0, 0);
14892         }
14893
14894       if (speed)
14895         {
14896           if (VECTOR_MODE_P (mode))
14897             *cost += extra_cost->vect.alu;
14898           else
14899             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
14900         }
14901
14902       /* We can combine fmul by a power of 2 followed by a fcvt into a single
14903          fixed-point fcvt.  */
14904       if (GET_CODE (x) == MULT
14905           && ((VECTOR_MODE_P (mode)
14906                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
14907               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
14908         {
14909           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
14910                              0, speed);
14911           return true;
14912         }
14913
14914       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
14915       return true;
14916
14917     case ABS:
14918       if (VECTOR_MODE_P (mode))
14919         {
14920           /* ABS (vector).  */
14921           if (speed)
14922             *cost += extra_cost->vect.alu;
14923         }
14924       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14925         {
14926           op0 = XEXP (x, 0);
14927
14928           /* FABD, which is analogous to FADD.  */
14929           if (GET_CODE (op0) == MINUS)
14930             {
14931               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
14932               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
14933               if (speed)
14934                 *cost += extra_cost->fp[mode == DFmode].addsub;
14935
14936               return true;
14937             }
14938           /* Simple FABS is analogous to FNEG.  */
14939           if (speed)
14940             *cost += extra_cost->fp[mode == DFmode].neg;
14941         }
14942       else
14943         {
14944           /* Integer ABS will either be split to
14945              two arithmetic instructions, or will be an ABS
14946              (scalar), which we don't model.  */
14947           *cost = COSTS_N_INSNS (2);
14948           if (speed)
14949             *cost += 2 * extra_cost->alu.arith;
14950         }
14951       return false;
14952
14953     case SMAX:
14954     case SMIN:
14955       if (speed)
14956         {
14957           if (VECTOR_MODE_P (mode))
14958             *cost += extra_cost->vect.alu;
14959           else
14960             {
14961               /* FMAXNM/FMINNM/FMAX/FMIN.
14962                  TODO: This may not be accurate for all implementations, but
14963                  we do not model this in the cost tables.  */
14964               *cost += extra_cost->fp[mode == DFmode].addsub;
14965             }
14966         }
14967       return false;
14968
14969     case UNSPEC:
14970       /* The floating point round to integer frint* instructions.  */
14971       if (aarch64_frint_unspec_p (XINT (x, 1)))
14972         {
14973           if (speed)
14974             *cost += extra_cost->fp[mode == DFmode].roundint;
14975
14976           return false;
14977         }
14978
14979       if (XINT (x, 1) == UNSPEC_RBIT)
14980         {
14981           if (speed)
14982             *cost += extra_cost->alu.rev;
14983
14984           return false;
14985         }
14986       break;
14987
14988     case TRUNCATE:
14989
14990       /* Decompose <su>muldi3_highpart.  */
14991       if (/* (truncate:DI  */
14992           mode == DImode
14993           /*   (lshiftrt:TI  */
14994           && GET_MODE (XEXP (x, 0)) == TImode
14995           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
14996           /*      (mult:TI  */
14997           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
14998           /*        (ANY_EXTEND:TI (reg:DI))
14999                     (ANY_EXTEND:TI (reg:DI)))  */
15000           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15001                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15002               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15003                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15004           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15005           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15006           /*     (const_int 64)  */
15007           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15008           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15009         {
15010           /* UMULH/SMULH.  */
15011           if (speed)
15012             *cost += extra_cost->mult[mode == DImode].extend;
15013           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15014                              mode, MULT, 0, speed);
15015           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15016                              mode, MULT, 1, speed);
15017           return true;
15018         }
15019         break;
15020     case CONST_VECTOR:
15021         {
15022           /* Load using MOVI/MVNI.  */
15023           if (aarch64_simd_valid_immediate (x, NULL))
15024             *cost = extra_cost->vect.movi;
15025           else /* Load using constant pool.  */
15026             *cost = extra_cost->ldst.load;
15027           break;
15028         }
15029     case VEC_CONCAT:
15030         /* depending on the operation, either DUP or INS.
15031            For now, keep default costing.  */
15032         break;
15033     case VEC_DUPLICATE:
15034         /* Load using a DUP.  */
15035         *cost = extra_cost->vect.dup;
15036         return false;
15037     case VEC_SELECT:
15038         {
15039           rtx op0 = XEXP (x, 0);
15040           *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15041
15042           /* cost subreg of 0 as free, otherwise as DUP */
15043           rtx op1 = XEXP (x, 1);
15044           if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15045             ;
15046           else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15047             *cost = extra_cost->vect.dup;
15048           else
15049             *cost = extra_cost->vect.extract;
15050           return true;
15051         }
15052     default:
15053       break;
15054     }
15055
15056   if (dump_file
15057       && flag_aarch64_verbose_cost)
15058     fprintf (dump_file,
15059       "\nFailed to cost RTX.  Assuming default cost.\n");
15060
15061   return true;
15062 }
15063
15064 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15065    calculated for X.  This cost is stored in *COST.  Returns true
15066    if the total cost of X was calculated.  */
15067 static bool
15068 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15069                    int param, int *cost, bool speed)
15070 {
15071   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15072
15073   if (dump_file
15074       && flag_aarch64_verbose_cost)
15075     {
15076       print_rtl_single (dump_file, x);
15077       fprintf (dump_file, "\n%s cost: %d (%s)\n",
15078                speed ? "Hot" : "Cold",
15079                *cost, result ? "final" : "partial");
15080     }
15081
15082   return result;
15083 }
15084
15085 static int
15086 aarch64_register_move_cost (machine_mode mode,
15087                             reg_class_t from_i, reg_class_t to_i)
15088 {
15089   enum reg_class from = (enum reg_class) from_i;
15090   enum reg_class to = (enum reg_class) to_i;
15091   const struct cpu_regmove_cost *regmove_cost
15092     = aarch64_tune_params.regmove_cost;
15093
15094   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
15095   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
15096       || to == STUB_REGS)
15097     to = GENERAL_REGS;
15098
15099   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
15100       || from == STUB_REGS)
15101     from = GENERAL_REGS;
15102
15103   /* Make RDFFR very expensive.  In particular, if we know that the FFR
15104      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15105      as a way of obtaining a PTRUE.  */
15106   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15107       && hard_reg_set_subset_p (reg_class_contents[from_i],
15108                                 reg_class_contents[FFR_REGS]))
15109     return 80;
15110
15111   /* Moving between GPR and stack cost is the same as GP2GP.  */
15112   if ((from == GENERAL_REGS && to == STACK_REG)
15113       || (to == GENERAL_REGS && from == STACK_REG))
15114     return regmove_cost->GP2GP;
15115
15116   /* To/From the stack register, we move via the gprs.  */
15117   if (to == STACK_REG || from == STACK_REG)
15118     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15119             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15120
15121   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15122   if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15123       && known_eq (GET_MODE_SIZE (mode), 16))
15124     {
15125       /* 128-bit operations on general registers require 2 instructions.  */
15126       if (from == GENERAL_REGS && to == GENERAL_REGS)
15127         return regmove_cost->GP2GP * 2;
15128       else if (from == GENERAL_REGS)
15129         return regmove_cost->GP2FP * 2;
15130       else if (to == GENERAL_REGS)
15131         return regmove_cost->FP2GP * 2;
15132
15133       /* When AdvSIMD instructions are disabled it is not possible to move
15134          a 128-bit value directly between Q registers.  This is handled in
15135          secondary reload.  A general register is used as a scratch to move
15136          the upper DI value and the lower DI value is moved directly,
15137          hence the cost is the sum of three moves. */
15138       if (! TARGET_SIMD)
15139         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15140
15141       return regmove_cost->FP2FP;
15142     }
15143
15144   if (from == GENERAL_REGS && to == GENERAL_REGS)
15145     return regmove_cost->GP2GP;
15146   else if (from == GENERAL_REGS)
15147     return regmove_cost->GP2FP;
15148   else if (to == GENERAL_REGS)
15149     return regmove_cost->FP2GP;
15150
15151   if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15152     {
15153       /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15154          The cost must be greater than 2 units to indicate that direct
15155          moves aren't possible.  */
15156       auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15157                          + aarch64_tune_params.memmov_cost.store_fp);
15158       return MIN (CEIL (per_vector, 2), 4);
15159     }
15160
15161   return regmove_cost->FP2FP;
15162 }
15163
15164 /* Implements TARGET_MEMORY_MOVE_COST.  */
15165 static int
15166 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15167 {
15168   enum reg_class rclass = (enum reg_class) rclass_i;
15169   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15170       ? reg_classes_intersect_p (rclass, PR_REGS)
15171       : reg_class_subset_p (rclass, PR_REGS))
15172     return (in
15173             ? aarch64_tune_params.memmov_cost.load_pred
15174             : aarch64_tune_params.memmov_cost.store_pred);
15175
15176   if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15177       ? reg_classes_intersect_p (rclass, FP_REGS)
15178       : reg_class_subset_p (rclass, FP_REGS))
15179     return (in
15180             ? aarch64_tune_params.memmov_cost.load_fp
15181             : aarch64_tune_params.memmov_cost.store_fp);
15182
15183   return (in
15184           ? aarch64_tune_params.memmov_cost.load_int
15185           : aarch64_tune_params.memmov_cost.store_int);
15186 }
15187
15188 /* Implement TARGET_INIT_BUILTINS.  */
15189 static void
15190 aarch64_init_builtins ()
15191 {
15192   aarch64_general_init_builtins ();
15193   aarch64_sve::init_builtins ();
15194 #ifdef SUBTARGET_INIT_BUILTINS
15195   SUBTARGET_INIT_BUILTINS;
15196 #endif
15197 }
15198
15199 /* Implement TARGET_FOLD_BUILTIN.  */
15200 static tree
15201 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15202 {
15203   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15204   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15205   tree type = TREE_TYPE (TREE_TYPE (fndecl));
15206   switch (code & AARCH64_BUILTIN_CLASS)
15207     {
15208     case AARCH64_BUILTIN_GENERAL:
15209       return aarch64_general_fold_builtin (subcode, type, nargs, args);
15210
15211     case AARCH64_BUILTIN_SVE:
15212       return NULL_TREE;
15213     }
15214   gcc_unreachable ();
15215 }
15216
15217 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
15218 static bool
15219 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15220 {
15221   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15222   tree fndecl = gimple_call_fndecl (stmt);
15223   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15224   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15225   gimple *new_stmt = NULL;
15226   switch (code & AARCH64_BUILTIN_CLASS)
15227     {
15228     case AARCH64_BUILTIN_GENERAL:
15229       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15230       break;
15231
15232     case AARCH64_BUILTIN_SVE:
15233       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15234       break;
15235     }
15236
15237   if (!new_stmt)
15238     return false;
15239
15240   gsi_replace (gsi, new_stmt, true);
15241   return true;
15242 }
15243
15244 /* Implement TARGET_EXPAND_BUILTIN.  */
15245 static rtx
15246 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15247 {
15248   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15249   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15250   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15251   switch (code & AARCH64_BUILTIN_CLASS)
15252     {
15253     case AARCH64_BUILTIN_GENERAL:
15254       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15255
15256     case AARCH64_BUILTIN_SVE:
15257       return aarch64_sve::expand_builtin (subcode, exp, target);
15258     }
15259   gcc_unreachable ();
15260 }
15261
15262 /* Implement TARGET_BUILTIN_DECL.  */
15263 static tree
15264 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15265 {
15266   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15267   switch (code & AARCH64_BUILTIN_CLASS)
15268     {
15269     case AARCH64_BUILTIN_GENERAL:
15270       return aarch64_general_builtin_decl (subcode, initialize_p);
15271
15272     case AARCH64_BUILTIN_SVE:
15273       return aarch64_sve::builtin_decl (subcode, initialize_p);
15274     }
15275   gcc_unreachable ();
15276 }
15277
15278 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15279    to optimize 1.0/sqrt.  */
15280
15281 static bool
15282 use_rsqrt_p (machine_mode mode)
15283 {
15284   return (!flag_trapping_math
15285           && flag_unsafe_math_optimizations
15286           && ((aarch64_tune_params.approx_modes->recip_sqrt
15287                & AARCH64_APPROX_MODE (mode))
15288               || flag_mrecip_low_precision_sqrt));
15289 }
15290
15291 /* Function to decide when to use the approximate reciprocal square root
15292    builtin.  */
15293
15294 static tree
15295 aarch64_builtin_reciprocal (tree fndecl)
15296 {
15297   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15298
15299   if (!use_rsqrt_p (mode))
15300     return NULL_TREE;
15301   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15302   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15303   switch (code & AARCH64_BUILTIN_CLASS)
15304     {
15305     case AARCH64_BUILTIN_GENERAL:
15306       return aarch64_general_builtin_rsqrt (subcode);
15307
15308     case AARCH64_BUILTIN_SVE:
15309       return NULL_TREE;
15310     }
15311   gcc_unreachable ();
15312 }
15313
15314 /* Emit code to perform the floating-point operation:
15315
15316      DST = SRC1 * SRC2
15317
15318    where all three operands are already known to be registers.
15319    If the operation is an SVE one, PTRUE is a suitable all-true
15320    predicate.  */
15321
15322 static void
15323 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15324 {
15325   if (ptrue)
15326     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15327                                  dst, ptrue, src1, src2,
15328                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
15329   else
15330     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15331 }
15332
15333 /* Emit instruction sequence to compute either the approximate square root
15334    or its approximate reciprocal, depending on the flag RECP, and return
15335    whether the sequence was emitted or not.  */
15336
15337 bool
15338 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15339 {
15340   machine_mode mode = GET_MODE (dst);
15341
15342   if (GET_MODE_INNER (mode) == HFmode)
15343     {
15344       gcc_assert (!recp);
15345       return false;
15346     }
15347
15348   if (!recp)
15349     {
15350       if (!(flag_mlow_precision_sqrt
15351             || (aarch64_tune_params.approx_modes->sqrt
15352                 & AARCH64_APPROX_MODE (mode))))
15353         return false;
15354
15355       if (!flag_finite_math_only
15356           || flag_trapping_math
15357           || !flag_unsafe_math_optimizations
15358           || optimize_function_for_size_p (cfun))
15359         return false;
15360     }
15361   else
15362     /* Caller assumes we cannot fail.  */
15363     gcc_assert (use_rsqrt_p (mode));
15364
15365   rtx pg = NULL_RTX;
15366   if (aarch64_sve_mode_p (mode))
15367     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15368   machine_mode mmsk = (VECTOR_MODE_P (mode)
15369                        ? related_int_vector_mode (mode).require ()
15370                        : int_mode_for_mode (mode).require ());
15371   rtx xmsk = NULL_RTX;
15372   if (!recp)
15373     {
15374       /* When calculating the approximate square root, compare the
15375          argument with 0.0 and create a mask.  */
15376       rtx zero = CONST0_RTX (mode);
15377       if (pg)
15378         {
15379           xmsk = gen_reg_rtx (GET_MODE (pg));
15380           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15381           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15382                                            xmsk, pg, hint, src, zero));
15383         }
15384       else
15385         {
15386           xmsk = gen_reg_rtx (mmsk);
15387           emit_insn (gen_rtx_SET (xmsk,
15388                                   gen_rtx_NEG (mmsk,
15389                                                gen_rtx_EQ (mmsk, src, zero))));
15390         }
15391     }
15392
15393   /* Estimate the approximate reciprocal square root.  */
15394   rtx xdst = gen_reg_rtx (mode);
15395   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15396
15397   /* Iterate over the series twice for SF and thrice for DF.  */
15398   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15399
15400   /* Optionally iterate over the series once less for faster performance
15401      while sacrificing the accuracy.  */
15402   if ((recp && flag_mrecip_low_precision_sqrt)
15403       || (!recp && flag_mlow_precision_sqrt))
15404     iterations--;
15405
15406   /* Iterate over the series to calculate the approximate reciprocal square
15407      root.  */
15408   rtx x1 = gen_reg_rtx (mode);
15409   while (iterations--)
15410     {
15411       rtx x2 = gen_reg_rtx (mode);
15412       aarch64_emit_mult (x2, pg, xdst, xdst);
15413
15414       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15415
15416       if (iterations > 0)
15417         aarch64_emit_mult (xdst, pg, xdst, x1);
15418     }
15419
15420   if (!recp)
15421     {
15422       if (pg)
15423         /* Multiply nonzero source values by the corresponding intermediate
15424            result elements, so that the final calculation is the approximate
15425            square root rather than its reciprocal.  Select a zero result for
15426            zero source values, to avoid the Inf * 0 -> NaN that we'd get
15427            otherwise.  */
15428         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15429                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15430       else
15431         {
15432           /* Qualify the approximate reciprocal square root when the
15433              argument is 0.0 by squashing the intermediary result to 0.0.  */
15434           rtx xtmp = gen_reg_rtx (mmsk);
15435           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15436                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
15437           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15438
15439           /* Calculate the approximate square root.  */
15440           aarch64_emit_mult (xdst, pg, xdst, src);
15441         }
15442     }
15443
15444   /* Finalize the approximation.  */
15445   aarch64_emit_mult (dst, pg, xdst, x1);
15446
15447   return true;
15448 }
15449
15450 /* Emit the instruction sequence to compute the approximation for the division
15451    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
15452
15453 bool
15454 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15455 {
15456   machine_mode mode = GET_MODE (quo);
15457
15458   if (GET_MODE_INNER (mode) == HFmode)
15459     return false;
15460
15461   bool use_approx_division_p = (flag_mlow_precision_div
15462                                 || (aarch64_tune_params.approx_modes->division
15463                                     & AARCH64_APPROX_MODE (mode)));
15464
15465   if (!flag_finite_math_only
15466       || flag_trapping_math
15467       || !flag_unsafe_math_optimizations
15468       || optimize_function_for_size_p (cfun)
15469       || !use_approx_division_p)
15470     return false;
15471
15472   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15473     return false;
15474
15475   rtx pg = NULL_RTX;
15476   if (aarch64_sve_mode_p (mode))
15477     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15478
15479   /* Estimate the approximate reciprocal.  */
15480   rtx xrcp = gen_reg_rtx (mode);
15481   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15482
15483   /* Iterate over the series twice for SF and thrice for DF.  */
15484   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15485
15486   /* Optionally iterate over the series less for faster performance,
15487      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
15488   if (flag_mlow_precision_div)
15489     iterations = (GET_MODE_INNER (mode) == DFmode
15490                   ? aarch64_double_recp_precision
15491                   : aarch64_float_recp_precision);
15492
15493   /* Iterate over the series to calculate the approximate reciprocal.  */
15494   rtx xtmp = gen_reg_rtx (mode);
15495   while (iterations--)
15496     {
15497       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15498
15499       if (iterations > 0)
15500         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15501     }
15502
15503   if (num != CONST1_RTX (mode))
15504     {
15505       /* As the approximate reciprocal of DEN is already calculated, only
15506          calculate the approximate division when NUM is not 1.0.  */
15507       rtx xnum = force_reg (mode, num);
15508       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15509     }
15510
15511   /* Finalize the approximation.  */
15512   aarch64_emit_mult (quo, pg, xrcp, xtmp);
15513   return true;
15514 }
15515
15516 /* Return the number of instructions that can be issued per cycle.  */
15517 static int
15518 aarch64_sched_issue_rate (void)
15519 {
15520   return aarch64_tune_params.issue_rate;
15521 }
15522
15523 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
15524 static int
15525 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15526 {
15527   if (DEBUG_INSN_P (insn))
15528     return more;
15529
15530   rtx_code code = GET_CODE (PATTERN (insn));
15531   if (code == USE || code == CLOBBER)
15532     return more;
15533
15534   if (get_attr_type (insn) == TYPE_NO_INSN)
15535     return more;
15536
15537   return more - 1;
15538 }
15539
15540 static int
15541 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15542 {
15543   int issue_rate = aarch64_sched_issue_rate ();
15544
15545   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15546 }
15547
15548
15549 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15550    autopref_multipass_dfa_lookahead_guard from haifa-sched.cc.  It only
15551    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
15552
15553 static int
15554 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15555                                                     int ready_index)
15556 {
15557   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15558 }
15559
15560
15561 /* Vectorizer cost model target hooks.  */
15562
15563 /* Information about how the CPU would issue the scalar, Advanced SIMD
15564    or SVE version of a vector loop, using the scheme defined by the
15565    aarch64_base_vec_issue_info hierarchy of structures.  */
15566 class aarch64_vec_op_count
15567 {
15568 public:
15569   aarch64_vec_op_count () = default;
15570   aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15571                         unsigned int = 1);
15572
15573   unsigned int vec_flags () const { return m_vec_flags; }
15574   unsigned int vf_factor () const { return m_vf_factor; }
15575
15576   const aarch64_base_vec_issue_info *base_issue_info () const;
15577   const aarch64_simd_vec_issue_info *simd_issue_info () const;
15578   const aarch64_sve_vec_issue_info *sve_issue_info () const;
15579
15580   fractional_cost rename_cycles_per_iter () const;
15581   fractional_cost min_nonpred_cycles_per_iter () const;
15582   fractional_cost min_pred_cycles_per_iter () const;
15583   fractional_cost min_cycles_per_iter () const;
15584
15585   void dump () const;
15586
15587   /* The number of individual "general" operations.  See the comments
15588      in aarch64_base_vec_issue_info for details.  */
15589   unsigned int general_ops = 0;
15590
15591   /* The number of load and store operations, under the same scheme
15592      as above.  */
15593   unsigned int loads = 0;
15594   unsigned int stores = 0;
15595
15596   /* The minimum number of cycles needed to execute all loop-carried
15597      operations, which in the vector code become associated with
15598      reductions.  */
15599   unsigned int reduction_latency = 0;
15600
15601   /* The number of individual predicate operations.  See the comments
15602      in aarch64_sve_vec_issue_info for details.  */
15603   unsigned int pred_ops = 0;
15604
15605 private:
15606   /* The issue information for the core.  */
15607   const aarch64_vec_issue_info *m_issue_info = nullptr;
15608
15609   /* - If M_VEC_FLAGS is zero then this structure describes scalar code
15610      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
15611        Advanced SIMD code.
15612      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
15613        SVE code.  */
15614   unsigned int m_vec_flags = 0;
15615
15616   /* Assume that, when the code is executing on the core described
15617      by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
15618      times more data than the vectorizer anticipates.
15619
15620      This is only ever different from 1 for SVE.  It allows us to consider
15621      what would happen on a 256-bit SVE target even when the -mtune
15622      parameters say that the “likely” SVE length is 128 bits.  */
15623   unsigned int m_vf_factor = 1;
15624 };
15625
15626 aarch64_vec_op_count::
15627 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
15628                       unsigned int vec_flags, unsigned int vf_factor)
15629   : m_issue_info (issue_info),
15630     m_vec_flags (vec_flags),
15631     m_vf_factor (vf_factor)
15632 {
15633 }
15634
15635 /* Return the base issue information (i.e. the parts that make sense
15636    for both scalar and vector code).  Return null if we have no issue
15637    information.  */
15638 const aarch64_base_vec_issue_info *
15639 aarch64_vec_op_count::base_issue_info () const
15640 {
15641   if (auto *ret = simd_issue_info ())
15642     return ret;
15643   return m_issue_info->scalar;
15644 }
15645
15646 /* If the structure describes vector code and we have associated issue
15647    information, return that issue information, otherwise return null.  */
15648 const aarch64_simd_vec_issue_info *
15649 aarch64_vec_op_count::simd_issue_info () const
15650 {
15651   if (auto *ret = sve_issue_info ())
15652     return ret;
15653   if (m_vec_flags)
15654     return m_issue_info->advsimd;
15655   return nullptr;
15656 }
15657
15658 /* If the structure describes SVE code and we have associated issue
15659    information, return that issue information, otherwise return null.  */
15660 const aarch64_sve_vec_issue_info *
15661 aarch64_vec_op_count::sve_issue_info () const
15662 {
15663   if (m_vec_flags & VEC_ANY_SVE)
15664     return m_issue_info->sve;
15665   return nullptr;
15666 }
15667
15668 /* Estimate the minimum number of cycles per iteration needed to rename
15669    the instructions.
15670
15671    ??? For now this is done inline rather than via cost tables, since it
15672    isn't clear how it should be parameterized for the general case.  */
15673 fractional_cost
15674 aarch64_vec_op_count::rename_cycles_per_iter () const
15675 {
15676   if (sve_issue_info () == &neoverse512tvb_sve_issue_info
15677       || sve_issue_info () == &neoversen2_sve_issue_info
15678       || sve_issue_info () == &neoversev2_sve_issue_info)
15679     /* + 1 for an addition.  We've already counted a general op for each
15680        store, so we don't need to account for stores separately.  The branch
15681        reads no registers and so does not need to be counted either.
15682
15683        ??? This value is very much on the pessimistic side, but seems to work
15684        pretty well in practice.  */
15685     return { general_ops + loads + pred_ops + 1, 5 };
15686
15687   return 0;
15688 }
15689
15690 /* Like min_cycles_per_iter, but excluding predicate operations.  */
15691 fractional_cost
15692 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
15693 {
15694   auto *issue_info = base_issue_info ();
15695
15696   fractional_cost cycles = MAX (reduction_latency, 1);
15697   cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
15698   cycles = std::max (cycles, { loads + stores,
15699                                issue_info->loads_stores_per_cycle });
15700   cycles = std::max (cycles, { general_ops,
15701                                issue_info->general_ops_per_cycle });
15702   cycles = std::max (cycles, rename_cycles_per_iter ());
15703   return cycles;
15704 }
15705
15706 /* Like min_cycles_per_iter, but including only the predicate operations.  */
15707 fractional_cost
15708 aarch64_vec_op_count::min_pred_cycles_per_iter () const
15709 {
15710   if (auto *issue_info = sve_issue_info ())
15711     return { pred_ops, issue_info->pred_ops_per_cycle };
15712   return 0;
15713 }
15714
15715 /* Estimate the minimum number of cycles needed to issue the operations.
15716    This is a very simplistic model!  */
15717 fractional_cost
15718 aarch64_vec_op_count::min_cycles_per_iter () const
15719 {
15720   return std::max (min_nonpred_cycles_per_iter (),
15721                    min_pred_cycles_per_iter ());
15722 }
15723
15724 /* Dump information about the structure.  */
15725 void
15726 aarch64_vec_op_count::dump () const
15727 {
15728   dump_printf_loc (MSG_NOTE, vect_location,
15729                    "  load operations = %d\n", loads);
15730   dump_printf_loc (MSG_NOTE, vect_location,
15731                    "  store operations = %d\n", stores);
15732   dump_printf_loc (MSG_NOTE, vect_location,
15733                    "  general operations = %d\n", general_ops);
15734   if (sve_issue_info ())
15735     dump_printf_loc (MSG_NOTE, vect_location,
15736                      "  predicate operations = %d\n", pred_ops);
15737   dump_printf_loc (MSG_NOTE, vect_location,
15738                    "  reduction latency = %d\n", reduction_latency);
15739   if (auto rcpi = rename_cycles_per_iter ())
15740     dump_printf_loc (MSG_NOTE, vect_location,
15741                      "  estimated cycles per iteration to rename = %f\n",
15742                      rcpi.as_double ());
15743   if (auto pred_cpi = min_pred_cycles_per_iter ())
15744     {
15745       dump_printf_loc (MSG_NOTE, vect_location,
15746                        "  estimated min cycles per iteration"
15747                        " without predication = %f\n",
15748                        min_nonpred_cycles_per_iter ().as_double ());
15749       dump_printf_loc (MSG_NOTE, vect_location,
15750                        "  estimated min cycles per iteration"
15751                        " for predication = %f\n", pred_cpi.as_double ());
15752     }
15753   if (auto cpi = min_cycles_per_iter ())
15754     dump_printf_loc (MSG_NOTE, vect_location,
15755                      "  estimated min cycles per iteration = %f\n",
15756                      cpi.as_double ());
15757 }
15758
15759 /* Information about vector code that we're in the process of costing.  */
15760 class aarch64_vector_costs : public vector_costs
15761 {
15762 public:
15763   aarch64_vector_costs (vec_info *, bool);
15764
15765   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
15766                               stmt_vec_info stmt_info, slp_tree, tree vectype,
15767                               int misalign,
15768                               vect_cost_model_location where) override;
15769   void finish_cost (const vector_costs *) override;
15770   bool better_main_loop_than_p (const vector_costs *other) const override;
15771
15772 private:
15773   void record_potential_advsimd_unrolling (loop_vec_info);
15774   void analyze_loop_vinfo (loop_vec_info);
15775   void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
15776                   aarch64_vec_op_count *);
15777   fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
15778                                         fractional_cost, unsigned int,
15779                                         unsigned int *, bool *);
15780   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
15781                                  unsigned int);
15782   bool prefer_unrolled_loop () const;
15783   unsigned int determine_suggested_unroll_factor ();
15784
15785   /* True if we have performed one-time initialization based on the
15786      vec_info.  */
15787   bool m_analyzed_vinfo = false;
15788
15789   /* This loop uses an average operation that is not supported by SVE, but is
15790      supported by Advanced SIMD and SVE2.  */
15791   bool m_has_avg = false;
15792
15793   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
15794      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
15795        SIMD code.
15796      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
15797   unsigned int m_vec_flags = 0;
15798
15799   /* At the moment, we do not model LDP and STP in the vector and scalar costs.
15800      This means that code such as:
15801
15802         a[0] = x;
15803         a[1] = x;
15804
15805      will be costed as two scalar instructions and two vector instructions
15806      (a scalar_to_vec and an unaligned_store).  For SLP, the vector form
15807      wins if the costs are equal, because of the fact that the vector costs
15808      include constant initializations whereas the scalar costs don't.
15809      We would therefore tend to vectorize the code above, even though
15810      the scalar version can use a single STP.
15811
15812      We should eventually fix this and model LDP and STP in the main costs;
15813      see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
15814      Until then, we look specifically for code that does nothing more than
15815      STP-like operations.  We cost them on that basis in addition to the
15816      normal latency-based costs.
15817
15818      If the scalar or vector code could be a sequence of STPs +
15819      initialization, this variable counts the cost of the sequence,
15820      with 2 units per instruction.  The variable is ~0U for other
15821      kinds of code.  */
15822   unsigned int m_stp_sequence_cost = 0;
15823
15824   /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
15825      throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
15826      situations, we try to predict whether an Advanced SIMD implementation
15827      of the loop could be completely unrolled and become straight-line code.
15828      If so, it is generally better to use the Advanced SIMD version rather
15829      than length-agnostic SVE, since the SVE loop would execute an unknown
15830      number of times and so could not be completely unrolled in the same way.
15831
15832      If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
15833      number of Advanced SIMD loop iterations that would be unrolled and
15834      M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
15835      in the unrolled loop.  Both values are zero if we're not applying
15836      the heuristic.  */
15837   unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
15838   unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
15839
15840   /* If we're vectorizing a loop that executes a constant number of times,
15841      this variable gives the number of times that the vector loop would
15842      iterate, otherwise it is zero.  */
15843   uint64_t m_num_vector_iterations = 0;
15844
15845   /* Used only when vectorizing loops.  Estimates the number and kind of
15846      operations that would be needed by one iteration of the scalar
15847      or vector loop.  There is one entry for each tuning option of
15848      interest.  */
15849   auto_vec<aarch64_vec_op_count, 2> m_ops;
15850 };
15851
15852 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
15853                                             bool costing_for_scalar)
15854   : vector_costs (vinfo, costing_for_scalar),
15855     m_vec_flags (costing_for_scalar ? 0
15856                  : aarch64_classify_vector_mode (vinfo->vector_mode))
15857 {
15858   if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
15859     {
15860       m_ops.quick_push ({ issue_info, m_vec_flags });
15861       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
15862         {
15863           unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
15864           m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
15865                               vf_factor });
15866         }
15867     }
15868 }
15869
15870 /* Implement TARGET_VECTORIZE_CREATE_COSTS.  */
15871 vector_costs *
15872 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
15873 {
15874   return new aarch64_vector_costs (vinfo, costing_for_scalar);
15875 }
15876
15877 /* Return true if the current CPU should use the new costs defined
15878    in GCC 11.  This should be removed for GCC 12 and above, with the
15879    costs applying to all CPUs instead.  */
15880 static bool
15881 aarch64_use_new_vector_costs_p ()
15882 {
15883   return (aarch64_tune_params.extra_tuning_flags
15884           & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
15885 }
15886
15887 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
15888 static const simd_vec_cost *
15889 aarch64_simd_vec_costs (tree vectype)
15890 {
15891   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15892   if (vectype != NULL
15893       && aarch64_sve_mode_p (TYPE_MODE (vectype))
15894       && costs->sve != NULL)
15895     return costs->sve;
15896   return costs->advsimd;
15897 }
15898
15899 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
15900 static const simd_vec_cost *
15901 aarch64_simd_vec_costs_for_flags (unsigned int flags)
15902 {
15903   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15904   if ((flags & VEC_ANY_SVE) && costs->sve)
15905     return costs->sve;
15906   return costs->advsimd;
15907 }
15908
15909 /* If STMT_INFO is a memory reference, return the scalar memory type,
15910    otherwise return null.  */
15911 static tree
15912 aarch64_dr_type (stmt_vec_info stmt_info)
15913 {
15914   if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
15915     return TREE_TYPE (DR_REF (dr));
15916   return NULL_TREE;
15917 }
15918
15919 /* Decide whether to use the unrolling heuristic described above
15920    m_unrolled_advsimd_niters, updating that field if so.  LOOP_VINFO
15921    describes the loop that we're vectorizing.  */
15922 void
15923 aarch64_vector_costs::
15924 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
15925 {
15926   /* The heuristic only makes sense on targets that have the same
15927      vector throughput for SVE and Advanced SIMD.  */
15928   if (!(aarch64_tune_params.extra_tuning_flags
15929         & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
15930     return;
15931
15932   /* We only want to apply the heuristic if LOOP_VINFO is being
15933      vectorized for SVE.  */
15934   if (!(m_vec_flags & VEC_ANY_SVE))
15935     return;
15936
15937   /* Check whether it is possible in principle to use Advanced SIMD
15938      instead.  */
15939   if (aarch64_autovec_preference == 2)
15940     return;
15941
15942   /* We don't want to apply the heuristic to outer loops, since it's
15943      harder to track two levels of unrolling.  */
15944   if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
15945     return;
15946
15947   /* Only handle cases in which the number of Advanced SIMD iterations
15948      would be known at compile time but the number of SVE iterations
15949      would not.  */
15950   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
15951       || aarch64_sve_vg.is_constant ())
15952     return;
15953
15954   /* Guess how many times the Advanced SIMD loop would iterate and make
15955      sure that it is within the complete unrolling limit.  Even if the
15956      number of iterations is small enough, the number of statements might
15957      not be, which is why we need to estimate the number of statements too.  */
15958   unsigned int estimated_vq = aarch64_estimated_sve_vq ();
15959   unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
15960   unsigned HOST_WIDE_INT unrolled_advsimd_niters
15961     = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
15962   if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
15963     return;
15964
15965   /* Record that we're applying the heuristic and should try to estimate
15966      the number of statements in the Advanced SIMD loop.  */
15967   m_unrolled_advsimd_niters = unrolled_advsimd_niters;
15968 }
15969
15970 /* Do one-time initialization of the aarch64_vector_costs given that we're
15971    costing the loop vectorization described by LOOP_VINFO.  */
15972 void
15973 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
15974 {
15975   /* Record the number of times that the vector loop would execute,
15976      if known.  */
15977   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
15978   auto scalar_niters = max_stmt_executions_int (loop);
15979   if (scalar_niters >= 0)
15980     {
15981       unsigned int vf = vect_vf_for_cost (loop_vinfo);
15982       if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
15983         m_num_vector_iterations = scalar_niters / vf;
15984       else
15985         m_num_vector_iterations = CEIL (scalar_niters, vf);
15986     }
15987
15988   /* Detect whether we're vectorizing for SVE and should apply the unrolling
15989      heuristic described above m_unrolled_advsimd_niters.  */
15990   record_potential_advsimd_unrolling (loop_vinfo);
15991
15992   /* Record the issue information for any SVE WHILE instructions that the
15993      loop needs.  */
15994   if (!m_ops.is_empty () && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
15995     {
15996       unsigned int num_masks = 0;
15997       rgroup_controls *rgm;
15998       unsigned int num_vectors_m1;
15999       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
16000         if (rgm->type)
16001           num_masks += num_vectors_m1 + 1;
16002       for (auto &ops : m_ops)
16003         if (auto *issue = ops.sve_issue_info ())
16004           ops.pred_ops += num_masks * issue->while_pred_ops;
16005     }
16006 }
16007
16008 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
16009 static int
16010 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16011                                     tree vectype,
16012                                     int misalign ATTRIBUTE_UNUSED)
16013 {
16014   unsigned elements;
16015   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16016   bool fp = false;
16017
16018   if (vectype != NULL)
16019     fp = FLOAT_TYPE_P (vectype);
16020
16021   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16022
16023   switch (type_of_cost)
16024     {
16025       case scalar_stmt:
16026         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16027
16028       case scalar_load:
16029         return costs->scalar_load_cost;
16030
16031       case scalar_store:
16032         return costs->scalar_store_cost;
16033
16034       case vector_stmt:
16035         return fp ? simd_costs->fp_stmt_cost
16036                   : simd_costs->int_stmt_cost;
16037
16038       case vector_load:
16039         return simd_costs->align_load_cost;
16040
16041       case vector_store:
16042         return simd_costs->store_cost;
16043
16044       case vec_to_scalar:
16045         return simd_costs->vec_to_scalar_cost;
16046
16047       case scalar_to_vec:
16048         return simd_costs->scalar_to_vec_cost;
16049
16050       case unaligned_load:
16051       case vector_gather_load:
16052         return simd_costs->unalign_load_cost;
16053
16054       case unaligned_store:
16055       case vector_scatter_store:
16056         return simd_costs->unalign_store_cost;
16057
16058       case cond_branch_taken:
16059         return costs->cond_taken_branch_cost;
16060
16061       case cond_branch_not_taken:
16062         return costs->cond_not_taken_branch_cost;
16063
16064       case vec_perm:
16065         return simd_costs->permute_cost;
16066
16067       case vec_promote_demote:
16068         return fp ? simd_costs->fp_stmt_cost
16069                   : simd_costs->int_stmt_cost;
16070
16071       case vec_construct:
16072         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16073         return elements / 2 + 1;
16074
16075       default:
16076         gcc_unreachable ();
16077     }
16078 }
16079
16080 /* Return true if an access of kind KIND for STMT_INFO represents one
16081    vector of an LD[234] or ST[234] operation.  Return the total number of
16082    vectors (2, 3 or 4) if so, otherwise return a value outside that range.  */
16083 static int
16084 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16085 {
16086   if ((kind == vector_load
16087        || kind == unaligned_load
16088        || kind == vector_store
16089        || kind == unaligned_store)
16090       && STMT_VINFO_DATA_REF (stmt_info))
16091     {
16092       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16093       if (stmt_info
16094           && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16095         return DR_GROUP_SIZE (stmt_info);
16096     }
16097   return 0;
16098 }
16099
16100 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16101    vectors would produce a series of LDP or STP operations.  KIND is the
16102    kind of statement that STMT_INFO represents.  */
16103 static bool
16104 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16105                            stmt_vec_info stmt_info)
16106 {
16107   switch (kind)
16108     {
16109     case vector_load:
16110     case vector_store:
16111     case unaligned_load:
16112     case unaligned_store:
16113       break;
16114
16115     default:
16116       return false;
16117     }
16118
16119   if (aarch64_tune_params.extra_tuning_flags
16120       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16121     return false;
16122
16123   return is_gimple_assign (stmt_info->stmt);
16124 }
16125
16126 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16127    or multiply-subtract sequence that might be suitable for fusing into a
16128    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16129    a scalar one, otherwise analyze it as an operation on vectors with those
16130    VEC_* flags.  */
16131 static bool
16132 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16133                         unsigned int vec_flags)
16134 {
16135   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16136   if (!assign)
16137     return false;
16138   tree_code code = gimple_assign_rhs_code (assign);
16139   if (code != PLUS_EXPR && code != MINUS_EXPR)
16140     return false;
16141
16142   if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
16143       || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
16144     return false;
16145
16146   for (int i = 1; i < 3; ++i)
16147     {
16148       tree rhs = gimple_op (assign, i);
16149       /* ??? Should we try to check for a single use as well?  */
16150       if (TREE_CODE (rhs) != SSA_NAME)
16151         continue;
16152
16153       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16154       if (!def_stmt_info
16155           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16156         continue;
16157       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16158       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16159         continue;
16160
16161       if (vec_flags & VEC_ADVSIMD)
16162         {
16163           /* Scalar and SVE code can tie the result to any FMLA input (or none,
16164              although that requires a MOVPRFX for SVE).  However, Advanced SIMD
16165              only supports MLA forms, so will require a move if the result
16166              cannot be tied to the accumulator.  The most important case in
16167              which this is true is when the accumulator input is invariant.  */
16168           rhs = gimple_op (assign, 3 - i);
16169           if (TREE_CODE (rhs) != SSA_NAME)
16170             return false;
16171           def_stmt_info = vinfo->lookup_def (rhs);
16172           if (!def_stmt_info
16173               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
16174             return false;
16175         }
16176
16177       return true;
16178     }
16179   return false;
16180 }
16181
16182 /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
16183    in-loop reduction that SVE supports directly, return its latency in cycles,
16184    otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
16185    instructions.  */
16186 static unsigned int
16187 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16188                                        stmt_vec_info stmt_info,
16189                                        const sve_vec_cost *sve_costs)
16190 {
16191   switch (vect_reduc_type (vinfo, stmt_info))
16192     {
16193     case EXTRACT_LAST_REDUCTION:
16194       return sve_costs->clast_cost;
16195
16196     case FOLD_LEFT_REDUCTION:
16197       switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16198         {
16199         case E_HFmode:
16200         case E_BFmode:
16201           return sve_costs->fadda_f16_cost;
16202
16203         case E_SFmode:
16204           return sve_costs->fadda_f32_cost;
16205
16206         case E_DFmode:
16207           return sve_costs->fadda_f64_cost;
16208
16209         default:
16210           break;
16211         }
16212       break;
16213     }
16214
16215   return 0;
16216 }
16217
16218 /* STMT_INFO describes a loop-carried operation in the original scalar code
16219    that we are considering implementing as a reduction.  Return one of the
16220    following values, depending on VEC_FLAGS:
16221
16222    - If VEC_FLAGS is zero, return the loop carry latency of the original
16223      scalar operation.
16224
16225    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16226      Advanced SIMD implementation.
16227
16228    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16229      SVE implementation.  */
16230 static unsigned int
16231 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16232                                    unsigned int vec_flags)
16233 {
16234   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16235   const sve_vec_cost *sve_costs = nullptr;
16236   if (vec_flags & VEC_ANY_SVE)
16237     sve_costs = aarch64_tune_params.vec_costs->sve;
16238
16239   /* If the caller is asking for the SVE latency, check for forms of reduction
16240      that only SVE can handle directly.  */
16241   if (sve_costs)
16242     {
16243       unsigned int latency
16244         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16245       if (latency)
16246         return latency;
16247     }
16248
16249   /* Handle scalar costs.  */
16250   bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16251   if (vec_flags == 0)
16252     {
16253       if (is_float)
16254         return vec_costs->scalar_fp_stmt_cost;
16255       return vec_costs->scalar_int_stmt_cost;
16256     }
16257
16258   /* Otherwise, the loop body just contains normal integer or FP operations,
16259      with a vector reduction outside the loop.  */
16260   const simd_vec_cost *simd_costs
16261     = aarch64_simd_vec_costs_for_flags (vec_flags);
16262   if (is_float)
16263     return simd_costs->fp_stmt_cost;
16264   return simd_costs->int_stmt_cost;
16265 }
16266
16267 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16268    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
16269    try to subdivide the target-independent categorization provided by KIND
16270    to get a more accurate cost.  */
16271 static fractional_cost
16272 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16273                                     stmt_vec_info stmt_info,
16274                                     fractional_cost stmt_cost)
16275 {
16276   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
16277      the extension with the load.  */
16278   if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16279     return 0;
16280
16281   return stmt_cost;
16282 }
16283
16284 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16285    for the vectorized form of STMT_INFO, which has cost kind KIND and which
16286    when vectorized would operate on vector type VECTYPE.  Try to subdivide
16287    the target-independent categorization provided by KIND to get a more
16288    accurate cost.  WHERE specifies where the cost associated with KIND
16289    occurs.  */
16290 static fractional_cost
16291 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16292                                     stmt_vec_info stmt_info, tree vectype,
16293                                     enum vect_cost_model_location where,
16294                                     fractional_cost stmt_cost)
16295 {
16296   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16297   const sve_vec_cost *sve_costs = nullptr;
16298   if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16299     sve_costs = aarch64_tune_params.vec_costs->sve;
16300
16301   /* It's generally better to avoid costing inductions, since the induction
16302      will usually be hidden by other operations.  This is particularly true
16303      for things like COND_REDUCTIONS.  */
16304   if (is_a<gphi *> (stmt_info->stmt))
16305     return 0;
16306
16307   /* Detect cases in which vec_to_scalar is describing the extraction of a
16308      vector element in preparation for a scalar store.  The store itself is
16309      costed separately.  */
16310   if (vect_is_store_elt_extraction (kind, stmt_info))
16311     return simd_costs->store_elt_extra_cost;
16312
16313   /* Detect SVE gather loads, which are costed as a single scalar_load
16314      for each element.  We therefore need to divide the full-instruction
16315      cost by the number of elements in the vector.  */
16316   if (kind == scalar_load
16317       && sve_costs
16318       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16319     {
16320       unsigned int nunits = vect_nunits_for_cost (vectype);
16321       if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16322         return { sve_costs->gather_load_x64_cost, nunits };
16323       return { sve_costs->gather_load_x32_cost, nunits };
16324     }
16325
16326   /* Detect cases in which a scalar_store is really storing one element
16327      in a scatter operation.  */
16328   if (kind == scalar_store
16329       && sve_costs
16330       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16331     return sve_costs->scatter_store_elt_cost;
16332
16333   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
16334   if (kind == vec_to_scalar
16335       && where == vect_body
16336       && sve_costs)
16337     {
16338       unsigned int latency
16339         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16340       if (latency)
16341         return latency;
16342     }
16343
16344   /* Detect cases in which vec_to_scalar represents a single reduction
16345      instruction like FADDP or MAXV.  */
16346   if (kind == vec_to_scalar
16347       && where == vect_epilogue
16348       && vect_is_reduction (stmt_info))
16349     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16350       {
16351       case E_QImode:
16352         return simd_costs->reduc_i8_cost;
16353
16354       case E_HImode:
16355         return simd_costs->reduc_i16_cost;
16356
16357       case E_SImode:
16358         return simd_costs->reduc_i32_cost;
16359
16360       case E_DImode:
16361         return simd_costs->reduc_i64_cost;
16362
16363       case E_HFmode:
16364       case E_BFmode:
16365         return simd_costs->reduc_f16_cost;
16366
16367       case E_SFmode:
16368         return simd_costs->reduc_f32_cost;
16369
16370       case E_DFmode:
16371         return simd_costs->reduc_f64_cost;
16372
16373       default:
16374         break;
16375       }
16376
16377   /* Otherwise stick with the original categorization.  */
16378   return stmt_cost;
16379 }
16380
16381 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16382    for STMT_INFO, which has cost kind KIND and which when vectorized would
16383    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
16384    targets.  */
16385 static fractional_cost
16386 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16387                               stmt_vec_info stmt_info, tree vectype,
16388                               fractional_cost stmt_cost)
16389 {
16390   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16391      vector register size or number of units.  Integer promotions of this
16392      type therefore map to SXT[BHW] or UXT[BHW].
16393
16394      Most loads have extending forms that can do the sign or zero extension
16395      on the fly.  Optimistically assume that a load followed by an extension
16396      will fold to this form during combine, and that the extension therefore
16397      comes for free.  */
16398   if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16399     stmt_cost = 0;
16400
16401   /* For similar reasons, vector_stmt integer truncations are a no-op,
16402      because we can just ignore the unused upper bits of the source.  */
16403   if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16404     stmt_cost = 0;
16405
16406   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16407      but there are no equivalent instructions for SVE.  This means that
16408      (all other things being equal) 128-bit SVE needs twice as many load
16409      and store instructions as Advanced SIMD in order to process vector pairs.
16410
16411      Also, scalar code can often use LDP and STP to access pairs of values,
16412      so it is too simplistic to say that one SVE load or store replaces
16413      VF scalar loads and stores.
16414
16415      Ideally we would account for this in the scalar and Advanced SIMD
16416      costs by making suitable load/store pairs as cheap as a single
16417      load/store.  However, that would be a very invasive change and in
16418      practice it tends to stress other parts of the cost model too much.
16419      E.g. stores of scalar constants currently count just a store,
16420      whereas stores of vector constants count a store and a vec_init.
16421      This is an artificial distinction for AArch64, where stores of
16422      nonzero scalar constants need the same kind of register invariant
16423      as vector stores.
16424
16425      An alternative would be to double the cost of any SVE loads and stores
16426      that could be paired in Advanced SIMD (and possibly also paired in
16427      scalar code).  But this tends to stress other parts of the cost model
16428      in the same way.  It also means that we can fall back to Advanced SIMD
16429      even if full-loop predication would have been useful.
16430
16431      Here we go for a more conservative version: double the costs of SVE
16432      loads and stores if one iteration of the scalar loop processes enough
16433      elements for it to use a whole number of Advanced SIMD LDP or STP
16434      instructions.  This makes it very likely that the VF would be 1 for
16435      Advanced SIMD, and so no epilogue should be needed.  */
16436   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16437     {
16438       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16439       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16440       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16441       if (multiple_p (count * elt_bits, 256)
16442           && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16443         stmt_cost *= 2;
16444     }
16445
16446   return stmt_cost;
16447 }
16448
16449 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16450    and which when vectorized would operate on vector type VECTYPE.  Add the
16451    cost of any embedded operations.  */
16452 static fractional_cost
16453 aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
16454                           tree vectype, fractional_cost stmt_cost)
16455 {
16456   if (vectype)
16457     {
16458       const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16459
16460       /* Detect cases in which a vector load or store represents an
16461          LD[234] or ST[234] instruction.  */
16462       switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16463         {
16464         case 2:
16465           stmt_cost += simd_costs->ld2_st2_permute_cost;
16466           break;
16467
16468         case 3:
16469           stmt_cost += simd_costs->ld3_st3_permute_cost;
16470           break;
16471
16472         case 4:
16473           stmt_cost += simd_costs->ld4_st4_permute_cost;
16474           break;
16475         }
16476
16477       if (kind == vector_stmt || kind == vec_to_scalar)
16478         if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16479           {
16480             if (FLOAT_TYPE_P (cmp_type))
16481               stmt_cost += simd_costs->fp_stmt_cost;
16482             else
16483               stmt_cost += simd_costs->int_stmt_cost;
16484           }
16485     }
16486
16487   if (kind == scalar_stmt)
16488     if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16489       {
16490         if (FLOAT_TYPE_P (cmp_type))
16491           stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16492         else
16493           stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16494       }
16495
16496   return stmt_cost;
16497 }
16498
16499 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16500    and they describe an operation in the body of a vector loop.  Record issue
16501    information relating to the vector operation in OPS.  */
16502 void
16503 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
16504                                  stmt_vec_info stmt_info,
16505                                  aarch64_vec_op_count *ops)
16506 {
16507   const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16508   if (!base_issue)
16509     return;
16510   const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
16511   const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
16512
16513   /* Calculate the minimum cycles per iteration imposed by a reduction
16514      operation.  */
16515   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16516       && vect_is_reduction (stmt_info))
16517     {
16518       unsigned int base
16519         = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
16520
16521       /* ??? Ideally we'd do COUNT reductions in parallel, but unfortunately
16522          that's not yet the case.  */
16523       ops->reduction_latency = MAX (ops->reduction_latency, base * count);
16524     }
16525
16526   /* Assume that multiply-adds will become a single operation.  */
16527   if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
16528     return;
16529
16530   /* Count the basic operation cost associated with KIND.  */
16531   switch (kind)
16532     {
16533     case cond_branch_taken:
16534     case cond_branch_not_taken:
16535     case vector_gather_load:
16536     case vector_scatter_store:
16537       /* We currently don't expect these to be used in a loop body.  */
16538       break;
16539
16540     case vec_perm:
16541     case vec_promote_demote:
16542     case vec_construct:
16543     case vec_to_scalar:
16544     case scalar_to_vec:
16545     case vector_stmt:
16546     case scalar_stmt:
16547       ops->general_ops += count;
16548       break;
16549
16550     case scalar_load:
16551     case vector_load:
16552     case unaligned_load:
16553       ops->loads += count;
16554       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16555         ops->general_ops += base_issue->fp_simd_load_general_ops * count;
16556       break;
16557
16558     case vector_store:
16559     case unaligned_store:
16560     case scalar_store:
16561       ops->stores += count;
16562       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16563         ops->general_ops += base_issue->fp_simd_store_general_ops * count;
16564       break;
16565     }
16566
16567   /* Add any embedded comparison operations.  */
16568   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16569       && vect_embedded_comparison_type (stmt_info))
16570     ops->general_ops += count;
16571
16572   /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
16573      have only accounted for one.  */
16574   if ((kind == vector_stmt || kind == vec_to_scalar)
16575       && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
16576     ops->general_ops += count;
16577
16578   /* Count the predicate operations needed by an SVE comparison.  */
16579   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
16580     if (tree type = vect_comparison_type (stmt_info))
16581       {
16582         unsigned int base = (FLOAT_TYPE_P (type)
16583                              ? sve_issue->fp_cmp_pred_ops
16584                              : sve_issue->int_cmp_pred_ops);
16585         ops->pred_ops += base * count;
16586       }
16587
16588   /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
16589   if (simd_issue)
16590     switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16591       {
16592       case 2:
16593         ops->general_ops += simd_issue->ld2_st2_general_ops * count;
16594         break;
16595
16596       case 3:
16597         ops->general_ops += simd_issue->ld3_st3_general_ops * count;
16598         break;
16599
16600       case 4:
16601         ops->general_ops += simd_issue->ld4_st4_general_ops * count;
16602         break;
16603       }
16604
16605   /* Add any overhead associated with gather loads and scatter stores.  */
16606   if (sve_issue
16607       && (kind == scalar_load || kind == scalar_store)
16608       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16609     {
16610       unsigned int pairs = CEIL (count, 2);
16611       ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
16612       ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
16613     }
16614 }
16615
16616 /* Return true if STMT_INFO contains a memory access and if the constant
16617    component of the memory address is aligned to SIZE bytes.  */
16618 static bool
16619 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
16620                                    poly_uint64 size)
16621 {
16622   if (!STMT_VINFO_DATA_REF (stmt_info))
16623     return false;
16624
16625   if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
16626     stmt_info = first_stmt;
16627   tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
16628   /* Needed for gathers & scatters, for example.  */
16629   if (!constant_offset)
16630     return false;
16631
16632   return multiple_p (wi::to_poly_offset (constant_offset), size);
16633 }
16634
16635 /* Check if a scalar or vector stmt could be part of a region of code
16636    that does nothing more than store values to memory, in the scalar
16637    case using STP.  Return the cost of the stmt if so, counting 2 for
16638    one instruction.  Return ~0U otherwise.
16639
16640    The arguments are a subset of those passed to add_stmt_cost.  */
16641 unsigned int
16642 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
16643                            stmt_vec_info stmt_info, tree vectype)
16644 {
16645   /* Code that stores vector constants uses a vector_load to create
16646      the constant.  We don't apply the heuristic to that case for two
16647      main reasons:
16648
16649      - At the moment, STPs are only formed via peephole2, and the
16650        constant scalar moves would often come between STRs and so
16651        prevent STP formation.
16652
16653      - The scalar code also has to load the constant somehow, and that
16654        isn't costed.  */
16655   switch (kind)
16656     {
16657     case scalar_to_vec:
16658       /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup.  */
16659       return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
16660
16661     case vec_construct:
16662       if (FLOAT_TYPE_P (vectype))
16663         /* Count 1 insn for the maximum number of FP->SIMD INS
16664            instructions.  */
16665         return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
16666
16667       /* Count 2 insns for a GPR->SIMD move and 2 insns for the
16668          maximum number of GPR->SIMD INS instructions.  */
16669       return vect_nunits_for_cost (vectype) * 4 * count;
16670
16671     case vector_store:
16672     case unaligned_store:
16673       /* Count 1 insn per vector if we can't form STP Q pairs.  */
16674       if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16675         return count * 2;
16676       if (aarch64_tune_params.extra_tuning_flags
16677           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16678         return count * 2;
16679
16680       if (stmt_info)
16681         {
16682           /* Assume we won't be able to use STP if the constant offset
16683              component of the address is misaligned.  ??? This could be
16684              removed if we formed STP pairs earlier, rather than relying
16685              on peephole2.  */
16686           auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
16687           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16688             return count * 2;
16689         }
16690       return CEIL (count, 2) * 2;
16691
16692     case scalar_store:
16693       if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
16694         {
16695           /* Check for a mode in which STP pairs can be formed.  */
16696           auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
16697           if (maybe_ne (size, 4) && maybe_ne (size, 8))
16698             return ~0U;
16699
16700           /* Assume we won't be able to use STP if the constant offset
16701              component of the address is misaligned.  ??? This could be
16702              removed if we formed STP pairs earlier, rather than relying
16703              on peephole2.  */
16704           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16705             return ~0U;
16706         }
16707       return count;
16708
16709     default:
16710       return ~0U;
16711     }
16712 }
16713
16714 unsigned
16715 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
16716                                      stmt_vec_info stmt_info, slp_tree,
16717                                      tree vectype, int misalign,
16718                                      vect_cost_model_location where)
16719 {
16720   fractional_cost stmt_cost
16721     = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
16722
16723   bool in_inner_loop_p = (where == vect_body
16724                           && stmt_info
16725                           && stmt_in_inner_loop_p (m_vinfo, stmt_info));
16726
16727   /* Do one-time initialization based on the vinfo.  */
16728   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
16729   if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
16730     {
16731       if (loop_vinfo)
16732         analyze_loop_vinfo (loop_vinfo);
16733
16734       m_analyzed_vinfo = true;
16735     }
16736
16737   /* Apply the heuristic described above m_stp_sequence_cost.  */
16738   if (m_stp_sequence_cost != ~0U)
16739     {
16740       uint64_t cost = aarch64_stp_sequence_cost (count, kind,
16741                                                  stmt_info, vectype);
16742       m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
16743     }
16744
16745   /* Try to get a more accurate cost by looking at STMT_INFO instead
16746      of just looking at KIND.  */
16747   if (stmt_info && aarch64_use_new_vector_costs_p ())
16748     {
16749       /* If we scalarize a strided store, the vectorizer costs one
16750          vec_to_scalar for each element.  However, we can store the first
16751          element using an FP store without a separate extract step.  */
16752       if (vect_is_store_elt_extraction (kind, stmt_info))
16753         count -= 1;
16754
16755       stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
16756                                                       stmt_info, stmt_cost);
16757
16758       if (vectype && m_vec_flags)
16759         stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
16760                                                         stmt_info, vectype,
16761                                                         where, stmt_cost);
16762     }
16763
16764   /* Do any SVE-specific adjustments to the cost.  */
16765   if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
16766     stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
16767                                               vectype, stmt_cost);
16768
16769   if (stmt_info && aarch64_use_new_vector_costs_p ())
16770     {
16771       /* Account for any extra "embedded" costs that apply additively
16772          to the base cost calculated above.  */
16773       stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
16774                                             stmt_cost);
16775
16776       /* If we're recording a nonzero vector loop body cost for the
16777          innermost loop, also estimate the operations that would need
16778          to be issued by all relevant implementations of the loop.  */
16779       if (loop_vinfo
16780           && (m_costing_for_scalar || where == vect_body)
16781           && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
16782           && stmt_cost != 0)
16783         for (auto &ops : m_ops)
16784           count_ops (count, kind, stmt_info, &ops);
16785
16786       /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
16787          estimate the number of statements in the unrolled Advanced SIMD
16788          loop.  For simplicitly, we assume that one iteration of the
16789          Advanced SIMD loop would need the same number of statements
16790          as one iteration of the SVE loop.  */
16791       if (where == vect_body && m_unrolled_advsimd_niters)
16792         m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
16793
16794       /* Detect the use of an averaging operation.  */
16795       gimple *stmt = stmt_info->stmt;
16796       if (is_gimple_call (stmt)
16797           && gimple_call_internal_p (stmt))
16798         {
16799           switch (gimple_call_internal_fn (stmt))
16800             {
16801             case IFN_AVG_FLOOR:
16802             case IFN_AVG_CEIL:
16803               m_has_avg = true;
16804             default:
16805               break;
16806             }
16807         }
16808     }
16809   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
16810 }
16811
16812 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
16813    heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
16814    says that we should prefer the Advanced SIMD loop.  */
16815 bool
16816 aarch64_vector_costs::prefer_unrolled_loop () const
16817 {
16818   if (!m_unrolled_advsimd_stmts)
16819     return false;
16820
16821   if (dump_enabled_p ())
16822     dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
16823                      " unrolled Advanced SIMD loop = "
16824                      HOST_WIDE_INT_PRINT_UNSIGNED "\n",
16825                      m_unrolled_advsimd_stmts);
16826
16827   /* The balance here is tricky.  On the one hand, we can't be sure whether
16828      the code is vectorizable with Advanced SIMD or not.  However, even if
16829      it isn't vectorizable with Advanced SIMD, there's a possibility that
16830      the scalar code could also be unrolled.  Some of the code might then
16831      benefit from SLP, or from using LDP and STP.  We therefore apply
16832      the heuristic regardless of can_use_advsimd_p.  */
16833   return (m_unrolled_advsimd_stmts
16834           && (m_unrolled_advsimd_stmts
16835               <= (unsigned int) param_max_completely_peeled_insns));
16836 }
16837
16838 /* Subroutine of adjust_body_cost for handling SVE.  Use ISSUE_INFO to work out
16839    how fast the SVE code can be issued and compare it to the equivalent value
16840    for scalar code (SCALAR_CYCLES_PER_ITER).  If COULD_USE_ADVSIMD is true,
16841    also compare it to the issue rate of Advanced SIMD code
16842    (ADVSIMD_CYCLES_PER_ITER).
16843
16844    ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
16845    *BODY_COST is the current value of the adjusted cost.  *SHOULD_DISPARAGE
16846    is true if we think the loop body is too expensive.  */
16847
16848 fractional_cost
16849 aarch64_vector_costs::
16850 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
16851                       fractional_cost scalar_cycles_per_iter,
16852                       unsigned int orig_body_cost, unsigned int *body_cost,
16853                       bool *should_disparage)
16854 {
16855   if (dump_enabled_p ())
16856     ops->dump ();
16857
16858   fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
16859   fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
16860
16861   /* If the scalar version of the loop could issue at least as
16862      quickly as the predicate parts of the SVE loop, make the SVE loop
16863      prohibitively expensive.  In this case vectorization is adding an
16864      overhead that the original scalar code didn't have.
16865
16866      This is mostly intended to detect cases in which WHILELOs dominate
16867      for very tight loops, which is something that normal latency-based
16868      costs would not model.  Adding this kind of cliffedge would be
16869      too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
16870      code in the caller handles that case in a more conservative way.  */
16871   fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
16872   if (scalar_cycles_per_iter < sve_estimate)
16873     {
16874       unsigned int min_cost
16875         = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
16876       if (*body_cost < min_cost)
16877         {
16878           if (dump_enabled_p ())
16879             dump_printf_loc (MSG_NOTE, vect_location,
16880                              "Increasing body cost to %d because the"
16881                              " scalar code could issue within the limit"
16882                              " imposed by predicate operations\n",
16883                              min_cost);
16884           *body_cost = min_cost;
16885           *should_disparage = true;
16886         }
16887     }
16888
16889   return sve_cycles_per_iter;
16890 }
16891
16892 unsigned int
16893 aarch64_vector_costs::determine_suggested_unroll_factor ()
16894 {
16895   bool sve = m_vec_flags & VEC_ANY_SVE;
16896   /* If we are trying to unroll an Advanced SIMD main loop that contains
16897      an averaging operation that we do not support with SVE and we might use a
16898      predicated epilogue, we need to be conservative and block unrolling as
16899      this might lead to a less optimal loop for the first and only epilogue
16900      using the original loop's vectorization factor.
16901      TODO: Remove this constraint when we add support for multiple epilogue
16902      vectorization.  */
16903   if (!sve && !TARGET_SVE2 && m_has_avg)
16904     return 1;
16905
16906   unsigned int max_unroll_factor = 1;
16907   for (auto vec_ops : m_ops)
16908     {
16909       aarch64_simd_vec_issue_info const *vec_issue
16910         = vec_ops.simd_issue_info ();
16911       if (!vec_issue)
16912         return 1;
16913       /* Limit unroll factor to a value adjustable by the user, the default
16914          value is 4. */
16915       unsigned int unroll_factor = aarch64_vect_unroll_limit;
16916       unsigned int factor
16917        = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
16918       unsigned int temp;
16919
16920       /* Sanity check, this should never happen.  */
16921       if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
16922         return 1;
16923
16924       /* Check stores.  */
16925       if (vec_ops.stores > 0)
16926         {
16927           temp = CEIL (factor * vec_issue->stores_per_cycle,
16928                        vec_ops.stores);
16929           unroll_factor = MIN (unroll_factor, temp);
16930         }
16931
16932       /* Check loads + stores.  */
16933       if (vec_ops.loads > 0)
16934         {
16935           temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
16936                        vec_ops.loads + vec_ops.stores);
16937           unroll_factor = MIN (unroll_factor, temp);
16938         }
16939
16940       /* Check general ops.  */
16941       if (vec_ops.general_ops > 0)
16942         {
16943           temp = CEIL (factor * vec_issue->general_ops_per_cycle,
16944                        vec_ops.general_ops);
16945           unroll_factor = MIN (unroll_factor, temp);
16946          }
16947       max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
16948     }
16949
16950   /* Make sure unroll factor is power of 2.  */
16951   return 1 << ceil_log2 (max_unroll_factor);
16952 }
16953
16954 /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
16955    and return the new cost.  */
16956 unsigned int
16957 aarch64_vector_costs::
16958 adjust_body_cost (loop_vec_info loop_vinfo,
16959                   const aarch64_vector_costs *scalar_costs,
16960                   unsigned int body_cost)
16961 {
16962   if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
16963     return body_cost;
16964
16965   const auto &scalar_ops = scalar_costs->m_ops[0];
16966   const auto &vector_ops = m_ops[0];
16967   unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
16968   unsigned int orig_body_cost = body_cost;
16969   bool should_disparage = false;
16970
16971   if (dump_enabled_p ())
16972     dump_printf_loc (MSG_NOTE, vect_location,
16973                      "Original vector body cost = %d\n", body_cost);
16974
16975   fractional_cost scalar_cycles_per_iter
16976     = scalar_ops.min_cycles_per_iter () * estimated_vf;
16977
16978   fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
16979
16980   if (dump_enabled_p ())
16981     {
16982       if (IN_RANGE (m_num_vector_iterations, 0, 65536))
16983         dump_printf_loc (MSG_NOTE, vect_location,
16984                          "Vector loop iterates at most %wd times\n",
16985                          m_num_vector_iterations);
16986       dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
16987       scalar_ops.dump ();
16988       dump_printf_loc (MSG_NOTE, vect_location,
16989                        "  estimated cycles per vector iteration"
16990                        " (for VF %d) = %f\n",
16991                        estimated_vf, scalar_cycles_per_iter.as_double ());
16992     }
16993
16994   if (vector_ops.sve_issue_info ())
16995     {
16996       if (dump_enabled_p ())
16997         dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
16998       vector_cycles_per_iter
16999         = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17000                                 orig_body_cost, &body_cost, &should_disparage);
17001
17002       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17003         {
17004           /* Also take Neoverse V1 tuning into account, doubling the
17005              scalar and Advanced SIMD estimates to account for the
17006              doubling in SVE vector length.  */
17007           if (dump_enabled_p ())
17008             dump_printf_loc (MSG_NOTE, vect_location,
17009                              "Neoverse V1 estimate:\n");
17010           auto vf_factor = m_ops[1].vf_factor ();
17011           adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17012                                 orig_body_cost, &body_cost, &should_disparage);
17013         }
17014     }
17015   else
17016     {
17017       if (dump_enabled_p ())
17018         {
17019           dump_printf_loc (MSG_NOTE, vect_location,
17020                            "Vector issue estimate:\n");
17021           vector_ops.dump ();
17022         }
17023     }
17024
17025   /* Decide whether to stick to latency-based costs or whether to try to
17026      take issue rates into account.  */
17027   unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17028   if (m_vec_flags & VEC_ANY_SVE)
17029     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17030
17031   if (m_num_vector_iterations >= 1
17032       && m_num_vector_iterations < threshold)
17033     {
17034       if (dump_enabled_p ())
17035         dump_printf_loc (MSG_NOTE, vect_location,
17036                          "Low iteration count, so using pure latency"
17037                          " costs\n");
17038     }
17039   /* Increase the cost of the vector code if it looks like the scalar code
17040      could issue more quickly.  These values are only rough estimates,
17041      so minor differences should only result in minor changes.  */
17042   else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17043     {
17044       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17045                                           scalar_cycles_per_iter);
17046       if (dump_enabled_p ())
17047         dump_printf_loc (MSG_NOTE, vect_location,
17048                          "Increasing body cost to %d because scalar code"
17049                          " would issue more quickly\n", body_cost);
17050     }
17051   /* In general, it's expected that the proposed vector code would be able
17052      to issue more quickly than the original scalar code.  This should
17053      already be reflected to some extent in the latency-based costs.
17054
17055      However, the latency-based costs effectively assume that the scalar
17056      code and the vector code execute serially, which tends to underplay
17057      one important case: if the real (non-serialized) execution time of
17058      a scalar iteration is dominated by loop-carried dependencies,
17059      and if the vector code is able to reduce both the length of
17060      the loop-carried dependencies *and* the number of cycles needed
17061      to issue the code in general, we can be more confident that the
17062      vector code is an improvement, even if adding the other (non-loop-carried)
17063      latencies tends to hide this saving.  We therefore reduce the cost of the
17064      vector loop body in proportion to the saving.  */
17065   else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17066            && scalar_ops.reduction_latency == scalar_cycles_per_iter
17067            && scalar_cycles_per_iter > vector_cycles_per_iter
17068            && !should_disparage)
17069     {
17070       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17071                                           scalar_cycles_per_iter);
17072       if (dump_enabled_p ())
17073         dump_printf_loc (MSG_NOTE, vect_location,
17074                          "Decreasing body cost to %d account for smaller"
17075                          " reduction latency\n", body_cost);
17076     }
17077
17078   return body_cost;
17079 }
17080
17081 void
17082 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17083 {
17084   auto *scalar_costs
17085     = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17086   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17087   if (loop_vinfo
17088       && m_vec_flags
17089       && aarch64_use_new_vector_costs_p ())
17090     {
17091       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17092                                              m_costs[vect_body]);
17093       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17094     }
17095
17096   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
17097      the scalar code in the event of a tie, since there is more chance
17098      of scalar code being optimized with surrounding operations.  */
17099   if (!loop_vinfo
17100       && scalar_costs
17101       && m_stp_sequence_cost != ~0U
17102       && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17103     m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17104
17105   vector_costs::finish_cost (scalar_costs);
17106 }
17107
17108 bool
17109 aarch64_vector_costs::
17110 better_main_loop_than_p (const vector_costs *uncast_other) const
17111 {
17112   auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17113
17114   auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17115   auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17116
17117   if (dump_enabled_p ())
17118     dump_printf_loc (MSG_NOTE, vect_location,
17119                      "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17120                      GET_MODE_NAME (this_loop_vinfo->vector_mode),
17121                      vect_vf_for_cost (this_loop_vinfo),
17122                      GET_MODE_NAME (other_loop_vinfo->vector_mode),
17123                      vect_vf_for_cost (other_loop_vinfo));
17124
17125   /* Apply the unrolling heuristic described above
17126      m_unrolled_advsimd_niters.  */
17127   if (bool (m_unrolled_advsimd_stmts)
17128       != bool (other->m_unrolled_advsimd_stmts))
17129     {
17130       bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17131       bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17132       if (this_prefer_unrolled != other_prefer_unrolled)
17133         {
17134           if (dump_enabled_p ())
17135             dump_printf_loc (MSG_NOTE, vect_location,
17136                              "Preferring Advanced SIMD loop because"
17137                              " it can be unrolled\n");
17138           return other_prefer_unrolled;
17139         }
17140     }
17141
17142   for (unsigned int i = 0; i < m_ops.length (); ++i)
17143     {
17144       if (dump_enabled_p ())
17145         {
17146           if (i)
17147             dump_printf_loc (MSG_NOTE, vect_location,
17148                              "Reconsidering with subtuning %d\n", i);
17149           dump_printf_loc (MSG_NOTE, vect_location,
17150                            "Issue info for %s loop:\n",
17151                            GET_MODE_NAME (this_loop_vinfo->vector_mode));
17152           this->m_ops[i].dump ();
17153           dump_printf_loc (MSG_NOTE, vect_location,
17154                            "Issue info for %s loop:\n",
17155                            GET_MODE_NAME (other_loop_vinfo->vector_mode));
17156           other->m_ops[i].dump ();
17157         }
17158
17159       auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17160                                 * this->m_ops[i].vf_factor ());
17161       auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17162                                  * other->m_ops[i].vf_factor ());
17163
17164       /* If it appears that one loop could process the same amount of data
17165          in fewer cycles, prefer that loop over the other one.  */
17166       fractional_cost this_cost
17167         = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17168       fractional_cost other_cost
17169         = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17170       if (dump_enabled_p ())
17171         {
17172           dump_printf_loc (MSG_NOTE, vect_location,
17173                            "Weighted cycles per iteration of %s loop ~= %f\n",
17174                            GET_MODE_NAME (this_loop_vinfo->vector_mode),
17175                            this_cost.as_double ());
17176           dump_printf_loc (MSG_NOTE, vect_location,
17177                            "Weighted cycles per iteration of %s loop ~= %f\n",
17178                            GET_MODE_NAME (other_loop_vinfo->vector_mode),
17179                            other_cost.as_double ());
17180         }
17181       if (this_cost != other_cost)
17182         {
17183           if (dump_enabled_p ())
17184             dump_printf_loc (MSG_NOTE, vect_location,
17185                              "Preferring loop with lower cycles"
17186                              " per iteration\n");
17187           return this_cost < other_cost;
17188         }
17189
17190       /* If the issue rate of SVE code is limited by predicate operations
17191          (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17192          and if Advanced SIMD code could issue within the limit imposed
17193          by the predicate operations, the predicate operations are adding an
17194          overhead that the original code didn't have and so we should prefer
17195          the Advanced SIMD version.  */
17196       auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17197                                     const aarch64_vec_op_count &b) -> bool
17198         {
17199           if (a.pred_ops == 0
17200               && (b.min_pred_cycles_per_iter ()
17201                   > b.min_nonpred_cycles_per_iter ()))
17202             {
17203               if (dump_enabled_p ())
17204                 dump_printf_loc (MSG_NOTE, vect_location,
17205                                  "Preferring Advanced SIMD loop since"
17206                                  " SVE loop is predicate-limited\n");
17207               return true;
17208             }
17209           return false;
17210         };
17211       if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17212         return true;
17213       if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17214         return false;
17215     }
17216
17217   return vector_costs::better_main_loop_than_p (other);
17218 }
17219
17220 static void initialize_aarch64_code_model (struct gcc_options *);
17221
17222 /* Parse the TO_PARSE string and put the architecture struct that it
17223    selects into RES and the architectural features into ISA_FLAGS.
17224    Return an aarch64_parse_opt_result describing the parse result.
17225    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17226    When the TO_PARSE string contains an invalid extension,
17227    a copy of the string is created and stored to INVALID_EXTENSION.  */
17228
17229 static enum aarch64_parse_opt_result
17230 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17231                     aarch64_feature_flags *isa_flags,
17232                     std::string *invalid_extension)
17233 {
17234   const char *ext;
17235   const struct processor *arch;
17236   size_t len;
17237
17238   ext = strchr (to_parse, '+');
17239
17240   if (ext != NULL)
17241     len = ext - to_parse;
17242   else
17243     len = strlen (to_parse);
17244
17245   if (len == 0)
17246     return AARCH64_PARSE_MISSING_ARG;
17247
17248
17249   /* Loop through the list of supported ARCHes to find a match.  */
17250   for (arch = all_architectures; arch->name != NULL; arch++)
17251     {
17252       if (strlen (arch->name) == len
17253           && strncmp (arch->name, to_parse, len) == 0)
17254         {
17255           auto isa_temp = arch->flags;
17256
17257           if (ext != NULL)
17258             {
17259               /* TO_PARSE string contains at least one extension.  */
17260               enum aarch64_parse_opt_result ext_res
17261                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17262
17263               if (ext_res != AARCH64_PARSE_OK)
17264                 return ext_res;
17265             }
17266           /* Extension parsing was successful.  Confirm the result
17267              arch and ISA flags.  */
17268           *res = arch;
17269           *isa_flags = isa_temp;
17270           return AARCH64_PARSE_OK;
17271         }
17272     }
17273
17274   /* ARCH name not found in list.  */
17275   return AARCH64_PARSE_INVALID_ARG;
17276 }
17277
17278 /* Parse the TO_PARSE string and put the result tuning in RES and the
17279    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
17280    describing the parse result.  If there is an error parsing, RES and
17281    ISA_FLAGS are left unchanged.
17282    When the TO_PARSE string contains an invalid extension,
17283    a copy of the string is created and stored to INVALID_EXTENSION.  */
17284
17285 static enum aarch64_parse_opt_result
17286 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17287                    aarch64_feature_flags *isa_flags,
17288                    std::string *invalid_extension)
17289 {
17290   const char *ext;
17291   const struct processor *cpu;
17292   size_t len;
17293
17294   ext = strchr (to_parse, '+');
17295
17296   if (ext != NULL)
17297     len = ext - to_parse;
17298   else
17299     len = strlen (to_parse);
17300
17301   if (len == 0)
17302     return AARCH64_PARSE_MISSING_ARG;
17303
17304
17305   /* Loop through the list of supported CPUs to find a match.  */
17306   for (cpu = all_cores; cpu->name != NULL; cpu++)
17307     {
17308       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17309         {
17310           auto isa_temp = cpu->flags;
17311
17312           if (ext != NULL)
17313             {
17314               /* TO_PARSE string contains at least one extension.  */
17315               enum aarch64_parse_opt_result ext_res
17316                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17317
17318               if (ext_res != AARCH64_PARSE_OK)
17319                 return ext_res;
17320             }
17321           /* Extension parsing was successfull.  Confirm the result
17322              cpu and ISA flags.  */
17323           *res = cpu;
17324           *isa_flags = isa_temp;
17325           return AARCH64_PARSE_OK;
17326         }
17327     }
17328
17329   /* CPU name not found in list.  */
17330   return AARCH64_PARSE_INVALID_ARG;
17331 }
17332
17333 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17334    Return an aarch64_parse_opt_result describing the parse result.
17335    If the parsing fails the RES does not change.  */
17336
17337 static enum aarch64_parse_opt_result
17338 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17339 {
17340   const struct processor *cpu;
17341
17342   /* Loop through the list of supported CPUs to find a match.  */
17343   for (cpu = all_cores; cpu->name != NULL; cpu++)
17344     {
17345       if (strcmp (cpu->name, to_parse) == 0)
17346         {
17347           *res = cpu;
17348           return AARCH64_PARSE_OK;
17349         }
17350     }
17351
17352   /* CPU name not found in list.  */
17353   return AARCH64_PARSE_INVALID_ARG;
17354 }
17355
17356 /* Parse TOKEN, which has length LENGTH to see if it is an option
17357    described in FLAG.  If it is, return the index bit for that fusion type.
17358    If not, error (printing OPTION_NAME) and return zero.  */
17359
17360 static unsigned int
17361 aarch64_parse_one_option_token (const char *token,
17362                                 size_t length,
17363                                 const struct aarch64_flag_desc *flag,
17364                                 const char *option_name)
17365 {
17366   for (; flag->name != NULL; flag++)
17367     {
17368       if (length == strlen (flag->name)
17369           && !strncmp (flag->name, token, length))
17370         return flag->flag;
17371     }
17372
17373   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17374   return 0;
17375 }
17376
17377 /* Parse OPTION which is a comma-separated list of flags to enable.
17378    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17379    default state we inherit from the CPU tuning structures.  OPTION_NAME
17380    gives the top-level option we are parsing in the -moverride string,
17381    for use in error messages.  */
17382
17383 static unsigned int
17384 aarch64_parse_boolean_options (const char *option,
17385                                const struct aarch64_flag_desc *flags,
17386                                unsigned int initial_state,
17387                                const char *option_name)
17388 {
17389   const char separator = '.';
17390   const char* specs = option;
17391   const char* ntoken = option;
17392   unsigned int found_flags = initial_state;
17393
17394   while ((ntoken = strchr (specs, separator)))
17395     {
17396       size_t token_length = ntoken - specs;
17397       unsigned token_ops = aarch64_parse_one_option_token (specs,
17398                                                            token_length,
17399                                                            flags,
17400                                                            option_name);
17401       /* If we find "none" (or, for simplicity's sake, an error) anywhere
17402          in the token stream, reset the supported operations.  So:
17403
17404            adrp+add.cmp+branch.none.adrp+add
17405
17406            would have the result of turning on only adrp+add fusion.  */
17407       if (!token_ops)
17408         found_flags = 0;
17409
17410       found_flags |= token_ops;
17411       specs = ++ntoken;
17412     }
17413
17414   /* We ended with a comma, print something.  */
17415   if (!(*specs))
17416     {
17417       error ("%qs string ill-formed", option_name);
17418       return 0;
17419     }
17420
17421   /* We still have one more token to parse.  */
17422   size_t token_length = strlen (specs);
17423   unsigned token_ops = aarch64_parse_one_option_token (specs,
17424                                                        token_length,
17425                                                        flags,
17426                                                        option_name);
17427    if (!token_ops)
17428      found_flags = 0;
17429
17430   found_flags |= token_ops;
17431   return found_flags;
17432 }
17433
17434 /* Support for overriding instruction fusion.  */
17435
17436 static void
17437 aarch64_parse_fuse_string (const char *fuse_string,
17438                             struct tune_params *tune)
17439 {
17440   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
17441                                                      aarch64_fusible_pairs,
17442                                                      tune->fusible_ops,
17443                                                      "fuse=");
17444 }
17445
17446 /* Support for overriding other tuning flags.  */
17447
17448 static void
17449 aarch64_parse_tune_string (const char *tune_string,
17450                             struct tune_params *tune)
17451 {
17452   tune->extra_tuning_flags
17453     = aarch64_parse_boolean_options (tune_string,
17454                                      aarch64_tuning_flags,
17455                                      tune->extra_tuning_flags,
17456                                      "tune=");
17457 }
17458
17459 /* Parse the sve_width tuning moverride string in TUNE_STRING.
17460    Accept the valid SVE vector widths allowed by
17461    aarch64_sve_vector_bits_enum and use it to override sve_width
17462    in TUNE.  */
17463
17464 static void
17465 aarch64_parse_sve_width_string (const char *tune_string,
17466                                 struct tune_params *tune)
17467 {
17468   int width = -1;
17469
17470   int n = sscanf (tune_string, "%d", &width);
17471   if (n == EOF)
17472     {
17473       error ("invalid format for %<sve_width%>");
17474       return;
17475     }
17476   switch (width)
17477     {
17478     case SVE_128:
17479     case SVE_256:
17480     case SVE_512:
17481     case SVE_1024:
17482     case SVE_2048:
17483       break;
17484     default:
17485       error ("invalid %<sve_width%> value: %d", width);
17486     }
17487   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
17488 }
17489
17490 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
17491    we understand.  If it is, extract the option string and handoff to
17492    the appropriate function.  */
17493
17494 void
17495 aarch64_parse_one_override_token (const char* token,
17496                                   size_t length,
17497                                   struct tune_params *tune)
17498 {
17499   const struct aarch64_tuning_override_function *fn
17500     = aarch64_tuning_override_functions;
17501
17502   const char *option_part = strchr (token, '=');
17503   if (!option_part)
17504     {
17505       error ("tuning string missing in option (%s)", token);
17506       return;
17507     }
17508
17509   /* Get the length of the option name.  */
17510   length = option_part - token;
17511   /* Skip the '=' to get to the option string.  */
17512   option_part++;
17513
17514   for (; fn->name != NULL; fn++)
17515     {
17516       if (!strncmp (fn->name, token, length))
17517         {
17518           fn->parse_override (option_part, tune);
17519           return;
17520         }
17521     }
17522
17523   error ("unknown tuning option (%s)",token);
17524   return;
17525 }
17526
17527 /* A checking mechanism for the implementation of the tls size.  */
17528
17529 static void
17530 initialize_aarch64_tls_size (struct gcc_options *opts)
17531 {
17532   if (aarch64_tls_size == 0)
17533     aarch64_tls_size = 24;
17534
17535   switch (opts->x_aarch64_cmodel_var)
17536     {
17537     case AARCH64_CMODEL_TINY:
17538       /* Both the default and maximum TLS size allowed under tiny is 1M which
17539          needs two instructions to address, so we clamp the size to 24.  */
17540       if (aarch64_tls_size > 24)
17541         aarch64_tls_size = 24;
17542       break;
17543     case AARCH64_CMODEL_SMALL:
17544       /* The maximum TLS size allowed under small is 4G.  */
17545       if (aarch64_tls_size > 32)
17546         aarch64_tls_size = 32;
17547       break;
17548     case AARCH64_CMODEL_LARGE:
17549       /* The maximum TLS size allowed under large is 16E.
17550          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
17551       if (aarch64_tls_size > 48)
17552         aarch64_tls_size = 48;
17553       break;
17554     default:
17555       gcc_unreachable ();
17556     }
17557
17558   return;
17559 }
17560
17561 /* Return the CPU corresponding to the enum CPU.  */
17562
17563 static const struct processor *
17564 aarch64_get_tune_cpu (enum aarch64_processor cpu)
17565 {
17566   gcc_assert (cpu != aarch64_none);
17567
17568   return &all_cores[cpu];
17569 }
17570
17571 /* Return the architecture corresponding to the enum ARCH.  */
17572
17573 static const struct processor *
17574 aarch64_get_arch (enum aarch64_arch arch)
17575 {
17576   gcc_assert (arch != aarch64_no_arch);
17577
17578   return &all_architectures[arch];
17579 }
17580
17581 /* Parse STRING looking for options in the format:
17582      string     :: option:string
17583      option     :: name=substring
17584      name       :: {a-z}
17585      substring  :: defined by option.  */
17586
17587 static void
17588 aarch64_parse_override_string (const char* input_string,
17589                                struct tune_params* tune)
17590 {
17591   const char separator = ':';
17592   size_t string_length = strlen (input_string) + 1;
17593   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
17594   char *string = string_root;
17595   strncpy (string, input_string, string_length);
17596   string[string_length - 1] = '\0';
17597
17598   char* ntoken = string;
17599
17600   while ((ntoken = strchr (string, separator)))
17601     {
17602       size_t token_length = ntoken - string;
17603       /* Make this substring look like a string.  */
17604       *ntoken = '\0';
17605       aarch64_parse_one_override_token (string, token_length, tune);
17606       string = ++ntoken;
17607     }
17608
17609   /* One last option to parse.  */
17610   aarch64_parse_one_override_token (string, strlen (string), tune);
17611   free (string_root);
17612 }
17613
17614 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
17615    are best for a generic target with the currently-enabled architecture
17616    extensions.  */
17617 static void
17618 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
17619 {
17620   /* Neoverse V1 is the only core that is known to benefit from
17621      AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.  There is therefore no
17622      point enabling it for SVE2 and above.  */
17623   if (TARGET_SVE2)
17624     current_tune.extra_tuning_flags
17625       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
17626 }
17627
17628 static void
17629 aarch64_override_options_after_change_1 (struct gcc_options *opts)
17630 {
17631   if (accepted_branch_protection_string)
17632     {
17633       opts->x_aarch64_branch_protection_string
17634         = xstrdup (accepted_branch_protection_string);
17635     }
17636
17637   /* PR 70044: We have to be careful about being called multiple times for the
17638      same function.  This means all changes should be repeatable.  */
17639
17640   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
17641      Disable the frame pointer flag so the mid-end will not use a frame
17642      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
17643      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
17644      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
17645   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
17646   if (opts->x_flag_omit_frame_pointer == 0)
17647     opts->x_flag_omit_frame_pointer = 2;
17648
17649   /* If not optimizing for size, set the default
17650      alignment to what the target wants.  */
17651   if (!opts->x_optimize_size)
17652     {
17653       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
17654         opts->x_str_align_loops = aarch64_tune_params.loop_align;
17655       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
17656         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
17657       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
17658         opts->x_str_align_functions = aarch64_tune_params.function_align;
17659     }
17660
17661   /* We default to no pc-relative literal loads.  */
17662
17663   aarch64_pcrelative_literal_loads = false;
17664
17665   /* If -mpc-relative-literal-loads is set on the command line, this
17666      implies that the user asked for PC relative literal loads.  */
17667   if (opts->x_pcrelative_literal_loads == 1)
17668     aarch64_pcrelative_literal_loads = true;
17669
17670   /* In the tiny memory model it makes no sense to disallow PC relative
17671      literal pool loads.  */
17672   if (aarch64_cmodel == AARCH64_CMODEL_TINY
17673       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
17674     aarch64_pcrelative_literal_loads = true;
17675
17676   /* When enabling the lower precision Newton series for the square root, also
17677      enable it for the reciprocal square root, since the latter is an
17678      intermediary step for the former.  */
17679   if (flag_mlow_precision_sqrt)
17680     flag_mrecip_low_precision_sqrt = true;
17681 }
17682
17683 /* 'Unpack' up the internal tuning structs and update the options
17684     in OPTS.  The caller must have set up selected_tune and selected_arch
17685     as all the other target-specific codegen decisions are
17686     derived from them.  */
17687
17688 void
17689 aarch64_override_options_internal (struct gcc_options *opts)
17690 {
17691   const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
17692   aarch64_tune_flags = tune->flags;
17693   aarch64_tune = tune->sched_core;
17694   /* Make a copy of the tuning parameters attached to the core, which
17695      we may later overwrite.  */
17696   aarch64_tune_params = *(tune->tune);
17697   if (tune->tune == &generic_tunings)
17698     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
17699
17700   if (opts->x_aarch64_override_tune_string)
17701     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
17702                                    &aarch64_tune_params);
17703
17704   /* This target defaults to strict volatile bitfields.  */
17705   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
17706     opts->x_flag_strict_volatile_bitfields = 1;
17707
17708   if (aarch64_stack_protector_guard == SSP_GLOBAL
17709       && opts->x_aarch64_stack_protector_guard_offset_str)
17710     {
17711       error ("incompatible options %<-mstack-protector-guard=global%> and "
17712              "%<-mstack-protector-guard-offset=%s%>",
17713              aarch64_stack_protector_guard_offset_str);
17714     }
17715
17716   if (aarch64_stack_protector_guard == SSP_SYSREG
17717       && !(opts->x_aarch64_stack_protector_guard_offset_str
17718            && opts->x_aarch64_stack_protector_guard_reg_str))
17719     {
17720       error ("both %<-mstack-protector-guard-offset%> and "
17721              "%<-mstack-protector-guard-reg%> must be used "
17722              "with %<-mstack-protector-guard=sysreg%>");
17723     }
17724
17725   if (opts->x_aarch64_stack_protector_guard_reg_str)
17726     {
17727       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
17728           error ("specify a system register with a small string length");
17729     }
17730
17731   if (opts->x_aarch64_stack_protector_guard_offset_str)
17732     {
17733       char *end;
17734       const char *str = aarch64_stack_protector_guard_offset_str;
17735       errno = 0;
17736       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
17737       if (!*str || *end || errno)
17738         error ("%qs is not a valid offset in %qs", str,
17739                "-mstack-protector-guard-offset=");
17740       aarch64_stack_protector_guard_offset = offs;
17741     }
17742
17743   if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
17744       && !fixed_regs[R18_REGNUM])
17745     error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
17746
17747   initialize_aarch64_code_model (opts);
17748   initialize_aarch64_tls_size (opts);
17749
17750   int queue_depth = 0;
17751   switch (aarch64_tune_params.autoprefetcher_model)
17752     {
17753       case tune_params::AUTOPREFETCHER_OFF:
17754         queue_depth = -1;
17755         break;
17756       case tune_params::AUTOPREFETCHER_WEAK:
17757         queue_depth = 0;
17758         break;
17759       case tune_params::AUTOPREFETCHER_STRONG:
17760         queue_depth = max_insn_queue_index + 1;
17761         break;
17762       default:
17763         gcc_unreachable ();
17764     }
17765
17766   /* We don't mind passing in global_options_set here as we don't use
17767      the *options_set structs anyway.  */
17768   SET_OPTION_IF_UNSET (opts, &global_options_set,
17769                        param_sched_autopref_queue_depth, queue_depth);
17770
17771   /* If using Advanced SIMD only for autovectorization disable SVE vector costs
17772      comparison.  */
17773   if (aarch64_autovec_preference == 1)
17774     SET_OPTION_IF_UNSET (opts, &global_options_set,
17775                          aarch64_sve_compare_costs, 0);
17776
17777   /* Set up parameters to be used in prefetching algorithm.  Do not
17778      override the defaults unless we are tuning for a core we have
17779      researched values for.  */
17780   if (aarch64_tune_params.prefetch->num_slots > 0)
17781     SET_OPTION_IF_UNSET (opts, &global_options_set,
17782                          param_simultaneous_prefetches,
17783                          aarch64_tune_params.prefetch->num_slots);
17784   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
17785     SET_OPTION_IF_UNSET (opts, &global_options_set,
17786                          param_l1_cache_size,
17787                          aarch64_tune_params.prefetch->l1_cache_size);
17788   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17789     SET_OPTION_IF_UNSET (opts, &global_options_set,
17790                          param_l1_cache_line_size,
17791                          aarch64_tune_params.prefetch->l1_cache_line_size);
17792
17793   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17794     {
17795       SET_OPTION_IF_UNSET (opts, &global_options_set,
17796                            param_destruct_interfere_size,
17797                            aarch64_tune_params.prefetch->l1_cache_line_size);
17798       SET_OPTION_IF_UNSET (opts, &global_options_set,
17799                            param_construct_interfere_size,
17800                            aarch64_tune_params.prefetch->l1_cache_line_size);
17801     }
17802   else
17803     {
17804       /* For a generic AArch64 target, cover the current range of cache line
17805          sizes.  */
17806       SET_OPTION_IF_UNSET (opts, &global_options_set,
17807                            param_destruct_interfere_size,
17808                            256);
17809       SET_OPTION_IF_UNSET (opts, &global_options_set,
17810                            param_construct_interfere_size,
17811                            64);
17812     }
17813
17814   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
17815     SET_OPTION_IF_UNSET (opts, &global_options_set,
17816                          param_l2_cache_size,
17817                          aarch64_tune_params.prefetch->l2_cache_size);
17818   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
17819     SET_OPTION_IF_UNSET (opts, &global_options_set,
17820                          param_prefetch_dynamic_strides, 0);
17821   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
17822     SET_OPTION_IF_UNSET (opts, &global_options_set,
17823                          param_prefetch_minimum_stride,
17824                          aarch64_tune_params.prefetch->minimum_stride);
17825
17826   /* Use the alternative scheduling-pressure algorithm by default.  */
17827   SET_OPTION_IF_UNSET (opts, &global_options_set,
17828                        param_sched_pressure_algorithm,
17829                        SCHED_PRESSURE_MODEL);
17830
17831   /* Validate the guard size.  */
17832   int guard_size = param_stack_clash_protection_guard_size;
17833
17834   if (guard_size != 12 && guard_size != 16)
17835     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
17836            "size.  Given value %d (%llu KB) is out of range",
17837            guard_size, (1ULL << guard_size) / 1024ULL);
17838
17839   /* Enforce that interval is the same size as size so the mid-end does the
17840      right thing.  */
17841   SET_OPTION_IF_UNSET (opts, &global_options_set,
17842                        param_stack_clash_protection_probe_interval,
17843                        guard_size);
17844
17845   /* The maybe_set calls won't update the value if the user has explicitly set
17846      one.  Which means we need to validate that probing interval and guard size
17847      are equal.  */
17848   int probe_interval
17849     = param_stack_clash_protection_probe_interval;
17850   if (guard_size != probe_interval)
17851     error ("stack clash guard size %<%d%> must be equal to probing interval "
17852            "%<%d%>", guard_size, probe_interval);
17853
17854   /* Enable sw prefetching at specified optimization level for
17855      CPUS that have prefetch.  Lower optimization level threshold by 1
17856      when profiling is enabled.  */
17857   if (opts->x_flag_prefetch_loop_arrays < 0
17858       && !opts->x_optimize_size
17859       && aarch64_tune_params.prefetch->default_opt_level >= 0
17860       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
17861     opts->x_flag_prefetch_loop_arrays = 1;
17862
17863   aarch64_override_options_after_change_1 (opts);
17864 }
17865
17866 /* Print a hint with a suggestion for a core or architecture name that
17867    most closely resembles what the user passed in STR.  ARCH is true if
17868    the user is asking for an architecture name.  ARCH is false if the user
17869    is asking for a core name.  */
17870
17871 static void
17872 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
17873 {
17874   auto_vec<const char *> candidates;
17875   const struct processor *entry = arch ? all_architectures : all_cores;
17876   for (; entry->name != NULL; entry++)
17877     candidates.safe_push (entry->name);
17878
17879 #ifdef HAVE_LOCAL_CPU_DETECT
17880   /* Add also "native" as possible value.  */
17881   if (arch)
17882     candidates.safe_push ("native");
17883 #endif
17884
17885   char *s;
17886   const char *hint = candidates_list_and_hint (str, s, candidates);
17887   if (hint)
17888     inform (input_location, "valid arguments are: %s;"
17889                              " did you mean %qs?", s, hint);
17890   else
17891     inform (input_location, "valid arguments are: %s", s);
17892
17893   XDELETEVEC (s);
17894 }
17895
17896 /* Print a hint with a suggestion for a core name that most closely resembles
17897    what the user passed in STR.  */
17898
17899 inline static void
17900 aarch64_print_hint_for_core (const char *str)
17901 {
17902   aarch64_print_hint_for_core_or_arch (str, false);
17903 }
17904
17905 /* Print a hint with a suggestion for an architecture name that most closely
17906    resembles what the user passed in STR.  */
17907
17908 inline static void
17909 aarch64_print_hint_for_arch (const char *str)
17910 {
17911   aarch64_print_hint_for_core_or_arch (str, true);
17912 }
17913
17914
17915 /* Print a hint with a suggestion for an extension name
17916    that most closely resembles what the user passed in STR.  */
17917
17918 void
17919 aarch64_print_hint_for_extensions (const std::string &str)
17920 {
17921   auto_vec<const char *> candidates;
17922   aarch64_get_all_extension_candidates (&candidates);
17923   char *s;
17924   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
17925   if (hint)
17926     inform (input_location, "valid arguments are: %s;"
17927                              " did you mean %qs?", s, hint);
17928   else
17929     inform (input_location, "valid arguments are: %s", s);
17930
17931   XDELETEVEC (s);
17932 }
17933
17934 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
17935    specified in STR and throw errors if appropriate.  Put the results if
17936    they are valid in RES and ISA_FLAGS.  Return whether the option is
17937    valid.  */
17938
17939 static bool
17940 aarch64_validate_mcpu (const char *str, const struct processor **res,
17941                        aarch64_feature_flags *isa_flags)
17942 {
17943   std::string invalid_extension;
17944   enum aarch64_parse_opt_result parse_res
17945     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
17946
17947   if (parse_res == AARCH64_PARSE_OK)
17948     return true;
17949
17950   switch (parse_res)
17951     {
17952       case AARCH64_PARSE_MISSING_ARG:
17953         error ("missing cpu name in %<-mcpu=%s%>", str);
17954         break;
17955       case AARCH64_PARSE_INVALID_ARG:
17956         error ("unknown value %qs for %<-mcpu%>", str);
17957         aarch64_print_hint_for_core (str);
17958         break;
17959       case AARCH64_PARSE_INVALID_FEATURE:
17960         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
17961                invalid_extension.c_str (), str);
17962         aarch64_print_hint_for_extensions (invalid_extension);
17963         break;
17964       default:
17965         gcc_unreachable ();
17966     }
17967
17968   return false;
17969 }
17970
17971 /* Straight line speculation indicators.  */
17972 enum aarch64_sls_hardening_type
17973 {
17974   SLS_NONE = 0,
17975   SLS_RETBR = 1,
17976   SLS_BLR = 2,
17977   SLS_ALL = 3,
17978 };
17979 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
17980
17981 /* Return whether we should mitigatate Straight Line Speculation for the RET
17982    and BR instructions.  */
17983 bool
17984 aarch64_harden_sls_retbr_p (void)
17985 {
17986   return aarch64_sls_hardening & SLS_RETBR;
17987 }
17988
17989 /* Return whether we should mitigatate Straight Line Speculation for the BLR
17990    instruction.  */
17991 bool
17992 aarch64_harden_sls_blr_p (void)
17993 {
17994   return aarch64_sls_hardening & SLS_BLR;
17995 }
17996
17997 /* As of yet we only allow setting these options globally, in the future we may
17998    allow setting them per function.  */
17999 static void
18000 aarch64_validate_sls_mitigation (const char *const_str)
18001 {
18002   char *token_save = NULL;
18003   char *str = NULL;
18004
18005   if (strcmp (const_str, "none") == 0)
18006     {
18007       aarch64_sls_hardening = SLS_NONE;
18008       return;
18009     }
18010   if (strcmp (const_str, "all") == 0)
18011     {
18012       aarch64_sls_hardening = SLS_ALL;
18013       return;
18014     }
18015
18016   char *str_root = xstrdup (const_str);
18017   str = strtok_r (str_root, ",", &token_save);
18018   if (!str)
18019     error ("invalid argument given to %<-mharden-sls=%>");
18020
18021   int temp = SLS_NONE;
18022   while (str)
18023     {
18024       if (strcmp (str, "blr") == 0)
18025         temp |= SLS_BLR;
18026       else if (strcmp (str, "retbr") == 0)
18027         temp |= SLS_RETBR;
18028       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18029         {
18030           error ("%qs must be by itself for %<-mharden-sls=%>", str);
18031           break;
18032         }
18033       else
18034         {
18035           error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18036           break;
18037         }
18038       str = strtok_r (NULL, ",", &token_save);
18039     }
18040   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18041   free (str_root);
18042 }
18043
18044 /* Parses CONST_STR for branch protection features specified in
18045    aarch64_branch_protect_types, and set any global variables required.  Returns
18046    the parsing result and assigns LAST_STR to the last processed token from
18047    CONST_STR so that it can be used for error reporting.  */
18048
18049 static enum
18050 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
18051                                                           char** last_str)
18052 {
18053   char *str_root = xstrdup (const_str);
18054   char* token_save = NULL;
18055   char *str = strtok_r (str_root, "+", &token_save);
18056   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
18057   if (!str)
18058     res = AARCH64_PARSE_MISSING_ARG;
18059   else
18060     {
18061       char *next_str = strtok_r (NULL, "+", &token_save);
18062       /* Reset the branch protection features to their defaults.  */
18063       aarch64_handle_no_branch_protection (NULL, NULL);
18064
18065       while (str && res == AARCH64_PARSE_OK)
18066         {
18067           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
18068           bool found = false;
18069           /* Search for this type.  */
18070           while (type && type->name && !found && res == AARCH64_PARSE_OK)
18071             {
18072               if (strcmp (str, type->name) == 0)
18073                 {
18074                   found = true;
18075                   res = type->handler (str, next_str);
18076                   str = next_str;
18077                   next_str = strtok_r (NULL, "+", &token_save);
18078                 }
18079               else
18080                 type++;
18081             }
18082           if (found && res == AARCH64_PARSE_OK)
18083             {
18084               bool found_subtype = true;
18085               /* Loop through each token until we find one that isn't a
18086                  subtype.  */
18087               while (found_subtype)
18088                 {
18089                   found_subtype = false;
18090                   const aarch64_branch_protect_type *subtype = type->subtypes;
18091                   /* Search for the subtype.  */
18092                   while (str && subtype && subtype->name && !found_subtype
18093                           && res == AARCH64_PARSE_OK)
18094                     {
18095                       if (strcmp (str, subtype->name) == 0)
18096                         {
18097                           found_subtype = true;
18098                           res = subtype->handler (str, next_str);
18099                           str = next_str;
18100                           next_str = strtok_r (NULL, "+", &token_save);
18101                         }
18102                       else
18103                         subtype++;
18104                     }
18105                 }
18106             }
18107           else if (!found)
18108             res = AARCH64_PARSE_INVALID_ARG;
18109         }
18110     }
18111   /* Copy the last processed token into the argument to pass it back.
18112     Used by option and attribute validation to print the offending token.  */
18113   if (last_str)
18114     {
18115       if (str) strcpy (*last_str, str);
18116       else *last_str = NULL;
18117     }
18118   if (res == AARCH64_PARSE_OK)
18119     {
18120       /* If needed, alloc the accepted string then copy in const_str.
18121         Used by override_option_after_change_1.  */
18122       if (!accepted_branch_protection_string)
18123         accepted_branch_protection_string = (char *) xmalloc (
18124                                                       BRANCH_PROTECT_STR_MAX
18125                                                         + 1);
18126       strncpy (accepted_branch_protection_string, const_str,
18127                 BRANCH_PROTECT_STR_MAX + 1);
18128       /* Forcibly null-terminate.  */
18129       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
18130     }
18131   return res;
18132 }
18133
18134 static bool
18135 aarch64_validate_mbranch_protection (const char *const_str)
18136 {
18137   char *str = (char *) xmalloc (strlen (const_str));
18138   enum aarch64_parse_opt_result res =
18139     aarch64_parse_branch_protection (const_str, &str);
18140   if (res == AARCH64_PARSE_INVALID_ARG)
18141     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
18142   else if (res == AARCH64_PARSE_MISSING_ARG)
18143     error ("missing argument for %<-mbranch-protection=%>");
18144   free (str);
18145   return res == AARCH64_PARSE_OK;
18146 }
18147
18148 /* Validate a command-line -march option.  Parse the arch and extensions
18149    (if any) specified in STR and throw errors if appropriate.  Put the
18150    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
18151    option is valid.  */
18152
18153 static bool
18154 aarch64_validate_march (const char *str, const struct processor **res,
18155                         aarch64_feature_flags *isa_flags)
18156 {
18157   std::string invalid_extension;
18158   enum aarch64_parse_opt_result parse_res
18159     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18160
18161   if (parse_res == AARCH64_PARSE_OK)
18162     return true;
18163
18164   switch (parse_res)
18165     {
18166       case AARCH64_PARSE_MISSING_ARG:
18167         error ("missing arch name in %<-march=%s%>", str);
18168         break;
18169       case AARCH64_PARSE_INVALID_ARG:
18170         error ("unknown value %qs for %<-march%>", str);
18171         aarch64_print_hint_for_arch (str);
18172         /* A common user error is confusing -march and -mcpu.
18173            If the -march string matches a known CPU suggest -mcpu.  */
18174         parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18175         if (parse_res == AARCH64_PARSE_OK)
18176           inform (input_location, "did you mean %<-mcpu=%s%>?", str);
18177         break;
18178       case AARCH64_PARSE_INVALID_FEATURE:
18179         error ("invalid feature modifier %qs in %<-march=%s%>",
18180                invalid_extension.c_str (), str);
18181         aarch64_print_hint_for_extensions (invalid_extension);
18182         break;
18183       default:
18184         gcc_unreachable ();
18185     }
18186
18187   return false;
18188 }
18189
18190 /* Validate a command-line -mtune option.  Parse the cpu
18191    specified in STR and throw errors if appropriate.  Put the
18192    result, if it is valid, in RES.  Return whether the option is
18193    valid.  */
18194
18195 static bool
18196 aarch64_validate_mtune (const char *str, const struct processor **res)
18197 {
18198   enum aarch64_parse_opt_result parse_res
18199     = aarch64_parse_tune (str, res);
18200
18201   if (parse_res == AARCH64_PARSE_OK)
18202     return true;
18203
18204   switch (parse_res)
18205     {
18206       case AARCH64_PARSE_MISSING_ARG:
18207         error ("missing cpu name in %<-mtune=%s%>", str);
18208         break;
18209       case AARCH64_PARSE_INVALID_ARG:
18210         error ("unknown value %qs for %<-mtune%>", str);
18211         aarch64_print_hint_for_core (str);
18212         break;
18213       default:
18214         gcc_unreachable ();
18215     }
18216   return false;
18217 }
18218
18219 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
18220
18221 static poly_uint16
18222 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18223 {
18224   /* 128-bit SVE and Advanced SIMD modes use different register layouts
18225      on big-endian targets, so we would need to forbid subregs that convert
18226      from one to the other.  By default a reinterpret sequence would then
18227      involve a store to memory in one mode and a load back in the other.
18228      Even if we optimize that sequence using reverse instructions,
18229      it would still be a significant potential overhead.
18230
18231      For now, it seems better to generate length-agnostic code for that
18232      case instead.  */
18233   if (value == SVE_SCALABLE
18234       || (value == SVE_128 && BYTES_BIG_ENDIAN))
18235     return poly_uint16 (2, 2);
18236   else
18237     return (int) value / 64;
18238 }
18239
18240 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18241    aarch64_isa_flags accordingly.  */
18242
18243 void
18244 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18245 {
18246   aarch64_set_asm_isa_flags (&global_options, flags);
18247 }
18248
18249 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
18250    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18251    tuning structs.  In particular it must set selected_tune and
18252    aarch64_asm_isa_flags that define the available ISA features and tuning
18253    decisions.  It must also set selected_arch as this will be used to
18254    output the .arch asm tags for each function.  */
18255
18256 static void
18257 aarch64_override_options (void)
18258 {
18259   aarch64_feature_flags cpu_isa = 0;
18260   aarch64_feature_flags arch_isa = 0;
18261   aarch64_set_asm_isa_flags (0);
18262
18263   const struct processor *cpu = NULL;
18264   const struct processor *arch = NULL;
18265   const struct processor *tune = NULL;
18266
18267   if (aarch64_harden_sls_string)
18268     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18269
18270   if (aarch64_branch_protection_string)
18271     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
18272
18273   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18274      If either of -march or -mtune is given, they override their
18275      respective component of -mcpu.  */
18276   if (aarch64_cpu_string)
18277     aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18278
18279   if (aarch64_arch_string)
18280     aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18281
18282   if (aarch64_tune_string)
18283     aarch64_validate_mtune (aarch64_tune_string, &tune);
18284
18285 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18286   SUBTARGET_OVERRIDE_OPTIONS;
18287 #endif
18288
18289   if (cpu && arch)
18290     {
18291       /* If both -mcpu and -march are specified, warn if they are not
18292          architecturally compatible and prefer the -march ISA flags.  */
18293       if (arch->arch != cpu->arch)
18294         {
18295           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
18296                        aarch64_cpu_string,
18297                        aarch64_arch_string);
18298         }
18299
18300       selected_arch = arch->arch;
18301       aarch64_set_asm_isa_flags (arch_isa);
18302     }
18303   else if (cpu)
18304     {
18305       selected_arch = cpu->arch;
18306       aarch64_set_asm_isa_flags (cpu_isa);
18307     }
18308   else if (arch)
18309     {
18310       cpu = &all_cores[arch->ident];
18311       selected_arch = arch->arch;
18312       aarch64_set_asm_isa_flags (arch_isa);
18313     }
18314   else
18315     {
18316       /* No -mcpu or -march specified, so use the default CPU.  */
18317       cpu = &all_cores[TARGET_CPU_DEFAULT];
18318       selected_arch = cpu->arch;
18319       aarch64_set_asm_isa_flags (cpu->flags);
18320     }
18321
18322   selected_tune = tune ? tune->ident : cpu->ident;
18323
18324   if (aarch64_enable_bti == 2)
18325     {
18326 #ifdef TARGET_ENABLE_BTI
18327       aarch64_enable_bti = 1;
18328 #else
18329       aarch64_enable_bti = 0;
18330 #endif
18331     }
18332
18333   /* Return address signing is currently not supported for ILP32 targets.  For
18334      LP64 targets use the configured option in the absence of a command-line
18335      option for -mbranch-protection.  */
18336   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
18337     {
18338 #ifdef TARGET_ENABLE_PAC_RET
18339       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
18340 #else
18341       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
18342 #endif
18343     }
18344
18345 #ifndef HAVE_AS_MABI_OPTION
18346   /* The compiler may have been configured with 2.23.* binutils, which does
18347      not have support for ILP32.  */
18348   if (TARGET_ILP32)
18349     error ("assembler does not support %<-mabi=ilp32%>");
18350 #endif
18351
18352   /* Convert -msve-vector-bits to a VG count.  */
18353   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18354
18355   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
18356     sorry ("return address signing is only supported for %<-mabi=lp64%>");
18357
18358   /* The pass to insert speculation tracking runs before
18359      shrink-wrapping and the latter does not know how to update the
18360      tracking status.  So disable it in this case.  */
18361   if (aarch64_track_speculation)
18362     flag_shrink_wrap = 0;
18363
18364   aarch64_override_options_internal (&global_options);
18365
18366   /* Save these options as the default ones in case we push and pop them later
18367      while processing functions with potential target attributes.  */
18368   target_option_default_node = target_option_current_node
18369     = build_target_option_node (&global_options, &global_options_set);
18370 }
18371
18372 /* Implement targetm.override_options_after_change.  */
18373
18374 static void
18375 aarch64_override_options_after_change (void)
18376 {
18377   aarch64_override_options_after_change_1 (&global_options);
18378 }
18379
18380 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
18381 static char *
18382 aarch64_offload_options (void)
18383 {
18384   if (TARGET_ILP32)
18385     return xstrdup ("-foffload-abi=ilp32");
18386   else
18387     return xstrdup ("-foffload-abi=lp64");
18388 }
18389
18390 static struct machine_function *
18391 aarch64_init_machine_status (void)
18392 {
18393   struct machine_function *machine;
18394   machine = ggc_cleared_alloc<machine_function> ();
18395   return machine;
18396 }
18397
18398 void
18399 aarch64_init_expanders (void)
18400 {
18401   init_machine_status = aarch64_init_machine_status;
18402 }
18403
18404 /* A checking mechanism for the implementation of the various code models.  */
18405 static void
18406 initialize_aarch64_code_model (struct gcc_options *opts)
18407 {
18408   aarch64_cmodel = opts->x_aarch64_cmodel_var;
18409   switch (opts->x_aarch64_cmodel_var)
18410     {
18411     case AARCH64_CMODEL_TINY:
18412       if (opts->x_flag_pic)
18413         aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18414       break;
18415     case AARCH64_CMODEL_SMALL:
18416       if (opts->x_flag_pic)
18417         {
18418 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18419           aarch64_cmodel = (flag_pic == 2
18420                             ? AARCH64_CMODEL_SMALL_PIC
18421                             : AARCH64_CMODEL_SMALL_SPIC);
18422 #else
18423           aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
18424 #endif
18425         }
18426       break;
18427     case AARCH64_CMODEL_LARGE:
18428       if (opts->x_flag_pic)
18429         sorry ("code model %qs with %<-f%s%>", "large",
18430                opts->x_flag_pic > 1 ? "PIC" : "pic");
18431       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18432         sorry ("code model %qs not supported in ilp32 mode", "large");
18433       break;
18434     case AARCH64_CMODEL_TINY_PIC:
18435     case AARCH64_CMODEL_SMALL_PIC:
18436     case AARCH64_CMODEL_SMALL_SPIC:
18437       gcc_unreachable ();
18438     }
18439 }
18440
18441 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
18442    using the information saved in PTR.  */
18443
18444 static void
18445 aarch64_option_restore (struct gcc_options *opts,
18446                         struct gcc_options * /* opts_set */,
18447                         struct cl_target_option * /* ptr */)
18448 {
18449   aarch64_override_options_internal (opts);
18450 }
18451
18452 /* Implement TARGET_OPTION_PRINT.  */
18453
18454 static void
18455 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
18456 {
18457   const struct processor *cpu
18458     = aarch64_get_tune_cpu (ptr->x_selected_tune);
18459   const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
18460   std::string extension
18461     = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_asm_isa_flags,
18462                                                   arch->flags);
18463
18464   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
18465   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
18466            arch->name, extension.c_str ());
18467 }
18468
18469 static GTY(()) tree aarch64_previous_fndecl;
18470
18471 void
18472 aarch64_reset_previous_fndecl (void)
18473 {
18474   aarch64_previous_fndecl = NULL;
18475 }
18476
18477 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
18478    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
18479    make sure optab availability predicates are recomputed when necessary.  */
18480
18481 void
18482 aarch64_save_restore_target_globals (tree new_tree)
18483 {
18484   if (TREE_TARGET_GLOBALS (new_tree))
18485     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
18486   else if (new_tree == target_option_default_node)
18487     restore_target_globals (&default_target_globals);
18488   else
18489     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
18490 }
18491
18492 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
18493    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
18494    of the function, if such exists.  This function may be called multiple
18495    times on a single function so use aarch64_previous_fndecl to avoid
18496    setting up identical state.  */
18497
18498 static void
18499 aarch64_set_current_function (tree fndecl)
18500 {
18501   if (!fndecl || fndecl == aarch64_previous_fndecl)
18502     return;
18503
18504   tree old_tree = (aarch64_previous_fndecl
18505                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
18506                    : NULL_TREE);
18507
18508   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18509
18510   /* If current function has no attributes but the previous one did,
18511      use the default node.  */
18512   if (!new_tree && old_tree)
18513     new_tree = target_option_default_node;
18514
18515   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
18516      the default have been handled by aarch64_save_restore_target_globals from
18517      aarch64_pragma_target_parse.  */
18518   if (old_tree == new_tree)
18519     return;
18520
18521   aarch64_previous_fndecl = fndecl;
18522
18523   /* First set the target options.  */
18524   cl_target_option_restore (&global_options, &global_options_set,
18525                             TREE_TARGET_OPTION (new_tree));
18526
18527   aarch64_save_restore_target_globals (new_tree);
18528 }
18529
18530 /* Enum describing the various ways we can handle attributes.
18531    In many cases we can reuse the generic option handling machinery.  */
18532
18533 enum aarch64_attr_opt_type
18534 {
18535   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
18536   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
18537   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
18538   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
18539 };
18540
18541 /* All the information needed to handle a target attribute.
18542    NAME is the name of the attribute.
18543    ATTR_TYPE specifies the type of behavior of the attribute as described
18544    in the definition of enum aarch64_attr_opt_type.
18545    ALLOW_NEG is true if the attribute supports a "no-" form.
18546    HANDLER is the function that takes the attribute string as an argument
18547    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
18548    OPT_NUM is the enum specifying the option that the attribute modifies.
18549    This is needed for attributes that mirror the behavior of a command-line
18550    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
18551    aarch64_attr_enum.  */
18552
18553 struct aarch64_attribute_info
18554 {
18555   const char *name;
18556   enum aarch64_attr_opt_type attr_type;
18557   bool allow_neg;
18558   bool (*handler) (const char *);
18559   enum opt_code opt_num;
18560 };
18561
18562 /* Handle the ARCH_STR argument to the arch= target attribute.  */
18563
18564 static bool
18565 aarch64_handle_attr_arch (const char *str)
18566 {
18567   const struct processor *tmp_arch = NULL;
18568   std::string invalid_extension;
18569   aarch64_feature_flags tmp_flags;
18570   enum aarch64_parse_opt_result parse_res
18571     = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
18572
18573   if (parse_res == AARCH64_PARSE_OK)
18574     {
18575       gcc_assert (tmp_arch);
18576       selected_arch = tmp_arch->arch;
18577       aarch64_set_asm_isa_flags (tmp_flags);
18578       return true;
18579     }
18580
18581   switch (parse_res)
18582     {
18583       case AARCH64_PARSE_MISSING_ARG:
18584         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
18585         break;
18586       case AARCH64_PARSE_INVALID_ARG:
18587         error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
18588         aarch64_print_hint_for_arch (str);
18589         break;
18590       case AARCH64_PARSE_INVALID_FEATURE:
18591         error ("invalid feature modifier %s of value %qs in "
18592                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18593         aarch64_print_hint_for_extensions (invalid_extension);
18594         break;
18595       default:
18596         gcc_unreachable ();
18597     }
18598
18599   return false;
18600 }
18601
18602 /* Handle the argument CPU_STR to the cpu= target attribute.  */
18603
18604 static bool
18605 aarch64_handle_attr_cpu (const char *str)
18606 {
18607   const struct processor *tmp_cpu = NULL;
18608   std::string invalid_extension;
18609   aarch64_feature_flags tmp_flags;
18610   enum aarch64_parse_opt_result parse_res
18611     = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
18612
18613   if (parse_res == AARCH64_PARSE_OK)
18614     {
18615       gcc_assert (tmp_cpu);
18616       selected_tune = tmp_cpu->ident;
18617       selected_arch = tmp_cpu->arch;
18618       aarch64_set_asm_isa_flags (tmp_flags);
18619       return true;
18620     }
18621
18622   switch (parse_res)
18623     {
18624       case AARCH64_PARSE_MISSING_ARG:
18625         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
18626         break;
18627       case AARCH64_PARSE_INVALID_ARG:
18628         error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
18629         aarch64_print_hint_for_core (str);
18630         break;
18631       case AARCH64_PARSE_INVALID_FEATURE:
18632         error ("invalid feature modifier %qs of value %qs in "
18633                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18634         aarch64_print_hint_for_extensions (invalid_extension);
18635         break;
18636       default:
18637         gcc_unreachable ();
18638     }
18639
18640   return false;
18641 }
18642
18643 /* Handle the argument STR to the branch-protection= attribute.  */
18644
18645  static bool
18646  aarch64_handle_attr_branch_protection (const char* str)
18647  {
18648   char *err_str = (char *) xmalloc (strlen (str) + 1);
18649   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
18650                                                                       &err_str);
18651   bool success = false;
18652   switch (res)
18653     {
18654      case AARCH64_PARSE_MISSING_ARG:
18655        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
18656               " attribute");
18657        break;
18658      case AARCH64_PARSE_INVALID_ARG:
18659        error ("invalid protection type %qs in %<target(\"branch-protection"
18660               "=\")%> pragma or attribute", err_str);
18661        break;
18662      case AARCH64_PARSE_OK:
18663        success = true;
18664       /* Fall through.  */
18665      case AARCH64_PARSE_INVALID_FEATURE:
18666        break;
18667      default:
18668        gcc_unreachable ();
18669     }
18670   free (err_str);
18671   return success;
18672  }
18673
18674 /* Handle the argument STR to the tune= target attribute.  */
18675
18676 static bool
18677 aarch64_handle_attr_tune (const char *str)
18678 {
18679   const struct processor *tmp_tune = NULL;
18680   enum aarch64_parse_opt_result parse_res
18681     = aarch64_parse_tune (str, &tmp_tune);
18682
18683   if (parse_res == AARCH64_PARSE_OK)
18684     {
18685       gcc_assert (tmp_tune);
18686       selected_tune = tmp_tune->ident;
18687       return true;
18688     }
18689
18690   switch (parse_res)
18691     {
18692       case AARCH64_PARSE_INVALID_ARG:
18693         error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
18694         aarch64_print_hint_for_core (str);
18695         break;
18696       default:
18697         gcc_unreachable ();
18698     }
18699
18700   return false;
18701 }
18702
18703 /* Parse an architecture extensions target attribute string specified in STR.
18704    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
18705    if successful.  Update aarch64_isa_flags to reflect the ISA features
18706    modified.  */
18707
18708 static bool
18709 aarch64_handle_attr_isa_flags (char *str)
18710 {
18711   enum aarch64_parse_opt_result parse_res;
18712   auto isa_flags = aarch64_asm_isa_flags;
18713
18714   /* We allow "+nothing" in the beginning to clear out all architectural
18715      features if the user wants to handpick specific features.  */
18716   if (strncmp ("+nothing", str, 8) == 0)
18717     {
18718       isa_flags = 0;
18719       str += 8;
18720     }
18721
18722   std::string invalid_extension;
18723   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
18724
18725   if (parse_res == AARCH64_PARSE_OK)
18726     {
18727       aarch64_set_asm_isa_flags (isa_flags);
18728       return true;
18729     }
18730
18731   switch (parse_res)
18732     {
18733       case AARCH64_PARSE_MISSING_ARG:
18734         error ("missing value in %<target()%> pragma or attribute");
18735         break;
18736
18737       case AARCH64_PARSE_INVALID_FEATURE:
18738         error ("invalid feature modifier %qs of value %qs in "
18739                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18740         break;
18741
18742       default:
18743         gcc_unreachable ();
18744     }
18745
18746  return false;
18747 }
18748
18749 /* The target attributes that we support.  On top of these we also support just
18750    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
18751    handled explicitly in aarch64_process_one_target_attr.  */
18752
18753 static const struct aarch64_attribute_info aarch64_attributes[] =
18754 {
18755   { "general-regs-only", aarch64_attr_mask, false, NULL,
18756      OPT_mgeneral_regs_only },
18757   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
18758      OPT_mfix_cortex_a53_835769 },
18759   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
18760      OPT_mfix_cortex_a53_843419 },
18761   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
18762   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
18763   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
18764      OPT_momit_leaf_frame_pointer },
18765   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
18766   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
18767      OPT_march_ },
18768   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
18769   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
18770      OPT_mtune_ },
18771   { "branch-protection", aarch64_attr_custom, false,
18772      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
18773   { "sign-return-address", aarch64_attr_enum, false, NULL,
18774      OPT_msign_return_address_ },
18775   { "outline-atomics", aarch64_attr_bool, true, NULL,
18776      OPT_moutline_atomics},
18777   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
18778 };
18779
18780 /* Parse ARG_STR which contains the definition of one target attribute.
18781    Show appropriate errors if any or return true if the attribute is valid.  */
18782
18783 static bool
18784 aarch64_process_one_target_attr (char *arg_str)
18785 {
18786   bool invert = false;
18787
18788   size_t len = strlen (arg_str);
18789
18790   if (len == 0)
18791     {
18792       error ("malformed %<target()%> pragma or attribute");
18793       return false;
18794     }
18795
18796   char *str_to_check = (char *) alloca (len + 1);
18797   strcpy (str_to_check, arg_str);
18798
18799   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
18800      It is easier to detect and handle it explicitly here rather than going
18801      through the machinery for the rest of the target attributes in this
18802      function.  */
18803   if (*str_to_check == '+')
18804     return aarch64_handle_attr_isa_flags (str_to_check);
18805
18806   if (len > 3 && startswith (str_to_check, "no-"))
18807     {
18808       invert = true;
18809       str_to_check += 3;
18810     }
18811   char *arg = strchr (str_to_check, '=');
18812
18813   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
18814      and point ARG to "foo".  */
18815   if (arg)
18816     {
18817       *arg = '\0';
18818       arg++;
18819     }
18820   const struct aarch64_attribute_info *p_attr;
18821   bool found = false;
18822   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
18823     {
18824       /* If the names don't match up, or the user has given an argument
18825          to an attribute that doesn't accept one, or didn't give an argument
18826          to an attribute that expects one, fail to match.  */
18827       if (strcmp (str_to_check, p_attr->name) != 0)
18828         continue;
18829
18830       found = true;
18831       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
18832                               || p_attr->attr_type == aarch64_attr_enum;
18833
18834       if (attr_need_arg_p ^ (arg != NULL))
18835         {
18836           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
18837           return false;
18838         }
18839
18840       /* If the name matches but the attribute does not allow "no-" versions
18841          then we can't match.  */
18842       if (invert && !p_attr->allow_neg)
18843         {
18844           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
18845           return false;
18846         }
18847
18848       switch (p_attr->attr_type)
18849         {
18850         /* Has a custom handler registered.
18851            For example, cpu=, arch=, tune=.  */
18852           case aarch64_attr_custom:
18853             gcc_assert (p_attr->handler);
18854             if (!p_attr->handler (arg))
18855               return false;
18856             break;
18857
18858           /* Either set or unset a boolean option.  */
18859           case aarch64_attr_bool:
18860             {
18861               struct cl_decoded_option decoded;
18862
18863               generate_option (p_attr->opt_num, NULL, !invert,
18864                                CL_TARGET, &decoded);
18865               aarch64_handle_option (&global_options, &global_options_set,
18866                                       &decoded, input_location);
18867               break;
18868             }
18869           /* Set or unset a bit in the target_flags.  aarch64_handle_option
18870              should know what mask to apply given the option number.  */
18871           case aarch64_attr_mask:
18872             {
18873               struct cl_decoded_option decoded;
18874               /* We only need to specify the option number.
18875                  aarch64_handle_option will know which mask to apply.  */
18876               decoded.opt_index = p_attr->opt_num;
18877               decoded.value = !invert;
18878               aarch64_handle_option (&global_options, &global_options_set,
18879                                       &decoded, input_location);
18880               break;
18881             }
18882           /* Use the option setting machinery to set an option to an enum.  */
18883           case aarch64_attr_enum:
18884             {
18885               gcc_assert (arg);
18886               bool valid;
18887               int value;
18888               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
18889                                               &value, CL_TARGET);
18890               if (valid)
18891                 {
18892                   set_option (&global_options, NULL, p_attr->opt_num, value,
18893                               NULL, DK_UNSPECIFIED, input_location,
18894                               global_dc);
18895                 }
18896               else
18897                 {
18898                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
18899                 }
18900               break;
18901             }
18902           default:
18903             gcc_unreachable ();
18904         }
18905     }
18906
18907   /* If we reached here we either have found an attribute and validated
18908      it or didn't match any.  If we matched an attribute but its arguments
18909      were malformed we will have returned false already.  */
18910   return found;
18911 }
18912
18913 /* Count how many times the character C appears in
18914    NULL-terminated string STR.  */
18915
18916 static unsigned int
18917 num_occurences_in_str (char c, char *str)
18918 {
18919   unsigned int res = 0;
18920   while (*str != '\0')
18921     {
18922       if (*str == c)
18923         res++;
18924
18925       str++;
18926     }
18927
18928   return res;
18929 }
18930
18931 /* Parse the tree in ARGS that contains the target attribute information
18932    and update the global target options space.  */
18933
18934 bool
18935 aarch64_process_target_attr (tree args)
18936 {
18937   if (TREE_CODE (args) == TREE_LIST)
18938     {
18939       do
18940         {
18941           tree head = TREE_VALUE (args);
18942           if (head)
18943             {
18944               if (!aarch64_process_target_attr (head))
18945                 return false;
18946             }
18947           args = TREE_CHAIN (args);
18948         } while (args);
18949
18950       return true;
18951     }
18952
18953   if (TREE_CODE (args) != STRING_CST)
18954     {
18955       error ("attribute %<target%> argument not a string");
18956       return false;
18957     }
18958
18959   size_t len = strlen (TREE_STRING_POINTER (args));
18960   char *str_to_check = (char *) alloca (len + 1);
18961   strcpy (str_to_check, TREE_STRING_POINTER (args));
18962
18963   if (len == 0)
18964     {
18965       error ("malformed %<target()%> pragma or attribute");
18966       return false;
18967     }
18968
18969   /* Used to catch empty spaces between commas i.e.
18970      attribute ((target ("attr1,,attr2"))).  */
18971   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
18972
18973   /* Handle multiple target attributes separated by ','.  */
18974   char *token = strtok_r (str_to_check, ",", &str_to_check);
18975
18976   unsigned int num_attrs = 0;
18977   while (token)
18978     {
18979       num_attrs++;
18980       if (!aarch64_process_one_target_attr (token))
18981         {
18982           /* Check if token is possibly an arch extension without
18983              leading '+'.  */
18984           aarch64_feature_flags isa_temp = 0;
18985           auto with_plus = std::string ("+") + token;
18986           enum aarch64_parse_opt_result ext_res
18987             = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
18988
18989           if (ext_res == AARCH64_PARSE_OK)
18990             error ("arch extension %<%s%> should be prefixed by %<+%>",
18991                    token);
18992           else
18993             error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
18994           return false;
18995         }
18996
18997       token = strtok_r (NULL, ",", &str_to_check);
18998     }
18999
19000   if (num_attrs != num_commas + 1)
19001     {
19002       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19003       return false;
19004     }
19005
19006   return true;
19007 }
19008
19009 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
19010    process attribute ((target ("..."))).  */
19011
19012 static bool
19013 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19014 {
19015   struct cl_target_option cur_target;
19016   bool ret;
19017   tree old_optimize;
19018   tree new_target, new_optimize;
19019   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19020
19021   /* If what we're processing is the current pragma string then the
19022      target option node is already stored in target_option_current_node
19023      by aarch64_pragma_target_parse in aarch64-c.cc.  Use that to avoid
19024      having to re-parse the string.  This is especially useful to keep
19025      arm_neon.h compile times down since that header contains a lot
19026      of intrinsics enclosed in pragmas.  */
19027   if (!existing_target && args == current_target_pragma)
19028     {
19029       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19030       return true;
19031     }
19032   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19033
19034   old_optimize
19035     = build_optimization_node (&global_options, &global_options_set);
19036   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19037
19038   /* If the function changed the optimization levels as well as setting
19039      target options, start with the optimizations specified.  */
19040   if (func_optimize && func_optimize != old_optimize)
19041     cl_optimization_restore (&global_options, &global_options_set,
19042                              TREE_OPTIMIZATION (func_optimize));
19043
19044   /* Save the current target options to restore at the end.  */
19045   cl_target_option_save (&cur_target, &global_options, &global_options_set);
19046
19047   /* If fndecl already has some target attributes applied to it, unpack
19048      them so that we add this attribute on top of them, rather than
19049      overwriting them.  */
19050   if (existing_target)
19051     {
19052       struct cl_target_option *existing_options
19053         = TREE_TARGET_OPTION (existing_target);
19054
19055       if (existing_options)
19056         cl_target_option_restore (&global_options, &global_options_set,
19057                                   existing_options);
19058     }
19059   else
19060     cl_target_option_restore (&global_options, &global_options_set,
19061                               TREE_TARGET_OPTION (target_option_current_node));
19062
19063   ret = aarch64_process_target_attr (args);
19064
19065   /* Set up any additional state.  */
19066   if (ret)
19067     {
19068       aarch64_override_options_internal (&global_options);
19069       new_target = build_target_option_node (&global_options,
19070                                              &global_options_set);
19071     }
19072   else
19073     new_target = NULL;
19074
19075   new_optimize = build_optimization_node (&global_options,
19076                                           &global_options_set);
19077
19078   if (fndecl && ret)
19079     {
19080       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19081
19082       if (old_optimize != new_optimize)
19083         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19084     }
19085
19086   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19087
19088   if (old_optimize != new_optimize)
19089     cl_optimization_restore (&global_options, &global_options_set,
19090                              TREE_OPTIMIZATION (old_optimize));
19091   return ret;
19092 }
19093
19094 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
19095    tri-bool options (yes, no, don't care) and the default value is
19096    DEF, determine whether to reject inlining.  */
19097
19098 static bool
19099 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
19100                                      int dont_care, int def)
19101 {
19102   /* If the callee doesn't care, always allow inlining.  */
19103   if (callee == dont_care)
19104     return true;
19105
19106   /* If the caller doesn't care, always allow inlining.  */
19107   if (caller == dont_care)
19108     return true;
19109
19110   /* Otherwise, allow inlining if either the callee and caller values
19111      agree, or if the callee is using the default value.  */
19112   return (callee == caller || callee == def);
19113 }
19114
19115 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
19116    to inline CALLEE into CALLER based on target-specific info.
19117    Make sure that the caller and callee have compatible architectural
19118    features.  Then go through the other possible target attributes
19119    and see if they can block inlining.  Try not to reject always_inline
19120    callees unless they are incompatible architecturally.  */
19121
19122 static bool
19123 aarch64_can_inline_p (tree caller, tree callee)
19124 {
19125   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
19126   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
19127
19128   struct cl_target_option *caller_opts
19129         = TREE_TARGET_OPTION (caller_tree ? caller_tree
19130                                            : target_option_default_node);
19131
19132   struct cl_target_option *callee_opts
19133         = TREE_TARGET_OPTION (callee_tree ? callee_tree
19134                                            : target_option_default_node);
19135
19136   /* Callee's ISA flags should be a subset of the caller's.  */
19137   if ((caller_opts->x_aarch64_asm_isa_flags
19138        & callee_opts->x_aarch64_asm_isa_flags)
19139       != callee_opts->x_aarch64_asm_isa_flags)
19140     return false;
19141   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
19142       != callee_opts->x_aarch64_isa_flags)
19143     return false;
19144
19145   /* Allow non-strict aligned functions inlining into strict
19146      aligned ones.  */
19147   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
19148        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
19149       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
19150            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
19151     return false;
19152
19153   bool always_inline = lookup_attribute ("always_inline",
19154                                           DECL_ATTRIBUTES (callee));
19155
19156   /* If the architectural features match up and the callee is always_inline
19157      then the other attributes don't matter.  */
19158   if (always_inline)
19159     return true;
19160
19161   if (caller_opts->x_aarch64_cmodel_var
19162       != callee_opts->x_aarch64_cmodel_var)
19163     return false;
19164
19165   if (caller_opts->x_aarch64_tls_dialect
19166       != callee_opts->x_aarch64_tls_dialect)
19167     return false;
19168
19169   /* Honour explicit requests to workaround errata.  */
19170   if (!aarch64_tribools_ok_for_inlining_p (
19171           caller_opts->x_aarch64_fix_a53_err835769,
19172           callee_opts->x_aarch64_fix_a53_err835769,
19173           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
19174     return false;
19175
19176   if (!aarch64_tribools_ok_for_inlining_p (
19177           caller_opts->x_aarch64_fix_a53_err843419,
19178           callee_opts->x_aarch64_fix_a53_err843419,
19179           2, TARGET_FIX_ERR_A53_843419))
19180     return false;
19181
19182   /* If the user explicitly specified -momit-leaf-frame-pointer for the
19183      caller and calle and they don't match up, reject inlining.  */
19184   if (!aarch64_tribools_ok_for_inlining_p (
19185           caller_opts->x_flag_omit_leaf_frame_pointer,
19186           callee_opts->x_flag_omit_leaf_frame_pointer,
19187           2, 1))
19188     return false;
19189
19190   /* If the callee has specific tuning overrides, respect them.  */
19191   if (callee_opts->x_aarch64_override_tune_string != NULL
19192       && caller_opts->x_aarch64_override_tune_string == NULL)
19193     return false;
19194
19195   /* If the user specified tuning override strings for the
19196      caller and callee and they don't match up, reject inlining.
19197      We just do a string compare here, we don't analyze the meaning
19198      of the string, as it would be too costly for little gain.  */
19199   if (callee_opts->x_aarch64_override_tune_string
19200       && caller_opts->x_aarch64_override_tune_string
19201       && (strcmp (callee_opts->x_aarch64_override_tune_string,
19202                   caller_opts->x_aarch64_override_tune_string) != 0))
19203     return false;
19204
19205   return true;
19206 }
19207
19208 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
19209    been already.  */
19210
19211 unsigned int
19212 aarch64_tlsdesc_abi_id ()
19213 {
19214   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
19215   if (!tlsdesc_abi.initialized_p ())
19216     {
19217       HARD_REG_SET full_reg_clobbers;
19218       CLEAR_HARD_REG_SET (full_reg_clobbers);
19219       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
19220       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
19221       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
19222         SET_HARD_REG_BIT (full_reg_clobbers, regno);
19223       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
19224     }
19225   return tlsdesc_abi.id ();
19226 }
19227
19228 /* Return true if SYMBOL_REF X binds locally.  */
19229
19230 static bool
19231 aarch64_symbol_binds_local_p (const_rtx x)
19232 {
19233   return (SYMBOL_REF_DECL (x)
19234           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
19235           : SYMBOL_REF_LOCAL_P (x));
19236 }
19237
19238 /* Return true if SYMBOL_REF X is thread local */
19239 static bool
19240 aarch64_tls_symbol_p (rtx x)
19241 {
19242   if (! TARGET_HAVE_TLS)
19243     return false;
19244
19245   x = strip_salt (x);
19246   if (!SYMBOL_REF_P (x))
19247     return false;
19248
19249   return SYMBOL_REF_TLS_MODEL (x) != 0;
19250 }
19251
19252 /* Classify a TLS symbol into one of the TLS kinds.  */
19253 enum aarch64_symbol_type
19254 aarch64_classify_tls_symbol (rtx x)
19255 {
19256   enum tls_model tls_kind = tls_symbolic_operand_type (x);
19257
19258   switch (tls_kind)
19259     {
19260     case TLS_MODEL_GLOBAL_DYNAMIC:
19261     case TLS_MODEL_LOCAL_DYNAMIC:
19262       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
19263
19264     case TLS_MODEL_INITIAL_EXEC:
19265       switch (aarch64_cmodel)
19266         {
19267         case AARCH64_CMODEL_TINY:
19268         case AARCH64_CMODEL_TINY_PIC:
19269           return SYMBOL_TINY_TLSIE;
19270         default:
19271           return SYMBOL_SMALL_TLSIE;
19272         }
19273
19274     case TLS_MODEL_LOCAL_EXEC:
19275       if (aarch64_tls_size == 12)
19276         return SYMBOL_TLSLE12;
19277       else if (aarch64_tls_size == 24)
19278         return SYMBOL_TLSLE24;
19279       else if (aarch64_tls_size == 32)
19280         return SYMBOL_TLSLE32;
19281       else if (aarch64_tls_size == 48)
19282         return SYMBOL_TLSLE48;
19283       else
19284         gcc_unreachable ();
19285
19286     case TLS_MODEL_EMULATED:
19287     case TLS_MODEL_NONE:
19288       return SYMBOL_FORCE_TO_MEM;
19289
19290     default:
19291       gcc_unreachable ();
19292     }
19293 }
19294
19295 /* Return the correct method for accessing X + OFFSET, where X is either
19296    a SYMBOL_REF or LABEL_REF.  */
19297
19298 enum aarch64_symbol_type
19299 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
19300 {
19301   x = strip_salt (x);
19302
19303   if (LABEL_REF_P (x))
19304     {
19305       switch (aarch64_cmodel)
19306         {
19307         case AARCH64_CMODEL_LARGE:
19308           return SYMBOL_FORCE_TO_MEM;
19309
19310         case AARCH64_CMODEL_TINY_PIC:
19311         case AARCH64_CMODEL_TINY:
19312           return SYMBOL_TINY_ABSOLUTE;
19313
19314         case AARCH64_CMODEL_SMALL_SPIC:
19315         case AARCH64_CMODEL_SMALL_PIC:
19316         case AARCH64_CMODEL_SMALL:
19317           return SYMBOL_SMALL_ABSOLUTE;
19318
19319         default:
19320           gcc_unreachable ();
19321         }
19322     }
19323
19324   if (SYMBOL_REF_P (x))
19325     {
19326       if (aarch64_tls_symbol_p (x))
19327         return aarch64_classify_tls_symbol (x);
19328
19329       switch (aarch64_cmodel)
19330         {
19331         case AARCH64_CMODEL_TINY_PIC:
19332         case AARCH64_CMODEL_TINY:
19333           /* With -fPIC non-local symbols use the GOT.  For orthogonality
19334              always use the GOT for extern weak symbols.  */
19335           if ((flag_pic || SYMBOL_REF_WEAK (x))
19336               && !aarch64_symbol_binds_local_p (x))
19337             return SYMBOL_TINY_GOT;
19338
19339           /* When we retrieve symbol + offset address, we have to make sure
19340              the offset does not cause overflow of the final address.  But
19341              we have no way of knowing the address of symbol at compile time
19342              so we can't accurately say if the distance between the PC and
19343              symbol + offset is outside the addressible range of +/-1MB in the
19344              TINY code model.  So we limit the maximum offset to +/-64KB and
19345              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
19346              If offset_within_block_p is true we allow larger offsets.  */
19347           if (!(IN_RANGE (offset, -0x10000, 0x10000)
19348                 || offset_within_block_p (x, offset)))
19349             return SYMBOL_FORCE_TO_MEM;
19350
19351           return SYMBOL_TINY_ABSOLUTE;
19352
19353
19354         case AARCH64_CMODEL_SMALL_SPIC:
19355         case AARCH64_CMODEL_SMALL_PIC:
19356         case AARCH64_CMODEL_SMALL:
19357           if ((flag_pic || SYMBOL_REF_WEAK (x))
19358               && !aarch64_symbol_binds_local_p (x))
19359             return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
19360                     ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
19361
19362           /* Same reasoning as the tiny code model, but the offset cap here is
19363              1MB, allowing +/-3.9GB for the offset to the symbol.  */
19364           if (!(IN_RANGE (offset, -0x100000, 0x100000)
19365                 || offset_within_block_p (x, offset)))
19366             return SYMBOL_FORCE_TO_MEM;
19367
19368           return SYMBOL_SMALL_ABSOLUTE;
19369
19370         case AARCH64_CMODEL_LARGE:
19371           /* This is alright even in PIC code as the constant
19372              pool reference is always PC relative and within
19373              the same translation unit.  */
19374           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
19375             return SYMBOL_SMALL_ABSOLUTE;
19376           else
19377             return SYMBOL_FORCE_TO_MEM;
19378
19379         default:
19380           gcc_unreachable ();
19381         }
19382     }
19383
19384   /* By default push everything into the constant pool.  */
19385   return SYMBOL_FORCE_TO_MEM;
19386 }
19387
19388 bool
19389 aarch64_constant_address_p (rtx x)
19390 {
19391   return (CONSTANT_P (x) && memory_address_p (DImode, x));
19392 }
19393
19394 bool
19395 aarch64_legitimate_pic_operand_p (rtx x)
19396 {
19397   poly_int64 offset;
19398   x = strip_offset_and_salt (x, &offset);
19399   if (SYMBOL_REF_P (x))
19400     return false;
19401
19402   return true;
19403 }
19404
19405 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
19406    that should be rematerialized rather than spilled.  */
19407
19408 static bool
19409 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
19410 {
19411   /* Support CSE and rematerialization of common constants.  */
19412   if (CONST_INT_P (x)
19413       || CONST_DOUBLE_P (x))
19414     return true;
19415
19416   /* Only accept variable-length vector constants if they can be
19417      handled directly.
19418
19419      ??? It would be possible (but complex) to handle rematerialization
19420      of other constants via secondary reloads.  */
19421   if (!GET_MODE_SIZE (mode).is_constant ())
19422     return aarch64_simd_valid_immediate (x, NULL);
19423
19424   /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
19425      least be forced to memory and loaded from there.  */
19426   if (CONST_VECTOR_P (x))
19427     return !targetm.cannot_force_const_mem (mode, x);
19428
19429   /* Do not allow vector struct mode constants for Advanced SIMD.
19430      We could support 0 and -1 easily, but they need support in
19431      aarch64-simd.md.  */
19432   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19433   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
19434     return false;
19435
19436   if (GET_CODE (x) == HIGH)
19437     x = XEXP (x, 0);
19438
19439   /* Accept polynomial constants that can be calculated by using the
19440      destination of a move as the sole temporary.  Constants that
19441      require a second temporary cannot be rematerialized (they can't be
19442      forced to memory and also aren't legitimate constants).  */
19443   poly_int64 offset;
19444   if (poly_int_rtx_p (x, &offset))
19445     return aarch64_offset_temporaries (false, offset) <= 1;
19446
19447   /* If an offset is being added to something else, we need to allow the
19448      base to be moved into the destination register, meaning that there
19449      are no free temporaries for the offset.  */
19450   x = strip_offset_and_salt (x, &offset);
19451   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
19452     return false;
19453
19454   /* Do not allow const (plus (anchor_symbol, const_int)).  */
19455   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
19456     return false;
19457
19458   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
19459      so spilling them is better than rematerialization.  */
19460   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
19461     return true;
19462
19463   /* Label references are always constant.  */
19464   if (LABEL_REF_P (x))
19465     return true;
19466
19467   return false;
19468 }
19469
19470 rtx
19471 aarch64_load_tp (rtx target)
19472 {
19473   if (!target
19474       || GET_MODE (target) != Pmode
19475       || !register_operand (target, Pmode))
19476     target = gen_reg_rtx (Pmode);
19477
19478   /* Can return in any reg.  */
19479   emit_insn (gen_aarch64_load_tp_hard (target));
19480   return target;
19481 }
19482
19483 /* On AAPCS systems, this is the "struct __va_list".  */
19484 static GTY(()) tree va_list_type;
19485
19486 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
19487    Return the type to use as __builtin_va_list.
19488
19489    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
19490
19491    struct __va_list
19492    {
19493      void *__stack;
19494      void *__gr_top;
19495      void *__vr_top;
19496      int   __gr_offs;
19497      int   __vr_offs;
19498    };  */
19499
19500 static tree
19501 aarch64_build_builtin_va_list (void)
19502 {
19503   tree va_list_name;
19504   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19505
19506   /* Create the type.  */
19507   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
19508   /* Give it the required name.  */
19509   va_list_name = build_decl (BUILTINS_LOCATION,
19510                              TYPE_DECL,
19511                              get_identifier ("__va_list"),
19512                              va_list_type);
19513   DECL_ARTIFICIAL (va_list_name) = 1;
19514   TYPE_NAME (va_list_type) = va_list_name;
19515   TYPE_STUB_DECL (va_list_type) = va_list_name;
19516
19517   /* Create the fields.  */
19518   f_stack = build_decl (BUILTINS_LOCATION,
19519                         FIELD_DECL, get_identifier ("__stack"),
19520                         ptr_type_node);
19521   f_grtop = build_decl (BUILTINS_LOCATION,
19522                         FIELD_DECL, get_identifier ("__gr_top"),
19523                         ptr_type_node);
19524   f_vrtop = build_decl (BUILTINS_LOCATION,
19525                         FIELD_DECL, get_identifier ("__vr_top"),
19526                         ptr_type_node);
19527   f_groff = build_decl (BUILTINS_LOCATION,
19528                         FIELD_DECL, get_identifier ("__gr_offs"),
19529                         integer_type_node);
19530   f_vroff = build_decl (BUILTINS_LOCATION,
19531                         FIELD_DECL, get_identifier ("__vr_offs"),
19532                         integer_type_node);
19533
19534   /* Tell tree-stdarg pass about our internal offset fields.
19535      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
19536      purpose to identify whether the code is updating va_list internal
19537      offset fields through irregular way.  */
19538   va_list_gpr_counter_field = f_groff;
19539   va_list_fpr_counter_field = f_vroff;
19540
19541   DECL_ARTIFICIAL (f_stack) = 1;
19542   DECL_ARTIFICIAL (f_grtop) = 1;
19543   DECL_ARTIFICIAL (f_vrtop) = 1;
19544   DECL_ARTIFICIAL (f_groff) = 1;
19545   DECL_ARTIFICIAL (f_vroff) = 1;
19546
19547   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
19548   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
19549   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
19550   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
19551   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
19552
19553   TYPE_FIELDS (va_list_type) = f_stack;
19554   DECL_CHAIN (f_stack) = f_grtop;
19555   DECL_CHAIN (f_grtop) = f_vrtop;
19556   DECL_CHAIN (f_vrtop) = f_groff;
19557   DECL_CHAIN (f_groff) = f_vroff;
19558
19559   /* Compute its layout.  */
19560   layout_type (va_list_type);
19561
19562   return va_list_type;
19563 }
19564
19565 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
19566 static void
19567 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
19568 {
19569   const CUMULATIVE_ARGS *cum;
19570   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19571   tree stack, grtop, vrtop, groff, vroff;
19572   tree t;
19573   int gr_save_area_size = cfun->va_list_gpr_size;
19574   int vr_save_area_size = cfun->va_list_fpr_size;
19575   int vr_offset;
19576
19577   cum = &crtl->args.info;
19578   if (cfun->va_list_gpr_size)
19579     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
19580                              cfun->va_list_gpr_size);
19581   if (cfun->va_list_fpr_size)
19582     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
19583                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
19584
19585   if (!TARGET_FLOAT)
19586     {
19587       gcc_assert (cum->aapcs_nvrn == 0);
19588       vr_save_area_size = 0;
19589     }
19590
19591   f_stack = TYPE_FIELDS (va_list_type_node);
19592   f_grtop = DECL_CHAIN (f_stack);
19593   f_vrtop = DECL_CHAIN (f_grtop);
19594   f_groff = DECL_CHAIN (f_vrtop);
19595   f_vroff = DECL_CHAIN (f_groff);
19596
19597   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
19598                   NULL_TREE);
19599   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
19600                   NULL_TREE);
19601   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
19602                   NULL_TREE);
19603   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
19604                   NULL_TREE);
19605   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
19606                   NULL_TREE);
19607
19608   /* Emit code to initialize STACK, which points to the next varargs stack
19609      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
19610      by named arguments.  STACK is 8-byte aligned.  */
19611   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
19612   if (cum->aapcs_stack_size > 0)
19613     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
19614   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
19615   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19616
19617   /* Emit code to initialize GRTOP, the top of the GR save area.
19618      virtual_incoming_args_rtx should have been 16 byte aligned.  */
19619   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
19620   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
19621   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19622
19623   /* Emit code to initialize VRTOP, the top of the VR save area.
19624      This address is gr_save_area_bytes below GRTOP, rounded
19625      down to the next 16-byte boundary.  */
19626   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
19627   vr_offset = ROUND_UP (gr_save_area_size,
19628                         STACK_BOUNDARY / BITS_PER_UNIT);
19629
19630   if (vr_offset)
19631     t = fold_build_pointer_plus_hwi (t, -vr_offset);
19632   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
19633   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19634
19635   /* Emit code to initialize GROFF, the offset from GRTOP of the
19636      next GPR argument.  */
19637   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
19638               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
19639   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19640
19641   /* Likewise emit code to initialize VROFF, the offset from FTOP
19642      of the next VR argument.  */
19643   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
19644               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
19645   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19646 }
19647
19648 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
19649
19650 static tree
19651 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
19652                               gimple_seq *post_p ATTRIBUTE_UNUSED)
19653 {
19654   tree addr;
19655   bool indirect_p;
19656   bool is_ha;           /* is HFA or HVA.  */
19657   bool dw_align;        /* double-word align.  */
19658   machine_mode ag_mode = VOIDmode;
19659   int nregs;
19660   machine_mode mode;
19661
19662   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19663   tree stack, f_top, f_off, off, arg, roundup, on_stack;
19664   HOST_WIDE_INT size, rsize, adjust, align;
19665   tree t, u, cond1, cond2;
19666
19667   indirect_p = pass_va_arg_by_reference (type);
19668   if (indirect_p)
19669     type = build_pointer_type (type);
19670
19671   mode = TYPE_MODE (type);
19672
19673   f_stack = TYPE_FIELDS (va_list_type_node);
19674   f_grtop = DECL_CHAIN (f_stack);
19675   f_vrtop = DECL_CHAIN (f_grtop);
19676   f_groff = DECL_CHAIN (f_vrtop);
19677   f_vroff = DECL_CHAIN (f_groff);
19678
19679   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
19680                   f_stack, NULL_TREE);
19681   size = int_size_in_bytes (type);
19682
19683   unsigned int abi_break;
19684   align
19685     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
19686
19687   dw_align = false;
19688   adjust = 0;
19689   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
19690                                                &is_ha, false))
19691     {
19692       /* No frontends can create types with variable-sized modes, so we
19693          shouldn't be asked to pass or return them.  */
19694       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
19695
19696       /* TYPE passed in fp/simd registers.  */
19697       if (!TARGET_FLOAT)
19698         aarch64_err_no_fpadvsimd (mode);
19699
19700       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
19701                       unshare_expr (valist), f_vrtop, NULL_TREE);
19702       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
19703                       unshare_expr (valist), f_vroff, NULL_TREE);
19704
19705       rsize = nregs * UNITS_PER_VREG;
19706
19707       if (is_ha)
19708         {
19709           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
19710             adjust = UNITS_PER_VREG - ag_size;
19711         }
19712       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19713                && size < UNITS_PER_VREG)
19714         {
19715           adjust = UNITS_PER_VREG - size;
19716         }
19717     }
19718   else
19719     {
19720       /* TYPE passed in general registers.  */
19721       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
19722                       unshare_expr (valist), f_grtop, NULL_TREE);
19723       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
19724                       unshare_expr (valist), f_groff, NULL_TREE);
19725       rsize = ROUND_UP (size, UNITS_PER_WORD);
19726       nregs = rsize / UNITS_PER_WORD;
19727
19728       if (align > 8)
19729         {
19730           if (abi_break && warn_psabi)
19731             inform (input_location, "parameter passing for argument of type "
19732                     "%qT changed in GCC 9.1", type);
19733           dw_align = true;
19734         }
19735
19736       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19737           && size < UNITS_PER_WORD)
19738         {
19739           adjust = UNITS_PER_WORD  - size;
19740         }
19741     }
19742
19743   /* Get a local temporary for the field value.  */
19744   off = get_initialized_tmp_var (f_off, pre_p, NULL);
19745
19746   /* Emit code to branch if off >= 0.  */
19747   t = build2 (GE_EXPR, boolean_type_node, off,
19748               build_int_cst (TREE_TYPE (off), 0));
19749   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
19750
19751   if (dw_align)
19752     {
19753       /* Emit: offs = (offs + 15) & -16.  */
19754       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19755                   build_int_cst (TREE_TYPE (off), 15));
19756       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
19757                   build_int_cst (TREE_TYPE (off), -16));
19758       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
19759     }
19760   else
19761     roundup = NULL;
19762
19763   /* Update ap.__[g|v]r_offs  */
19764   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19765               build_int_cst (TREE_TYPE (off), rsize));
19766   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
19767
19768   /* String up.  */
19769   if (roundup)
19770     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19771
19772   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
19773   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
19774               build_int_cst (TREE_TYPE (f_off), 0));
19775   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
19776
19777   /* String up: make sure the assignment happens before the use.  */
19778   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
19779   COND_EXPR_ELSE (cond1) = t;
19780
19781   /* Prepare the trees handling the argument that is passed on the stack;
19782      the top level node will store in ON_STACK.  */
19783   arg = get_initialized_tmp_var (stack, pre_p, NULL);
19784   if (align > 8)
19785     {
19786       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
19787       t = fold_build_pointer_plus_hwi (arg, 15);
19788       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19789                   build_int_cst (TREE_TYPE (t), -16));
19790       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
19791     }
19792   else
19793     roundup = NULL;
19794   /* Advance ap.__stack  */
19795   t = fold_build_pointer_plus_hwi (arg, size + 7);
19796   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19797               build_int_cst (TREE_TYPE (t), -8));
19798   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
19799   /* String up roundup and advance.  */
19800   if (roundup)
19801     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19802   /* String up with arg */
19803   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
19804   /* Big-endianness related address adjustment.  */
19805   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19806       && size < UNITS_PER_WORD)
19807   {
19808     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
19809                 size_int (UNITS_PER_WORD - size));
19810     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
19811   }
19812
19813   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
19814   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
19815
19816   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
19817   t = off;
19818   if (adjust)
19819     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
19820                 build_int_cst (TREE_TYPE (off), adjust));
19821
19822   t = fold_convert (sizetype, t);
19823   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
19824
19825   if (is_ha)
19826     {
19827       /* type ha; // treat as "struct {ftype field[n];}"
19828          ... [computing offs]
19829          for (i = 0; i <nregs; ++i, offs += 16)
19830            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
19831          return ha;  */
19832       int i;
19833       tree tmp_ha, field_t, field_ptr_t;
19834
19835       /* Declare a local variable.  */
19836       tmp_ha = create_tmp_var_raw (type, "ha");
19837       gimple_add_tmp_var (tmp_ha);
19838
19839       /* Establish the base type.  */
19840       switch (ag_mode)
19841         {
19842         case E_SFmode:
19843           field_t = float_type_node;
19844           field_ptr_t = float_ptr_type_node;
19845           break;
19846         case E_DFmode:
19847           field_t = double_type_node;
19848           field_ptr_t = double_ptr_type_node;
19849           break;
19850         case E_TFmode:
19851           field_t = long_double_type_node;
19852           field_ptr_t = long_double_ptr_type_node;
19853           break;
19854         case E_SDmode:
19855           field_t = dfloat32_type_node;
19856           field_ptr_t = build_pointer_type (dfloat32_type_node);
19857           break;
19858         case E_DDmode:
19859           field_t = dfloat64_type_node;
19860           field_ptr_t = build_pointer_type (dfloat64_type_node);
19861           break;
19862         case E_TDmode:
19863           field_t = dfloat128_type_node;
19864           field_ptr_t = build_pointer_type (dfloat128_type_node);
19865           break;
19866         case E_HFmode:
19867           field_t = aarch64_fp16_type_node;
19868           field_ptr_t = aarch64_fp16_ptr_type_node;
19869           break;
19870         case E_BFmode:
19871           field_t = aarch64_bf16_type_node;
19872           field_ptr_t = aarch64_bf16_ptr_type_node;
19873           break;
19874         case E_V2SImode:
19875         case E_V4SImode:
19876             {
19877               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
19878               field_t = build_vector_type_for_mode (innertype, ag_mode);
19879               field_ptr_t = build_pointer_type (field_t);
19880             }
19881           break;
19882         default:
19883           gcc_assert (0);
19884         }
19885
19886       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
19887       TREE_ADDRESSABLE (tmp_ha) = 1;
19888       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
19889       addr = t;
19890       t = fold_convert (field_ptr_t, addr);
19891       t = build2 (MODIFY_EXPR, field_t,
19892                   build1 (INDIRECT_REF, field_t, tmp_ha),
19893                   build1 (INDIRECT_REF, field_t, t));
19894
19895       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
19896       for (i = 1; i < nregs; ++i)
19897         {
19898           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
19899           u = fold_convert (field_ptr_t, addr);
19900           u = build2 (MODIFY_EXPR, field_t,
19901                       build2 (MEM_REF, field_t, tmp_ha,
19902                               build_int_cst (field_ptr_t,
19903                                              (i *
19904                                               int_size_in_bytes (field_t)))),
19905                       build1 (INDIRECT_REF, field_t, u));
19906           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
19907         }
19908
19909       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
19910       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
19911     }
19912
19913   COND_EXPR_ELSE (cond2) = t;
19914   addr = fold_convert (build_pointer_type (type), cond1);
19915   addr = build_va_arg_indirect_ref (addr);
19916
19917   if (indirect_p)
19918     addr = build_va_arg_indirect_ref (addr);
19919
19920   return addr;
19921 }
19922
19923 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
19924
19925 static void
19926 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
19927                                 const function_arg_info &arg,
19928                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
19929 {
19930   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
19931   CUMULATIVE_ARGS local_cum;
19932   int gr_saved = cfun->va_list_gpr_size;
19933   int vr_saved = cfun->va_list_fpr_size;
19934
19935   /* The caller has advanced CUM up to, but not beyond, the last named
19936      argument.  Advance a local copy of CUM past the last "real" named
19937      argument, to find out how many registers are left over.  */
19938   local_cum = *cum;
19939   if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
19940     aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
19941
19942   /* Found out how many registers we need to save.
19943      Honor tree-stdvar analysis results.  */
19944   if (cfun->va_list_gpr_size)
19945     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
19946                     cfun->va_list_gpr_size / UNITS_PER_WORD);
19947   if (cfun->va_list_fpr_size)
19948     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
19949                     cfun->va_list_fpr_size / UNITS_PER_VREG);
19950
19951   if (!TARGET_FLOAT)
19952     {
19953       gcc_assert (local_cum.aapcs_nvrn == 0);
19954       vr_saved = 0;
19955     }
19956
19957   if (!no_rtl)
19958     {
19959       if (gr_saved > 0)
19960         {
19961           rtx ptr, mem;
19962
19963           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
19964           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
19965                                - gr_saved * UNITS_PER_WORD);
19966           mem = gen_frame_mem (BLKmode, ptr);
19967           set_mem_alias_set (mem, get_varargs_alias_set ());
19968
19969           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
19970                                mem, gr_saved);
19971         }
19972       if (vr_saved > 0)
19973         {
19974           /* We can't use move_block_from_reg, because it will use
19975              the wrong mode, storing D regs only.  */
19976           machine_mode mode = TImode;
19977           int off, i, vr_start;
19978
19979           /* Set OFF to the offset from virtual_incoming_args_rtx of
19980              the first vector register.  The VR save area lies below
19981              the GR one, and is aligned to 16 bytes.  */
19982           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
19983                            STACK_BOUNDARY / BITS_PER_UNIT);
19984           off -= vr_saved * UNITS_PER_VREG;
19985
19986           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
19987           for (i = 0; i < vr_saved; ++i)
19988             {
19989               rtx ptr, mem;
19990
19991               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
19992               mem = gen_frame_mem (mode, ptr);
19993               set_mem_alias_set (mem, get_varargs_alias_set ());
19994               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
19995               off += UNITS_PER_VREG;
19996             }
19997         }
19998     }
19999
20000   /* We don't save the size into *PRETEND_SIZE because we want to avoid
20001      any complication of having crtl->args.pretend_args_size changed.  */
20002   cfun->machine->frame.saved_varargs_size
20003     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
20004                  STACK_BOUNDARY / BITS_PER_UNIT)
20005        + vr_saved * UNITS_PER_VREG);
20006 }
20007
20008 static void
20009 aarch64_conditional_register_usage (void)
20010 {
20011   int i;
20012   if (!TARGET_FLOAT)
20013     {
20014       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
20015         {
20016           fixed_regs[i] = 1;
20017           call_used_regs[i] = 1;
20018           CLEAR_HARD_REG_BIT (operand_reg_set, i);
20019         }
20020     }
20021   if (!TARGET_SVE)
20022     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
20023       {
20024         fixed_regs[i] = 1;
20025         call_used_regs[i] = 1;
20026       }
20027
20028   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
20029   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
20030   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
20031
20032   /* When tracking speculation, we need a couple of call-clobbered registers
20033      to track the speculation state.  It would be nice to just use
20034      IP0 and IP1, but currently there are numerous places that just
20035      assume these registers are free for other uses (eg pointer
20036      authentication).  */
20037   if (aarch64_track_speculation)
20038     {
20039       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
20040       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
20041       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20042       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20043     }
20044 }
20045
20046 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
20047
20048 bool
20049 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
20050 {
20051   /* For records we're passed a FIELD_DECL, for arrays we're passed
20052      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
20053   const_tree type = TREE_TYPE (field_or_array);
20054
20055   /* Assign BLKmode to anything that contains multiple SVE predicates.
20056      For structures, the "multiple" case is indicated by MODE being
20057      VOIDmode.  */
20058   unsigned int num_zr, num_pr;
20059   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
20060     {
20061       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
20062         return !simple_cst_equal (TYPE_SIZE (field_or_array),
20063                                   TYPE_SIZE (type));
20064       return mode == VOIDmode;
20065     }
20066
20067   return default_member_type_forces_blk (field_or_array, mode);
20068 }
20069
20070 /* Bitmasks that indicate whether earlier versions of GCC would have
20071    taken a different path through the ABI logic.  This should result in
20072    a -Wpsabi warning if the earlier path led to a different ABI decision.
20073
20074    WARN_PSABI_EMPTY_CXX17_BASE
20075       Indicates that the type includes an artificial empty C++17 base field
20076       that, prior to GCC 10.1, would prevent the type from being treated as
20077       a HFA or HVA.  See PR94383 for details.
20078
20079    WARN_PSABI_NO_UNIQUE_ADDRESS
20080       Indicates that the type includes an empty [[no_unique_address]] field
20081       that, prior to GCC 10.1, would prevent the type from being treated as
20082       a HFA or HVA.  */
20083 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
20084 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
20085 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
20086
20087 /* Walk down the type tree of TYPE counting consecutive base elements.
20088    If *MODEP is VOIDmode, then set it to the first valid floating point
20089    type.  If a non-floating point type is found, or if a floating point
20090    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
20091    otherwise return the count in the sub-tree.
20092
20093    The WARN_PSABI_FLAGS argument allows the caller to check whether this
20094    function has changed its behavior relative to earlier versions of GCC.
20095    Normally the argument should be nonnull and point to a zero-initialized
20096    variable.  The function then records whether the ABI decision might
20097    be affected by a known fix to the ABI logic, setting the associated
20098    WARN_PSABI_* bits if so.
20099
20100    When the argument is instead a null pointer, the function tries to
20101    simulate the behavior of GCC before all such ABI fixes were made.
20102    This is useful to check whether the function returns something
20103    different after the ABI fixes.  */
20104 static int
20105 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
20106                          unsigned int *warn_psabi_flags)
20107 {
20108   machine_mode mode;
20109   HOST_WIDE_INT size;
20110
20111   if (aarch64_sve::builtin_type_p (type))
20112     return -1;
20113
20114   switch (TREE_CODE (type))
20115     {
20116     case REAL_TYPE:
20117       mode = TYPE_MODE (type);
20118       if (mode != DFmode && mode != SFmode
20119           && mode != TFmode && mode != HFmode
20120           && mode != SDmode && mode != DDmode && mode != TDmode)
20121         return -1;
20122
20123       if (*modep == VOIDmode)
20124         *modep = mode;
20125
20126       if (*modep == mode)
20127         return 1;
20128
20129       break;
20130
20131     case COMPLEX_TYPE:
20132       mode = TYPE_MODE (TREE_TYPE (type));
20133       if (mode != DFmode && mode != SFmode
20134           && mode != TFmode && mode != HFmode)
20135         return -1;
20136
20137       if (*modep == VOIDmode)
20138         *modep = mode;
20139
20140       if (*modep == mode)
20141         return 2;
20142
20143       break;
20144
20145     case VECTOR_TYPE:
20146       /* Use V2SImode and V4SImode as representatives of all 64-bit
20147          and 128-bit vector types.  */
20148       size = int_size_in_bytes (type);
20149       switch (size)
20150         {
20151         case 8:
20152           mode = V2SImode;
20153           break;
20154         case 16:
20155           mode = V4SImode;
20156           break;
20157         default:
20158           return -1;
20159         }
20160
20161       if (*modep == VOIDmode)
20162         *modep = mode;
20163
20164       /* Vector modes are considered to be opaque: two vectors are
20165          equivalent for the purposes of being homogeneous aggregates
20166          if they are the same size.  */
20167       if (*modep == mode)
20168         return 1;
20169
20170       break;
20171
20172     case ARRAY_TYPE:
20173       {
20174         int count;
20175         tree index = TYPE_DOMAIN (type);
20176
20177         /* Can't handle incomplete types nor sizes that are not
20178            fixed.  */
20179         if (!COMPLETE_TYPE_P (type)
20180             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20181           return -1;
20182
20183         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
20184                                          warn_psabi_flags);
20185         if (count == -1
20186             || !index
20187             || !TYPE_MAX_VALUE (index)
20188             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
20189             || !TYPE_MIN_VALUE (index)
20190             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
20191             || count < 0)
20192           return -1;
20193
20194         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
20195                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
20196
20197         /* There must be no padding.  */
20198         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20199                       count * GET_MODE_BITSIZE (*modep)))
20200           return -1;
20201
20202         return count;
20203       }
20204
20205     case RECORD_TYPE:
20206       {
20207         int count = 0;
20208         int sub_count;
20209         tree field;
20210
20211         /* Can't handle incomplete types nor sizes that are not
20212            fixed.  */
20213         if (!COMPLETE_TYPE_P (type)
20214             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20215           return -1;
20216
20217         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20218           {
20219             if (TREE_CODE (field) != FIELD_DECL)
20220               continue;
20221
20222             if (DECL_FIELD_ABI_IGNORED (field))
20223               {
20224                 /* See whether this is something that earlier versions of
20225                    GCC failed to ignore.  */
20226                 unsigned int flag;
20227                 if (lookup_attribute ("no_unique_address",
20228                                       DECL_ATTRIBUTES (field)))
20229                   flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
20230                 else if (cxx17_empty_base_field_p (field))
20231                   flag = WARN_PSABI_EMPTY_CXX17_BASE;
20232                 else
20233                   /* No compatibility problem.  */
20234                   continue;
20235
20236                 /* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
20237                 if (warn_psabi_flags)
20238                   {
20239                     *warn_psabi_flags |= flag;
20240                     continue;
20241                   }
20242               }
20243             /* A zero-width bitfield may affect layout in some
20244                circumstances, but adds no members.  The determination
20245                of whether or not a type is an HFA is performed after
20246                layout is complete, so if the type still looks like an
20247                HFA afterwards, it is still classed as one.  This is
20248                potentially an ABI break for the hard-float ABI.  */
20249             else if (DECL_BIT_FIELD (field)
20250                      && integer_zerop (DECL_SIZE (field)))
20251               {
20252                 /* Prior to GCC-12 these fields were striped early,
20253                    hiding them from the back-end entirely and
20254                    resulting in the correct behaviour for argument
20255                    passing.  Simulate that old behaviour without
20256                    generating a warning.  */
20257                 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
20258                   continue;
20259                 if (warn_psabi_flags)
20260                   {
20261                     *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
20262                     continue;
20263                   }
20264               }
20265
20266             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20267                                                  warn_psabi_flags);
20268             if (sub_count < 0)
20269               return -1;
20270             count += sub_count;
20271           }
20272
20273         /* There must be no padding.  */
20274         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20275                       count * GET_MODE_BITSIZE (*modep)))
20276           return -1;
20277
20278         return count;
20279       }
20280
20281     case UNION_TYPE:
20282     case QUAL_UNION_TYPE:
20283       {
20284         /* These aren't very interesting except in a degenerate case.  */
20285         int count = 0;
20286         int sub_count;
20287         tree field;
20288
20289         /* Can't handle incomplete types nor sizes that are not
20290            fixed.  */
20291         if (!COMPLETE_TYPE_P (type)
20292             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20293           return -1;
20294
20295         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20296           {
20297             if (TREE_CODE (field) != FIELD_DECL)
20298               continue;
20299
20300             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20301                                                  warn_psabi_flags);
20302             if (sub_count < 0)
20303               return -1;
20304             count = count > sub_count ? count : sub_count;
20305           }
20306
20307         /* There must be no padding.  */
20308         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20309                       count * GET_MODE_BITSIZE (*modep)))
20310           return -1;
20311
20312         return count;
20313       }
20314
20315     default:
20316       break;
20317     }
20318
20319   return -1;
20320 }
20321
20322 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
20323    type as described in AAPCS64 \S 4.1.2.
20324
20325    See the comment above aarch64_composite_type_p for the notes on MODE.  */
20326
20327 static bool
20328 aarch64_short_vector_p (const_tree type,
20329                         machine_mode mode)
20330 {
20331   poly_int64 size = -1;
20332
20333   if (type && TREE_CODE (type) == VECTOR_TYPE)
20334     {
20335       if (aarch64_sve::builtin_type_p (type))
20336         return false;
20337       size = int_size_in_bytes (type);
20338     }
20339   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
20340            || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
20341     {
20342       /* The containing "else if" is too loose: it means that we look at TYPE
20343          if the type is a vector type (good), but that we otherwise ignore TYPE
20344          and look only at the mode.  This is wrong because the type describes
20345          the language-level information whereas the mode is purely an internal
20346          GCC concept.  We can therefore reach here for types that are not
20347          vectors in the AAPCS64 sense.
20348
20349          We can't "fix" that for the traditional Advanced SIMD vector modes
20350          without breaking backwards compatibility.  However, there's no such
20351          baggage for the structure modes, which were introduced in GCC 12.  */
20352       if (aarch64_advsimd_struct_mode_p (mode))
20353         return false;
20354
20355       /* For similar reasons, rely only on the type, not the mode, when
20356          processing SVE types.  */
20357       if (type && aarch64_some_values_include_pst_objects_p (type))
20358         /* Leave later code to report an error if SVE is disabled.  */
20359         gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
20360       else
20361         size = GET_MODE_SIZE (mode);
20362     }
20363   if (known_eq (size, 8) || known_eq (size, 16))
20364     {
20365       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
20366          they are being treated as scalable AAPCS64 types.  */
20367       gcc_assert (!aarch64_sve_mode_p (mode)
20368                   && !aarch64_advsimd_struct_mode_p (mode));
20369       return true;
20370     }
20371   return false;
20372 }
20373
20374 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
20375    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
20376    array types.  The C99 floating-point complex types are also considered
20377    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
20378    types, which are GCC extensions and out of the scope of AAPCS64, are
20379    treated as composite types here as well.
20380
20381    Note that MODE itself is not sufficient in determining whether a type
20382    is such a composite type or not.  This is because
20383    stor-layout.cc:compute_record_mode may have already changed the MODE
20384    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
20385    structure with only one field may have its MODE set to the mode of the
20386    field.  Also an integer mode whose size matches the size of the
20387    RECORD_TYPE type may be used to substitute the original mode
20388    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
20389    solely relied on.  */
20390
20391 static bool
20392 aarch64_composite_type_p (const_tree type,
20393                           machine_mode mode)
20394 {
20395   if (aarch64_short_vector_p (type, mode))
20396     return false;
20397
20398   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
20399     return true;
20400
20401   if (mode == BLKmode
20402       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
20403       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
20404     return true;
20405
20406   return false;
20407 }
20408
20409 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
20410    shall be passed or returned in simd/fp register(s) (providing these
20411    parameter passing registers are available).
20412
20413    Upon successful return, *COUNT returns the number of needed registers,
20414    *BASE_MODE returns the mode of the individual register and when IS_HA
20415    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
20416    floating-point aggregate or a homogeneous short-vector aggregate.
20417
20418    SILENT_P is true if the function should refrain from reporting any
20419    diagnostics.  This should only be used if the caller is certain that
20420    any ABI decisions would eventually come through this function with
20421    SILENT_P set to false.  */
20422
20423 static bool
20424 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
20425                                          const_tree type,
20426                                          machine_mode *base_mode,
20427                                          int *count,
20428                                          bool *is_ha,
20429                                          bool silent_p)
20430 {
20431   if (is_ha != NULL) *is_ha = false;
20432
20433   machine_mode new_mode = VOIDmode;
20434   bool composite_p = aarch64_composite_type_p (type, mode);
20435
20436   if ((!composite_p
20437        && (GET_MODE_CLASS (mode) == MODE_FLOAT
20438            || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
20439       || aarch64_short_vector_p (type, mode))
20440     {
20441       *count = 1;
20442       new_mode = mode;
20443     }
20444   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
20445     {
20446       if (is_ha != NULL) *is_ha = true;
20447       *count = 2;
20448       new_mode = GET_MODE_INNER (mode);
20449     }
20450   else if (type && composite_p)
20451     {
20452       unsigned int warn_psabi_flags = 0;
20453       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
20454                                               &warn_psabi_flags);
20455       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
20456         {
20457           static unsigned last_reported_type_uid;
20458           unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
20459           int alt;
20460           if (!silent_p
20461               && warn_psabi
20462               && warn_psabi_flags
20463               && uid != last_reported_type_uid
20464               && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
20465                   != ag_count))
20466             {
20467               const char *url10
20468                 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
20469               const char *url12
20470                 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
20471               gcc_assert (alt == -1);
20472               last_reported_type_uid = uid;
20473               /* Use TYPE_MAIN_VARIANT to strip any redundant const
20474                  qualification.  */
20475               if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
20476                 inform (input_location, "parameter passing for argument of "
20477                         "type %qT with %<[[no_unique_address]]%> members "
20478                         "changed %{in GCC 10.1%}",
20479                         TYPE_MAIN_VARIANT (type), url10);
20480               else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
20481                 inform (input_location, "parameter passing for argument of "
20482                         "type %qT when C++17 is enabled changed to match "
20483                         "C++14 %{in GCC 10.1%}",
20484                         TYPE_MAIN_VARIANT (type), url10);
20485               else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
20486                 inform (input_location, "parameter passing for argument of "
20487                         "type %qT changed %{in GCC 12.1%}",
20488                         TYPE_MAIN_VARIANT (type), url12);
20489             }
20490
20491           if (is_ha != NULL) *is_ha = true;
20492           *count = ag_count;
20493         }
20494       else
20495         return false;
20496     }
20497   else
20498     return false;
20499
20500   gcc_assert (!aarch64_sve_mode_p (new_mode));
20501   *base_mode = new_mode;
20502   return true;
20503 }
20504
20505 /* Implement TARGET_STRUCT_VALUE_RTX.  */
20506
20507 static rtx
20508 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
20509                           int incoming ATTRIBUTE_UNUSED)
20510 {
20511   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
20512 }
20513
20514 /* Implements target hook vector_mode_supported_p.  */
20515 static bool
20516 aarch64_vector_mode_supported_p (machine_mode mode)
20517 {
20518   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
20519   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
20520 }
20521
20522 /* Return the full-width SVE vector mode for element mode MODE, if one
20523    exists.  */
20524 opt_machine_mode
20525 aarch64_full_sve_mode (scalar_mode mode)
20526 {
20527   switch (mode)
20528     {
20529     case E_DFmode:
20530       return VNx2DFmode;
20531     case E_SFmode:
20532       return VNx4SFmode;
20533     case E_HFmode:
20534       return VNx8HFmode;
20535     case E_BFmode:
20536       return VNx8BFmode;
20537     case E_DImode:
20538       return VNx2DImode;
20539     case E_SImode:
20540       return VNx4SImode;
20541     case E_HImode:
20542       return VNx8HImode;
20543     case E_QImode:
20544       return VNx16QImode;
20545     default:
20546       return opt_machine_mode ();
20547     }
20548 }
20549
20550 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
20551    if it exists.  */
20552 opt_machine_mode
20553 aarch64_vq_mode (scalar_mode mode)
20554 {
20555   switch (mode)
20556     {
20557     case E_DFmode:
20558       return V2DFmode;
20559     case E_SFmode:
20560       return V4SFmode;
20561     case E_HFmode:
20562       return V8HFmode;
20563     case E_BFmode:
20564       return V8BFmode;
20565     case E_SImode:
20566       return V4SImode;
20567     case E_HImode:
20568       return V8HImode;
20569     case E_QImode:
20570       return V16QImode;
20571     case E_DImode:
20572       return V2DImode;
20573     default:
20574       return opt_machine_mode ();
20575     }
20576 }
20577
20578 /* Return appropriate SIMD container
20579    for MODE within a vector of WIDTH bits.  */
20580 static machine_mode
20581 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
20582 {
20583   if (TARGET_SVE
20584       && maybe_ne (width, 128)
20585       && known_eq (width, BITS_PER_SVE_VECTOR))
20586     return aarch64_full_sve_mode (mode).else_mode (word_mode);
20587
20588   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
20589   if (TARGET_SIMD)
20590     {
20591       if (known_eq (width, 128))
20592         return aarch64_vq_mode (mode).else_mode (word_mode);
20593       else
20594         switch (mode)
20595           {
20596           case E_SFmode:
20597             return V2SFmode;
20598           case E_HFmode:
20599             return V4HFmode;
20600           case E_BFmode:
20601             return V4BFmode;
20602           case E_SImode:
20603             return V2SImode;
20604           case E_HImode:
20605             return V4HImode;
20606           case E_QImode:
20607             return V8QImode;
20608           default:
20609             break;
20610           }
20611     }
20612   return word_mode;
20613 }
20614
20615 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
20616    and return whether the SVE mode should be preferred over the
20617    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
20618 static bool
20619 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
20620 {
20621   /* Take into account the aarch64-autovec-preference param if non-zero.  */
20622   bool only_asimd_p = aarch64_autovec_preference == 1;
20623   bool only_sve_p = aarch64_autovec_preference == 2;
20624
20625   if (only_asimd_p)
20626     return false;
20627   if (only_sve_p)
20628     return true;
20629
20630   /* The preference in case of a tie in costs.  */
20631   bool prefer_asimd = aarch64_autovec_preference == 3;
20632   bool prefer_sve = aarch64_autovec_preference == 4;
20633
20634   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
20635   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
20636   /* If the CPU information does not have an SVE width registered use the
20637      generic poly_int comparison that prefers SVE.  If a preference is
20638      explicitly requested avoid this path.  */
20639   if (aarch64_tune_params.sve_width == SVE_SCALABLE
20640       && !prefer_asimd
20641       && !prefer_sve)
20642     return maybe_gt (nunits_sve, nunits_asimd);
20643
20644   /* Otherwise estimate the runtime width of the modes involved.  */
20645   HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
20646   HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
20647
20648   /* Preferring SVE means picking it first unless the Advanced SIMD mode
20649      is clearly wider.  */
20650   if (prefer_sve)
20651     return est_sve >= est_asimd;
20652   /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
20653      is clearly wider.  */
20654   if (prefer_asimd)
20655     return est_sve > est_asimd;
20656
20657   /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
20658   return est_sve > est_asimd;
20659 }
20660
20661 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
20662 static machine_mode
20663 aarch64_preferred_simd_mode (scalar_mode mode)
20664 {
20665   /* Take into account explicit auto-vectorization ISA preferences through
20666      aarch64_cmp_autovec_modes.  */
20667   if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
20668     return aarch64_full_sve_mode (mode).else_mode (word_mode);
20669   if (TARGET_SIMD)
20670     return aarch64_vq_mode (mode).else_mode (word_mode);
20671   return word_mode;
20672 }
20673
20674 /* Return a list of possible vector sizes for the vectorizer
20675    to iterate over.  */
20676 static unsigned int
20677 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
20678 {
20679   static const machine_mode sve_modes[] = {
20680     /* Try using full vectors for all element types.  */
20681     VNx16QImode,
20682
20683     /* Try using 16-bit containers for 8-bit elements and full vectors
20684        for wider elements.  */
20685     VNx8QImode,
20686
20687     /* Try using 32-bit containers for 8-bit and 16-bit elements and
20688        full vectors for wider elements.  */
20689     VNx4QImode,
20690
20691     /* Try using 64-bit containers for all element types.  */
20692     VNx2QImode
20693   };
20694
20695   static const machine_mode advsimd_modes[] = {
20696     /* Try using 128-bit vectors for all element types.  */
20697     V16QImode,
20698
20699     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
20700        for wider elements.  */
20701     V8QImode,
20702
20703     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
20704        for wider elements.
20705
20706        TODO: We could support a limited form of V4QImode too, so that
20707        we use 32-bit vectors for 8-bit elements.  */
20708     V4HImode,
20709
20710     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
20711        for 64-bit elements.
20712
20713        TODO: We could similarly support limited forms of V2QImode and V2HImode
20714        for this case.  */
20715     V2SImode
20716   };
20717
20718   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
20719      This is because:
20720
20721      - If we can't use N-byte Advanced SIMD vectors then the placement
20722        doesn't matter; we'll just continue as though the Advanced SIMD
20723        entry didn't exist.
20724
20725      - If an SVE main loop with N bytes ends up being cheaper than an
20726        Advanced SIMD main loop with N bytes then by default we'll replace
20727        the Advanced SIMD version with the SVE one.
20728
20729      - If an Advanced SIMD main loop with N bytes ends up being cheaper
20730        than an SVE main loop with N bytes then by default we'll try to
20731        use the SVE loop to vectorize the epilogue instead.  */
20732
20733   bool only_asimd_p = aarch64_autovec_preference == 1;
20734   bool only_sve_p = aarch64_autovec_preference == 2;
20735
20736   unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
20737   unsigned int advsimd_i = 0;
20738
20739   while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
20740     {
20741       if (sve_i < ARRAY_SIZE (sve_modes)
20742           && aarch64_cmp_autovec_modes (sve_modes[sve_i],
20743                                         advsimd_modes[advsimd_i]))
20744         modes->safe_push (sve_modes[sve_i++]);
20745       else
20746         modes->safe_push (advsimd_modes[advsimd_i++]);
20747     }
20748   while (sve_i < ARRAY_SIZE (sve_modes))
20749    modes->safe_push (sve_modes[sve_i++]);
20750
20751   unsigned int flags = 0;
20752   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
20753      can compare SVE against Advanced SIMD and so that we can compare
20754      multiple SVE vectorization approaches against each other.  There's
20755      not really any point doing this for Advanced SIMD only, since the
20756      first mode that works should always be the best.  */
20757   if (TARGET_SVE && aarch64_sve_compare_costs)
20758     flags |= VECT_COMPARE_COSTS;
20759   return flags;
20760 }
20761
20762 /* Implement TARGET_MANGLE_TYPE.  */
20763
20764 static const char *
20765 aarch64_mangle_type (const_tree type)
20766 {
20767   /* The AArch64 ABI documents say that "__va_list" has to be
20768      mangled as if it is in the "std" namespace.  */
20769   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
20770     return "St9__va_list";
20771
20772   /* Half-precision floating point types.  */
20773   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
20774     {
20775       if (TYPE_MAIN_VARIANT (type) == float16_type_node)
20776         return NULL;
20777       if (TYPE_MODE (type) == BFmode)
20778         return "u6__bf16";
20779       else
20780         return "Dh";
20781     }
20782
20783   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
20784      builtin types.  */
20785   if (TYPE_NAME (type) != NULL)
20786     {
20787       const char *res;
20788       if ((res = aarch64_general_mangle_builtin_type (type))
20789           || (res = aarch64_sve::mangle_builtin_type (type)))
20790         return res;
20791     }
20792
20793   /* Use the default mangling.  */
20794   return NULL;
20795 }
20796
20797 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
20798
20799 static bool
20800 aarch64_verify_type_context (location_t loc, type_context_kind context,
20801                              const_tree type, bool silent_p)
20802 {
20803   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
20804 }
20805
20806 /* Find the first rtx_insn before insn that will generate an assembly
20807    instruction.  */
20808
20809 static rtx_insn *
20810 aarch64_prev_real_insn (rtx_insn *insn)
20811 {
20812   if (!insn)
20813     return NULL;
20814
20815   do
20816     {
20817       insn = prev_real_insn (insn);
20818     }
20819   while (insn && recog_memoized (insn) < 0);
20820
20821   return insn;
20822 }
20823
20824 static bool
20825 is_madd_op (enum attr_type t1)
20826 {
20827   unsigned int i;
20828   /* A number of these may be AArch32 only.  */
20829   enum attr_type mlatypes[] = {
20830     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
20831     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
20832     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
20833   };
20834
20835   for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
20836     {
20837       if (t1 == mlatypes[i])
20838         return true;
20839     }
20840
20841   return false;
20842 }
20843
20844 /* Check if there is a register dependency between a load and the insn
20845    for which we hold recog_data.  */
20846
20847 static bool
20848 dep_between_memop_and_curr (rtx memop)
20849 {
20850   rtx load_reg;
20851   int opno;
20852
20853   gcc_assert (GET_CODE (memop) == SET);
20854
20855   if (!REG_P (SET_DEST (memop)))
20856     return false;
20857
20858   load_reg = SET_DEST (memop);
20859   for (opno = 1; opno < recog_data.n_operands; opno++)
20860     {
20861       rtx operand = recog_data.operand[opno];
20862       if (REG_P (operand)
20863           && reg_overlap_mentioned_p (load_reg, operand))
20864         return true;
20865
20866     }
20867   return false;
20868 }
20869
20870
20871 /* When working around the Cortex-A53 erratum 835769,
20872    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
20873    instruction and has a preceding memory instruction such that a NOP
20874    should be inserted between them.  */
20875
20876 bool
20877 aarch64_madd_needs_nop (rtx_insn* insn)
20878 {
20879   enum attr_type attr_type;
20880   rtx_insn *prev;
20881   rtx body;
20882
20883   if (!TARGET_FIX_ERR_A53_835769)
20884     return false;
20885
20886   if (!INSN_P (insn) || recog_memoized (insn) < 0)
20887     return false;
20888
20889   attr_type = get_attr_type (insn);
20890   if (!is_madd_op (attr_type))
20891     return false;
20892
20893   prev = aarch64_prev_real_insn (insn);
20894   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
20895      Restore recog state to INSN to avoid state corruption.  */
20896   extract_constrain_insn_cached (insn);
20897
20898   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
20899     return false;
20900
20901   body = single_set (prev);
20902
20903   /* If the previous insn is a memory op and there is no dependency between
20904      it and the DImode madd, emit a NOP between them.  If body is NULL then we
20905      have a complex memory operation, probably a load/store pair.
20906      Be conservative for now and emit a NOP.  */
20907   if (GET_MODE (recog_data.operand[0]) == DImode
20908       && (!body || !dep_between_memop_and_curr (body)))
20909     return true;
20910
20911   return false;
20912
20913 }
20914
20915
20916 /* Implement FINAL_PRESCAN_INSN.  */
20917
20918 void
20919 aarch64_final_prescan_insn (rtx_insn *insn)
20920 {
20921   if (aarch64_madd_needs_nop (insn))
20922     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
20923 }
20924
20925
20926 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
20927    instruction.  */
20928
20929 bool
20930 aarch64_sve_index_immediate_p (rtx base_or_step)
20931 {
20932   return (CONST_INT_P (base_or_step)
20933           && IN_RANGE (INTVAL (base_or_step), -16, 15));
20934 }
20935
20936 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
20937    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
20938
20939 bool
20940 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
20941 {
20942   rtx elt = unwrap_const_vec_duplicate (x);
20943   if (!CONST_INT_P (elt))
20944     return false;
20945
20946   HOST_WIDE_INT val = INTVAL (elt);
20947   if (negate_p)
20948     val = -val;
20949   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
20950
20951   if (val & 0xff)
20952     return IN_RANGE (val, 0, 0xff);
20953   return IN_RANGE (val, 0, 0xff00);
20954 }
20955
20956 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
20957    instructions when applied to mode MODE.  Negate X first if NEGATE_P
20958    is true.  */
20959
20960 bool
20961 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
20962 {
20963   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
20964     return false;
20965
20966   /* After the optional negation, the immediate must be nonnegative.
20967      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
20968      instead of SQADD Zn.B, Zn.B, #129.  */
20969   rtx elt = unwrap_const_vec_duplicate (x);
20970   return negate_p == (INTVAL (elt) < 0);
20971 }
20972
20973 /* Return true if X is a valid immediate operand for an SVE logical
20974    instruction such as AND.  */
20975
20976 bool
20977 aarch64_sve_bitmask_immediate_p (rtx x)
20978 {
20979   rtx elt;
20980
20981   return (const_vec_duplicate_p (x, &elt)
20982           && CONST_INT_P (elt)
20983           && aarch64_bitmask_imm (INTVAL (elt),
20984                                   GET_MODE_INNER (GET_MODE (x))));
20985 }
20986
20987 /* Return true if X is a valid immediate for the SVE DUP and CPY
20988    instructions.  */
20989
20990 bool
20991 aarch64_sve_dup_immediate_p (rtx x)
20992 {
20993   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
20994   if (!CONST_INT_P (x))
20995     return false;
20996
20997   HOST_WIDE_INT val = INTVAL (x);
20998   if (val & 0xff)
20999     return IN_RANGE (val, -0x80, 0x7f);
21000   return IN_RANGE (val, -0x8000, 0x7f00);
21001 }
21002
21003 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
21004    SIGNED_P says whether the operand is signed rather than unsigned.  */
21005
21006 bool
21007 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
21008 {
21009   x = unwrap_const_vec_duplicate (x);
21010   return (CONST_INT_P (x)
21011           && (signed_p
21012               ? IN_RANGE (INTVAL (x), -16, 15)
21013               : IN_RANGE (INTVAL (x), 0, 127)));
21014 }
21015
21016 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
21017    instruction.  Negate X first if NEGATE_P is true.  */
21018
21019 bool
21020 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
21021 {
21022   rtx elt;
21023   REAL_VALUE_TYPE r;
21024
21025   if (!const_vec_duplicate_p (x, &elt)
21026       || !CONST_DOUBLE_P (elt))
21027     return false;
21028
21029   r = *CONST_DOUBLE_REAL_VALUE (elt);
21030
21031   if (negate_p)
21032     r = real_value_negate (&r);
21033
21034   if (real_equal (&r, &dconst1))
21035     return true;
21036   if (real_equal (&r, &dconsthalf))
21037     return true;
21038   return false;
21039 }
21040
21041 /* Return true if X is a valid immediate operand for an SVE FMUL
21042    instruction.  */
21043
21044 bool
21045 aarch64_sve_float_mul_immediate_p (rtx x)
21046 {
21047   rtx elt;
21048
21049   return (const_vec_duplicate_p (x, &elt)
21050           && CONST_DOUBLE_P (elt)
21051           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
21052               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
21053 }
21054
21055 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
21056    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
21057    is nonnull, use it to describe valid immediates.  */
21058 static bool
21059 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
21060                                     simd_immediate_info *info,
21061                                     enum simd_immediate_check which,
21062                                     simd_immediate_info::insn_type insn)
21063 {
21064   /* Try a 4-byte immediate with LSL.  */
21065   for (unsigned int shift = 0; shift < 32; shift += 8)
21066     if ((val32 & (0xff << shift)) == val32)
21067       {
21068         if (info)
21069           *info = simd_immediate_info (SImode, val32 >> shift, insn,
21070                                        simd_immediate_info::LSL, shift);
21071         return true;
21072       }
21073
21074   /* Try a 2-byte immediate with LSL.  */
21075   unsigned int imm16 = val32 & 0xffff;
21076   if (imm16 == (val32 >> 16))
21077     for (unsigned int shift = 0; shift < 16; shift += 8)
21078       if ((imm16 & (0xff << shift)) == imm16)
21079         {
21080           if (info)
21081             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
21082                                          simd_immediate_info::LSL, shift);
21083           return true;
21084         }
21085
21086   /* Try a 4-byte immediate with MSL, except for cases that MVN
21087      can handle.  */
21088   if (which == AARCH64_CHECK_MOV)
21089     for (unsigned int shift = 8; shift < 24; shift += 8)
21090       {
21091         unsigned int low = (1 << shift) - 1;
21092         if (((val32 & (0xff << shift)) | low) == val32)
21093           {
21094             if (info)
21095               *info = simd_immediate_info (SImode, val32 >> shift, insn,
21096                                            simd_immediate_info::MSL, shift);
21097             return true;
21098           }
21099       }
21100
21101   return false;
21102 }
21103
21104 /* Return true if replicating VAL64 is a valid immediate for the
21105    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
21106    use it to describe valid immediates.  */
21107 static bool
21108 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
21109                                  simd_immediate_info *info,
21110                                  enum simd_immediate_check which)
21111 {
21112   unsigned int val32 = val64 & 0xffffffff;
21113   unsigned int val16 = val64 & 0xffff;
21114   unsigned int val8 = val64 & 0xff;
21115
21116   if (val32 == (val64 >> 32))
21117     {
21118       if ((which & AARCH64_CHECK_ORR) != 0
21119           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
21120                                                  simd_immediate_info::MOV))
21121         return true;
21122
21123       if ((which & AARCH64_CHECK_BIC) != 0
21124           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
21125                                                  simd_immediate_info::MVN))
21126         return true;
21127
21128       /* Try using a replicated byte.  */
21129       if (which == AARCH64_CHECK_MOV
21130           && val16 == (val32 >> 16)
21131           && val8 == (val16 >> 8))
21132         {
21133           if (info)
21134             *info = simd_immediate_info (QImode, val8);
21135           return true;
21136         }
21137     }
21138
21139   /* Try using a bit-to-bytemask.  */
21140   if (which == AARCH64_CHECK_MOV)
21141     {
21142       unsigned int i;
21143       for (i = 0; i < 64; i += 8)
21144         {
21145           unsigned char byte = (val64 >> i) & 0xff;
21146           if (byte != 0 && byte != 0xff)
21147             break;
21148         }
21149       if (i == 64)
21150         {
21151           if (info)
21152             *info = simd_immediate_info (DImode, val64);
21153           return true;
21154         }
21155     }
21156   return false;
21157 }
21158
21159 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
21160    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
21161
21162 static bool
21163 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
21164                              simd_immediate_info *info)
21165 {
21166   scalar_int_mode mode = DImode;
21167   unsigned int val32 = val64 & 0xffffffff;
21168   if (val32 == (val64 >> 32))
21169     {
21170       mode = SImode;
21171       unsigned int val16 = val32 & 0xffff;
21172       if (val16 == (val32 >> 16))
21173         {
21174           mode = HImode;
21175           unsigned int val8 = val16 & 0xff;
21176           if (val8 == (val16 >> 8))
21177             mode = QImode;
21178         }
21179     }
21180   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
21181   if (IN_RANGE (val, -0x80, 0x7f))
21182     {
21183       /* DUP with no shift.  */
21184       if (info)
21185         *info = simd_immediate_info (mode, val);
21186       return true;
21187     }
21188   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
21189     {
21190       /* DUP with LSL #8.  */
21191       if (info)
21192         *info = simd_immediate_info (mode, val);
21193       return true;
21194     }
21195   if (aarch64_bitmask_imm (val64, mode))
21196     {
21197       /* DUPM.  */
21198       if (info)
21199         *info = simd_immediate_info (mode, val);
21200       return true;
21201     }
21202   return false;
21203 }
21204
21205 /* Return true if X is an UNSPEC_PTRUE constant of the form:
21206
21207        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
21208
21209    where PATTERN is the svpattern as a CONST_INT and where ZERO
21210    is a zero constant of the required PTRUE mode (which can have
21211    fewer elements than X's mode, if zero bits are significant).
21212
21213    If so, and if INFO is nonnull, describe the immediate in INFO.  */
21214 bool
21215 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
21216 {
21217   if (GET_CODE (x) != CONST)
21218     return false;
21219
21220   x = XEXP (x, 0);
21221   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
21222     return false;
21223
21224   if (info)
21225     {
21226       aarch64_svpattern pattern
21227         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
21228       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
21229       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
21230       *info = simd_immediate_info (int_mode, pattern);
21231     }
21232   return true;
21233 }
21234
21235 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
21236    it to describe valid immediates.  */
21237
21238 static bool
21239 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
21240 {
21241   if (aarch64_sve_ptrue_svpattern_p (x, info))
21242     return true;
21243
21244   if (x == CONST0_RTX (GET_MODE (x)))
21245     {
21246       if (info)
21247         *info = simd_immediate_info (DImode, 0);
21248       return true;
21249     }
21250
21251   /* Analyze the value as a VNx16BImode.  This should be relatively
21252      efficient, since rtx_vector_builder has enough built-in capacity
21253      to store all VLA predicate constants without needing the heap.  */
21254   rtx_vector_builder builder;
21255   if (!aarch64_get_sve_pred_bits (builder, x))
21256     return false;
21257
21258   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
21259   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
21260     {
21261       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
21262       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
21263       if (pattern != AARCH64_NUM_SVPATTERNS)
21264         {
21265           if (info)
21266             {
21267               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
21268               *info = simd_immediate_info (int_mode, pattern);
21269             }
21270           return true;
21271         }
21272     }
21273   return false;
21274 }
21275
21276 /* Return true if OP is a valid SIMD immediate for the operation
21277    described by WHICH.  If INFO is nonnull, use it to describe valid
21278    immediates.  */
21279 bool
21280 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
21281                               enum simd_immediate_check which)
21282 {
21283   machine_mode mode = GET_MODE (op);
21284   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21285   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21286     return false;
21287
21288   if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
21289     return false;
21290
21291   if (vec_flags & VEC_SVE_PRED)
21292     return aarch64_sve_pred_valid_immediate (op, info);
21293
21294   scalar_mode elt_mode = GET_MODE_INNER (mode);
21295   rtx base, step;
21296   unsigned int n_elts;
21297   if (CONST_VECTOR_P (op)
21298       && CONST_VECTOR_DUPLICATE_P (op))
21299     n_elts = CONST_VECTOR_NPATTERNS (op);
21300   else if ((vec_flags & VEC_SVE_DATA)
21301            && const_vec_series_p (op, &base, &step))
21302     {
21303       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
21304       if (!aarch64_sve_index_immediate_p (base)
21305           || !aarch64_sve_index_immediate_p (step))
21306         return false;
21307
21308       if (info)
21309         {
21310           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
21311              should yield two integer values per 128-bit block, meaning
21312              that we need to treat it in the same way as V2DI and then
21313              ignore the upper 32 bits of each element.  */
21314           elt_mode = aarch64_sve_container_int_mode (mode);
21315           *info = simd_immediate_info (elt_mode, base, step);
21316         }
21317       return true;
21318     }
21319   else if (CONST_VECTOR_P (op)
21320            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
21321     /* N_ELTS set above.  */;
21322   else
21323     return false;
21324
21325   scalar_float_mode elt_float_mode;
21326   if (n_elts == 1
21327       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
21328     {
21329       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
21330       if (aarch64_float_const_zero_rtx_p (elt)
21331           || aarch64_float_const_representable_p (elt))
21332         {
21333           if (info)
21334             *info = simd_immediate_info (elt_float_mode, elt);
21335           return true;
21336         }
21337     }
21338
21339   /* If all elements in an SVE vector have the same value, we have a free
21340      choice between using the element mode and using the container mode.
21341      Using the element mode means that unused parts of the vector are
21342      duplicates of the used elements, while using the container mode means
21343      that the unused parts are an extension of the used elements.  Using the
21344      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
21345      for its container mode VNx4SI while 0x00000101 isn't.
21346
21347      If not all elements in an SVE vector have the same value, we need the
21348      transition from one element to the next to occur at container boundaries.
21349      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
21350      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
21351   scalar_int_mode elt_int_mode;
21352   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
21353     elt_int_mode = aarch64_sve_container_int_mode (mode);
21354   else
21355     elt_int_mode = int_mode_for_mode (elt_mode).require ();
21356
21357   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
21358   if (elt_size > 8)
21359     return false;
21360
21361   /* Expand the vector constant out into a byte vector, with the least
21362      significant byte of the register first.  */
21363   auto_vec<unsigned char, 16> bytes;
21364   bytes.reserve (n_elts * elt_size);
21365   for (unsigned int i = 0; i < n_elts; i++)
21366     {
21367       /* The vector is provided in gcc endian-neutral fashion.
21368          For aarch64_be Advanced SIMD, it must be laid out in the vector
21369          register in reverse order.  */
21370       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
21371       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
21372
21373       if (elt_mode != elt_int_mode)
21374         elt = gen_lowpart (elt_int_mode, elt);
21375
21376       if (!CONST_INT_P (elt))
21377         return false;
21378
21379       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
21380       for (unsigned int byte = 0; byte < elt_size; byte++)
21381         {
21382           bytes.quick_push (elt_val & 0xff);
21383           elt_val >>= BITS_PER_UNIT;
21384         }
21385     }
21386
21387   /* The immediate must repeat every eight bytes.  */
21388   unsigned int nbytes = bytes.length ();
21389   for (unsigned i = 8; i < nbytes; ++i)
21390     if (bytes[i] != bytes[i - 8])
21391       return false;
21392
21393   /* Get the repeating 8-byte value as an integer.  No endian correction
21394      is needed here because bytes is already in lsb-first order.  */
21395   unsigned HOST_WIDE_INT val64 = 0;
21396   for (unsigned int i = 0; i < 8; i++)
21397     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
21398               << (i * BITS_PER_UNIT));
21399
21400   if (vec_flags & VEC_SVE_DATA)
21401     return aarch64_sve_valid_immediate (val64, info);
21402   else
21403     return aarch64_advsimd_valid_immediate (val64, info, which);
21404 }
21405
21406 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
21407    has a step in the range of INDEX.  Return the index expression if so,
21408    otherwise return null.  */
21409 rtx
21410 aarch64_check_zero_based_sve_index_immediate (rtx x)
21411 {
21412   rtx base, step;
21413   if (const_vec_series_p (x, &base, &step)
21414       && base == const0_rtx
21415       && aarch64_sve_index_immediate_p (step))
21416     return step;
21417   return NULL_RTX;
21418 }
21419
21420 /* Check of immediate shift constants are within range.  */
21421 bool
21422 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
21423 {
21424   x = unwrap_const_vec_duplicate (x);
21425   if (!CONST_INT_P (x))
21426     return false;
21427   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
21428   if (left)
21429     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
21430   else
21431     return IN_RANGE (INTVAL (x), 1, bit_width);
21432 }
21433
21434 /* Return the bitmask CONST_INT to select the bits required by a zero extract
21435    operation of width WIDTH at bit position POS.  */
21436
21437 rtx
21438 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
21439 {
21440   gcc_assert (CONST_INT_P (width));
21441   gcc_assert (CONST_INT_P (pos));
21442
21443   unsigned HOST_WIDE_INT mask
21444     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
21445   return GEN_INT (mask << UINTVAL (pos));
21446 }
21447
21448 bool
21449 aarch64_mov_operand_p (rtx x, machine_mode mode)
21450 {
21451   if (GET_CODE (x) == HIGH
21452       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
21453     return true;
21454
21455   if (CONST_INT_P (x))
21456     return true;
21457
21458   if (VECTOR_MODE_P (GET_MODE (x)))
21459     {
21460       /* Require predicate constants to be VNx16BI before RA, so that we
21461          force everything to have a canonical form.  */
21462       if (!lra_in_progress
21463           && !reload_completed
21464           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
21465           && GET_MODE (x) != VNx16BImode)
21466         return false;
21467
21468       return aarch64_simd_valid_immediate (x, NULL);
21469     }
21470
21471   /* Remove UNSPEC_SALT_ADDR before checking symbol reference.  */
21472   x = strip_salt (x);
21473
21474   /* GOT accesses are valid moves.  */
21475   if (SYMBOL_REF_P (x)
21476       && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
21477     return true;
21478
21479   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
21480     return true;
21481
21482   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
21483     return true;
21484
21485   return aarch64_classify_symbolic_expression (x)
21486     == SYMBOL_TINY_ABSOLUTE;
21487 }
21488
21489 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
21490    the constant creation.  */
21491
21492 rtx
21493 aarch64_gen_shareable_zero (machine_mode mode)
21494 {
21495   machine_mode zmode = V4SImode;
21496   rtx tmp = gen_reg_rtx (zmode);
21497   emit_move_insn (tmp, CONST0_RTX (zmode));
21498   return lowpart_subreg (mode, tmp, zmode);
21499 }
21500
21501 /* Return a const_int vector of VAL.  */
21502 rtx
21503 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
21504 {
21505   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
21506   return gen_const_vec_duplicate (mode, c);
21507 }
21508
21509 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
21510
21511 bool
21512 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
21513 {
21514   machine_mode vmode;
21515
21516   vmode = aarch64_simd_container_mode (mode, 64);
21517   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
21518   return aarch64_simd_valid_immediate (op_v, NULL);
21519 }
21520
21521 /* Construct and return a PARALLEL RTX vector with elements numbering the
21522    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
21523    the vector - from the perspective of the architecture.  This does not
21524    line up with GCC's perspective on lane numbers, so we end up with
21525    different masks depending on our target endian-ness.  The diagram
21526    below may help.  We must draw the distinction when building masks
21527    which select one half of the vector.  An instruction selecting
21528    architectural low-lanes for a big-endian target, must be described using
21529    a mask selecting GCC high-lanes.
21530
21531                  Big-Endian             Little-Endian
21532
21533 GCC             0   1   2   3           3   2   1   0
21534               | x | x | x | x |       | x | x | x | x |
21535 Architecture    3   2   1   0           3   2   1   0
21536
21537 Low Mask:         { 2, 3 }                { 0, 1 }
21538 High Mask:        { 0, 1 }                { 2, 3 }
21539
21540    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
21541
21542 rtx
21543 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
21544 {
21545   rtvec v = rtvec_alloc (nunits / 2);
21546   int high_base = nunits / 2;
21547   int low_base = 0;
21548   int base;
21549   rtx t1;
21550   int i;
21551
21552   if (BYTES_BIG_ENDIAN)
21553     base = high ? low_base : high_base;
21554   else
21555     base = high ? high_base : low_base;
21556
21557   for (i = 0; i < nunits / 2; i++)
21558     RTVEC_ELT (v, i) = GEN_INT (base + i);
21559
21560   t1 = gen_rtx_PARALLEL (mode, v);
21561   return t1;
21562 }
21563
21564 /* Check OP for validity as a PARALLEL RTX vector with elements
21565    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
21566    from the perspective of the architecture.  See the diagram above
21567    aarch64_simd_vect_par_cnst_half for more details.  */
21568
21569 bool
21570 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
21571                                        bool high)
21572 {
21573   int nelts;
21574   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
21575     return false;
21576
21577   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
21578   HOST_WIDE_INT count_op = XVECLEN (op, 0);
21579   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
21580   int i = 0;
21581
21582   if (count_op != count_ideal)
21583     return false;
21584
21585   for (i = 0; i < count_ideal; i++)
21586     {
21587       rtx elt_op = XVECEXP (op, 0, i);
21588       rtx elt_ideal = XVECEXP (ideal, 0, i);
21589
21590       if (!CONST_INT_P (elt_op)
21591           || INTVAL (elt_ideal) != INTVAL (elt_op))
21592         return false;
21593     }
21594   return true;
21595 }
21596
21597 /* Return a PARALLEL containing NELTS elements, with element I equal
21598    to BASE + I * STEP.  */
21599
21600 rtx
21601 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
21602 {
21603   rtvec vec = rtvec_alloc (nelts);
21604   for (unsigned int i = 0; i < nelts; ++i)
21605     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
21606   return gen_rtx_PARALLEL (VOIDmode, vec);
21607 }
21608
21609 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
21610    series with step STEP.  */
21611
21612 bool
21613 aarch64_stepped_int_parallel_p (rtx op, int step)
21614 {
21615   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
21616     return false;
21617
21618   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
21619   for (int i = 1; i < XVECLEN (op, 0); ++i)
21620     if (!CONST_INT_P (XVECEXP (op, 0, i))
21621         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
21622       return false;
21623
21624   return true;
21625 }
21626
21627 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
21628    HIGH (exclusive).  */
21629 void
21630 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
21631                           const_tree exp)
21632 {
21633   HOST_WIDE_INT lane;
21634   gcc_assert (CONST_INT_P (operand));
21635   lane = INTVAL (operand);
21636
21637   if (lane < low || lane >= high)
21638   {
21639     if (exp)
21640       error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
21641                 lane, low, high - 1);
21642     else
21643       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
21644   }
21645 }
21646
21647 /* Peform endian correction on lane number N, which indexes a vector
21648    of mode MODE, and return the result as an SImode rtx.  */
21649
21650 rtx
21651 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
21652 {
21653   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
21654 }
21655
21656 /* Return TRUE if OP is a valid vector addressing mode.  */
21657
21658 bool
21659 aarch64_simd_mem_operand_p (rtx op)
21660 {
21661   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
21662                         || REG_P (XEXP (op, 0)));
21663 }
21664
21665 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
21666
21667 bool
21668 aarch64_sve_ld1r_operand_p (rtx op)
21669 {
21670   struct aarch64_address_info addr;
21671   scalar_mode mode;
21672
21673   return (MEM_P (op)
21674           && is_a <scalar_mode> (GET_MODE (op), &mode)
21675           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
21676           && addr.type == ADDRESS_REG_IMM
21677           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
21678 }
21679
21680 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
21681    where the size of the read data is specified by `mode` and the size of the
21682    vector elements are specified by `elem_mode`.   */
21683 bool
21684 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
21685                                    scalar_mode elem_mode)
21686 {
21687   struct aarch64_address_info addr;
21688   if (!MEM_P (op)
21689       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
21690     return false;
21691
21692   if (addr.type == ADDRESS_REG_IMM)
21693     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
21694
21695   if (addr.type == ADDRESS_REG_REG)
21696     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
21697
21698   return false;
21699 }
21700
21701 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
21702 bool
21703 aarch64_sve_ld1rq_operand_p (rtx op)
21704 {
21705   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
21706                                             GET_MODE_INNER (GET_MODE (op)));
21707 }
21708
21709 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
21710    accessing a vector where the element size is specified by `elem_mode`.  */
21711 bool
21712 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
21713 {
21714   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
21715 }
21716
21717 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
21718 bool
21719 aarch64_sve_ldff1_operand_p (rtx op)
21720 {
21721   if (!MEM_P (op))
21722     return false;
21723
21724   struct aarch64_address_info addr;
21725   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
21726     return false;
21727
21728   if (addr.type == ADDRESS_REG_IMM)
21729     return known_eq (addr.const_offset, 0);
21730
21731   return addr.type == ADDRESS_REG_REG;
21732 }
21733
21734 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
21735 bool
21736 aarch64_sve_ldnf1_operand_p (rtx op)
21737 {
21738   struct aarch64_address_info addr;
21739
21740   return (MEM_P (op)
21741           && aarch64_classify_address (&addr, XEXP (op, 0),
21742                                        GET_MODE (op), false)
21743           && addr.type == ADDRESS_REG_IMM);
21744 }
21745
21746 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
21747    The conditions for STR are the same.  */
21748 bool
21749 aarch64_sve_ldr_operand_p (rtx op)
21750 {
21751   struct aarch64_address_info addr;
21752
21753   return (MEM_P (op)
21754           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
21755                                        false, ADDR_QUERY_ANY)
21756           && addr.type == ADDRESS_REG_IMM);
21757 }
21758
21759 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
21760    addressing memory of mode MODE.  */
21761 bool
21762 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
21763 {
21764   struct aarch64_address_info addr;
21765   if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
21766     return false;
21767
21768   if (addr.type == ADDRESS_REG_IMM)
21769     return offset_6bit_signed_scaled_p (mode, addr.const_offset);
21770
21771   return addr.type == ADDRESS_REG_REG;
21772 }
21773
21774 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
21775    We need to be able to access the individual pieces, so the range
21776    is different from LD[234] and ST[234].  */
21777 bool
21778 aarch64_sve_struct_memory_operand_p (rtx op)
21779 {
21780   if (!MEM_P (op))
21781     return false;
21782
21783   machine_mode mode = GET_MODE (op);
21784   struct aarch64_address_info addr;
21785   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
21786                                  ADDR_QUERY_ANY)
21787       || addr.type != ADDRESS_REG_IMM)
21788     return false;
21789
21790   poly_int64 first = addr.const_offset;
21791   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
21792   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
21793           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
21794 }
21795
21796 /* Emit a register copy from operand to operand, taking care not to
21797    early-clobber source registers in the process.
21798
21799    COUNT is the number of components into which the copy needs to be
21800    decomposed.  */
21801 void
21802 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
21803                                 unsigned int count)
21804 {
21805   unsigned int i;
21806   int rdest = REGNO (operands[0]);
21807   int rsrc = REGNO (operands[1]);
21808
21809   if (!reg_overlap_mentioned_p (operands[0], operands[1])
21810       || rdest < rsrc)
21811     for (i = 0; i < count; i++)
21812       emit_move_insn (gen_rtx_REG (mode, rdest + i),
21813                       gen_rtx_REG (mode, rsrc + i));
21814   else
21815     for (i = 0; i < count; i++)
21816       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
21817                       gen_rtx_REG (mode, rsrc + count - i - 1));
21818 }
21819
21820 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
21821    one of VSTRUCT modes: OI, CI, or XI.  */
21822 int
21823 aarch64_simd_attr_length_rglist (machine_mode mode)
21824 {
21825   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
21826   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
21827 }
21828
21829 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
21830    alignment of a vector to 128 bits.  SVE predicates have an alignment of
21831    16 bits.  */
21832 static HOST_WIDE_INT
21833 aarch64_simd_vector_alignment (const_tree type)
21834 {
21835   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
21836      be set for non-predicate vectors of booleans.  Modes are the most
21837      direct way we have of identifying real SVE predicate types.  */
21838   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
21839     return 16;
21840   widest_int min_size
21841     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
21842   return wi::umin (min_size, 128).to_uhwi ();
21843 }
21844
21845 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
21846 static poly_uint64
21847 aarch64_vectorize_preferred_vector_alignment (const_tree type)
21848 {
21849   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
21850     {
21851       /* If the length of the vector is a fixed power of 2, try to align
21852          to that length, otherwise don't try to align at all.  */
21853       HOST_WIDE_INT result;
21854       if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
21855           || !pow2p_hwi (result))
21856         result = TYPE_ALIGN (TREE_TYPE (type));
21857       return result;
21858     }
21859   return TYPE_ALIGN (type);
21860 }
21861
21862 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
21863 static bool
21864 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
21865 {
21866   if (is_packed)
21867     return false;
21868
21869   /* For fixed-length vectors, check that the vectorizer will aim for
21870      full-vector alignment.  This isn't true for generic GCC vectors
21871      that are wider than the ABI maximum of 128 bits.  */
21872   poly_uint64 preferred_alignment =
21873     aarch64_vectorize_preferred_vector_alignment (type);
21874   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
21875       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
21876                    preferred_alignment))
21877     return false;
21878
21879   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
21880   return true;
21881 }
21882
21883 /* Return true if the vector misalignment factor is supported by the
21884    target.  */
21885 static bool
21886 aarch64_builtin_support_vector_misalignment (machine_mode mode,
21887                                              const_tree type, int misalignment,
21888                                              bool is_packed)
21889 {
21890   if (TARGET_SIMD && STRICT_ALIGNMENT)
21891     {
21892       /* Return if movmisalign pattern is not supported for this mode.  */
21893       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
21894         return false;
21895
21896       /* Misalignment factor is unknown at compile time.  */
21897       if (misalignment == -1)
21898         return false;
21899     }
21900   return default_builtin_support_vector_misalignment (mode, type, misalignment,
21901                                                       is_packed);
21902 }
21903
21904 /* If VALS is a vector constant that can be loaded into a register
21905    using DUP, generate instructions to do so and return an RTX to
21906    assign to the register.  Otherwise return NULL_RTX.  */
21907 static rtx
21908 aarch64_simd_dup_constant (rtx vals)
21909 {
21910   machine_mode mode = GET_MODE (vals);
21911   machine_mode inner_mode = GET_MODE_INNER (mode);
21912   rtx x;
21913
21914   if (!const_vec_duplicate_p (vals, &x))
21915     return NULL_RTX;
21916
21917   /* We can load this constant by using DUP and a constant in a
21918      single ARM register.  This will be cheaper than a vector
21919      load.  */
21920   x = copy_to_mode_reg (inner_mode, x);
21921   return gen_vec_duplicate (mode, x);
21922 }
21923
21924
21925 /* Generate code to load VALS, which is a PARALLEL containing only
21926    constants (for vec_init) or CONST_VECTOR, efficiently into a
21927    register.  Returns an RTX to copy into the register, or NULL_RTX
21928    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
21929 static rtx
21930 aarch64_simd_make_constant (rtx vals)
21931 {
21932   machine_mode mode = GET_MODE (vals);
21933   rtx const_dup;
21934   rtx const_vec = NULL_RTX;
21935   int n_const = 0;
21936   int i;
21937
21938   if (CONST_VECTOR_P (vals))
21939     const_vec = vals;
21940   else if (GET_CODE (vals) == PARALLEL)
21941     {
21942       /* A CONST_VECTOR must contain only CONST_INTs and
21943          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
21944          Only store valid constants in a CONST_VECTOR.  */
21945       int n_elts = XVECLEN (vals, 0);
21946       for (i = 0; i < n_elts; ++i)
21947         {
21948           rtx x = XVECEXP (vals, 0, i);
21949           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
21950             n_const++;
21951         }
21952       if (n_const == n_elts)
21953         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
21954     }
21955   else
21956     gcc_unreachable ();
21957
21958   if (const_vec != NULL_RTX
21959       && aarch64_simd_valid_immediate (const_vec, NULL))
21960     /* Load using MOVI/MVNI.  */
21961     return const_vec;
21962   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
21963     /* Loaded using DUP.  */
21964     return const_dup;
21965   else if (const_vec != NULL_RTX)
21966     /* Load from constant pool. We cannot take advantage of single-cycle
21967        LD1 because we need a PC-relative addressing mode.  */
21968     return const_vec;
21969   else
21970     /* A PARALLEL containing something not valid inside CONST_VECTOR.
21971        We cannot construct an initializer.  */
21972     return NULL_RTX;
21973 }
21974
21975 /* Expand a vector initialisation sequence, such that TARGET is
21976    initialised to contain VALS.  */
21977
21978 void
21979 aarch64_expand_vector_init (rtx target, rtx vals)
21980 {
21981   machine_mode mode = GET_MODE (target);
21982   scalar_mode inner_mode = GET_MODE_INNER (mode);
21983   /* The number of vector elements.  */
21984   int n_elts = XVECLEN (vals, 0);
21985   /* The number of vector elements which are not constant.  */
21986   int n_var = 0;
21987   rtx any_const = NULL_RTX;
21988   /* The first element of vals.  */
21989   rtx v0 = XVECEXP (vals, 0, 0);
21990   bool all_same = true;
21991
21992   /* This is a special vec_init<M><N> where N is not an element mode but a
21993      vector mode with half the elements of M.  We expect to find two entries
21994      of mode N in VALS and we must put their concatentation into TARGET.  */
21995   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
21996     {
21997       machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
21998       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
21999                   && known_eq (GET_MODE_SIZE (mode),
22000                                2 * GET_MODE_SIZE (narrow_mode)));
22001       emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
22002                                          XVECEXP (vals, 0, 0),
22003                                          XVECEXP (vals, 0, 1)));
22004      return;
22005    }
22006
22007   /* Count the number of variable elements to initialise.  */
22008   for (int i = 0; i < n_elts; ++i)
22009     {
22010       rtx x = XVECEXP (vals, 0, i);
22011       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
22012         ++n_var;
22013       else
22014         any_const = x;
22015
22016       all_same &= rtx_equal_p (x, v0);
22017     }
22018
22019   /* No variable elements, hand off to aarch64_simd_make_constant which knows
22020      how best to handle this.  */
22021   if (n_var == 0)
22022     {
22023       rtx constant = aarch64_simd_make_constant (vals);
22024       if (constant != NULL_RTX)
22025         {
22026           emit_move_insn (target, constant);
22027           return;
22028         }
22029     }
22030
22031   /* Splat a single non-constant element if we can.  */
22032   if (all_same)
22033     {
22034       rtx x = copy_to_mode_reg (inner_mode, v0);
22035       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22036       return;
22037     }
22038
22039   /* Check for interleaving case.
22040      For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
22041      Generate following code:
22042      dup v0.h, x
22043      dup v1.h, y
22044      zip1 v0.h, v0.h, v1.h
22045      for "large enough" initializer.  */
22046
22047   if (n_elts >= 8)
22048     {
22049       int i;
22050       for (i = 2; i < n_elts; i++)
22051         if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
22052           break;
22053
22054       if (i == n_elts)
22055         {
22056           machine_mode mode = GET_MODE (target);
22057           rtx dest[2];
22058
22059           for (int i = 0; i < 2; i++)
22060             {
22061               rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
22062               dest[i] = force_reg (mode, x);
22063             }
22064
22065           rtvec v = gen_rtvec (2, dest[0], dest[1]);
22066           emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22067           return;
22068         }
22069     }
22070
22071   enum insn_code icode = optab_handler (vec_set_optab, mode);
22072   gcc_assert (icode != CODE_FOR_nothing);
22073
22074   /* If there are only variable elements, try to optimize
22075      the insertion using dup for the most common element
22076      followed by insertions.  */
22077
22078   /* The algorithm will fill matches[*][0] with the earliest matching element,
22079      and matches[X][1] with the count of duplicate elements (if X is the
22080      earliest element which has duplicates).  */
22081
22082   if (n_var == n_elts && n_elts <= 16)
22083     {
22084       int matches[16][2] = {0};
22085       for (int i = 0; i < n_elts; i++)
22086         {
22087           for (int j = 0; j <= i; j++)
22088             {
22089               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
22090                 {
22091                   matches[i][0] = j;
22092                   matches[j][1]++;
22093                   break;
22094                 }
22095             }
22096         }
22097       int maxelement = 0;
22098       int maxv = 0;
22099       for (int i = 0; i < n_elts; i++)
22100         if (matches[i][1] > maxv)
22101           {
22102             maxelement = i;
22103             maxv = matches[i][1];
22104           }
22105
22106       /* Create a duplicate of the most common element, unless all elements
22107          are equally useless to us, in which case just immediately set the
22108          vector register using the first element.  */
22109
22110       if (maxv == 1)
22111         {
22112           /* For vectors of two 64-bit elements, we can do even better.  */
22113           if (n_elts == 2
22114               && (inner_mode == E_DImode
22115                   || inner_mode == E_DFmode))
22116
22117             {
22118               rtx x0 = XVECEXP (vals, 0, 0);
22119               rtx x1 = XVECEXP (vals, 0, 1);
22120               /* Combine can pick up this case, but handling it directly
22121                  here leaves clearer RTL.
22122
22123                  This is load_pair_lanes<mode>, and also gives us a clean-up
22124                  for store_pair_lanes<mode>.  */
22125               if (memory_operand (x0, inner_mode)
22126                   && memory_operand (x1, inner_mode)
22127                   && aarch64_mergeable_load_pair_p (mode, x0, x1))
22128                 {
22129                   rtx t;
22130                   if (inner_mode == DFmode)
22131                     t = gen_load_pair_lanesdf (target, x0, x1);
22132                   else
22133                     t = gen_load_pair_lanesdi (target, x0, x1);
22134                   emit_insn (t);
22135                   return;
22136                 }
22137             }
22138           /* The subreg-move sequence below will move into lane zero of the
22139              vector register.  For big-endian we want that position to hold
22140              the last element of VALS.  */
22141           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
22142           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22143           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
22144         }
22145       else
22146         {
22147           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22148           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22149         }
22150
22151       /* Insert the rest.  */
22152       for (int i = 0; i < n_elts; i++)
22153         {
22154           rtx x = XVECEXP (vals, 0, i);
22155           if (matches[i][0] == maxelement)
22156             continue;
22157           x = copy_to_mode_reg (inner_mode, x);
22158           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22159         }
22160       return;
22161     }
22162
22163   /* Initialise a vector which is part-variable.  We want to first try
22164      to build those lanes which are constant in the most efficient way we
22165      can.  */
22166   if (n_var != n_elts)
22167     {
22168       rtx copy = copy_rtx (vals);
22169
22170       /* Load constant part of vector.  We really don't care what goes into the
22171          parts we will overwrite, but we're more likely to be able to load the
22172          constant efficiently if it has fewer, larger, repeating parts
22173          (see aarch64_simd_valid_immediate).  */
22174       for (int i = 0; i < n_elts; i++)
22175         {
22176           rtx x = XVECEXP (vals, 0, i);
22177           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22178             continue;
22179           rtx subst = any_const;
22180           for (int bit = n_elts / 2; bit > 0; bit /= 2)
22181             {
22182               /* Look in the copied vector, as more elements are const.  */
22183               rtx test = XVECEXP (copy, 0, i ^ bit);
22184               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
22185                 {
22186                   subst = test;
22187                   break;
22188                 }
22189             }
22190           XVECEXP (copy, 0, i) = subst;
22191         }
22192       aarch64_expand_vector_init (target, copy);
22193     }
22194
22195   /* Insert the variable lanes directly.  */
22196   for (int i = 0; i < n_elts; i++)
22197     {
22198       rtx x = XVECEXP (vals, 0, i);
22199       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22200         continue;
22201       x = copy_to_mode_reg (inner_mode, x);
22202       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22203     }
22204 }
22205
22206 /* Emit RTL corresponding to:
22207    insr TARGET, ELEM.  */
22208
22209 static void
22210 emit_insr (rtx target, rtx elem)
22211 {
22212   machine_mode mode = GET_MODE (target);
22213   scalar_mode elem_mode = GET_MODE_INNER (mode);
22214   elem = force_reg (elem_mode, elem);
22215
22216   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
22217   gcc_assert (icode != CODE_FOR_nothing);
22218   emit_insn (GEN_FCN (icode) (target, target, elem));
22219 }
22220
22221 /* Subroutine of aarch64_sve_expand_vector_init for handling
22222    trailing constants.
22223    This function works as follows:
22224    (a) Create a new vector consisting of trailing constants.
22225    (b) Initialize TARGET with the constant vector using emit_move_insn.
22226    (c) Insert remaining elements in TARGET using insr.
22227    NELTS is the total number of elements in original vector while
22228    while NELTS_REQD is the number of elements that are actually
22229    significant.
22230
22231    ??? The heuristic used is to do above only if number of constants
22232    is at least half the total number of elements.  May need fine tuning.  */
22233
22234 static bool
22235 aarch64_sve_expand_vector_init_handle_trailing_constants
22236  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
22237 {
22238   machine_mode mode = GET_MODE (target);
22239   scalar_mode elem_mode = GET_MODE_INNER (mode);
22240   int n_trailing_constants = 0;
22241
22242   for (int i = nelts_reqd - 1;
22243        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
22244        i--)
22245     n_trailing_constants++;
22246
22247   if (n_trailing_constants >= nelts_reqd / 2)
22248     {
22249       /* Try to use the natural pattern of BUILDER to extend the trailing
22250          constant elements to a full vector.  Replace any variables in the
22251          extra elements with zeros.
22252
22253          ??? It would be better if the builders supported "don't care"
22254              elements, with the builder filling in whichever elements
22255              give the most compact encoding.  */
22256       rtx_vector_builder v (mode, nelts, 1);
22257       for (int i = 0; i < nelts; i++)
22258         {
22259           rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
22260           if (!valid_for_const_vector_p (elem_mode, x))
22261             x = CONST0_RTX (elem_mode);
22262           v.quick_push (x);
22263         }
22264       rtx const_vec = v.build ();
22265       emit_move_insn (target, const_vec);
22266
22267       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
22268         emit_insr (target, builder.elt (i));
22269
22270       return true;
22271     }
22272
22273   return false;
22274 }
22275
22276 /* Subroutine of aarch64_sve_expand_vector_init.
22277    Works as follows:
22278    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
22279    (b) Skip trailing elements from BUILDER, which are the same as
22280        element NELTS_REQD - 1.
22281    (c) Insert earlier elements in reverse order in TARGET using insr.  */
22282
22283 static void
22284 aarch64_sve_expand_vector_init_insert_elems (rtx target,
22285                                              const rtx_vector_builder &builder,
22286                                              int nelts_reqd)
22287 {
22288   machine_mode mode = GET_MODE (target);
22289   scalar_mode elem_mode = GET_MODE_INNER (mode);
22290
22291   struct expand_operand ops[2];
22292   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
22293   gcc_assert (icode != CODE_FOR_nothing);
22294
22295   create_output_operand (&ops[0], target, mode);
22296   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
22297   expand_insn (icode, 2, ops);
22298
22299   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22300   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
22301     emit_insr (target, builder.elt (i));
22302 }
22303
22304 /* Subroutine of aarch64_sve_expand_vector_init to handle case
22305    when all trailing elements of builder are same.
22306    This works as follows:
22307    (a) Use expand_insn interface to broadcast last vector element in TARGET.
22308    (b) Insert remaining elements in TARGET using insr.
22309
22310    ??? The heuristic used is to do above if number of same trailing elements
22311    is at least 3/4 of total number of elements, loosely based on
22312    heuristic from mostly_zeros_p.  May need fine-tuning.  */
22313
22314 static bool
22315 aarch64_sve_expand_vector_init_handle_trailing_same_elem
22316  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
22317 {
22318   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22319   if (ndups >= (3 * nelts_reqd) / 4)
22320     {
22321       aarch64_sve_expand_vector_init_insert_elems (target, builder,
22322                                                    nelts_reqd - ndups + 1);
22323       return true;
22324     }
22325
22326   return false;
22327 }
22328
22329 /* Initialize register TARGET from BUILDER. NELTS is the constant number
22330    of elements in BUILDER.
22331
22332    The function tries to initialize TARGET from BUILDER if it fits one
22333    of the special cases outlined below.
22334
22335    Failing that, the function divides BUILDER into two sub-vectors:
22336    v_even = even elements of BUILDER;
22337    v_odd = odd elements of BUILDER;
22338
22339    and recursively calls itself with v_even and v_odd.
22340
22341    if (recursive call succeeded for v_even or v_odd)
22342      TARGET = zip (v_even, v_odd)
22343
22344    The function returns true if it managed to build TARGET from BUILDER
22345    with one of the special cases, false otherwise.
22346
22347    Example: {a, 1, b, 2, c, 3, d, 4}
22348
22349    The vector gets divided into:
22350    v_even = {a, b, c, d}
22351    v_odd = {1, 2, 3, 4}
22352
22353    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
22354    initialize tmp2 from constant vector v_odd using emit_move_insn.
22355
22356    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
22357    4 elements, so we construct tmp1 from v_even using insr:
22358    tmp1 = dup(d)
22359    insr tmp1, c
22360    insr tmp1, b
22361    insr tmp1, a
22362
22363    And finally:
22364    TARGET = zip (tmp1, tmp2)
22365    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
22366
22367 static bool
22368 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
22369                                 int nelts, int nelts_reqd)
22370 {
22371   machine_mode mode = GET_MODE (target);
22372
22373   /* Case 1: Vector contains trailing constants.  */
22374
22375   if (aarch64_sve_expand_vector_init_handle_trailing_constants
22376        (target, builder, nelts, nelts_reqd))
22377     return true;
22378
22379   /* Case 2: Vector contains leading constants.  */
22380
22381   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
22382   for (int i = 0; i < nelts_reqd; i++)
22383     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
22384   rev_builder.finalize ();
22385
22386   if (aarch64_sve_expand_vector_init_handle_trailing_constants
22387        (target, rev_builder, nelts, nelts_reqd))
22388     {
22389       emit_insn (gen_aarch64_sve_rev (mode, target, target));
22390       return true;
22391     }
22392
22393   /* Case 3: Vector contains trailing same element.  */
22394
22395   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22396        (target, builder, nelts_reqd))
22397     return true;
22398
22399   /* Case 4: Vector contains leading same element.  */
22400
22401   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22402        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
22403     {
22404       emit_insn (gen_aarch64_sve_rev (mode, target, target));
22405       return true;
22406     }
22407
22408   /* Avoid recursing below 4-elements.
22409      ??? The threshold 4 may need fine-tuning.  */
22410
22411   if (nelts_reqd <= 4)
22412     return false;
22413
22414   rtx_vector_builder v_even (mode, nelts, 1);
22415   rtx_vector_builder v_odd (mode, nelts, 1);
22416
22417   for (int i = 0; i < nelts * 2; i += 2)
22418     {
22419       v_even.quick_push (builder.elt (i));
22420       v_odd.quick_push (builder.elt (i + 1));
22421     }
22422
22423   v_even.finalize ();
22424   v_odd.finalize ();
22425
22426   rtx tmp1 = gen_reg_rtx (mode);
22427   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
22428                                                     nelts, nelts_reqd / 2);
22429
22430   rtx tmp2 = gen_reg_rtx (mode);
22431   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
22432                                                    nelts, nelts_reqd / 2);
22433
22434   if (!did_even_p && !did_odd_p)
22435     return false;
22436
22437   /* Initialize v_even and v_odd using INSR if it didn't match any of the
22438      special cases and zip v_even, v_odd.  */
22439
22440   if (!did_even_p)
22441     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
22442
22443   if (!did_odd_p)
22444     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
22445
22446   rtvec v = gen_rtvec (2, tmp1, tmp2);
22447   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22448   return true;
22449 }
22450
22451 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
22452
22453 void
22454 aarch64_sve_expand_vector_init (rtx target, rtx vals)
22455 {
22456   machine_mode mode = GET_MODE (target);
22457   int nelts = XVECLEN (vals, 0);
22458
22459   rtx_vector_builder v (mode, nelts, 1);
22460   for (int i = 0; i < nelts; i++)
22461     v.quick_push (XVECEXP (vals, 0, i));
22462   v.finalize ();
22463
22464   /* If neither sub-vectors of v could be initialized specially,
22465      then use INSR to insert all elements from v into TARGET.
22466      ??? This might not be optimal for vectors with large
22467      initializers like 16-element or above.
22468      For nelts < 4, it probably isn't useful to handle specially.  */
22469
22470   if (nelts < 4
22471       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
22472     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
22473 }
22474
22475 /* Check whether VALUE is a vector constant in which every element
22476    is either a power of 2 or a negated power of 2.  If so, return
22477    a constant vector of log2s, and flip CODE between PLUS and MINUS
22478    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
22479
22480 static rtx
22481 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
22482 {
22483   if (!CONST_VECTOR_P (value))
22484     return NULL_RTX;
22485
22486   rtx_vector_builder builder;
22487   if (!builder.new_unary_operation (GET_MODE (value), value, false))
22488     return NULL_RTX;
22489
22490   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
22491   /* 1 if the result of the multiplication must be negated,
22492      0 if it mustn't, or -1 if we don't yet care.  */
22493   int negate = -1;
22494   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
22495   for (unsigned int i = 0; i < encoded_nelts; ++i)
22496     {
22497       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
22498       if (!CONST_SCALAR_INT_P (elt))
22499         return NULL_RTX;
22500       rtx_mode_t val (elt, int_mode);
22501       wide_int pow2 = wi::neg (val);
22502       if (val != pow2)
22503         {
22504           /* It matters whether we negate or not.  Make that choice,
22505              and make sure that it's consistent with previous elements.  */
22506           if (negate == !wi::neg_p (val))
22507             return NULL_RTX;
22508           negate = wi::neg_p (val);
22509           if (!negate)
22510             pow2 = val;
22511         }
22512       /* POW2 is now the value that we want to be a power of 2.  */
22513       int shift = wi::exact_log2 (pow2);
22514       if (shift < 0)
22515         return NULL_RTX;
22516       builder.quick_push (gen_int_mode (shift, int_mode));
22517     }
22518   if (negate == -1)
22519     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
22520     code = PLUS;
22521   else if (negate == 1)
22522     code = code == PLUS ? MINUS : PLUS;
22523   return builder.build ();
22524 }
22525
22526 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
22527    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
22528    operands array, in the same order as for fma_optab.  Return true if
22529    the function emitted all the necessary instructions, false if the caller
22530    should generate the pattern normally with the new OPERANDS array.  */
22531
22532 bool
22533 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
22534 {
22535   machine_mode mode = GET_MODE (operands[0]);
22536   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
22537     {
22538       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
22539                                   NULL_RTX, true, OPTAB_DIRECT);
22540       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
22541                           operands[3], product, operands[0], true,
22542                           OPTAB_DIRECT);
22543       return true;
22544     }
22545   operands[2] = force_reg (mode, operands[2]);
22546   return false;
22547 }
22548
22549 /* Likewise, but for a conditional pattern.  */
22550
22551 bool
22552 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
22553 {
22554   machine_mode mode = GET_MODE (operands[0]);
22555   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
22556     {
22557       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
22558                                   NULL_RTX, true, OPTAB_DIRECT);
22559       emit_insn (gen_cond (code, mode, operands[0], operands[1],
22560                            operands[4], product, operands[5]));
22561       return true;
22562     }
22563   operands[3] = force_reg (mode, operands[3]);
22564   return false;
22565 }
22566
22567 static unsigned HOST_WIDE_INT
22568 aarch64_shift_truncation_mask (machine_mode mode)
22569 {
22570   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
22571     return 0;
22572   return GET_MODE_UNIT_BITSIZE (mode) - 1;
22573 }
22574
22575 /* Select a format to encode pointers in exception handling data.  */
22576 int
22577 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
22578 {
22579    int type;
22580    switch (aarch64_cmodel)
22581      {
22582      case AARCH64_CMODEL_TINY:
22583      case AARCH64_CMODEL_TINY_PIC:
22584      case AARCH64_CMODEL_SMALL:
22585      case AARCH64_CMODEL_SMALL_PIC:
22586      case AARCH64_CMODEL_SMALL_SPIC:
22587        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
22588           for everything.  */
22589        type = DW_EH_PE_sdata4;
22590        break;
22591      default:
22592        /* No assumptions here.  8-byte relocs required.  */
22593        type = DW_EH_PE_sdata8;
22594        break;
22595      }
22596    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
22597 }
22598
22599 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
22600
22601 static void
22602 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
22603 {
22604   if (TREE_CODE (decl) == FUNCTION_DECL)
22605     {
22606       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
22607       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
22608         {
22609           fprintf (stream, "\t.variant_pcs\t");
22610           assemble_name (stream, name);
22611           fprintf (stream, "\n");
22612         }
22613     }
22614 }
22615
22616 /* The last .arch and .tune assembly strings that we printed.  */
22617 static std::string aarch64_last_printed_arch_string;
22618 static std::string aarch64_last_printed_tune_string;
22619
22620 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
22621    by the function fndecl.  */
22622
22623 void
22624 aarch64_declare_function_name (FILE *stream, const char* name,
22625                                 tree fndecl)
22626 {
22627   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
22628
22629   struct cl_target_option *targ_options;
22630   if (target_parts)
22631     targ_options = TREE_TARGET_OPTION (target_parts);
22632   else
22633     targ_options = TREE_TARGET_OPTION (target_option_current_node);
22634   gcc_assert (targ_options);
22635
22636   const struct processor *this_arch
22637     = aarch64_get_arch (targ_options->x_selected_arch);
22638
22639   auto isa_flags = targ_options->x_aarch64_asm_isa_flags;
22640   std::string extension
22641     = aarch64_get_extension_string_for_isa_flags (isa_flags,
22642                                                   this_arch->flags);
22643   /* Only update the assembler .arch string if it is distinct from the last
22644      such string we printed.  */
22645   std::string to_print = this_arch->name + extension;
22646   if (to_print != aarch64_last_printed_arch_string)
22647     {
22648       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
22649       aarch64_last_printed_arch_string = to_print;
22650     }
22651
22652   /* Print the cpu name we're tuning for in the comments, might be
22653      useful to readers of the generated asm.  Do it only when it changes
22654      from function to function and verbose assembly is requested.  */
22655   const struct processor *this_tune
22656     = aarch64_get_tune_cpu (targ_options->x_selected_tune);
22657
22658   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
22659     {
22660       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
22661                    this_tune->name);
22662       aarch64_last_printed_tune_string = this_tune->name;
22663     }
22664
22665   aarch64_asm_output_variant_pcs (stream, fndecl, name);
22666
22667   /* Don't forget the type directive for ELF.  */
22668   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
22669   ASM_OUTPUT_LABEL (stream, name);
22670
22671   cfun->machine->label_is_assembled = true;
22672 }
22673
22674 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  Check if the patch area is after
22675    the function label and emit a BTI if necessary.  */
22676
22677 void
22678 aarch64_print_patchable_function_entry (FILE *file,
22679                                         unsigned HOST_WIDE_INT patch_area_size,
22680                                         bool record_p)
22681 {
22682   if (cfun->machine->label_is_assembled
22683       && aarch64_bti_enabled ()
22684       && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
22685     {
22686       /* Remove the BTI that follows the patch area and insert a new BTI
22687          before the patch area right after the function label.  */
22688       rtx_insn *insn = next_real_nondebug_insn (get_insns ());
22689       if (insn
22690           && INSN_P (insn)
22691           && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
22692           && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
22693         delete_insn (insn);
22694       asm_fprintf (file, "\thint\t34 // bti c\n");
22695     }
22696
22697   default_print_patchable_function_entry (file, patch_area_size, record_p);
22698 }
22699
22700 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
22701
22702 void
22703 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
22704 {
22705   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
22706   const char *value = IDENTIFIER_POINTER (target);
22707   aarch64_asm_output_variant_pcs (stream, decl, name);
22708   ASM_OUTPUT_DEF (stream, name, value);
22709 }
22710
22711 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
22712    function symbol references.  */
22713
22714 void
22715 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
22716 {
22717   default_elf_asm_output_external (stream, decl, name);
22718   aarch64_asm_output_variant_pcs (stream, decl, name);
22719 }
22720
22721 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
22722    Used to output the .cfi_b_key_frame directive when signing the current
22723    function with the B key.  */
22724
22725 void
22726 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
22727 {
22728   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
22729       && aarch64_ra_sign_key == AARCH64_KEY_B)
22730         asm_fprintf (f, "\t.cfi_b_key_frame\n");
22731 }
22732
22733 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
22734
22735 static void
22736 aarch64_start_file (void)
22737 {
22738   struct cl_target_option *default_options
22739     = TREE_TARGET_OPTION (target_option_default_node);
22740
22741   const struct processor *default_arch
22742     = aarch64_get_arch (default_options->x_selected_arch);
22743   auto default_isa_flags = default_options->x_aarch64_asm_isa_flags;
22744   std::string extension
22745     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
22746                                                   default_arch->flags);
22747
22748    aarch64_last_printed_arch_string = default_arch->name + extension;
22749    aarch64_last_printed_tune_string = "";
22750    asm_fprintf (asm_out_file, "\t.arch %s\n",
22751                 aarch64_last_printed_arch_string.c_str ());
22752
22753    default_file_start ();
22754 }
22755
22756 /* Emit load exclusive.  */
22757
22758 static void
22759 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
22760                              rtx mem, rtx model_rtx)
22761 {
22762   if (mode == TImode)
22763     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
22764                                                 gen_highpart (DImode, rval),
22765                                                 mem, model_rtx));
22766   else
22767     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
22768 }
22769
22770 /* Emit store exclusive.  */
22771
22772 static void
22773 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
22774                               rtx mem, rtx rval, rtx model_rtx)
22775 {
22776   if (mode == TImode)
22777     emit_insn (gen_aarch64_store_exclusive_pair
22778                (bval, mem, operand_subword (rval, 0, 0, TImode),
22779                 operand_subword (rval, 1, 0, TImode), model_rtx));
22780   else
22781     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
22782 }
22783
22784 /* Mark the previous jump instruction as unlikely.  */
22785
22786 static void
22787 aarch64_emit_unlikely_jump (rtx insn)
22788 {
22789   rtx_insn *jump = emit_jump_insn (insn);
22790   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
22791 }
22792
22793 /* We store the names of the various atomic helpers in a 5x5 array.
22794    Return the libcall function given MODE, MODEL and NAMES.  */
22795
22796 rtx
22797 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
22798                         const atomic_ool_names *names)
22799 {
22800   memmodel model = memmodel_from_int (INTVAL (model_rtx));
22801   int mode_idx, model_idx;
22802
22803   switch (mode)
22804     {
22805     case E_QImode:
22806       mode_idx = 0;
22807       break;
22808     case E_HImode:
22809       mode_idx = 1;
22810       break;
22811     case E_SImode:
22812       mode_idx = 2;
22813       break;
22814     case E_DImode:
22815       mode_idx = 3;
22816       break;
22817     case E_TImode:
22818       mode_idx = 4;
22819       break;
22820     default:
22821       gcc_unreachable ();
22822     }
22823
22824   switch (model)
22825     {
22826     case MEMMODEL_RELAXED:
22827       model_idx = 0;
22828       break;
22829     case MEMMODEL_CONSUME:
22830     case MEMMODEL_ACQUIRE:
22831       model_idx = 1;
22832       break;
22833     case MEMMODEL_RELEASE:
22834       model_idx = 2;
22835       break;
22836     case MEMMODEL_ACQ_REL:
22837     case MEMMODEL_SEQ_CST:
22838       model_idx = 3;
22839       break;
22840     case MEMMODEL_SYNC_ACQUIRE:
22841     case MEMMODEL_SYNC_RELEASE:
22842     case MEMMODEL_SYNC_SEQ_CST:
22843       model_idx = 4;
22844       break;
22845     default:
22846       gcc_unreachable ();
22847     }
22848
22849   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
22850                                       VISIBILITY_HIDDEN);
22851 }
22852
22853 #define DEF0(B, N) \
22854   { "__aarch64_" #B #N "_relax", \
22855     "__aarch64_" #B #N "_acq", \
22856     "__aarch64_" #B #N "_rel", \
22857     "__aarch64_" #B #N "_acq_rel", \
22858     "__aarch64_" #B #N "_sync" }
22859
22860 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
22861                  { NULL, NULL, NULL, NULL }
22862 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
22863
22864 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
22865 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
22866 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
22867 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
22868 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
22869 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
22870
22871 #undef DEF0
22872 #undef DEF4
22873 #undef DEF5
22874
22875 /* Expand a compare and swap pattern.  */
22876
22877 void
22878 aarch64_expand_compare_and_swap (rtx operands[])
22879 {
22880   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
22881   machine_mode mode, r_mode;
22882
22883   bval = operands[0];
22884   rval = operands[1];
22885   mem = operands[2];
22886   oldval = operands[3];
22887   newval = operands[4];
22888   is_weak = operands[5];
22889   mod_s = operands[6];
22890   mod_f = operands[7];
22891   mode = GET_MODE (mem);
22892
22893   /* Normally the succ memory model must be stronger than fail, but in the
22894      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
22895      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
22896   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
22897       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
22898     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
22899
22900   r_mode = mode;
22901   if (mode == QImode || mode == HImode)
22902     {
22903       r_mode = SImode;
22904       rval = gen_reg_rtx (r_mode);
22905     }
22906
22907   if (TARGET_LSE)
22908     {
22909       /* The CAS insn requires oldval and rval overlap, but we need to
22910          have a copy of oldval saved across the operation to tell if
22911          the operation is successful.  */
22912       if (reg_overlap_mentioned_p (rval, oldval))
22913         rval = copy_to_mode_reg (r_mode, oldval);
22914       else
22915         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
22916
22917       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
22918                                                    newval, mod_s));
22919       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
22920     }
22921   else if (TARGET_OUTLINE_ATOMICS)
22922     {
22923       /* Oldval must satisfy compare afterward.  */
22924       if (!aarch64_plus_operand (oldval, mode))
22925         oldval = force_reg (mode, oldval);
22926       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
22927       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
22928                                       oldval, mode, newval, mode,
22929                                       XEXP (mem, 0), Pmode);
22930       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
22931     }
22932   else
22933     {
22934       /* The oldval predicate varies by mode.  Test it and force to reg.  */
22935       insn_code code = code_for_aarch64_compare_and_swap (mode);
22936       if (!insn_data[code].operand[2].predicate (oldval, mode))
22937         oldval = force_reg (mode, oldval);
22938
22939       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
22940                                  is_weak, mod_s, mod_f));
22941       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
22942     }
22943
22944   if (r_mode != mode)
22945     rval = gen_lowpart (mode, rval);
22946   emit_move_insn (operands[1], rval);
22947
22948   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
22949   emit_insn (gen_rtx_SET (bval, x));
22950 }
22951
22952 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
22953    sequence implementing an atomic operation.  */
22954
22955 static void
22956 aarch64_emit_post_barrier (enum memmodel model)
22957 {
22958   const enum memmodel base_model = memmodel_base (model);
22959
22960   if (is_mm_sync (model)
22961       && (base_model == MEMMODEL_ACQUIRE
22962           || base_model == MEMMODEL_ACQ_REL
22963           || base_model == MEMMODEL_SEQ_CST))
22964     {
22965       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
22966     }
22967 }
22968
22969 /* Split a compare and swap pattern.  */
22970
22971 void
22972 aarch64_split_compare_and_swap (rtx operands[])
22973 {
22974   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
22975   gcc_assert (epilogue_completed);
22976
22977   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
22978   machine_mode mode;
22979   bool is_weak;
22980   rtx_code_label *label1, *label2;
22981   enum memmodel model;
22982
22983   rval = operands[0];
22984   mem = operands[1];
22985   oldval = operands[2];
22986   newval = operands[3];
22987   is_weak = (operands[4] != const0_rtx);
22988   model_rtx = operands[5];
22989   scratch = operands[7];
22990   mode = GET_MODE (mem);
22991   model = memmodel_from_int (INTVAL (model_rtx));
22992
22993   /* When OLDVAL is zero and we want the strong version we can emit a tighter
22994     loop:
22995     .label1:
22996         LD[A]XR rval, [mem]
22997         CBNZ    rval, .label2
22998         ST[L]XR scratch, newval, [mem]
22999         CBNZ    scratch, .label1
23000     .label2:
23001         CMP     rval, 0.  */
23002   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
23003                         oldval == const0_rtx && mode != TImode);
23004
23005   label1 = NULL;
23006   if (!is_weak)
23007     {
23008       label1 = gen_label_rtx ();
23009       emit_label (label1);
23010     }
23011   label2 = gen_label_rtx ();
23012
23013   /* The initial load can be relaxed for a __sync operation since a final
23014      barrier will be emitted to stop code hoisting.  */
23015   if (is_mm_sync (model))
23016     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
23017   else
23018     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
23019
23020   if (strong_zero_p)
23021     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
23022   else
23023     {
23024       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23025       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
23026     }
23027   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23028                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
23029   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23030
23031   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
23032
23033   if (!is_weak)
23034     {
23035       if (aarch64_track_speculation)
23036         {
23037           /* Emit an explicit compare instruction, so that we can correctly
23038              track the condition codes.  */
23039           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23040           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23041         }
23042       else
23043         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
23044
23045       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23046                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
23047       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23048     }
23049   else
23050     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23051
23052   emit_label (label2);
23053
23054   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
23055      to set the condition flags.  If this is not used it will be removed by
23056      later passes.  */
23057   if (strong_zero_p)
23058     aarch64_gen_compare_reg (NE, rval, const0_rtx);
23059
23060   /* Emit any final barrier needed for a __sync operation.  */
23061   if (is_mm_sync (model))
23062     aarch64_emit_post_barrier (model);
23063 }
23064
23065 /* Split an atomic operation.  */
23066
23067 void
23068 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
23069                          rtx value, rtx model_rtx, rtx cond)
23070 {
23071   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
23072   gcc_assert (epilogue_completed);
23073
23074   machine_mode mode = GET_MODE (mem);
23075   machine_mode wmode = (mode == DImode ? DImode : SImode);
23076   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
23077   const bool is_sync = is_mm_sync (model);
23078   rtx_code_label *label;
23079   rtx x;
23080
23081   /* Split the atomic operation into a sequence.  */
23082   label = gen_label_rtx ();
23083   emit_label (label);
23084
23085   if (new_out)
23086     new_out = gen_lowpart (wmode, new_out);
23087   if (old_out)
23088     old_out = gen_lowpart (wmode, old_out);
23089   else
23090     old_out = new_out;
23091   value = simplify_gen_subreg (wmode, value, mode, 0);
23092
23093   /* The initial load can be relaxed for a __sync operation since a final
23094      barrier will be emitted to stop code hoisting.  */
23095  if (is_sync)
23096     aarch64_emit_load_exclusive (mode, old_out, mem,
23097                                  GEN_INT (MEMMODEL_RELAXED));
23098   else
23099     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
23100
23101   switch (code)
23102     {
23103     case SET:
23104       new_out = value;
23105       break;
23106
23107     case NOT:
23108       x = gen_rtx_AND (wmode, old_out, value);
23109       emit_insn (gen_rtx_SET (new_out, x));
23110       x = gen_rtx_NOT (wmode, new_out);
23111       emit_insn (gen_rtx_SET (new_out, x));
23112       break;
23113
23114     case MINUS:
23115       if (CONST_INT_P (value))
23116         {
23117           value = GEN_INT (-UINTVAL (value));
23118           code = PLUS;
23119         }
23120       /* Fall through.  */
23121
23122     default:
23123       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
23124       emit_insn (gen_rtx_SET (new_out, x));
23125       break;
23126     }
23127
23128   aarch64_emit_store_exclusive (mode, cond, mem,
23129                                 gen_lowpart (mode, new_out), model_rtx);
23130
23131   if (aarch64_track_speculation)
23132     {
23133       /* Emit an explicit compare instruction, so that we can correctly
23134          track the condition codes.  */
23135       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
23136       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23137     }
23138   else
23139     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
23140
23141   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23142                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
23143   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23144
23145   /* Emit any final barrier needed for a __sync operation.  */
23146   if (is_sync)
23147     aarch64_emit_post_barrier (model);
23148 }
23149
23150 static void
23151 aarch64_init_libfuncs (void)
23152 {
23153    /* Half-precision float operations.  The compiler handles all operations
23154      with NULL libfuncs by converting to SFmode.  */
23155
23156   /* Conversions.  */
23157   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
23158   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
23159
23160   /* Arithmetic.  */
23161   set_optab_libfunc (add_optab, HFmode, NULL);
23162   set_optab_libfunc (sdiv_optab, HFmode, NULL);
23163   set_optab_libfunc (smul_optab, HFmode, NULL);
23164   set_optab_libfunc (neg_optab, HFmode, NULL);
23165   set_optab_libfunc (sub_optab, HFmode, NULL);
23166
23167   /* Comparisons.  */
23168   set_optab_libfunc (eq_optab, HFmode, NULL);
23169   set_optab_libfunc (ne_optab, HFmode, NULL);
23170   set_optab_libfunc (lt_optab, HFmode, NULL);
23171   set_optab_libfunc (le_optab, HFmode, NULL);
23172   set_optab_libfunc (ge_optab, HFmode, NULL);
23173   set_optab_libfunc (gt_optab, HFmode, NULL);
23174   set_optab_libfunc (unord_optab, HFmode, NULL);
23175 }
23176
23177 /* Target hook for c_mode_for_suffix.  */
23178 static machine_mode
23179 aarch64_c_mode_for_suffix (char suffix)
23180 {
23181   if (suffix == 'q')
23182     return TFmode;
23183
23184   return VOIDmode;
23185 }
23186
23187 /* We can only represent floating point constants which will fit in
23188    "quarter-precision" values.  These values are characterised by
23189    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
23190    by:
23191
23192    (-1)^s * (n/16) * 2^r
23193
23194    Where:
23195      's' is the sign bit.
23196      'n' is an integer in the range 16 <= n <= 31.
23197      'r' is an integer in the range -3 <= r <= 4.  */
23198
23199 /* Return true iff X can be represented by a quarter-precision
23200    floating point immediate operand X.  Note, we cannot represent 0.0.  */
23201 bool
23202 aarch64_float_const_representable_p (rtx x)
23203 {
23204   /* This represents our current view of how many bits
23205      make up the mantissa.  */
23206   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
23207   int exponent;
23208   unsigned HOST_WIDE_INT mantissa, mask;
23209   REAL_VALUE_TYPE r, m;
23210   bool fail;
23211
23212   x = unwrap_const_vec_duplicate (x);
23213   if (!CONST_DOUBLE_P (x))
23214     return false;
23215
23216   if (GET_MODE (x) == VOIDmode
23217       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
23218     return false;
23219
23220   r = *CONST_DOUBLE_REAL_VALUE (x);
23221
23222   /* We cannot represent infinities, NaNs or +/-zero.  We won't
23223      know if we have +zero until we analyse the mantissa, but we
23224      can reject the other invalid values.  */
23225   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
23226       || REAL_VALUE_MINUS_ZERO (r))
23227     return false;
23228
23229   /* Extract exponent.  */
23230   r = real_value_abs (&r);
23231   exponent = REAL_EXP (&r);
23232
23233   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23234      highest (sign) bit, with a fixed binary point at bit point_pos.
23235      m1 holds the low part of the mantissa, m2 the high part.
23236      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23237      bits for the mantissa, this can fail (low bits will be lost).  */
23238   real_ldexp (&m, &r, point_pos - exponent);
23239   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
23240
23241   /* If the low part of the mantissa has bits set we cannot represent
23242      the value.  */
23243   if (w.ulow () != 0)
23244     return false;
23245   /* We have rejected the lower HOST_WIDE_INT, so update our
23246      understanding of how many bits lie in the mantissa and
23247      look only at the high HOST_WIDE_INT.  */
23248   mantissa = w.elt (1);
23249   point_pos -= HOST_BITS_PER_WIDE_INT;
23250
23251   /* We can only represent values with a mantissa of the form 1.xxxx.  */
23252   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
23253   if ((mantissa & mask) != 0)
23254     return false;
23255
23256   /* Having filtered unrepresentable values, we may now remove all
23257      but the highest 5 bits.  */
23258   mantissa >>= point_pos - 5;
23259
23260   /* We cannot represent the value 0.0, so reject it.  This is handled
23261      elsewhere.  */
23262   if (mantissa == 0)
23263     return false;
23264
23265   /* Then, as bit 4 is always set, we can mask it off, leaving
23266      the mantissa in the range [0, 15].  */
23267   mantissa &= ~(1 << 4);
23268   gcc_assert (mantissa <= 15);
23269
23270   /* GCC internally does not use IEEE754-like encoding (where normalized
23271      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
23272      Our mantissa values are shifted 4 places to the left relative to
23273      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23274      by 5 places to correct for GCC's representation.  */
23275   exponent = 5 - exponent;
23276
23277   return (exponent >= 0 && exponent <= 7);
23278 }
23279
23280 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
23281    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
23282    output MOVI/MVNI, ORR or BIC immediate.  */
23283 char*
23284 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
23285                                    enum simd_immediate_check which)
23286 {
23287   bool is_valid;
23288   static char templ[40];
23289   const char *mnemonic;
23290   const char *shift_op;
23291   unsigned int lane_count = 0;
23292   char element_char;
23293
23294   struct simd_immediate_info info;
23295
23296   /* This will return true to show const_vector is legal for use as either
23297      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
23298      It will also update INFO to show how the immediate should be generated.
23299      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
23300   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
23301   gcc_assert (is_valid);
23302
23303   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23304   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
23305
23306   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23307     {
23308       gcc_assert (info.insn == simd_immediate_info::MOV
23309                   && info.u.mov.shift == 0);
23310       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
23311          move immediate path.  */
23312       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23313         info.u.mov.value = GEN_INT (0);
23314       else
23315         {
23316           const unsigned int buf_size = 20;
23317           char float_buf[buf_size] = {'\0'};
23318           real_to_decimal_for_mode (float_buf,
23319                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23320                                     buf_size, buf_size, 1, info.elt_mode);
23321
23322           if (lane_count == 1)
23323             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
23324           else
23325             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
23326                       lane_count, element_char, float_buf);
23327           return templ;
23328         }
23329     }
23330
23331   gcc_assert (CONST_INT_P (info.u.mov.value));
23332
23333   if (which == AARCH64_CHECK_MOV)
23334     {
23335       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
23336       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
23337                   ? "msl" : "lsl");
23338       if (lane_count == 1)
23339         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
23340                   mnemonic, UINTVAL (info.u.mov.value));
23341       else if (info.u.mov.shift)
23342         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23343                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
23344                   element_char, UINTVAL (info.u.mov.value), shift_op,
23345                   info.u.mov.shift);
23346       else
23347         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23348                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
23349                   element_char, UINTVAL (info.u.mov.value));
23350     }
23351   else
23352     {
23353       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
23354       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
23355       if (info.u.mov.shift)
23356         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23357                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
23358                   element_char, UINTVAL (info.u.mov.value), "lsl",
23359                   info.u.mov.shift);
23360       else
23361         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23362                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
23363                   element_char, UINTVAL (info.u.mov.value));
23364     }
23365   return templ;
23366 }
23367
23368 char*
23369 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
23370 {
23371
23372   /* If a floating point number was passed and we desire to use it in an
23373      integer mode do the conversion to integer.  */
23374   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
23375     {
23376       unsigned HOST_WIDE_INT ival;
23377       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
23378           gcc_unreachable ();
23379       immediate = gen_int_mode (ival, mode);
23380     }
23381
23382   machine_mode vmode;
23383   /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
23384      a 128 bit vector mode.  */
23385   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
23386
23387   vmode = aarch64_simd_container_mode (mode, width);
23388   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
23389   return aarch64_output_simd_mov_immediate (v_op, width);
23390 }
23391
23392 /* Return the output string to use for moving immediate CONST_VECTOR
23393    into an SVE register.  */
23394
23395 char *
23396 aarch64_output_sve_mov_immediate (rtx const_vector)
23397 {
23398   static char templ[40];
23399   struct simd_immediate_info info;
23400   char element_char;
23401
23402   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
23403   gcc_assert (is_valid);
23404
23405   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23406
23407   machine_mode vec_mode = GET_MODE (const_vector);
23408   if (aarch64_sve_pred_mode_p (vec_mode))
23409     {
23410       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
23411       if (info.insn == simd_immediate_info::MOV)
23412         {
23413           gcc_assert (info.u.mov.value == const0_rtx);
23414           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
23415         }
23416       else
23417         {
23418           gcc_assert (info.insn == simd_immediate_info::PTRUE);
23419           unsigned int total_bytes;
23420           if (info.u.pattern == AARCH64_SV_ALL
23421               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
23422             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
23423                       total_bytes / GET_MODE_SIZE (info.elt_mode));
23424           else
23425             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
23426                       svpattern_token (info.u.pattern));
23427         }
23428       return buf;
23429     }
23430
23431   if (info.insn == simd_immediate_info::INDEX)
23432     {
23433       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
23434                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
23435                 element_char, INTVAL (info.u.index.base),
23436                 INTVAL (info.u.index.step));
23437       return templ;
23438     }
23439
23440   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23441     {
23442       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23443         info.u.mov.value = GEN_INT (0);
23444       else
23445         {
23446           const int buf_size = 20;
23447           char float_buf[buf_size] = {};
23448           real_to_decimal_for_mode (float_buf,
23449                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23450                                     buf_size, buf_size, 1, info.elt_mode);
23451
23452           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
23453                     element_char, float_buf);
23454           return templ;
23455         }
23456     }
23457
23458   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
23459             element_char, INTVAL (info.u.mov.value));
23460   return templ;
23461 }
23462
23463 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
23464    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
23465    pattern.  */
23466
23467 char *
23468 aarch64_output_sve_ptrues (rtx const_unspec)
23469 {
23470   static char templ[40];
23471
23472   struct simd_immediate_info info;
23473   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
23474   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
23475
23476   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23477   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
23478             svpattern_token (info.u.pattern));
23479   return templ;
23480 }
23481
23482 /* Split operands into moves from op[1] + op[2] into op[0].  */
23483
23484 void
23485 aarch64_split_combinev16qi (rtx operands[3])
23486 {
23487   unsigned int dest = REGNO (operands[0]);
23488   unsigned int src1 = REGNO (operands[1]);
23489   unsigned int src2 = REGNO (operands[2]);
23490   machine_mode halfmode = GET_MODE (operands[1]);
23491   unsigned int halfregs = REG_NREGS (operands[1]);
23492   rtx destlo, desthi;
23493
23494   gcc_assert (halfmode == V16QImode);
23495
23496   if (src1 == dest && src2 == dest + halfregs)
23497     {
23498       /* No-op move.  Can't split to nothing; emit something.  */
23499       emit_note (NOTE_INSN_DELETED);
23500       return;
23501     }
23502
23503   /* Preserve register attributes for variable tracking.  */
23504   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
23505   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
23506                                GET_MODE_SIZE (halfmode));
23507
23508   /* Special case of reversed high/low parts.  */
23509   if (reg_overlap_mentioned_p (operands[2], destlo)
23510       && reg_overlap_mentioned_p (operands[1], desthi))
23511     {
23512       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23513       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
23514       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23515     }
23516   else if (!reg_overlap_mentioned_p (operands[2], destlo))
23517     {
23518       /* Try to avoid unnecessary moves if part of the result
23519          is in the right place already.  */
23520       if (src1 != dest)
23521         emit_move_insn (destlo, operands[1]);
23522       if (src2 != dest + halfregs)
23523         emit_move_insn (desthi, operands[2]);
23524     }
23525   else
23526     {
23527       if (src2 != dest + halfregs)
23528         emit_move_insn (desthi, operands[2]);
23529       if (src1 != dest)
23530         emit_move_insn (destlo, operands[1]);
23531     }
23532 }
23533
23534 /* vec_perm support.  */
23535
23536 struct expand_vec_perm_d
23537 {
23538   rtx target, op0, op1;
23539   vec_perm_indices perm;
23540   machine_mode vmode;
23541   machine_mode op_mode;
23542   unsigned int vec_flags;
23543   unsigned int op_vec_flags;
23544   bool one_vector_p;
23545   bool testing_p;
23546 };
23547
23548 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
23549
23550 /* Generate a variable permutation.  */
23551
23552 static void
23553 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
23554 {
23555   machine_mode vmode = GET_MODE (target);
23556   bool one_vector_p = rtx_equal_p (op0, op1);
23557
23558   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
23559   gcc_checking_assert (GET_MODE (op0) == vmode);
23560   gcc_checking_assert (GET_MODE (op1) == vmode);
23561   gcc_checking_assert (GET_MODE (sel) == vmode);
23562   gcc_checking_assert (TARGET_SIMD);
23563
23564   if (one_vector_p)
23565     {
23566       if (vmode == V8QImode)
23567         {
23568           /* Expand the argument to a V16QI mode by duplicating it.  */
23569           rtx pair = gen_reg_rtx (V16QImode);
23570           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
23571           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23572         }
23573       else
23574         {
23575           emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
23576         }
23577     }
23578   else
23579     {
23580       rtx pair;
23581
23582       if (vmode == V8QImode)
23583         {
23584           pair = gen_reg_rtx (V16QImode);
23585           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
23586           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23587         }
23588       else
23589         {
23590           pair = gen_reg_rtx (V2x16QImode);
23591           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
23592           emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
23593         }
23594     }
23595 }
23596
23597 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
23598    NELT is the number of elements in the vector.  */
23599
23600 void
23601 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
23602                          unsigned int nelt)
23603 {
23604   machine_mode vmode = GET_MODE (target);
23605   bool one_vector_p = rtx_equal_p (op0, op1);
23606   rtx mask;
23607
23608   /* The TBL instruction does not use a modulo index, so we must take care
23609      of that ourselves.  */
23610   mask = aarch64_simd_gen_const_vector_dup (vmode,
23611       one_vector_p ? nelt - 1 : 2 * nelt - 1);
23612   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
23613
23614   /* For big-endian, we also need to reverse the index within the vector
23615      (but not which vector).  */
23616   if (BYTES_BIG_ENDIAN)
23617     {
23618       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
23619       if (!one_vector_p)
23620         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
23621       sel = expand_simple_binop (vmode, XOR, sel, mask,
23622                                  NULL, 0, OPTAB_LIB_WIDEN);
23623     }
23624   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
23625 }
23626
23627 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
23628
23629 static void
23630 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
23631 {
23632   emit_insn (gen_rtx_SET (target,
23633                           gen_rtx_UNSPEC (GET_MODE (target),
23634                                           gen_rtvec (2, op0, op1), code)));
23635 }
23636
23637 /* Expand an SVE vec_perm with the given operands.  */
23638
23639 void
23640 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
23641 {
23642   machine_mode data_mode = GET_MODE (target);
23643   machine_mode sel_mode = GET_MODE (sel);
23644   /* Enforced by the pattern condition.  */
23645   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
23646
23647   /* Note: vec_perm indices are supposed to wrap when they go beyond the
23648      size of the two value vectors, i.e. the upper bits of the indices
23649      are effectively ignored.  SVE TBL instead produces 0 for any
23650      out-of-range indices, so we need to modulo all the vec_perm indices
23651      to ensure they are all in range.  */
23652   rtx sel_reg = force_reg (sel_mode, sel);
23653
23654   /* Check if the sel only references the first values vector.  */
23655   if (CONST_VECTOR_P (sel)
23656       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
23657     {
23658       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
23659       return;
23660     }
23661
23662   /* Check if the two values vectors are the same.  */
23663   if (rtx_equal_p (op0, op1))
23664     {
23665       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
23666       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23667                                          NULL, 0, OPTAB_DIRECT);
23668       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
23669       return;
23670     }
23671
23672   /* Run TBL on for each value vector and combine the results.  */
23673
23674   rtx res0 = gen_reg_rtx (data_mode);
23675   rtx res1 = gen_reg_rtx (data_mode);
23676   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
23677   if (!CONST_VECTOR_P (sel)
23678       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
23679     {
23680       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
23681                                                        2 * nunits - 1);
23682       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23683                                      NULL, 0, OPTAB_DIRECT);
23684     }
23685   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
23686   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
23687                                      NULL, 0, OPTAB_DIRECT);
23688   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
23689   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
23690     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
23691   else
23692     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
23693 }
23694
23695 /* Recognize patterns suitable for the TRN instructions.  */
23696 static bool
23697 aarch64_evpc_trn (struct expand_vec_perm_d *d)
23698 {
23699   HOST_WIDE_INT odd;
23700   poly_uint64 nelt = d->perm.length ();
23701   rtx out, in0, in1;
23702   machine_mode vmode = d->vmode;
23703
23704   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23705     return false;
23706
23707   /* Note that these are little-endian tests.
23708      We correct for big-endian later.  */
23709   if (!d->perm[0].is_constant (&odd)
23710       || (odd != 0 && odd != 1)
23711       || !d->perm.series_p (0, 2, odd, 2)
23712       || !d->perm.series_p (1, 2, nelt + odd, 2))
23713     return false;
23714
23715   /* Success!  */
23716   if (d->testing_p)
23717     return true;
23718
23719   in0 = d->op0;
23720   in1 = d->op1;
23721   /* We don't need a big-endian lane correction for SVE; see the comment
23722      at the head of aarch64-sve.md for details.  */
23723   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23724     {
23725       std::swap (in0, in1);
23726       odd = !odd;
23727     }
23728   out = d->target;
23729
23730   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23731                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
23732   return true;
23733 }
23734
23735 /* Try to re-encode the PERM constant so it combines odd and even elements.
23736    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
23737    We retry with this new constant with the full suite of patterns.  */
23738 static bool
23739 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
23740 {
23741   expand_vec_perm_d newd;
23742   unsigned HOST_WIDE_INT nelt;
23743
23744   if (d->vec_flags != VEC_ADVSIMD)
23745     return false;
23746
23747   /* Get the new mode.  Always twice the size of the inner
23748      and half the elements.  */
23749   poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
23750   unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
23751   auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
23752   machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
23753
23754   if (new_mode == word_mode)
23755     return false;
23756
23757   /* to_constant is safe since this routine is specific to Advanced SIMD
23758      vectors.  */
23759   nelt = d->perm.length ().to_constant ();
23760
23761   vec_perm_builder newpermconst;
23762   newpermconst.new_vector (nelt / 2, nelt / 2, 1);
23763
23764   /* Convert the perm constant if we can.  Require even, odd as the pairs.  */
23765   for (unsigned int i = 0; i < nelt; i += 2)
23766     {
23767       poly_int64 elt0 = d->perm[i];
23768       poly_int64 elt1 = d->perm[i + 1];
23769       poly_int64 newelt;
23770       if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
23771         return false;
23772       newpermconst.quick_push (newelt.to_constant ());
23773     }
23774   newpermconst.finalize ();
23775
23776   newd.vmode = new_mode;
23777   newd.vec_flags = VEC_ADVSIMD;
23778   newd.op_mode = newd.vmode;
23779   newd.op_vec_flags = newd.vec_flags;
23780   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
23781   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
23782   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
23783   newd.testing_p = d->testing_p;
23784   newd.one_vector_p = d->one_vector_p;
23785
23786   newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
23787   return aarch64_expand_vec_perm_const_1 (&newd);
23788 }
23789
23790 /* Recognize patterns suitable for the UZP instructions.  */
23791 static bool
23792 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
23793 {
23794   HOST_WIDE_INT odd;
23795   rtx out, in0, in1;
23796   machine_mode vmode = d->vmode;
23797
23798   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23799     return false;
23800
23801   /* Note that these are little-endian tests.
23802      We correct for big-endian later.  */
23803   if (!d->perm[0].is_constant (&odd)
23804       || (odd != 0 && odd != 1)
23805       || !d->perm.series_p (0, 1, odd, 2))
23806     return false;
23807
23808   /* Success!  */
23809   if (d->testing_p)
23810     return true;
23811
23812   in0 = d->op0;
23813   in1 = d->op1;
23814   /* We don't need a big-endian lane correction for SVE; see the comment
23815      at the head of aarch64-sve.md for details.  */
23816   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23817     {
23818       std::swap (in0, in1);
23819       odd = !odd;
23820     }
23821   out = d->target;
23822
23823   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23824                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
23825   return true;
23826 }
23827
23828 /* Recognize patterns suitable for the ZIP instructions.  */
23829 static bool
23830 aarch64_evpc_zip (struct expand_vec_perm_d *d)
23831 {
23832   unsigned int high;
23833   poly_uint64 nelt = d->perm.length ();
23834   rtx out, in0, in1;
23835   machine_mode vmode = d->vmode;
23836
23837   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23838     return false;
23839
23840   /* Note that these are little-endian tests.
23841      We correct for big-endian later.  */
23842   poly_uint64 first = d->perm[0];
23843   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
23844       || !d->perm.series_p (0, 2, first, 1)
23845       || !d->perm.series_p (1, 2, first + nelt, 1))
23846     return false;
23847   high = maybe_ne (first, 0U);
23848
23849   /* Success!  */
23850   if (d->testing_p)
23851     return true;
23852
23853   in0 = d->op0;
23854   in1 = d->op1;
23855   /* We don't need a big-endian lane correction for SVE; see the comment
23856      at the head of aarch64-sve.md for details.  */
23857   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23858     {
23859       std::swap (in0, in1);
23860       high = !high;
23861     }
23862   out = d->target;
23863
23864   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23865                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
23866   return true;
23867 }
23868
23869 /* Recognize patterns for the EXT insn.  */
23870
23871 static bool
23872 aarch64_evpc_ext (struct expand_vec_perm_d *d)
23873 {
23874   HOST_WIDE_INT location;
23875   rtx offset;
23876
23877   /* The first element always refers to the first vector.
23878      Check if the extracted indices are increasing by one.  */
23879   if (d->vec_flags == VEC_SVE_PRED
23880       || !d->perm[0].is_constant (&location)
23881       || !d->perm.series_p (0, 1, location, 1))
23882     return false;
23883
23884   /* Success! */
23885   if (d->testing_p)
23886     return true;
23887
23888   /* The case where (location == 0) is a no-op for both big- and little-endian,
23889      and is removed by the mid-end at optimization levels -O1 and higher.
23890
23891      We don't need a big-endian lane correction for SVE; see the comment
23892      at the head of aarch64-sve.md for details.  */
23893   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
23894     {
23895       /* After setup, we want the high elements of the first vector (stored
23896          at the LSB end of the register), and the low elements of the second
23897          vector (stored at the MSB end of the register). So swap.  */
23898       std::swap (d->op0, d->op1);
23899       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
23900          to_constant () is safe since this is restricted to Advanced SIMD
23901          vectors.  */
23902       location = d->perm.length ().to_constant () - location;
23903     }
23904
23905   offset = GEN_INT (location);
23906   emit_set_insn (d->target,
23907                  gen_rtx_UNSPEC (d->vmode,
23908                                  gen_rtvec (3, d->op0, d->op1, offset),
23909                                  UNSPEC_EXT));
23910   return true;
23911 }
23912
23913 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
23914    within each 64-bit, 32-bit or 16-bit granule.  */
23915
23916 static bool
23917 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
23918 {
23919   HOST_WIDE_INT diff;
23920   unsigned int i, size, unspec;
23921   machine_mode pred_mode;
23922
23923   if (d->vec_flags == VEC_SVE_PRED
23924       || !d->one_vector_p
23925       || !d->perm[0].is_constant (&diff)
23926       || !diff)
23927     return false;
23928
23929   if (d->vec_flags & VEC_SVE_DATA)
23930     size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
23931   else
23932     size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
23933   if (size == 64)
23934     {
23935       unspec = UNSPEC_REV64;
23936       pred_mode = VNx2BImode;
23937     }
23938   else if (size == 32)
23939     {
23940       unspec = UNSPEC_REV32;
23941       pred_mode = VNx4BImode;
23942     }
23943   else if (size == 16)
23944     {
23945       unspec = UNSPEC_REV16;
23946       pred_mode = VNx8BImode;
23947     }
23948   else
23949     return false;
23950
23951   unsigned int step = diff + 1;
23952   for (i = 0; i < step; ++i)
23953     if (!d->perm.series_p (i, step, diff - i, step))
23954       return false;
23955
23956   /* Success! */
23957   if (d->testing_p)
23958     return true;
23959
23960   if (d->vec_flags & VEC_SVE_DATA)
23961     {
23962       rtx pred = aarch64_ptrue_reg (pred_mode);
23963       emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
23964                                          d->target, pred, d->op0));
23965       return true;
23966     }
23967   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
23968   emit_set_insn (d->target, src);
23969   return true;
23970 }
23971
23972 /* Recognize patterns for the REV insn, which reverses elements within
23973    a full vector.  */
23974
23975 static bool
23976 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
23977 {
23978   poly_uint64 nelt = d->perm.length ();
23979
23980   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
23981     return false;
23982
23983   if (!d->perm.series_p (0, 1, nelt - 1, -1))
23984     return false;
23985
23986   /* Success! */
23987   if (d->testing_p)
23988     return true;
23989
23990   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
23991   emit_set_insn (d->target, src);
23992   return true;
23993 }
23994
23995 static bool
23996 aarch64_evpc_dup (struct expand_vec_perm_d *d)
23997 {
23998   rtx out = d->target;
23999   rtx in0;
24000   HOST_WIDE_INT elt;
24001   machine_mode vmode = d->vmode;
24002   rtx lane;
24003
24004   if (d->vec_flags == VEC_SVE_PRED
24005       || d->perm.encoding ().encoded_nelts () != 1
24006       || !d->perm[0].is_constant (&elt))
24007     return false;
24008
24009   if ((d->vec_flags & VEC_SVE_DATA)
24010       && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
24011     return false;
24012
24013   /* Success! */
24014   if (d->testing_p)
24015     return true;
24016
24017   /* The generic preparation in aarch64_expand_vec_perm_const_1
24018      swaps the operand order and the permute indices if it finds
24019      d->perm[0] to be in the second operand.  Thus, we can always
24020      use d->op0 and need not do any extra arithmetic to get the
24021      correct lane number.  */
24022   in0 = d->op0;
24023   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
24024
24025   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
24026   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
24027   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
24028   return true;
24029 }
24030
24031 static bool
24032 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
24033 {
24034   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
24035   machine_mode vmode = d->vmode;
24036
24037   /* Make sure that the indices are constant.  */
24038   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
24039   for (unsigned int i = 0; i < encoded_nelts; ++i)
24040     if (!d->perm[i].is_constant ())
24041       return false;
24042
24043   if (d->testing_p)
24044     return true;
24045
24046   /* Generic code will try constant permutation twice.  Once with the
24047      original mode and again with the elements lowered to QImode.
24048      So wait and don't do the selector expansion ourselves.  */
24049   if (vmode != V8QImode && vmode != V16QImode)
24050     return false;
24051
24052   /* to_constant is safe since this routine is specific to Advanced SIMD
24053      vectors.  */
24054   unsigned int nelt = d->perm.length ().to_constant ();
24055   for (unsigned int i = 0; i < nelt; ++i)
24056     /* If big-endian and two vectors we end up with a weird mixed-endian
24057        mode on NEON.  Reverse the index within each word but not the word
24058        itself.  to_constant is safe because we checked is_constant above.  */
24059     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
24060                         ? d->perm[i].to_constant () ^ (nelt - 1)
24061                         : d->perm[i].to_constant ());
24062
24063   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
24064   sel = force_reg (vmode, sel);
24065
24066   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
24067   return true;
24068 }
24069
24070 /* Try to implement D using an SVE TBL instruction.  */
24071
24072 static bool
24073 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
24074 {
24075   unsigned HOST_WIDE_INT nelt;
24076
24077   /* Permuting two variable-length vectors could overflow the
24078      index range.  */
24079   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
24080     return false;
24081
24082   if (d->testing_p)
24083     return true;
24084
24085   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
24086   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
24087   if (d->one_vector_p)
24088     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
24089   else
24090     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
24091   return true;
24092 }
24093
24094 /* Try to implement D using SVE dup instruction.  */
24095
24096 static bool
24097 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
24098 {
24099   if (BYTES_BIG_ENDIAN
24100       || !d->one_vector_p
24101       || d->vec_flags != VEC_SVE_DATA
24102       || d->op_vec_flags != VEC_ADVSIMD
24103       || d->perm.encoding ().nelts_per_pattern () != 1
24104       || !known_eq (d->perm.encoding ().npatterns (),
24105                     GET_MODE_NUNITS (d->op_mode))
24106       || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
24107     return false;
24108
24109   int npatterns = d->perm.encoding ().npatterns ();
24110   for (int i = 0; i < npatterns; i++)
24111     if (!known_eq (d->perm[i], i))
24112       return false;
24113
24114   if (d->testing_p)
24115     return true;
24116
24117   aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
24118   return true;
24119 }
24120
24121 /* Try to implement D using SVE SEL instruction.  */
24122
24123 static bool
24124 aarch64_evpc_sel (struct expand_vec_perm_d *d)
24125 {
24126   machine_mode vmode = d->vmode;
24127   int unit_size = GET_MODE_UNIT_SIZE (vmode);
24128
24129   if (d->vec_flags != VEC_SVE_DATA
24130       || unit_size > 8)
24131     return false;
24132
24133   int n_patterns = d->perm.encoding ().npatterns ();
24134   poly_int64 vec_len = d->perm.length ();
24135
24136   for (int i = 0; i < n_patterns; ++i)
24137     if (!known_eq (d->perm[i], i)
24138         && !known_eq (d->perm[i], vec_len + i))
24139       return false;
24140
24141   for (int i = n_patterns; i < n_patterns * 2; i++)
24142     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
24143         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
24144       return false;
24145
24146   if (d->testing_p)
24147     return true;
24148
24149   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
24150
24151   /* Build a predicate that is true when op0 elements should be used.  */
24152   rtx_vector_builder builder (pred_mode, n_patterns, 2);
24153   for (int i = 0; i < n_patterns * 2; i++)
24154     {
24155       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
24156                                           : CONST0_RTX (BImode);
24157       builder.quick_push (elem);
24158     }
24159
24160   rtx const_vec = builder.build ();
24161   rtx pred = force_reg (pred_mode, const_vec);
24162   /* TARGET = PRED ? OP0 : OP1.  */
24163   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
24164   return true;
24165 }
24166
24167 /* Recognize patterns suitable for the INS instructions.  */
24168 static bool
24169 aarch64_evpc_ins (struct expand_vec_perm_d *d)
24170 {
24171   machine_mode mode = d->vmode;
24172   unsigned HOST_WIDE_INT nelt;
24173
24174   if (d->vec_flags != VEC_ADVSIMD)
24175     return false;
24176
24177   /* to_constant is safe since this routine is specific to Advanced SIMD
24178      vectors.  */
24179   nelt = d->perm.length ().to_constant ();
24180   rtx insv = d->op0;
24181
24182   HOST_WIDE_INT idx = -1;
24183
24184   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24185     {
24186       HOST_WIDE_INT elt;
24187       if (!d->perm[i].is_constant (&elt))
24188         return false;
24189       if (elt == (HOST_WIDE_INT) i)
24190         continue;
24191       if (idx != -1)
24192         {
24193           idx = -1;
24194           break;
24195         }
24196       idx = i;
24197     }
24198
24199   if (idx == -1)
24200     {
24201       insv = d->op1;
24202       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24203         {
24204           if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
24205             continue;
24206           if (idx != -1)
24207             return false;
24208           idx = i;
24209         }
24210
24211       if (idx == -1)
24212         return false;
24213     }
24214
24215   if (d->testing_p)
24216     return true;
24217
24218   gcc_assert (idx != -1);
24219
24220   unsigned extractindex = d->perm[idx].to_constant ();
24221   rtx extractv = d->op0;
24222   if (extractindex >= nelt)
24223     {
24224       extractv = d->op1;
24225       extractindex -= nelt;
24226     }
24227   gcc_assert (extractindex < nelt);
24228
24229   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
24230   expand_operand ops[5];
24231   create_output_operand (&ops[0], d->target, mode);
24232   create_input_operand (&ops[1], insv, mode);
24233   create_integer_operand (&ops[2], 1 << idx);
24234   create_input_operand (&ops[3], extractv, mode);
24235   create_integer_operand (&ops[4], extractindex);
24236   expand_insn (icode, 5, ops);
24237
24238   return true;
24239 }
24240
24241 static bool
24242 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
24243 {
24244   gcc_assert (d->op_mode != E_VOIDmode);
24245
24246   /* The pattern matching functions above are written to look for a small
24247      number to begin the sequence (0, 1, N/2).  If we begin with an index
24248      from the second operand, we can swap the operands.  */
24249   poly_int64 nelt = d->perm.length ();
24250   if (known_ge (d->perm[0], nelt))
24251     {
24252       d->perm.rotate_inputs (1);
24253       std::swap (d->op0, d->op1);
24254     }
24255
24256   if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
24257        || d->vec_flags == VEC_SVE_DATA
24258        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
24259        || d->vec_flags == VEC_SVE_PRED)
24260       && known_gt (nelt, 1))
24261     {
24262       if (d->vmode == d->op_mode)
24263         {
24264           if (aarch64_evpc_rev_local (d))
24265             return true;
24266           else if (aarch64_evpc_rev_global (d))
24267             return true;
24268           else if (aarch64_evpc_ext (d))
24269             return true;
24270           else if (aarch64_evpc_dup (d))
24271             return true;
24272           else if (aarch64_evpc_zip (d))
24273             return true;
24274           else if (aarch64_evpc_uzp (d))
24275             return true;
24276           else if (aarch64_evpc_trn (d))
24277             return true;
24278           else if (aarch64_evpc_sel (d))
24279             return true;
24280           else if (aarch64_evpc_ins (d))
24281             return true;
24282           else if (aarch64_evpc_reencode (d))
24283             return true;
24284
24285           if (d->vec_flags == VEC_SVE_DATA)
24286             return aarch64_evpc_sve_tbl (d);
24287           else if (d->vec_flags == VEC_ADVSIMD)
24288             return aarch64_evpc_tbl (d);
24289         }
24290       else
24291         {
24292           if (aarch64_evpc_sve_dup (d))
24293             return true;
24294         }
24295     }
24296   return false;
24297 }
24298
24299 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
24300
24301 static bool
24302 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
24303                                   rtx target, rtx op0, rtx op1,
24304                                   const vec_perm_indices &sel)
24305 {
24306   struct expand_vec_perm_d d;
24307
24308   /* Check whether the mask can be applied to a single vector.  */
24309   if (sel.ninputs () == 1
24310       || (op0 && rtx_equal_p (op0, op1)))
24311     d.one_vector_p = true;
24312   else if (sel.all_from_input_p (0))
24313     {
24314       d.one_vector_p = true;
24315       op1 = op0;
24316     }
24317   else if (sel.all_from_input_p (1))
24318     {
24319       d.one_vector_p = true;
24320       op0 = op1;
24321     }
24322   else
24323     d.one_vector_p = false;
24324
24325   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
24326                      sel.nelts_per_input ());
24327   d.vmode = vmode;
24328   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
24329   d.op_mode = op_mode;
24330   d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
24331   d.target = target;
24332   d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
24333   if (op0 == op1)
24334     d.op1 = d.op0;
24335   else
24336     d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
24337   d.testing_p = !target;
24338
24339   if (!d.testing_p)
24340     return aarch64_expand_vec_perm_const_1 (&d);
24341
24342   rtx_insn *last = get_last_insn ();
24343   bool ret = aarch64_expand_vec_perm_const_1 (&d);
24344   gcc_assert (last == get_last_insn ());
24345
24346   return ret;
24347 }
24348
24349 /* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
24350
24351 bool
24352 aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
24353                                                tree vectype, wide_int cst,
24354                                                rtx *output, rtx in0, rtx in1)
24355 {
24356   if (code != TRUNC_DIV_EXPR
24357       || !TYPE_UNSIGNED (vectype))
24358     return false;
24359
24360   unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE (vectype));
24361   if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
24362     return false;
24363
24364   int pow = wi::exact_log2 (cst + 1);
24365   auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype));
24366   /* SVE actually has a div operator, we may have gotten here through
24367      that route.  */
24368   if (pow != (int) (element_precision (vectype) / 2)
24369       || insn_code == CODE_FOR_nothing)
24370     return false;
24371
24372   /* We can use the optimized pattern.  */
24373   if (in0 == NULL_RTX && in1 == NULL_RTX)
24374     return true;
24375
24376   if (!VECTOR_TYPE_P (vectype))
24377    return false;
24378
24379   gcc_assert (output);
24380
24381   if (!*output)
24382     *output = gen_reg_rtx (TYPE_MODE (vectype));
24383
24384   emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, in1));
24385   return true;
24386 }
24387
24388 /* Generate a byte permute mask for a register of mode MODE,
24389    which has NUNITS units.  */
24390
24391 rtx
24392 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
24393 {
24394   /* We have to reverse each vector because we dont have
24395      a permuted load that can reverse-load according to ABI rules.  */
24396   rtx mask;
24397   rtvec v = rtvec_alloc (16);
24398   unsigned int i, j;
24399   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
24400
24401   gcc_assert (BYTES_BIG_ENDIAN);
24402   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
24403
24404   for (i = 0; i < nunits; i++)
24405     for (j = 0; j < usize; j++)
24406       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
24407   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
24408   return force_reg (V16QImode, mask);
24409 }
24410
24411 /* Expand an SVE integer comparison using the SVE equivalent of:
24412
24413      (set TARGET (CODE OP0 OP1)).  */
24414
24415 void
24416 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
24417 {
24418   machine_mode pred_mode = GET_MODE (target);
24419   machine_mode data_mode = GET_MODE (op0);
24420   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
24421                                       op0, op1);
24422   if (!rtx_equal_p (target, res))
24423     emit_move_insn (target, res);
24424 }
24425
24426 /* Return the UNSPEC_COND_* code for comparison CODE.  */
24427
24428 static unsigned int
24429 aarch64_unspec_cond_code (rtx_code code)
24430 {
24431   switch (code)
24432     {
24433     case NE:
24434       return UNSPEC_COND_FCMNE;
24435     case EQ:
24436       return UNSPEC_COND_FCMEQ;
24437     case LT:
24438       return UNSPEC_COND_FCMLT;
24439     case GT:
24440       return UNSPEC_COND_FCMGT;
24441     case LE:
24442       return UNSPEC_COND_FCMLE;
24443     case GE:
24444       return UNSPEC_COND_FCMGE;
24445     case UNORDERED:
24446       return UNSPEC_COND_FCMUO;
24447     default:
24448       gcc_unreachable ();
24449     }
24450 }
24451
24452 /* Emit:
24453
24454       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24455
24456    where <X> is the operation associated with comparison CODE.
24457    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24458
24459 static void
24460 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
24461                           bool known_ptrue_p, rtx op0, rtx op1)
24462 {
24463   rtx flag = gen_int_mode (known_ptrue_p, SImode);
24464   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
24465                                gen_rtvec (4, pred, flag, op0, op1),
24466                                aarch64_unspec_cond_code (code));
24467   emit_set_insn (target, unspec);
24468 }
24469
24470 /* Emit the SVE equivalent of:
24471
24472       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
24473       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
24474       (set TARGET (ior:PRED_MODE TMP1 TMP2))
24475
24476    where <Xi> is the operation associated with comparison CODEi.
24477    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24478
24479 static void
24480 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
24481                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
24482 {
24483   machine_mode pred_mode = GET_MODE (pred);
24484   rtx tmp1 = gen_reg_rtx (pred_mode);
24485   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
24486   rtx tmp2 = gen_reg_rtx (pred_mode);
24487   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
24488   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
24489 }
24490
24491 /* Emit the SVE equivalent of:
24492
24493       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24494       (set TARGET (not TMP))
24495
24496    where <X> is the operation associated with comparison CODE.
24497    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24498
24499 static void
24500 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
24501                                  bool known_ptrue_p, rtx op0, rtx op1)
24502 {
24503   machine_mode pred_mode = GET_MODE (pred);
24504   rtx tmp = gen_reg_rtx (pred_mode);
24505   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
24506   aarch64_emit_unop (target, one_cmpl_optab, tmp);
24507 }
24508
24509 /* Expand an SVE floating-point comparison using the SVE equivalent of:
24510
24511      (set TARGET (CODE OP0 OP1))
24512
24513    If CAN_INVERT_P is true, the caller can also handle inverted results;
24514    return true if the result is in fact inverted.  */
24515
24516 bool
24517 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
24518                                   rtx op0, rtx op1, bool can_invert_p)
24519 {
24520   machine_mode pred_mode = GET_MODE (target);
24521   machine_mode data_mode = GET_MODE (op0);
24522
24523   rtx ptrue = aarch64_ptrue_reg (pred_mode);
24524   switch (code)
24525     {
24526     case UNORDERED:
24527       /* UNORDERED has no immediate form.  */
24528       op1 = force_reg (data_mode, op1);
24529       /* fall through */
24530     case LT:
24531     case LE:
24532     case GT:
24533     case GE:
24534     case EQ:
24535     case NE:
24536       {
24537         /* There is native support for the comparison.  */
24538         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24539         return false;
24540       }
24541
24542     case LTGT:
24543       /* This is a trapping operation (LT or GT).  */
24544       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
24545       return false;
24546
24547     case UNEQ:
24548       if (!flag_trapping_math)
24549         {
24550           /* This would trap for signaling NaNs.  */
24551           op1 = force_reg (data_mode, op1);
24552           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
24553                                         ptrue, true, op0, op1);
24554           return false;
24555         }
24556       /* fall through */
24557     case UNLT:
24558     case UNLE:
24559     case UNGT:
24560     case UNGE:
24561       if (flag_trapping_math)
24562         {
24563           /* Work out which elements are ordered.  */
24564           rtx ordered = gen_reg_rtx (pred_mode);
24565           op1 = force_reg (data_mode, op1);
24566           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
24567                                            ptrue, true, op0, op1);
24568
24569           /* Test the opposite condition for the ordered elements,
24570              then invert the result.  */
24571           if (code == UNEQ)
24572             code = NE;
24573           else
24574             code = reverse_condition_maybe_unordered (code);
24575           if (can_invert_p)
24576             {
24577               aarch64_emit_sve_fp_cond (target, code,
24578                                         ordered, false, op0, op1);
24579               return true;
24580             }
24581           aarch64_emit_sve_invert_fp_cond (target, code,
24582                                            ordered, false, op0, op1);
24583           return false;
24584         }
24585       break;
24586
24587     case ORDERED:
24588       /* ORDERED has no immediate form.  */
24589       op1 = force_reg (data_mode, op1);
24590       break;
24591
24592     default:
24593       gcc_unreachable ();
24594     }
24595
24596   /* There is native support for the inverse comparison.  */
24597   code = reverse_condition_maybe_unordered (code);
24598   if (can_invert_p)
24599     {
24600       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24601       return true;
24602     }
24603   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
24604   return false;
24605 }
24606
24607 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
24608    of the data being selected and CMP_MODE is the mode of the values being
24609    compared.  */
24610
24611 void
24612 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
24613                           rtx *ops)
24614 {
24615   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
24616   rtx pred = gen_reg_rtx (pred_mode);
24617   if (FLOAT_MODE_P (cmp_mode))
24618     {
24619       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
24620                                             ops[4], ops[5], true))
24621         std::swap (ops[1], ops[2]);
24622     }
24623   else
24624     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
24625
24626   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
24627     ops[1] = force_reg (data_mode, ops[1]);
24628   /* The "false" value can only be zero if the "true" value is a constant.  */
24629   if (register_operand (ops[1], data_mode)
24630       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
24631     ops[2] = force_reg (data_mode, ops[2]);
24632
24633   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
24634   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
24635 }
24636
24637 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
24638    true.  However due to issues with register allocation it is preferable
24639    to avoid tieing integer scalar and FP scalar modes.  Executing integer
24640    operations in general registers is better than treating them as scalar
24641    vector operations.  This reduces latency and avoids redundant int<->FP
24642    moves.  So tie modes if they are either the same class, or vector modes
24643    with other vector modes, vector structs or any scalar mode.  */
24644
24645 static bool
24646 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
24647 {
24648   if ((aarch64_advsimd_partial_struct_mode_p (mode1)
24649        != aarch64_advsimd_partial_struct_mode_p (mode2))
24650       && maybe_gt (GET_MODE_SIZE (mode1), 8)
24651       && maybe_gt (GET_MODE_SIZE (mode2), 8))
24652     return false;
24653
24654   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
24655     return true;
24656
24657   /* We specifically want to allow elements of "structure" modes to
24658      be tieable to the structure.  This more general condition allows
24659      other rarer situations too.  The reason we don't extend this to
24660      predicate modes is that there are no predicate structure modes
24661      nor any specific instructions for extracting part of a predicate
24662      register.  */
24663   if (aarch64_vector_data_mode_p (mode1)
24664       && aarch64_vector_data_mode_p (mode2))
24665     return true;
24666
24667   /* Also allow any scalar modes with vectors.  */
24668   if (aarch64_vector_mode_supported_p (mode1)
24669       || aarch64_vector_mode_supported_p (mode2))
24670     return true;
24671
24672   return false;
24673 }
24674
24675 /* Return a new RTX holding the result of moving POINTER forward by
24676    AMOUNT bytes.  */
24677
24678 static rtx
24679 aarch64_move_pointer (rtx pointer, poly_int64 amount)
24680 {
24681   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
24682
24683   return adjust_automodify_address (pointer, GET_MODE (pointer),
24684                                     next, amount);
24685 }
24686
24687 /* Return a new RTX holding the result of moving POINTER forward by the
24688    size of the mode it points to.  */
24689
24690 static rtx
24691 aarch64_progress_pointer (rtx pointer)
24692 {
24693   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
24694 }
24695
24696 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
24697    MODE bytes.  */
24698
24699 static void
24700 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
24701                                               machine_mode mode)
24702 {
24703   /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
24704      address copies using V4SImode so that we can use Q registers.  */
24705   if (known_eq (GET_MODE_BITSIZE (mode), 256))
24706     {
24707       mode = V4SImode;
24708       rtx reg1 = gen_reg_rtx (mode);
24709       rtx reg2 = gen_reg_rtx (mode);
24710       /* "Cast" the pointers to the correct mode.  */
24711       *src = adjust_address (*src, mode, 0);
24712       *dst = adjust_address (*dst, mode, 0);
24713       /* Emit the memcpy.  */
24714       emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
24715                                         aarch64_progress_pointer (*src)));
24716       emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
24717                                          aarch64_progress_pointer (*dst), reg2));
24718       /* Move the pointers forward.  */
24719       *src = aarch64_move_pointer (*src, 32);
24720       *dst = aarch64_move_pointer (*dst, 32);
24721       return;
24722     }
24723
24724   rtx reg = gen_reg_rtx (mode);
24725
24726   /* "Cast" the pointers to the correct mode.  */
24727   *src = adjust_address (*src, mode, 0);
24728   *dst = adjust_address (*dst, mode, 0);
24729   /* Emit the memcpy.  */
24730   emit_move_insn (reg, *src);
24731   emit_move_insn (*dst, reg);
24732   /* Move the pointers forward.  */
24733   *src = aarch64_progress_pointer (*src);
24734   *dst = aarch64_progress_pointer (*dst);
24735 }
24736
24737 /* Expand a cpymem using the MOPS extension.  OPERANDS are taken
24738    from the cpymem pattern.  Return true iff we succeeded.  */
24739 static bool
24740 aarch64_expand_cpymem_mops (rtx *operands)
24741 {
24742   if (!TARGET_MOPS)
24743     return false;
24744
24745   /* All three registers are changed by the instruction, so each one
24746      must be a fresh pseudo.  */
24747   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24748   rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
24749   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24750   rtx src_mem = replace_equiv_address (operands[1], src_addr);
24751   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
24752   emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
24753
24754   return true;
24755 }
24756
24757 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
24758    we succeed, otherwise return false, indicating that a libcall to
24759    memcpy should be emitted.  */
24760
24761 bool
24762 aarch64_expand_cpymem (rtx *operands)
24763 {
24764   int mode_bits;
24765   rtx dst = operands[0];
24766   rtx src = operands[1];
24767   rtx base;
24768   machine_mode cur_mode = BLKmode;
24769
24770   /* Variable-sized memcpy can go through the MOPS expansion if available.  */
24771   if (!CONST_INT_P (operands[2]))
24772     return aarch64_expand_cpymem_mops (operands);
24773
24774   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
24775
24776   /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
24777   unsigned HOST_WIDE_INT max_copy_size
24778     = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
24779
24780   bool size_p = optimize_function_for_size_p (cfun);
24781
24782   /* Large constant-sized cpymem should go through MOPS when possible.
24783      It should be a win even for size optimization in the general case.
24784      For speed optimization the choice between MOPS and the SIMD sequence
24785      depends on the size of the copy, rather than number of instructions,
24786      alignment etc.  */
24787   if (size > max_copy_size)
24788     return aarch64_expand_cpymem_mops (operands);
24789
24790   int copy_bits = 256;
24791
24792   /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
24793      support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
24794   if (size <= 24
24795       || !TARGET_SIMD
24796       || (aarch64_tune_params.extra_tuning_flags
24797           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
24798     copy_bits = 128;
24799
24800   /* Emit an inline load+store sequence and count the number of operations
24801      involved.  We use a simple count of just the loads and stores emitted
24802      rather than rtx_insn count as all the pointer adjustments and reg copying
24803      in this function will get optimized away later in the pipeline.  */
24804   start_sequence ();
24805   unsigned nops = 0;
24806
24807   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24808   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24809
24810   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
24811   src = adjust_automodify_address (src, VOIDmode, base, 0);
24812
24813   /* Convert size to bits to make the rest of the code simpler.  */
24814   int n = size * BITS_PER_UNIT;
24815
24816   while (n > 0)
24817     {
24818       /* Find the largest mode in which to do the copy in without over reading
24819          or writing.  */
24820       opt_scalar_int_mode mode_iter;
24821       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
24822         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
24823           cur_mode = mode_iter.require ();
24824
24825       gcc_assert (cur_mode != BLKmode);
24826
24827       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
24828
24829       /* Prefer Q-register accesses for the last bytes.  */
24830       if (mode_bits == 128 && copy_bits == 256)
24831         cur_mode = V4SImode;
24832
24833       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
24834       /* A single block copy is 1 load + 1 store.  */
24835       nops += 2;
24836       n -= mode_bits;
24837
24838       /* Emit trailing copies using overlapping unaligned accesses
24839         (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
24840       if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
24841         {
24842           machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
24843           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
24844           gcc_assert (n_bits <= mode_bits);
24845           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
24846           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
24847           n = n_bits;
24848         }
24849     }
24850   rtx_insn *seq = get_insns ();
24851   end_sequence ();
24852   /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
24853      the constant size into a register.  */
24854   unsigned mops_cost = 3 + 1;
24855
24856   /* If MOPS is available at this point we don't consider the libcall as it's
24857      not a win even on code size.  At this point only consider MOPS if
24858      optimizing for size.  For speed optimizations we will have chosen between
24859      the two based on copy size already.  */
24860   if (TARGET_MOPS)
24861     {
24862       if (size_p && mops_cost < nops)
24863         return aarch64_expand_cpymem_mops (operands);
24864       emit_insn (seq);
24865       return true;
24866     }
24867
24868   /* A memcpy libcall in the worst case takes 3 instructions to prepare the
24869      arguments + 1 for the call.  When MOPS is not available and we're
24870      optimizing for size a libcall may be preferable.  */
24871   unsigned libcall_cost = 4;
24872   if (size_p && libcall_cost < nops)
24873     return false;
24874
24875   emit_insn (seq);
24876   return true;
24877 }
24878
24879 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
24880    SRC is a register we have created with the duplicated value to be set.  */
24881 static void
24882 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
24883                                             machine_mode mode)
24884 {
24885   /* If we are copying 128bits or 256bits, we can do that straight from
24886      the SIMD register we prepared.  */
24887   if (known_eq (GET_MODE_BITSIZE (mode), 256))
24888     {
24889       mode = GET_MODE (src);
24890       /* "Cast" the *dst to the correct mode.  */
24891       *dst = adjust_address (*dst, mode, 0);
24892       /* Emit the memset.  */
24893       emit_insn (aarch64_gen_store_pair (mode, *dst, src,
24894                                          aarch64_progress_pointer (*dst), src));
24895
24896       /* Move the pointers forward.  */
24897       *dst = aarch64_move_pointer (*dst, 32);
24898       return;
24899     }
24900   if (known_eq (GET_MODE_BITSIZE (mode), 128))
24901     {
24902       /* "Cast" the *dst to the correct mode.  */
24903       *dst = adjust_address (*dst, GET_MODE (src), 0);
24904       /* Emit the memset.  */
24905       emit_move_insn (*dst, src);
24906       /* Move the pointers forward.  */
24907       *dst = aarch64_move_pointer (*dst, 16);
24908       return;
24909     }
24910   /* For copying less, we have to extract the right amount from src.  */
24911   rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
24912
24913   /* "Cast" the *dst to the correct mode.  */
24914   *dst = adjust_address (*dst, mode, 0);
24915   /* Emit the memset.  */
24916   emit_move_insn (*dst, reg);
24917   /* Move the pointer forward.  */
24918   *dst = aarch64_progress_pointer (*dst);
24919 }
24920
24921 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
24922    as for the setmem pattern.  Return true iff we succeed.  */
24923 static bool
24924 aarch64_expand_setmem_mops (rtx *operands)
24925 {
24926   if (!TARGET_MOPS)
24927     return false;
24928
24929   /* The first two registers are changed by the instruction, so both
24930      of them must be a fresh pseudo.  */
24931   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24932   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24933   rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
24934   rtx val = operands[2];
24935   if (val != CONST0_RTX (QImode))
24936     val = force_reg (QImode, val);
24937   emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
24938   return true;
24939 }
24940
24941 /* Expand setmem, as if from a __builtin_memset.  Return true if
24942    we succeed, otherwise return false.  */
24943
24944 bool
24945 aarch64_expand_setmem (rtx *operands)
24946 {
24947   int n, mode_bits;
24948   unsigned HOST_WIDE_INT len;
24949   rtx dst = operands[0];
24950   rtx val = operands[2], src;
24951   rtx base;
24952   machine_mode cur_mode = BLKmode, next_mode;
24953
24954   /* If we don't have SIMD registers or the size is variable use the MOPS
24955      inlined sequence if possible.  */
24956   if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
24957     return aarch64_expand_setmem_mops (operands);
24958
24959   bool size_p = optimize_function_for_size_p (cfun);
24960
24961   /* Default the maximum to 256-bytes when considering only libcall vs
24962      SIMD broadcast sequence.  */
24963   unsigned max_set_size = 256;
24964
24965   len = INTVAL (operands[1]);
24966   if (len > max_set_size && !TARGET_MOPS)
24967     return false;
24968
24969   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
24970   /* The MOPS sequence takes:
24971      3 instructions for the memory storing
24972      + 1 to move the constant size into a reg
24973      + 1 if VAL is a non-zero constant to move into a reg
24974     (zero constants can use XZR directly).  */
24975   unsigned mops_cost = 3 + 1 + cst_val;
24976   /* A libcall to memset in the worst case takes 3 instructions to prepare
24977      the arguments + 1 for the call.  */
24978   unsigned libcall_cost = 4;
24979
24980   /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
24981      when available.  */
24982   if (TARGET_MOPS
24983       && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
24984     return aarch64_expand_setmem_mops (operands);
24985
24986   /* Attempt a sequence with a vector broadcast followed by stores.
24987      Count the number of operations involved to see if it's worth it
24988      against the alternatives.  A simple counter simd_ops on the
24989      algorithmically-relevant operations is used rather than an rtx_insn count
24990      as all the pointer adjusmtents and mode reinterprets will be optimized
24991      away later.  */
24992   start_sequence ();
24993   unsigned simd_ops = 0;
24994
24995   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24996   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24997
24998   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
24999   src = expand_vector_broadcast (V16QImode, val);
25000   src = force_reg (V16QImode, src);
25001   simd_ops++;
25002   /* Convert len to bits to make the rest of the code simpler.  */
25003   n = len * BITS_PER_UNIT;
25004
25005   /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
25006      AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
25007   const int copy_limit = (aarch64_tune_params.extra_tuning_flags
25008                           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
25009                           ? GET_MODE_BITSIZE (TImode) : 256;
25010
25011   while (n > 0)
25012     {
25013       /* Find the largest mode in which to do the copy without
25014          over writing.  */
25015       opt_scalar_int_mode mode_iter;
25016       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
25017         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
25018           cur_mode = mode_iter.require ();
25019
25020       gcc_assert (cur_mode != BLKmode);
25021
25022       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
25023       aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
25024       simd_ops++;
25025       n -= mode_bits;
25026
25027       /* Do certain trailing copies as overlapping if it's going to be
25028          cheaper.  i.e. less instructions to do so.  For instance doing a 15
25029          byte copy it's more efficient to do two overlapping 8 byte copies than
25030          8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
25031       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
25032         {
25033           next_mode = smallest_mode_for_size (n, MODE_INT);
25034           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
25035           gcc_assert (n_bits <= mode_bits);
25036           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
25037           n = n_bits;
25038         }
25039     }
25040   rtx_insn *seq = get_insns ();
25041   end_sequence ();
25042
25043   if (size_p)
25044     {
25045       /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
25046          call to memset or the MOPS expansion.  */
25047       if (TARGET_MOPS
25048           && mops_cost <= libcall_cost
25049           && mops_cost <= simd_ops)
25050         return aarch64_expand_setmem_mops (operands);
25051       /* If MOPS is not available or not shorter pick a libcall if the SIMD
25052          sequence is too long.  */
25053       else if (libcall_cost < simd_ops)
25054         return false;
25055       emit_insn (seq);
25056       return true;
25057     }
25058
25059   /* At this point the SIMD broadcast sequence is the best choice when
25060      optimizing for speed.  */
25061   emit_insn (seq);
25062   return true;
25063 }
25064
25065
25066 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
25067    SImode stores.  Handle the case when the constant has identical
25068    bottom and top halves.  This is beneficial when the two stores can be
25069    merged into an STP and we avoid synthesising potentially expensive
25070    immediates twice.  Return true if such a split is possible.  */
25071
25072 bool
25073 aarch64_split_dimode_const_store (rtx dst, rtx src)
25074 {
25075   rtx lo = gen_lowpart (SImode, src);
25076   rtx hi = gen_highpart_mode (SImode, DImode, src);
25077
25078   bool size_p = optimize_function_for_size_p (cfun);
25079
25080   if (!rtx_equal_p (lo, hi))
25081     return false;
25082
25083   unsigned int orig_cost
25084     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
25085   unsigned int lo_cost
25086     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
25087
25088   /* We want to transform:
25089      MOV        x1, 49370
25090      MOVK       x1, 0x140, lsl 16
25091      MOVK       x1, 0xc0da, lsl 32
25092      MOVK       x1, 0x140, lsl 48
25093      STR        x1, [x0]
25094    into:
25095      MOV        w1, 49370
25096      MOVK       w1, 0x140, lsl 16
25097      STP        w1, w1, [x0]
25098    So we want to perform this only when we save two instructions
25099    or more.  When optimizing for size, however, accept any code size
25100    savings we can.  */
25101   if (size_p && orig_cost <= lo_cost)
25102     return false;
25103
25104   if (!size_p
25105       && (orig_cost <= lo_cost + 1))
25106     return false;
25107
25108   rtx mem_lo = adjust_address (dst, SImode, 0);
25109   if (!aarch64_mem_pair_operand (mem_lo, SImode))
25110     return false;
25111
25112   rtx tmp_reg = gen_reg_rtx (SImode);
25113   aarch64_expand_mov_immediate (tmp_reg, lo);
25114   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
25115   /* Don't emit an explicit store pair as this may not be always profitable.
25116      Let the sched-fusion logic decide whether to merge them.  */
25117   emit_move_insn (mem_lo, tmp_reg);
25118   emit_move_insn (mem_hi, tmp_reg);
25119
25120   return true;
25121 }
25122
25123 /* Generate RTL for a conditional branch with rtx comparison CODE in
25124    mode CC_MODE.  The destination of the unlikely conditional branch
25125    is LABEL_REF.  */
25126
25127 void
25128 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
25129                               rtx label_ref)
25130 {
25131   rtx x;
25132   x = gen_rtx_fmt_ee (code, VOIDmode,
25133                       gen_rtx_REG (cc_mode, CC_REGNUM),
25134                       const0_rtx);
25135
25136   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
25137                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
25138                             pc_rtx);
25139   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
25140 }
25141
25142 /* Generate DImode scratch registers for 128-bit (TImode) addition.
25143
25144    OP1 represents the TImode destination operand 1
25145    OP2 represents the TImode destination operand 2
25146    LOW_DEST represents the low half (DImode) of TImode operand 0
25147    LOW_IN1 represents the low half (DImode) of TImode operand 1
25148    LOW_IN2 represents the low half (DImode) of TImode operand 2
25149    HIGH_DEST represents the high half (DImode) of TImode operand 0
25150    HIGH_IN1 represents the high half (DImode) of TImode operand 1
25151    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
25152
25153 void
25154 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25155                             rtx *low_in1, rtx *low_in2,
25156                             rtx *high_dest, rtx *high_in1,
25157                             rtx *high_in2)
25158 {
25159   *low_dest = gen_reg_rtx (DImode);
25160   *low_in1 = gen_lowpart (DImode, op1);
25161   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25162                                   subreg_lowpart_offset (DImode, TImode));
25163   *high_dest = gen_reg_rtx (DImode);
25164   *high_in1 = gen_highpart (DImode, op1);
25165   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25166                                    subreg_highpart_offset (DImode, TImode));
25167 }
25168
25169 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
25170
25171    This function differs from 'arch64_addti_scratch_regs' in that
25172    OP1 can be an immediate constant (zero). We must call
25173    subreg_highpart_offset with DImode and TImode arguments, otherwise
25174    VOIDmode will be used for the const_int which generates an internal
25175    error from subreg_size_highpart_offset which does not expect a size of zero.
25176
25177    OP1 represents the TImode destination operand 1
25178    OP2 represents the TImode destination operand 2
25179    LOW_DEST represents the low half (DImode) of TImode operand 0
25180    LOW_IN1 represents the low half (DImode) of TImode operand 1
25181    LOW_IN2 represents the low half (DImode) of TImode operand 2
25182    HIGH_DEST represents the high half (DImode) of TImode operand 0
25183    HIGH_IN1 represents the high half (DImode) of TImode operand 1
25184    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
25185
25186
25187 void
25188 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25189                              rtx *low_in1, rtx *low_in2,
25190                              rtx *high_dest, rtx *high_in1,
25191                              rtx *high_in2)
25192 {
25193   *low_dest = gen_reg_rtx (DImode);
25194   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
25195                                   subreg_lowpart_offset (DImode, TImode));
25196
25197   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25198                                   subreg_lowpart_offset (DImode, TImode));
25199   *high_dest = gen_reg_rtx (DImode);
25200
25201   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
25202                                    subreg_highpart_offset (DImode, TImode));
25203   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25204                                    subreg_highpart_offset (DImode, TImode));
25205 }
25206
25207 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
25208
25209    OP0 represents the TImode destination operand 0
25210    LOW_DEST represents the low half (DImode) of TImode operand 0
25211    LOW_IN1 represents the low half (DImode) of TImode operand 1
25212    LOW_IN2 represents the low half (DImode) of TImode operand 2
25213    HIGH_DEST represents the high half (DImode) of TImode operand 0
25214    HIGH_IN1 represents the high half (DImode) of TImode operand 1
25215    HIGH_IN2 represents the high half (DImode) of TImode operand 2
25216    UNSIGNED_P is true if the operation is being performed on unsigned
25217    values.  */
25218 void
25219 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
25220                        rtx low_in2, rtx high_dest, rtx high_in1,
25221                        rtx high_in2, bool unsigned_p)
25222 {
25223   if (low_in2 == const0_rtx)
25224     {
25225       low_dest = low_in1;
25226       high_in2 = force_reg (DImode, high_in2);
25227       if (unsigned_p)
25228         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
25229       else
25230         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
25231     }
25232   else
25233     {
25234       if (aarch64_plus_immediate (low_in2, DImode))
25235         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
25236                                             GEN_INT (-UINTVAL (low_in2))));
25237       else
25238         {
25239           low_in2 = force_reg (DImode, low_in2);
25240           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
25241         }
25242       high_in2 = force_reg (DImode, high_in2);
25243
25244       if (unsigned_p)
25245         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
25246       else
25247         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
25248     }
25249
25250   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
25251   emit_move_insn (gen_highpart (DImode, op0), high_dest);
25252
25253 }
25254
25255 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
25256
25257 static unsigned HOST_WIDE_INT
25258 aarch64_asan_shadow_offset (void)
25259 {
25260   if (TARGET_ILP32)
25261     return (HOST_WIDE_INT_1 << 29);
25262   else
25263     return (HOST_WIDE_INT_1 << 36);
25264 }
25265
25266 static rtx
25267 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
25268                         int code, tree treeop0, tree treeop1)
25269 {
25270   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25271   rtx op0, op1;
25272   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25273   insn_code icode;
25274   struct expand_operand ops[4];
25275
25276   start_sequence ();
25277   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25278
25279   op_mode = GET_MODE (op0);
25280   if (op_mode == VOIDmode)
25281     op_mode = GET_MODE (op1);
25282
25283   switch (op_mode)
25284     {
25285     case E_QImode:
25286     case E_HImode:
25287     case E_SImode:
25288       cmp_mode = SImode;
25289       icode = CODE_FOR_cmpsi;
25290       break;
25291
25292     case E_DImode:
25293       cmp_mode = DImode;
25294       icode = CODE_FOR_cmpdi;
25295       break;
25296
25297     case E_SFmode:
25298       cmp_mode = SFmode;
25299       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25300       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
25301       break;
25302
25303     case E_DFmode:
25304       cmp_mode = DFmode;
25305       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25306       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
25307       break;
25308
25309     default:
25310       end_sequence ();
25311       return NULL_RTX;
25312     }
25313
25314   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
25315   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
25316   if (!op0 || !op1)
25317     {
25318       end_sequence ();
25319       return NULL_RTX;
25320     }
25321   *prep_seq = get_insns ();
25322   end_sequence ();
25323
25324   create_fixed_operand (&ops[0], op0);
25325   create_fixed_operand (&ops[1], op1);
25326
25327   start_sequence ();
25328   if (!maybe_expand_insn (icode, 2, ops))
25329     {
25330       end_sequence ();
25331       return NULL_RTX;
25332     }
25333   *gen_seq = get_insns ();
25334   end_sequence ();
25335
25336   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
25337                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
25338 }
25339
25340 static rtx
25341 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
25342                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
25343 {
25344   rtx op0, op1, target;
25345   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25346   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25347   insn_code icode;
25348   struct expand_operand ops[6];
25349   int aarch64_cond;
25350
25351   push_to_sequence (*prep_seq);
25352   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25353
25354   op_mode = GET_MODE (op0);
25355   if (op_mode == VOIDmode)
25356     op_mode = GET_MODE (op1);
25357
25358   switch (op_mode)
25359     {
25360     case E_QImode:
25361     case E_HImode:
25362     case E_SImode:
25363       cmp_mode = SImode;
25364       break;
25365
25366     case E_DImode:
25367       cmp_mode = DImode;
25368       break;
25369
25370     case E_SFmode:
25371       cmp_mode = SFmode;
25372       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25373       break;
25374
25375     case E_DFmode:
25376       cmp_mode = DFmode;
25377       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25378       break;
25379
25380     default:
25381       end_sequence ();
25382       return NULL_RTX;
25383     }
25384
25385   icode = code_for_ccmp (cc_mode, cmp_mode);
25386
25387   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
25388   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
25389   if (!op0 || !op1)
25390     {
25391       end_sequence ();
25392       return NULL_RTX;
25393     }
25394   *prep_seq = get_insns ();
25395   end_sequence ();
25396
25397   target = gen_rtx_REG (cc_mode, CC_REGNUM);
25398   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
25399
25400   if (bit_code != AND)
25401     {
25402       /* Treat the ccmp patterns as canonical and use them where possible,
25403          but fall back to ccmp_rev patterns if there's no other option.  */
25404       rtx_code prev_code = GET_CODE (prev);
25405       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
25406       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
25407           && !(prev_code == EQ
25408                || prev_code == NE
25409                || prev_code == ORDERED
25410                || prev_code == UNORDERED))
25411         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
25412       else
25413         {
25414           rtx_code code = reverse_condition (prev_code);
25415           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
25416         }
25417       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
25418     }
25419
25420   create_fixed_operand (&ops[0], XEXP (prev, 0));
25421   create_fixed_operand (&ops[1], target);
25422   create_fixed_operand (&ops[2], op0);
25423   create_fixed_operand (&ops[3], op1);
25424   create_fixed_operand (&ops[4], prev);
25425   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
25426
25427   push_to_sequence (*gen_seq);
25428   if (!maybe_expand_insn (icode, 6, ops))
25429     {
25430       end_sequence ();
25431       return NULL_RTX;
25432     }
25433
25434   *gen_seq = get_insns ();
25435   end_sequence ();
25436
25437   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
25438 }
25439
25440 #undef TARGET_GEN_CCMP_FIRST
25441 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
25442
25443 #undef TARGET_GEN_CCMP_NEXT
25444 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
25445
25446 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
25447    instruction fusion of some sort.  */
25448
25449 static bool
25450 aarch64_macro_fusion_p (void)
25451 {
25452   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
25453 }
25454
25455
25456 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
25457    should be kept together during scheduling.  */
25458
25459 static bool
25460 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
25461 {
25462   rtx set_dest;
25463   rtx prev_set = single_set (prev);
25464   rtx curr_set = single_set (curr);
25465   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
25466   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
25467
25468   if (!aarch64_macro_fusion_p ())
25469     return false;
25470
25471   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
25472     {
25473       /* We are trying to match:
25474          prev (mov)  == (set (reg r0) (const_int imm16))
25475          curr (movk) == (set (zero_extract (reg r0)
25476                                            (const_int 16)
25477                                            (const_int 16))
25478                              (const_int imm16_1))  */
25479
25480       set_dest = SET_DEST (curr_set);
25481
25482       if (GET_CODE (set_dest) == ZERO_EXTRACT
25483           && CONST_INT_P (SET_SRC (curr_set))
25484           && CONST_INT_P (SET_SRC (prev_set))
25485           && CONST_INT_P (XEXP (set_dest, 2))
25486           && INTVAL (XEXP (set_dest, 2)) == 16
25487           && REG_P (XEXP (set_dest, 0))
25488           && REG_P (SET_DEST (prev_set))
25489           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
25490         {
25491           return true;
25492         }
25493     }
25494
25495   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
25496     {
25497
25498       /*  We're trying to match:
25499           prev (adrp) == (set (reg r1)
25500                               (high (symbol_ref ("SYM"))))
25501           curr (add) == (set (reg r0)
25502                              (lo_sum (reg r1)
25503                                      (symbol_ref ("SYM"))))
25504           Note that r0 need not necessarily be the same as r1, especially
25505           during pre-regalloc scheduling.  */
25506
25507       if (satisfies_constraint_Ush (SET_SRC (prev_set))
25508           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25509         {
25510           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
25511               && REG_P (XEXP (SET_SRC (curr_set), 0))
25512               && REGNO (XEXP (SET_SRC (curr_set), 0))
25513                  == REGNO (SET_DEST (prev_set))
25514               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
25515                               XEXP (SET_SRC (curr_set), 1)))
25516             return true;
25517         }
25518     }
25519
25520   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
25521     {
25522
25523       /* We're trying to match:
25524          prev (movk) == (set (zero_extract (reg r0)
25525                                            (const_int 16)
25526                                            (const_int 32))
25527                              (const_int imm16_1))
25528          curr (movk) == (set (zero_extract (reg r0)
25529                                            (const_int 16)
25530                                            (const_int 48))
25531                              (const_int imm16_2))  */
25532
25533       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
25534           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
25535           && REG_P (XEXP (SET_DEST (prev_set), 0))
25536           && REG_P (XEXP (SET_DEST (curr_set), 0))
25537           && REGNO (XEXP (SET_DEST (prev_set), 0))
25538              == REGNO (XEXP (SET_DEST (curr_set), 0))
25539           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
25540           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
25541           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
25542           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
25543           && CONST_INT_P (SET_SRC (prev_set))
25544           && CONST_INT_P (SET_SRC (curr_set)))
25545         return true;
25546
25547     }
25548   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
25549     {
25550       /* We're trying to match:
25551           prev (adrp) == (set (reg r0)
25552                               (high (symbol_ref ("SYM"))))
25553           curr (ldr) == (set (reg r1)
25554                              (mem (lo_sum (reg r0)
25555                                              (symbol_ref ("SYM")))))
25556                  or
25557           curr (ldr) == (set (reg r1)
25558                              (zero_extend (mem
25559                                            (lo_sum (reg r0)
25560                                                    (symbol_ref ("SYM"))))))  */
25561       if (satisfies_constraint_Ush (SET_SRC (prev_set))
25562           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25563         {
25564           rtx curr_src = SET_SRC (curr_set);
25565
25566           if (GET_CODE (curr_src) == ZERO_EXTEND)
25567             curr_src = XEXP (curr_src, 0);
25568
25569           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
25570               && REG_P (XEXP (XEXP (curr_src, 0), 0))
25571               && REGNO (XEXP (XEXP (curr_src, 0), 0))
25572                  == REGNO (SET_DEST (prev_set))
25573               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
25574                               XEXP (SET_SRC (prev_set), 0)))
25575               return true;
25576         }
25577     }
25578
25579   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
25580   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
25581       && prev_set && curr_set && any_condjump_p (curr)
25582       && GET_CODE (SET_SRC (prev_set)) == COMPARE
25583       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
25584       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
25585     return true;
25586
25587   /* Fuse flag-setting ALU instructions and conditional branch.  */
25588   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
25589       && any_condjump_p (curr))
25590     {
25591       unsigned int condreg1, condreg2;
25592       rtx cc_reg_1;
25593       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
25594       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
25595
25596       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
25597           && prev
25598           && modified_in_p (cc_reg_1, prev))
25599         {
25600           enum attr_type prev_type = get_attr_type (prev);
25601
25602           /* FIXME: this misses some which is considered simple arthematic
25603              instructions for ThunderX.  Simple shifts are missed here.  */
25604           if (prev_type == TYPE_ALUS_SREG
25605               || prev_type == TYPE_ALUS_IMM
25606               || prev_type == TYPE_LOGICS_REG
25607               || prev_type == TYPE_LOGICS_IMM)
25608             return true;
25609         }
25610     }
25611
25612   /* Fuse ALU instructions and CBZ/CBNZ.  */
25613   if (prev_set
25614       && curr_set
25615       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
25616       && any_condjump_p (curr))
25617     {
25618       /* We're trying to match:
25619           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
25620           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
25621                                                          (const_int 0))
25622                                                  (label_ref ("SYM"))
25623                                                  (pc))  */
25624       if (SET_DEST (curr_set) == (pc_rtx)
25625           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
25626           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
25627           && REG_P (SET_DEST (prev_set))
25628           && REGNO (SET_DEST (prev_set))
25629              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
25630         {
25631           /* Fuse ALU operations followed by conditional branch instruction.  */
25632           switch (get_attr_type (prev))
25633             {
25634             case TYPE_ALU_IMM:
25635             case TYPE_ALU_SREG:
25636             case TYPE_ADC_REG:
25637             case TYPE_ADC_IMM:
25638             case TYPE_ADCS_REG:
25639             case TYPE_ADCS_IMM:
25640             case TYPE_LOGIC_REG:
25641             case TYPE_LOGIC_IMM:
25642             case TYPE_CSEL:
25643             case TYPE_ADR:
25644             case TYPE_MOV_IMM:
25645             case TYPE_SHIFT_REG:
25646             case TYPE_SHIFT_IMM:
25647             case TYPE_BFM:
25648             case TYPE_RBIT:
25649             case TYPE_REV:
25650             case TYPE_EXTEND:
25651               return true;
25652
25653             default:;
25654             }
25655         }
25656     }
25657
25658   /* Fuse A+B+1 and A-B-1 */
25659   if (simple_sets_p
25660       && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
25661     {
25662       /* We're trying to match:
25663           prev == (set (r0) (plus (r0) (r1)))
25664           curr == (set (r0) (plus (r0) (const_int 1)))
25665         or:
25666           prev == (set (r0) (minus (r0) (r1)))
25667           curr == (set (r0) (plus (r0) (const_int -1))) */
25668
25669       rtx prev_src = SET_SRC (prev_set);
25670       rtx curr_src = SET_SRC (curr_set);
25671
25672       int polarity = 1;
25673       if (GET_CODE (prev_src) == MINUS)
25674         polarity = -1;
25675
25676       if (GET_CODE (curr_src) == PLUS
25677           && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
25678           && CONST_INT_P (XEXP (curr_src, 1))
25679           && INTVAL (XEXP (curr_src, 1)) == polarity
25680           && REG_P (XEXP (curr_src, 0))
25681           && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
25682         return true;
25683     }
25684
25685   return false;
25686 }
25687
25688 /* Return true iff the instruction fusion described by OP is enabled.  */
25689
25690 bool
25691 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
25692 {
25693   return (aarch64_tune_params.fusible_ops & op) != 0;
25694 }
25695
25696 /* If MEM is in the form of [base+offset], extract the two parts
25697    of address and set to BASE and OFFSET, otherwise return false
25698    after clearing BASE and OFFSET.  */
25699
25700 bool
25701 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
25702 {
25703   rtx addr;
25704
25705   gcc_assert (MEM_P (mem));
25706
25707   addr = XEXP (mem, 0);
25708
25709   if (REG_P (addr))
25710     {
25711       *base = addr;
25712       *offset = const0_rtx;
25713       return true;
25714     }
25715
25716   if (GET_CODE (addr) == PLUS
25717       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
25718     {
25719       *base = XEXP (addr, 0);
25720       *offset = XEXP (addr, 1);
25721       return true;
25722     }
25723
25724   *base = NULL_RTX;
25725   *offset = NULL_RTX;
25726
25727   return false;
25728 }
25729
25730 /* Types for scheduling fusion.  */
25731 enum sched_fusion_type
25732 {
25733   SCHED_FUSION_NONE = 0,
25734   SCHED_FUSION_LD_SIGN_EXTEND,
25735   SCHED_FUSION_LD_ZERO_EXTEND,
25736   SCHED_FUSION_LD,
25737   SCHED_FUSION_ST,
25738   SCHED_FUSION_NUM
25739 };
25740
25741 /* If INSN is a load or store of address in the form of [base+offset],
25742    extract the two parts and set to BASE and OFFSET.  Return scheduling
25743    fusion type this INSN is.  */
25744
25745 static enum sched_fusion_type
25746 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
25747 {
25748   rtx x, dest, src;
25749   enum sched_fusion_type fusion = SCHED_FUSION_LD;
25750
25751   gcc_assert (INSN_P (insn));
25752   x = PATTERN (insn);
25753   if (GET_CODE (x) != SET)
25754     return SCHED_FUSION_NONE;
25755
25756   src = SET_SRC (x);
25757   dest = SET_DEST (x);
25758
25759   machine_mode dest_mode = GET_MODE (dest);
25760
25761   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
25762     return SCHED_FUSION_NONE;
25763
25764   if (GET_CODE (src) == SIGN_EXTEND)
25765     {
25766       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
25767       src = XEXP (src, 0);
25768       if (!MEM_P (src) || GET_MODE (src) != SImode)
25769         return SCHED_FUSION_NONE;
25770     }
25771   else if (GET_CODE (src) == ZERO_EXTEND)
25772     {
25773       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
25774       src = XEXP (src, 0);
25775       if (!MEM_P (src) || GET_MODE (src) != SImode)
25776         return SCHED_FUSION_NONE;
25777     }
25778
25779   if (MEM_P (src) && REG_P (dest))
25780     extract_base_offset_in_addr (src, base, offset);
25781   else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
25782     {
25783       fusion = SCHED_FUSION_ST;
25784       extract_base_offset_in_addr (dest, base, offset);
25785     }
25786   else
25787     return SCHED_FUSION_NONE;
25788
25789   if (*base == NULL_RTX || *offset == NULL_RTX)
25790     fusion = SCHED_FUSION_NONE;
25791
25792   return fusion;
25793 }
25794
25795 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
25796
25797    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
25798    and PRI are only calculated for these instructions.  For other instruction,
25799    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
25800    type instruction fusion can be added by returning different priorities.
25801
25802    It's important that irrelevant instructions get the largest FUSION_PRI.  */
25803
25804 static void
25805 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
25806                                int *fusion_pri, int *pri)
25807 {
25808   int tmp, off_val;
25809   rtx base, offset;
25810   enum sched_fusion_type fusion;
25811
25812   gcc_assert (INSN_P (insn));
25813
25814   tmp = max_pri - 1;
25815   fusion = fusion_load_store (insn, &base, &offset);
25816   if (fusion == SCHED_FUSION_NONE)
25817     {
25818       *pri = tmp;
25819       *fusion_pri = tmp;
25820       return;
25821     }
25822
25823   /* Set FUSION_PRI according to fusion type and base register.  */
25824   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
25825
25826   /* Calculate PRI.  */
25827   tmp /= 2;
25828
25829   /* INSN with smaller offset goes first.  */
25830   off_val = (int)(INTVAL (offset));
25831   if (off_val >= 0)
25832     tmp -= (off_val & 0xfffff);
25833   else
25834     tmp += ((- off_val) & 0xfffff);
25835
25836   *pri = tmp;
25837   return;
25838 }
25839
25840 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
25841    Adjust priority of sha1h instructions so they are scheduled before
25842    other SHA1 instructions.  */
25843
25844 static int
25845 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
25846 {
25847   rtx x = PATTERN (insn);
25848
25849   if (GET_CODE (x) == SET)
25850     {
25851       x = SET_SRC (x);
25852
25853       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
25854         return priority + 10;
25855     }
25856
25857   return priority;
25858 }
25859
25860 /* If REVERSED is null, return true if memory reference *MEM2 comes
25861    immediately after memory reference *MEM1.  Do not change the references
25862    in this case.
25863
25864    Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
25865    if they are, try to make them use constant offsets from the same base
25866    register.  Return true on success.  When returning true, set *REVERSED
25867    to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2.  */
25868 static bool
25869 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
25870 {
25871   if (reversed)
25872     *reversed = false;
25873
25874   if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
25875       || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
25876     return false;
25877
25878   if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
25879     return false;
25880
25881   auto size1 = MEM_SIZE (*mem1);
25882   auto size2 = MEM_SIZE (*mem2);
25883
25884   rtx base1, base2, offset1, offset2;
25885   extract_base_offset_in_addr (*mem1, &base1, &offset1);
25886   extract_base_offset_in_addr (*mem2, &base2, &offset2);
25887
25888   /* Make sure at least one memory is in base+offset form.  */
25889   if (!(base1 && offset1) && !(base2 && offset2))
25890     return false;
25891
25892   /* If both mems already use the same base register, just check the
25893      offsets.  */
25894   if (base1 && base2 && rtx_equal_p (base1, base2))
25895     {
25896       if (!offset1 || !offset2)
25897         return false;
25898
25899       if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
25900         return true;
25901
25902       if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
25903         {
25904           *reversed = true;
25905           return true;
25906         }
25907
25908       return false;
25909     }
25910
25911   /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
25912      guarantee that the values are consecutive.  */
25913   if (MEM_EXPR (*mem1)
25914       && MEM_EXPR (*mem2)
25915       && MEM_OFFSET_KNOWN_P (*mem1)
25916       && MEM_OFFSET_KNOWN_P (*mem2))
25917     {
25918       poly_int64 expr_offset1;
25919       poly_int64 expr_offset2;
25920       tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
25921                                                        &expr_offset1);
25922       tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
25923                                                        &expr_offset2);
25924       if (!expr_base1
25925           || !expr_base2
25926           || !DECL_P (expr_base1)
25927           || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
25928         return false;
25929
25930       expr_offset1 += MEM_OFFSET (*mem1);
25931       expr_offset2 += MEM_OFFSET (*mem2);
25932
25933       if (known_eq (expr_offset1 + size1, expr_offset2))
25934         ;
25935       else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
25936         *reversed = true;
25937       else
25938         return false;
25939
25940       if (reversed)
25941         {
25942           if (base2)
25943             {
25944               rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
25945                                          expr_offset1 - expr_offset2);
25946               *mem1 = replace_equiv_address_nv (*mem1, addr1);
25947             }
25948           else
25949             {
25950               rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
25951                                          expr_offset2 - expr_offset1);
25952               *mem2 = replace_equiv_address_nv (*mem2, addr2);
25953             }
25954         }
25955       return true;
25956     }
25957
25958   return false;
25959 }
25960
25961 /* Return true if MEM1 and MEM2 can be combined into a single access
25962    of mode MODE, with the combined access having the same address as MEM1.  */
25963
25964 bool
25965 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
25966 {
25967   if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
25968     return false;
25969   return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
25970 }
25971
25972 /* Given OPERANDS of consecutive load/store, check if we can merge
25973    them into ldp/stp.  LOAD is true if they are load instructions.
25974    MODE is the mode of memory operands.  */
25975
25976 bool
25977 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
25978                                 machine_mode mode)
25979 {
25980   enum reg_class rclass_1, rclass_2;
25981   rtx mem_1, mem_2, reg_1, reg_2;
25982
25983   if (load)
25984     {
25985       mem_1 = operands[1];
25986       mem_2 = operands[3];
25987       reg_1 = operands[0];
25988       reg_2 = operands[2];
25989       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
25990       if (REGNO (reg_1) == REGNO (reg_2))
25991         return false;
25992       if (reg_overlap_mentioned_p (reg_1, mem_2))
25993         return false;
25994     }
25995   else
25996     {
25997       mem_1 = operands[0];
25998       mem_2 = operands[2];
25999       reg_1 = operands[1];
26000       reg_2 = operands[3];
26001     }
26002
26003   /* The mems cannot be volatile.  */
26004   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
26005     return false;
26006
26007   /* If we have SImode and slow unaligned ldp,
26008      check the alignment to be at least 8 byte. */
26009   if (mode == SImode
26010       && (aarch64_tune_params.extra_tuning_flags
26011           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26012       && !optimize_size
26013       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
26014     return false;
26015
26016   /* Check if the addresses are in the form of [base+offset].  */
26017   bool reversed = false;
26018   if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
26019     return false;
26020
26021   /* The operands must be of the same size.  */
26022   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
26023                         GET_MODE_SIZE (GET_MODE (mem_2))));
26024
26025   /* One of the memory accesses must be a mempair operand.
26026      If it is not the first one, they need to be swapped by the
26027      peephole.  */
26028   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
26029        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
26030     return false;
26031
26032   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
26033     rclass_1 = FP_REGS;
26034   else
26035     rclass_1 = GENERAL_REGS;
26036
26037   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
26038     rclass_2 = FP_REGS;
26039   else
26040     rclass_2 = GENERAL_REGS;
26041
26042   /* Check if the registers are of same class.  */
26043   if (rclass_1 != rclass_2)
26044     return false;
26045
26046   return true;
26047 }
26048
26049 /* Given OPERANDS of consecutive load/store that can be merged,
26050    swap them if they are not in ascending order.  */
26051 void
26052 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
26053 {
26054   int mem_op = load ? 1 : 0;
26055   bool reversed = false;
26056   if (!aarch64_check_consecutive_mems (operands + mem_op,
26057                                        operands + mem_op + 2, &reversed))
26058     gcc_unreachable ();
26059
26060   if (reversed)
26061     {
26062       /* Irrespective of whether this is a load or a store,
26063          we do the same swap.  */
26064       std::swap (operands[0], operands[2]);
26065       std::swap (operands[1], operands[3]);
26066     }
26067 }
26068
26069 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
26070    comparison between the two.  */
26071 int
26072 aarch64_host_wide_int_compare (const void *x, const void *y)
26073 {
26074   return wi::cmps (* ((const HOST_WIDE_INT *) x),
26075                    * ((const HOST_WIDE_INT *) y));
26076 }
26077
26078 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
26079    other pointing to a REG rtx containing an offset, compare the offsets
26080    of the two pairs.
26081
26082    Return:
26083
26084         1 iff offset (X) > offset (Y)
26085         0 iff offset (X) == offset (Y)
26086         -1 iff offset (X) < offset (Y)  */
26087 int
26088 aarch64_ldrstr_offset_compare (const void *x, const void *y)
26089 {
26090   const rtx * operands_1 = (const rtx *) x;
26091   const rtx * operands_2 = (const rtx *) y;
26092   rtx mem_1, mem_2, base, offset_1, offset_2;
26093
26094   if (MEM_P (operands_1[0]))
26095     mem_1 = operands_1[0];
26096   else
26097     mem_1 = operands_1[1];
26098
26099   if (MEM_P (operands_2[0]))
26100     mem_2 = operands_2[0];
26101   else
26102     mem_2 = operands_2[1];
26103
26104   /* Extract the offsets.  */
26105   extract_base_offset_in_addr (mem_1, &base, &offset_1);
26106   extract_base_offset_in_addr (mem_2, &base, &offset_2);
26107
26108   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
26109
26110   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
26111 }
26112
26113 /* Given OPERANDS of consecutive load/store, check if we can merge
26114    them into ldp/stp by adjusting the offset.  LOAD is true if they
26115    are load instructions.  MODE is the mode of memory operands.
26116
26117    Given below consecutive stores:
26118
26119      str  w1, [xb, 0x100]
26120      str  w1, [xb, 0x104]
26121      str  w1, [xb, 0x108]
26122      str  w1, [xb, 0x10c]
26123
26124    Though the offsets are out of the range supported by stp, we can
26125    still pair them after adjusting the offset, like:
26126
26127      add  scratch, xb, 0x100
26128      stp  w1, w1, [scratch]
26129      stp  w1, w1, [scratch, 0x8]
26130
26131    The peephole patterns detecting this opportunity should guarantee
26132    the scratch register is avaliable.  */
26133
26134 bool
26135 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
26136                                        machine_mode mode)
26137 {
26138   const int num_insns = 4;
26139   enum reg_class rclass;
26140   HOST_WIDE_INT offvals[num_insns], msize;
26141   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
26142
26143   if (load)
26144     {
26145       for (int i = 0; i < num_insns; i++)
26146         {
26147           reg[i] = operands[2 * i];
26148           mem[i] = operands[2 * i + 1];
26149
26150           gcc_assert (REG_P (reg[i]));
26151         }
26152
26153       /* Do not attempt to merge the loads if the loads clobber each other.  */
26154       for (int i = 0; i < 8; i += 2)
26155         for (int j = i + 2; j < 8; j += 2)
26156           if (reg_overlap_mentioned_p (operands[i], operands[j]))
26157             return false;
26158     }
26159   else
26160     for (int i = 0; i < num_insns; i++)
26161       {
26162         mem[i] = operands[2 * i];
26163         reg[i] = operands[2 * i + 1];
26164       }
26165
26166   /* Skip if memory operand is by itself valid for ldp/stp.  */
26167   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
26168     return false;
26169
26170   for (int i = 0; i < num_insns; i++)
26171     {
26172       /* The mems cannot be volatile.  */
26173       if (MEM_VOLATILE_P (mem[i]))
26174         return false;
26175
26176       /* Check if the addresses are in the form of [base+offset].  */
26177       extract_base_offset_in_addr (mem[i], base + i, offset + i);
26178       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
26179         return false;
26180     }
26181
26182   /* Check if the registers are of same class.  */
26183   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
26184     ? FP_REGS : GENERAL_REGS;
26185
26186   for (int i = 1; i < num_insns; i++)
26187     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
26188       {
26189         if (rclass != FP_REGS)
26190           return false;
26191       }
26192     else
26193       {
26194         if (rclass != GENERAL_REGS)
26195           return false;
26196       }
26197
26198   /* Only the last register in the order in which they occur
26199      may be clobbered by the load.  */
26200   if (rclass == GENERAL_REGS && load)
26201     for (int i = 0; i < num_insns - 1; i++)
26202       if (reg_mentioned_p (reg[i], mem[i]))
26203         return false;
26204
26205   /* Check if the bases are same.  */
26206   for (int i = 0; i < num_insns - 1; i++)
26207     if (!rtx_equal_p (base[i], base[i + 1]))
26208       return false;
26209
26210   for (int i = 0; i < num_insns; i++)
26211     offvals[i] = INTVAL (offset[i]);
26212
26213   msize = GET_MODE_SIZE (mode).to_constant ();
26214
26215   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
26216   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
26217          aarch64_host_wide_int_compare);
26218
26219   if (!(offvals[1] == offvals[0] + msize
26220         && offvals[3] == offvals[2] + msize))
26221     return false;
26222
26223   /* Check that offsets are within range of each other.  The ldp/stp
26224      instructions have 7 bit immediate offsets, so use 0x80.  */
26225   if (offvals[2] - offvals[0] >= msize * 0x80)
26226     return false;
26227
26228   /* The offsets must be aligned with respect to each other.  */
26229   if (offvals[0] % msize != offvals[2] % msize)
26230     return false;
26231
26232   /* If we have SImode and slow unaligned ldp,
26233      check the alignment to be at least 8 byte. */
26234   if (mode == SImode
26235       && (aarch64_tune_params.extra_tuning_flags
26236           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26237       && !optimize_size
26238       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
26239     return false;
26240
26241   return true;
26242 }
26243
26244 /* Given OPERANDS of consecutive load/store, this function pairs them
26245    into LDP/STP after adjusting the offset.  It depends on the fact
26246    that the operands can be sorted so the offsets are correct for STP.
26247    MODE is the mode of memory operands.  CODE is the rtl operator
26248    which should be applied to all memory operands, it's SIGN_EXTEND,
26249    ZERO_EXTEND or UNKNOWN.  */
26250
26251 bool
26252 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
26253                              machine_mode mode, RTX_CODE code)
26254 {
26255   rtx base, offset_1, offset_3, t1, t2;
26256   rtx mem_1, mem_2, mem_3, mem_4;
26257   rtx temp_operands[8];
26258   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
26259                 stp_off_upper_limit, stp_off_lower_limit, msize;
26260
26261   /* We make changes on a copy as we may still bail out.  */
26262   for (int i = 0; i < 8; i ++)
26263     temp_operands[i] = operands[i];
26264
26265   /* Sort the operands.  Note for cases as below:
26266        [base + 0x310] = A
26267        [base + 0x320] = B
26268        [base + 0x330] = C
26269        [base + 0x320] = D
26270      We need stable sorting otherwise wrong data may be store to offset 0x320.
26271      Also note the dead store in above case should be optimized away, but no
26272      guarantees here.  */
26273   gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
26274                  aarch64_ldrstr_offset_compare);
26275
26276   /* Copy the memory operands so that if we have to bail for some
26277      reason the original addresses are unchanged.  */
26278   if (load)
26279     {
26280       mem_1 = copy_rtx (temp_operands[1]);
26281       mem_2 = copy_rtx (temp_operands[3]);
26282       mem_3 = copy_rtx (temp_operands[5]);
26283       mem_4 = copy_rtx (temp_operands[7]);
26284     }
26285   else
26286     {
26287       mem_1 = copy_rtx (temp_operands[0]);
26288       mem_2 = copy_rtx (temp_operands[2]);
26289       mem_3 = copy_rtx (temp_operands[4]);
26290       mem_4 = copy_rtx (temp_operands[6]);
26291       gcc_assert (code == UNKNOWN);
26292     }
26293
26294   extract_base_offset_in_addr (mem_1, &base, &offset_1);
26295   extract_base_offset_in_addr (mem_3, &base, &offset_3);
26296   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
26297               && offset_3 != NULL_RTX);
26298
26299   /* Adjust offset so it can fit in LDP/STP instruction.  */
26300   msize = GET_MODE_SIZE (mode).to_constant();
26301   stp_off_upper_limit = msize * (0x40 - 1);
26302   stp_off_lower_limit = - msize * 0x40;
26303
26304   off_val_1 = INTVAL (offset_1);
26305   off_val_3 = INTVAL (offset_3);
26306
26307   /* The base offset is optimally half way between the two STP/LDP offsets.  */
26308   if (msize <= 4)
26309     base_off = (off_val_1 + off_val_3) / 2;
26310   else
26311     /* However, due to issues with negative LDP/STP offset generation for
26312        larger modes, for DF, DD, DI and vector modes. we must not use negative
26313        addresses smaller than 9 signed unadjusted bits can store.  This
26314        provides the most range in this case.  */
26315     base_off = off_val_1;
26316
26317   /* Adjust the base so that it is aligned with the addresses but still
26318      optimal.  */
26319   if (base_off % msize != off_val_1 % msize)
26320     /* Fix the offset, bearing in mind we want to make it bigger not
26321        smaller.  */
26322     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26323   else if (msize <= 4)
26324     /* The negative range of LDP/STP is one larger than the positive range.  */
26325     base_off += msize;
26326
26327   /* Check if base offset is too big or too small.  We can attempt to resolve
26328      this issue by setting it to the maximum value and seeing if the offsets
26329      still fit.  */
26330   if (base_off >= 0x1000)
26331     {
26332       base_off = 0x1000 - 1;
26333       /* We must still make sure that the base offset is aligned with respect
26334          to the address.  But it may not be made any bigger.  */
26335       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26336     }
26337
26338   /* Likewise for the case where the base is too small.  */
26339   if (base_off <= -0x1000)
26340     {
26341       base_off = -0x1000 + 1;
26342       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26343     }
26344
26345   /* Offset of the first STP/LDP.  */
26346   new_off_1 = off_val_1 - base_off;
26347
26348   /* Offset of the second STP/LDP.  */
26349   new_off_3 = off_val_3 - base_off;
26350
26351   /* The offsets must be within the range of the LDP/STP instructions.  */
26352   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
26353       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
26354     return false;
26355
26356   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
26357                                                   new_off_1), true);
26358   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
26359                                                   new_off_1 + msize), true);
26360   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
26361                                                   new_off_3), true);
26362   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
26363                                                   new_off_3 + msize), true);
26364
26365   if (!aarch64_mem_pair_operand (mem_1, mode)
26366       || !aarch64_mem_pair_operand (mem_3, mode))
26367     return false;
26368
26369   if (code == ZERO_EXTEND)
26370     {
26371       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
26372       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
26373       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
26374       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
26375     }
26376   else if (code == SIGN_EXTEND)
26377     {
26378       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
26379       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
26380       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
26381       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
26382     }
26383
26384   if (load)
26385     {
26386       operands[0] = temp_operands[0];
26387       operands[1] = mem_1;
26388       operands[2] = temp_operands[2];
26389       operands[3] = mem_2;
26390       operands[4] = temp_operands[4];
26391       operands[5] = mem_3;
26392       operands[6] = temp_operands[6];
26393       operands[7] = mem_4;
26394     }
26395   else
26396     {
26397       operands[0] = mem_1;
26398       operands[1] = temp_operands[1];
26399       operands[2] = mem_2;
26400       operands[3] = temp_operands[3];
26401       operands[4] = mem_3;
26402       operands[5] = temp_operands[5];
26403       operands[6] = mem_4;
26404       operands[7] = temp_operands[7];
26405     }
26406
26407   /* Emit adjusting instruction.  */
26408   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
26409   /* Emit ldp/stp instructions.  */
26410   t1 = gen_rtx_SET (operands[0], operands[1]);
26411   t2 = gen_rtx_SET (operands[2], operands[3]);
26412   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26413   t1 = gen_rtx_SET (operands[4], operands[5]);
26414   t2 = gen_rtx_SET (operands[6], operands[7]);
26415   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26416   return true;
26417 }
26418
26419 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
26420    it isn't worth branching around empty masked ops (including masked
26421    stores).  */
26422
26423 static bool
26424 aarch64_empty_mask_is_expensive (unsigned)
26425 {
26426   return false;
26427 }
26428
26429 /* Return 1 if pseudo register should be created and used to hold
26430    GOT address for PIC code.  */
26431
26432 bool
26433 aarch64_use_pseudo_pic_reg (void)
26434 {
26435   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
26436 }
26437
26438 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
26439
26440 static int
26441 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
26442 {
26443   switch (XINT (x, 1))
26444     {
26445     case UNSPEC_GOTSMALLPIC:
26446     case UNSPEC_GOTSMALLPIC28K:
26447     case UNSPEC_GOTTINYPIC:
26448       return 0;
26449     default:
26450       break;
26451     }
26452
26453   return default_unspec_may_trap_p (x, flags);
26454 }
26455
26456
26457 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
26458    return the log2 of that value.  Otherwise return -1.  */
26459
26460 int
26461 aarch64_fpconst_pow_of_2 (rtx x)
26462 {
26463   const REAL_VALUE_TYPE *r;
26464
26465   if (!CONST_DOUBLE_P (x))
26466     return -1;
26467
26468   r = CONST_DOUBLE_REAL_VALUE (x);
26469
26470   if (REAL_VALUE_NEGATIVE (*r)
26471       || REAL_VALUE_ISNAN (*r)
26472       || REAL_VALUE_ISINF (*r)
26473       || !real_isinteger (r, DFmode))
26474     return -1;
26475
26476   return exact_log2 (real_to_integer (r));
26477 }
26478
26479 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
26480    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
26481    return n. Otherwise return -1.  */
26482
26483 int
26484 aarch64_fpconst_pow2_recip (rtx x)
26485 {
26486   REAL_VALUE_TYPE r0;
26487
26488   if (!CONST_DOUBLE_P (x))
26489     return -1;
26490
26491   r0 = *CONST_DOUBLE_REAL_VALUE (x);
26492   if (exact_real_inverse (DFmode, &r0)
26493       && !REAL_VALUE_NEGATIVE (r0))
26494     {
26495         int ret = exact_log2 (real_to_integer (&r0));
26496         if (ret >= 1 && ret <= 32)
26497             return ret;
26498     }
26499   return -1;
26500 }
26501
26502 /* If X is a vector of equal CONST_DOUBLE values and that value is
26503    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
26504
26505 int
26506 aarch64_vec_fpconst_pow_of_2 (rtx x)
26507 {
26508   int nelts;
26509   if (!CONST_VECTOR_P (x)
26510       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
26511     return -1;
26512
26513   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
26514     return -1;
26515
26516   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
26517   if (firstval <= 0)
26518     return -1;
26519
26520   for (int i = 1; i < nelts; i++)
26521     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
26522       return -1;
26523
26524   return firstval;
26525 }
26526
26527 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
26528    to float.
26529
26530    __fp16 always promotes through this hook.
26531    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
26532    through the generic excess precision logic rather than here.  */
26533
26534 static tree
26535 aarch64_promoted_type (const_tree t)
26536 {
26537   if (SCALAR_FLOAT_TYPE_P (t)
26538       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
26539     return float_type_node;
26540
26541   return NULL_TREE;
26542 }
26543
26544 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
26545
26546 static bool
26547 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
26548                            optimization_type opt_type)
26549 {
26550   switch (op)
26551     {
26552     case rsqrt_optab:
26553       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
26554
26555     default:
26556       return true;
26557     }
26558 }
26559
26560 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
26561
26562 static unsigned int
26563 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
26564                                         int *offset)
26565 {
26566   /* Polynomial invariant 1 == (VG / 2) - 1.  */
26567   gcc_assert (i == 1);
26568   *factor = 2;
26569   *offset = 1;
26570   return AARCH64_DWARF_VG;
26571 }
26572
26573 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
26574    if MODE is HFmode, and punt to the generic implementation otherwise.  */
26575
26576 static bool
26577 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
26578 {
26579   return (mode == HFmode
26580           ? true
26581           : default_libgcc_floating_mode_supported_p (mode));
26582 }
26583
26584 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
26585    if MODE is HFmode, and punt to the generic implementation otherwise.  */
26586
26587 static bool
26588 aarch64_scalar_mode_supported_p (scalar_mode mode)
26589 {
26590   if (DECIMAL_FLOAT_MODE_P (mode))
26591     return default_decimal_float_supported_p ();
26592
26593   return (mode == HFmode
26594           ? true
26595           : default_scalar_mode_supported_p (mode));
26596 }
26597
26598 /* Set the value of FLT_EVAL_METHOD.
26599    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
26600
26601     0: evaluate all operations and constants, whose semantic type has at
26602        most the range and precision of type float, to the range and
26603        precision of float; evaluate all other operations and constants to
26604        the range and precision of the semantic type;
26605
26606     N, where _FloatN is a supported interchange floating type
26607        evaluate all operations and constants, whose semantic type has at
26608        most the range and precision of _FloatN type, to the range and
26609        precision of the _FloatN type; evaluate all other operations and
26610        constants to the range and precision of the semantic type;
26611
26612    If we have the ARMv8.2-A extensions then we support _Float16 in native
26613    precision, so we should set this to 16.  Otherwise, we support the type,
26614    but want to evaluate expressions in float precision, so set this to
26615    0.  */
26616
26617 static enum flt_eval_method
26618 aarch64_excess_precision (enum excess_precision_type type)
26619 {
26620   switch (type)
26621     {
26622       case EXCESS_PRECISION_TYPE_FAST:
26623       case EXCESS_PRECISION_TYPE_STANDARD:
26624         /* We can calculate either in 16-bit range and precision or
26625            32-bit range and precision.  Make that decision based on whether
26626            we have native support for the ARMv8.2-A 16-bit floating-point
26627            instructions or not.  */
26628         return (TARGET_FP_F16INST
26629                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
26630                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
26631       case EXCESS_PRECISION_TYPE_IMPLICIT:
26632       case EXCESS_PRECISION_TYPE_FLOAT16:
26633         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
26634       default:
26635         gcc_unreachable ();
26636     }
26637   return FLT_EVAL_METHOD_UNPREDICTABLE;
26638 }
26639
26640 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
26641    scheduled for speculative execution.  Reject the long-running division
26642    and square-root instructions.  */
26643
26644 static bool
26645 aarch64_sched_can_speculate_insn (rtx_insn *insn)
26646 {
26647   switch (get_attr_type (insn))
26648     {
26649       case TYPE_SDIV:
26650       case TYPE_UDIV:
26651       case TYPE_FDIVS:
26652       case TYPE_FDIVD:
26653       case TYPE_FSQRTS:
26654       case TYPE_FSQRTD:
26655       case TYPE_NEON_FP_SQRT_S:
26656       case TYPE_NEON_FP_SQRT_D:
26657       case TYPE_NEON_FP_SQRT_S_Q:
26658       case TYPE_NEON_FP_SQRT_D_Q:
26659       case TYPE_NEON_FP_DIV_S:
26660       case TYPE_NEON_FP_DIV_D:
26661       case TYPE_NEON_FP_DIV_S_Q:
26662       case TYPE_NEON_FP_DIV_D_Q:
26663         return false;
26664       default:
26665         return true;
26666     }
26667 }
26668
26669 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
26670
26671 static int
26672 aarch64_compute_pressure_classes (reg_class *classes)
26673 {
26674   int i = 0;
26675   classes[i++] = GENERAL_REGS;
26676   classes[i++] = FP_REGS;
26677   /* PR_REGS isn't a useful pressure class because many predicate pseudo
26678      registers need to go in PR_LO_REGS at some point during their
26679      lifetime.  Splitting it into two halves has the effect of making
26680      all predicates count against PR_LO_REGS, so that we try whenever
26681      possible to restrict the number of live predicates to 8.  This
26682      greatly reduces the amount of spilling in certain loops.  */
26683   classes[i++] = PR_LO_REGS;
26684   classes[i++] = PR_HI_REGS;
26685   return i;
26686 }
26687
26688 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
26689
26690 static bool
26691 aarch64_can_change_mode_class (machine_mode from,
26692                                machine_mode to, reg_class_t)
26693 {
26694   unsigned int from_flags = aarch64_classify_vector_mode (from);
26695   unsigned int to_flags = aarch64_classify_vector_mode (to);
26696
26697   bool from_sve_p = (from_flags & VEC_ANY_SVE);
26698   bool to_sve_p = (to_flags & VEC_ANY_SVE);
26699
26700   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
26701   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
26702
26703   bool from_pred_p = (from_flags & VEC_SVE_PRED);
26704   bool to_pred_p = (to_flags & VEC_SVE_PRED);
26705
26706   bool from_full_advsimd_struct_p = (from_flags == (VEC_ADVSIMD | VEC_STRUCT));
26707   bool to_partial_advsimd_struct_p = (to_flags == (VEC_ADVSIMD | VEC_STRUCT
26708                                                    | VEC_PARTIAL));
26709
26710   /* Don't allow changes between predicate modes and other modes.
26711      Only predicate registers can hold predicate modes and only
26712      non-predicate registers can hold non-predicate modes, so any
26713      attempt to mix them would require a round trip through memory.  */
26714   if (from_pred_p != to_pred_p)
26715     return false;
26716
26717   /* Don't allow changes between partial SVE modes and other modes.
26718      The contents of partial SVE modes are distributed evenly across
26719      the register, whereas GCC expects them to be clustered together.  */
26720   if (from_partial_sve_p != to_partial_sve_p)
26721     return false;
26722
26723   /* Similarly reject changes between partial SVE modes that have
26724      different patterns of significant and insignificant bits.  */
26725   if (from_partial_sve_p
26726       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
26727           || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
26728     return false;
26729
26730   /* Don't allow changes between partial and full Advanced SIMD structure
26731      modes.  */
26732   if (from_full_advsimd_struct_p && to_partial_advsimd_struct_p)
26733     return false;
26734
26735   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26736     {
26737       /* Don't allow changes between SVE modes and other modes that might
26738          be bigger than 128 bits.  In particular, OImode, CImode and XImode
26739          divide into 128-bit quantities while SVE modes divide into
26740          BITS_PER_SVE_VECTOR quantities.  */
26741       if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
26742         return false;
26743       if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
26744         return false;
26745     }
26746
26747   if (BYTES_BIG_ENDIAN)
26748     {
26749       /* Don't allow changes between SVE data modes and non-SVE modes.
26750          See the comment at the head of aarch64-sve.md for details.  */
26751       if (from_sve_p != to_sve_p)
26752         return false;
26753
26754       /* Don't allow changes in element size: lane 0 of the new vector
26755          would not then be lane 0 of the old vector.  See the comment
26756          above aarch64_maybe_expand_sve_subreg_move for a more detailed
26757          description.
26758
26759          In the worst case, this forces a register to be spilled in
26760          one mode and reloaded in the other, which handles the
26761          endianness correctly.  */
26762       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
26763         return false;
26764     }
26765   return true;
26766 }
26767
26768 /* Implement TARGET_EARLY_REMAT_MODES.  */
26769
26770 static void
26771 aarch64_select_early_remat_modes (sbitmap modes)
26772 {
26773   /* SVE values are not normally live across a call, so it should be
26774      worth doing early rematerialization even in VL-specific mode.  */
26775   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
26776     if (aarch64_sve_mode_p ((machine_mode) i))
26777       bitmap_set_bit (modes, i);
26778 }
26779
26780 /* Override the default target speculation_safe_value.  */
26781 static rtx
26782 aarch64_speculation_safe_value (machine_mode mode,
26783                                 rtx result, rtx val, rtx failval)
26784 {
26785   /* Maybe we should warn if falling back to hard barriers.  They are
26786      likely to be noticably more expensive than the alternative below.  */
26787   if (!aarch64_track_speculation)
26788     return default_speculation_safe_value (mode, result, val, failval);
26789
26790   if (!REG_P (val))
26791     val = copy_to_mode_reg (mode, val);
26792
26793   if (!aarch64_reg_or_zero (failval, mode))
26794     failval = copy_to_mode_reg (mode, failval);
26795
26796   emit_insn (gen_despeculate_copy (mode, result, val, failval));
26797   return result;
26798 }
26799
26800 /* Implement TARGET_ESTIMATED_POLY_VALUE.
26801    Look into the tuning structure for an estimate.
26802    KIND specifies the type of requested estimate: min, max or likely.
26803    For cores with a known SVE width all three estimates are the same.
26804    For generic SVE tuning we want to distinguish the maximum estimate from
26805    the minimum and likely ones.
26806    The likely estimate is the same as the minimum in that case to give a
26807    conservative behavior of auto-vectorizing with SVE when it is a win
26808    even for 128-bit SVE.
26809    When SVE width information is available VAL.coeffs[1] is multiplied by
26810    the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
26811
26812 static HOST_WIDE_INT
26813 aarch64_estimated_poly_value (poly_int64 val,
26814                               poly_value_estimate_kind kind
26815                                 = POLY_VALUE_LIKELY)
26816 {
26817   unsigned int width_source = aarch64_tune_params.sve_width;
26818
26819   /* If there is no core-specific information then the minimum and likely
26820      values are based on 128-bit vectors and the maximum is based on
26821      the architectural maximum of 2048 bits.  */
26822   if (width_source == SVE_SCALABLE)
26823     switch (kind)
26824       {
26825       case POLY_VALUE_MIN:
26826       case POLY_VALUE_LIKELY:
26827         return val.coeffs[0];
26828       case POLY_VALUE_MAX:
26829           return val.coeffs[0] + val.coeffs[1] * 15;
26830       }
26831
26832   /* Allow sve_width to be a bitmask of different VL, treating the lowest
26833      as likely.  This could be made more general if future -mtune options
26834      need it to be.  */
26835   if (kind == POLY_VALUE_MAX)
26836     width_source = 1 << floor_log2 (width_source);
26837   else
26838     width_source = least_bit_hwi (width_source);
26839
26840   /* If the core provides width information, use that.  */
26841   HOST_WIDE_INT over_128 = width_source - 128;
26842   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
26843 }
26844
26845
26846 /* Return true for types that could be supported as SIMD return or
26847    argument types.  */
26848
26849 static bool
26850 supported_simd_type (tree t)
26851 {
26852   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
26853     {
26854       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
26855       return s == 1 || s == 2 || s == 4 || s == 8;
26856     }
26857   return false;
26858 }
26859
26860 /* Return true for types that currently are supported as SIMD return
26861    or argument types.  */
26862
26863 static bool
26864 currently_supported_simd_type (tree t, tree b)
26865 {
26866   if (COMPLEX_FLOAT_TYPE_P (t))
26867     return false;
26868
26869   if (TYPE_SIZE (t) != TYPE_SIZE (b))
26870     return false;
26871
26872   return supported_simd_type (t);
26873 }
26874
26875 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
26876
26877 static int
26878 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
26879                                         struct cgraph_simd_clone *clonei,
26880                                         tree base_type, int num,
26881                                         bool explicit_p)
26882 {
26883   tree t, ret_type;
26884   unsigned int elt_bits, count;
26885   unsigned HOST_WIDE_INT const_simdlen;
26886   poly_uint64 vec_bits;
26887
26888   if (!TARGET_SIMD)
26889     return 0;
26890
26891   /* For now, SVE simdclones won't produce illegal simdlen, So only check
26892      const simdlens here.  */
26893   if (maybe_ne (clonei->simdlen, 0U)
26894       && clonei->simdlen.is_constant (&const_simdlen)
26895       && (const_simdlen < 2
26896           || const_simdlen > 1024
26897           || (const_simdlen & (const_simdlen - 1)) != 0))
26898     {
26899       if (explicit_p)
26900         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26901                     "unsupported simdlen %wd", const_simdlen);
26902       return 0;
26903     }
26904
26905   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
26906   if (TREE_CODE (ret_type) != VOID_TYPE
26907       && !currently_supported_simd_type (ret_type, base_type))
26908     {
26909       if (!explicit_p)
26910         ;
26911       else if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
26912         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26913                     "GCC does not currently support mixed size types "
26914                     "for %<simd%> functions");
26915       else if (supported_simd_type (ret_type))
26916         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26917                     "GCC does not currently support return type %qT "
26918                     "for %<simd%> functions", ret_type);
26919       else
26920         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26921                     "unsupported return type %qT for %<simd%> functions",
26922                     ret_type);
26923       return 0;
26924     }
26925
26926   int i;
26927   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
26928   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
26929
26930   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
26931        t && t != void_list_node; t = TREE_CHAIN (t), i++)
26932     {
26933       tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
26934
26935       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
26936           && !currently_supported_simd_type (arg_type, base_type))
26937         {
26938           if (!explicit_p)
26939             ;
26940           else if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
26941             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26942                         "GCC does not currently support mixed size types "
26943                         "for %<simd%> functions");
26944           else
26945             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26946                         "GCC does not currently support argument type %qT "
26947                         "for %<simd%> functions", arg_type);
26948           return 0;
26949         }
26950     }
26951
26952   clonei->vecsize_mangle = 'n';
26953   clonei->mask_mode = VOIDmode;
26954   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
26955   if (known_eq (clonei->simdlen, 0U))
26956     {
26957       count = 2;
26958       vec_bits = (num == 0 ? 64 : 128);
26959       clonei->simdlen = exact_div (vec_bits, elt_bits);
26960     }
26961   else
26962     {
26963       count = 1;
26964       vec_bits = clonei->simdlen * elt_bits;
26965       /* For now, SVE simdclones won't produce illegal simdlen, So only check
26966          const simdlens here.  */
26967       if (clonei->simdlen.is_constant (&const_simdlen)
26968           && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
26969         {
26970           if (explicit_p)
26971             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26972                         "GCC does not currently support simdlen %wd for "
26973                         "type %qT",
26974                         const_simdlen, base_type);
26975           return 0;
26976         }
26977     }
26978   clonei->vecsize_int = vec_bits;
26979   clonei->vecsize_float = vec_bits;
26980   return count;
26981 }
26982
26983 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
26984
26985 static void
26986 aarch64_simd_clone_adjust (struct cgraph_node *node)
26987 {
26988   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
26989      use the correct ABI.  */
26990
26991   tree t = TREE_TYPE (node->decl);
26992   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
26993                                         TYPE_ATTRIBUTES (t));
26994 }
26995
26996 /* Implement TARGET_SIMD_CLONE_USABLE.  */
26997
26998 static int
26999 aarch64_simd_clone_usable (struct cgraph_node *node)
27000 {
27001   switch (node->simdclone->vecsize_mangle)
27002     {
27003     case 'n':
27004       if (!TARGET_SIMD)
27005         return -1;
27006       return 0;
27007     default:
27008       gcc_unreachable ();
27009     }
27010 }
27011
27012 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
27013
27014 static int
27015 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
27016 {
27017   auto check_attr = [&](const char *name) {
27018     tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
27019     tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
27020     if (!attr1 && !attr2)
27021       return true;
27022
27023     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
27024   };
27025
27026   if (!check_attr ("aarch64_vector_pcs"))
27027     return 0;
27028   if (!check_attr ("Advanced SIMD type"))
27029     return 0;
27030   if (!check_attr ("SVE type"))
27031     return 0;
27032   if (!check_attr ("SVE sizeless type"))
27033     return 0;
27034   return 1;
27035 }
27036
27037 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
27038
27039 static const char *
27040 aarch64_get_multilib_abi_name (void)
27041 {
27042   if (TARGET_BIG_END)
27043     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
27044   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
27045 }
27046
27047 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
27048    global variable based guard use the default else
27049    return a null tree.  */
27050 static tree
27051 aarch64_stack_protect_guard (void)
27052 {
27053   if (aarch64_stack_protector_guard == SSP_GLOBAL)
27054     return default_stack_protect_guard ();
27055
27056   return NULL_TREE;
27057 }
27058
27059 /* Return the diagnostic message string if conversion from FROMTYPE to
27060    TOTYPE is not allowed, NULL otherwise.  */
27061
27062 static const char *
27063 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
27064 {
27065   if (element_mode (fromtype) != element_mode (totype))
27066     {
27067       /* Do no allow conversions to/from BFmode scalar types.  */
27068       if (TYPE_MODE (fromtype) == BFmode)
27069         return N_("invalid conversion from type %<bfloat16_t%>");
27070       if (TYPE_MODE (totype) == BFmode)
27071         return N_("invalid conversion to type %<bfloat16_t%>");
27072     }
27073
27074   /* Conversion allowed.  */
27075   return NULL;
27076 }
27077
27078 /* Return the diagnostic message string if the unary operation OP is
27079    not permitted on TYPE, NULL otherwise.  */
27080
27081 static const char *
27082 aarch64_invalid_unary_op (int op, const_tree type)
27083 {
27084   /* Reject all single-operand operations on BFmode except for &.  */
27085   if (element_mode (type) == BFmode && op != ADDR_EXPR)
27086     return N_("operation not permitted on type %<bfloat16_t%>");
27087
27088   /* Operation allowed.  */
27089   return NULL;
27090 }
27091
27092 /* Return the diagnostic message string if the binary operation OP is
27093    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
27094
27095 static const char *
27096 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
27097                            const_tree type2)
27098 {
27099   /* Reject all 2-operand operations on BFmode.  */
27100   if (element_mode (type1) == BFmode
27101       || element_mode (type2) == BFmode)
27102     return N_("operation not permitted on type %<bfloat16_t%>");
27103
27104   if (VECTOR_TYPE_P (type1)
27105       && VECTOR_TYPE_P (type2)
27106       && !TYPE_INDIVISIBLE_P (type1)
27107       && !TYPE_INDIVISIBLE_P (type2)
27108       && (aarch64_sve::builtin_type_p (type1)
27109           != aarch64_sve::builtin_type_p (type2)))
27110     return N_("cannot combine GNU and SVE vectors in a binary operation");
27111
27112   /* Operation allowed.  */
27113   return NULL;
27114 }
27115
27116 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES.  Here we tell the rest of the
27117    compiler that we automatically ignore the top byte of our pointers, which
27118    allows using -fsanitize=hwaddress.  */
27119 bool
27120 aarch64_can_tag_addresses ()
27121 {
27122   return !TARGET_ILP32;
27123 }
27124
27125 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
27126    section at the end if needed.  */
27127 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
27128 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
27129 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
27130 void
27131 aarch64_file_end_indicate_exec_stack ()
27132 {
27133   file_end_indicate_exec_stack ();
27134
27135   unsigned feature_1_and = 0;
27136   if (aarch64_bti_enabled ())
27137     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
27138
27139   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
27140     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
27141
27142   if (feature_1_and)
27143     {
27144       /* Generate .note.gnu.property section.  */
27145       switch_to_section (get_section (".note.gnu.property",
27146                                       SECTION_NOTYPE, NULL));
27147
27148       /* PT_NOTE header: namesz, descsz, type.
27149          namesz = 4 ("GNU\0")
27150          descsz = 16 (Size of the program property array)
27151                   [(12 + padding) * Number of array elements]
27152          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
27153       assemble_align (POINTER_SIZE);
27154       assemble_integer (GEN_INT (4), 4, 32, 1);
27155       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
27156       assemble_integer (GEN_INT (5), 4, 32, 1);
27157
27158       /* PT_NOTE name.  */
27159       assemble_string ("GNU", 4);
27160
27161       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
27162          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
27163          datasz = 4
27164          data   = feature_1_and.  */
27165       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
27166       assemble_integer (GEN_INT (4), 4, 32, 1);
27167       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
27168
27169       /* Pad the size of the note to the required alignment.  */
27170       assemble_align (POINTER_SIZE);
27171     }
27172 }
27173 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
27174 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
27175 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
27176
27177 /* Helper function for straight line speculation.
27178    Return what barrier should be emitted for straight line speculation
27179    mitigation.
27180    When not mitigating against straight line speculation this function returns
27181    an empty string.
27182    When mitigating against straight line speculation, use:
27183    * SB when the v8.5-A SB extension is enabled.
27184    * DSB+ISB otherwise.  */
27185 const char *
27186 aarch64_sls_barrier (int mitigation_required)
27187 {
27188   return mitigation_required
27189     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
27190     : "";
27191 }
27192
27193 static GTY (()) tree aarch64_sls_shared_thunks[30];
27194 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
27195 const char *indirect_symbol_names[30] = {
27196     "__call_indirect_x0",
27197     "__call_indirect_x1",
27198     "__call_indirect_x2",
27199     "__call_indirect_x3",
27200     "__call_indirect_x4",
27201     "__call_indirect_x5",
27202     "__call_indirect_x6",
27203     "__call_indirect_x7",
27204     "__call_indirect_x8",
27205     "__call_indirect_x9",
27206     "__call_indirect_x10",
27207     "__call_indirect_x11",
27208     "__call_indirect_x12",
27209     "__call_indirect_x13",
27210     "__call_indirect_x14",
27211     "__call_indirect_x15",
27212     "", /* "__call_indirect_x16",  */
27213     "", /* "__call_indirect_x17",  */
27214     "__call_indirect_x18",
27215     "__call_indirect_x19",
27216     "__call_indirect_x20",
27217     "__call_indirect_x21",
27218     "__call_indirect_x22",
27219     "__call_indirect_x23",
27220     "__call_indirect_x24",
27221     "__call_indirect_x25",
27222     "__call_indirect_x26",
27223     "__call_indirect_x27",
27224     "__call_indirect_x28",
27225     "__call_indirect_x29",
27226 };
27227
27228 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
27229    line speculation.  Instead of a simple BLR that can be speculated past,
27230    we emit a BL to this thunk, and this thunk contains a BR to the relevant
27231    register.  These thunks have the relevant speculation barries put after
27232    their indirect branch so that speculation is blocked.
27233
27234    We use such a thunk so the speculation barriers are kept off the
27235    architecturally executed path in order to reduce the performance overhead.
27236
27237    When optimizing for size we use stubs shared by the linked object.
27238    When optimizing for performance we emit stubs for each function in the hope
27239    that the branch predictor can better train on jumps specific for a given
27240    function.  */
27241 rtx
27242 aarch64_sls_create_blr_label (int regnum)
27243 {
27244   gcc_assert (STUB_REGNUM_P (regnum));
27245   if (optimize_function_for_size_p (cfun))
27246     {
27247       /* For the thunks shared between different functions in this compilation
27248          unit we use a named symbol -- this is just for users to more easily
27249          understand the generated assembly.  */
27250       aarch64_sls_shared_thunks_needed = true;
27251       const char *thunk_name = indirect_symbol_names[regnum];
27252       if (aarch64_sls_shared_thunks[regnum] == NULL)
27253         {
27254           /* Build a decl representing this function stub and record it for
27255              later.  We build a decl here so we can use the GCC machinery for
27256              handling sections automatically (through `get_named_section` and
27257              `make_decl_one_only`).  That saves us a lot of trouble handling
27258              the specifics of different output file formats.  */
27259           tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
27260                                   get_identifier (thunk_name),
27261                                   build_function_type_list (void_type_node,
27262                                                             NULL_TREE));
27263           DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
27264                                            NULL_TREE, void_type_node);
27265           TREE_PUBLIC (decl) = 1;
27266           TREE_STATIC (decl) = 1;
27267           DECL_IGNORED_P (decl) = 1;
27268           DECL_ARTIFICIAL (decl) = 1;
27269           make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
27270           resolve_unique_section (decl, 0, false);
27271           aarch64_sls_shared_thunks[regnum] = decl;
27272         }
27273
27274       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
27275     }
27276
27277   if (cfun->machine->call_via[regnum] == NULL)
27278     cfun->machine->call_via[regnum]
27279       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
27280   return cfun->machine->call_via[regnum];
27281 }
27282
27283 /* Helper function for aarch64_sls_emit_blr_function_thunks and
27284    aarch64_sls_emit_shared_blr_thunks below.  */
27285 static void
27286 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
27287 {
27288   /* Save in x16 and branch to that function so this transformation does
27289      not prevent jumping to `BTI c` instructions.  */
27290   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
27291   asm_fprintf (out_file, "\tbr\tx16\n");
27292 }
27293
27294 /* Emit all BLR stubs for this particular function.
27295    Here we emit all the BLR stubs needed for the current function.  Since we
27296    emit these stubs in a consecutive block we know there will be no speculation
27297    gadgets between each stub, and hence we only emit a speculation barrier at
27298    the end of the stub sequences.
27299
27300    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
27301 void
27302 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
27303 {
27304   if (! aarch64_harden_sls_blr_p ())
27305     return;
27306
27307   bool any_functions_emitted = false;
27308   /* We must save and restore the current function section since this assembly
27309      is emitted at the end of the function.  This means it can be emitted *just
27310      after* the cold section of a function.  That cold part would be emitted in
27311      a different section.  That switch would trigger a `.cfi_endproc` directive
27312      to be emitted in the original section and a `.cfi_startproc` directive to
27313      be emitted in the new section.  Switching to the original section without
27314      restoring would mean that the `.cfi_endproc` emitted as a function ends
27315      would happen in a different section -- leaving an unmatched
27316      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
27317      in the standard text section.  */
27318   section *save_text_section = in_section;
27319   switch_to_section (function_section (current_function_decl));
27320   for (int regnum = 0; regnum < 30; ++regnum)
27321     {
27322       rtx specu_label = cfun->machine->call_via[regnum];
27323       if (specu_label == NULL)
27324         continue;
27325
27326       targetm.asm_out.print_operand (out_file, specu_label, 0);
27327       asm_fprintf (out_file, ":\n");
27328       aarch64_sls_emit_function_stub (out_file, regnum);
27329       any_functions_emitted = true;
27330     }
27331   if (any_functions_emitted)
27332     /* Can use the SB if needs be here, since this stub will only be used
27333       by the current function, and hence for the current target.  */
27334     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
27335   switch_to_section (save_text_section);
27336 }
27337
27338 /* Emit shared BLR stubs for the current compilation unit.
27339    Over the course of compiling this unit we may have converted some BLR
27340    instructions to a BL to a shared stub function.  This is where we emit those
27341    stub functions.
27342    This function is for the stubs shared between different functions in this
27343    compilation unit.  We share when optimizing for size instead of speed.
27344
27345    This function is called through the TARGET_ASM_FILE_END hook.  */
27346 void
27347 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
27348 {
27349   if (! aarch64_sls_shared_thunks_needed)
27350     return;
27351
27352   for (int regnum = 0; regnum < 30; ++regnum)
27353     {
27354       tree decl = aarch64_sls_shared_thunks[regnum];
27355       if (!decl)
27356         continue;
27357
27358       const char *name = indirect_symbol_names[regnum];
27359       switch_to_section (get_named_section (decl, NULL, 0));
27360       ASM_OUTPUT_ALIGN (out_file, 2);
27361       targetm.asm_out.globalize_label (out_file, name);
27362       /* Only emits if the compiler is configured for an assembler that can
27363          handle visibility directives.  */
27364       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
27365       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
27366       ASM_OUTPUT_LABEL (out_file, name);
27367       aarch64_sls_emit_function_stub (out_file, regnum);
27368       /* Use the most conservative target to ensure it can always be used by any
27369          function in the translation unit.  */
27370       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
27371       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
27372     }
27373 }
27374
27375 /* Implement TARGET_ASM_FILE_END.  */
27376 void
27377 aarch64_asm_file_end ()
27378 {
27379   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
27380   /* Since this function will be called for the ASM_FILE_END hook, we ensure
27381      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
27382      for FreeBSD) still gets called.  */
27383 #ifdef TARGET_ASM_FILE_END
27384   TARGET_ASM_FILE_END ();
27385 #endif
27386 }
27387
27388 const char *
27389 aarch64_indirect_call_asm (rtx addr)
27390 {
27391   gcc_assert (REG_P (addr));
27392   if (aarch64_harden_sls_blr_p ())
27393     {
27394       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
27395       output_asm_insn ("bl\t%0", &stub_label);
27396     }
27397   else
27398    output_asm_insn ("blr\t%0", &addr);
27399   return "";
27400 }
27401
27402 /* Target-specific selftests.  */
27403
27404 #if CHECKING_P
27405
27406 namespace selftest {
27407
27408 /* Selftest for the RTL loader.
27409    Verify that the RTL loader copes with a dump from
27410    print_rtx_function.  This is essentially just a test that class
27411    function_reader can handle a real dump, but it also verifies
27412    that lookup_reg_by_dump_name correctly handles hard regs.
27413    The presence of hard reg names in the dump means that the test is
27414    target-specific, hence it is in this file.  */
27415
27416 static void
27417 aarch64_test_loading_full_dump ()
27418 {
27419   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
27420
27421   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
27422
27423   rtx_insn *insn_1 = get_insn_by_uid (1);
27424   ASSERT_EQ (NOTE, GET_CODE (insn_1));
27425
27426   rtx_insn *insn_15 = get_insn_by_uid (15);
27427   ASSERT_EQ (INSN, GET_CODE (insn_15));
27428   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
27429
27430   /* Verify crtl->return_rtx.  */
27431   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
27432   ASSERT_EQ (0, REGNO (crtl->return_rtx));
27433   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
27434 }
27435
27436 /* Test the fractional_cost class.  */
27437
27438 static void
27439 aarch64_test_fractional_cost ()
27440 {
27441   using cf = fractional_cost;
27442
27443   ASSERT_EQ (cf (0, 20), 0);
27444
27445   ASSERT_EQ (cf (4, 2), 2);
27446   ASSERT_EQ (3, cf (9, 3));
27447
27448   ASSERT_NE (cf (5, 2), 2);
27449   ASSERT_NE (3, cf (8, 3));
27450
27451   ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
27452   ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
27453   ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
27454
27455   ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
27456   ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
27457   ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
27458   ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
27459   ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
27460   ASSERT_EQ (3 - cf (10, 3), 0);
27461
27462   ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
27463   ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
27464
27465   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27466   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27467   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27468   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27469   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27470   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27471   ASSERT_TRUE (cf (239, 240) < 1);
27472   ASSERT_FALSE (cf (240, 240) < 1);
27473   ASSERT_FALSE (cf (241, 240) < 1);
27474   ASSERT_FALSE (2 < cf (207, 104));
27475   ASSERT_FALSE (2 < cf (208, 104));
27476   ASSERT_TRUE (2 < cf (209, 104));
27477
27478   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27479   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27480   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27481   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27482   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27483   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27484   ASSERT_TRUE (cf (239, 240) < 1);
27485   ASSERT_FALSE (cf (240, 240) < 1);
27486   ASSERT_FALSE (cf (241, 240) < 1);
27487   ASSERT_FALSE (2 < cf (207, 104));
27488   ASSERT_FALSE (2 < cf (208, 104));
27489   ASSERT_TRUE (2 < cf (209, 104));
27490
27491   ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
27492   ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
27493   ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
27494   ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
27495   ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
27496   ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
27497   ASSERT_FALSE (cf (239, 240) >= 1);
27498   ASSERT_TRUE (cf (240, 240) >= 1);
27499   ASSERT_TRUE (cf (241, 240) >= 1);
27500   ASSERT_TRUE (2 >= cf (207, 104));
27501   ASSERT_TRUE (2 >= cf (208, 104));
27502   ASSERT_FALSE (2 >= cf (209, 104));
27503
27504   ASSERT_FALSE (cf (4, 15) > cf (5, 15));
27505   ASSERT_FALSE (cf (5, 15) > cf (5, 15));
27506   ASSERT_TRUE (cf (6, 15) > cf (5, 15));
27507   ASSERT_FALSE (cf (1, 3) > cf (2, 5));
27508   ASSERT_FALSE (cf (1, 12) > cf (1, 6));
27509   ASSERT_FALSE (cf (5, 3) > cf (5, 3));
27510   ASSERT_FALSE (cf (239, 240) > 1);
27511   ASSERT_FALSE (cf (240, 240) > 1);
27512   ASSERT_TRUE (cf (241, 240) > 1);
27513   ASSERT_TRUE (2 > cf (207, 104));
27514   ASSERT_FALSE (2 > cf (208, 104));
27515   ASSERT_FALSE (2 > cf (209, 104));
27516
27517   ASSERT_EQ (cf (1, 2).ceil (), 1);
27518   ASSERT_EQ (cf (11, 7).ceil (), 2);
27519   ASSERT_EQ (cf (20, 1).ceil (), 20);
27520   ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
27521   ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
27522   ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
27523   ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
27524   ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
27525
27526   ASSERT_EQ (cf (1, 2).as_double (), 0.5);
27527 }
27528
27529 /* Run all target-specific selftests.  */
27530
27531 static void
27532 aarch64_run_selftests (void)
27533 {
27534   aarch64_test_loading_full_dump ();
27535   aarch64_test_fractional_cost ();
27536 }
27537
27538 } // namespace selftest
27539
27540 #endif /* #if CHECKING_P */
27541
27542 #undef TARGET_STACK_PROTECT_GUARD
27543 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
27544
27545 #undef TARGET_ADDRESS_COST
27546 #define TARGET_ADDRESS_COST aarch64_address_cost
27547
27548 /* This hook will determines whether unnamed bitfields affect the alignment
27549    of the containing structure.  The hook returns true if the structure
27550    should inherit the alignment requirements of an unnamed bitfield's
27551    type.  */
27552 #undef TARGET_ALIGN_ANON_BITFIELD
27553 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
27554
27555 #undef TARGET_ASM_ALIGNED_DI_OP
27556 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
27557
27558 #undef TARGET_ASM_ALIGNED_HI_OP
27559 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
27560
27561 #undef TARGET_ASM_ALIGNED_SI_OP
27562 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
27563
27564 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
27565 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
27566   hook_bool_const_tree_hwi_hwi_const_tree_true
27567
27568 #undef TARGET_ASM_FILE_START
27569 #define TARGET_ASM_FILE_START aarch64_start_file
27570
27571 #undef TARGET_ASM_OUTPUT_MI_THUNK
27572 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
27573
27574 #undef TARGET_ASM_SELECT_RTX_SECTION
27575 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
27576
27577 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
27578 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
27579
27580 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
27581 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
27582
27583 #undef TARGET_BUILD_BUILTIN_VA_LIST
27584 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
27585
27586 #undef TARGET_CALLEE_COPIES
27587 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
27588
27589 #undef TARGET_CAN_ELIMINATE
27590 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
27591
27592 #undef TARGET_CAN_INLINE_P
27593 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
27594
27595 #undef TARGET_CANNOT_FORCE_CONST_MEM
27596 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
27597
27598 #undef TARGET_CASE_VALUES_THRESHOLD
27599 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
27600
27601 #undef TARGET_CONDITIONAL_REGISTER_USAGE
27602 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
27603
27604 #undef TARGET_MEMBER_TYPE_FORCES_BLK
27605 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
27606
27607 /* Only the least significant bit is used for initialization guard
27608    variables.  */
27609 #undef TARGET_CXX_GUARD_MASK_BIT
27610 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
27611
27612 #undef TARGET_C_MODE_FOR_SUFFIX
27613 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
27614
27615 #ifdef TARGET_BIG_ENDIAN_DEFAULT
27616 #undef  TARGET_DEFAULT_TARGET_FLAGS
27617 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
27618 #endif
27619
27620 #undef TARGET_CLASS_MAX_NREGS
27621 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
27622
27623 #undef TARGET_BUILTIN_DECL
27624 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
27625
27626 #undef TARGET_BUILTIN_RECIPROCAL
27627 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
27628
27629 #undef TARGET_C_EXCESS_PRECISION
27630 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
27631
27632 #undef  TARGET_EXPAND_BUILTIN
27633 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
27634
27635 #undef TARGET_EXPAND_BUILTIN_VA_START
27636 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
27637
27638 #undef TARGET_FOLD_BUILTIN
27639 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
27640
27641 #undef TARGET_FUNCTION_ARG
27642 #define TARGET_FUNCTION_ARG aarch64_function_arg
27643
27644 #undef TARGET_FUNCTION_ARG_ADVANCE
27645 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
27646
27647 #undef TARGET_FUNCTION_ARG_BOUNDARY
27648 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
27649
27650 #undef TARGET_FUNCTION_ARG_PADDING
27651 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
27652
27653 #undef TARGET_GET_RAW_RESULT_MODE
27654 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
27655 #undef TARGET_GET_RAW_ARG_MODE
27656 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
27657
27658 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
27659 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
27660
27661 #undef TARGET_FUNCTION_VALUE
27662 #define TARGET_FUNCTION_VALUE aarch64_function_value
27663
27664 #undef TARGET_FUNCTION_VALUE_REGNO_P
27665 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
27666
27667 #undef TARGET_GIMPLE_FOLD_BUILTIN
27668 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
27669
27670 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
27671 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
27672
27673 #undef  TARGET_INIT_BUILTINS
27674 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
27675
27676 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
27677 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
27678   aarch64_ira_change_pseudo_allocno_class
27679
27680 #undef TARGET_LEGITIMATE_ADDRESS_P
27681 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
27682
27683 #undef TARGET_LEGITIMATE_CONSTANT_P
27684 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
27685
27686 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
27687 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
27688   aarch64_legitimize_address_displacement
27689
27690 #undef TARGET_LIBGCC_CMP_RETURN_MODE
27691 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
27692
27693 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
27694 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
27695 aarch64_libgcc_floating_mode_supported_p
27696
27697 #undef TARGET_MANGLE_TYPE
27698 #define TARGET_MANGLE_TYPE aarch64_mangle_type
27699
27700 #undef TARGET_INVALID_CONVERSION
27701 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
27702
27703 #undef TARGET_INVALID_UNARY_OP
27704 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
27705
27706 #undef TARGET_INVALID_BINARY_OP
27707 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
27708
27709 #undef TARGET_VERIFY_TYPE_CONTEXT
27710 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
27711
27712 #undef TARGET_MEMORY_MOVE_COST
27713 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
27714
27715 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
27716 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
27717
27718 #undef TARGET_MUST_PASS_IN_STACK
27719 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
27720
27721 /* This target hook should return true if accesses to volatile bitfields
27722    should use the narrowest mode possible.  It should return false if these
27723    accesses should use the bitfield container type.  */
27724 #undef TARGET_NARROW_VOLATILE_BITFIELD
27725 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
27726
27727 #undef  TARGET_OPTION_OVERRIDE
27728 #define TARGET_OPTION_OVERRIDE aarch64_override_options
27729
27730 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
27731 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
27732   aarch64_override_options_after_change
27733
27734 #undef TARGET_OFFLOAD_OPTIONS
27735 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
27736
27737 #undef TARGET_OPTION_RESTORE
27738 #define TARGET_OPTION_RESTORE aarch64_option_restore
27739
27740 #undef TARGET_OPTION_PRINT
27741 #define TARGET_OPTION_PRINT aarch64_option_print
27742
27743 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
27744 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
27745
27746 #undef TARGET_SET_CURRENT_FUNCTION
27747 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
27748
27749 #undef TARGET_PASS_BY_REFERENCE
27750 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
27751
27752 #undef TARGET_PREFERRED_RELOAD_CLASS
27753 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
27754
27755 #undef TARGET_SCHED_REASSOCIATION_WIDTH
27756 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
27757
27758 #undef TARGET_PROMOTED_TYPE
27759 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
27760
27761 #undef TARGET_SECONDARY_RELOAD
27762 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
27763
27764 #undef TARGET_SECONDARY_MEMORY_NEEDED
27765 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
27766
27767 #undef TARGET_SHIFT_TRUNCATION_MASK
27768 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
27769
27770 #undef TARGET_SETUP_INCOMING_VARARGS
27771 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
27772
27773 #undef TARGET_STRUCT_VALUE_RTX
27774 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
27775
27776 #undef TARGET_REGISTER_MOVE_COST
27777 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
27778
27779 #undef TARGET_RETURN_IN_MEMORY
27780 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
27781
27782 #undef TARGET_RETURN_IN_MSB
27783 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
27784
27785 #undef TARGET_RTX_COSTS
27786 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
27787
27788 #undef TARGET_SCALAR_MODE_SUPPORTED_P
27789 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
27790
27791 #undef TARGET_SCHED_ISSUE_RATE
27792 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
27793
27794 #undef TARGET_SCHED_VARIABLE_ISSUE
27795 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
27796
27797 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
27798 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
27799   aarch64_sched_first_cycle_multipass_dfa_lookahead
27800
27801 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
27802 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
27803   aarch64_first_cycle_multipass_dfa_lookahead_guard
27804
27805 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
27806 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
27807   aarch64_get_separate_components
27808
27809 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
27810 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
27811   aarch64_components_for_bb
27812
27813 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
27814 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
27815   aarch64_disqualify_components
27816
27817 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
27818 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
27819   aarch64_emit_prologue_components
27820
27821 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
27822 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
27823   aarch64_emit_epilogue_components
27824
27825 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
27826 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
27827   aarch64_set_handled_components
27828
27829 #undef TARGET_TRAMPOLINE_INIT
27830 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
27831
27832 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
27833 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
27834
27835 #undef TARGET_VECTOR_MODE_SUPPORTED_P
27836 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
27837
27838 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
27839 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
27840
27841 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
27842 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
27843   aarch64_builtin_support_vector_misalignment
27844
27845 #undef TARGET_ARRAY_MODE
27846 #define TARGET_ARRAY_MODE aarch64_array_mode
27847
27848 #undef TARGET_ARRAY_MODE_SUPPORTED_P
27849 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
27850
27851 #undef TARGET_VECTORIZE_CREATE_COSTS
27852 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
27853
27854 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
27855 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
27856   aarch64_builtin_vectorization_cost
27857
27858 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
27859 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
27860
27861 #undef TARGET_VECTORIZE_BUILTINS
27862 #define TARGET_VECTORIZE_BUILTINS
27863
27864 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
27865 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
27866   aarch64_autovectorize_vector_modes
27867
27868 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
27869 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
27870   aarch64_atomic_assign_expand_fenv
27871
27872 /* Section anchor support.  */
27873
27874 #undef TARGET_MIN_ANCHOR_OFFSET
27875 #define TARGET_MIN_ANCHOR_OFFSET -256
27876
27877 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
27878    byte offset; we can do much more for larger data types, but have no way
27879    to determine the size of the access.  We assume accesses are aligned.  */
27880 #undef TARGET_MAX_ANCHOR_OFFSET
27881 #define TARGET_MAX_ANCHOR_OFFSET 4095
27882
27883 #undef TARGET_VECTOR_ALIGNMENT
27884 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
27885
27886 #undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
27887 #define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
27888   aarch64_vectorize_can_special_div_by_constant
27889
27890 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
27891 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
27892   aarch64_vectorize_preferred_vector_alignment
27893 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
27894 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
27895   aarch64_simd_vector_alignment_reachable
27896
27897 /* vec_perm support.  */
27898
27899 #undef TARGET_VECTORIZE_VEC_PERM_CONST
27900 #define TARGET_VECTORIZE_VEC_PERM_CONST \
27901   aarch64_vectorize_vec_perm_const
27902
27903 #undef TARGET_VECTORIZE_RELATED_MODE
27904 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
27905 #undef TARGET_VECTORIZE_GET_MASK_MODE
27906 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
27907 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
27908 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
27909   aarch64_empty_mask_is_expensive
27910 #undef TARGET_PREFERRED_ELSE_VALUE
27911 #define TARGET_PREFERRED_ELSE_VALUE \
27912   aarch64_preferred_else_value
27913
27914 #undef TARGET_INIT_LIBFUNCS
27915 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
27916
27917 #undef TARGET_FIXED_CONDITION_CODE_REGS
27918 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
27919
27920 #undef TARGET_FLAGS_REGNUM
27921 #define TARGET_FLAGS_REGNUM CC_REGNUM
27922
27923 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
27924 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
27925
27926 #undef TARGET_ASAN_SHADOW_OFFSET
27927 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
27928
27929 #undef TARGET_LEGITIMIZE_ADDRESS
27930 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
27931
27932 #undef TARGET_SCHED_CAN_SPECULATE_INSN
27933 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
27934
27935 #undef TARGET_CAN_USE_DOLOOP_P
27936 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
27937
27938 #undef TARGET_SCHED_ADJUST_PRIORITY
27939 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
27940
27941 #undef TARGET_SCHED_MACRO_FUSION_P
27942 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
27943
27944 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
27945 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
27946
27947 #undef TARGET_SCHED_FUSION_PRIORITY
27948 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
27949
27950 #undef TARGET_UNSPEC_MAY_TRAP_P
27951 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
27952
27953 #undef TARGET_USE_PSEUDO_PIC_REG
27954 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
27955
27956 #undef TARGET_PRINT_OPERAND
27957 #define TARGET_PRINT_OPERAND aarch64_print_operand
27958
27959 #undef TARGET_PRINT_OPERAND_ADDRESS
27960 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
27961
27962 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
27963 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
27964
27965 #undef TARGET_OPTAB_SUPPORTED_P
27966 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
27967
27968 #undef TARGET_OMIT_STRUCT_RETURN_REG
27969 #define TARGET_OMIT_STRUCT_RETURN_REG true
27970
27971 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
27972 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
27973   aarch64_dwarf_poly_indeterminate_value
27974
27975 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
27976 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
27977 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
27978
27979 #undef TARGET_HARD_REGNO_NREGS
27980 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
27981 #undef TARGET_HARD_REGNO_MODE_OK
27982 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
27983
27984 #undef TARGET_MODES_TIEABLE_P
27985 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
27986
27987 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
27988 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
27989   aarch64_hard_regno_call_part_clobbered
27990
27991 #undef TARGET_INSN_CALLEE_ABI
27992 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
27993
27994 #undef TARGET_CONSTANT_ALIGNMENT
27995 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
27996
27997 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
27998 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
27999   aarch64_stack_clash_protection_alloca_probe_range
28000
28001 #undef TARGET_COMPUTE_PRESSURE_CLASSES
28002 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
28003
28004 #undef TARGET_CAN_CHANGE_MODE_CLASS
28005 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
28006
28007 #undef TARGET_SELECT_EARLY_REMAT_MODES
28008 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
28009
28010 #undef TARGET_SPECULATION_SAFE_VALUE
28011 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
28012
28013 #undef TARGET_ESTIMATED_POLY_VALUE
28014 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
28015
28016 #undef TARGET_ATTRIBUTE_TABLE
28017 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
28018
28019 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
28020 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
28021   aarch64_simd_clone_compute_vecsize_and_simdlen
28022
28023 #undef TARGET_SIMD_CLONE_ADJUST
28024 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
28025
28026 #undef TARGET_SIMD_CLONE_USABLE
28027 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
28028
28029 #undef TARGET_COMP_TYPE_ATTRIBUTES
28030 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
28031
28032 #undef TARGET_GET_MULTILIB_ABI_NAME
28033 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
28034
28035 #undef TARGET_FNTYPE_ABI
28036 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
28037
28038 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
28039 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
28040
28041 #if CHECKING_P
28042 #undef TARGET_RUN_TARGET_SELFTESTS
28043 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
28044 #endif /* #if CHECKING_P */
28045
28046 #undef TARGET_ASM_POST_CFI_STARTPROC
28047 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
28048
28049 #undef TARGET_STRICT_ARGUMENT_NAMING
28050 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
28051
28052 #undef TARGET_MD_ASM_ADJUST
28053 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
28054
28055 #undef TARGET_ASM_FILE_END
28056 #define TARGET_ASM_FILE_END aarch64_asm_file_end
28057
28058 #undef TARGET_ASM_FUNCTION_EPILOGUE
28059 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
28060
28061 #undef TARGET_HAVE_SHADOW_CALL_STACK
28062 #define TARGET_HAVE_SHADOW_CALL_STACK true
28063
28064 struct gcc_target targetm = TARGET_INITIALIZER;
28065
28066 #include "gt-aarch64.h"