gcc/config/aarch64/aarch64.cc

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2023 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #define INCLUDE_STRING
  24 #define INCLUDE_ALGORITHM
  25 #include "config.h"
  26 #include "system.h"
  27 #include "coretypes.h"
  28 #include "backend.h"
  29 #include "target.h"
  30 #include "rtl.h"
  31 #include "tree.h"
  32 #include "memmodel.h"
  33 #include "gimple.h"
  34 #include "cfghooks.h"
  35 #include "cfgloop.h"
  36 #include "df.h"
  37 #include "tm_p.h"
  38 #include "stringpool.h"
  39 #include "attribs.h"
  40 #include "optabs.h"
  41 #include "regs.h"
  42 #include "emit-rtl.h"
  43 #include "recog.h"
  44 #include "cgraph.h"
  45 #include "diagnostic.h"
  46 #include "insn-attr.h"
  47 #include "alias.h"
  48 #include "fold-const.h"
  49 #include "stor-layout.h"
  50 #include "calls.h"
  51 #include "varasm.h"
  52 #include "output.h"
  53 #include "flags.h"
  54 #include "explow.h"
  55 #include "expr.h"
  56 #include "reload.h"
  57 #include "langhooks.h"
  58 #include "opts.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76 #include "expmed.h"
  77 #include "function-abi.h"
  78 #include "gimple-pretty-print.h"
  79 #include "tree-ssa-loop-niter.h"
  80 #include "fractional-cost.h"
  81 #include "rtlanal.h"
  82 #include "tree-dfa.h"
  83 #include "asan.h"
  84 #include "aarch64-feature-deps.h"
  85 #include "config/arm/aarch-common.h"
  86 #include "config/arm/aarch-common-protos.h"
  87
  88 /* This file should be included last.  */
  89 #include "target-def.h"
  90
  91 /* Defined for convenience.  */
  92 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  93
  94 /* Information about a legitimate vector immediate operand.  */
  95 struct simd_immediate_info
  96 {
  97   enum insn_type { MOV, MVN, INDEX, PTRUE };
  98   enum modifier_type { LSL, MSL };
  99
 100   simd_immediate_info () {}
 101   simd_immediate_info (scalar_float_mode, rtx);
 102   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 103                        insn_type = MOV, modifier_type = LSL,
 104                        unsigned int = 0);
 105   simd_immediate_info (scalar_mode, rtx, rtx);
 106   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
 107
 108   /* The mode of the elements.  */
 109   scalar_mode elt_mode;
 110
 111   /* The instruction to use to move the immediate into a vector.  */
 112   insn_type insn;
 113
 114   union
 115   {
 116     /* For MOV and MVN.  */
 117     struct
 118     {
 119       /* The value of each element.  */
 120       rtx value;
 121
 122       /* The kind of shift modifier to use, and the number of bits to shift.
 123          This is (LSL, 0) if no shift is needed.  */
 124       modifier_type modifier;
 125       unsigned int shift;
 126     } mov;
 127
 128     /* For INDEX.  */
 129     struct
 130     {
 131       /* The value of the first element and the step to be added for each
 132          subsequent element.  */
 133       rtx base, step;
 134     } index;
 135
 136     /* For PTRUE.  */
 137     aarch64_svpattern pattern;
 138   } u;
 139 };
 140
 141 /* Construct a floating-point immediate in which each element has mode
 142    ELT_MODE_IN and value VALUE_IN.  */
 143 inline simd_immediate_info
 144 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 145   : elt_mode (elt_mode_in), insn (MOV)
 146 {
 147   u.mov.value = value_in;
 148   u.mov.modifier = LSL;
 149   u.mov.shift = 0;
 150 }
 151
 152 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 153    and value VALUE_IN.  The other parameters are as for the structure
 154    fields.  */
 155 inline simd_immediate_info
 156 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 157                        unsigned HOST_WIDE_INT value_in,
 158                        insn_type insn_in, modifier_type modifier_in,
 159                        unsigned int shift_in)
 160   : elt_mode (elt_mode_in), insn (insn_in)
 161 {
 162   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 163   u.mov.modifier = modifier_in;
 164   u.mov.shift = shift_in;
 165 }
 166
 167 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 168    and where element I is equal to BASE_IN + I * STEP_IN.  */
 169 inline simd_immediate_info
 170 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 171   : elt_mode (elt_mode_in), insn (INDEX)
 172 {
 173   u.index.base = base_in;
 174   u.index.step = step_in;
 175 }
 176
 177 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 178    and has PTRUE pattern PATTERN_IN.  */
 179 inline simd_immediate_info
 180 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 181                        aarch64_svpattern pattern_in)
 182   : elt_mode (elt_mode_in), insn (PTRUE)
 183 {
 184   u.pattern = pattern_in;
 185 }
 186
 187 namespace {
 188
 189 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
 190 class pure_scalable_type_info
 191 {
 192 public:
 193   /* Represents the result of analyzing a type.  All values are nonzero,
 194      in the possibly forlorn hope that accidental conversions to bool
 195      trigger a warning.  */
 196   enum analysis_result
 197   {
 198     /* The type does not have an ABI identity; i.e. it doesn't contain
 199        at least one object whose type is a Fundamental Data Type.  */
 200     NO_ABI_IDENTITY = 1,
 201
 202     /* The type is definitely a Pure Scalable Type.  */
 203     IS_PST,
 204
 205     /* The type is definitely not a Pure Scalable Type.  */
 206     ISNT_PST,
 207
 208     /* It doesn't matter for PCS purposes whether the type is a Pure
 209        Scalable Type or not, since the type will be handled the same
 210        way regardless.
 211
 212        Specifically, this means that if the type is a Pure Scalable Type,
 213        there aren't enough argument registers to hold it, and so it will
 214        need to be passed or returned in memory.  If the type isn't a
 215        Pure Scalable Type, it's too big to be passed or returned in core
 216        or SIMD&FP registers, and so again will need to go in memory.  */
 217     DOESNT_MATTER
 218   };
 219
 220   /* Aggregates of 17 bytes or more are normally passed and returned
 221      in memory, so aggregates of that size can safely be analyzed as
 222      DOESNT_MATTER.  We need to be able to collect enough pieces to
 223      represent a PST that is smaller than that.  Since predicates are
 224      2 bytes in size for -msve-vector-bits=128, that means we need to be
 225      able to store at least 8 pieces.
 226
 227      We also need to be able to store enough pieces to represent
 228      a single vector in each vector argument register and a single
 229      predicate in each predicate argument register.  This means that
 230      we need at least 12 pieces.  */
 231   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
 232   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
 233
 234   /* Describes one piece of a PST.  Each piece is one of:
 235
 236      - a single Scalable Vector Type (SVT)
 237      - a single Scalable Predicate Type (SPT)
 238      - a PST containing 2, 3 or 4 SVTs, with no padding
 239
 240      It either represents a single built-in type or a PST formed from
 241      multiple homogeneous built-in types.  */
 242   struct piece
 243   {
 244     rtx get_rtx (unsigned int, unsigned int) const;
 245
 246     /* The number of vector and predicate registers that the piece
 247        occupies.  One of the two is always zero.  */
 248     unsigned int num_zr;
 249     unsigned int num_pr;
 250
 251     /* The mode of the registers described above.  */
 252     machine_mode mode;
 253
 254     /* If this piece is formed from multiple homogeneous built-in types,
 255        this is the mode of the built-in types, otherwise it is MODE.  */
 256     machine_mode orig_mode;
 257
 258     /* The offset in bytes of the piece from the start of the type.  */
 259     poly_uint64_pod offset;
 260   };
 261
 262   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
 263      are in memory order.  */
 264   auto_vec<piece, MAX_PIECES> pieces;
 265
 266   unsigned int num_zr () const;
 267   unsigned int num_pr () const;
 268
 269   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
 270
 271   analysis_result analyze (const_tree);
 272   bool analyze_registers (const_tree);
 273
 274 private:
 275   analysis_result analyze_array (const_tree);
 276   analysis_result analyze_record (const_tree);
 277   void add_piece (const piece &);
 278 };
 279 }
 280
 281 /* The current code model.  */
 282 enum aarch64_code_model aarch64_cmodel;
 283
 284 /* The number of 64-bit elements in an SVE vector.  */
 285 poly_uint16 aarch64_sve_vg;
 286
 287 #ifdef HAVE_AS_TLS
 288 #undef TARGET_HAVE_TLS
 289 #define TARGET_HAVE_TLS 1
 290 #endif
 291
 292 static bool aarch64_composite_type_p (const_tree, machine_mode);
 293 static bool aarch64_return_in_memory_1 (const_tree);
 294 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 295                                                      const_tree,
 296                                                      machine_mode *, int *,
 297                                                      bool *, bool);
 298 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 299 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 300 static void aarch64_override_options_after_change (void);
 301 static bool aarch64_vector_mode_supported_p (machine_mode);
 302 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 303 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 304                                                          const_tree type,
 305                                                          int misalignment,
 306                                                          bool is_packed);
 307 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 308 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 309                                             aarch64_addr_query_type);
 310
 311 /* The processor for which instructions should be scheduled.  */
 312 enum aarch64_processor aarch64_tune = cortexa53;
 313
 314 /* Mask to specify which instruction scheduling options should be used.  */
 315 uint64_t aarch64_tune_flags = 0;
 316
 317 /* Global flag for PC relative loads.  */
 318 bool aarch64_pcrelative_literal_loads;
 319
 320 /* Global flag for whether frame pointer is enabled.  */
 321 bool aarch64_use_frame_pointer;
 322
 323 char *accepted_branch_protection_string = NULL;
 324
 325 /* Support for command line parsing of boolean flags in the tuning
 326    structures.  */
 327 struct aarch64_flag_desc
 328 {
 329   const char* name;
 330   unsigned int flag;
 331 };
 332
 333 #define AARCH64_FUSION_PAIR(name, internal_name) \
 334   { name, AARCH64_FUSE_##internal_name },
 335 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 336 {
 337   { "none", AARCH64_FUSE_NOTHING },
 338 #include "aarch64-fusion-pairs.def"
 339   { "all", AARCH64_FUSE_ALL },
 340   { NULL, AARCH64_FUSE_NOTHING }
 341 };
 342
 343 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 344   { name, AARCH64_EXTRA_TUNE_##internal_name },
 345 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 346 {
 347   { "none", AARCH64_EXTRA_TUNE_NONE },
 348 #include "aarch64-tuning-flags.def"
 349   { "all", AARCH64_EXTRA_TUNE_ALL },
 350   { NULL, AARCH64_EXTRA_TUNE_NONE }
 351 };
 352
 353 /* Tuning parameters.  */
 354
 355 static const struct cpu_addrcost_table generic_addrcost_table =
 356 {
 357     {
 358       1, /* hi  */
 359       0, /* si  */
 360       0, /* di  */
 361       1, /* ti  */
 362     },
 363   0, /* pre_modify  */
 364   0, /* post_modify  */
 365   0, /* post_modify_ld3_st3  */
 366   0, /* post_modify_ld4_st4  */
 367   0, /* register_offset  */
 368   0, /* register_sextend  */
 369   0, /* register_zextend  */
 370   0 /* imm_offset  */
 371 };
 372
 373 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 374 {
 375     {
 376       0, /* hi  */
 377       0, /* si  */
 378       0, /* di  */
 379       2, /* ti  */
 380     },
 381   0, /* pre_modify  */
 382   0, /* post_modify  */
 383   0, /* post_modify_ld3_st3  */
 384   0, /* post_modify_ld4_st4  */
 385   1, /* register_offset  */
 386   1, /* register_sextend  */
 387   2, /* register_zextend  */
 388   0, /* imm_offset  */
 389 };
 390
 391 static const struct cpu_addrcost_table xgene1_addrcost_table =
 392 {
 393     {
 394       1, /* hi  */
 395       0, /* si  */
 396       0, /* di  */
 397       1, /* ti  */
 398     },
 399   1, /* pre_modify  */
 400   1, /* post_modify  */
 401   1, /* post_modify_ld3_st3  */
 402   1, /* post_modify_ld4_st4  */
 403   0, /* register_offset  */
 404   1, /* register_sextend  */
 405   1, /* register_zextend  */
 406   0, /* imm_offset  */
 407 };
 408
 409 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 410 {
 411     {
 412       1, /* hi  */
 413       1, /* si  */
 414       1, /* di  */
 415       2, /* ti  */
 416     },
 417   0, /* pre_modify  */
 418   0, /* post_modify  */
 419   0, /* post_modify_ld3_st3  */
 420   0, /* post_modify_ld4_st4  */
 421   2, /* register_offset  */
 422   3, /* register_sextend  */
 423   3, /* register_zextend  */
 424   0, /* imm_offset  */
 425 };
 426
 427 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
 428 {
 429     {
 430       1, /* hi  */
 431       1, /* si  */
 432       1, /* di  */
 433       2, /* ti  */
 434     },
 435   0, /* pre_modify  */
 436   0, /* post_modify  */
 437   0, /* post_modify_ld3_st3  */
 438   0, /* post_modify_ld4_st4  */
 439   2, /* register_offset  */
 440   3, /* register_sextend  */
 441   3, /* register_zextend  */
 442   0, /* imm_offset  */
 443 };
 444
 445 static const struct cpu_addrcost_table tsv110_addrcost_table =
 446 {
 447     {
 448       1, /* hi  */
 449       0, /* si  */
 450       0, /* di  */
 451       1, /* ti  */
 452     },
 453   0, /* pre_modify  */
 454   0, /* post_modify  */
 455   0, /* post_modify_ld3_st3  */
 456   0, /* post_modify_ld4_st4  */
 457   0, /* register_offset  */
 458   1, /* register_sextend  */
 459   1, /* register_zextend  */
 460   0, /* imm_offset  */
 461 };
 462
 463 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 464 {
 465     {
 466       1, /* hi  */
 467       1, /* si  */
 468       1, /* di  */
 469       2, /* ti  */
 470     },
 471   1, /* pre_modify  */
 472   1, /* post_modify  */
 473   1, /* post_modify_ld3_st3  */
 474   1, /* post_modify_ld4_st4  */
 475   3, /* register_offset  */
 476   3, /* register_sextend  */
 477   3, /* register_zextend  */
 478   2, /* imm_offset  */
 479 };
 480
 481 static const struct cpu_addrcost_table a64fx_addrcost_table =
 482 {
 483     {
 484       1, /* hi  */
 485       1, /* si  */
 486       1, /* di  */
 487       2, /* ti  */
 488     },
 489   0, /* pre_modify  */
 490   0, /* post_modify  */
 491   0, /* post_modify_ld3_st3  */
 492   0, /* post_modify_ld4_st4  */
 493   2, /* register_offset  */
 494   3, /* register_sextend  */
 495   3, /* register_zextend  */
 496   0, /* imm_offset  */
 497 };
 498
 499 static const struct cpu_addrcost_table neoversev1_addrcost_table =
 500 {
 501     {
 502       1, /* hi  */
 503       0, /* si  */
 504       0, /* di  */
 505       1, /* ti  */
 506     },
 507   0, /* pre_modify  */
 508   0, /* post_modify  */
 509   3, /* post_modify_ld3_st3  */
 510   3, /* post_modify_ld4_st4  */
 511   0, /* register_offset  */
 512   0, /* register_sextend  */
 513   0, /* register_zextend  */
 514   0 /* imm_offset  */
 515 };
 516
 517 static const struct cpu_addrcost_table neoversen2_addrcost_table =
 518 {
 519     {
 520       1, /* hi  */
 521       0, /* si  */
 522       0, /* di  */
 523       1, /* ti  */
 524     },
 525   0, /* pre_modify  */
 526   0, /* post_modify  */
 527   2, /* post_modify_ld3_st3  */
 528   2, /* post_modify_ld4_st4  */
 529   0, /* register_offset  */
 530   0, /* register_sextend  */
 531   0, /* register_zextend  */
 532   0 /* imm_offset  */
 533 };
 534
 535 static const struct cpu_addrcost_table neoversev2_addrcost_table =
 536 {
 537     {
 538       1, /* hi  */
 539       0, /* si  */
 540       0, /* di  */
 541       1, /* ti  */
 542     },
 543   0, /* pre_modify  */
 544   0, /* post_modify  */
 545   2, /* post_modify_ld3_st3  */
 546   2, /* post_modify_ld4_st4  */
 547   0, /* register_offset  */
 548   0, /* register_sextend  */
 549   0, /* register_zextend  */
 550   0 /* imm_offset  */
 551 };
 552
 553 static const struct cpu_regmove_cost generic_regmove_cost =
 554 {
 555   1, /* GP2GP  */
 556   /* Avoid the use of slow int<->fp moves for spilling by setting
 557      their cost higher than memmov_cost.  */
 558   5, /* GP2FP  */
 559   5, /* FP2GP  */
 560   2 /* FP2FP  */
 561 };
 562
 563 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 564 {
 565   1, /* GP2GP  */
 566   /* Avoid the use of slow int<->fp moves for spilling by setting
 567      their cost higher than memmov_cost.  */
 568   5, /* GP2FP  */
 569   5, /* FP2GP  */
 570   2 /* FP2FP  */
 571 };
 572
 573 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 574 {
 575   1, /* GP2GP  */
 576   /* Avoid the use of slow int<->fp moves for spilling by setting
 577      their cost higher than memmov_cost.  */
 578   5, /* GP2FP  */
 579   5, /* FP2GP  */
 580   2 /* FP2FP  */
 581 };
 582
 583 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 584 {
 585   1, /* GP2GP  */
 586   /* Avoid the use of slow int<->fp moves for spilling by setting
 587      their cost higher than memmov_cost (actual, 4 and 9).  */
 588   9, /* GP2FP  */
 589   9, /* FP2GP  */
 590   1 /* FP2FP  */
 591 };
 592
 593 static const struct cpu_regmove_cost thunderx_regmove_cost =
 594 {
 595   2, /* GP2GP  */
 596   2, /* GP2FP  */
 597   6, /* FP2GP  */
 598   4 /* FP2FP  */
 599 };
 600
 601 static const struct cpu_regmove_cost xgene1_regmove_cost =
 602 {
 603   1, /* GP2GP  */
 604   /* Avoid the use of slow int<->fp moves for spilling by setting
 605      their cost higher than memmov_cost.  */
 606   8, /* GP2FP  */
 607   8, /* FP2GP  */
 608   2 /* FP2FP  */
 609 };
 610
 611 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 612 {
 613   2, /* GP2GP  */
 614   /* Avoid the use of int<->fp moves for spilling.  */
 615   6, /* GP2FP  */
 616   6, /* FP2GP  */
 617   4 /* FP2FP  */
 618 };
 619
 620 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 621 {
 622   1, /* GP2GP  */
 623   /* Avoid the use of int<->fp moves for spilling.  */
 624   5, /* GP2FP  */
 625   6, /* FP2GP  */
 626   3, /* FP2FP  */
 627 };
 628
 629 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
 630 {
 631   1, /* GP2GP  */
 632   /* Avoid the use of int<->fp moves for spilling.  */
 633   4, /* GP2FP  */
 634   5, /* FP2GP  */
 635   4  /* FP2FP  */
 636 };
 637
 638 static const struct cpu_regmove_cost tsv110_regmove_cost =
 639 {
 640   1, /* GP2GP  */
 641   /* Avoid the use of slow int<->fp moves for spilling by setting
 642      their cost higher than memmov_cost.  */
 643   2, /* GP2FP  */
 644   3, /* FP2GP  */
 645   2  /* FP2FP  */
 646 };
 647
 648 static const struct cpu_regmove_cost a64fx_regmove_cost =
 649 {
 650   1, /* GP2GP  */
 651   /* Avoid the use of slow int<->fp moves for spilling by setting
 652      their cost higher than memmov_cost.  */
 653   5, /* GP2FP  */
 654   7, /* FP2GP  */
 655   2 /* FP2FP  */
 656 };
 657
 658 static const struct cpu_regmove_cost neoversen2_regmove_cost =
 659 {
 660   1, /* GP2GP  */
 661   /* Spilling to int<->fp instead of memory is recommended so set
 662      realistic costs compared to memmov_cost.  */
 663   3, /* GP2FP  */
 664   2, /* FP2GP  */
 665   2 /* FP2FP  */
 666 };
 667
 668 static const struct cpu_regmove_cost neoversev1_regmove_cost =
 669 {
 670   1, /* GP2GP  */
 671   /* Spilling to int<->fp instead of memory is recommended so set
 672      realistic costs compared to memmov_cost.  */
 673   3, /* GP2FP  */
 674   2, /* FP2GP  */
 675   2 /* FP2FP  */
 676 };
 677
 678 static const struct cpu_regmove_cost neoversev2_regmove_cost =
 679 {
 680   1, /* GP2GP  */
 681   /* Spilling to int<->fp instead of memory is recommended so set
 682      realistic costs compared to memmov_cost.  */
 683   3, /* GP2FP  */
 684   2, /* FP2GP  */
 685   2 /* FP2FP  */
 686 };
 687
 688 /* Generic costs for Advanced SIMD vector operations.   */
 689 static const advsimd_vec_cost generic_advsimd_vector_cost =
 690 {
 691   1, /* int_stmt_cost  */
 692   1, /* fp_stmt_cost  */
 693   0, /* ld2_st2_permute_cost  */
 694   0, /* ld3_st3_permute_cost  */
 695   0, /* ld4_st4_permute_cost  */
 696   2, /* permute_cost  */
 697   2, /* reduc_i8_cost  */
 698   2, /* reduc_i16_cost  */
 699   2, /* reduc_i32_cost  */
 700   2, /* reduc_i64_cost  */
 701   2, /* reduc_f16_cost  */
 702   2, /* reduc_f32_cost  */
 703   2, /* reduc_f64_cost  */
 704   2, /* store_elt_extra_cost  */
 705   2, /* vec_to_scalar_cost  */
 706   1, /* scalar_to_vec_cost  */
 707   1, /* align_load_cost  */
 708   1, /* unalign_load_cost  */
 709   1, /* unalign_store_cost  */
 710   1  /* store_cost  */
 711 };
 712
 713 /* Generic costs for SVE vector operations.  */
 714 static const sve_vec_cost generic_sve_vector_cost =
 715 {
 716   {
 717     1, /* int_stmt_cost  */
 718     1, /* fp_stmt_cost  */
 719     0, /* ld2_st2_permute_cost  */
 720     0, /* ld3_st3_permute_cost  */
 721     0, /* ld4_st4_permute_cost  */
 722     2, /* permute_cost  */
 723     2, /* reduc_i8_cost  */
 724     2, /* reduc_i16_cost  */
 725     2, /* reduc_i32_cost  */
 726     2, /* reduc_i64_cost  */
 727     2, /* reduc_f16_cost  */
 728     2, /* reduc_f32_cost  */
 729     2, /* reduc_f64_cost  */
 730     2, /* store_elt_extra_cost  */
 731     2, /* vec_to_scalar_cost  */
 732     1, /* scalar_to_vec_cost  */
 733     1, /* align_load_cost  */
 734     1, /* unalign_load_cost  */
 735     1, /* unalign_store_cost  */
 736     1  /* store_cost  */
 737   },
 738   2, /* clast_cost  */
 739   2, /* fadda_f16_cost  */
 740   2, /* fadda_f32_cost  */
 741   2, /* fadda_f64_cost  */
 742   4, /* gather_load_x32_cost  */
 743   2, /* gather_load_x64_cost  */
 744   1 /* scatter_store_elt_cost  */
 745 };
 746
 747 /* Generic costs for vector insn classes.  */
 748 static const struct cpu_vector_cost generic_vector_cost =
 749 {
 750   1, /* scalar_int_stmt_cost  */
 751   1, /* scalar_fp_stmt_cost  */
 752   1, /* scalar_load_cost  */
 753   1, /* scalar_store_cost  */
 754   3, /* cond_taken_branch_cost  */
 755   1, /* cond_not_taken_branch_cost  */
 756   &generic_advsimd_vector_cost, /* advsimd  */
 757   &generic_sve_vector_cost, /* sve */
 758   nullptr /* issue_info  */
 759 };
 760
 761 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
 762 {
 763   2, /* int_stmt_cost  */
 764   5, /* fp_stmt_cost  */
 765   0, /* ld2_st2_permute_cost  */
 766   0, /* ld3_st3_permute_cost  */
 767   0, /* ld4_st4_permute_cost  */
 768   3, /* permute_cost  */
 769   13, /* reduc_i8_cost  */
 770   13, /* reduc_i16_cost  */
 771   13, /* reduc_i32_cost  */
 772   13, /* reduc_i64_cost  */
 773   13, /* reduc_f16_cost  */
 774   13, /* reduc_f32_cost  */
 775   13, /* reduc_f64_cost  */
 776   13, /* store_elt_extra_cost  */
 777   13, /* vec_to_scalar_cost  */
 778   4, /* scalar_to_vec_cost  */
 779   6, /* align_load_cost  */
 780   6, /* unalign_load_cost  */
 781   1, /* unalign_store_cost  */
 782   1  /* store_cost  */
 783 };
 784
 785 static const sve_vec_cost a64fx_sve_vector_cost =
 786 {
 787   {
 788     2, /* int_stmt_cost  */
 789     5, /* fp_stmt_cost  */
 790     0, /* ld2_st2_permute_cost  */
 791     0, /* ld3_st3_permute_cost  */
 792     0, /* ld4_st4_permute_cost  */
 793     3, /* permute_cost  */
 794     13, /* reduc_i8_cost  */
 795     13, /* reduc_i16_cost  */
 796     13, /* reduc_i32_cost  */
 797     13, /* reduc_i64_cost  */
 798     13, /* reduc_f16_cost  */
 799     13, /* reduc_f32_cost  */
 800     13, /* reduc_f64_cost  */
 801     13, /* store_elt_extra_cost  */
 802     13, /* vec_to_scalar_cost  */
 803     4, /* scalar_to_vec_cost  */
 804     6, /* align_load_cost  */
 805     6, /* unalign_load_cost  */
 806     1, /* unalign_store_cost  */
 807     1  /* store_cost  */
 808   },
 809   13, /* clast_cost  */
 810   13, /* fadda_f16_cost  */
 811   13, /* fadda_f32_cost  */
 812   13, /* fadda_f64_cost  */
 813   64, /* gather_load_x32_cost  */
 814   32, /* gather_load_x64_cost  */
 815   1 /* scatter_store_elt_cost  */
 816 };
 817
 818 static const struct cpu_vector_cost a64fx_vector_cost =
 819 {
 820   1, /* scalar_int_stmt_cost  */
 821   5, /* scalar_fp_stmt_cost  */
 822   4, /* scalar_load_cost  */
 823   1, /* scalar_store_cost  */
 824   3, /* cond_taken_branch_cost  */
 825   1, /* cond_not_taken_branch_cost  */
 826   &a64fx_advsimd_vector_cost, /* advsimd  */
 827   &a64fx_sve_vector_cost, /* sve  */
 828   nullptr /* issue_info  */
 829 };
 830
 831 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
 832 {
 833   1, /* int_stmt_cost  */
 834   3, /* fp_stmt_cost  */
 835   0, /* ld2_st2_permute_cost  */
 836   0, /* ld3_st3_permute_cost  */
 837   0, /* ld4_st4_permute_cost  */
 838   2, /* permute_cost  */
 839   1, /* reduc_i8_cost  */
 840   1, /* reduc_i16_cost  */
 841   1, /* reduc_i32_cost  */
 842   1, /* reduc_i64_cost  */
 843   1, /* reduc_f16_cost  */
 844   1, /* reduc_f32_cost  */
 845   1, /* reduc_f64_cost  */
 846   1, /* store_elt_extra_cost  */
 847   1, /* vec_to_scalar_cost  */
 848   1, /* scalar_to_vec_cost  */
 849   1, /* align_load_cost  */
 850   1, /* unalign_load_cost  */
 851   1, /* unalign_store_cost  */
 852   1  /* store_cost  */
 853 };
 854
 855 /* QDF24XX costs for vector insn classes.  */
 856 static const struct cpu_vector_cost qdf24xx_vector_cost =
 857 {
 858   1, /* scalar_int_stmt_cost  */
 859   1, /* scalar_fp_stmt_cost  */
 860   1, /* scalar_load_cost  */
 861   1, /* scalar_store_cost  */
 862   3, /* cond_taken_branch_cost  */
 863   1, /* cond_not_taken_branch_cost  */
 864   &qdf24xx_advsimd_vector_cost, /* advsimd  */
 865   nullptr, /* sve  */
 866   nullptr /* issue_info  */
 867 };
 868
 869
 870 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
 871 {
 872   4, /* int_stmt_cost  */
 873   1, /* fp_stmt_cost  */
 874   0, /* ld2_st2_permute_cost  */
 875   0, /* ld3_st3_permute_cost  */
 876   0, /* ld4_st4_permute_cost  */
 877   4, /* permute_cost  */
 878   2, /* reduc_i8_cost  */
 879   2, /* reduc_i16_cost  */
 880   2, /* reduc_i32_cost  */
 881   2, /* reduc_i64_cost  */
 882   2, /* reduc_f16_cost  */
 883   2, /* reduc_f32_cost  */
 884   2, /* reduc_f64_cost  */
 885   2, /* store_elt_extra_cost  */
 886   2, /* vec_to_scalar_cost  */
 887   2, /* scalar_to_vec_cost  */
 888   3, /* align_load_cost  */
 889   5, /* unalign_load_cost  */
 890   5, /* unalign_store_cost  */
 891   1  /* store_cost  */
 892 };
 893
 894 /* ThunderX costs for vector insn classes.  */
 895 static const struct cpu_vector_cost thunderx_vector_cost =
 896 {
 897   1, /* scalar_int_stmt_cost  */
 898   1, /* scalar_fp_stmt_cost  */
 899   3, /* scalar_load_cost  */
 900   1, /* scalar_store_cost  */
 901   3, /* cond_taken_branch_cost  */
 902   3, /* cond_not_taken_branch_cost  */
 903   &thunderx_advsimd_vector_cost, /* advsimd  */
 904   nullptr, /* sve  */
 905   nullptr /* issue_info  */
 906 };
 907
 908 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
 909 {
 910   2, /* int_stmt_cost  */
 911   2, /* fp_stmt_cost  */
 912   0, /* ld2_st2_permute_cost  */
 913   0, /* ld3_st3_permute_cost  */
 914   0, /* ld4_st4_permute_cost  */
 915   2, /* permute_cost  */
 916   3, /* reduc_i8_cost  */
 917   3, /* reduc_i16_cost  */
 918   3, /* reduc_i32_cost  */
 919   3, /* reduc_i64_cost  */
 920   3, /* reduc_f16_cost  */
 921   3, /* reduc_f32_cost  */
 922   3, /* reduc_f64_cost  */
 923   3, /* store_elt_extra_cost  */
 924   3, /* vec_to_scalar_cost  */
 925   2, /* scalar_to_vec_cost  */
 926   5, /* align_load_cost  */
 927   5, /* unalign_load_cost  */
 928   1, /* unalign_store_cost  */
 929   1  /* store_cost  */
 930 };
 931
 932 static const struct cpu_vector_cost tsv110_vector_cost =
 933 {
 934   1, /* scalar_int_stmt_cost  */
 935   1, /* scalar_fp_stmt_cost  */
 936   5, /* scalar_load_cost  */
 937   1, /* scalar_store_cost  */
 938   1, /* cond_taken_branch_cost  */
 939   1, /* cond_not_taken_branch_cost  */
 940   &tsv110_advsimd_vector_cost, /* advsimd  */
 941   nullptr, /* sve  */
 942   nullptr /* issue_info  */
 943 };
 944
 945 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
 946 {
 947   2, /* int_stmt_cost  */
 948   2, /* fp_stmt_cost  */
 949   0, /* ld2_st2_permute_cost  */
 950   0, /* ld3_st3_permute_cost  */
 951   0, /* ld4_st4_permute_cost  */
 952   3, /* permute_cost  */
 953   8, /* reduc_i8_cost  */
 954   8, /* reduc_i16_cost  */
 955   8, /* reduc_i32_cost  */
 956   8, /* reduc_i64_cost  */
 957   8, /* reduc_f16_cost  */
 958   8, /* reduc_f32_cost  */
 959   8, /* reduc_f64_cost  */
 960   8, /* store_elt_extra_cost  */
 961   8, /* vec_to_scalar_cost  */
 962   8, /* scalar_to_vec_cost  */
 963   4, /* align_load_cost  */
 964   4, /* unalign_load_cost  */
 965   1, /* unalign_store_cost  */
 966   1  /* store_cost  */
 967 };
 968
 969 /* Cortex-A57 costs for vector insn classes.  */
 970 static const struct cpu_vector_cost cortexa57_vector_cost =
 971 {
 972   1, /* scalar_int_stmt_cost  */
 973   1, /* scalar_fp_stmt_cost  */
 974   4, /* scalar_load_cost  */
 975   1, /* scalar_store_cost  */
 976   1, /* cond_taken_branch_cost  */
 977   1, /* cond_not_taken_branch_cost  */
 978   &cortexa57_advsimd_vector_cost, /* advsimd  */
 979   nullptr, /* sve  */
 980   nullptr /* issue_info  */
 981 };
 982
 983 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
 984 {
 985   3, /* int_stmt_cost  */
 986   3, /* fp_stmt_cost  */
 987   0, /* ld2_st2_permute_cost  */
 988   0, /* ld3_st3_permute_cost  */
 989   0, /* ld4_st4_permute_cost  */
 990   3, /* permute_cost  */
 991   3, /* reduc_i8_cost  */
 992   3, /* reduc_i16_cost  */
 993   3, /* reduc_i32_cost  */
 994   3, /* reduc_i64_cost  */
 995   3, /* reduc_f16_cost  */
 996   3, /* reduc_f32_cost  */
 997   3, /* reduc_f64_cost  */
 998   3, /* store_elt_extra_cost  */
 999   3, /* vec_to_scalar_cost  */
1000   3, /* scalar_to_vec_cost  */
1001   5, /* align_load_cost  */
1002   5, /* unalign_load_cost  */
1003   1, /* unalign_store_cost  */
1004   1  /* store_cost  */
1005 };
1006
1007 static const struct cpu_vector_cost exynosm1_vector_cost =
1008 {
1009   1, /* scalar_int_stmt_cost  */
1010   1, /* scalar_fp_stmt_cost  */
1011   5, /* scalar_load_cost  */
1012   1, /* scalar_store_cost  */
1013   1, /* cond_taken_branch_cost  */
1014   1, /* cond_not_taken_branch_cost  */
1015   &exynosm1_advsimd_vector_cost, /* advsimd  */
1016   nullptr, /* sve  */
1017   nullptr /* issue_info  */
1018 };
1019
1020 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
1021 {
1022   2, /* int_stmt_cost  */
1023   2, /* fp_stmt_cost  */
1024   0, /* ld2_st2_permute_cost  */
1025   0, /* ld3_st3_permute_cost  */
1026   0, /* ld4_st4_permute_cost  */
1027   2, /* permute_cost  */
1028   4, /* reduc_i8_cost  */
1029   4, /* reduc_i16_cost  */
1030   4, /* reduc_i32_cost  */
1031   4, /* reduc_i64_cost  */
1032   4, /* reduc_f16_cost  */
1033   4, /* reduc_f32_cost  */
1034   4, /* reduc_f64_cost  */
1035   4, /* store_elt_extra_cost  */
1036   4, /* vec_to_scalar_cost  */
1037   4, /* scalar_to_vec_cost  */
1038   10, /* align_load_cost  */
1039   10, /* unalign_load_cost  */
1040   2, /* unalign_store_cost  */
1041   2  /* store_cost  */
1042 };
1043
1044 /* Generic costs for vector insn classes.  */
1045 static const struct cpu_vector_cost xgene1_vector_cost =
1046 {
1047   1, /* scalar_int_stmt_cost  */
1048   1, /* scalar_fp_stmt_cost  */
1049   5, /* scalar_load_cost  */
1050   1, /* scalar_store_cost  */
1051   2, /* cond_taken_branch_cost  */
1052   1, /* cond_not_taken_branch_cost  */
1053   &xgene1_advsimd_vector_cost, /* advsimd  */
1054   nullptr, /* sve  */
1055   nullptr /* issue_info  */
1056 };
1057
1058 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
1059 {
1060   4, /* int_stmt_cost  */
1061   5, /* fp_stmt_cost  */
1062   0, /* ld2_st2_permute_cost  */
1063   0, /* ld3_st3_permute_cost  */
1064   0, /* ld4_st4_permute_cost  */
1065   10, /* permute_cost  */
1066   6, /* reduc_i8_cost  */
1067   6, /* reduc_i16_cost  */
1068   6, /* reduc_i32_cost  */
1069   6, /* reduc_i64_cost  */
1070   6, /* reduc_f16_cost  */
1071   6, /* reduc_f32_cost  */
1072   6, /* reduc_f64_cost  */
1073   6, /* store_elt_extra_cost  */
1074   6, /* vec_to_scalar_cost  */
1075   5, /* scalar_to_vec_cost  */
1076   4, /* align_load_cost  */
1077   4, /* unalign_load_cost  */
1078   1, /* unalign_store_cost  */
1079   1  /* store_cost  */
1080 };
1081
1082 /* Costs for vector insn classes for Vulcan.  */
1083 static const struct cpu_vector_cost thunderx2t99_vector_cost =
1084 {
1085   1, /* scalar_int_stmt_cost  */
1086   6, /* scalar_fp_stmt_cost  */
1087   4, /* scalar_load_cost  */
1088   1, /* scalar_store_cost  */
1089   2, /* cond_taken_branch_cost  */
1090   1,  /* cond_not_taken_branch_cost  */
1091   &thunderx2t99_advsimd_vector_cost, /* advsimd  */
1092   nullptr, /* sve  */
1093   nullptr /* issue_info  */
1094 };
1095
1096 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1097 {
1098   5, /* int_stmt_cost  */
1099   5, /* fp_stmt_cost  */
1100   0, /* ld2_st2_permute_cost  */
1101   0, /* ld3_st3_permute_cost  */
1102   0, /* ld4_st4_permute_cost  */
1103   10, /* permute_cost  */
1104   5, /* reduc_i8_cost  */
1105   5, /* reduc_i16_cost  */
1106   5, /* reduc_i32_cost  */
1107   5, /* reduc_i64_cost  */
1108   5, /* reduc_f16_cost  */
1109   5, /* reduc_f32_cost  */
1110   5, /* reduc_f64_cost  */
1111   5, /* store_elt_extra_cost  */
1112   5, /* vec_to_scalar_cost  */
1113   5, /* scalar_to_vec_cost  */
1114   4, /* align_load_cost  */
1115   4, /* unalign_load_cost  */
1116   4, /* unalign_store_cost  */
1117   4  /* store_cost  */
1118 };
1119
1120 static const struct cpu_vector_cost thunderx3t110_vector_cost =
1121 {
1122   1, /* scalar_int_stmt_cost  */
1123   5, /* scalar_fp_stmt_cost  */
1124   4, /* scalar_load_cost  */
1125   1, /* scalar_store_cost  */
1126   2, /* cond_taken_branch_cost  */
1127   1,  /* cond_not_taken_branch_cost  */
1128   &thunderx3t110_advsimd_vector_cost, /* advsimd  */
1129   nullptr, /* sve  */
1130   nullptr /* issue_info  */
1131 };
1132
1133 static const advsimd_vec_cost ampere1_advsimd_vector_cost =
1134 {
1135   1, /* int_stmt_cost  */
1136   3, /* fp_stmt_cost  */
1137   0, /* ld2_st2_permute_cost  */
1138   0, /* ld3_st3_permute_cost  */
1139   0, /* ld4_st4_permute_cost  */
1140   2, /* permute_cost  */
1141   12, /* reduc_i8_cost  */
1142   9, /* reduc_i16_cost  */
1143   6, /* reduc_i32_cost  */
1144   5, /* reduc_i64_cost  */
1145   9, /* reduc_f16_cost  */
1146   6, /* reduc_f32_cost  */
1147   5, /* reduc_f64_cost  */
1148   8, /* store_elt_extra_cost  */
1149   6, /* vec_to_scalar_cost  */
1150   7, /* scalar_to_vec_cost  */
1151   4, /* align_load_cost  */
1152   4, /* unalign_load_cost  */
1153   1, /* unalign_store_cost  */
1154   1  /* store_cost  */
1155 };
1156
1157 /* Ampere-1 costs for vector insn classes.  */
1158 static const struct cpu_vector_cost ampere1_vector_cost =
1159 {
1160   1, /* scalar_int_stmt_cost  */
1161   3, /* scalar_fp_stmt_cost  */
1162   4, /* scalar_load_cost  */
1163   1, /* scalar_store_cost  */
1164   1, /* cond_taken_branch_cost  */
1165   1, /* cond_not_taken_branch_cost  */
1166   &ampere1_advsimd_vector_cost, /* advsimd  */
1167   nullptr, /* sve  */
1168   nullptr  /* issue_info  */
1169 };
1170
1171 /* Generic costs for branch instructions.  */
1172 static const struct cpu_branch_cost generic_branch_cost =
1173 {
1174   1,  /* Predictable.  */
1175   3   /* Unpredictable.  */
1176 };
1177
1178 /* Generic approximation modes.  */
1179 static const cpu_approx_modes generic_approx_modes =
1180 {
1181   AARCH64_APPROX_NONE,  /* division  */
1182   AARCH64_APPROX_NONE,  /* sqrt  */
1183   AARCH64_APPROX_NONE   /* recip_sqrt  */
1184 };
1185
1186 /* Approximation modes for Exynos M1.  */
1187 static const cpu_approx_modes exynosm1_approx_modes =
1188 {
1189   AARCH64_APPROX_NONE,  /* division  */
1190   AARCH64_APPROX_ALL,   /* sqrt  */
1191   AARCH64_APPROX_ALL    /* recip_sqrt  */
1192 };
1193
1194 /* Approximation modes for X-Gene 1.  */
1195 static const cpu_approx_modes xgene1_approx_modes =
1196 {
1197   AARCH64_APPROX_NONE,  /* division  */
1198   AARCH64_APPROX_NONE,  /* sqrt  */
1199   AARCH64_APPROX_ALL    /* recip_sqrt  */
1200 };
1201
1202 /* Generic prefetch settings (which disable prefetch).  */
1203 static const cpu_prefetch_tune generic_prefetch_tune =
1204 {
1205   0,                    /* num_slots  */
1206   -1,                   /* l1_cache_size  */
1207   -1,                   /* l1_cache_line_size  */
1208   -1,                   /* l2_cache_size  */
1209   true,                 /* prefetch_dynamic_strides */
1210   -1,                   /* minimum_stride */
1211   -1                    /* default_opt_level  */
1212 };
1213
1214 static const cpu_prefetch_tune exynosm1_prefetch_tune =
1215 {
1216   0,                    /* num_slots  */
1217   -1,                   /* l1_cache_size  */
1218   64,                   /* l1_cache_line_size  */
1219   -1,                   /* l2_cache_size  */
1220   true,                 /* prefetch_dynamic_strides */
1221   -1,                   /* minimum_stride */
1222   -1                    /* default_opt_level  */
1223 };
1224
1225 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1226 {
1227   4,                    /* num_slots  */
1228   32,                   /* l1_cache_size  */
1229   64,                   /* l1_cache_line_size  */
1230   512,                  /* l2_cache_size  */
1231   false,                /* prefetch_dynamic_strides */
1232   2048,                 /* minimum_stride */
1233   3                     /* default_opt_level  */
1234 };
1235
1236 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1237 {
1238   8,                    /* num_slots  */
1239   32,                   /* l1_cache_size  */
1240   128,                  /* l1_cache_line_size  */
1241   16*1024,              /* l2_cache_size  */
1242   true,                 /* prefetch_dynamic_strides */
1243   -1,                   /* minimum_stride */
1244   3                     /* default_opt_level  */
1245 };
1246
1247 static const cpu_prefetch_tune thunderx_prefetch_tune =
1248 {
1249   8,                    /* num_slots  */
1250   32,                   /* l1_cache_size  */
1251   128,                  /* l1_cache_line_size  */
1252   -1,                   /* l2_cache_size  */
1253   true,                 /* prefetch_dynamic_strides */
1254   -1,                   /* minimum_stride */
1255   -1                    /* default_opt_level  */
1256 };
1257
1258 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1259 {
1260   8,                    /* num_slots  */
1261   32,                   /* l1_cache_size  */
1262   64,                   /* l1_cache_line_size  */
1263   256,                  /* l2_cache_size  */
1264   true,                 /* prefetch_dynamic_strides */
1265   -1,                   /* minimum_stride */
1266   -1                    /* default_opt_level  */
1267 };
1268
1269 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1270 {
1271   8,                    /* num_slots  */
1272   32,                   /* l1_cache_size  */
1273   64,                   /* l1_cache_line_size  */
1274   256,                  /* l2_cache_size  */
1275   true,                 /* prefetch_dynamic_strides */
1276   -1,                   /* minimum_stride */
1277   -1                    /* default_opt_level  */
1278 };
1279
1280 static const cpu_prefetch_tune tsv110_prefetch_tune =
1281 {
1282   0,                    /* num_slots  */
1283   64,                   /* l1_cache_size  */
1284   64,                   /* l1_cache_line_size  */
1285   512,                  /* l2_cache_size  */
1286   true,                 /* prefetch_dynamic_strides */
1287   -1,                   /* minimum_stride */
1288   -1                    /* default_opt_level  */
1289 };
1290
1291 static const cpu_prefetch_tune xgene1_prefetch_tune =
1292 {
1293   8,                    /* num_slots  */
1294   32,                   /* l1_cache_size  */
1295   64,                   /* l1_cache_line_size  */
1296   256,                  /* l2_cache_size  */
1297   true,                 /* prefetch_dynamic_strides */
1298   -1,                   /* minimum_stride */
1299   -1                    /* default_opt_level  */
1300 };
1301
1302 static const cpu_prefetch_tune a64fx_prefetch_tune =
1303 {
1304   8,                    /* num_slots  */
1305   64,                   /* l1_cache_size  */
1306   256,                  /* l1_cache_line_size  */
1307   32768,                /* l2_cache_size  */
1308   true,                 /* prefetch_dynamic_strides */
1309   -1,                   /* minimum_stride */
1310   -1                    /* default_opt_level  */
1311 };
1312
1313 static const cpu_prefetch_tune ampere1_prefetch_tune =
1314 {
1315   0,                    /* num_slots  */
1316   64,                   /* l1_cache_size  */
1317   64,                   /* l1_cache_line_size  */
1318   2048,                 /* l2_cache_size  */
1319   true,                 /* prefetch_dynamic_strides */
1320   -1,                   /* minimum_stride */
1321   -1                    /* default_opt_level  */
1322 };
1323
1324 static const struct tune_params generic_tunings =
1325 {
1326   &cortexa57_extra_costs,
1327   &generic_addrcost_table,
1328   &generic_regmove_cost,
1329   &generic_vector_cost,
1330   &generic_branch_cost,
1331   &generic_approx_modes,
1332   SVE_NOT_IMPLEMENTED, /* sve_width  */
1333   { 4, /* load_int.  */
1334     4, /* store_int.  */
1335     4, /* load_fp.  */
1336     4, /* store_fp.  */
1337     4, /* load_pred.  */
1338     4 /* store_pred.  */
1339   }, /* memmov_cost.  */
1340   2, /* issue_rate  */
1341   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1342   "16:12",      /* function_align.  */
1343   "4",  /* jump_align.  */
1344   "8",  /* loop_align.  */
1345   2,    /* int_reassoc_width.  */
1346   4,    /* fp_reassoc_width.  */
1347   1,    /* fma_reassoc_width.  */
1348   1,    /* vec_reassoc_width.  */
1349   2,    /* min_div_recip_mul_sf.  */
1350   2,    /* min_div_recip_mul_df.  */
1351   0,    /* max_case_values.  */
1352   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1353   /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1354      Neoverse V1.  It does not have a noticeable effect on A64FX and should
1355      have at most a very minor effect on SVE2 cores.  */
1356   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),    /* tune_flags.  */
1357   &generic_prefetch_tune
1358 };
1359
1360 static const struct tune_params cortexa35_tunings =
1361 {
1362   &cortexa53_extra_costs,
1363   &generic_addrcost_table,
1364   &cortexa53_regmove_cost,
1365   &generic_vector_cost,
1366   &generic_branch_cost,
1367   &generic_approx_modes,
1368   SVE_NOT_IMPLEMENTED, /* sve_width  */
1369   { 4, /* load_int.  */
1370     4, /* store_int.  */
1371     4, /* load_fp.  */
1372     4, /* store_fp.  */
1373     4, /* load_pred.  */
1374     4 /* store_pred.  */
1375   }, /* memmov_cost.  */
1376   1, /* issue_rate  */
1377   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1378    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1379   "16", /* function_align.  */
1380   "4",  /* jump_align.  */
1381   "8",  /* loop_align.  */
1382   2,    /* int_reassoc_width.  */
1383   4,    /* fp_reassoc_width.  */
1384   1,    /* fma_reassoc_width.  */
1385   1,    /* vec_reassoc_width.  */
1386   2,    /* min_div_recip_mul_sf.  */
1387   2,    /* min_div_recip_mul_df.  */
1388   0,    /* max_case_values.  */
1389   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1390   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1391   &generic_prefetch_tune
1392 };
1393
1394 static const struct tune_params cortexa53_tunings =
1395 {
1396   &cortexa53_extra_costs,
1397   &generic_addrcost_table,
1398   &cortexa53_regmove_cost,
1399   &generic_vector_cost,
1400   &generic_branch_cost,
1401   &generic_approx_modes,
1402   SVE_NOT_IMPLEMENTED, /* sve_width  */
1403   { 4, /* load_int.  */
1404     4, /* store_int.  */
1405     4, /* load_fp.  */
1406     4, /* store_fp.  */
1407     4, /* load_pred.  */
1408     4 /* store_pred.  */
1409   }, /* memmov_cost.  */
1410   2, /* issue_rate  */
1411   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1412    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1413   "16", /* function_align.  */
1414   "4",  /* jump_align.  */
1415   "8",  /* loop_align.  */
1416   2,    /* int_reassoc_width.  */
1417   4,    /* fp_reassoc_width.  */
1418   1,    /* fma_reassoc_width.  */
1419   1,    /* vec_reassoc_width.  */
1420   2,    /* min_div_recip_mul_sf.  */
1421   2,    /* min_div_recip_mul_df.  */
1422   0,    /* max_case_values.  */
1423   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1424   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1425   &generic_prefetch_tune
1426 };
1427
1428 static const struct tune_params cortexa57_tunings =
1429 {
1430   &cortexa57_extra_costs,
1431   &generic_addrcost_table,
1432   &cortexa57_regmove_cost,
1433   &cortexa57_vector_cost,
1434   &generic_branch_cost,
1435   &generic_approx_modes,
1436   SVE_NOT_IMPLEMENTED, /* sve_width  */
1437   { 4, /* load_int.  */
1438     4, /* store_int.  */
1439     4, /* load_fp.  */
1440     4, /* store_fp.  */
1441     4, /* load_pred.  */
1442     4 /* store_pred.  */
1443   }, /* memmov_cost.  */
1444   3, /* issue_rate  */
1445   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1446    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1447   "16", /* function_align.  */
1448   "4",  /* jump_align.  */
1449   "8",  /* loop_align.  */
1450   2,    /* int_reassoc_width.  */
1451   4,    /* fp_reassoc_width.  */
1452   1,    /* fma_reassoc_width.  */
1453   1,    /* vec_reassoc_width.  */
1454   2,    /* min_div_recip_mul_sf.  */
1455   2,    /* min_div_recip_mul_df.  */
1456   0,    /* max_case_values.  */
1457   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1458   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
1459   &generic_prefetch_tune
1460 };
1461
1462 static const struct tune_params cortexa72_tunings =
1463 {
1464   &cortexa57_extra_costs,
1465   &generic_addrcost_table,
1466   &cortexa57_regmove_cost,
1467   &cortexa57_vector_cost,
1468   &generic_branch_cost,
1469   &generic_approx_modes,
1470   SVE_NOT_IMPLEMENTED, /* sve_width  */
1471   { 4, /* load_int.  */
1472     4, /* store_int.  */
1473     4, /* load_fp.  */
1474     4, /* store_fp.  */
1475     4, /* load_pred.  */
1476     4 /* store_pred.  */
1477   }, /* memmov_cost.  */
1478   3, /* issue_rate  */
1479   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1480    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1481   "16", /* function_align.  */
1482   "4",  /* jump_align.  */
1483   "8",  /* loop_align.  */
1484   2,    /* int_reassoc_width.  */
1485   4,    /* fp_reassoc_width.  */
1486   1,    /* fma_reassoc_width.  */
1487   1,    /* vec_reassoc_width.  */
1488   2,    /* min_div_recip_mul_sf.  */
1489   2,    /* min_div_recip_mul_df.  */
1490   0,    /* max_case_values.  */
1491   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1492   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1493   &generic_prefetch_tune
1494 };
1495
1496 static const struct tune_params cortexa73_tunings =
1497 {
1498   &cortexa57_extra_costs,
1499   &generic_addrcost_table,
1500   &cortexa57_regmove_cost,
1501   &cortexa57_vector_cost,
1502   &generic_branch_cost,
1503   &generic_approx_modes,
1504   SVE_NOT_IMPLEMENTED, /* sve_width  */
1505   { 4, /* load_int.  */
1506     4, /* store_int.  */
1507     4, /* load_fp.  */
1508     4, /* store_fp.  */
1509     4, /* load_pred.  */
1510     4 /* store_pred.  */
1511   }, /* memmov_cost.  */
1512   2, /* issue_rate.  */
1513   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1514    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1515   "16", /* function_align.  */
1516   "4",  /* jump_align.  */
1517   "8",  /* loop_align.  */
1518   2,    /* int_reassoc_width.  */
1519   4,    /* fp_reassoc_width.  */
1520   1,    /* fma_reassoc_width.  */
1521   1,    /* vec_reassoc_width.  */
1522   2,    /* min_div_recip_mul_sf.  */
1523   2,    /* min_div_recip_mul_df.  */
1524   0,    /* max_case_values.  */
1525   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1526   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1527   &generic_prefetch_tune
1528 };
1529
1530
1531
1532 static const struct tune_params exynosm1_tunings =
1533 {
1534   &exynosm1_extra_costs,
1535   &exynosm1_addrcost_table,
1536   &exynosm1_regmove_cost,
1537   &exynosm1_vector_cost,
1538   &generic_branch_cost,
1539   &exynosm1_approx_modes,
1540   SVE_NOT_IMPLEMENTED, /* sve_width  */
1541   { 4, /* load_int.  */
1542     4, /* store_int.  */
1543     4, /* load_fp.  */
1544     4, /* store_fp.  */
1545     4, /* load_pred.  */
1546     4 /* store_pred.  */
1547   }, /* memmov_cost.  */
1548   3,    /* issue_rate  */
1549   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
1550   "4",  /* function_align.  */
1551   "4",  /* jump_align.  */
1552   "4",  /* loop_align.  */
1553   2,    /* int_reassoc_width.  */
1554   4,    /* fp_reassoc_width.  */
1555   1,    /* fma_reassoc_width.  */
1556   1,    /* vec_reassoc_width.  */
1557   2,    /* min_div_recip_mul_sf.  */
1558   2,    /* min_div_recip_mul_df.  */
1559   48,   /* max_case_values.  */
1560   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
1561   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
1562   &exynosm1_prefetch_tune
1563 };
1564
1565 static const struct tune_params thunderxt88_tunings =
1566 {
1567   &thunderx_extra_costs,
1568   &generic_addrcost_table,
1569   &thunderx_regmove_cost,
1570   &thunderx_vector_cost,
1571   &generic_branch_cost,
1572   &generic_approx_modes,
1573   SVE_NOT_IMPLEMENTED, /* sve_width  */
1574   { 6, /* load_int.  */
1575     6, /* store_int.  */
1576     6, /* load_fp.  */
1577     6, /* store_fp.  */
1578     6, /* load_pred.  */
1579     6 /* store_pred.  */
1580   }, /* memmov_cost.  */
1581   2, /* issue_rate  */
1582   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1583   "8",  /* function_align.  */
1584   "8",  /* jump_align.  */
1585   "8",  /* loop_align.  */
1586   2,    /* int_reassoc_width.  */
1587   4,    /* fp_reassoc_width.  */
1588   1,    /* fma_reassoc_width.  */
1589   1,    /* vec_reassoc_width.  */
1590   2,    /* min_div_recip_mul_sf.  */
1591   2,    /* min_div_recip_mul_df.  */
1592   0,    /* max_case_values.  */
1593   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1594   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
1595   &thunderxt88_prefetch_tune
1596 };
1597
1598 static const struct tune_params thunderx_tunings =
1599 {
1600   &thunderx_extra_costs,
1601   &generic_addrcost_table,
1602   &thunderx_regmove_cost,
1603   &thunderx_vector_cost,
1604   &generic_branch_cost,
1605   &generic_approx_modes,
1606   SVE_NOT_IMPLEMENTED, /* sve_width  */
1607   { 6, /* load_int.  */
1608     6, /* store_int.  */
1609     6, /* load_fp.  */
1610     6, /* store_fp.  */
1611     6, /* load_pred.  */
1612     6 /* store_pred.  */
1613   }, /* memmov_cost.  */
1614   2, /* issue_rate  */
1615   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1616   "8",  /* function_align.  */
1617   "8",  /* jump_align.  */
1618   "8",  /* loop_align.  */
1619   2,    /* int_reassoc_width.  */
1620   4,    /* fp_reassoc_width.  */
1621   1,    /* fma_reassoc_width.  */
1622   1,    /* vec_reassoc_width.  */
1623   2,    /* min_div_recip_mul_sf.  */
1624   2,    /* min_div_recip_mul_df.  */
1625   0,    /* max_case_values.  */
1626   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1627   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1628    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
1629   &thunderx_prefetch_tune
1630 };
1631
1632 static const struct tune_params tsv110_tunings =
1633 {
1634   &tsv110_extra_costs,
1635   &tsv110_addrcost_table,
1636   &tsv110_regmove_cost,
1637   &tsv110_vector_cost,
1638   &generic_branch_cost,
1639   &generic_approx_modes,
1640   SVE_NOT_IMPLEMENTED, /* sve_width  */
1641   { 4, /* load_int.  */
1642     4, /* store_int.  */
1643     4, /* load_fp.  */
1644     4, /* store_fp.  */
1645     4, /* load_pred.  */
1646     4 /* store_pred.  */
1647   }, /* memmov_cost.  */
1648   4,    /* issue_rate  */
1649   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1650    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1651   "16", /* function_align.  */
1652   "4",  /* jump_align.  */
1653   "8",  /* loop_align.  */
1654   2,    /* int_reassoc_width.  */
1655   4,    /* fp_reassoc_width.  */
1656   1,    /* fma_reassoc_width.  */
1657   1,    /* vec_reassoc_width.  */
1658   2,    /* min_div_recip_mul_sf.  */
1659   2,    /* min_div_recip_mul_df.  */
1660   0,    /* max_case_values.  */
1661   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1662   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
1663   &tsv110_prefetch_tune
1664 };
1665
1666 static const struct tune_params xgene1_tunings =
1667 {
1668   &xgene1_extra_costs,
1669   &xgene1_addrcost_table,
1670   &xgene1_regmove_cost,
1671   &xgene1_vector_cost,
1672   &generic_branch_cost,
1673   &xgene1_approx_modes,
1674   SVE_NOT_IMPLEMENTED, /* sve_width  */
1675   { 6, /* load_int.  */
1676     6, /* store_int.  */
1677     6, /* load_fp.  */
1678     6, /* store_fp.  */
1679     6, /* load_pred.  */
1680     6 /* store_pred.  */
1681   }, /* memmov_cost.  */
1682   4, /* issue_rate  */
1683   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1684   "16", /* function_align.  */
1685   "16", /* jump_align.  */
1686   "16", /* loop_align.  */
1687   2,    /* int_reassoc_width.  */
1688   4,    /* fp_reassoc_width.  */
1689   1,    /* fma_reassoc_width.  */
1690   1,    /* vec_reassoc_width.  */
1691   2,    /* min_div_recip_mul_sf.  */
1692   2,    /* min_div_recip_mul_df.  */
1693   17,   /* max_case_values.  */
1694   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1695   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1696   &xgene1_prefetch_tune
1697 };
1698
1699 static const struct tune_params emag_tunings =
1700 {
1701   &xgene1_extra_costs,
1702   &xgene1_addrcost_table,
1703   &xgene1_regmove_cost,
1704   &xgene1_vector_cost,
1705   &generic_branch_cost,
1706   &xgene1_approx_modes,
1707   SVE_NOT_IMPLEMENTED,
1708   { 6, /* load_int.  */
1709     6, /* store_int.  */
1710     6, /* load_fp.  */
1711     6, /* store_fp.  */
1712     6, /* load_pred.  */
1713     6 /* store_pred.  */
1714   }, /* memmov_cost.  */
1715   4, /* issue_rate  */
1716   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1717   "16", /* function_align.  */
1718   "16", /* jump_align.  */
1719   "16", /* loop_align.  */
1720   2,    /* int_reassoc_width.  */
1721   4,    /* fp_reassoc_width.  */
1722   1,    /* fma_reassoc_width.  */
1723   1,    /* vec_reassoc_width.  */
1724   2,    /* min_div_recip_mul_sf.  */
1725   2,    /* min_div_recip_mul_df.  */
1726   17,   /* max_case_values.  */
1727   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1728   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1729   &xgene1_prefetch_tune
1730 };
1731
1732 static const struct tune_params qdf24xx_tunings =
1733 {
1734   &qdf24xx_extra_costs,
1735   &qdf24xx_addrcost_table,
1736   &qdf24xx_regmove_cost,
1737   &qdf24xx_vector_cost,
1738   &generic_branch_cost,
1739   &generic_approx_modes,
1740   SVE_NOT_IMPLEMENTED, /* sve_width  */
1741   { 4, /* load_int.  */
1742     4, /* store_int.  */
1743     4, /* load_fp.  */
1744     4, /* store_fp.  */
1745     4, /* load_pred.  */
1746     4 /* store_pred.  */
1747   }, /* memmov_cost.  */
1748   4, /* issue_rate  */
1749   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1750    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1751   "16", /* function_align.  */
1752   "8",  /* jump_align.  */
1753   "16", /* loop_align.  */
1754   2,    /* int_reassoc_width.  */
1755   4,    /* fp_reassoc_width.  */
1756   1,    /* fma_reassoc_width.  */
1757   1,    /* vec_reassoc_width.  */
1758   2,    /* min_div_recip_mul_sf.  */
1759   2,    /* min_div_recip_mul_df.  */
1760   0,    /* max_case_values.  */
1761   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1762   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1763   &qdf24xx_prefetch_tune
1764 };
1765
1766 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1767    for now.  */
1768 static const struct tune_params saphira_tunings =
1769 {
1770   &generic_extra_costs,
1771   &generic_addrcost_table,
1772   &generic_regmove_cost,
1773   &generic_vector_cost,
1774   &generic_branch_cost,
1775   &generic_approx_modes,
1776   SVE_NOT_IMPLEMENTED, /* sve_width  */
1777   { 4, /* load_int.  */
1778     4, /* store_int.  */
1779     4, /* load_fp.  */
1780     4, /* store_fp.  */
1781     4, /* load_pred.  */
1782     4 /* store_pred.  */
1783   }, /* memmov_cost.  */
1784   4, /* issue_rate  */
1785   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1786    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1787   "16", /* function_align.  */
1788   "8",  /* jump_align.  */
1789   "16", /* loop_align.  */
1790   2,    /* int_reassoc_width.  */
1791   4,    /* fp_reassoc_width.  */
1792   1,    /* fma_reassoc_width.  */
1793   1,    /* vec_reassoc_width.  */
1794   2,    /* min_div_recip_mul_sf.  */
1795   2,    /* min_div_recip_mul_df.  */
1796   0,    /* max_case_values.  */
1797   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1798   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1799   &generic_prefetch_tune
1800 };
1801
1802 static const struct tune_params thunderx2t99_tunings =
1803 {
1804   &thunderx2t99_extra_costs,
1805   &thunderx2t99_addrcost_table,
1806   &thunderx2t99_regmove_cost,
1807   &thunderx2t99_vector_cost,
1808   &generic_branch_cost,
1809   &generic_approx_modes,
1810   SVE_NOT_IMPLEMENTED, /* sve_width  */
1811   { 4, /* load_int.  */
1812     4, /* store_int.  */
1813     4, /* load_fp.  */
1814     4, /* store_fp.  */
1815     4, /* load_pred.  */
1816     4 /* store_pred.  */
1817   }, /* memmov_cost.  */
1818   4, /* issue_rate.  */
1819   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1820    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1821   "16", /* function_align.  */
1822   "8",  /* jump_align.  */
1823   "16", /* loop_align.  */
1824   3,    /* int_reassoc_width.  */
1825   2,    /* fp_reassoc_width.  */
1826   1,    /* fma_reassoc_width.  */
1827   2,    /* vec_reassoc_width.  */
1828   2,    /* min_div_recip_mul_sf.  */
1829   2,    /* min_div_recip_mul_df.  */
1830   0,    /* max_case_values.  */
1831   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1832   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1833   &thunderx2t99_prefetch_tune
1834 };
1835
1836 static const struct tune_params thunderx3t110_tunings =
1837 {
1838   &thunderx3t110_extra_costs,
1839   &thunderx3t110_addrcost_table,
1840   &thunderx3t110_regmove_cost,
1841   &thunderx3t110_vector_cost,
1842   &generic_branch_cost,
1843   &generic_approx_modes,
1844   SVE_NOT_IMPLEMENTED, /* sve_width  */
1845   { 4, /* load_int.  */
1846     4, /* store_int.  */
1847     4, /* load_fp.  */
1848     4, /* store_fp.  */
1849     4, /* load_pred.  */
1850     4 /* store_pred.  */
1851   }, /* memmov_cost.  */
1852   6, /* issue_rate.  */
1853   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1854    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1855   "16", /* function_align.  */
1856   "8",  /* jump_align.  */
1857   "16", /* loop_align.  */
1858   3,    /* int_reassoc_width.  */
1859   2,    /* fp_reassoc_width.  */
1860   1,    /* fma_reassoc_width.  */
1861   2,    /* vec_reassoc_width.  */
1862   2,    /* min_div_recip_mul_sf.  */
1863   2,    /* min_div_recip_mul_df.  */
1864   0,    /* max_case_values.  */
1865   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1866   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1867   &thunderx3t110_prefetch_tune
1868 };
1869
1870 static const struct tune_params neoversen1_tunings =
1871 {
1872   &cortexa76_extra_costs,
1873   &generic_addrcost_table,
1874   &generic_regmove_cost,
1875   &cortexa57_vector_cost,
1876   &generic_branch_cost,
1877   &generic_approx_modes,
1878   SVE_NOT_IMPLEMENTED, /* sve_width  */
1879   { 4, /* load_int.  */
1880     2, /* store_int.  */
1881     5, /* load_fp.  */
1882     2, /* store_fp.  */
1883     4, /* load_pred.  */
1884     4 /* store_pred.  */
1885   }, /* memmov_cost.  */
1886   3, /* issue_rate  */
1887   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1888   "32:16",      /* function_align.  */
1889   "4",          /* jump_align.  */
1890   "32:16",      /* loop_align.  */
1891   2,    /* int_reassoc_width.  */
1892   4,    /* fp_reassoc_width.  */
1893   1,    /* fma_reassoc_width.  */
1894   2,    /* vec_reassoc_width.  */
1895   2,    /* min_div_recip_mul_sf.  */
1896   2,    /* min_div_recip_mul_df.  */
1897   0,    /* max_case_values.  */
1898   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1899   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),      /* tune_flags.  */
1900   &generic_prefetch_tune
1901 };
1902
1903 static const struct tune_params ampere1_tunings =
1904 {
1905   &ampere1_extra_costs,
1906   &generic_addrcost_table,
1907   &generic_regmove_cost,
1908   &ampere1_vector_cost,
1909   &generic_branch_cost,
1910   &generic_approx_modes,
1911   SVE_NOT_IMPLEMENTED, /* sve_width  */
1912   { 4, /* load_int.  */
1913     4, /* store_int.  */
1914     4, /* load_fp.  */
1915     4, /* store_fp.  */
1916     4, /* load_pred.  */
1917     4 /* store_pred.  */
1918   }, /* memmov_cost.  */
1919   4, /* issue_rate  */
1920   (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1921    AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1922    AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1923    AARCH64_FUSE_CMP_BRANCH),
1924   /* fusible_ops  */
1925   "32",         /* function_align.  */
1926   "4",          /* jump_align.  */
1927   "32:16",      /* loop_align.  */
1928   2,    /* int_reassoc_width.  */
1929   4,    /* fp_reassoc_width.  */
1930   1,    /* fma_reassoc_width.  */
1931   2,    /* vec_reassoc_width.  */
1932   2,    /* min_div_recip_mul_sf.  */
1933   2,    /* min_div_recip_mul_df.  */
1934   0,    /* max_case_values.  */
1935   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1936   (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE),  /* tune_flags.  */
1937   &ampere1_prefetch_tune
1938 };
1939
1940 static const struct tune_params ampere1a_tunings =
1941 {
1942   &ampere1a_extra_costs,
1943   &generic_addrcost_table,
1944   &generic_regmove_cost,
1945   &ampere1_vector_cost,
1946   &generic_branch_cost,
1947   &generic_approx_modes,
1948   SVE_NOT_IMPLEMENTED, /* sve_width  */
1949   { 4, /* load_int.  */
1950     4, /* store_int.  */
1951     4, /* load_fp.  */
1952     4, /* store_fp.  */
1953     4, /* load_pred.  */
1954     4 /* store_pred.  */
1955   }, /* memmov_cost.  */
1956   4, /* issue_rate  */
1957   (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1958    AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1959    AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1960    AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
1961    AARCH64_FUSE_ADDSUB_2REG_CONST1),
1962   /* fusible_ops  */
1963   "32",         /* function_align.  */
1964   "4",          /* jump_align.  */
1965   "32:16",      /* loop_align.  */
1966   2,    /* int_reassoc_width.  */
1967   4,    /* fp_reassoc_width.  */
1968   1,    /* fma_reassoc_width.  */
1969   2,    /* vec_reassoc_width.  */
1970   2,    /* min_div_recip_mul_sf.  */
1971   2,    /* min_div_recip_mul_df.  */
1972   0,    /* max_case_values.  */
1973   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1974   (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE),  /* tune_flags.  */
1975   &ampere1_prefetch_tune
1976 };
1977
1978 static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1979 {
1980   2, /* int_stmt_cost  */
1981   2, /* fp_stmt_cost  */
1982   4, /* ld2_st2_permute_cost */
1983   4, /* ld3_st3_permute_cost  */
1984   5, /* ld4_st4_permute_cost  */
1985   3, /* permute_cost  */
1986   4, /* reduc_i8_cost  */
1987   4, /* reduc_i16_cost  */
1988   2, /* reduc_i32_cost  */
1989   2, /* reduc_i64_cost  */
1990   6, /* reduc_f16_cost  */
1991   3, /* reduc_f32_cost  */
1992   2, /* reduc_f64_cost  */
1993   2, /* store_elt_extra_cost  */
1994   /* This value is just inherited from the Cortex-A57 table.  */
1995   8, /* vec_to_scalar_cost  */
1996   /* This depends very much on what the scalar value is and
1997      where it comes from.  E.g. some constants take two dependent
1998      instructions or a load, while others might be moved from a GPR.
1999      4 seems to be a reasonable compromise in practice.  */
2000   4, /* scalar_to_vec_cost  */
2001   4, /* align_load_cost  */
2002   4, /* unalign_load_cost  */
2003   /* Although stores have a latency of 2 and compete for the
2004      vector pipes, in practice it's better not to model that.  */
2005   1, /* unalign_store_cost  */
2006   1  /* store_cost  */
2007 };
2008
2009 static const sve_vec_cost neoversev1_sve_vector_cost =
2010 {
2011   {
2012     2, /* int_stmt_cost  */
2013     2, /* fp_stmt_cost  */
2014     4, /* ld2_st2_permute_cost  */
2015     7, /* ld3_st3_permute_cost  */
2016     8, /* ld4_st4_permute_cost  */
2017     3, /* permute_cost  */
2018     /* Theoretically, a reduction involving 31 scalar ADDs could
2019        complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
2020        completes in 14 cycles, so give it a cost of 31 + 5.  */
2021     36, /* reduc_i8_cost  */
2022     /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
2023     22, /* reduc_i16_cost  */
2024     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
2025     14, /* reduc_i32_cost  */
2026     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
2027     11, /* reduc_i64_cost  */
2028     /* Theoretically, a reduction involving 15 scalar FADDs could
2029        complete in ~9 cycles and would have a cost of 30.  FADDV
2030        completes in 13 cycles, so give it a cost of 30 + 4.  */
2031     34, /* reduc_f16_cost  */
2032     /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
2033     19, /* reduc_f32_cost  */
2034     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
2035     11, /* reduc_f64_cost  */
2036     2, /* store_elt_extra_cost  */
2037     /* This value is just inherited from the Cortex-A57 table.  */
2038     8, /* vec_to_scalar_cost  */
2039     /* See the comment above the Advanced SIMD versions.  */
2040     4, /* scalar_to_vec_cost  */
2041     4, /* align_load_cost  */
2042     4, /* unalign_load_cost  */
2043     /* Although stores have a latency of 2 and compete for the
2044        vector pipes, in practice it's better not to model that.  */
2045     1, /* unalign_store_cost  */
2046     1  /* store_cost  */
2047   },
2048   3, /* clast_cost  */
2049   19, /* fadda_f16_cost  */
2050   11, /* fadda_f32_cost  */
2051   8, /* fadda_f64_cost  */
2052   32, /* gather_load_x32_cost  */
2053   16, /* gather_load_x64_cost  */
2054   3 /* scatter_store_elt_cost  */
2055 };
2056
2057 static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
2058 {
2059   3, /* loads_stores_per_cycle  */
2060   2, /* stores_per_cycle  */
2061   4, /* general_ops_per_cycle  */
2062   0, /* fp_simd_load_general_ops  */
2063   1 /* fp_simd_store_general_ops  */
2064 };
2065
2066 static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
2067 {
2068   {
2069     3, /* loads_stores_per_cycle  */
2070     2, /* stores_per_cycle  */
2071     4, /* general_ops_per_cycle  */
2072     0, /* fp_simd_load_general_ops  */
2073     1 /* fp_simd_store_general_ops  */
2074   },
2075   2, /* ld2_st2_general_ops  */
2076   2, /* ld3_st3_general_ops  */
2077   3 /* ld4_st4_general_ops  */
2078 };
2079
2080 static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
2081 {
2082   {
2083     {
2084       2, /* loads_per_cycle  */
2085       2, /* stores_per_cycle  */
2086       2, /* general_ops_per_cycle  */
2087       0, /* fp_simd_load_general_ops  */
2088       1 /* fp_simd_store_general_ops  */
2089     },
2090     2, /* ld2_st2_general_ops  */
2091     2, /* ld3_st3_general_ops  */
2092     3 /* ld4_st4_general_ops  */
2093   },
2094   1, /* pred_ops_per_cycle  */
2095   2, /* while_pred_ops  */
2096   2, /* int_cmp_pred_ops  */
2097   1, /* fp_cmp_pred_ops  */
2098   1, /* gather_scatter_pair_general_ops  */
2099   1 /* gather_scatter_pair_pred_ops  */
2100 };
2101
2102 static const aarch64_vec_issue_info neoversev1_vec_issue_info =
2103 {
2104   &neoversev1_scalar_issue_info,
2105   &neoversev1_advsimd_issue_info,
2106   &neoversev1_sve_issue_info
2107 };
2108
2109 /* Neoverse V1 costs for vector insn classes.  */
2110 static const struct cpu_vector_cost neoversev1_vector_cost =
2111 {
2112   1, /* scalar_int_stmt_cost  */
2113   2, /* scalar_fp_stmt_cost  */
2114   4, /* scalar_load_cost  */
2115   1, /* scalar_store_cost  */
2116   1, /* cond_taken_branch_cost  */
2117   1, /* cond_not_taken_branch_cost  */
2118   &neoversev1_advsimd_vector_cost, /* advsimd  */
2119   &neoversev1_sve_vector_cost, /* sve  */
2120   &neoversev1_vec_issue_info /* issue_info  */
2121 };
2122
2123 static const struct tune_params neoversev1_tunings =
2124 {
2125   &cortexa76_extra_costs,
2126   &neoversev1_addrcost_table,
2127   &neoversev1_regmove_cost,
2128   &neoversev1_vector_cost,
2129   &generic_branch_cost,
2130   &generic_approx_modes,
2131   SVE_256, /* sve_width  */
2132   { 4, /* load_int.  */
2133     2, /* store_int.  */
2134     6, /* load_fp.  */
2135     2, /* store_fp.  */
2136     6, /* load_pred.  */
2137     1 /* store_pred.  */
2138   }, /* memmov_cost.  */
2139   3, /* issue_rate  */
2140   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2141   "32:16",      /* function_align.  */
2142   "4",          /* jump_align.  */
2143   "32:16",      /* loop_align.  */
2144   2,    /* int_reassoc_width.  */
2145   4,    /* fp_reassoc_width.  */
2146   4,    /* fma_reassoc_width.  */
2147   2,    /* vec_reassoc_width.  */
2148   2,    /* min_div_recip_mul_sf.  */
2149   2,    /* min_div_recip_mul_df.  */
2150   0,    /* max_case_values.  */
2151   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2152   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2153    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2154    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
2155    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
2156   &generic_prefetch_tune
2157 };
2158
2159 static const sve_vec_cost neoverse512tvb_sve_vector_cost =
2160 {
2161   {
2162     2, /* int_stmt_cost  */
2163     2, /* fp_stmt_cost  */
2164     4, /* ld2_st2_permute_cost  */
2165     5, /* ld3_st3_permute_cost  */
2166     5, /* ld4_st4_permute_cost  */
2167     3, /* permute_cost  */
2168     /* Theoretically, a reduction involving 15 scalar ADDs could
2169        complete in ~5 cycles and would have a cost of 15.  Assume that
2170        [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6.  */
2171     21, /* reduc_i8_cost  */
2172     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
2173     13, /* reduc_i16_cost  */
2174     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
2175     9, /* reduc_i32_cost  */
2176     /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7.  */
2177     8, /* reduc_i64_cost  */
2178     /* Theoretically, a reduction involving 7 scalar FADDs could
2179        complete in ~6 cycles and would have a cost of 14.  Assume that
2180        FADDV completes in 8 cycles and so give it a cost of 14 + 2.  */
2181     16, /* reduc_f16_cost  */
2182     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
2183     8, /* reduc_f32_cost  */
2184     /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2.  */
2185     4, /* reduc_f64_cost  */
2186     2, /* store_elt_extra_cost  */
2187     /* This value is just inherited from the Cortex-A57 table.  */
2188     8, /* vec_to_scalar_cost  */
2189     /* This depends very much on what the scalar value is and
2190        where it comes from.  E.g. some constants take two dependent
2191        instructions or a load, while others might be moved from a GPR.
2192        4 seems to be a reasonable compromise in practice.  */
2193     4, /* scalar_to_vec_cost  */
2194     4, /* align_load_cost  */
2195     4, /* unalign_load_cost  */
2196     /* Although stores generally have a latency of 2 and compete for the
2197        vector pipes, in practice it's better not to model that.  */
2198     1, /* unalign_store_cost  */
2199     1  /* store_cost  */
2200   },
2201   3, /* clast_cost  */
2202   10, /* fadda_f16_cost  */
2203   6, /* fadda_f32_cost  */
2204   4, /* fadda_f64_cost  */
2205   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2206      (6 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2207      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2208      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2209      (cost 2) to that, to avoid the difference being lost in rounding.
2210
2211      There is no easy comparison between a strided Advanced SIMD x32 load
2212      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2213      operation more than a 64-bit gather.  */
2214   14, /* gather_load_x32_cost  */
2215   12, /* gather_load_x64_cost  */
2216   3 /* scatter_store_elt_cost  */
2217 };
2218
2219 static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
2220 {
2221   {
2222     {
2223       3, /* loads_per_cycle  */
2224       2, /* stores_per_cycle  */
2225       4, /* general_ops_per_cycle  */
2226       0, /* fp_simd_load_general_ops  */
2227       1 /* fp_simd_store_general_ops  */
2228     },
2229     2, /* ld2_st2_general_ops  */
2230     2, /* ld3_st3_general_ops  */
2231     3 /* ld4_st4_general_ops  */
2232   },
2233   2, /* pred_ops_per_cycle  */
2234   2, /* while_pred_ops  */
2235   2, /* int_cmp_pred_ops  */
2236   1, /* fp_cmp_pred_ops  */
2237   1, /* gather_scatter_pair_general_ops  */
2238   1 /* gather_scatter_pair_pred_ops  */
2239 };
2240
2241 static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
2242 {
2243   &neoversev1_scalar_issue_info,
2244   &neoversev1_advsimd_issue_info,
2245   &neoverse512tvb_sve_issue_info
2246 };
2247
2248 static const struct cpu_vector_cost neoverse512tvb_vector_cost =
2249 {
2250   1, /* scalar_int_stmt_cost  */
2251   2, /* scalar_fp_stmt_cost  */
2252   4, /* scalar_load_cost  */
2253   1, /* scalar_store_cost  */
2254   1, /* cond_taken_branch_cost  */
2255   1, /* cond_not_taken_branch_cost  */
2256   &neoversev1_advsimd_vector_cost, /* advsimd  */
2257   &neoverse512tvb_sve_vector_cost, /* sve  */
2258   &neoverse512tvb_vec_issue_info /* issue_info  */
2259 };
2260
2261 static const struct tune_params neoverse512tvb_tunings =
2262 {
2263   &cortexa76_extra_costs,
2264   &neoversev1_addrcost_table,
2265   &neoversev1_regmove_cost,
2266   &neoverse512tvb_vector_cost,
2267   &generic_branch_cost,
2268   &generic_approx_modes,
2269   SVE_128 | SVE_256, /* sve_width  */
2270   { 4, /* load_int.  */
2271     2, /* store_int.  */
2272     6, /* load_fp.  */
2273     2, /* store_fp.  */
2274     6, /* load_pred.  */
2275     1 /* store_pred.  */
2276   }, /* memmov_cost.  */
2277   3, /* issue_rate  */
2278   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2279   "32:16",      /* function_align.  */
2280   "4",          /* jump_align.  */
2281   "32:16",      /* loop_align.  */
2282   2,    /* int_reassoc_width.  */
2283   4,    /* fp_reassoc_width.  */
2284   4,    /* fma_reassoc_width.  */
2285   2,    /* vec_reassoc_width.  */
2286   2,    /* min_div_recip_mul_sf.  */
2287   2,    /* min_div_recip_mul_df.  */
2288   0,    /* max_case_values.  */
2289   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2290   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2291    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2292    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),     /* tune_flags.  */
2293   &generic_prefetch_tune
2294 };
2295
2296 static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
2297 {
2298   2, /* int_stmt_cost  */
2299   2, /* fp_stmt_cost  */
2300   2, /* ld2_st2_permute_cost */
2301   2, /* ld3_st3_permute_cost  */
2302   3, /* ld4_st4_permute_cost  */
2303   3, /* permute_cost  */
2304   4, /* reduc_i8_cost  */
2305   4, /* reduc_i16_cost  */
2306   2, /* reduc_i32_cost  */
2307   2, /* reduc_i64_cost  */
2308   6, /* reduc_f16_cost  */
2309   4, /* reduc_f32_cost  */
2310   2, /* reduc_f64_cost  */
2311   2, /* store_elt_extra_cost  */
2312   /* This value is just inherited from the Cortex-A57 table.  */
2313   8, /* vec_to_scalar_cost  */
2314   /* This depends very much on what the scalar value is and
2315      where it comes from.  E.g. some constants take two dependent
2316      instructions or a load, while others might be moved from a GPR.
2317      4 seems to be a reasonable compromise in practice.  */
2318   4, /* scalar_to_vec_cost  */
2319   4, /* align_load_cost  */
2320   4, /* unalign_load_cost  */
2321   /* Although stores have a latency of 2 and compete for the
2322      vector pipes, in practice it's better not to model that.  */
2323   1, /* unalign_store_cost  */
2324   1  /* store_cost  */
2325 };
2326
2327 static const sve_vec_cost neoversen2_sve_vector_cost =
2328 {
2329   {
2330     2, /* int_stmt_cost  */
2331     2, /* fp_stmt_cost  */
2332     3, /* ld2_st2_permute_cost  */
2333     4, /* ld3_st3_permute_cost  */
2334     4, /* ld4_st4_permute_cost  */
2335     3, /* permute_cost  */
2336     /* Theoretically, a reduction involving 15 scalar ADDs could
2337        complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
2338        completes in 11 cycles, so give it a cost of 15 + 6.  */
2339     21, /* reduc_i8_cost  */
2340     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
2341     13, /* reduc_i16_cost  */
2342     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
2343     9, /* reduc_i32_cost  */
2344     /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
2345     2, /* reduc_i64_cost  */
2346     /* Theoretically, a reduction involving 7 scalar FADDs could
2347        complete in ~8 cycles and would have a cost of 14.  FADDV
2348        completes in 6 cycles, so give it a cost of 14 - 2.  */
2349     12, /* reduc_f16_cost  */
2350     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
2351     6, /* reduc_f32_cost  */
2352     /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
2353     2, /* reduc_f64_cost  */
2354     2, /* store_elt_extra_cost  */
2355     /* This value is just inherited from the Cortex-A57 table.  */
2356     8, /* vec_to_scalar_cost  */
2357     /* See the comment above the Advanced SIMD versions.  */
2358     4, /* scalar_to_vec_cost  */
2359     4, /* align_load_cost  */
2360     4, /* unalign_load_cost  */
2361     /* Although stores have a latency of 2 and compete for the
2362        vector pipes, in practice it's better not to model that.  */
2363     1, /* unalign_store_cost  */
2364     1  /* store_cost  */
2365   },
2366   3, /* clast_cost  */
2367   10, /* fadda_f16_cost  */
2368   6, /* fadda_f32_cost  */
2369   4, /* fadda_f64_cost  */
2370   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2371      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2372      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2373      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2374      (cost 2) to that, to avoid the difference being lost in rounding.
2375
2376      There is no easy comparison between a strided Advanced SIMD x32 load
2377      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2378      operation more than a 64-bit gather.  */
2379   14, /* gather_load_x32_cost  */
2380   12, /* gather_load_x64_cost  */
2381   3 /* scatter_store_elt_cost  */
2382 };
2383
2384 static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
2385 {
2386   3, /* loads_stores_per_cycle  */
2387   2, /* stores_per_cycle  */
2388   4, /* general_ops_per_cycle  */
2389   0, /* fp_simd_load_general_ops  */
2390   1 /* fp_simd_store_general_ops  */
2391 };
2392
2393 static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
2394 {
2395   {
2396     3, /* loads_stores_per_cycle  */
2397     2, /* stores_per_cycle  */
2398     2, /* general_ops_per_cycle  */
2399     0, /* fp_simd_load_general_ops  */
2400     1 /* fp_simd_store_general_ops  */
2401   },
2402   2, /* ld2_st2_general_ops  */
2403   2, /* ld3_st3_general_ops  */
2404   3 /* ld4_st4_general_ops  */
2405 };
2406
2407 static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
2408 {
2409   {
2410     {
2411       3, /* loads_per_cycle  */
2412       2, /* stores_per_cycle  */
2413       2, /* general_ops_per_cycle  */
2414       0, /* fp_simd_load_general_ops  */
2415       1 /* fp_simd_store_general_ops  */
2416     },
2417     2, /* ld2_st2_general_ops  */
2418     3, /* ld3_st3_general_ops  */
2419     3 /* ld4_st4_general_ops  */
2420   },
2421   2, /* pred_ops_per_cycle  */
2422   2, /* while_pred_ops  */
2423   2, /* int_cmp_pred_ops  */
2424   1, /* fp_cmp_pred_ops  */
2425   1, /* gather_scatter_pair_general_ops  */
2426   1 /* gather_scatter_pair_pred_ops  */
2427 };
2428
2429 static const aarch64_vec_issue_info neoversen2_vec_issue_info =
2430 {
2431   &neoversen2_scalar_issue_info,
2432   &neoversen2_advsimd_issue_info,
2433   &neoversen2_sve_issue_info
2434 };
2435
2436 /* Neoverse N2 costs for vector insn classes.  */
2437 static const struct cpu_vector_cost neoversen2_vector_cost =
2438 {
2439   1, /* scalar_int_stmt_cost  */
2440   2, /* scalar_fp_stmt_cost  */
2441   4, /* scalar_load_cost  */
2442   1, /* scalar_store_cost  */
2443   1, /* cond_taken_branch_cost  */
2444   1, /* cond_not_taken_branch_cost  */
2445   &neoversen2_advsimd_vector_cost, /* advsimd  */
2446   &neoversen2_sve_vector_cost, /* sve  */
2447   &neoversen2_vec_issue_info /* issue_info  */
2448 };
2449
2450 static const struct tune_params neoversen2_tunings =
2451 {
2452   &cortexa76_extra_costs,
2453   &neoversen2_addrcost_table,
2454   &neoversen2_regmove_cost,
2455   &neoversen2_vector_cost,
2456   &generic_branch_cost,
2457   &generic_approx_modes,
2458   SVE_128, /* sve_width  */
2459   { 4, /* load_int.  */
2460     1, /* store_int.  */
2461     6, /* load_fp.  */
2462     2, /* store_fp.  */
2463     6, /* load_pred.  */
2464     1 /* store_pred.  */
2465   }, /* memmov_cost.  */
2466   3, /* issue_rate  */
2467   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2468   "32:16",      /* function_align.  */
2469   "4",          /* jump_align.  */
2470   "32:16",      /* loop_align.  */
2471   2,    /* int_reassoc_width.  */
2472   4,    /* fp_reassoc_width.  */
2473   1,    /* fma_reassoc_width.  */
2474   2,    /* vec_reassoc_width.  */
2475   2,    /* min_div_recip_mul_sf.  */
2476   2,    /* min_div_recip_mul_df.  */
2477   0,    /* max_case_values.  */
2478   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2479   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2480    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2481    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2482    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),     /* tune_flags.  */
2483   &generic_prefetch_tune
2484 };
2485
2486 static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
2487 {
2488   2, /* int_stmt_cost  */
2489   2, /* fp_stmt_cost  */
2490   2, /* ld2_st2_permute_cost */
2491   2, /* ld3_st3_permute_cost  */
2492   3, /* ld4_st4_permute_cost  */
2493   3, /* permute_cost  */
2494   4, /* reduc_i8_cost  */
2495   4, /* reduc_i16_cost  */
2496   2, /* reduc_i32_cost  */
2497   2, /* reduc_i64_cost  */
2498   6, /* reduc_f16_cost  */
2499   3, /* reduc_f32_cost  */
2500   2, /* reduc_f64_cost  */
2501   2, /* store_elt_extra_cost  */
2502   /* This value is just inherited from the Cortex-A57 table.  */
2503   8, /* vec_to_scalar_cost  */
2504   /* This depends very much on what the scalar value is and
2505      where it comes from.  E.g. some constants take two dependent
2506      instructions or a load, while others might be moved from a GPR.
2507      4 seems to be a reasonable compromise in practice.  */
2508   4, /* scalar_to_vec_cost  */
2509   4, /* align_load_cost  */
2510   4, /* unalign_load_cost  */
2511   /* Although stores have a latency of 2 and compete for the
2512      vector pipes, in practice it's better not to model that.  */
2513   1, /* unalign_store_cost  */
2514   1  /* store_cost  */
2515 };
2516
2517 static const sve_vec_cost neoversev2_sve_vector_cost =
2518 {
2519   {
2520     2, /* int_stmt_cost  */
2521     2, /* fp_stmt_cost  */
2522     3, /* ld2_st2_permute_cost  */
2523     3, /* ld3_st3_permute_cost  */
2524     4, /* ld4_st4_permute_cost  */
2525     3, /* permute_cost  */
2526     /* Theoretically, a reduction involving 15 scalar ADDs could
2527        complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
2528        completes in 11 cycles, so give it a cost of 15 + 8.  */
2529     21, /* reduc_i8_cost  */
2530     /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
2531     14, /* reduc_i16_cost  */
2532     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
2533     7, /* reduc_i32_cost  */
2534     /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
2535     2, /* reduc_i64_cost  */
2536     /* Theoretically, a reduction involving 7 scalar FADDs could
2537        complete in ~6 cycles and would have a cost of 14.  FADDV
2538        completes in 8 cycles, so give it a cost of 14 + 2.  */
2539     16, /* reduc_f16_cost  */
2540     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
2541     8, /* reduc_f32_cost  */
2542     /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2.  */
2543     4, /* reduc_f64_cost  */
2544     2, /* store_elt_extra_cost  */
2545     /* This value is just inherited from the Cortex-A57 table.  */
2546     8, /* vec_to_scalar_cost  */
2547     /* See the comment above the Advanced SIMD versions.  */
2548     4, /* scalar_to_vec_cost  */
2549     4, /* align_load_cost  */
2550     4, /* unalign_load_cost  */
2551     /* Although stores have a latency of 2 and compete for the
2552        vector pipes, in practice it's better not to model that.  */
2553     1, /* unalign_store_cost  */
2554     1  /* store_cost  */
2555   },
2556   3, /* clast_cost  */
2557   10, /* fadda_f16_cost  */
2558   6, /* fadda_f32_cost  */
2559   4, /* fadda_f64_cost  */
2560   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2561      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2562      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2563      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2564      (cost 2) to that, to avoid the difference being lost in rounding.
2565
2566      There is no easy comparison between a strided Advanced SIMD x32 load
2567      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2568      operation more than a 64-bit gather.  */
2569   14, /* gather_load_x32_cost  */
2570   12, /* gather_load_x64_cost  */
2571   3 /* scatter_store_elt_cost  */
2572 };
2573
2574 static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
2575 {
2576   3, /* loads_stores_per_cycle  */
2577   2, /* stores_per_cycle  */
2578   6, /* general_ops_per_cycle  */
2579   0, /* fp_simd_load_general_ops  */
2580   1 /* fp_simd_store_general_ops  */
2581 };
2582
2583 static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
2584 {
2585   {
2586     3, /* loads_stores_per_cycle  */
2587     2, /* stores_per_cycle  */
2588     4, /* general_ops_per_cycle  */
2589     0, /* fp_simd_load_general_ops  */
2590     1 /* fp_simd_store_general_ops  */
2591   },
2592   2, /* ld2_st2_general_ops  */
2593   2, /* ld3_st3_general_ops  */
2594   3 /* ld4_st4_general_ops  */
2595 };
2596
2597 static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
2598 {
2599   {
2600     {
2601       3, /* loads_per_cycle  */
2602       2, /* stores_per_cycle  */
2603       4, /* general_ops_per_cycle  */
2604       0, /* fp_simd_load_general_ops  */
2605       1 /* fp_simd_store_general_ops  */
2606     },
2607     2, /* ld2_st2_general_ops  */
2608     3, /* ld3_st3_general_ops  */
2609     3 /* ld4_st4_general_ops  */
2610   },
2611   2, /* pred_ops_per_cycle  */
2612   2, /* while_pred_ops  */
2613   2, /* int_cmp_pred_ops  */
2614   1, /* fp_cmp_pred_ops  */
2615   1, /* gather_scatter_pair_general_ops  */
2616   1 /* gather_scatter_pair_pred_ops  */
2617 };
2618
2619 static const aarch64_vec_issue_info neoversev2_vec_issue_info =
2620 {
2621   &neoversev2_scalar_issue_info,
2622   &neoversev2_advsimd_issue_info,
2623   &neoversev2_sve_issue_info
2624 };
2625
2626 /* Demeter costs for vector insn classes.  */
2627 static const struct cpu_vector_cost neoversev2_vector_cost =
2628 {
2629   1, /* scalar_int_stmt_cost  */
2630   2, /* scalar_fp_stmt_cost  */
2631   4, /* scalar_load_cost  */
2632   1, /* scalar_store_cost  */
2633   1, /* cond_taken_branch_cost  */
2634   1, /* cond_not_taken_branch_cost  */
2635   &neoversev2_advsimd_vector_cost, /* advsimd  */
2636   &neoversev2_sve_vector_cost, /* sve  */
2637   &neoversev2_vec_issue_info /* issue_info  */
2638 };
2639
2640 static const struct tune_params neoversev2_tunings =
2641 {
2642   &cortexa76_extra_costs,
2643   &neoversev2_addrcost_table,
2644   &neoversev2_regmove_cost,
2645   &neoversev2_vector_cost,
2646   &generic_branch_cost,
2647   &generic_approx_modes,
2648   SVE_128, /* sve_width  */
2649   { 4, /* load_int.  */
2650     2, /* store_int.  */
2651     6, /* load_fp.  */
2652     1, /* store_fp.  */
2653     6, /* load_pred.  */
2654     2 /* store_pred.  */
2655   }, /* memmov_cost.  */
2656   5, /* issue_rate  */
2657   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2658   "32:16",      /* function_align.  */
2659   "4",          /* jump_align.  */
2660   "32:16",      /* loop_align.  */
2661   3,    /* int_reassoc_width.  */
2662   6,    /* fp_reassoc_width.  */
2663   4,    /* fma_reassoc_width.  */
2664   3,    /* vec_reassoc_width.  */
2665   2,    /* min_div_recip_mul_sf.  */
2666   2,    /* min_div_recip_mul_df.  */
2667   0,    /* max_case_values.  */
2668   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2669   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2670    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2671    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2672    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),     /* tune_flags.  */
2673   &generic_prefetch_tune
2674 };
2675
2676 static const struct tune_params a64fx_tunings =
2677 {
2678   &a64fx_extra_costs,
2679   &a64fx_addrcost_table,
2680   &a64fx_regmove_cost,
2681   &a64fx_vector_cost,
2682   &generic_branch_cost,
2683   &generic_approx_modes,
2684   SVE_512, /* sve_width  */
2685   { 4, /* load_int.  */
2686     4, /* store_int.  */
2687     4, /* load_fp.  */
2688     4, /* store_fp.  */
2689     4, /* load_pred.  */
2690     4 /* store_pred.  */
2691   }, /* memmov_cost.  */
2692   7, /* issue_rate  */
2693   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2694   "32", /* function_align.  */
2695   "16", /* jump_align.  */
2696   "32", /* loop_align.  */
2697   4,    /* int_reassoc_width.  */
2698   2,    /* fp_reassoc_width.  */
2699   1,    /* fma_reassoc_width.  */
2700   2,    /* vec_reassoc_width.  */
2701   2,    /* min_div_recip_mul_sf.  */
2702   2,    /* min_div_recip_mul_df.  */
2703   0,    /* max_case_values.  */
2704   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2705   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
2706   &a64fx_prefetch_tune
2707 };
2708
2709 /* Support for fine-grained override of the tuning structures.  */
2710 struct aarch64_tuning_override_function
2711 {
2712   const char* name;
2713   void (*parse_override)(const char*, struct tune_params*);
2714 };
2715
2716 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
2717 static void aarch64_parse_tune_string (const char*, struct tune_params*);
2718 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
2719
2720 static const struct aarch64_tuning_override_function
2721 aarch64_tuning_override_functions[] =
2722 {
2723   { "fuse", aarch64_parse_fuse_string },
2724   { "tune", aarch64_parse_tune_string },
2725   { "sve_width", aarch64_parse_sve_width_string },
2726   { NULL, NULL }
2727 };
2728
2729 /* A processor implementing AArch64.  */
2730 struct processor
2731 {
2732   const char *name;
2733   aarch64_processor ident;
2734   aarch64_processor sched_core;
2735   aarch64_arch arch;
2736   aarch64_feature_flags flags;
2737   const tune_params *tune;
2738 };
2739
2740 /* Architectures implementing AArch64.  */
2741 static CONSTEXPR const processor all_architectures[] =
2742 {
2743 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
2744   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
2745    feature_deps::ARCH_IDENT ().enable, NULL},
2746 #include "aarch64-arches.def"
2747   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2748 };
2749
2750 /* Processor cores implementing AArch64.  */
2751 static const struct processor all_cores[] =
2752 {
2753 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
2754   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
2755    feature_deps::cpu_##IDENT, &COSTS##_tunings},
2756 #include "aarch64-cores.def"
2757   {"generic", generic, cortexa53, AARCH64_ARCH_V8A,
2758    feature_deps::V8A ().enable, &generic_tunings},
2759   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2760 };
2761
2762 /* The current tuning set.  */
2763 struct tune_params aarch64_tune_params = generic_tunings;
2764
2765 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
2766
2767 static tree
2768 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
2769                                      int, bool *no_add_attrs)
2770 {
2771   /* Since we set fn_type_req to true, the caller should have checked
2772      this for us.  */
2773   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
2774   switch ((arm_pcs) fntype_abi (*node).id ())
2775     {
2776     case ARM_PCS_AAPCS64:
2777     case ARM_PCS_SIMD:
2778       return NULL_TREE;
2779
2780     case ARM_PCS_SVE:
2781       error ("the %qE attribute cannot be applied to an SVE function type",
2782              name);
2783       *no_add_attrs = true;
2784       return NULL_TREE;
2785
2786     case ARM_PCS_TLSDESC:
2787     case ARM_PCS_UNKNOWN:
2788       break;
2789     }
2790   gcc_unreachable ();
2791 }
2792
2793 /* Table of machine attributes.  */
2794 static const struct attribute_spec aarch64_attribute_table[] =
2795 {
2796   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
2797        affects_type_identity, handler, exclude } */
2798   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
2799                           handle_aarch64_vector_pcs_attribute, NULL },
2800   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
2801                           aarch64_sve::handle_arm_sve_vector_bits_attribute,
2802                           NULL },
2803   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
2804   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
2805   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
2806   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
2807 };
2808
2809 typedef enum aarch64_cond_code
2810 {
2811   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2812   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2813   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2814 }
2815 aarch64_cc;
2816
2817 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2818
2819
2820 /* The condition codes of the processor, and the inverse function.  */
2821 static const char * const aarch64_condition_codes[] =
2822 {
2823   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2824   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2825 };
2826
2827 /* The preferred condition codes for SVE conditions.  */
2828 static const char *const aarch64_sve_condition_codes[] =
2829 {
2830   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2831   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2832 };
2833
2834 /* Return the assembly token for svpattern value VALUE.  */
2835
2836 static const char *
2837 svpattern_token (enum aarch64_svpattern pattern)
2838 {
2839   switch (pattern)
2840     {
2841 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2842     AARCH64_FOR_SVPATTERN (CASE)
2843 #undef CASE
2844     case AARCH64_NUM_SVPATTERNS:
2845       break;
2846     }
2847   gcc_unreachable ();
2848 }
2849
2850 /* Return the location of a piece that is known to be passed or returned
2851    in registers.  FIRST_ZR is the first unused vector argument register
2852    and FIRST_PR is the first unused predicate argument register.  */
2853
2854 rtx
2855 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2856                                          unsigned int first_pr) const
2857 {
2858   gcc_assert (VECTOR_MODE_P (mode)
2859               && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2860               && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2861
2862   if (num_zr > 0 && num_pr == 0)
2863     return gen_rtx_REG (mode, first_zr);
2864
2865   if (num_zr == 0 && num_pr == 1)
2866     return gen_rtx_REG (mode, first_pr);
2867
2868   gcc_unreachable ();
2869 }
2870
2871 /* Return the total number of vector registers required by the PST.  */
2872
2873 unsigned int
2874 pure_scalable_type_info::num_zr () const
2875 {
2876   unsigned int res = 0;
2877   for (unsigned int i = 0; i < pieces.length (); ++i)
2878     res += pieces[i].num_zr;
2879   return res;
2880 }
2881
2882 /* Return the total number of predicate registers required by the PST.  */
2883
2884 unsigned int
2885 pure_scalable_type_info::num_pr () const
2886 {
2887   unsigned int res = 0;
2888   for (unsigned int i = 0; i < pieces.length (); ++i)
2889     res += pieces[i].num_pr;
2890   return res;
2891 }
2892
2893 /* Return the location of a PST that is known to be passed or returned
2894    in registers.  FIRST_ZR is the first unused vector argument register
2895    and FIRST_PR is the first unused predicate argument register.  */
2896
2897 rtx
2898 pure_scalable_type_info::get_rtx (machine_mode mode,
2899                                   unsigned int first_zr,
2900                                   unsigned int first_pr) const
2901 {
2902   /* Try to return a single REG if possible.  This leads to better
2903      code generation; it isn't required for correctness.  */
2904   if (mode == pieces[0].mode)
2905     {
2906       gcc_assert (pieces.length () == 1);
2907       return pieces[0].get_rtx (first_zr, first_pr);
2908     }
2909
2910   /* Build up a PARALLEL that contains the individual pieces.  */
2911   rtvec rtxes = rtvec_alloc (pieces.length ());
2912   for (unsigned int i = 0; i < pieces.length (); ++i)
2913     {
2914       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
2915       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
2916       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
2917       first_zr += pieces[i].num_zr;
2918       first_pr += pieces[i].num_pr;
2919     }
2920   return gen_rtx_PARALLEL (mode, rtxes);
2921 }
2922
2923 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
2924    in the AAPCS64.  */
2925
2926 pure_scalable_type_info::analysis_result
2927 pure_scalable_type_info::analyze (const_tree type)
2928 {
2929   /* Prevent accidental reuse.  */
2930   gcc_assert (pieces.is_empty ());
2931
2932   /* No code will be generated for erroneous types, so we won't establish
2933      an ABI mapping.  */
2934   if (type == error_mark_node)
2935     return NO_ABI_IDENTITY;
2936
2937   /* Zero-sized types disappear in the language->ABI mapping.  */
2938   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2939     return NO_ABI_IDENTITY;
2940
2941   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
2942   piece p = {};
2943   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
2944     {
2945       machine_mode mode = TYPE_MODE_RAW (type);
2946       gcc_assert (VECTOR_MODE_P (mode)
2947                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
2948
2949       p.mode = p.orig_mode = mode;
2950       add_piece (p);
2951       return IS_PST;
2952     }
2953
2954   /* Check for user-defined PSTs.  */
2955   if (TREE_CODE (type) == ARRAY_TYPE)
2956     return analyze_array (type);
2957   if (TREE_CODE (type) == RECORD_TYPE)
2958     return analyze_record (type);
2959
2960   return ISNT_PST;
2961 }
2962
2963 /* Analyze a type that is known not to be passed or returned in memory.
2964    Return true if it has an ABI identity and is a Pure Scalable Type.  */
2965
2966 bool
2967 pure_scalable_type_info::analyze_registers (const_tree type)
2968 {
2969   analysis_result result = analyze (type);
2970   gcc_assert (result != DOESNT_MATTER);
2971   return result == IS_PST;
2972 }
2973
2974 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
2975
2976 pure_scalable_type_info::analysis_result
2977 pure_scalable_type_info::analyze_array (const_tree type)
2978 {
2979   /* Analyze the element type.  */
2980   pure_scalable_type_info element_info;
2981   analysis_result result = element_info.analyze (TREE_TYPE (type));
2982   if (result != IS_PST)
2983     return result;
2984
2985   /* An array of unknown, flexible or variable length will be passed and
2986      returned by reference whatever we do.  */
2987   tree nelts_minus_one = array_type_nelts (type);
2988   if (!tree_fits_uhwi_p (nelts_minus_one))
2989     return DOESNT_MATTER;
2990
2991   /* Likewise if the array is constant-sized but too big to be interesting.
2992      The double checks against MAX_PIECES are to protect against overflow.  */
2993   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
2994   if (count > MAX_PIECES)
2995     return DOESNT_MATTER;
2996   count += 1;
2997   if (count * element_info.pieces.length () > MAX_PIECES)
2998     return DOESNT_MATTER;
2999
3000   /* The above checks should have weeded out elements of unknown size.  */
3001   poly_uint64 element_bytes;
3002   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
3003     gcc_unreachable ();
3004
3005   /* Build up the list of individual vectors and predicates.  */
3006   gcc_assert (!element_info.pieces.is_empty ());
3007   for (unsigned int i = 0; i < count; ++i)
3008     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
3009       {
3010         piece p = element_info.pieces[j];
3011         p.offset += i * element_bytes;
3012         add_piece (p);
3013       }
3014   return IS_PST;
3015 }
3016
3017 /* Subroutine of analyze for handling RECORD_TYPEs.  */
3018
3019 pure_scalable_type_info::analysis_result
3020 pure_scalable_type_info::analyze_record (const_tree type)
3021 {
3022   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3023     {
3024       if (TREE_CODE (field) != FIELD_DECL)
3025         continue;
3026
3027       /* Zero-sized fields disappear in the language->ABI mapping.  */
3028       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
3029         continue;
3030
3031       /* All fields with an ABI identity must be PSTs for the record as
3032          a whole to be a PST.  If any individual field is too big to be
3033          interesting then the record is too.  */
3034       pure_scalable_type_info field_info;
3035       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
3036       if (subresult == NO_ABI_IDENTITY)
3037         continue;
3038       if (subresult != IS_PST)
3039         return subresult;
3040
3041       /* Since all previous fields are PSTs, we ought to be able to track
3042          the field offset using poly_ints.  */
3043       tree bitpos = bit_position (field);
3044       gcc_assert (poly_int_tree_p (bitpos));
3045
3046       /* For the same reason, it shouldn't be possible to create a PST field
3047          whose offset isn't byte-aligned.  */
3048       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
3049                                                 BITS_PER_UNIT);
3050
3051       /* Punt if the record is too big to be interesting.  */
3052       poly_uint64 bytepos;
3053       if (!wide_bytepos.to_uhwi (&bytepos)
3054           || pieces.length () + field_info.pieces.length () > MAX_PIECES)
3055         return DOESNT_MATTER;
3056
3057       /* Add the individual vectors and predicates in the field to the
3058          record's list.  */
3059       gcc_assert (!field_info.pieces.is_empty ());
3060       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
3061         {
3062           piece p = field_info.pieces[i];
3063           p.offset += bytepos;
3064           add_piece (p);
3065         }
3066     }
3067   /* Empty structures disappear in the language->ABI mapping.  */
3068   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
3069 }
3070
3071 /* Add P to the list of pieces in the type.  */
3072
3073 void
3074 pure_scalable_type_info::add_piece (const piece &p)
3075 {
3076   /* Try to fold the new piece into the previous one to form a
3077      single-mode PST.  For example, if we see three consecutive vectors
3078      of the same mode, we can represent them using the corresponding
3079      3-tuple mode.
3080
3081      This is purely an optimization.  */
3082   if (!pieces.is_empty ())
3083     {
3084       piece &prev = pieces.last ();
3085       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
3086       unsigned int nelems1, nelems2;
3087       if (prev.orig_mode == p.orig_mode
3088           && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
3089           && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
3090                                   GET_MODE_NUNITS (p.orig_mode), &nelems1)
3091           && constant_multiple_p (GET_MODE_NUNITS (p.mode),
3092                                   GET_MODE_NUNITS (p.orig_mode), &nelems2)
3093           && targetm.array_mode (p.orig_mode,
3094                                  nelems1 + nelems2).exists (&prev.mode))
3095         {
3096           prev.num_zr += p.num_zr;
3097           prev.num_pr += p.num_pr;
3098           return;
3099         }
3100     }
3101   pieces.quick_push (p);
3102 }
3103
3104 /* Return true if at least one possible value of type TYPE includes at
3105    least one object of Pure Scalable Type, in the sense of the AAPCS64.
3106
3107    This is a relatively expensive test for some types, so it should
3108    generally be made as late as possible.  */
3109
3110 static bool
3111 aarch64_some_values_include_pst_objects_p (const_tree type)
3112 {
3113   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3114     return false;
3115
3116   if (aarch64_sve::builtin_type_p (type))
3117     return true;
3118
3119   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
3120     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
3121
3122   if (RECORD_OR_UNION_TYPE_P (type))
3123     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3124       if (TREE_CODE (field) == FIELD_DECL
3125           && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
3126         return true;
3127
3128   return false;
3129 }
3130
3131 /* Return the descriptor of the SIMD ABI.  */
3132
3133 static const predefined_function_abi &
3134 aarch64_simd_abi (void)
3135 {
3136   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
3137   if (!simd_abi.initialized_p ())
3138     {
3139       HARD_REG_SET full_reg_clobbers
3140         = default_function_abi.full_reg_clobbers ();
3141       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
3142         if (FP_SIMD_SAVED_REGNUM_P (regno))
3143           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3144       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
3145     }
3146   return simd_abi;
3147 }
3148
3149 /* Return the descriptor of the SVE PCS.  */
3150
3151 static const predefined_function_abi &
3152 aarch64_sve_abi (void)
3153 {
3154   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
3155   if (!sve_abi.initialized_p ())
3156     {
3157       HARD_REG_SET full_reg_clobbers
3158         = default_function_abi.full_reg_clobbers ();
3159       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
3160         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3161       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
3162         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3163       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
3164     }
3165   return sve_abi;
3166 }
3167
3168 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
3169    wraps, otherwise return X itself.  */
3170
3171 static rtx
3172 strip_salt (rtx x)
3173 {
3174   rtx search = x;
3175   if (GET_CODE (search) == CONST)
3176     search = XEXP (search, 0);
3177   if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
3178     x = XVECEXP (search, 0, 0);
3179   return x;
3180 }
3181
3182 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
3183    expression.  */
3184
3185 static rtx
3186 strip_offset_and_salt (rtx addr, poly_int64 *offset)
3187 {
3188   return strip_salt (strip_offset (addr, offset));
3189 }
3190
3191 /* Generate code to enable conditional branches in functions over 1 MiB.  */
3192 const char *
3193 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
3194                         const char * branch_format)
3195 {
3196     rtx_code_label * tmp_label = gen_label_rtx ();
3197     char label_buf[256];
3198     char buffer[128];
3199     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
3200                                  CODE_LABEL_NUMBER (tmp_label));
3201     const char *label_ptr = targetm.strip_name_encoding (label_buf);
3202     rtx dest_label = operands[pos_label];
3203     operands[pos_label] = tmp_label;
3204
3205     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
3206     output_asm_insn (buffer, operands);
3207
3208     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
3209     operands[pos_label] = dest_label;
3210     output_asm_insn (buffer, operands);
3211     return "";
3212 }
3213
3214 void
3215 aarch64_err_no_fpadvsimd (machine_mode mode)
3216 {
3217   if (TARGET_GENERAL_REGS_ONLY)
3218     if (FLOAT_MODE_P (mode))
3219       error ("%qs is incompatible with the use of floating-point types",
3220              "-mgeneral-regs-only");
3221     else
3222       error ("%qs is incompatible with the use of vector types",
3223              "-mgeneral-regs-only");
3224   else
3225     if (FLOAT_MODE_P (mode))
3226       error ("%qs feature modifier is incompatible with the use of"
3227              " floating-point types", "+nofp");
3228     else
3229       error ("%qs feature modifier is incompatible with the use of"
3230              " vector types", "+nofp");
3231 }
3232
3233 /* Report when we try to do something that requires SVE when SVE is disabled.
3234    This is an error of last resort and isn't very high-quality.  It usually
3235    involves attempts to measure the vector length in some way.  */
3236 static void
3237 aarch64_report_sve_required (void)
3238 {
3239   static bool reported_p = false;
3240
3241   /* Avoid reporting a slew of messages for a single oversight.  */
3242   if (reported_p)
3243     return;
3244
3245   error ("this operation requires the SVE ISA extension");
3246   inform (input_location, "you can enable SVE using the command-line"
3247           " option %<-march%>, or by using the %<target%>"
3248           " attribute or pragma");
3249   reported_p = true;
3250 }
3251
3252 /* Return true if REGNO is P0-P15 or one of the special FFR-related
3253    registers.  */
3254 inline bool
3255 pr_or_ffr_regnum_p (unsigned int regno)
3256 {
3257   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
3258 }
3259
3260 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
3261    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
3262    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
3263    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
3264    and GENERAL_REGS is lower than the memory cost (in this case the best class
3265    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
3266    cost results in bad allocations with many redundant int<->FP moves which
3267    are expensive on various cores.
3268    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
3269    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
3270    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
3271    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
3272    The result of this is that it is no longer inefficient to have a higher
3273    memory move cost than the register move cost.
3274 */
3275
3276 static reg_class_t
3277 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
3278                                          reg_class_t best_class)
3279 {
3280   machine_mode mode;
3281
3282   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
3283       || !reg_class_subset_p (FP_REGS, allocno_class))
3284     return allocno_class;
3285
3286   if (!reg_class_subset_p (GENERAL_REGS, best_class)
3287       || !reg_class_subset_p (FP_REGS, best_class))
3288     return best_class;
3289
3290   mode = PSEUDO_REGNO_MODE (regno);
3291   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
3292 }
3293
3294 static unsigned int
3295 aarch64_min_divisions_for_recip_mul (machine_mode mode)
3296 {
3297   if (GET_MODE_UNIT_SIZE (mode) == 4)
3298     return aarch64_tune_params.min_div_recip_mul_sf;
3299   return aarch64_tune_params.min_div_recip_mul_df;
3300 }
3301
3302 /* Return the reassociation width of treeop OPC with mode MODE.  */
3303 static int
3304 aarch64_reassociation_width (unsigned opc, machine_mode mode)
3305 {
3306   if (VECTOR_MODE_P (mode))
3307     return aarch64_tune_params.vec_reassoc_width;
3308   if (INTEGRAL_MODE_P (mode))
3309     return aarch64_tune_params.int_reassoc_width;
3310   /* Reassociation reduces the number of FMAs which may result in worse
3311      performance.  Use a per-CPU setting for FMA reassociation which allows
3312      narrow CPUs with few FP pipes to switch it off (value of 1), and wider
3313      CPUs with many FP pipes to enable reassociation.
3314      Since the reassociation pass doesn't understand FMA at all, assume
3315      that any FP addition might turn into FMA.  */
3316   if (FLOAT_MODE_P (mode))
3317     return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
3318                             : aarch64_tune_params.fp_reassoc_width;
3319   return 1;
3320 }
3321
3322 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
3323 unsigned
3324 aarch64_debugger_regno (unsigned regno)
3325 {
3326    if (GP_REGNUM_P (regno))
3327      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
3328    else if (regno == SP_REGNUM)
3329      return AARCH64_DWARF_SP;
3330    else if (FP_REGNUM_P (regno))
3331      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
3332    else if (PR_REGNUM_P (regno))
3333      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
3334    else if (regno == VG_REGNUM)
3335      return AARCH64_DWARF_VG;
3336
3337    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
3338       equivalent DWARF register.  */
3339    return DWARF_FRAME_REGISTERS;
3340 }
3341
3342 /* Implement TARGET_DWARF_FRAME_REG_MODE.  */
3343 static machine_mode
3344 aarch64_dwarf_frame_reg_mode (int regno)
3345 {
3346   /* Predicate registers are call-clobbered in the EH ABI (which is
3347      ARM_PCS_AAPCS64), so they should not be described by CFI.
3348      Their size changes as VL changes, so any values computed by
3349      __builtin_init_dwarf_reg_size_table might not be valid for
3350      all frames.  */
3351   if (PR_REGNUM_P (regno))
3352     return VOIDmode;
3353   return default_dwarf_frame_reg_mode (regno);
3354 }
3355
3356 /* If X is a CONST_DOUBLE, return its bit representation as a constant
3357    integer, otherwise return X unmodified.  */
3358 static rtx
3359 aarch64_bit_representation (rtx x)
3360 {
3361   if (CONST_DOUBLE_P (x))
3362     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
3363   return x;
3364 }
3365
3366 /* Return an estimate for the number of quadwords in an SVE vector.  This is
3367    equivalent to the number of Advanced SIMD vectors in an SVE vector.  */
3368 static unsigned int
3369 aarch64_estimated_sve_vq ()
3370 {
3371   return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
3372 }
3373
3374 /* Return true if MODE is an SVE predicate mode.  */
3375 static bool
3376 aarch64_sve_pred_mode_p (machine_mode mode)
3377 {
3378   return (TARGET_SVE
3379           && (mode == VNx16BImode
3380               || mode == VNx8BImode
3381               || mode == VNx4BImode
3382               || mode == VNx2BImode));
3383 }
3384
3385 /* Three mutually-exclusive flags describing a vector or predicate type.  */
3386 const unsigned int VEC_ADVSIMD  = 1;
3387 const unsigned int VEC_SVE_DATA = 2;
3388 const unsigned int VEC_SVE_PRED = 4;
3389 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
3390    a structure of 2, 3 or 4 vectors.  */
3391 const unsigned int VEC_STRUCT   = 8;
3392 /* Can be used in combination with VEC_SVE_DATA to indicate that the
3393    vector has fewer significant bytes than a full SVE vector.  */
3394 const unsigned int VEC_PARTIAL  = 16;
3395 /* Useful combinations of the above.  */
3396 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
3397 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
3398
3399 /* Return a set of flags describing the vector properties of mode MODE.
3400    Ignore modes that are not supported by the current target.  */
3401 static unsigned int
3402 aarch64_classify_vector_mode (machine_mode mode)
3403 {
3404   if (aarch64_sve_pred_mode_p (mode))
3405     return VEC_SVE_PRED;
3406
3407   /* Make the decision based on the mode's enum value rather than its
3408      properties, so that we keep the correct classification regardless
3409      of -msve-vector-bits.  */
3410   switch (mode)
3411     {
3412     /* Partial SVE QI vectors.  */
3413     case E_VNx2QImode:
3414     case E_VNx4QImode:
3415     case E_VNx8QImode:
3416     /* Partial SVE HI vectors.  */
3417     case E_VNx2HImode:
3418     case E_VNx4HImode:
3419     /* Partial SVE SI vector.  */
3420     case E_VNx2SImode:
3421     /* Partial SVE HF vectors.  */
3422     case E_VNx2HFmode:
3423     case E_VNx4HFmode:
3424     /* Partial SVE BF vectors.  */
3425     case E_VNx2BFmode:
3426     case E_VNx4BFmode:
3427     /* Partial SVE SF vector.  */
3428     case E_VNx2SFmode:
3429       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
3430
3431     case E_VNx16QImode:
3432     case E_VNx8HImode:
3433     case E_VNx4SImode:
3434     case E_VNx2DImode:
3435     case E_VNx8BFmode:
3436     case E_VNx8HFmode:
3437     case E_VNx4SFmode:
3438     case E_VNx2DFmode:
3439       return TARGET_SVE ? VEC_SVE_DATA : 0;
3440
3441     /* x2 SVE vectors.  */
3442     case E_VNx32QImode:
3443     case E_VNx16HImode:
3444     case E_VNx8SImode:
3445     case E_VNx4DImode:
3446     case E_VNx16BFmode:
3447     case E_VNx16HFmode:
3448     case E_VNx8SFmode:
3449     case E_VNx4DFmode:
3450     /* x3 SVE vectors.  */
3451     case E_VNx48QImode:
3452     case E_VNx24HImode:
3453     case E_VNx12SImode:
3454     case E_VNx6DImode:
3455     case E_VNx24BFmode:
3456     case E_VNx24HFmode:
3457     case E_VNx12SFmode:
3458     case E_VNx6DFmode:
3459     /* x4 SVE vectors.  */
3460     case E_VNx64QImode:
3461     case E_VNx32HImode:
3462     case E_VNx16SImode:
3463     case E_VNx8DImode:
3464     case E_VNx32BFmode:
3465     case E_VNx32HFmode:
3466     case E_VNx16SFmode:
3467     case E_VNx8DFmode:
3468       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
3469
3470     case E_OImode:
3471     case E_CImode:
3472     case E_XImode:
3473       return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
3474
3475     /* Structures of 64-bit Advanced SIMD vectors.  */
3476     case E_V2x8QImode:
3477     case E_V2x4HImode:
3478     case E_V2x2SImode:
3479     case E_V2x1DImode:
3480     case E_V2x4BFmode:
3481     case E_V2x4HFmode:
3482     case E_V2x2SFmode:
3483     case E_V2x1DFmode:
3484     case E_V3x8QImode:
3485     case E_V3x4HImode:
3486     case E_V3x2SImode:
3487     case E_V3x1DImode:
3488     case E_V3x4BFmode:
3489     case E_V3x4HFmode:
3490     case E_V3x2SFmode:
3491     case E_V3x1DFmode:
3492     case E_V4x8QImode:
3493     case E_V4x4HImode:
3494     case E_V4x2SImode:
3495     case E_V4x1DImode:
3496     case E_V4x4BFmode:
3497     case E_V4x4HFmode:
3498     case E_V4x2SFmode:
3499     case E_V4x1DFmode:
3500       return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
3501
3502     /* Structures of 128-bit Advanced SIMD vectors.  */
3503     case E_V2x16QImode:
3504     case E_V2x8HImode:
3505     case E_V2x4SImode:
3506     case E_V2x2DImode:
3507     case E_V2x8BFmode:
3508     case E_V2x8HFmode:
3509     case E_V2x4SFmode:
3510     case E_V2x2DFmode:
3511     case E_V3x16QImode:
3512     case E_V3x8HImode:
3513     case E_V3x4SImode:
3514     case E_V3x2DImode:
3515     case E_V3x8BFmode:
3516     case E_V3x8HFmode:
3517     case E_V3x4SFmode:
3518     case E_V3x2DFmode:
3519     case E_V4x16QImode:
3520     case E_V4x8HImode:
3521     case E_V4x4SImode:
3522     case E_V4x2DImode:
3523     case E_V4x8BFmode:
3524     case E_V4x8HFmode:
3525     case E_V4x4SFmode:
3526     case E_V4x2DFmode:
3527       return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
3528
3529     /* 64-bit Advanced SIMD vectors.  */
3530     case E_V8QImode:
3531     case E_V4HImode:
3532     case E_V2SImode:
3533     case E_V1DImode:
3534     case E_V4HFmode:
3535     case E_V4BFmode:
3536     case E_V2SFmode:
3537     case E_V1DFmode:
3538     /* 128-bit Advanced SIMD vectors.  */
3539     case E_V16QImode:
3540     case E_V8HImode:
3541     case E_V4SImode:
3542     case E_V2DImode:
3543     case E_V8HFmode:
3544     case E_V8BFmode:
3545     case E_V4SFmode:
3546     case E_V2DFmode:
3547       return TARGET_FLOAT ? VEC_ADVSIMD : 0;
3548
3549     default:
3550       return 0;
3551     }
3552 }
3553
3554 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
3555 bool
3556 aarch64_advsimd_struct_mode_p (machine_mode mode)
3557 {
3558   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3559   return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
3560 }
3561
3562 /* Return true if MODE is an Advanced SIMD D-register structure mode.  */
3563 static bool
3564 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
3565 {
3566   return (aarch64_classify_vector_mode (mode)
3567           == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
3568 }
3569
3570 /* Return true if MODE is an Advanced SIMD Q-register structure mode.  */
3571 static bool
3572 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
3573 {
3574   return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
3575 }
3576
3577 /* Return true if MODE is any of the data vector modes, including
3578    structure modes.  */
3579 static bool
3580 aarch64_vector_data_mode_p (machine_mode mode)
3581 {
3582   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
3583 }
3584
3585 /* Return true if MODE is any form of SVE mode, including predicates,
3586    vectors and structures.  */
3587 bool
3588 aarch64_sve_mode_p (machine_mode mode)
3589 {
3590   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
3591 }
3592
3593 /* Return true if MODE is an SVE data vector mode; either a single vector
3594    or a structure of vectors.  */
3595 static bool
3596 aarch64_sve_data_mode_p (machine_mode mode)
3597 {
3598   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
3599 }
3600
3601 /* Return the number of defined bytes in one constituent vector of
3602    SVE mode MODE, which has vector flags VEC_FLAGS.  */
3603 static poly_int64
3604 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
3605 {
3606   if (vec_flags & VEC_PARTIAL)
3607     /* A single partial vector.  */
3608     return GET_MODE_SIZE (mode);
3609
3610   if (vec_flags & VEC_SVE_DATA)
3611     /* A single vector or a tuple.  */
3612     return BYTES_PER_SVE_VECTOR;
3613
3614   /* A single predicate.  */
3615   gcc_assert (vec_flags & VEC_SVE_PRED);
3616   return BYTES_PER_SVE_PRED;
3617 }
3618
3619 /* If MODE holds an array of vectors, return the number of vectors
3620    in the array, otherwise return 1.  */
3621
3622 static unsigned int
3623 aarch64_ldn_stn_vectors (machine_mode mode)
3624 {
3625   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3626   if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
3627     return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
3628   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
3629     return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
3630   if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
3631     return exact_div (GET_MODE_SIZE (mode),
3632                       BYTES_PER_SVE_VECTOR).to_constant ();
3633   return 1;
3634 }
3635
3636 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
3637    corresponding vector structure mode.  */
3638 static opt_machine_mode
3639 aarch64_advsimd_vector_array_mode (machine_mode mode,
3640                                    unsigned HOST_WIDE_INT nelems)
3641 {
3642   unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
3643   if (known_eq (GET_MODE_SIZE (mode), 8))
3644     flags |= VEC_PARTIAL;
3645
3646   machine_mode struct_mode;
3647   FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
3648     if (aarch64_classify_vector_mode (struct_mode) == flags
3649         && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
3650         && known_eq (GET_MODE_NUNITS (struct_mode),
3651              GET_MODE_NUNITS (mode) * nelems))
3652       return struct_mode;
3653   return opt_machine_mode ();
3654 }
3655
3656 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
3657
3658 opt_machine_mode
3659 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
3660 {
3661   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
3662                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
3663   machine_mode mode;
3664   FOR_EACH_MODE_IN_CLASS (mode, mclass)
3665     if (inner_mode == GET_MODE_INNER (mode)
3666         && known_eq (nunits, GET_MODE_NUNITS (mode))
3667         && aarch64_sve_data_mode_p (mode))
3668       return mode;
3669   return opt_machine_mode ();
3670 }
3671
3672 /* Implement target hook TARGET_ARRAY_MODE.  */
3673 static opt_machine_mode
3674 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
3675 {
3676   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
3677       && IN_RANGE (nelems, 2, 4))
3678     return aarch64_sve_data_mode (GET_MODE_INNER (mode),
3679                                   GET_MODE_NUNITS (mode) * nelems);
3680   if (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD
3681       && IN_RANGE (nelems, 2, 4))
3682     return aarch64_advsimd_vector_array_mode (mode, nelems);
3683
3684   return opt_machine_mode ();
3685 }
3686
3687 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
3688 static bool
3689 aarch64_array_mode_supported_p (machine_mode mode,
3690                                 unsigned HOST_WIDE_INT nelems)
3691 {
3692   if (TARGET_SIMD
3693       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
3694           || AARCH64_VALID_SIMD_DREG_MODE (mode))
3695       && (nelems >= 2 && nelems <= 4))
3696     return true;
3697
3698   return false;
3699 }
3700
3701 /* MODE is some form of SVE vector mode.  For data modes, return the number
3702    of vector register bits that each element of MODE occupies, such as 64
3703    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
3704    in a 64-bit container).  For predicate modes, return the number of
3705    data bits controlled by each significant predicate bit.  */
3706
3707 static unsigned int
3708 aarch64_sve_container_bits (machine_mode mode)
3709 {
3710   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3711   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
3712                              ? BITS_PER_SVE_VECTOR
3713                              : GET_MODE_BITSIZE (mode));
3714   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
3715 }
3716
3717 /* Return the SVE predicate mode to use for elements that have
3718    ELEM_NBYTES bytes, if such a mode exists.  */
3719
3720 opt_machine_mode
3721 aarch64_sve_pred_mode (unsigned int elem_nbytes)
3722 {
3723   if (TARGET_SVE)
3724     {
3725       if (elem_nbytes == 1)
3726         return VNx16BImode;
3727       if (elem_nbytes == 2)
3728         return VNx8BImode;
3729       if (elem_nbytes == 4)
3730         return VNx4BImode;
3731       if (elem_nbytes == 8)
3732         return VNx2BImode;
3733     }
3734   return opt_machine_mode ();
3735 }
3736
3737 /* Return the SVE predicate mode that should be used to control
3738    SVE mode MODE.  */
3739
3740 machine_mode
3741 aarch64_sve_pred_mode (machine_mode mode)
3742 {
3743   unsigned int bits = aarch64_sve_container_bits (mode);
3744   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
3745 }
3746
3747 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
3748
3749 static opt_machine_mode
3750 aarch64_get_mask_mode (machine_mode mode)
3751 {
3752   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3753   if (vec_flags & VEC_SVE_DATA)
3754     return aarch64_sve_pred_mode (mode);
3755
3756   return default_get_mask_mode (mode);
3757 }
3758
3759 /* Return the integer element mode associated with SVE mode MODE.  */
3760
3761 static scalar_int_mode
3762 aarch64_sve_element_int_mode (machine_mode mode)
3763 {
3764   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3765                              ? BITS_PER_SVE_VECTOR
3766                              : GET_MODE_BITSIZE (mode));
3767   unsigned int elt_bits = vector_element_size (vector_bits,
3768                                                GET_MODE_NUNITS (mode));
3769   return int_mode_for_size (elt_bits, 0).require ();
3770 }
3771
3772 /* Return an integer element mode that contains exactly
3773    aarch64_sve_container_bits (MODE) bits.  This is wider than
3774    aarch64_sve_element_int_mode if MODE is a partial vector,
3775    otherwise it's the same.  */
3776
3777 static scalar_int_mode
3778 aarch64_sve_container_int_mode (machine_mode mode)
3779 {
3780   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
3781 }
3782
3783 /* Return the integer vector mode associated with SVE mode MODE.
3784    Unlike related_int_vector_mode, this can handle the case in which
3785    MODE is a predicate (and thus has a different total size).  */
3786
3787 machine_mode
3788 aarch64_sve_int_mode (machine_mode mode)
3789 {
3790   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
3791   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
3792 }
3793
3794 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
3795
3796 static opt_machine_mode
3797 aarch64_vectorize_related_mode (machine_mode vector_mode,
3798                                 scalar_mode element_mode,
3799                                 poly_uint64 nunits)
3800 {
3801   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
3802
3803   /* If we're operating on SVE vectors, try to return an SVE mode.  */
3804   poly_uint64 sve_nunits;
3805   if ((vec_flags & VEC_SVE_DATA)
3806       && multiple_p (BYTES_PER_SVE_VECTOR,
3807                      GET_MODE_SIZE (element_mode), &sve_nunits))
3808     {
3809       machine_mode sve_mode;
3810       if (maybe_ne (nunits, 0U))
3811         {
3812           /* Try to find a full or partial SVE mode with exactly
3813              NUNITS units.  */
3814           if (multiple_p (sve_nunits, nunits)
3815               && aarch64_sve_data_mode (element_mode,
3816                                         nunits).exists (&sve_mode))
3817             return sve_mode;
3818         }
3819       else
3820         {
3821           /* Take the preferred number of units from the number of bytes
3822              that fit in VECTOR_MODE.  We always start by "autodetecting"
3823              a full vector mode with preferred_simd_mode, so vectors
3824              chosen here will also be full vector modes.  Then
3825              autovectorize_vector_modes tries smaller starting modes
3826              and thus smaller preferred numbers of units.  */
3827           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
3828           if (aarch64_sve_data_mode (element_mode,
3829                                      sve_nunits).exists (&sve_mode))
3830             return sve_mode;
3831         }
3832     }
3833
3834   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
3835   if (TARGET_SIMD
3836       && (vec_flags & VEC_ADVSIMD)
3837       && known_eq (nunits, 0U)
3838       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3839       && maybe_ge (GET_MODE_BITSIZE (element_mode)
3840                    * GET_MODE_NUNITS (vector_mode), 128U))
3841     {
3842       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3843       if (VECTOR_MODE_P (res))
3844         return res;
3845     }
3846
3847   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3848 }
3849
3850 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
3851
3852 static bool
3853 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
3854 {
3855   machine_mode mode = TYPE_MODE (type);
3856   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3857   bool sve_p = (vec_flags & VEC_ANY_SVE);
3858   bool simd_p = (vec_flags & VEC_ADVSIMD);
3859
3860   return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
3861 }
3862
3863 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
3864    prefer to use the first arithmetic operand as the else value if
3865    the else value doesn't matter, since that exactly matches the SVE
3866    destructive merging form.  For ternary operations we could either
3867    pick the first operand and use FMAD-like instructions or the last
3868    operand and use FMLA-like instructions; the latter seems more
3869    natural.  */
3870
3871 static tree
3872 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
3873 {
3874   return nops == 3 ? ops[2] : ops[0];
3875 }
3876
3877 /* Implement TARGET_HARD_REGNO_NREGS.  */
3878
3879 static unsigned int
3880 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
3881 {
3882   /* ??? Logically we should only need to provide a value when
3883      HARD_REGNO_MODE_OK says that the combination is valid,
3884      but at the moment we need to handle all modes.  Just ignore
3885      any runtime parts for registers that can't store them.  */
3886   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
3887   switch (aarch64_regno_regclass (regno))
3888     {
3889     case FP_REGS:
3890     case FP_LO_REGS:
3891     case FP_LO8_REGS:
3892       {
3893         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3894         if (vec_flags & VEC_SVE_DATA)
3895           return exact_div (GET_MODE_SIZE (mode),
3896                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
3897         if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
3898           return GET_MODE_SIZE (mode).to_constant () / 8;
3899         return CEIL (lowest_size, UNITS_PER_VREG);
3900       }
3901     case PR_REGS:
3902     case PR_LO_REGS:
3903     case PR_HI_REGS:
3904     case FFR_REGS:
3905     case PR_AND_FFR_REGS:
3906       return 1;
3907     default:
3908       return CEIL (lowest_size, UNITS_PER_WORD);
3909     }
3910   gcc_unreachable ();
3911 }
3912
3913 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
3914
3915 static bool
3916 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
3917 {
3918   if (mode == V8DImode)
3919     return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
3920            && multiple_p (regno - R0_REGNUM, 2);
3921
3922   if (GET_MODE_CLASS (mode) == MODE_CC)
3923     return regno == CC_REGNUM;
3924
3925   if (regno == VG_REGNUM)
3926     /* This must have the same size as _Unwind_Word.  */
3927     return mode == DImode;
3928
3929   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3930   if (vec_flags & VEC_SVE_PRED)
3931     return pr_or_ffr_regnum_p (regno);
3932
3933   if (pr_or_ffr_regnum_p (regno))
3934     return false;
3935
3936   if (regno == SP_REGNUM)
3937     /* The purpose of comparing with ptr_mode is to support the
3938        global register variable associated with the stack pointer
3939        register via the syntax of asm ("wsp") in ILP32.  */
3940     return mode == Pmode || mode == ptr_mode;
3941
3942   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
3943     return mode == Pmode;
3944
3945   if (GP_REGNUM_P (regno))
3946     {
3947       if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
3948         return false;
3949       if (known_le (GET_MODE_SIZE (mode), 8))
3950         return true;
3951       if (known_le (GET_MODE_SIZE (mode), 16))
3952         return (regno & 1) == 0;
3953     }
3954   else if (FP_REGNUM_P (regno))
3955     {
3956       if (vec_flags & VEC_STRUCT)
3957         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
3958       else
3959         return !VECTOR_MODE_P (mode) || vec_flags != 0;
3960     }
3961
3962   return false;
3963 }
3964
3965 /* Return true if a function with type FNTYPE returns its value in
3966    SVE vector or predicate registers.  */
3967
3968 static bool
3969 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
3970 {
3971   tree return_type = TREE_TYPE (fntype);
3972
3973   pure_scalable_type_info pst_info;
3974   switch (pst_info.analyze (return_type))
3975     {
3976     case pure_scalable_type_info::IS_PST:
3977       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
3978               && pst_info.num_pr () <= NUM_PR_ARG_REGS);
3979
3980     case pure_scalable_type_info::DOESNT_MATTER:
3981       gcc_assert (aarch64_return_in_memory_1 (return_type));
3982       return false;
3983
3984     case pure_scalable_type_info::NO_ABI_IDENTITY:
3985     case pure_scalable_type_info::ISNT_PST:
3986       return false;
3987     }
3988   gcc_unreachable ();
3989 }
3990
3991 /* Return true if a function with type FNTYPE takes arguments in
3992    SVE vector or predicate registers.  */
3993
3994 static bool
3995 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
3996 {
3997   CUMULATIVE_ARGS args_so_far_v;
3998   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
3999                                 NULL_TREE, 0, true);
4000   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
4001
4002   for (tree chain = TYPE_ARG_TYPES (fntype);
4003        chain && chain != void_list_node;
4004        chain = TREE_CHAIN (chain))
4005     {
4006       tree arg_type = TREE_VALUE (chain);
4007       if (arg_type == error_mark_node)
4008         return false;
4009
4010       function_arg_info arg (arg_type, /*named=*/true);
4011       apply_pass_by_reference_rules (&args_so_far_v, arg);
4012       pure_scalable_type_info pst_info;
4013       if (pst_info.analyze_registers (arg.type))
4014         {
4015           unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
4016           unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
4017           gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
4018           return true;
4019         }
4020
4021       targetm.calls.function_arg_advance (args_so_far, arg);
4022     }
4023   return false;
4024 }
4025
4026 /* Implement TARGET_FNTYPE_ABI.  */
4027
4028 static const predefined_function_abi &
4029 aarch64_fntype_abi (const_tree fntype)
4030 {
4031   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
4032     return aarch64_simd_abi ();
4033
4034   if (aarch64_returns_value_in_sve_regs_p (fntype)
4035       || aarch64_takes_arguments_in_sve_regs_p (fntype))
4036     return aarch64_sve_abi ();
4037
4038   return default_function_abi;
4039 }
4040
4041 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
4042
4043 static bool
4044 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
4045 {
4046   return (aarch64_sve::builtin_type_p (type1)
4047           == aarch64_sve::builtin_type_p (type2));
4048 }
4049
4050 /* Return true if we should emit CFI for register REGNO.  */
4051
4052 static bool
4053 aarch64_emit_cfi_for_reg_p (unsigned int regno)
4054 {
4055   return (GP_REGNUM_P (regno)
4056           || !default_function_abi.clobbers_full_reg_p (regno));
4057 }
4058
4059 /* Return the mode we should use to save and restore register REGNO.  */
4060
4061 static machine_mode
4062 aarch64_reg_save_mode (unsigned int regno)
4063 {
4064   if (GP_REGNUM_P (regno))
4065     return DImode;
4066
4067   if (FP_REGNUM_P (regno))
4068     switch (crtl->abi->id ())
4069       {
4070       case ARM_PCS_AAPCS64:
4071         /* Only the low 64 bits are saved by the base PCS.  */
4072         return DFmode;
4073
4074       case ARM_PCS_SIMD:
4075         /* The vector PCS saves the low 128 bits (which is the full
4076            register on non-SVE targets).  */
4077         return TFmode;
4078
4079       case ARM_PCS_SVE:
4080         /* Use vectors of DImode for registers that need frame
4081            information, so that the first 64 bytes of the save slot
4082            are always the equivalent of what storing D<n> would give.  */
4083         if (aarch64_emit_cfi_for_reg_p (regno))
4084           return VNx2DImode;
4085
4086         /* Use vectors of bytes otherwise, so that the layout is
4087            endian-agnostic, and so that we can use LDR and STR for
4088            big-endian targets.  */
4089         return VNx16QImode;
4090
4091       case ARM_PCS_TLSDESC:
4092       case ARM_PCS_UNKNOWN:
4093         break;
4094       }
4095
4096   if (PR_REGNUM_P (regno))
4097     /* Save the full predicate register.  */
4098     return VNx16BImode;
4099
4100   gcc_unreachable ();
4101 }
4102
4103 /* Implement TARGET_INSN_CALLEE_ABI.  */
4104
4105 const predefined_function_abi &
4106 aarch64_insn_callee_abi (const rtx_insn *insn)
4107 {
4108   rtx pat = PATTERN (insn);
4109   gcc_assert (GET_CODE (pat) == PARALLEL);
4110   rtx unspec = XVECEXP (pat, 0, 1);
4111   gcc_assert (GET_CODE (unspec) == UNSPEC
4112               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
4113   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
4114 }
4115
4116 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
4117    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
4118    clobbers the top 64 bits when restoring the bottom 64 bits.  */
4119
4120 static bool
4121 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
4122                                         unsigned int regno,
4123                                         machine_mode mode)
4124 {
4125   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
4126     {
4127       poly_int64 per_register_size = GET_MODE_SIZE (mode);
4128       unsigned int nregs = hard_regno_nregs (regno, mode);
4129       if (nregs > 1)
4130         per_register_size = exact_div (per_register_size, nregs);
4131       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
4132         return maybe_gt (per_register_size, 16);
4133       return maybe_gt (per_register_size, 8);
4134     }
4135   return false;
4136 }
4137
4138 /* Implement REGMODE_NATURAL_SIZE.  */
4139 poly_uint64
4140 aarch64_regmode_natural_size (machine_mode mode)
4141 {
4142   /* The natural size for SVE data modes is one SVE data vector,
4143      and similarly for predicates.  We can't independently modify
4144      anything smaller than that.  */
4145   /* ??? For now, only do this for variable-width SVE registers.
4146      Doing it for constant-sized registers breaks lower-subreg.cc.  */
4147   /* ??? And once that's fixed, we should probably have similar
4148      code for Advanced SIMD.  */
4149   if (!aarch64_sve_vg.is_constant ())
4150     {
4151       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4152       if (vec_flags & VEC_SVE_PRED)
4153         return BYTES_PER_SVE_PRED;
4154       if (vec_flags & VEC_SVE_DATA)
4155         return BYTES_PER_SVE_VECTOR;
4156     }
4157   return UNITS_PER_WORD;
4158 }
4159
4160 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
4161 machine_mode
4162 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
4163                                      machine_mode mode)
4164 {
4165   /* The predicate mode determines which bits are significant and
4166      which are "don't care".  Decreasing the number of lanes would
4167      lose data while increasing the number of lanes would make bits
4168      unnecessarily significant.  */
4169   if (PR_REGNUM_P (regno))
4170     return mode;
4171   if (known_ge (GET_MODE_SIZE (mode), 4))
4172     return mode;
4173   else
4174     return SImode;
4175 }
4176
4177 /* Return true if I's bits are consecutive ones from the MSB.  */
4178 bool
4179 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
4180 {
4181   return exact_log2 (-i) != HOST_WIDE_INT_M1;
4182 }
4183
4184 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
4185    that strcpy from constants will be faster.  */
4186
4187 static HOST_WIDE_INT
4188 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
4189 {
4190   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
4191     return MAX (align, BITS_PER_WORD);
4192   return align;
4193 }
4194
4195 /* Return true if calls to DECL should be treated as
4196    long-calls (ie called via a register).  */
4197 static bool
4198 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
4199 {
4200   return false;
4201 }
4202
4203 /* Return true if calls to symbol-ref SYM should be treated as
4204    long-calls (ie called via a register).  */
4205 bool
4206 aarch64_is_long_call_p (rtx sym)
4207 {
4208   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
4209 }
4210
4211 /* Return true if calls to symbol-ref SYM should not go through
4212    plt stubs.  */
4213
4214 bool
4215 aarch64_is_noplt_call_p (rtx sym)
4216 {
4217   const_tree decl = SYMBOL_REF_DECL (sym);
4218
4219   if (flag_pic
4220       && decl
4221       && (!flag_plt
4222           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
4223       && !targetm.binds_local_p (decl))
4224     return true;
4225
4226   return false;
4227 }
4228
4229 /* Emit an insn that's a simple single-set.  Both the operands must be
4230    known to be valid.  */
4231 inline static rtx_insn *
4232 emit_set_insn (rtx x, rtx y)
4233 {
4234   return emit_insn (gen_rtx_SET (x, y));
4235 }
4236
4237 /* X and Y are two things to compare using CODE.  Emit the compare insn and
4238    return the rtx for register 0 in the proper mode.  */
4239 rtx
4240 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
4241 {
4242   machine_mode cmp_mode = GET_MODE (x);
4243   machine_mode cc_mode;
4244   rtx cc_reg;
4245
4246   if (cmp_mode == TImode)
4247     {
4248       gcc_assert (code == NE);
4249
4250       cc_mode = CCmode;
4251       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4252
4253       rtx x_lo = operand_subword (x, 0, 0, TImode);
4254       rtx y_lo = operand_subword (y, 0, 0, TImode);
4255       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
4256
4257       rtx x_hi = operand_subword (x, 1, 0, TImode);
4258       rtx y_hi = operand_subword (y, 1, 0, TImode);
4259       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
4260                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
4261                                GEN_INT (AARCH64_EQ)));
4262     }
4263   else
4264     {
4265       cc_mode = SELECT_CC_MODE (code, x, y);
4266       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4267       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
4268     }
4269   return cc_reg;
4270 }
4271
4272 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
4273
4274 static rtx
4275 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
4276                                   machine_mode y_mode)
4277 {
4278   if (y_mode == E_QImode || y_mode == E_HImode)
4279     {
4280       if (CONST_INT_P (y))
4281         {
4282           y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
4283           y_mode = SImode;
4284         }
4285       else
4286         {
4287           rtx t, cc_reg;
4288           machine_mode cc_mode;
4289
4290           t = gen_rtx_ZERO_EXTEND (SImode, y);
4291           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
4292           cc_mode = CC_SWPmode;
4293           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4294           emit_set_insn (cc_reg, t);
4295           return cc_reg;
4296         }
4297     }
4298
4299   if (!aarch64_plus_operand (y, y_mode))
4300     y = force_reg (y_mode, y);
4301
4302   return aarch64_gen_compare_reg (code, x, y);
4303 }
4304
4305 /* Consider the operation:
4306
4307      OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
4308
4309    where:
4310
4311    - CODE is [SU]MAX or [SU]MIN
4312    - OPERANDS[2] and OPERANDS[3] are constant integers
4313    - OPERANDS[3] is a positive or negative shifted 12-bit immediate
4314    - all operands have mode MODE
4315
4316    Decide whether it is possible to implement the operation using:
4317
4318      SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
4319      or
4320      ADDS <tmp>, OPERANDS[1], OPERANDS[3]
4321
4322    followed by:
4323
4324      <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
4325
4326    where <insn> is one of CSEL, CSINV or CSINC.  Return true if so.
4327    If GENERATE_P is true, also update OPERANDS as follows:
4328
4329      OPERANDS[4] = -OPERANDS[3]
4330      OPERANDS[5] = the rtl condition representing <cond>
4331      OPERANDS[6] = <tmp>
4332      OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC.  */
4333 bool
4334 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
4335 {
4336   signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
4337   rtx dst = operands[0];
4338   rtx maxmin_op = operands[2];
4339   rtx add_op = operands[3];
4340   machine_mode mode = GET_MODE (dst);
4341
4342   /* max (x, y) - z == (x >= y + 1 ? x : y) - z
4343                     == (x >= y ? x : y) - z
4344                     == (x > y ? x : y) - z
4345                     == (x > y - 1 ? x : y) - z
4346
4347      min (x, y) - z == (x <= y - 1 ? x : y) - z
4348                     == (x <= y ? x : y) - z
4349                     == (x < y ? x : y) - z
4350                     == (x < y + 1 ? x : y) - z
4351
4352      Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
4353      which x is compared with z.  Set DIFF to y - z.  Thus the supported
4354      combinations are as follows, with DIFF being the value after the ":":
4355
4356      max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1   [z == y + 1]
4357                     == x >= y ? x - y : 0              [z == y]
4358                     == x > y ? x - y : 0               [z == y]
4359                     == x > y - 1 ? x - (y - 1) : 1     [z == y - 1]
4360
4361      min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1    [z == y - 1]
4362                     == x <= y ? x - y : 0              [z == y]
4363                     == x < y ? x - y : 0               [z == y]
4364                     == x < y + 1 ? x - (y + 1) : -1    [z == y + 1].  */
4365   auto maxmin_val = rtx_mode_t (maxmin_op, mode);
4366   auto add_val = rtx_mode_t (add_op, mode);
4367   auto sub_val = wi::neg (add_val);
4368   auto diff = wi::sub (maxmin_val, sub_val);
4369   if (!(diff == 0
4370         || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
4371         || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
4372     return false;
4373
4374   if (!generate_p)
4375     return true;
4376
4377   rtx_code cmp;
4378   switch (code)
4379     {
4380     case SMAX:
4381       cmp = diff == 1 ? GT : GE;
4382       break;
4383     case UMAX:
4384       cmp = diff == 1 ? GTU : GEU;
4385       break;
4386     case SMIN:
4387       cmp = diff == -1 ? LT : LE;
4388       break;
4389     case UMIN:
4390       cmp = diff == -1 ? LTU : LEU;
4391       break;
4392     default:
4393       gcc_unreachable ();
4394     }
4395   rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
4396
4397   operands[4] = immed_wide_int_const (sub_val, mode);
4398   operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
4399   if (can_create_pseudo_p ())
4400     operands[6] = gen_reg_rtx (mode);
4401   else
4402     operands[6] = dst;
4403   operands[7] = immed_wide_int_const (diff, mode);
4404
4405   return true;
4406 }
4407
4408
4409 /* Build the SYMBOL_REF for __tls_get_addr.  */
4410
4411 static GTY(()) rtx tls_get_addr_libfunc;
4412
4413 rtx
4414 aarch64_tls_get_addr (void)
4415 {
4416   if (!tls_get_addr_libfunc)
4417     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
4418   return tls_get_addr_libfunc;
4419 }
4420
4421 /* Return the TLS model to use for ADDR.  */
4422
4423 static enum tls_model
4424 tls_symbolic_operand_type (rtx addr)
4425 {
4426   enum tls_model tls_kind = TLS_MODEL_NONE;
4427   poly_int64 offset;
4428   addr = strip_offset_and_salt (addr, &offset);
4429   if (SYMBOL_REF_P (addr))
4430     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
4431
4432   return tls_kind;
4433 }
4434
4435 /* We'll allow lo_sum's in addresses in our legitimate addresses
4436    so that combine would take care of combining addresses where
4437    necessary, but for generation purposes, we'll generate the address
4438    as :
4439    RTL                               Absolute
4440    tmp = hi (symbol_ref);            adrp  x1, foo
4441    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
4442                                      nop
4443
4444    PIC                               TLS
4445    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
4446    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
4447                                      bl   __tls_get_addr
4448                                      nop
4449
4450    Load TLS symbol, depending on TLS mechanism and TLS access model.
4451
4452    Global Dynamic - Traditional TLS:
4453    adrp tmp, :tlsgd:imm
4454    add  dest, tmp, #:tlsgd_lo12:imm
4455    bl   __tls_get_addr
4456
4457    Global Dynamic - TLS Descriptors:
4458    adrp dest, :tlsdesc:imm
4459    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
4460    add  dest, dest, #:tlsdesc_lo12:imm
4461    blr  tmp
4462    mrs  tp, tpidr_el0
4463    add  dest, dest, tp
4464
4465    Initial Exec:
4466    mrs  tp, tpidr_el0
4467    adrp tmp, :gottprel:imm
4468    ldr  dest, [tmp, #:gottprel_lo12:imm]
4469    add  dest, dest, tp
4470
4471    Local Exec:
4472    mrs  tp, tpidr_el0
4473    add  t0, tp, #:tprel_hi12:imm, lsl #12
4474    add  t0, t0, #:tprel_lo12_nc:imm
4475 */
4476
4477 static void
4478 aarch64_load_symref_appropriately (rtx dest, rtx imm,
4479                                    enum aarch64_symbol_type type)
4480 {
4481   switch (type)
4482     {
4483     case SYMBOL_SMALL_ABSOLUTE:
4484       {
4485         /* In ILP32, the mode of dest can be either SImode or DImode.  */
4486         rtx tmp_reg = dest;
4487         machine_mode mode = GET_MODE (dest);
4488
4489         gcc_assert (mode == Pmode || mode == ptr_mode);
4490
4491         if (can_create_pseudo_p ())
4492           tmp_reg = gen_reg_rtx (mode);
4493
4494         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
4495         emit_insn (gen_add_losym (dest, tmp_reg, imm));
4496         return;
4497       }
4498
4499     case SYMBOL_TINY_ABSOLUTE:
4500       emit_insn (gen_rtx_SET (dest, imm));
4501       return;
4502
4503     case SYMBOL_SMALL_GOT_28K:
4504       {
4505         machine_mode mode = GET_MODE (dest);
4506         rtx gp_rtx = pic_offset_table_rtx;
4507         rtx insn;
4508         rtx mem;
4509
4510         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
4511            here before rtl expand.  Tree IVOPT will generate rtl pattern to
4512            decide rtx costs, in which case pic_offset_table_rtx is not
4513            initialized.  For that case no need to generate the first adrp
4514            instruction as the final cost for global variable access is
4515            one instruction.  */
4516         if (gp_rtx != NULL)
4517           {
4518             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
4519                using the page base as GOT base, the first page may be wasted,
4520                in the worst scenario, there is only 28K space for GOT).
4521
4522                The generate instruction sequence for accessing global variable
4523                is:
4524
4525                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
4526
4527                Only one instruction needed. But we must initialize
4528                pic_offset_table_rtx properly.  We generate initialize insn for
4529                every global access, and allow CSE to remove all redundant.
4530
4531                The final instruction sequences will look like the following
4532                for multiply global variables access.
4533
4534                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
4535
4536                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
4537                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
4538                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
4539                  ...  */
4540
4541             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
4542             crtl->uses_pic_offset_table = 1;
4543             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
4544
4545             if (mode != GET_MODE (gp_rtx))
4546              gp_rtx = gen_lowpart (mode, gp_rtx);
4547
4548           }
4549
4550         if (mode == ptr_mode)
4551           {
4552             if (mode == DImode)
4553               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
4554             else
4555               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
4556
4557             mem = XVECEXP (SET_SRC (insn), 0, 0);
4558           }
4559         else
4560           {
4561             gcc_assert (mode == Pmode);
4562
4563             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
4564             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
4565           }
4566
4567         /* The operand is expected to be MEM.  Whenever the related insn
4568            pattern changed, above code which calculate mem should be
4569            updated.  */
4570         gcc_assert (MEM_P (mem));
4571         MEM_READONLY_P (mem) = 1;
4572         MEM_NOTRAP_P (mem) = 1;
4573         emit_insn (insn);
4574         return;
4575       }
4576
4577     case SYMBOL_SMALL_GOT_4G:
4578       emit_insn (gen_rtx_SET (dest, imm));
4579       return;
4580
4581     case SYMBOL_SMALL_TLSGD:
4582       {
4583         rtx_insn *insns;
4584         /* The return type of __tls_get_addr is the C pointer type
4585            so use ptr_mode.  */
4586         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
4587         rtx tmp_reg = dest;
4588
4589         if (GET_MODE (dest) != ptr_mode)
4590           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
4591
4592         start_sequence ();
4593         if (ptr_mode == SImode)
4594           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
4595         else
4596           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
4597         insns = get_insns ();
4598         end_sequence ();
4599
4600         RTL_CONST_CALL_P (insns) = 1;
4601         emit_libcall_block (insns, tmp_reg, result, imm);
4602         /* Convert back to the mode of the dest adding a zero_extend
4603            from SImode (ptr_mode) to DImode (Pmode). */
4604         if (dest != tmp_reg)
4605           convert_move (dest, tmp_reg, true);
4606         return;
4607       }
4608
4609     case SYMBOL_SMALL_TLSDESC:
4610       {
4611         machine_mode mode = GET_MODE (dest);
4612         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
4613         rtx tp;
4614
4615         gcc_assert (mode == Pmode || mode == ptr_mode);
4616
4617         /* In ILP32, the got entry is always of SImode size.  Unlike
4618            small GOT, the dest is fixed at reg 0.  */
4619         if (TARGET_ILP32)
4620           emit_insn (gen_tlsdesc_small_si (imm));
4621         else
4622           emit_insn (gen_tlsdesc_small_di (imm));
4623         tp = aarch64_load_tp (NULL);
4624
4625         if (mode != Pmode)
4626           tp = gen_lowpart (mode, tp);
4627
4628         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
4629         if (REG_P (dest))
4630           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4631         return;
4632       }
4633
4634     case SYMBOL_SMALL_TLSIE:
4635       {
4636         /* In ILP32, the mode of dest can be either SImode or DImode,
4637            while the got entry is always of SImode size.  The mode of
4638            dest depends on how dest is used: if dest is assigned to a
4639            pointer (e.g. in the memory), it has SImode; it may have
4640            DImode if dest is dereferenced to access the memeory.
4641            This is why we have to handle three different tlsie_small
4642            patterns here (two patterns for ILP32).  */
4643         machine_mode mode = GET_MODE (dest);
4644         rtx tmp_reg = gen_reg_rtx (mode);
4645         rtx tp = aarch64_load_tp (NULL);
4646
4647         if (mode == ptr_mode)
4648           {
4649             if (mode == DImode)
4650               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
4651             else
4652               {
4653                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
4654                 tp = gen_lowpart (mode, tp);
4655               }
4656           }
4657         else
4658           {
4659             gcc_assert (mode == Pmode);
4660             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
4661           }
4662
4663         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
4664         if (REG_P (dest))
4665           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4666         return;
4667       }
4668
4669     case SYMBOL_TLSLE12:
4670     case SYMBOL_TLSLE24:
4671     case SYMBOL_TLSLE32:
4672     case SYMBOL_TLSLE48:
4673       {
4674         machine_mode mode = GET_MODE (dest);
4675         rtx tp = aarch64_load_tp (NULL);
4676
4677         if (mode != Pmode)
4678           tp = gen_lowpart (mode, tp);
4679
4680         switch (type)
4681           {
4682           case SYMBOL_TLSLE12:
4683             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
4684                         (dest, tp, imm));
4685             break;
4686           case SYMBOL_TLSLE24:
4687             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
4688                         (dest, tp, imm));
4689           break;
4690           case SYMBOL_TLSLE32:
4691             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
4692                         (dest, imm));
4693             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4694                         (dest, dest, tp));
4695           break;
4696           case SYMBOL_TLSLE48:
4697             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
4698                         (dest, imm));
4699             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4700                         (dest, dest, tp));
4701             break;
4702           default:
4703             gcc_unreachable ();
4704           }
4705
4706         if (REG_P (dest))
4707           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4708         return;
4709       }
4710
4711     case SYMBOL_TINY_GOT:
4712       {
4713         rtx insn;
4714         machine_mode mode = GET_MODE (dest);
4715
4716         if (mode == ptr_mode)
4717           insn = gen_ldr_got_tiny (mode, dest, imm);
4718         else
4719           {
4720             gcc_assert (mode == Pmode);
4721             insn = gen_ldr_got_tiny_sidi (dest, imm);
4722           }
4723
4724         emit_insn (insn);
4725         return;
4726       }
4727
4728     case SYMBOL_TINY_TLSIE:
4729       {
4730         machine_mode mode = GET_MODE (dest);
4731         rtx tp = aarch64_load_tp (NULL);
4732
4733         if (mode == ptr_mode)
4734           {
4735             if (mode == DImode)
4736               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
4737             else
4738               {
4739                 tp = gen_lowpart (mode, tp);
4740                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
4741               }
4742           }
4743         else
4744           {
4745             gcc_assert (mode == Pmode);
4746             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
4747           }
4748
4749         if (REG_P (dest))
4750           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4751         return;
4752       }
4753
4754     default:
4755       gcc_unreachable ();
4756     }
4757 }
4758
4759 /* Emit a move from SRC to DEST.  Assume that the move expanders can
4760    handle all moves if !can_create_pseudo_p ().  The distinction is
4761    important because, unlike emit_move_insn, the move expanders know
4762    how to force Pmode objects into the constant pool even when the
4763    constant pool address is not itself legitimate.  */
4764 static rtx
4765 aarch64_emit_move (rtx dest, rtx src)
4766 {
4767   return (can_create_pseudo_p ()
4768           ? emit_move_insn (dest, src)
4769           : emit_move_insn_1 (dest, src));
4770 }
4771
4772 /* Apply UNOPTAB to OP and store the result in DEST.  */
4773
4774 static void
4775 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
4776 {
4777   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
4778   if (dest != tmp)
4779     emit_move_insn (dest, tmp);
4780 }
4781
4782 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
4783
4784 static void
4785 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
4786 {
4787   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
4788                           OPTAB_DIRECT);
4789   if (dest != tmp)
4790     emit_move_insn (dest, tmp);
4791 }
4792
4793 /* Split a 128-bit move operation into two 64-bit move operations,
4794    taking care to handle partial overlap of register to register
4795    copies.  Special cases are needed when moving between GP regs and
4796    FP regs.  SRC can be a register, constant or memory; DST a register
4797    or memory.  If either operand is memory it must not have any side
4798    effects.  */
4799 void
4800 aarch64_split_128bit_move (rtx dst, rtx src)
4801 {
4802   rtx dst_lo, dst_hi;
4803   rtx src_lo, src_hi;
4804
4805   machine_mode mode = GET_MODE (dst);
4806
4807   gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
4808   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
4809   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
4810
4811   if (REG_P (dst) && REG_P (src))
4812     {
4813       int src_regno = REGNO (src);
4814       int dst_regno = REGNO (dst);
4815
4816       /* Handle FP <-> GP regs.  */
4817       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
4818         {
4819           src_lo = gen_lowpart (word_mode, src);
4820           src_hi = gen_highpart (word_mode, src);
4821
4822           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
4823           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
4824           return;
4825         }
4826       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
4827         {
4828           dst_lo = gen_lowpart (word_mode, dst);
4829           dst_hi = gen_highpart (word_mode, dst);
4830
4831           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
4832           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
4833           return;
4834         }
4835     }
4836
4837   dst_lo = gen_lowpart (word_mode, dst);
4838   dst_hi = gen_highpart (word_mode, dst);
4839   src_lo = gen_lowpart (word_mode, src);
4840   src_hi = gen_highpart_mode (word_mode, mode, src);
4841
4842   /* At most one pairing may overlap.  */
4843   if (reg_overlap_mentioned_p (dst_lo, src_hi))
4844     {
4845       aarch64_emit_move (dst_hi, src_hi);
4846       aarch64_emit_move (dst_lo, src_lo);
4847     }
4848   else
4849     {
4850       aarch64_emit_move (dst_lo, src_lo);
4851       aarch64_emit_move (dst_hi, src_hi);
4852     }
4853 }
4854
4855 /* Return true if we should split a move from 128-bit value SRC
4856    to 128-bit register DEST.  */
4857
4858 bool
4859 aarch64_split_128bit_move_p (rtx dst, rtx src)
4860 {
4861   if (FP_REGNUM_P (REGNO (dst)))
4862     return REG_P (src) && !FP_REGNUM_P (REGNO (src));
4863   /* All moves to GPRs need to be split.  */
4864   return true;
4865 }
4866
4867 /* Split a complex SIMD move.  */
4868
4869 void
4870 aarch64_split_simd_move (rtx dst, rtx src)
4871 {
4872   machine_mode src_mode = GET_MODE (src);
4873   machine_mode dst_mode = GET_MODE (dst);
4874
4875   gcc_assert (VECTOR_MODE_P (dst_mode));
4876
4877   if (REG_P (dst) && REG_P (src))
4878     {
4879       gcc_assert (VECTOR_MODE_P (src_mode));
4880       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
4881     }
4882 }
4883
4884 bool
4885 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
4886                               machine_mode ymode, rtx y)
4887 {
4888   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
4889   gcc_assert (r != NULL);
4890   return rtx_equal_p (x, r);
4891 }
4892
4893 /* Return TARGET if it is nonnull and a register of mode MODE.
4894    Otherwise, return a fresh register of mode MODE if we can,
4895    or TARGET reinterpreted as MODE if we can't.  */
4896
4897 static rtx
4898 aarch64_target_reg (rtx target, machine_mode mode)
4899 {
4900   if (target && REG_P (target) && GET_MODE (target) == mode)
4901     return target;
4902   if (!can_create_pseudo_p ())
4903     {
4904       gcc_assert (target);
4905       return gen_lowpart (mode, target);
4906     }
4907   return gen_reg_rtx (mode);
4908 }
4909
4910 /* Return a register that contains the constant in BUILDER, given that
4911    the constant is a legitimate move operand.  Use TARGET as the register
4912    if it is nonnull and convenient.  */
4913
4914 static rtx
4915 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
4916 {
4917   rtx src = builder.build ();
4918   target = aarch64_target_reg (target, GET_MODE (src));
4919   emit_insn (gen_rtx_SET (target, src));
4920   return target;
4921 }
4922
4923 static rtx
4924 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
4925 {
4926   if (can_create_pseudo_p ())
4927     return force_reg (mode, value);
4928   else
4929     {
4930       gcc_assert (x);
4931       aarch64_emit_move (x, value);
4932       return x;
4933     }
4934 }
4935
4936 /* Return true if predicate value X is a constant in which every element
4937    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
4938    value, i.e. as a predicate in which all bits are significant.  */
4939
4940 static bool
4941 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
4942 {
4943   if (!CONST_VECTOR_P (x))
4944     return false;
4945
4946   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
4947                                              GET_MODE_NUNITS (GET_MODE (x)));
4948   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
4949   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
4950   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
4951
4952   unsigned int nelts = const_vector_encoded_nelts (x);
4953   for (unsigned int i = 0; i < nelts; ++i)
4954     {
4955       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
4956       if (!CONST_INT_P (elt))
4957         return false;
4958
4959       builder.quick_push (elt);
4960       for (unsigned int j = 1; j < factor; ++j)
4961         builder.quick_push (const0_rtx);
4962     }
4963   builder.finalize ();
4964   return true;
4965 }
4966
4967 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
4968    widest predicate element size it can have (that is, the largest size
4969    for which each element would still be 0 or 1).  */
4970
4971 unsigned int
4972 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
4973 {
4974   /* Start with the most optimistic assumption: that we only need
4975      one bit per pattern.  This is what we will use if only the first
4976      bit in each pattern is ever set.  */
4977   unsigned int mask = GET_MODE_SIZE (DImode);
4978   mask |= builder.npatterns ();
4979
4980   /* Look for set bits.  */
4981   unsigned int nelts = builder.encoded_nelts ();
4982   for (unsigned int i = 1; i < nelts; ++i)
4983     if (INTVAL (builder.elt (i)) != 0)
4984       {
4985         if (i & 1)
4986           return 1;
4987         mask |= i;
4988       }
4989   return mask & -mask;
4990 }
4991
4992 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
4993    return that predicate mode, otherwise return opt_machine_mode ().  */
4994
4995 opt_machine_mode
4996 aarch64_ptrue_all_mode (rtx x)
4997 {
4998   gcc_assert (GET_MODE (x) == VNx16BImode);
4999   if (!CONST_VECTOR_P (x)
5000       || !CONST_VECTOR_DUPLICATE_P (x)
5001       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
5002       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
5003     return opt_machine_mode ();
5004
5005   unsigned int nelts = const_vector_encoded_nelts (x);
5006   for (unsigned int i = 1; i < nelts; ++i)
5007     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
5008       return opt_machine_mode ();
5009
5010   return aarch64_sve_pred_mode (nelts);
5011 }
5012
5013 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
5014    that the constant would have with predicate element size ELT_SIZE
5015    (ignoring the upper bits in each element) and return:
5016
5017    * -1 if all bits are set
5018    * N if the predicate has N leading set bits followed by all clear bits
5019    * 0 if the predicate does not have any of these forms.  */
5020
5021 int
5022 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
5023                               unsigned int elt_size)
5024 {
5025   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
5026      followed by set bits.  */
5027   if (builder.nelts_per_pattern () == 3)
5028     return 0;
5029
5030   /* Skip over leading set bits.  */
5031   unsigned int nelts = builder.encoded_nelts ();
5032   unsigned int i = 0;
5033   for (; i < nelts; i += elt_size)
5034     if (INTVAL (builder.elt (i)) == 0)
5035       break;
5036   unsigned int vl = i / elt_size;
5037
5038   /* Check for the all-true case.  */
5039   if (i == nelts)
5040     return -1;
5041
5042   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
5043      repeating pattern of set bits followed by clear bits.  */
5044   if (builder.nelts_per_pattern () != 2)
5045     return 0;
5046
5047   /* We have a "foreground" value and a duplicated "background" value.
5048      If the background might repeat and the last set bit belongs to it,
5049      we might have set bits followed by clear bits followed by set bits.  */
5050   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
5051     return 0;
5052
5053   /* Make sure that the rest are all clear.  */
5054   for (; i < nelts; i += elt_size)
5055     if (INTVAL (builder.elt (i)) != 0)
5056       return 0;
5057
5058   return vl;
5059 }
5060
5061 /* See if there is an svpattern that encodes an SVE predicate of mode
5062    PRED_MODE in which the first VL bits are set and the rest are clear.
5063    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
5064    A VL of -1 indicates an all-true vector.  */
5065
5066 aarch64_svpattern
5067 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
5068 {
5069   if (vl < 0)
5070     return AARCH64_SV_ALL;
5071
5072   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
5073     return AARCH64_NUM_SVPATTERNS;
5074
5075   if (vl >= 1 && vl <= 8)
5076     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
5077
5078   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
5079     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
5080
5081   int max_vl;
5082   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
5083     {
5084       if (vl == (max_vl / 3) * 3)
5085         return AARCH64_SV_MUL3;
5086       /* These would only trigger for non-power-of-2 lengths.  */
5087       if (vl == (max_vl & -4))
5088         return AARCH64_SV_MUL4;
5089       if (vl == (1 << floor_log2 (max_vl)))
5090         return AARCH64_SV_POW2;
5091       if (vl == max_vl)
5092         return AARCH64_SV_ALL;
5093     }
5094   return AARCH64_NUM_SVPATTERNS;
5095 }
5096
5097 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
5098    bits has the lowest bit set and the upper bits clear.  This is the
5099    VNx16BImode equivalent of a PTRUE for controlling elements of
5100    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
5101    all bits are significant, even the upper zeros.  */
5102
5103 rtx
5104 aarch64_ptrue_all (unsigned int elt_size)
5105 {
5106   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
5107   builder.quick_push (const1_rtx);
5108   for (unsigned int i = 1; i < elt_size; ++i)
5109     builder.quick_push (const0_rtx);
5110   return builder.build ();
5111 }
5112
5113 /* Return an all-true predicate register of mode MODE.  */
5114
5115 rtx
5116 aarch64_ptrue_reg (machine_mode mode)
5117 {
5118   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5119   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
5120   return gen_lowpart (mode, reg);
5121 }
5122
5123 /* Return an all-false predicate register of mode MODE.  */
5124
5125 rtx
5126 aarch64_pfalse_reg (machine_mode mode)
5127 {
5128   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5129   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
5130   return gen_lowpart (mode, reg);
5131 }
5132
5133 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
5134    for it.  PRED2[0] is the predicate for the instruction whose result
5135    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
5136    for it.  Return true if we can prove that the two predicates are
5137    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
5138    with PRED1[0] without changing behavior.  */
5139
5140 bool
5141 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
5142 {
5143   machine_mode mode = GET_MODE (pred1[0]);
5144   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
5145               && mode == GET_MODE (pred2[0])
5146               && aarch64_sve_ptrue_flag (pred1[1], SImode)
5147               && aarch64_sve_ptrue_flag (pred2[1], SImode));
5148
5149   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
5150                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
5151   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
5152                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
5153   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
5154 }
5155
5156 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
5157    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
5158    Use TARGET as the target register if nonnull and convenient.  */
5159
5160 static rtx
5161 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
5162                           machine_mode data_mode, rtx op1, rtx op2)
5163 {
5164   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
5165   expand_operand ops[5];
5166   create_output_operand (&ops[0], target, pred_mode);
5167   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
5168   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
5169   create_input_operand (&ops[3], op1, data_mode);
5170   create_input_operand (&ops[4], op2, data_mode);
5171   expand_insn (icode, 5, ops);
5172   return ops[0].value;
5173 }
5174
5175 /* Use a comparison to convert integer vector SRC into MODE, which is
5176    the corresponding SVE predicate mode.  Use TARGET for the result
5177    if it's nonnull and convenient.  */
5178
5179 rtx
5180 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
5181 {
5182   machine_mode src_mode = GET_MODE (src);
5183   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
5184                                    src, CONST0_RTX (src_mode));
5185 }
5186
5187 /* Return the assembly token for svprfop value PRFOP.  */
5188
5189 static const char *
5190 svprfop_token (enum aarch64_svprfop prfop)
5191 {
5192   switch (prfop)
5193     {
5194 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
5195     AARCH64_FOR_SVPRFOP (CASE)
5196 #undef CASE
5197     case AARCH64_NUM_SVPRFOPS:
5198       break;
5199     }
5200   gcc_unreachable ();
5201 }
5202
5203 /* Return the assembly string for an SVE prefetch operation with
5204    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
5205    and that SUFFIX is the format for the remaining operands.  */
5206
5207 char *
5208 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
5209                              const char *suffix)
5210 {
5211   static char buffer[128];
5212   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
5213   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
5214                                    mnemonic, svprfop_token (prfop), suffix);
5215   gcc_assert (written < sizeof (buffer));
5216   return buffer;
5217 }
5218
5219 /* Check whether we can calculate the number of elements in PATTERN
5220    at compile time, given that there are NELTS_PER_VQ elements per
5221    128-bit block.  Return the value if so, otherwise return -1.  */
5222
5223 HOST_WIDE_INT
5224 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
5225 {
5226   unsigned int vl, const_vg;
5227   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
5228     vl = 1 + (pattern - AARCH64_SV_VL1);
5229   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
5230     vl = 16 << (pattern - AARCH64_SV_VL16);
5231   else if (aarch64_sve_vg.is_constant (&const_vg))
5232     {
5233       /* There are two vector granules per quadword.  */
5234       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
5235       switch (pattern)
5236         {
5237         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
5238         case AARCH64_SV_MUL4: return nelts & -4;
5239         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
5240         case AARCH64_SV_ALL: return nelts;
5241         default: gcc_unreachable ();
5242         }
5243     }
5244   else
5245     return -1;
5246
5247   /* There are two vector granules per quadword.  */
5248   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
5249   if (known_le (vl, nelts_all))
5250     return vl;
5251
5252   /* Requesting more elements than are available results in a PFALSE.  */
5253   if (known_gt (vl, nelts_all))
5254     return 0;
5255
5256   return -1;
5257 }
5258
5259 /* Return true if we can move VALUE into a register using a single
5260    CNT[BHWD] instruction.  */
5261
5262 static bool
5263 aarch64_sve_cnt_immediate_p (poly_int64 value)
5264 {
5265   HOST_WIDE_INT factor = value.coeffs[0];
5266   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
5267   return (value.coeffs[1] == factor
5268           && IN_RANGE (factor, 2, 16 * 16)
5269           && (factor & 1) == 0
5270           && factor <= 16 * (factor & -factor));
5271 }
5272
5273 /* Likewise for rtx X.  */
5274
5275 bool
5276 aarch64_sve_cnt_immediate_p (rtx x)
5277 {
5278   poly_int64 value;
5279   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
5280 }
5281
5282 /* Return the asm string for an instruction with a CNT-like vector size
5283    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5284    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5285    first part of the operands template (the part that comes before the
5286    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
5287    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
5288    in each quadword.  If it is zero, we can use any element size.  */
5289
5290 static char *
5291 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5292                                   aarch64_svpattern pattern,
5293                                   unsigned int factor,
5294                                   unsigned int nelts_per_vq)
5295 {
5296   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
5297
5298   if (nelts_per_vq == 0)
5299     /* There is some overlap in the ranges of the four CNT instructions.
5300        Here we always use the smallest possible element size, so that the
5301        multiplier is 1 whereever possible.  */
5302     nelts_per_vq = factor & -factor;
5303   int shift = std::min (exact_log2 (nelts_per_vq), 4);
5304   gcc_assert (IN_RANGE (shift, 1, 4));
5305   char suffix = "dwhb"[shift - 1];
5306
5307   factor >>= shift;
5308   unsigned int written;
5309   if (pattern == AARCH64_SV_ALL && factor == 1)
5310     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
5311                         prefix, suffix, operands);
5312   else if (factor == 1)
5313     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
5314                         prefix, suffix, operands, svpattern_token (pattern));
5315   else
5316     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
5317                         prefix, suffix, operands, svpattern_token (pattern),
5318                         factor);
5319   gcc_assert (written < sizeof (buffer));
5320   return buffer;
5321 }
5322
5323 /* Return the asm string for an instruction with a CNT-like vector size
5324    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5325    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5326    first part of the operands template (the part that comes before the
5327    vector size itself).  X is the value of the vector size operand,
5328    as a polynomial integer rtx; we need to convert this into an "all"
5329    pattern with a multiplier.  */
5330
5331 char *
5332 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5333                                   rtx x)
5334 {
5335   poly_int64 value = rtx_to_poly_int64 (x);
5336   gcc_assert (aarch64_sve_cnt_immediate_p (value));
5337   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
5338                                            value.coeffs[1], 0);
5339 }
5340
5341 /* Return the asm string for an instruction with a CNT-like vector size
5342    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5343    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5344    first part of the operands template (the part that comes before the
5345    vector size itself).  CNT_PAT[0..2] are the operands of the
5346    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
5347
5348 char *
5349 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
5350                                       const char *operands, rtx *cnt_pat)
5351 {
5352   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
5353   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
5354   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
5355   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
5356                                            factor, nelts_per_vq);
5357 }
5358
5359 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
5360
5361 bool
5362 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
5363 {
5364   poly_int64 value;
5365   return (poly_int_rtx_p (x, &value)
5366           && (aarch64_sve_cnt_immediate_p (value)
5367               || aarch64_sve_cnt_immediate_p (-value)));
5368 }
5369
5370 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
5371    operand 0.  */
5372
5373 char *
5374 aarch64_output_sve_scalar_inc_dec (rtx offset)
5375 {
5376   poly_int64 offset_value = rtx_to_poly_int64 (offset);
5377   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
5378   if (offset_value.coeffs[1] > 0)
5379     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
5380                                              offset_value.coeffs[1], 0);
5381   else
5382     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
5383                                              -offset_value.coeffs[1], 0);
5384 }
5385
5386 /* Return true if we can add VALUE to a register using a single ADDVL
5387    or ADDPL instruction.  */
5388
5389 static bool
5390 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
5391 {
5392   HOST_WIDE_INT factor = value.coeffs[0];
5393   if (factor == 0 || value.coeffs[1] != factor)
5394     return false;
5395   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
5396      and a value of 16 is one vector width.  */
5397   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
5398           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
5399 }
5400
5401 /* Likewise for rtx X.  */
5402
5403 bool
5404 aarch64_sve_addvl_addpl_immediate_p (rtx x)
5405 {
5406   poly_int64 value;
5407   return (poly_int_rtx_p (x, &value)
5408           && aarch64_sve_addvl_addpl_immediate_p (value));
5409 }
5410
5411 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
5412    to operand 1 and storing the result in operand 0.  */
5413
5414 char *
5415 aarch64_output_sve_addvl_addpl (rtx offset)
5416 {
5417   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
5418   poly_int64 offset_value = rtx_to_poly_int64 (offset);
5419   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
5420
5421   int factor = offset_value.coeffs[1];
5422   if ((factor & 15) == 0)
5423     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
5424   else
5425     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
5426   return buffer;
5427 }
5428
5429 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5430    instruction.  If it is, store the number of elements in each vector
5431    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
5432    factor in *FACTOR_OUT (if nonnull).  */
5433
5434 bool
5435 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
5436                                         unsigned int *nelts_per_vq_out)
5437 {
5438   rtx elt;
5439   poly_int64 value;
5440
5441   if (!const_vec_duplicate_p (x, &elt)
5442       || !poly_int_rtx_p (elt, &value))
5443     return false;
5444
5445   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
5446   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
5447     /* There's no vector INCB.  */
5448     return false;
5449
5450   HOST_WIDE_INT factor = value.coeffs[0];
5451   if (value.coeffs[1] != factor)
5452     return false;
5453
5454   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
5455   if ((factor % nelts_per_vq) != 0
5456       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
5457     return false;
5458
5459   if (factor_out)
5460     *factor_out = factor;
5461   if (nelts_per_vq_out)
5462     *nelts_per_vq_out = nelts_per_vq;
5463   return true;
5464 }
5465
5466 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5467    instruction.  */
5468
5469 bool
5470 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
5471 {
5472   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
5473 }
5474
5475 /* Return the asm template for an SVE vector INC or DEC instruction.
5476    OPERANDS gives the operands before the vector count and X is the
5477    value of the vector count operand itself.  */
5478
5479 char *
5480 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
5481 {
5482   int factor;
5483   unsigned int nelts_per_vq;
5484   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
5485     gcc_unreachable ();
5486   if (factor < 0)
5487     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
5488                                              -factor, nelts_per_vq);
5489   else
5490     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
5491                                              factor, nelts_per_vq);
5492 }
5493
5494 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5495
5496 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5497   {
5498     0x0000000100000001ull,
5499     0x0001000100010001ull,
5500     0x0101010101010101ull,
5501     0x1111111111111111ull,
5502     0x5555555555555555ull,
5503   };
5504
5505
5506
5507 /* Return true if 64-bit VAL is a valid bitmask immediate.  */
5508 static bool
5509 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
5510 {
5511   unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
5512   int bits;
5513
5514   /* Check for a single sequence of one bits and return quickly if so.
5515      The special cases of all ones and all zeroes returns false.  */
5516   tmp = val + (val & -val);
5517
5518   if (tmp == (tmp & -tmp))
5519     return (val + 1) > 1;
5520
5521   /* Invert if the immediate doesn't start with a zero bit - this means we
5522      only need to search for sequences of one bits.  */
5523   if (val & 1)
5524     val = ~val;
5525
5526   /* Find the first set bit and set tmp to val with the first sequence of one
5527      bits removed.  Return success if there is a single sequence of ones.  */
5528   first_one = val & -val;
5529   tmp = val & (val + first_one);
5530
5531   if (tmp == 0)
5532     return true;
5533
5534   /* Find the next set bit and compute the difference in bit position.  */
5535   next_one = tmp & -tmp;
5536   bits = clz_hwi (first_one) - clz_hwi (next_one);
5537   mask = val ^ tmp;
5538
5539   /* Check the bit position difference is a power of 2, and that the first
5540      sequence of one bits fits within 'bits' bits.  */
5541   if ((mask >> bits) != 0 || bits != (bits & -bits))
5542     return false;
5543
5544   /* Check the sequence of one bits is repeated 64/bits times.  */
5545   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5546 }
5547
5548
5549 /* Return true if VAL is a valid bitmask immediate for MODE.  */
5550 bool
5551 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5552 {
5553   if (mode == DImode)
5554     return aarch64_bitmask_imm (val);
5555
5556   if (mode == SImode)
5557     return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
5558
5559   /* Replicate small immediates to fit 64 bits.  */
5560   int size = GET_MODE_UNIT_PRECISION (mode);
5561   val &= (HOST_WIDE_INT_1U << size) - 1;
5562   val *= bitmask_imm_mul[__builtin_clz (size) - 26];
5563
5564   return aarch64_bitmask_imm (val);
5565 }
5566
5567
5568 /* Return true if the immediate VAL can be a bitfield immediate
5569    by changing the given MASK bits in VAL to zeroes, ones or bits
5570    from the other half of VAL.  Return the new immediate in VAL2.  */
5571 static inline bool
5572 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
5573                        unsigned HOST_WIDE_INT &val2,
5574                        unsigned HOST_WIDE_INT mask)
5575 {
5576   val2 = val & ~mask;
5577   if (val2 != val && aarch64_bitmask_imm (val2))
5578     return true;
5579   val2 = val | mask;
5580   if (val2 != val && aarch64_bitmask_imm (val2))
5581     return true;
5582   val = val & ~mask;
5583   val2 = val | (((val >> 32) | (val << 32)) & mask);
5584   if (val2 != val && aarch64_bitmask_imm (val2))
5585     return true;
5586   val2 = val | (((val >> 16) | (val << 48)) & mask);
5587   if (val2 != val && aarch64_bitmask_imm (val2))
5588     return true;
5589   return false;
5590 }
5591
5592
5593 /* Return true if VAL is a valid MOVZ immediate.  */
5594 static inline bool
5595 aarch64_is_movz (unsigned HOST_WIDE_INT val)
5596 {
5597   return (val >> (ctz_hwi (val) & 48)) < 65536;
5598 }
5599
5600
5601 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ.  */
5602 bool
5603 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
5604 {
5605   return aarch64_is_movz (val) || aarch64_is_movz (~val)
5606     || aarch64_bitmask_imm (val);
5607 }
5608
5609
5610 /* Return true if VAL is an immediate that can be created by a single
5611    MOV instruction.  */
5612 bool
5613 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5614 {
5615   gcc_assert (mode == SImode || mode == DImode);
5616
5617   if (val < 65536)
5618     return true;
5619
5620   unsigned HOST_WIDE_INT mask =
5621     (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
5622
5623   if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
5624     return true;
5625
5626   val = (val & mask) | ((val << 32) & ~mask);
5627   return aarch64_bitmask_imm (val);
5628 }
5629
5630
5631 static int
5632 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
5633                                 machine_mode mode)
5634 {
5635   int i;
5636   unsigned HOST_WIDE_INT val, val2, mask;
5637   int one_match, zero_match;
5638   int num_insns;
5639
5640   gcc_assert (mode == SImode || mode == DImode);
5641
5642   val = INTVAL (imm);
5643
5644   if (aarch64_move_imm (val, mode))
5645     {
5646       if (generate)
5647         emit_insn (gen_rtx_SET (dest, imm));
5648       return 1;
5649     }
5650
5651   if ((val >> 32) == 0 || mode == SImode)
5652     {
5653       if (generate)
5654         {
5655           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
5656           if (mode == SImode)
5657             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
5658                                        GEN_INT ((val >> 16) & 0xffff)));
5659           else
5660             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
5661                                        GEN_INT ((val >> 16) & 0xffff)));
5662         }
5663       return 2;
5664     }
5665
5666   /* Remaining cases are all for DImode.  */
5667
5668   mask = 0xffff;
5669   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
5670     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
5671   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
5672     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
5673
5674   /* Try a bitmask immediate and a movk to generate the immediate
5675      in 2 instructions.  */
5676
5677   if (zero_match < 2 && one_match < 2)
5678     {
5679       for (i = 0; i < 64; i += 16)
5680         {
5681           if (aarch64_check_bitmask (val, val2, mask << i))
5682             break;
5683
5684           val2 = val & ~(mask << i);
5685           if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
5686             break;
5687         }
5688
5689       if (i != 64)
5690         {
5691           if (generate)
5692             {
5693               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5694               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5695                                          GEN_INT ((val >> i) & 0xffff)));
5696             }
5697           return 2;
5698         }
5699     }
5700
5701   /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
5702   if (zero_match + one_match == 0)
5703     {
5704       for (i = 0; i < 48; i += 16)
5705         for (int j = i + 16; j < 64; j += 16)
5706           if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
5707             {
5708               if (generate)
5709                 {
5710                   emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5711                   emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5712                                              GEN_INT ((val >> i) & 0xffff)));
5713                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
5714                                                GEN_INT ((val >> j) & 0xffff)));
5715                 }
5716               return 3;
5717             }
5718     }
5719
5720   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
5721      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
5722      otherwise skip zero bits.  */
5723
5724   num_insns = 1;
5725   mask = 0xffff;
5726   val2 = one_match > zero_match ? ~val : val;
5727   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
5728
5729   if (generate)
5730     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
5731                                            ? (val | ~(mask << i))
5732                                            : (val & (mask << i)))));
5733   for (i += 16; i < 64; i += 16)
5734     {
5735       if ((val2 & (mask << i)) == 0)
5736         continue;
5737       if (generate)
5738         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5739                                    GEN_INT ((val >> i) & 0xffff)));
5740       num_insns ++;
5741     }
5742
5743   return num_insns;
5744 }
5745
5746 /* Return whether imm is a 128-bit immediate which is simple enough to
5747    expand inline.  */
5748 bool
5749 aarch64_mov128_immediate (rtx imm)
5750 {
5751   if (CONST_INT_P (imm))
5752     return true;
5753
5754   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
5755
5756   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
5757   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
5758
5759   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
5760          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
5761 }
5762
5763
5764 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5765    a left shift of 0 or 12 bits.  */
5766 bool
5767 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
5768 {
5769   return val < 4096 || (val & 0xfff000) == val;
5770 }
5771
5772 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5773    that can be created with a left shift of 0 or 12.  */
5774 static HOST_WIDE_INT
5775 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
5776 {
5777   /* Check to see if the value fits in 24 bits, as that is the maximum we can
5778      handle correctly.  */
5779   gcc_assert (val < 0x1000000);
5780
5781   if (val < 4096)
5782     return val;
5783
5784   return val & 0xfff000;
5785 }
5786
5787
5788 /* Test whether:
5789
5790      X = (X & AND_VAL) | IOR_VAL;
5791
5792    can be implemented using:
5793
5794      MOVK X, #(IOR_VAL >> shift), LSL #shift
5795
5796    Return the shift if so, otherwise return -1.  */
5797 int
5798 aarch64_movk_shift (const wide_int_ref &and_val,
5799                     const wide_int_ref &ior_val)
5800 {
5801   unsigned int precision = and_val.get_precision ();
5802   unsigned HOST_WIDE_INT mask = 0xffff;
5803   for (unsigned int shift = 0; shift < precision; shift += 16)
5804     {
5805       if (and_val == ~mask && (ior_val & mask) == ior_val)
5806         return shift;
5807       mask <<= 16;
5808     }
5809   return -1;
5810 }
5811
5812 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5813    Assumed precondition: VAL_IN Is not zero.  */
5814
5815 unsigned HOST_WIDE_INT
5816 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5817 {
5818   int lowest_bit_set = ctz_hwi (val_in);
5819   int highest_bit_set = floor_log2 (val_in);
5820   gcc_assert (val_in != 0);
5821
5822   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5823           (HOST_WIDE_INT_1U << lowest_bit_set));
5824 }
5825
5826 /* Create constant where bits outside of lowest bit set to highest bit set
5827    are set to 1.  */
5828
5829 unsigned HOST_WIDE_INT
5830 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5831 {
5832   return val_in | ~aarch64_and_split_imm1 (val_in);
5833 }
5834
5835 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5836
5837 bool
5838 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5839 {
5840   scalar_int_mode int_mode;
5841   if (!is_a <scalar_int_mode> (mode, &int_mode))
5842     return false;
5843
5844   if (aarch64_bitmask_imm (val_in, int_mode))
5845     return false;
5846
5847   if (aarch64_move_imm (val_in, int_mode))
5848     return false;
5849
5850   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5851
5852   return aarch64_bitmask_imm (imm2, int_mode);
5853 }
5854
5855 /* Return the number of temporary registers that aarch64_add_offset_1
5856    would need to add OFFSET to a register.  */
5857
5858 static unsigned int
5859 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
5860 {
5861   return absu_hwi (offset) < 0x1000000 ? 0 : 1;
5862 }
5863
5864 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
5865    a non-polynomial OFFSET.  MODE is the mode of the addition.
5866    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5867    be set and CFA adjustments added to the generated instructions.
5868
5869    TEMP1, if nonnull, is a register of mode MODE that can be used as a
5870    temporary if register allocation is already complete.  This temporary
5871    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
5872    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
5873    the immediate again.
5874
5875    Since this function may be used to adjust the stack pointer, we must
5876    ensure that it cannot cause transient stack deallocation (for example
5877    by first incrementing SP and then decrementing when adjusting by a
5878    large immediate).  */
5879
5880 static void
5881 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
5882                       rtx src, HOST_WIDE_INT offset, rtx temp1,
5883                       bool frame_related_p, bool emit_move_imm)
5884 {
5885   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5886   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5887
5888   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
5889   rtx_insn *insn;
5890
5891   if (!moffset)
5892     {
5893       if (!rtx_equal_p (dest, src))
5894         {
5895           insn = emit_insn (gen_rtx_SET (dest, src));
5896           RTX_FRAME_RELATED_P (insn) = frame_related_p;
5897         }
5898       return;
5899     }
5900
5901   /* Single instruction adjustment.  */
5902   if (aarch64_uimm12_shift (moffset))
5903     {
5904       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
5905       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5906       return;
5907     }
5908
5909   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
5910      and either:
5911
5912      a) the offset cannot be loaded by a 16-bit move or
5913      b) there is no spare register into which we can move it.  */
5914   if (moffset < 0x1000000
5915       && ((!temp1 && !can_create_pseudo_p ())
5916           || !aarch64_move_imm (moffset, mode)))
5917     {
5918       HOST_WIDE_INT low_off = moffset & 0xfff;
5919
5920       low_off = offset < 0 ? -low_off : low_off;
5921       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
5922       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5923       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
5924       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5925       return;
5926     }
5927
5928   /* Emit a move immediate if required and an addition/subtraction.  */
5929   if (emit_move_imm)
5930     {
5931       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
5932       temp1 = aarch64_force_temporary (mode, temp1,
5933                                        gen_int_mode (moffset, mode));
5934     }
5935   insn = emit_insn (offset < 0
5936                     ? gen_sub3_insn (dest, src, temp1)
5937                     : gen_add3_insn (dest, src, temp1));
5938   if (frame_related_p)
5939     {
5940       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5941       rtx adj = plus_constant (mode, src, offset);
5942       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
5943     }
5944 }
5945
5946 /* Return the number of temporary registers that aarch64_add_offset
5947    would need to move OFFSET into a register or add OFFSET to a register;
5948    ADD_P is true if we want the latter rather than the former.  */
5949
5950 static unsigned int
5951 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
5952 {
5953   /* This follows the same structure as aarch64_add_offset.  */
5954   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
5955     return 0;
5956
5957   unsigned int count = 0;
5958   HOST_WIDE_INT factor = offset.coeffs[1];
5959   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
5960   poly_int64 poly_offset (factor, factor);
5961   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
5962     /* Need one register for the ADDVL/ADDPL result.  */
5963     count += 1;
5964   else if (factor != 0)
5965     {
5966       factor = abs (factor);
5967       if (factor > 16 * (factor & -factor))
5968         /* Need one register for the CNT result and one for the multiplication
5969            factor.  If necessary, the second temporary can be reused for the
5970            constant part of the offset.  */
5971         return 2;
5972       /* Need one register for the CNT result (which might then
5973          be shifted).  */
5974       count += 1;
5975     }
5976   return count + aarch64_add_offset_1_temporaries (constant);
5977 }
5978
5979 /* If X can be represented as a poly_int64, return the number
5980    of temporaries that are required to add it to a register.
5981    Return -1 otherwise.  */
5982
5983 int
5984 aarch64_add_offset_temporaries (rtx x)
5985 {
5986   poly_int64 offset;
5987   if (!poly_int_rtx_p (x, &offset))
5988     return -1;
5989   return aarch64_offset_temporaries (true, offset);
5990 }
5991
5992 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
5993    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5994    be set and CFA adjustments added to the generated instructions.
5995
5996    TEMP1, if nonnull, is a register of mode MODE that can be used as a
5997    temporary if register allocation is already complete.  This temporary
5998    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
5999    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
6000    false to avoid emitting the immediate again.
6001
6002    TEMP2, if nonnull, is a second temporary register that doesn't
6003    overlap either DEST or REG.
6004
6005    Since this function may be used to adjust the stack pointer, we must
6006    ensure that it cannot cause transient stack deallocation (for example
6007    by first incrementing SP and then decrementing when adjusting by a
6008    large immediate).  */
6009
6010 static void
6011 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6012                     poly_int64 offset, rtx temp1, rtx temp2,
6013                     bool frame_related_p, bool emit_move_imm = true)
6014 {
6015   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
6016   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
6017   gcc_assert (temp1 == NULL_RTX
6018               || !frame_related_p
6019               || !reg_overlap_mentioned_p (temp1, dest));
6020   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
6021
6022   /* Try using ADDVL or ADDPL to add the whole value.  */
6023   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
6024     {
6025       rtx offset_rtx = gen_int_mode (offset, mode);
6026       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6027       RTX_FRAME_RELATED_P (insn) = frame_related_p;
6028       return;
6029     }
6030
6031   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
6032      SVE vector register, over and above the minimum size of 128 bits.
6033      This is equivalent to half the value returned by CNTD with a
6034      vector shape of ALL.  */
6035   HOST_WIDE_INT factor = offset.coeffs[1];
6036   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
6037
6038   /* Try using ADDVL or ADDPL to add the VG-based part.  */
6039   poly_int64 poly_offset (factor, factor);
6040   if (src != const0_rtx
6041       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
6042     {
6043       rtx offset_rtx = gen_int_mode (poly_offset, mode);
6044       if (frame_related_p)
6045         {
6046           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6047           RTX_FRAME_RELATED_P (insn) = true;
6048           src = dest;
6049         }
6050       else
6051         {
6052           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
6053           src = aarch64_force_temporary (mode, temp1, addr);
6054           temp1 = temp2;
6055           temp2 = NULL_RTX;
6056         }
6057     }
6058   /* Otherwise use a CNT-based sequence.  */
6059   else if (factor != 0)
6060     {
6061       /* Use a subtraction if we have a negative factor.  */
6062       rtx_code code = PLUS;
6063       if (factor < 0)
6064         {
6065           factor = -factor;
6066           code = MINUS;
6067         }
6068
6069       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
6070          into the multiplication.  */
6071       rtx val;
6072       int shift = 0;
6073       if (factor & 1)
6074         /* Use a right shift by 1.  */
6075         shift = -1;
6076       else
6077         factor /= 2;
6078       HOST_WIDE_INT low_bit = factor & -factor;
6079       if (factor <= 16 * low_bit)
6080         {
6081           if (factor > 16 * 8)
6082             {
6083               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
6084                  the value with the minimum multiplier and shift it into
6085                  position.  */
6086               int extra_shift = exact_log2 (low_bit);
6087               shift += extra_shift;
6088               factor >>= extra_shift;
6089             }
6090           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
6091         }
6092       else
6093         {
6094           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
6095              directly, since that should increase the chances of being
6096              able to use a shift and add sequence.  If LOW_BIT itself
6097              is out of range, just use CNTD.  */
6098           if (low_bit <= 16 * 8)
6099             factor /= low_bit;
6100           else
6101             low_bit = 1;
6102
6103           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
6104           val = aarch64_force_temporary (mode, temp1, val);
6105
6106           if (can_create_pseudo_p ())
6107             {
6108               rtx coeff1 = gen_int_mode (factor, mode);
6109               val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
6110             }
6111           else
6112             {
6113               /* Go back to using a negative multiplication factor if we have
6114                  no register from which to subtract.  */
6115               if (code == MINUS && src == const0_rtx)
6116                 {
6117                   factor = -factor;
6118                   code = PLUS;
6119                 }
6120               rtx coeff1 = gen_int_mode (factor, mode);
6121               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
6122               val = gen_rtx_MULT (mode, val, coeff1);
6123             }
6124         }
6125
6126       if (shift > 0)
6127         {
6128           /* Multiply by 1 << SHIFT.  */
6129           val = aarch64_force_temporary (mode, temp1, val);
6130           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
6131         }
6132       else if (shift == -1)
6133         {
6134           /* Divide by 2.  */
6135           val = aarch64_force_temporary (mode, temp1, val);
6136           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
6137         }
6138
6139       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
6140       if (src != const0_rtx)
6141         {
6142           val = aarch64_force_temporary (mode, temp1, val);
6143           val = gen_rtx_fmt_ee (code, mode, src, val);
6144         }
6145       else if (code == MINUS)
6146         {
6147           val = aarch64_force_temporary (mode, temp1, val);
6148           val = gen_rtx_NEG (mode, val);
6149         }
6150
6151       if (constant == 0 || frame_related_p)
6152         {
6153           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
6154           if (frame_related_p)
6155             {
6156               RTX_FRAME_RELATED_P (insn) = true;
6157               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6158                             gen_rtx_SET (dest, plus_constant (Pmode, src,
6159                                                               poly_offset)));
6160             }
6161           src = dest;
6162           if (constant == 0)
6163             return;
6164         }
6165       else
6166         {
6167           src = aarch64_force_temporary (mode, temp1, val);
6168           temp1 = temp2;
6169           temp2 = NULL_RTX;
6170         }
6171
6172       emit_move_imm = true;
6173     }
6174
6175   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
6176                         frame_related_p, emit_move_imm);
6177 }
6178
6179 /* Like aarch64_add_offset, but the offset is given as an rtx rather
6180    than a poly_int64.  */
6181
6182 void
6183 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6184                           rtx offset_rtx, rtx temp1, rtx temp2)
6185 {
6186   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
6187                       temp1, temp2, false);
6188 }
6189
6190 /* Add DELTA to the stack pointer, marking the instructions frame-related.
6191    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
6192    if TEMP1 already contains abs (DELTA).  */
6193
6194 static inline void
6195 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
6196 {
6197   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
6198                       temp1, temp2, true, emit_move_imm);
6199 }
6200
6201 /* Subtract DELTA from the stack pointer, marking the instructions
6202    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
6203    if nonnull.  */
6204
6205 static inline void
6206 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
6207                 bool emit_move_imm = true)
6208 {
6209   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
6210                       temp1, temp2, frame_related_p, emit_move_imm);
6211 }
6212
6213 /* Set DEST to (vec_series BASE STEP).  */
6214
6215 static void
6216 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
6217 {
6218   machine_mode mode = GET_MODE (dest);
6219   scalar_mode inner = GET_MODE_INNER (mode);
6220
6221   /* Each operand can be a register or an immediate in the range [-16, 15].  */
6222   if (!aarch64_sve_index_immediate_p (base))
6223     base = force_reg (inner, base);
6224   if (!aarch64_sve_index_immediate_p (step))
6225     step = force_reg (inner, step);
6226
6227   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
6228 }
6229
6230 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
6231    register of mode MODE.  Use TARGET for the result if it's nonnull
6232    and convenient.
6233
6234    The two vector modes must have the same element mode.  The behavior
6235    is to duplicate architectural lane N of SRC into architectural lanes
6236    N + I * STEP of the result.  On big-endian targets, architectural
6237    lane 0 of an Advanced SIMD vector is the last element of the vector
6238    in memory layout, so for big-endian targets this operation has the
6239    effect of reversing SRC before duplicating it.  Callers need to
6240    account for this.  */
6241
6242 rtx
6243 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
6244 {
6245   machine_mode src_mode = GET_MODE (src);
6246   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
6247   insn_code icode = (BYTES_BIG_ENDIAN
6248                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
6249                      : code_for_aarch64_vec_duplicate_vq_le (mode));
6250
6251   unsigned int i = 0;
6252   expand_operand ops[3];
6253   create_output_operand (&ops[i++], target, mode);
6254   create_output_operand (&ops[i++], src, src_mode);
6255   if (BYTES_BIG_ENDIAN)
6256     {
6257       /* Create a PARALLEL describing the reversal of SRC.  */
6258       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
6259       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
6260                                                   nelts_per_vq - 1, -1);
6261       create_fixed_operand (&ops[i++], sel);
6262     }
6263   expand_insn (icode, i, ops);
6264   return ops[0].value;
6265 }
6266
6267 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
6268    the memory image into DEST.  Return true on success.  */
6269
6270 static bool
6271 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
6272 {
6273   src = force_const_mem (GET_MODE (src), src);
6274   if (!src)
6275     return false;
6276
6277   /* Make sure that the address is legitimate.  */
6278   if (!aarch64_sve_ld1rq_operand_p (src))
6279     {
6280       rtx addr = force_reg (Pmode, XEXP (src, 0));
6281       src = replace_equiv_address (src, addr);
6282     }
6283
6284   machine_mode mode = GET_MODE (dest);
6285   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6286   rtx ptrue = aarch64_ptrue_reg (pred_mode);
6287   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
6288   return true;
6289 }
6290
6291 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
6292    by N "background" values.  Try to move it into TARGET using:
6293
6294       PTRUE PRED.<T>, VL<N>
6295       MOV TRUE.<T>, #<foreground>
6296       MOV FALSE.<T>, #<background>
6297       SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
6298
6299    The PTRUE is always a single instruction but the MOVs might need a
6300    longer sequence.  If the background value is zero (as it often is),
6301    the sequence can sometimes collapse to a PTRUE followed by a
6302    zero-predicated move.
6303
6304    Return the target on success, otherwise return null.  */
6305
6306 static rtx
6307 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
6308 {
6309   gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
6310
6311   /* Make sure that the PTRUE is valid.  */
6312   machine_mode mode = GET_MODE (src);
6313   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6314   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6315   if (aarch64_svpattern_for_vl (pred_mode, npatterns)
6316       == AARCH64_NUM_SVPATTERNS)
6317     return NULL_RTX;
6318
6319   rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
6320   rtx_vector_builder true_builder (mode, npatterns, 1);
6321   rtx_vector_builder false_builder (mode, npatterns, 1);
6322   for (unsigned int i = 0; i < npatterns; ++i)
6323     {
6324       true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6325       pred_builder.quick_push (CONST1_RTX (BImode));
6326     }
6327   for (unsigned int i = 0; i < npatterns; ++i)
6328     {
6329       false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
6330       pred_builder.quick_push (CONST0_RTX (BImode));
6331     }
6332   expand_operand ops[4];
6333   create_output_operand (&ops[0], target, mode);
6334   create_input_operand (&ops[1], true_builder.build (), mode);
6335   create_input_operand (&ops[2], false_builder.build (), mode);
6336   create_input_operand (&ops[3], pred_builder.build (), pred_mode);
6337   expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
6338   return target;
6339 }
6340
6341 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
6342    SVE data mode and isn't a legitimate constant.  Use TARGET for the
6343    result if convenient.
6344
6345    The returned register can have whatever mode seems most natural
6346    given the contents of SRC.  */
6347
6348 static rtx
6349 aarch64_expand_sve_const_vector (rtx target, rtx src)
6350 {
6351   machine_mode mode = GET_MODE (src);
6352   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6353   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
6354   scalar_mode elt_mode = GET_MODE_INNER (mode);
6355   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
6356   unsigned int container_bits = aarch64_sve_container_bits (mode);
6357   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
6358
6359   if (nelts_per_pattern == 1
6360       && encoded_bits <= 128
6361       && container_bits != elt_bits)
6362     {
6363       /* We have a partial vector mode and a constant whose full-vector
6364          equivalent would occupy a repeating 128-bit sequence.  Build that
6365          full-vector equivalent instead, so that we have the option of
6366          using LD1RQ and Advanced SIMD operations.  */
6367       unsigned int repeat = container_bits / elt_bits;
6368       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
6369       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
6370       for (unsigned int i = 0; i < npatterns; ++i)
6371         for (unsigned int j = 0; j < repeat; ++j)
6372           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6373       target = aarch64_target_reg (target, full_mode);
6374       return aarch64_expand_sve_const_vector (target, builder.build ());
6375     }
6376
6377   if (nelts_per_pattern == 1 && encoded_bits == 128)
6378     {
6379       /* The constant is a duplicated quadword but can't be narrowed
6380          beyond a quadword.  Get the memory image of the first quadword
6381          as a 128-bit vector and try using LD1RQ to load it from memory.
6382
6383          The effect for both endiannesses is to load memory lane N into
6384          architectural lanes N + I * STEP of the result.  On big-endian
6385          targets, the layout of the 128-bit vector in an Advanced SIMD
6386          register would be different from its layout in an SVE register,
6387          but this 128-bit vector is a memory value only.  */
6388       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6389       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
6390       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
6391         return target;
6392     }
6393
6394   if (nelts_per_pattern == 1 && encoded_bits < 128)
6395     {
6396       /* The vector is a repeating sequence of 64 bits or fewer.
6397          See if we can load them using an Advanced SIMD move and then
6398          duplicate it to fill a vector.  This is better than using a GPR
6399          move because it keeps everything in the same register file.  */
6400       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6401       rtx_vector_builder builder (vq_mode, npatterns, 1);
6402       for (unsigned int i = 0; i < npatterns; ++i)
6403         {
6404           /* We want memory lane N to go into architectural lane N,
6405              so reverse for big-endian targets.  The DUP .Q pattern
6406              has a compensating reverse built-in.  */
6407           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
6408           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
6409         }
6410       rtx vq_src = builder.build ();
6411       if (aarch64_simd_valid_immediate (vq_src, NULL))
6412         {
6413           vq_src = force_reg (vq_mode, vq_src);
6414           return aarch64_expand_sve_dupq (target, mode, vq_src);
6415         }
6416
6417       /* Get an integer representation of the repeating part of Advanced
6418          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
6419          which for big-endian targets is lane-swapped wrt a normal
6420          Advanced SIMD vector.  This means that for both endiannesses,
6421          memory lane N of SVE vector SRC corresponds to architectural
6422          lane N of a register holding VQ_SRC.  This in turn means that
6423          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
6424          as a single 128-bit value) and thus that memory lane 0 of SRC is
6425          in the lsb of the integer.  Duplicating the integer therefore
6426          ensures that memory lane N of SRC goes into architectural lane
6427          N + I * INDEX of the SVE register.  */
6428       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
6429       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
6430       if (elt_value)
6431         {
6432           /* Pretend that we had a vector of INT_MODE to start with.  */
6433           elt_mode = int_mode;
6434           mode = aarch64_full_sve_mode (int_mode).require ();
6435
6436           /* If the integer can be moved into a general register by a
6437              single instruction, do that and duplicate the result.  */
6438           if (CONST_INT_P (elt_value)
6439               && aarch64_move_imm (INTVAL (elt_value),
6440                                    encoded_bits <= 32 ? SImode : DImode))
6441             {
6442               elt_value = force_reg (elt_mode, elt_value);
6443               return expand_vector_broadcast (mode, elt_value);
6444             }
6445         }
6446       else if (npatterns == 1)
6447         /* We're duplicating a single value, but can't do better than
6448            force it to memory and load from there.  This handles things
6449            like symbolic constants.  */
6450         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
6451
6452       if (elt_value)
6453         {
6454           /* Load the element from memory if we can, otherwise move it into
6455              a register and use a DUP.  */
6456           rtx op = force_const_mem (elt_mode, elt_value);
6457           if (!op)
6458             op = force_reg (elt_mode, elt_value);
6459           return expand_vector_broadcast (mode, op);
6460         }
6461     }
6462
6463   /* Try using INDEX.  */
6464   rtx base, step;
6465   if (const_vec_series_p (src, &base, &step))
6466     {
6467       aarch64_expand_vec_series (target, base, step);
6468       return target;
6469     }
6470
6471   /* From here on, it's better to force the whole constant to memory
6472      if we can.  */
6473   if (GET_MODE_NUNITS (mode).is_constant ())
6474     return NULL_RTX;
6475
6476   if (nelts_per_pattern == 2)
6477     if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
6478       return res;
6479
6480   /* Expand each pattern individually.  */
6481   gcc_assert (npatterns > 1);
6482   rtx_vector_builder builder;
6483   auto_vec<rtx, 16> vectors (npatterns);
6484   for (unsigned int i = 0; i < npatterns; ++i)
6485     {
6486       builder.new_vector (mode, 1, nelts_per_pattern);
6487       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
6488         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
6489       vectors.quick_push (force_reg (mode, builder.build ()));
6490     }
6491
6492   /* Use permutes to interleave the separate vectors.  */
6493   while (npatterns > 1)
6494     {
6495       npatterns /= 2;
6496       for (unsigned int i = 0; i < npatterns; ++i)
6497         {
6498           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
6499           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
6500           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
6501           vectors[i] = tmp;
6502         }
6503     }
6504   gcc_assert (vectors[0] == target);
6505   return target;
6506 }
6507
6508 /* Use WHILE to set a predicate register of mode MODE in which the first
6509    VL bits are set and the rest are clear.  Use TARGET for the register
6510    if it's nonnull and convenient.  */
6511
6512 static rtx
6513 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
6514                                  unsigned int vl)
6515 {
6516   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
6517   target = aarch64_target_reg (target, mode);
6518   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
6519                         target, const0_rtx, limit));
6520   return target;
6521 }
6522
6523 static rtx
6524 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
6525
6526 /* BUILDER is a constant predicate in which the index of every set bit
6527    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
6528    by inverting every element at a multiple of ELT_SIZE and EORing the
6529    result with an ELT_SIZE PTRUE.
6530
6531    Return a register that contains the constant on success, otherwise
6532    return null.  Use TARGET as the register if it is nonnull and
6533    convenient.  */
6534
6535 static rtx
6536 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
6537                                    unsigned int elt_size)
6538 {
6539   /* Invert every element at a multiple of ELT_SIZE, keeping the
6540      other bits zero.  */
6541   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
6542                                   builder.nelts_per_pattern ());
6543   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6544     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
6545       inv_builder.quick_push (const1_rtx);
6546     else
6547       inv_builder.quick_push (const0_rtx);
6548   inv_builder.finalize ();
6549
6550   /* See if we can load the constant cheaply.  */
6551   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
6552   if (!inv)
6553     return NULL_RTX;
6554
6555   /* EOR the result with an ELT_SIZE PTRUE.  */
6556   rtx mask = aarch64_ptrue_all (elt_size);
6557   mask = force_reg (VNx16BImode, mask);
6558   inv = gen_lowpart (VNx16BImode, inv);
6559   target = aarch64_target_reg (target, VNx16BImode);
6560   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
6561   return target;
6562 }
6563
6564 /* BUILDER is a constant predicate in which the index of every set bit
6565    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
6566    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
6567    register on success, otherwise return null.  Use TARGET as the register
6568    if nonnull and convenient.  */
6569
6570 static rtx
6571 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
6572                                    unsigned int elt_size,
6573                                    unsigned int permute_size)
6574 {
6575   /* We're going to split the constant into two new constants A and B,
6576      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
6577      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6578
6579      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6580      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6581
6582      where _ indicates elements that will be discarded by the permute.
6583
6584      First calculate the ELT_SIZEs for A and B.  */
6585   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
6586   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
6587   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
6588     if (INTVAL (builder.elt (i)) != 0)
6589       {
6590         if (i & permute_size)
6591           b_elt_size |= i - permute_size;
6592         else
6593           a_elt_size |= i;
6594       }
6595   a_elt_size &= -a_elt_size;
6596   b_elt_size &= -b_elt_size;
6597
6598   /* Now construct the vectors themselves.  */
6599   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
6600                                 builder.nelts_per_pattern ());
6601   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
6602                                 builder.nelts_per_pattern ());
6603   unsigned int nelts = builder.encoded_nelts ();
6604   for (unsigned int i = 0; i < nelts; ++i)
6605     if (i & (elt_size - 1))
6606       {
6607         a_builder.quick_push (const0_rtx);
6608         b_builder.quick_push (const0_rtx);
6609       }
6610     else if ((i & permute_size) == 0)
6611       {
6612         /* The A and B elements are significant.  */
6613         a_builder.quick_push (builder.elt (i));
6614         b_builder.quick_push (builder.elt (i + permute_size));
6615       }
6616     else
6617       {
6618         /* The A and B elements are going to be discarded, so pick whatever
6619            is likely to give a nice constant.  We are targeting element
6620            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6621            with the aim of each being a sequence of ones followed by
6622            a sequence of zeros.  So:
6623
6624            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6625              duplicate the last X_ELT_SIZE element, to extend the
6626              current sequence of ones or zeros.
6627
6628            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6629              zero, so that the constant really does have X_ELT_SIZE and
6630              not a smaller size.  */
6631         if (a_elt_size > permute_size)
6632           a_builder.quick_push (const0_rtx);
6633         else
6634           a_builder.quick_push (a_builder.elt (i - a_elt_size));
6635         if (b_elt_size > permute_size)
6636           b_builder.quick_push (const0_rtx);
6637         else
6638           b_builder.quick_push (b_builder.elt (i - b_elt_size));
6639       }
6640   a_builder.finalize ();
6641   b_builder.finalize ();
6642
6643   /* Try loading A into a register.  */
6644   rtx_insn *last = get_last_insn ();
6645   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
6646   if (!a)
6647     return NULL_RTX;
6648
6649   /* Try loading B into a register.  */
6650   rtx b = a;
6651   if (a_builder != b_builder)
6652     {
6653       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
6654       if (!b)
6655         {
6656           delete_insns_since (last);
6657           return NULL_RTX;
6658         }
6659     }
6660
6661   /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
6662      operands but permutes them as though they had mode MODE.  */
6663   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
6664   target = aarch64_target_reg (target, GET_MODE (a));
6665   rtx type_reg = CONST0_RTX (mode);
6666   emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
6667   return target;
6668 }
6669
6670 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
6671    constant in BUILDER into an SVE predicate register.  Return the register
6672    on success, otherwise return null.  Use TARGET for the register if
6673    nonnull and convenient.
6674
6675    ALLOW_RECURSE_P is true if we can use methods that would call this
6676    function recursively.  */
6677
6678 static rtx
6679 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
6680                                  bool allow_recurse_p)
6681 {
6682   if (builder.encoded_nelts () == 1)
6683     /* A PFALSE or a PTRUE .B ALL.  */
6684     return aarch64_emit_set_immediate (target, builder);
6685
6686   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
6687   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
6688     {
6689       /* If we can load the constant using PTRUE, use it as-is.  */
6690       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
6691       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
6692         return aarch64_emit_set_immediate (target, builder);
6693
6694       /* Otherwise use WHILE to set the first VL bits.  */
6695       return aarch64_sve_move_pred_via_while (target, mode, vl);
6696     }
6697
6698   if (!allow_recurse_p)
6699     return NULL_RTX;
6700
6701   /* Try inverting the vector in element size ELT_SIZE and then EORing
6702      the result with an ELT_SIZE PTRUE.  */
6703   if (INTVAL (builder.elt (0)) == 0)
6704     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
6705                                                      elt_size))
6706       return res;
6707
6708   /* Try using TRN1 to permute two simpler constants.  */
6709   for (unsigned int i = elt_size; i <= 8; i *= 2)
6710     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
6711                                                      elt_size, i))
6712       return res;
6713
6714   return NULL_RTX;
6715 }
6716
6717 /* Return an SVE predicate register that contains the VNx16BImode
6718    constant in BUILDER, without going through the move expanders.
6719
6720    The returned register can have whatever mode seems most natural
6721    given the contents of BUILDER.  Use TARGET for the result if
6722    convenient.  */
6723
6724 static rtx
6725 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
6726 {
6727   /* Try loading the constant using pure predicate operations.  */
6728   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
6729     return res;
6730
6731   /* Try forcing the constant to memory.  */
6732   if (builder.full_nelts ().is_constant ())
6733     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
6734       {
6735         target = aarch64_target_reg (target, VNx16BImode);
6736         emit_move_insn (target, mem);
6737         return target;
6738       }
6739
6740   /* The last resort is to load the constant as an integer and then
6741      compare it against zero.  Use -1 for set bits in order to increase
6742      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
6743   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
6744                                   builder.nelts_per_pattern ());
6745   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6746     int_builder.quick_push (INTVAL (builder.elt (i))
6747                             ? constm1_rtx : const0_rtx);
6748   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
6749                                            int_builder.build ());
6750 }
6751
6752 /* Set DEST to immediate IMM.  */
6753
6754 void
6755 aarch64_expand_mov_immediate (rtx dest, rtx imm)
6756 {
6757   machine_mode mode = GET_MODE (dest);
6758
6759   /* Check on what type of symbol it is.  */
6760   scalar_int_mode int_mode;
6761   if ((SYMBOL_REF_P (imm)
6762        || LABEL_REF_P (imm)
6763        || GET_CODE (imm) == CONST
6764        || GET_CODE (imm) == CONST_POLY_INT)
6765       && is_a <scalar_int_mode> (mode, &int_mode))
6766     {
6767       rtx mem;
6768       poly_int64 offset;
6769       HOST_WIDE_INT const_offset;
6770       enum aarch64_symbol_type sty;
6771
6772       /* If we have (const (plus symbol offset)), separate out the offset
6773          before we start classifying the symbol.  */
6774       rtx base = strip_offset (imm, &offset);
6775
6776       /* We must always add an offset involving VL separately, rather than
6777          folding it into the relocation.  */
6778       if (!offset.is_constant (&const_offset))
6779         {
6780           if (!TARGET_SVE)
6781             {
6782               aarch64_report_sve_required ();
6783               return;
6784             }
6785           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
6786             emit_insn (gen_rtx_SET (dest, imm));
6787           else
6788             {
6789               /* Do arithmetic on 32-bit values if the result is smaller
6790                  than that.  */
6791               if (partial_subreg_p (int_mode, SImode))
6792                 {
6793                   /* It is invalid to do symbol calculations in modes
6794                      narrower than SImode.  */
6795                   gcc_assert (base == const0_rtx);
6796                   dest = gen_lowpart (SImode, dest);
6797                   int_mode = SImode;
6798                 }
6799               if (base != const0_rtx)
6800                 {
6801                   base = aarch64_force_temporary (int_mode, dest, base);
6802                   aarch64_add_offset (int_mode, dest, base, offset,
6803                                       NULL_RTX, NULL_RTX, false);
6804                 }
6805               else
6806                 aarch64_add_offset (int_mode, dest, base, offset,
6807                                     dest, NULL_RTX, false);
6808             }
6809           return;
6810         }
6811
6812       sty = aarch64_classify_symbol (base, const_offset);
6813       switch (sty)
6814         {
6815         case SYMBOL_FORCE_TO_MEM:
6816           if (int_mode != ptr_mode)
6817             imm = convert_memory_address (ptr_mode, imm);
6818
6819           if (const_offset != 0
6820               && targetm.cannot_force_const_mem (ptr_mode, imm))
6821             {
6822               gcc_assert (can_create_pseudo_p ());
6823               base = aarch64_force_temporary (int_mode, dest, base);
6824               aarch64_add_offset (int_mode, dest, base, const_offset,
6825                                   NULL_RTX, NULL_RTX, false);
6826               return;
6827             }
6828
6829           mem = force_const_mem (ptr_mode, imm);
6830           gcc_assert (mem);
6831
6832           /* If we aren't generating PC relative literals, then
6833              we need to expand the literal pool access carefully.
6834              This is something that needs to be done in a number
6835              of places, so could well live as a separate function.  */
6836           if (!aarch64_pcrelative_literal_loads)
6837             {
6838               gcc_assert (can_create_pseudo_p ());
6839               base = gen_reg_rtx (ptr_mode);
6840               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6841               if (ptr_mode != Pmode)
6842                 base = convert_memory_address (Pmode, base);
6843               mem = gen_rtx_MEM (ptr_mode, base);
6844             }
6845
6846           if (int_mode != ptr_mode)
6847             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6848
6849           emit_insn (gen_rtx_SET (dest, mem));
6850
6851           return;
6852
6853         case SYMBOL_SMALL_TLSGD:
6854         case SYMBOL_SMALL_TLSDESC:
6855         case SYMBOL_SMALL_TLSIE:
6856         case SYMBOL_SMALL_GOT_28K:
6857         case SYMBOL_SMALL_GOT_4G:
6858         case SYMBOL_TINY_GOT:
6859         case SYMBOL_TINY_TLSIE:
6860           if (const_offset != 0)
6861             {
6862               gcc_assert(can_create_pseudo_p ());
6863               base = aarch64_force_temporary (int_mode, dest, base);
6864               aarch64_add_offset (int_mode, dest, base, const_offset,
6865                                   NULL_RTX, NULL_RTX, false);
6866               return;
6867             }
6868           /* FALLTHRU */
6869
6870         case SYMBOL_SMALL_ABSOLUTE:
6871         case SYMBOL_TINY_ABSOLUTE:
6872         case SYMBOL_TLSLE12:
6873         case SYMBOL_TLSLE24:
6874         case SYMBOL_TLSLE32:
6875         case SYMBOL_TLSLE48:
6876           aarch64_load_symref_appropriately (dest, imm, sty);
6877           return;
6878
6879         default:
6880           gcc_unreachable ();
6881         }
6882     }
6883
6884   if (!CONST_INT_P (imm))
6885     {
6886       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
6887         {
6888           /* Only the low bit of each .H, .S and .D element is defined,
6889              so we can set the upper bits to whatever we like.  If the
6890              predicate is all-true in MODE, prefer to set all the undefined
6891              bits as well, so that we can share a single .B predicate for
6892              all modes.  */
6893           if (imm == CONSTM1_RTX (mode))
6894             imm = CONSTM1_RTX (VNx16BImode);
6895
6896           /* All methods for constructing predicate modes wider than VNx16BI
6897              will set the upper bits of each element to zero.  Expose this
6898              by moving such constants as a VNx16BI, so that all bits are
6899              significant and so that constants for different modes can be
6900              shared.  The wider constant will still be available as a
6901              REG_EQUAL note.  */
6902           rtx_vector_builder builder;
6903           if (aarch64_get_sve_pred_bits (builder, imm))
6904             {
6905               rtx res = aarch64_expand_sve_const_pred (dest, builder);
6906               if (dest != res)
6907                 emit_move_insn (dest, gen_lowpart (mode, res));
6908               return;
6909             }
6910         }
6911
6912       if (GET_CODE (imm) == HIGH
6913           || aarch64_simd_valid_immediate (imm, NULL))
6914         {
6915           emit_insn (gen_rtx_SET (dest, imm));
6916           return;
6917         }
6918
6919       if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6920         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6921           {
6922             if (dest != res)
6923               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6924             return;
6925           }
6926
6927       rtx mem = force_const_mem (mode, imm);
6928       gcc_assert (mem);
6929       emit_move_insn (dest, mem);
6930       return;
6931     }
6932
6933   aarch64_internal_mov_immediate (dest, imm, true, mode);
6934 }
6935
6936 /* Return the MEM rtx that provides the canary value that should be used
6937    for stack-smashing protection.  MODE is the mode of the memory.
6938    For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6939    (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
6940    indicates whether the caller is performing a SET or a TEST operation.  */
6941
6942 rtx
6943 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6944                                   aarch64_salt_type salt_type)
6945 {
6946   rtx addr;
6947   if (aarch64_stack_protector_guard == SSP_GLOBAL)
6948     {
6949       gcc_assert (MEM_P (decl_rtl));
6950       addr = XEXP (decl_rtl, 0);
6951       poly_int64 offset;
6952       rtx base = strip_offset_and_salt (addr, &offset);
6953       if (!SYMBOL_REF_P (base))
6954         return decl_rtl;
6955
6956       rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6957       addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6958       addr = gen_rtx_CONST (Pmode, addr);
6959       addr = plus_constant (Pmode, addr, offset);
6960     }
6961   else
6962     {
6963       /* Calculate the address from the system register.  */
6964       rtx salt = GEN_INT (salt_type);
6965       addr = gen_reg_rtx (mode);
6966       if (mode == DImode)
6967         emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6968       else
6969         {
6970           emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6971           addr = convert_memory_address (Pmode, addr);
6972         }
6973       addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6974     }
6975   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6976 }
6977
6978 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
6979    that is known to contain PTRUE.  */
6980
6981 void
6982 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6983 {
6984   expand_operand ops[3];
6985   machine_mode mode = GET_MODE (dest);
6986   create_output_operand (&ops[0], dest, mode);
6987   create_input_operand (&ops[1], pred, GET_MODE(pred));
6988   create_input_operand (&ops[2], src, mode);
6989   temporary_volatile_ok v (true);
6990   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6991 }
6992
6993 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6994    operand is in memory.  In this case we need to use the predicated LD1
6995    and ST1 instead of LDR and STR, both for correctness on big-endian
6996    targets and because LD1 and ST1 support a wider range of addressing modes.
6997    PRED_MODE is the mode of the predicate.
6998
6999    See the comment at the head of aarch64-sve.md for details about the
7000    big-endian handling.  */
7001
7002 void
7003 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
7004 {
7005   machine_mode mode = GET_MODE (dest);
7006   rtx ptrue = aarch64_ptrue_reg (pred_mode);
7007   if (!register_operand (src, mode)
7008       && !register_operand (dest, mode))
7009     {
7010       rtx tmp = gen_reg_rtx (mode);
7011       if (MEM_P (src))
7012         aarch64_emit_sve_pred_move (tmp, ptrue, src);
7013       else
7014         emit_move_insn (tmp, src);
7015       src = tmp;
7016     }
7017   aarch64_emit_sve_pred_move (dest, ptrue, src);
7018 }
7019
7020 /* Called only on big-endian targets.  See whether an SVE vector move
7021    from SRC to DEST is effectively a REV[BHW] instruction, because at
7022    least one operand is a subreg of an SVE vector that has wider or
7023    narrower elements.  Return true and emit the instruction if so.
7024
7025    For example:
7026
7027      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
7028
7029    represents a VIEW_CONVERT between the following vectors, viewed
7030    in memory order:
7031
7032      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
7033      R1: { [0],      [1],      [2],      [3],     ... }
7034
7035    The high part of lane X in R2 should therefore correspond to lane X*2
7036    of R1, but the register representations are:
7037
7038          msb                                      lsb
7039      R2: ...... [1].high  [1].low   [0].high  [0].low
7040      R1: ...... [3]       [2]       [1]       [0]
7041
7042    where the low part of lane X in R2 corresponds to lane X*2 in R1.
7043    We therefore need a reverse operation to swap the high and low values
7044    around.
7045
7046    This is purely an optimization.  Without it we would spill the
7047    subreg operand to the stack in one mode and reload it in the
7048    other mode, which has the same effect as the REV.  */
7049
7050 bool
7051 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
7052 {
7053   gcc_assert (BYTES_BIG_ENDIAN);
7054
7055   /* Do not try to optimize subregs that LRA has created for matched
7056      reloads.  These subregs only exist as a temporary measure to make
7057      the RTL well-formed, but they are exempt from the usual
7058      TARGET_CAN_CHANGE_MODE_CLASS rules.
7059
7060      For example, if we have:
7061
7062        (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
7063
7064      and the constraints require R1 and R2 to be in the same register,
7065      LRA may need to create RTL such as:
7066
7067        (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
7068        (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
7069        (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
7070
7071      which forces both the input and output of the original instruction
7072      to use the same hard register.  But for this to work, the normal
7073      rules have to be suppressed on the subreg input, otherwise LRA
7074      would need to reload that input too, meaning that the process
7075      would never terminate.  To compensate for this, the normal rules
7076      are also suppressed for the subreg output of the first move.
7077      Ignoring the special case and handling the first move normally
7078      would therefore generate wrong code: we would reverse the elements
7079      for the first subreg but not reverse them back for the second subreg.  */
7080   if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
7081     dest = SUBREG_REG (dest);
7082   if (SUBREG_P (src) && !LRA_SUBREG_P (src))
7083     src = SUBREG_REG (src);
7084
7085   /* The optimization handles two single SVE REGs with different element
7086      sizes.  */
7087   if (!REG_P (dest)
7088       || !REG_P (src)
7089       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
7090       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
7091       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
7092           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
7093     return false;
7094
7095   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
7096   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
7097   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
7098                                UNSPEC_REV_SUBREG);
7099   emit_insn (gen_rtx_SET (dest, unspec));
7100   return true;
7101 }
7102
7103 /* Return a copy of X with mode MODE, without changing its other
7104    attributes.  Unlike gen_lowpart, this doesn't care whether the
7105    mode change is valid.  */
7106
7107 rtx
7108 aarch64_replace_reg_mode (rtx x, machine_mode mode)
7109 {
7110   if (GET_MODE (x) == mode)
7111     return x;
7112
7113   x = shallow_copy_rtx (x);
7114   set_mode_and_regno (x, mode, REGNO (x));
7115   return x;
7116 }
7117
7118 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
7119    stored in wider integer containers.  */
7120
7121 static unsigned int
7122 aarch64_sve_rev_unspec (machine_mode mode)
7123 {
7124   switch (GET_MODE_UNIT_SIZE (mode))
7125     {
7126     case 1: return UNSPEC_REVB;
7127     case 2: return UNSPEC_REVH;
7128     case 4: return UNSPEC_REVW;
7129     }
7130   gcc_unreachable ();
7131 }
7132
7133 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
7134    operands.  */
7135
7136 void
7137 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
7138 {
7139   /* Decide which REV operation we need.  The mode with wider elements
7140      determines the mode of the operands and the mode with the narrower
7141      elements determines the reverse width.  */
7142   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
7143   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
7144   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
7145       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
7146     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
7147
7148   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
7149   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
7150
7151   /* Get the operands in the appropriate modes and emit the instruction.  */
7152   ptrue = gen_lowpart (pred_mode, ptrue);
7153   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
7154   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
7155   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
7156                                dest, ptrue, src));
7157 }
7158
7159 static bool
7160 aarch64_function_ok_for_sibcall (tree, tree exp)
7161 {
7162   if (crtl->abi->id () != expr_callee_abi (exp).id ())
7163     return false;
7164
7165   return true;
7166 }
7167
7168 /* Subroutine of aarch64_pass_by_reference for arguments that are not
7169    passed in SVE registers.  */
7170
7171 static bool
7172 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
7173                              const function_arg_info &arg)
7174 {
7175   HOST_WIDE_INT size;
7176   machine_mode dummymode;
7177   int nregs;
7178
7179   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
7180   if (arg.mode == BLKmode && arg.type)
7181     size = int_size_in_bytes (arg.type);
7182   else
7183     /* No frontends can create types with variable-sized modes, so we
7184        shouldn't be asked to pass or return them.  */
7185     size = GET_MODE_SIZE (arg.mode).to_constant ();
7186
7187   /* Aggregates are passed by reference based on their size.  */
7188   if (arg.aggregate_type_p ())
7189     size = int_size_in_bytes (arg.type);
7190
7191   /* Variable sized arguments are always returned by reference.  */
7192   if (size < 0)
7193     return true;
7194
7195   /* Can this be a candidate to be passed in fp/simd register(s)?  */
7196   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
7197                                                &dummymode, &nregs, NULL,
7198                                                !pcum || pcum->silent_p))
7199     return false;
7200
7201   /* Arguments which are variable sized or larger than 2 registers are
7202      passed by reference unless they are a homogenous floating point
7203      aggregate.  */
7204   return size > 2 * UNITS_PER_WORD;
7205 }
7206
7207 /* Implement TARGET_PASS_BY_REFERENCE.  */
7208
7209 static bool
7210 aarch64_pass_by_reference (cumulative_args_t pcum_v,
7211                            const function_arg_info &arg)
7212 {
7213   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7214
7215   if (!arg.type)
7216     return aarch64_pass_by_reference_1 (pcum, arg);
7217
7218   pure_scalable_type_info pst_info;
7219   switch (pst_info.analyze (arg.type))
7220     {
7221     case pure_scalable_type_info::IS_PST:
7222       if (pcum && !pcum->silent_p && !TARGET_SVE)
7223         /* We can't gracefully recover at this point, so make this a
7224            fatal error.  */
7225         fatal_error (input_location, "arguments of type %qT require"
7226                      " the SVE ISA extension", arg.type);
7227
7228       /* Variadic SVE types are passed by reference.  Normal non-variadic
7229          arguments are too if we've run out of registers.  */
7230       return (!arg.named
7231               || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
7232               || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
7233
7234     case pure_scalable_type_info::DOESNT_MATTER:
7235       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
7236       return true;
7237
7238     case pure_scalable_type_info::NO_ABI_IDENTITY:
7239     case pure_scalable_type_info::ISNT_PST:
7240       return aarch64_pass_by_reference_1 (pcum, arg);
7241     }
7242   gcc_unreachable ();
7243 }
7244
7245 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
7246 static bool
7247 aarch64_return_in_msb (const_tree valtype)
7248 {
7249   machine_mode dummy_mode;
7250   int dummy_int;
7251
7252   /* Never happens in little-endian mode.  */
7253   if (!BYTES_BIG_ENDIAN)
7254     return false;
7255
7256   /* Only composite types smaller than or equal to 16 bytes can
7257      be potentially returned in registers.  */
7258   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
7259       || int_size_in_bytes (valtype) <= 0
7260       || int_size_in_bytes (valtype) > 16)
7261     return false;
7262
7263   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
7264      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
7265      is always passed/returned in the least significant bits of fp/simd
7266      register(s).  */
7267   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
7268                                                &dummy_mode, &dummy_int, NULL,
7269                                                false))
7270     return false;
7271
7272   /* Likewise pure scalable types for SVE vector and predicate registers.  */
7273   pure_scalable_type_info pst_info;
7274   if (pst_info.analyze_registers (valtype))
7275     return false;
7276
7277   return true;
7278 }
7279
7280 /* Implement TARGET_FUNCTION_VALUE.
7281    Define how to find the value returned by a function.  */
7282
7283 static rtx
7284 aarch64_function_value (const_tree type, const_tree func,
7285                         bool outgoing ATTRIBUTE_UNUSED)
7286 {
7287   machine_mode mode;
7288   int unsignedp;
7289
7290   mode = TYPE_MODE (type);
7291   if (INTEGRAL_TYPE_P (type))
7292     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
7293
7294   pure_scalable_type_info pst_info;
7295   if (type && pst_info.analyze_registers (type))
7296     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
7297
7298   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7299      are returned in memory, not by value.  */
7300   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7301   bool sve_p = (vec_flags & VEC_ANY_SVE);
7302
7303   if (aarch64_return_in_msb (type))
7304     {
7305       HOST_WIDE_INT size = int_size_in_bytes (type);
7306
7307       if (size % UNITS_PER_WORD != 0)
7308         {
7309           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
7310           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
7311         }
7312     }
7313
7314   int count;
7315   machine_mode ag_mode;
7316   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
7317                                                NULL, false))
7318     {
7319       gcc_assert (!sve_p);
7320       if (!aarch64_composite_type_p (type, mode))
7321         {
7322           gcc_assert (count == 1 && mode == ag_mode);
7323           return gen_rtx_REG (mode, V0_REGNUM);
7324         }
7325       else if (aarch64_advsimd_full_struct_mode_p (mode)
7326                && known_eq (GET_MODE_SIZE (ag_mode), 16))
7327         return gen_rtx_REG (mode, V0_REGNUM);
7328       else if (aarch64_advsimd_partial_struct_mode_p (mode)
7329                && known_eq (GET_MODE_SIZE (ag_mode), 8))
7330         return gen_rtx_REG (mode, V0_REGNUM);
7331       else
7332         {
7333           int i;
7334           rtx par;
7335
7336           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
7337           for (i = 0; i < count; i++)
7338             {
7339               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
7340               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
7341               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7342               XVECEXP (par, 0, i) = tmp;
7343             }
7344           return par;
7345         }
7346     }
7347   else
7348     {
7349       if (sve_p)
7350         {
7351           /* Vector types can acquire a partial SVE mode using things like
7352              __attribute__((vector_size(N))), and this is potentially useful.
7353              However, the choice of mode doesn't affect the type's ABI
7354              identity, so we should treat the types as though they had
7355              the associated integer mode, just like they did before SVE
7356              was introduced.
7357
7358              We know that the vector must be 128 bits or smaller,
7359              otherwise we'd have returned it in memory instead.  */
7360           gcc_assert (type
7361                       && (aarch64_some_values_include_pst_objects_p (type)
7362                           || (vec_flags & VEC_PARTIAL)));
7363
7364           scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
7365           rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
7366           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
7367           return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
7368         }
7369       return gen_rtx_REG (mode, R0_REGNUM);
7370     }
7371 }
7372
7373 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
7374    Return true if REGNO is the number of a hard register in which the values
7375    of called function may come back.  */
7376
7377 static bool
7378 aarch64_function_value_regno_p (const unsigned int regno)
7379 {
7380   /* Maximum of 16 bytes can be returned in the general registers.  Examples
7381      of 16-byte return values are: 128-bit integers and 16-byte small
7382      structures (excluding homogeneous floating-point aggregates).  */
7383   if (regno == R0_REGNUM || regno == R1_REGNUM)
7384     return true;
7385
7386   /* Up to four fp/simd registers can return a function value, e.g. a
7387      homogeneous floating-point aggregate having four members.  */
7388   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
7389     return TARGET_FLOAT;
7390
7391   if (regno >= P0_REGNUM && regno < P0_REGNUM + HA_MAX_NUM_FLDS)
7392     return TARGET_SVE;
7393
7394   return false;
7395 }
7396
7397 /* Subroutine for aarch64_return_in_memory for types that are not returned
7398    in SVE registers.  */
7399
7400 static bool
7401 aarch64_return_in_memory_1 (const_tree type)
7402 {
7403   HOST_WIDE_INT size;
7404   machine_mode ag_mode;
7405   int count;
7406
7407   if (!AGGREGATE_TYPE_P (type)
7408       && TREE_CODE (type) != COMPLEX_TYPE
7409       && TREE_CODE (type) != VECTOR_TYPE)
7410     /* Simple scalar types always returned in registers.  */
7411     return false;
7412
7413   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7414                                                &ag_mode, &count, NULL, false))
7415     return false;
7416
7417   /* Types larger than 2 registers returned in memory.  */
7418   size = int_size_in_bytes (type);
7419   return (size < 0 || size > 2 * UNITS_PER_WORD);
7420 }
7421
7422 /* Implement TARGET_RETURN_IN_MEMORY.
7423
7424    If the type T of the result of a function is such that
7425      void func (T arg)
7426    would require that arg be passed as a value in a register (or set of
7427    registers) according to the parameter passing rules, then the result
7428    is returned in the same registers as would be used for such an
7429    argument.  */
7430
7431 static bool
7432 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
7433 {
7434   pure_scalable_type_info pst_info;
7435   switch (pst_info.analyze (type))
7436     {
7437     case pure_scalable_type_info::IS_PST:
7438       return (pst_info.num_zr () > NUM_FP_ARG_REGS
7439               || pst_info.num_pr () > NUM_PR_ARG_REGS);
7440
7441     case pure_scalable_type_info::DOESNT_MATTER:
7442       gcc_assert (aarch64_return_in_memory_1 (type));
7443       return true;
7444
7445     case pure_scalable_type_info::NO_ABI_IDENTITY:
7446     case pure_scalable_type_info::ISNT_PST:
7447       return aarch64_return_in_memory_1 (type);
7448     }
7449   gcc_unreachable ();
7450 }
7451
7452 static bool
7453 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
7454                                const_tree type, int *nregs)
7455 {
7456   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7457   return aarch64_vfp_is_call_or_return_candidate (mode, type,
7458                                                   &pcum->aapcs_vfp_rmode,
7459                                                   nregs, NULL, pcum->silent_p);
7460 }
7461
7462 /* Given MODE and TYPE of a function argument, return the alignment in
7463    bits.  The idea is to suppress any stronger alignment requested by
7464    the user and opt for the natural alignment (specified in AAPCS64 \S
7465    4.1).  ABI_BREAK is set to the old alignment if the alignment was
7466    incorrectly calculated in versions of GCC prior to GCC-9.
7467    ABI_BREAK_PACKED is set to the old alignment if it was incorrectly
7468    calculated in versions between GCC-9 and GCC-13.  This is a helper
7469    function for local use only.  */
7470
7471 static unsigned int
7472 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
7473                                 unsigned int *abi_break,
7474                                 unsigned int *abi_break_packed)
7475 {
7476   *abi_break = 0;
7477   *abi_break_packed = 0;
7478   if (!type)
7479     return GET_MODE_ALIGNMENT (mode);
7480
7481   if (integer_zerop (TYPE_SIZE (type)))
7482     return 0;
7483
7484   gcc_assert (TYPE_MODE (type) == mode);
7485
7486   if (!AGGREGATE_TYPE_P (type))
7487     {
7488       /* The ABI alignment is the natural alignment of the type, without
7489          any attributes applied.  Normally this is the alignment of the
7490          TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
7491          For now we just handle the known exceptions explicitly.  */
7492       type = TYPE_MAIN_VARIANT (type);
7493       if (POINTER_TYPE_P (type))
7494         {
7495           gcc_assert (known_eq (POINTER_SIZE, GET_MODE_BITSIZE (mode)));
7496           return POINTER_SIZE;
7497         }
7498       gcc_assert (!TYPE_USER_ALIGN (type));
7499       return TYPE_ALIGN (type);
7500     }
7501
7502   if (TREE_CODE (type) == ARRAY_TYPE)
7503     return TYPE_ALIGN (TREE_TYPE (type));
7504
7505   unsigned int alignment = 0;
7506   unsigned int bitfield_alignment_with_packed = 0;
7507   unsigned int bitfield_alignment = 0;
7508   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7509     if (TREE_CODE (field) == FIELD_DECL)
7510       {
7511         /* Note that we explicitly consider zero-sized fields here,
7512            even though they don't map to AAPCS64 machine types.
7513            For example, in:
7514
7515                struct __attribute__((aligned(8))) empty {};
7516
7517                struct s {
7518                  [[no_unique_address]] empty e;
7519                  int x;
7520                };
7521
7522            "s" contains only one Fundamental Data Type (the int field)
7523            but gains 8-byte alignment and size thanks to "e".  */
7524         alignment = std::max (alignment, DECL_ALIGN (field));
7525         if (DECL_BIT_FIELD_TYPE (field))
7526           {
7527             /* Take the bit-field type's alignment into account only
7528                if the user didn't reduce this field's alignment with
7529                the packed attribute.  */
7530             if (!DECL_PACKED (field))
7531               bitfield_alignment
7532                 = std::max (bitfield_alignment,
7533                             TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7534
7535             /* Compute the alignment even if the bit-field is
7536                packed, so that we can emit a warning in case the
7537                alignment changed between GCC versions.  */
7538             bitfield_alignment_with_packed
7539               = std::max (bitfield_alignment_with_packed,
7540                           TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7541           }
7542       }
7543
7544   /* Emit a warning if the alignment is different when taking the
7545      'packed' attribute into account.  */
7546   if (bitfield_alignment != bitfield_alignment_with_packed
7547       && bitfield_alignment_with_packed > alignment)
7548     *abi_break_packed = bitfield_alignment_with_packed;
7549
7550   if (bitfield_alignment > alignment)
7551     {
7552       *abi_break = alignment;
7553       return bitfield_alignment;
7554     }
7555
7556   return alignment;
7557 }
7558
7559 /* Layout a function argument according to the AAPCS64 rules.  The rule
7560    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
7561    mode that was originally given to us by the target hook, whereas the
7562    mode in ARG might be the result of replacing partial SVE modes with
7563    the equivalent integer mode.  */
7564
7565 static void
7566 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7567 {
7568   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7569   tree type = arg.type;
7570   machine_mode mode = arg.mode;
7571   int ncrn, nvrn, nregs;
7572   bool allocate_ncrn, allocate_nvrn;
7573   HOST_WIDE_INT size;
7574   unsigned int abi_break;
7575   unsigned int abi_break_packed;
7576
7577   /* We need to do this once per argument.  */
7578   if (pcum->aapcs_arg_processed)
7579     return;
7580
7581   bool warn_pcs_change
7582     = (warn_psabi
7583        && !pcum->silent_p
7584        && (currently_expanding_function_start
7585            || currently_expanding_gimple_stmt));
7586
7587   /* HFAs and HVAs can have an alignment greater than 16 bytes.  For example:
7588
7589        typedef struct foo {
7590          __Int8x16_t foo[2] __attribute__((aligned(32)));
7591        } foo;
7592
7593      is still a HVA despite its larger-than-normal alignment.
7594      However, such over-aligned HFAs and HVAs are guaranteed to have
7595      no padding.
7596
7597      If we exclude HFAs and HVAs from the discussion below, then there
7598      are several things to note:
7599
7600      - Both the C and AAPCS64 interpretations of a type's alignment should
7601        give a value that is no greater than the type's size.
7602
7603      - Types bigger than 16 bytes are passed indirectly.
7604
7605      - If an argument of type T is passed indirectly, TYPE and MODE describe
7606        a pointer to T rather than T iself.
7607
7608      It follows that the AAPCS64 alignment of TYPE must be no greater
7609      than 16 bytes.
7610
7611      Versions prior to GCC 9.1 ignored a bitfield's underlying type
7612      and so could calculate an alignment that was too small.  If this
7613      happened for TYPE then ABI_BREAK is this older, too-small alignment.
7614
7615      Although GCC 9.1 fixed that bug, it introduced a different one:
7616      it would consider the alignment of a bitfield's underlying type even
7617      if the field was packed (which should have the effect of overriding
7618      the alignment of the underlying type).  This was fixed in GCC 13.1.
7619
7620      As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
7621      that was too big.  If this happened for TYPE, ABI_BREAK_PACKED is
7622      this older, too-big alignment.
7623
7624      Also, the fact that GCC 9 to GCC 12 considered irrelevant
7625      alignments meant they could calculate type alignments that were
7626      bigger than the type's size, contrary to the assumption above.
7627      The handling of register arguments was nevertheless (and justifiably)
7628      written to follow the assumption that the alignment can never be
7629      greater than the size.  The same was not true for stack arguments;
7630      their alignment was instead handled by MIN bounds in
7631      aarch64_function_arg_boundary.
7632
7633      The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
7634      an alignment of more than 16 bytes for TYPE then:
7635
7636      - If the argument was passed in registers, these GCC versions
7637        would treat the alignment as though it was *less than* 16 bytes.
7638
7639      - If the argument was passed on the stack, these GCC versions
7640        would treat the alignment as though it was *equal to* 16 bytes.
7641
7642      Both behaviors were wrong, but in different cases.  */
7643
7644   pcum->aapcs_arg_processed = true;
7645
7646   pure_scalable_type_info pst_info;
7647   if (type && pst_info.analyze_registers (type))
7648     {
7649       /* aarch64_function_arg_alignment has never had an effect on
7650          this case.  */
7651
7652       /* The PCS says that it is invalid to pass an SVE value to an
7653          unprototyped function.  There is no ABI-defined location we
7654          can return in this case, so we have no real choice but to raise
7655          an error immediately, even though this is only a query function.  */
7656       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
7657         {
7658           gcc_assert (!pcum->silent_p);
7659           error ("SVE type %qT cannot be passed to an unprototyped function",
7660                  arg.type);
7661           /* Avoid repeating the message, and avoid tripping the assert
7662              below.  */
7663           pcum->pcs_variant = ARM_PCS_SVE;
7664         }
7665
7666       /* We would have converted the argument into pass-by-reference
7667          form if it didn't fit in registers.  */
7668       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
7669       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
7670       gcc_assert (arg.named
7671                   && pcum->pcs_variant == ARM_PCS_SVE
7672                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
7673                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
7674       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
7675                                           P0_REGNUM + pcum->aapcs_nprn);
7676       return;
7677     }
7678
7679   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7680      are passed by reference, not by value.  */
7681   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7682   bool sve_p = (vec_flags & VEC_ANY_SVE);
7683   if (sve_p)
7684     /* Vector types can acquire a partial SVE mode using things like
7685        __attribute__((vector_size(N))), and this is potentially useful.
7686        However, the choice of mode doesn't affect the type's ABI
7687        identity, so we should treat the types as though they had
7688        the associated integer mode, just like they did before SVE
7689        was introduced.
7690
7691        We know that the vector must be 128 bits or smaller,
7692        otherwise we'd have passed it in memory instead.  */
7693     gcc_assert (type
7694                 && (aarch64_some_values_include_pst_objects_p (type)
7695                     || (vec_flags & VEC_PARTIAL)));
7696
7697   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
7698   if (type)
7699     size = int_size_in_bytes (type);
7700   else
7701     /* No frontends can create types with variable-sized modes, so we
7702        shouldn't be asked to pass or return them.  */
7703     size = GET_MODE_SIZE (mode).to_constant ();
7704   size = ROUND_UP (size, UNITS_PER_WORD);
7705
7706   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
7707   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
7708                                                  mode,
7709                                                  type,
7710                                                  &nregs);
7711   gcc_assert (!sve_p || !allocate_nvrn);
7712
7713   unsigned int alignment
7714     = aarch64_function_arg_alignment (mode, type, &abi_break,
7715                                       &abi_break_packed);
7716
7717   gcc_assert ((allocate_nvrn || alignment <= 16 * BITS_PER_UNIT)
7718               && (!alignment || abi_break < alignment)
7719               && (!abi_break_packed || alignment < abi_break_packed));
7720
7721   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7722      The following code thus handles passing by SIMD/FP registers first.  */
7723
7724   nvrn = pcum->aapcs_nvrn;
7725
7726   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7727      and homogenous short-vector aggregates (HVA).  */
7728   if (allocate_nvrn)
7729     {
7730       /* aarch64_function_arg_alignment has never had an effect on
7731          this case.  */
7732       if (!pcum->silent_p && !TARGET_FLOAT)
7733         aarch64_err_no_fpadvsimd (mode);
7734
7735       if (nvrn + nregs <= NUM_FP_ARG_REGS)
7736         {
7737           pcum->aapcs_nextnvrn = nvrn + nregs;
7738           if (!aarch64_composite_type_p (type, mode))
7739             {
7740               gcc_assert (nregs == 1);
7741               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7742             }
7743           else if (aarch64_advsimd_full_struct_mode_p (mode)
7744                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
7745             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7746           else if (aarch64_advsimd_partial_struct_mode_p (mode)
7747                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
7748             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7749           else
7750             {
7751               rtx par;
7752               int i;
7753               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7754               for (i = 0; i < nregs; i++)
7755                 {
7756                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7757                                          V0_REGNUM + nvrn + i);
7758                   rtx offset = gen_int_mode
7759                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7760                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7761                   XVECEXP (par, 0, i) = tmp;
7762                 }
7763               pcum->aapcs_reg = par;
7764             }
7765           return;
7766         }
7767       else
7768         {
7769           /* C.3 NSRN is set to 8.  */
7770           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7771           goto on_stack;
7772         }
7773     }
7774
7775   ncrn = pcum->aapcs_ncrn;
7776   nregs = size / UNITS_PER_WORD;
7777
7778   /* C6 - C9.  though the sign and zero extension semantics are
7779      handled elsewhere.  This is the case where the argument fits
7780      entirely general registers.  */
7781   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7782     {
7783       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7784
7785       /* C.8 if the argument has an alignment of 16 then the NGRN is
7786          rounded up to the next even number.  */
7787       if (nregs == 2
7788           && ncrn % 2)
7789         {
7790           /* Emit a warning if the alignment changed when taking the
7791              'packed' attribute into account.  */
7792           if (warn_pcs_change
7793               && abi_break_packed
7794               && ((abi_break_packed == 16 * BITS_PER_UNIT)
7795                   != (alignment == 16 * BITS_PER_UNIT)))
7796             inform (input_location, "parameter passing for argument of type "
7797                     "%qT changed in GCC 13.1", type);
7798
7799           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7800              comparison is there because for > 16 * BITS_PER_UNIT
7801              alignment nregs should be > 2 and therefore it should be
7802              passed by reference rather than value.  */
7803           if (alignment == 16 * BITS_PER_UNIT)
7804             {
7805               if (warn_pcs_change && abi_break)
7806                 inform (input_location, "parameter passing for argument of type "
7807                         "%qT changed in GCC 9.1", type);
7808               ++ncrn;
7809               gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7810             }
7811         }
7812
7813       /* If an argument with an SVE mode needs to be shifted up to the
7814          high part of the register, treat it as though it had an integer mode.
7815          Using the normal (parallel [...]) would suppress the shifting.  */
7816       if (sve_p
7817           && BYTES_BIG_ENDIAN
7818           && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7819           && aarch64_pad_reg_upward (mode, type, false))
7820         {
7821           mode = int_mode_for_mode (mode).require ();
7822           sve_p = false;
7823         }
7824
7825       /* NREGS can be 0 when e.g. an empty structure is to be passed.
7826          A reg is still generated for it, but the caller should be smart
7827          enough not to use it.  */
7828       if (nregs == 0
7829           || (nregs == 1 && !sve_p)
7830           || GET_MODE_CLASS (mode) == MODE_INT)
7831         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7832       else
7833         {
7834           rtx par;
7835           int i;
7836
7837           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7838           for (i = 0; i < nregs; i++)
7839             {
7840               scalar_int_mode reg_mode = word_mode;
7841               if (nregs == 1)
7842                 reg_mode = int_mode_for_mode (mode).require ();
7843               rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7844               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7845                                        GEN_INT (i * UNITS_PER_WORD));
7846               XVECEXP (par, 0, i) = tmp;
7847             }
7848           pcum->aapcs_reg = par;
7849         }
7850
7851       pcum->aapcs_nextncrn = ncrn + nregs;
7852       return;
7853     }
7854
7855   /* C.11  */
7856   pcum->aapcs_nextncrn = NUM_ARG_REGS;
7857
7858   /* The argument is passed on stack; record the needed number of words for
7859      this argument and align the total size if necessary.  */
7860 on_stack:
7861   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7862
7863   if (warn_pcs_change
7864       && abi_break_packed
7865       && ((abi_break_packed >= 16 * BITS_PER_UNIT)
7866           != (alignment >= 16 * BITS_PER_UNIT)))
7867     inform (input_location, "parameter passing for argument of type "
7868             "%qT changed in GCC 13.1", type);
7869
7870   if (alignment == 16 * BITS_PER_UNIT)
7871     {
7872       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7873       if (pcum->aapcs_stack_size != new_size)
7874         {
7875           if (warn_pcs_change && abi_break)
7876             inform (input_location, "parameter passing for argument of type "
7877                     "%qT changed in GCC 9.1", type);
7878           pcum->aapcs_stack_size = new_size;
7879         }
7880     }
7881   return;
7882 }
7883
7884 /* Implement TARGET_FUNCTION_ARG.  */
7885
7886 static rtx
7887 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7888 {
7889   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7890   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7891               || pcum->pcs_variant == ARM_PCS_SIMD
7892               || pcum->pcs_variant == ARM_PCS_SVE);
7893
7894   if (arg.end_marker_p ())
7895     return gen_int_mode (pcum->pcs_variant, DImode);
7896
7897   aarch64_layout_arg (pcum_v, arg);
7898   return pcum->aapcs_reg;
7899 }
7900
7901 void
7902 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7903                               const_tree fntype,
7904                               rtx libname ATTRIBUTE_UNUSED,
7905                               const_tree fndecl ATTRIBUTE_UNUSED,
7906                               unsigned n_named ATTRIBUTE_UNUSED,
7907                               bool silent_p)
7908 {
7909   pcum->aapcs_ncrn = 0;
7910   pcum->aapcs_nvrn = 0;
7911   pcum->aapcs_nprn = 0;
7912   pcum->aapcs_nextncrn = 0;
7913   pcum->aapcs_nextnvrn = 0;
7914   pcum->aapcs_nextnprn = 0;
7915   if (fntype)
7916     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7917   else
7918     pcum->pcs_variant = ARM_PCS_AAPCS64;
7919   pcum->aapcs_reg = NULL_RTX;
7920   pcum->aapcs_arg_processed = false;
7921   pcum->aapcs_stack_words = 0;
7922   pcum->aapcs_stack_size = 0;
7923   pcum->silent_p = silent_p;
7924
7925   if (!silent_p
7926       && !TARGET_FLOAT
7927       && fntype && fntype != error_mark_node)
7928     {
7929       const_tree type = TREE_TYPE (fntype);
7930       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
7931       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
7932       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7933                                                    &mode, &nregs, NULL, false))
7934         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7935     }
7936
7937   if (!silent_p
7938       && !TARGET_SVE
7939       && pcum->pcs_variant == ARM_PCS_SVE)
7940     {
7941       /* We can't gracefully recover at this point, so make this a
7942          fatal error.  */
7943       if (fndecl)
7944         fatal_error (input_location, "%qE requires the SVE ISA extension",
7945                      fndecl);
7946       else
7947         fatal_error (input_location, "calls to functions of type %qT require"
7948                      " the SVE ISA extension", fntype);
7949     }
7950 }
7951
7952 static void
7953 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7954                               const function_arg_info &arg)
7955 {
7956   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7957   if (pcum->pcs_variant == ARM_PCS_AAPCS64
7958       || pcum->pcs_variant == ARM_PCS_SIMD
7959       || pcum->pcs_variant == ARM_PCS_SVE)
7960     {
7961       aarch64_layout_arg (pcum_v, arg);
7962       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7963                   != (pcum->aapcs_stack_words != 0));
7964       pcum->aapcs_arg_processed = false;
7965       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7966       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7967       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7968       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7969       pcum->aapcs_stack_words = 0;
7970       pcum->aapcs_reg = NULL_RTX;
7971     }
7972 }
7973
7974 bool
7975 aarch64_function_arg_regno_p (unsigned regno)
7976 {
7977   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7978           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS)
7979           || (PR_REGNUM_P (regno) && regno < P0_REGNUM + NUM_PR_ARG_REGS));
7980 }
7981
7982 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
7983    PARM_BOUNDARY bits of alignment, but will be given anything up
7984    to STACK_BOUNDARY bits if the type requires it.  This makes sure
7985    that both before and after the layout of each argument, the Next
7986    Stacked Argument Address (NSAA) will have a minimum alignment of
7987    8 bytes.  */
7988
7989 static unsigned int
7990 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7991 {
7992   unsigned int abi_break;
7993   unsigned int abi_break_packed;
7994   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7995                                                            &abi_break,
7996                                                            &abi_break_packed);
7997   /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7998      to emit warnings about ABI incompatibility.  */
7999   alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
8000   return alignment;
8001 }
8002
8003 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
8004
8005 static fixed_size_mode
8006 aarch64_get_reg_raw_mode (int regno)
8007 {
8008   if (TARGET_SVE && FP_REGNUM_P (regno))
8009     /* Don't use the SVE part of the register for __builtin_apply and
8010        __builtin_return.  The SVE registers aren't used by the normal PCS,
8011        so using them there would be a waste of time.  The PCS extensions
8012        for SVE types are fundamentally incompatible with the
8013        __builtin_return/__builtin_apply interface.  */
8014     return as_a <fixed_size_mode> (V16QImode);
8015   if (PR_REGNUM_P (regno))
8016     /* For SVE PR regs, indicate that they should be ignored for
8017        __builtin_apply/__builtin_return.  */
8018     return as_a <fixed_size_mode> (VOIDmode);
8019   return default_get_reg_raw_mode (regno);
8020 }
8021
8022 /* Implement TARGET_FUNCTION_ARG_PADDING.
8023
8024    Small aggregate types are placed in the lowest memory address.
8025
8026    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
8027
8028 static pad_direction
8029 aarch64_function_arg_padding (machine_mode mode, const_tree type)
8030 {
8031   /* On little-endian targets, the least significant byte of every stack
8032      argument is passed at the lowest byte address of the stack slot.  */
8033   if (!BYTES_BIG_ENDIAN)
8034     return PAD_UPWARD;
8035
8036   /* Otherwise, integral, floating-point and pointer types are padded downward:
8037      the least significant byte of a stack argument is passed at the highest
8038      byte address of the stack slot.  */
8039   if (type
8040       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
8041          || POINTER_TYPE_P (type))
8042       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
8043     return PAD_DOWNWARD;
8044
8045   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
8046   return PAD_UPWARD;
8047 }
8048
8049 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
8050
8051    It specifies padding for the last (may also be the only)
8052    element of a block move between registers and memory.  If
8053    assuming the block is in the memory, padding upward means that
8054    the last element is padded after its highest significant byte,
8055    while in downward padding, the last element is padded at the
8056    its least significant byte side.
8057
8058    Small aggregates and small complex types are always padded
8059    upwards.
8060
8061    We don't need to worry about homogeneous floating-point or
8062    short-vector aggregates; their move is not affected by the
8063    padding direction determined here.  Regardless of endianness,
8064    each element of such an aggregate is put in the least
8065    significant bits of a fp/simd register.
8066
8067    Return !BYTES_BIG_ENDIAN if the least significant byte of the
8068    register has useful data, and return the opposite if the most
8069    significant byte does.  */
8070
8071 bool
8072 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
8073                      bool first ATTRIBUTE_UNUSED)
8074 {
8075
8076   /* Aside from pure scalable types, small composite types are always
8077      padded upward.  */
8078   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
8079     {
8080       HOST_WIDE_INT size;
8081       if (type)
8082         size = int_size_in_bytes (type);
8083       else
8084         /* No frontends can create types with variable-sized modes, so we
8085            shouldn't be asked to pass or return them.  */
8086         size = GET_MODE_SIZE (mode).to_constant ();
8087       if (size < 2 * UNITS_PER_WORD)
8088         {
8089           pure_scalable_type_info pst_info;
8090           if (pst_info.analyze_registers (type))
8091             return false;
8092           return true;
8093         }
8094     }
8095
8096   /* Otherwise, use the default padding.  */
8097   return !BYTES_BIG_ENDIAN;
8098 }
8099
8100 static scalar_int_mode
8101 aarch64_libgcc_cmp_return_mode (void)
8102 {
8103   return SImode;
8104 }
8105
8106 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
8107
8108 /* We use the 12-bit shifted immediate arithmetic instructions so values
8109    must be multiple of (1 << 12), i.e. 4096.  */
8110 #define ARITH_FACTOR 4096
8111
8112 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
8113 #error Cannot use simple address calculation for stack probing
8114 #endif
8115
8116 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
8117    inclusive.  These are offsets from the current stack pointer.  */
8118
8119 static void
8120 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
8121 {
8122   HOST_WIDE_INT size;
8123   if (!poly_size.is_constant (&size))
8124     {
8125       sorry ("stack probes for SVE frames");
8126       return;
8127     }
8128
8129   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
8130
8131   /* See the same assertion on PROBE_INTERVAL above.  */
8132   gcc_assert ((first % ARITH_FACTOR) == 0);
8133
8134   /* See if we have a constant small number of probes to generate.  If so,
8135      that's the easy case.  */
8136   if (size <= PROBE_INTERVAL)
8137     {
8138       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
8139
8140       emit_set_insn (reg1,
8141                      plus_constant (Pmode,
8142                                     stack_pointer_rtx, -(first + base)));
8143       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
8144     }
8145
8146   /* The run-time loop is made up of 8 insns in the generic case while the
8147      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
8148   else if (size <= 4 * PROBE_INTERVAL)
8149     {
8150       HOST_WIDE_INT i, rem;
8151
8152       emit_set_insn (reg1,
8153                      plus_constant (Pmode,
8154                                     stack_pointer_rtx,
8155                                     -(first + PROBE_INTERVAL)));
8156       emit_stack_probe (reg1);
8157
8158       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
8159          it exceeds SIZE.  If only two probes are needed, this will not
8160          generate any code.  Then probe at FIRST + SIZE.  */
8161       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
8162         {
8163           emit_set_insn (reg1,
8164                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
8165           emit_stack_probe (reg1);
8166         }
8167
8168       rem = size - (i - PROBE_INTERVAL);
8169       if (rem > 256)
8170         {
8171           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8172
8173           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
8174           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
8175         }
8176       else
8177         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
8178     }
8179
8180   /* Otherwise, do the same as above, but in a loop.  Note that we must be
8181      extra careful with variables wrapping around because we might be at
8182      the very top (or the very bottom) of the address space and we have
8183      to be able to handle this case properly; in particular, we use an
8184      equality test for the loop condition.  */
8185   else
8186     {
8187       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
8188
8189       /* Step 1: round SIZE to the previous multiple of the interval.  */
8190
8191       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
8192
8193
8194       /* Step 2: compute initial and final value of the loop counter.  */
8195
8196       /* TEST_ADDR = SP + FIRST.  */
8197       emit_set_insn (reg1,
8198                      plus_constant (Pmode, stack_pointer_rtx, -first));
8199
8200       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
8201       HOST_WIDE_INT adjustment = - (first + rounded_size);
8202       if (! aarch64_uimm12_shift (adjustment))
8203         {
8204           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
8205                                           true, Pmode);
8206           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
8207         }
8208       else
8209         emit_set_insn (reg2,
8210                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
8211
8212       /* Step 3: the loop
8213
8214          do
8215            {
8216              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
8217              probe at TEST_ADDR
8218            }
8219          while (TEST_ADDR != LAST_ADDR)
8220
8221          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
8222          until it is equal to ROUNDED_SIZE.  */
8223
8224       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
8225
8226
8227       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
8228          that SIZE is equal to ROUNDED_SIZE.  */
8229
8230       if (size != rounded_size)
8231         {
8232           HOST_WIDE_INT rem = size - rounded_size;
8233
8234           if (rem > 256)
8235             {
8236               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8237
8238               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
8239               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
8240             }
8241           else
8242             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
8243         }
8244     }
8245
8246   /* Make sure nothing is scheduled before we are done.  */
8247   emit_insn (gen_blockage ());
8248 }
8249
8250 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
8251    absolute addresses.  */
8252
8253 const char *
8254 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
8255 {
8256   static int labelno = 0;
8257   char loop_lab[32];
8258   rtx xops[2];
8259
8260   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
8261
8262   /* Loop.  */
8263   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
8264
8265   HOST_WIDE_INT stack_clash_probe_interval
8266     = 1 << param_stack_clash_protection_guard_size;
8267
8268   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
8269   xops[0] = reg1;
8270   HOST_WIDE_INT interval;
8271   if (flag_stack_clash_protection)
8272     interval = stack_clash_probe_interval;
8273   else
8274     interval = PROBE_INTERVAL;
8275
8276   gcc_assert (aarch64_uimm12_shift (interval));
8277   xops[1] = GEN_INT (interval);
8278
8279   output_asm_insn ("sub\t%0, %0, %1", xops);
8280
8281   /* If doing stack clash protection then we probe up by the ABI specified
8282      amount.  We do this because we're dropping full pages at a time in the
8283      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
8284   if (flag_stack_clash_protection)
8285     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
8286   else
8287     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
8288
8289   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
8290      by this amount for each iteration.  */
8291   output_asm_insn ("str\txzr, [%0, %1]", xops);
8292
8293   /* Test if TEST_ADDR == LAST_ADDR.  */
8294   xops[1] = reg2;
8295   output_asm_insn ("cmp\t%0, %1", xops);
8296
8297   /* Branch.  */
8298   fputs ("\tb.ne\t", asm_out_file);
8299   assemble_name_raw (asm_out_file, loop_lab);
8300   fputc ('\n', asm_out_file);
8301
8302   return "";
8303 }
8304
8305 /* Emit the probe loop for doing stack clash probes and stack adjustments for
8306    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
8307    of GUARD_SIZE.  When a probe is emitted it is done at most
8308    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
8309    at most MIN_PROBE_THRESHOLD.  By the end of this function
8310    BASE = BASE - ADJUSTMENT.  */
8311
8312 const char *
8313 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
8314                                       rtx min_probe_threshold, rtx guard_size)
8315 {
8316   /* This function is not allowed to use any instruction generation function
8317      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
8318      so instead emit the code you want using output_asm_insn.  */
8319   gcc_assert (flag_stack_clash_protection);
8320   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
8321   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
8322
8323   /* The minimum required allocation before the residual requires probing.  */
8324   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
8325
8326   /* Clamp the value down to the nearest value that can be used with a cmp.  */
8327   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
8328   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
8329
8330   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
8331   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
8332
8333   static int labelno = 0;
8334   char loop_start_lab[32];
8335   char loop_end_lab[32];
8336   rtx xops[2];
8337
8338   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
8339   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
8340
8341   /* Emit loop start label.  */
8342   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
8343
8344   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
8345   xops[0] = adjustment;
8346   xops[1] = probe_offset_value_rtx;
8347   output_asm_insn ("cmp\t%0, %1", xops);
8348
8349   /* Branch to end if not enough adjustment to probe.  */
8350   fputs ("\tb.lt\t", asm_out_file);
8351   assemble_name_raw (asm_out_file, loop_end_lab);
8352   fputc ('\n', asm_out_file);
8353
8354   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
8355   xops[0] = base;
8356   xops[1] = probe_offset_value_rtx;
8357   output_asm_insn ("sub\t%0, %0, %1", xops);
8358
8359   /* Probe at BASE.  */
8360   xops[1] = const0_rtx;
8361   output_asm_insn ("str\txzr, [%0, %1]", xops);
8362
8363   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
8364   xops[0] = adjustment;
8365   xops[1] = probe_offset_value_rtx;
8366   output_asm_insn ("sub\t%0, %0, %1", xops);
8367
8368   /* Branch to start if still more bytes to allocate.  */
8369   fputs ("\tb\t", asm_out_file);
8370   assemble_name_raw (asm_out_file, loop_start_lab);
8371   fputc ('\n', asm_out_file);
8372
8373   /* No probe leave.  */
8374   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
8375
8376   /* BASE = BASE - ADJUSTMENT.  */
8377   xops[0] = base;
8378   xops[1] = adjustment;
8379   output_asm_insn ("sub\t%0, %0, %1", xops);
8380   return "";
8381 }
8382
8383 /* Determine whether a frame chain needs to be generated.  */
8384 static bool
8385 aarch64_needs_frame_chain (void)
8386 {
8387   /* Force a frame chain for EH returns so the return address is at FP+8.  */
8388   if (frame_pointer_needed || crtl->calls_eh_return)
8389     return true;
8390
8391   /* A leaf function cannot have calls or write LR.  */
8392   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
8393
8394   /* Don't use a frame chain in leaf functions if leaf frame pointers
8395      are disabled.  */
8396   if (flag_omit_leaf_frame_pointer && is_leaf)
8397     return false;
8398
8399   return aarch64_use_frame_pointer;
8400 }
8401
8402 /* Mark the registers that need to be saved by the callee and calculate
8403    the size of the callee-saved registers area and frame record (both FP
8404    and LR may be omitted).  */
8405 static void
8406 aarch64_layout_frame (void)
8407 {
8408   poly_int64 offset = 0;
8409   int regno, last_fp_reg = INVALID_REGNUM;
8410   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
8411   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
8412   bool frame_related_fp_reg_p = false;
8413   aarch64_frame &frame = cfun->machine->frame;
8414
8415   frame.emit_frame_chain = aarch64_needs_frame_chain ();
8416
8417   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
8418      the mid-end is doing.  */
8419   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
8420
8421 #define SLOT_NOT_REQUIRED (-2)
8422 #define SLOT_REQUIRED     (-1)
8423
8424   frame.wb_push_candidate1 = INVALID_REGNUM;
8425   frame.wb_push_candidate2 = INVALID_REGNUM;
8426   frame.spare_pred_reg = INVALID_REGNUM;
8427
8428   /* First mark all the registers that really need to be saved...  */
8429   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8430     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
8431
8432   /* ... that includes the eh data registers (if needed)...  */
8433   if (crtl->calls_eh_return)
8434     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
8435       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
8436
8437   /* ... and any callee saved register that dataflow says is live.  */
8438   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8439     if (df_regs_ever_live_p (regno)
8440         && !fixed_regs[regno]
8441         && (regno == R30_REGNUM
8442             || !crtl->abi->clobbers_full_reg_p (regno)))
8443       frame.reg_offset[regno] = SLOT_REQUIRED;
8444
8445   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8446     if (df_regs_ever_live_p (regno)
8447         && !fixed_regs[regno]
8448         && !crtl->abi->clobbers_full_reg_p (regno))
8449       {
8450         frame.reg_offset[regno] = SLOT_REQUIRED;
8451         last_fp_reg = regno;
8452         if (aarch64_emit_cfi_for_reg_p (regno))
8453           frame_related_fp_reg_p = true;
8454       }
8455
8456   /* Big-endian SVE frames need a spare predicate register in order
8457      to save Z8-Z15.  Decide which register they should use.  Prefer
8458      an unused argument register if possible, so that we don't force P4
8459      to be saved unnecessarily.  */
8460   if (frame_related_fp_reg_p
8461       && crtl->abi->id () == ARM_PCS_SVE
8462       && BYTES_BIG_ENDIAN)
8463     {
8464       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8465       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
8466       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
8467         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
8468           break;
8469       gcc_assert (regno <= P7_REGNUM);
8470       frame.spare_pred_reg = regno;
8471       df_set_regs_ever_live (regno, true);
8472     }
8473
8474   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8475     if (df_regs_ever_live_p (regno)
8476         && !fixed_regs[regno]
8477         && !crtl->abi->clobbers_full_reg_p (regno))
8478       frame.reg_offset[regno] = SLOT_REQUIRED;
8479
8480   /* With stack-clash, LR must be saved in non-leaf functions.  The saving of
8481      LR counts as an implicit probe which allows us to maintain the invariant
8482      described in the comment at expand_prologue.  */
8483   gcc_assert (crtl->is_leaf
8484               || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
8485
8486   /* Now assign stack slots for the registers.  Start with the predicate
8487      registers, since predicate LDR and STR have a relatively small
8488      offset range.  These saves happen below the hard frame pointer.  */
8489   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8490     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8491       {
8492         frame.reg_offset[regno] = offset;
8493         offset += BYTES_PER_SVE_PRED;
8494       }
8495
8496   if (maybe_ne (offset, 0))
8497     {
8498       /* If we have any vector registers to save above the predicate registers,
8499          the offset of the vector register save slots need to be a multiple
8500          of the vector size.  This lets us use the immediate forms of LDR/STR
8501          (or LD1/ST1 for big-endian).
8502
8503          A vector register is 8 times the size of a predicate register,
8504          and we need to save a maximum of 12 predicate registers, so the
8505          first vector register will be at either #1, MUL VL or #2, MUL VL.
8506
8507          If we don't have any vector registers to save, and we know how
8508          big the predicate save area is, we can just round it up to the
8509          next 16-byte boundary.  */
8510       if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
8511         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8512       else
8513         {
8514           if (known_le (offset, vector_save_size))
8515             offset = vector_save_size;
8516           else if (known_le (offset, vector_save_size * 2))
8517             offset = vector_save_size * 2;
8518           else
8519             gcc_unreachable ();
8520         }
8521     }
8522
8523   /* If we need to save any SVE vector registers, add them next.  */
8524   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
8525     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8526       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8527         {
8528           frame.reg_offset[regno] = offset;
8529           offset += vector_save_size;
8530         }
8531
8532   /* OFFSET is now the offset of the hard frame pointer from the bottom
8533      of the callee save area.  */
8534   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
8535   frame.below_hard_fp_saved_regs_size = offset;
8536   if (frame.emit_frame_chain)
8537     {
8538       /* FP and LR are placed in the linkage record.  */
8539       frame.reg_offset[R29_REGNUM] = offset;
8540       frame.wb_push_candidate1 = R29_REGNUM;
8541       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
8542       frame.wb_push_candidate2 = R30_REGNUM;
8543       offset += 2 * UNITS_PER_WORD;
8544     }
8545
8546   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8547     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8548       {
8549         frame.reg_offset[regno] = offset;
8550         if (frame.wb_push_candidate1 == INVALID_REGNUM)
8551           frame.wb_push_candidate1 = regno;
8552         else if (frame.wb_push_candidate2 == INVALID_REGNUM)
8553           frame.wb_push_candidate2 = regno;
8554         offset += UNITS_PER_WORD;
8555       }
8556
8557   poly_int64 max_int_offset = offset;
8558   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8559   bool has_align_gap = maybe_ne (offset, max_int_offset);
8560
8561   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8562     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8563       {
8564         /* If there is an alignment gap between integer and fp callee-saves,
8565            allocate the last fp register to it if possible.  */
8566         if (regno == last_fp_reg
8567             && has_align_gap
8568             && known_eq (vector_save_size, 8)
8569             && multiple_p (offset, 16))
8570           {
8571             frame.reg_offset[regno] = max_int_offset;
8572             break;
8573           }
8574
8575         frame.reg_offset[regno] = offset;
8576         if (frame.wb_push_candidate1 == INVALID_REGNUM)
8577           frame.wb_push_candidate1 = regno;
8578         else if (frame.wb_push_candidate2 == INVALID_REGNUM
8579                  && frame.wb_push_candidate1 >= V0_REGNUM)
8580           frame.wb_push_candidate2 = regno;
8581         offset += vector_save_size;
8582       }
8583
8584   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8585
8586   frame.saved_regs_size = offset;
8587
8588   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
8589
8590   poly_int64 above_outgoing_args
8591     = aligned_upper_bound (varargs_and_saved_regs_size
8592                            + get_frame_size (),
8593                            STACK_BOUNDARY / BITS_PER_UNIT);
8594
8595   frame.hard_fp_offset
8596     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
8597
8598   /* Both these values are already aligned.  */
8599   gcc_assert (multiple_p (crtl->outgoing_args_size,
8600                           STACK_BOUNDARY / BITS_PER_UNIT));
8601   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
8602
8603   frame.locals_offset = frame.saved_varargs_size;
8604
8605   frame.initial_adjust = 0;
8606   frame.final_adjust = 0;
8607   frame.callee_adjust = 0;
8608   frame.sve_callee_adjust = 0;
8609   frame.callee_offset = 0;
8610
8611   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8612   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8613
8614   /* Shadow call stack only deals with functions where the LR is pushed
8615      onto the stack and without specifying the "no_sanitize" attribute
8616      with the argument "shadow-call-stack".  */
8617   frame.is_scs_enabled
8618     = (!crtl->calls_eh_return
8619        && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8620        && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
8621
8622   /* When shadow call stack is enabled, the scs_pop in the epilogue will
8623      restore x30, and we don't need to pop x30 again in the traditional
8624      way.  Pop candidates record the registers that need to be popped
8625      eventually.  */
8626   if (frame.is_scs_enabled)
8627     {
8628       if (frame.wb_pop_candidate2 == R30_REGNUM)
8629         frame.wb_pop_candidate2 = INVALID_REGNUM;
8630       else if (frame.wb_pop_candidate1 == R30_REGNUM)
8631         frame.wb_pop_candidate1 = INVALID_REGNUM;
8632     }
8633
8634   /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8635      256 to ensure that the offset meets the requirements of emit_move_insn.
8636      Similarly, if candidate1 is INVALID_REGNUM, we need to set
8637      max_push_offset to 0, because no registers are popped at this time,
8638      so callee_adjust cannot be adjusted.  */
8639   HOST_WIDE_INT max_push_offset = 0;
8640   if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8641     max_push_offset = 512;
8642   else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8643     max_push_offset = 256;
8644
8645   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
8646   HOST_WIDE_INT const_saved_regs_size;
8647   if (frame.frame_size.is_constant (&const_size)
8648       && const_size < max_push_offset
8649       && known_eq (frame.hard_fp_offset, const_size))
8650     {
8651       /* Simple, small frame with no outgoing arguments:
8652
8653          stp reg1, reg2, [sp, -frame_size]!
8654          stp reg3, reg4, [sp, 16]  */
8655       frame.callee_adjust = const_size;
8656     }
8657   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
8658            && frame.saved_regs_size.is_constant (&const_saved_regs_size)
8659            && const_outgoing_args_size + const_saved_regs_size < 512
8660            /* We could handle this case even with outgoing args, provided
8661               that the number of args left us with valid offsets for all
8662               predicate and vector save slots.  It's such a rare case that
8663               it hardly seems worth the effort though.  */
8664            && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
8665            && !(cfun->calls_alloca
8666                 && frame.hard_fp_offset.is_constant (&const_fp_offset)
8667                 && const_fp_offset < max_push_offset))
8668     {
8669       /* Frame with small outgoing arguments:
8670
8671          sub sp, sp, frame_size
8672          stp reg1, reg2, [sp, outgoing_args_size]
8673          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
8674       frame.initial_adjust = frame.frame_size;
8675       frame.callee_offset = const_outgoing_args_size;
8676     }
8677   else if (saves_below_hard_fp_p
8678            && known_eq (frame.saved_regs_size,
8679                         frame.below_hard_fp_saved_regs_size))
8680     {
8681       /* Frame in which all saves are SVE saves:
8682
8683          sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
8684          save SVE registers relative to SP
8685          sub sp, sp, outgoing_args_size  */
8686       frame.initial_adjust = (frame.hard_fp_offset
8687                               + frame.below_hard_fp_saved_regs_size);
8688       frame.final_adjust = crtl->outgoing_args_size;
8689     }
8690   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
8691            && const_fp_offset < max_push_offset)
8692     {
8693       /* Frame with large outgoing arguments or SVE saves, but with
8694          a small local area:
8695
8696          stp reg1, reg2, [sp, -hard_fp_offset]!
8697          stp reg3, reg4, [sp, 16]
8698          [sub sp, sp, below_hard_fp_saved_regs_size]
8699          [save SVE registers relative to SP]
8700          sub sp, sp, outgoing_args_size  */
8701       frame.callee_adjust = const_fp_offset;
8702       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8703       frame.final_adjust = crtl->outgoing_args_size;
8704     }
8705   else
8706     {
8707       /* Frame with large local area and outgoing arguments or SVE saves,
8708          using frame pointer:
8709
8710          sub sp, sp, hard_fp_offset
8711          stp x29, x30, [sp, 0]
8712          add x29, sp, 0
8713          stp reg3, reg4, [sp, 16]
8714          [sub sp, sp, below_hard_fp_saved_regs_size]
8715          [save SVE registers relative to SP]
8716          sub sp, sp, outgoing_args_size  */
8717       frame.initial_adjust = frame.hard_fp_offset;
8718       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8719       frame.final_adjust = crtl->outgoing_args_size;
8720     }
8721
8722   /* Make sure the individual adjustments add up to the full frame size.  */
8723   gcc_assert (known_eq (frame.initial_adjust
8724                         + frame.callee_adjust
8725                         + frame.sve_callee_adjust
8726                         + frame.final_adjust, frame.frame_size));
8727
8728   if (!frame.emit_frame_chain && frame.callee_adjust == 0)
8729     {
8730       /* We've decided not to associate any register saves with the initial
8731          stack allocation.  */
8732       frame.wb_pop_candidate1 = frame.wb_push_candidate1 = INVALID_REGNUM;
8733       frame.wb_pop_candidate2 = frame.wb_push_candidate2 = INVALID_REGNUM;
8734     }
8735
8736   frame.laid_out = true;
8737 }
8738
8739 /* Return true if the register REGNO is saved on entry to
8740    the current function.  */
8741
8742 static bool
8743 aarch64_register_saved_on_entry (int regno)
8744 {
8745   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8746 }
8747
8748 /* Return the next register up from REGNO up to LIMIT for the callee
8749    to save.  */
8750
8751 static unsigned
8752 aarch64_next_callee_save (unsigned regno, unsigned limit)
8753 {
8754   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
8755     regno ++;
8756   return regno;
8757 }
8758
8759 /* Push the register number REGNO of mode MODE to the stack with write-back
8760    adjusting the stack by ADJUSTMENT.  */
8761
8762 static void
8763 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8764                            HOST_WIDE_INT adjustment)
8765  {
8766   rtx base_rtx = stack_pointer_rtx;
8767   rtx insn, reg, mem;
8768
8769   reg = gen_rtx_REG (mode, regno);
8770   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8771                             plus_constant (Pmode, base_rtx, -adjustment));
8772   mem = gen_frame_mem (mode, mem);
8773
8774   insn = emit_move_insn (mem, reg);
8775   RTX_FRAME_RELATED_P (insn) = 1;
8776 }
8777
8778 /* Generate and return an instruction to store the pair of registers
8779    REG and REG2 of mode MODE to location BASE with write-back adjusting
8780    the stack location BASE by ADJUSTMENT.  */
8781
8782 static rtx
8783 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8784                           HOST_WIDE_INT adjustment)
8785 {
8786   switch (mode)
8787     {
8788     case E_DImode:
8789       return gen_storewb_pairdi_di (base, base, reg, reg2,
8790                                     GEN_INT (-adjustment),
8791                                     GEN_INT (UNITS_PER_WORD - adjustment));
8792     case E_DFmode:
8793       return gen_storewb_pairdf_di (base, base, reg, reg2,
8794                                     GEN_INT (-adjustment),
8795                                     GEN_INT (UNITS_PER_WORD - adjustment));
8796     case E_TFmode:
8797       return gen_storewb_pairtf_di (base, base, reg, reg2,
8798                                     GEN_INT (-adjustment),
8799                                     GEN_INT (UNITS_PER_VREG - adjustment));
8800     default:
8801       gcc_unreachable ();
8802     }
8803 }
8804
8805 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8806    stack pointer by ADJUSTMENT.  */
8807
8808 static void
8809 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8810 {
8811   rtx_insn *insn;
8812   machine_mode mode = aarch64_reg_save_mode (regno1);
8813
8814   if (regno2 == INVALID_REGNUM)
8815     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8816
8817   rtx reg1 = gen_rtx_REG (mode, regno1);
8818   rtx reg2 = gen_rtx_REG (mode, regno2);
8819
8820   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8821                                               reg2, adjustment));
8822   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8823   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8824   RTX_FRAME_RELATED_P (insn) = 1;
8825 }
8826
8827 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8828    adjusting it by ADJUSTMENT afterwards.  */
8829
8830 static rtx
8831 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8832                          HOST_WIDE_INT adjustment)
8833 {
8834   switch (mode)
8835     {
8836     case E_DImode:
8837       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
8838                                    GEN_INT (UNITS_PER_WORD));
8839     case E_DFmode:
8840       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
8841                                    GEN_INT (UNITS_PER_WORD));
8842     case E_TFmode:
8843       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
8844                                    GEN_INT (UNITS_PER_VREG));
8845     default:
8846       gcc_unreachable ();
8847     }
8848 }
8849
8850 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8851    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8852    into CFI_OPS.  */
8853
8854 static void
8855 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8856                   rtx *cfi_ops)
8857 {
8858   machine_mode mode = aarch64_reg_save_mode (regno1);
8859   rtx reg1 = gen_rtx_REG (mode, regno1);
8860
8861   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8862
8863   if (regno2 == INVALID_REGNUM)
8864     {
8865       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8866       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8867       emit_move_insn (reg1, gen_frame_mem (mode, mem));
8868     }
8869   else
8870     {
8871       rtx reg2 = gen_rtx_REG (mode, regno2);
8872       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8873       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8874                                           reg2, adjustment));
8875     }
8876 }
8877
8878 /* Generate and return a store pair instruction of mode MODE to store
8879    register REG1 to MEM1 and register REG2 to MEM2.  */
8880
8881 static rtx
8882 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
8883                         rtx reg2)
8884 {
8885   switch (mode)
8886     {
8887     case E_DImode:
8888       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
8889
8890     case E_DFmode:
8891       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
8892
8893     case E_TFmode:
8894       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
8895
8896     case E_V4SImode:
8897       return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
8898
8899     case E_V16QImode:
8900       return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
8901
8902     default:
8903       gcc_unreachable ();
8904     }
8905 }
8906
8907 /* Generate and regurn a load pair isntruction of mode MODE to load register
8908    REG1 from MEM1 and register REG2 from MEM2.  */
8909
8910 static rtx
8911 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
8912                        rtx mem2)
8913 {
8914   switch (mode)
8915     {
8916     case E_DImode:
8917       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
8918
8919     case E_DFmode:
8920       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
8921
8922     case E_TFmode:
8923       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
8924
8925     case E_V4SImode:
8926       return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
8927
8928     default:
8929       gcc_unreachable ();
8930     }
8931 }
8932
8933 /* Return TRUE if return address signing should be enabled for the current
8934    function, otherwise return FALSE.  */
8935
8936 bool
8937 aarch64_return_address_signing_enabled (void)
8938 {
8939   /* This function should only be called after frame laid out.   */
8940   gcc_assert (cfun->machine->frame.laid_out);
8941
8942   /* Turn return address signing off in any function that uses
8943      __builtin_eh_return.  The address passed to __builtin_eh_return
8944      is not signed so either it has to be signed (with original sp)
8945      or the code path that uses it has to avoid authenticating it.
8946      Currently eh return introduces a return to anywhere gadget, no
8947      matter what we do here since it uses ret with user provided
8948      address. An ideal fix for that is to use indirect branch which
8949      can be protected with BTI j (to some extent).  */
8950   if (crtl->calls_eh_return)
8951     return false;
8952
8953   /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8954      if its LR is pushed onto stack.  */
8955   return (aarch_ra_sign_scope == AARCH_FUNCTION_ALL
8956           || (aarch_ra_sign_scope == AARCH_FUNCTION_NON_LEAF
8957               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8958 }
8959
8960 /* Only used by the arm backend.  */
8961 void aarch_bti_arch_check (void)
8962 {}
8963
8964 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
8965 bool
8966 aarch_bti_enabled (void)
8967 {
8968   return (aarch_enable_bti == 1);
8969 }
8970
8971 /* Check if INSN is a BTI J insn.  */
8972 bool
8973 aarch_bti_j_insn_p (rtx_insn *insn)
8974 {
8975   if (!insn || !INSN_P (insn))
8976     return false;
8977
8978   rtx pat = PATTERN (insn);
8979   return GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == UNSPECV_BTI_J;
8980 }
8981
8982 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction.  */
8983 bool
8984 aarch_pac_insn_p (rtx x)
8985 {
8986   if (!INSN_P (x))
8987     return false;
8988
8989   subrtx_var_iterator::array_type array;
8990   FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (x), ALL)
8991     {
8992       rtx sub = *iter;
8993       if (sub && GET_CODE (sub) == UNSPEC)
8994         {
8995           int unspec_val = XINT (sub, 1);
8996           switch (unspec_val)
8997             {
8998             case UNSPEC_PACIASP:
8999             case UNSPEC_PACIBSP:
9000               return true;
9001
9002             default:
9003               return false;
9004             }
9005           iter.skip_subrtxes ();
9006         }
9007     }
9008   return false;
9009 }
9010
9011 rtx aarch_gen_bti_c (void)
9012 {
9013   return gen_bti_c ();
9014 }
9015
9016 rtx aarch_gen_bti_j (void)
9017 {
9018   return gen_bti_j ();
9019 }
9020
9021 /* The caller is going to use ST1D or LD1D to save or restore an SVE
9022    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
9023    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
9024
9025      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
9026          or LD1D address
9027
9028      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
9029          if the variable isn't already nonnull
9030
9031    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
9032    Handle this case using a temporary base register that is suitable for
9033    all offsets in that range.  Use ANCHOR_REG as this base register if it
9034    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
9035
9036 static inline void
9037 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
9038                                      rtx &anchor_reg, poly_int64 &offset,
9039                                      rtx &ptrue)
9040 {
9041   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
9042     {
9043       /* This is the maximum valid offset of the anchor from the base.
9044          Lower values would be valid too.  */
9045       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
9046       if (!anchor_reg)
9047         {
9048           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9049           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
9050                                     gen_int_mode (anchor_offset, Pmode)));
9051         }
9052       base_rtx = anchor_reg;
9053       offset -= anchor_offset;
9054     }
9055   if (!ptrue)
9056     {
9057       int pred_reg = cfun->machine->frame.spare_pred_reg;
9058       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
9059                       CONSTM1_RTX (VNx16BImode));
9060       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
9061     }
9062 }
9063
9064 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
9065    is saved at BASE + OFFSET.  */
9066
9067 static void
9068 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
9069                             rtx base, poly_int64 offset)
9070 {
9071   rtx mem = gen_frame_mem (GET_MODE (reg),
9072                            plus_constant (Pmode, base, offset));
9073   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
9074 }
9075
9076 /* Emit code to save the callee-saved registers from register number START
9077    to LIMIT to the stack at the location starting at offset START_OFFSET,
9078    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
9079    is true if the hard frame pointer has been set up.  */
9080
9081 static void
9082 aarch64_save_callee_saves (poly_int64 start_offset,
9083                            unsigned start, unsigned limit, bool skip_wb,
9084                            bool hard_fp_valid_p)
9085 {
9086   rtx_insn *insn;
9087   unsigned regno;
9088   unsigned regno2;
9089   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
9090
9091   for (regno = aarch64_next_callee_save (start, limit);
9092        regno <= limit;
9093        regno = aarch64_next_callee_save (regno + 1, limit))
9094     {
9095       rtx reg, mem;
9096       poly_int64 offset;
9097       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9098
9099       if (skip_wb
9100           && (regno == cfun->machine->frame.wb_push_candidate1
9101               || regno == cfun->machine->frame.wb_push_candidate2))
9102         continue;
9103
9104       if (cfun->machine->reg_is_wrapped_separately[regno])
9105         continue;
9106
9107       machine_mode mode = aarch64_reg_save_mode (regno);
9108       reg = gen_rtx_REG (mode, regno);
9109       offset = start_offset + cfun->machine->frame.reg_offset[regno];
9110       rtx base_rtx = stack_pointer_rtx;
9111       poly_int64 sp_offset = offset;
9112
9113       HOST_WIDE_INT const_offset;
9114       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9115         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9116                                              offset, ptrue);
9117       else if (GP_REGNUM_P (regno)
9118                && (!offset.is_constant (&const_offset) || const_offset >= 512))
9119         {
9120           gcc_assert (known_eq (start_offset, 0));
9121           poly_int64 fp_offset
9122             = cfun->machine->frame.below_hard_fp_saved_regs_size;
9123           if (hard_fp_valid_p)
9124             base_rtx = hard_frame_pointer_rtx;
9125           else
9126             {
9127               if (!anchor_reg)
9128                 {
9129                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9130                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
9131                                             gen_int_mode (fp_offset, Pmode)));
9132                 }
9133               base_rtx = anchor_reg;
9134             }
9135           offset -= fp_offset;
9136         }
9137       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9138       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
9139
9140       if (!aarch64_sve_mode_p (mode)
9141           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
9142           && !cfun->machine->reg_is_wrapped_separately[regno2]
9143           && known_eq (GET_MODE_SIZE (mode),
9144                        cfun->machine->frame.reg_offset[regno2]
9145                        - cfun->machine->frame.reg_offset[regno]))
9146         {
9147           rtx reg2 = gen_rtx_REG (mode, regno2);
9148           rtx mem2;
9149
9150           offset += GET_MODE_SIZE (mode);
9151           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9152           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
9153                                                     reg2));
9154
9155           /* The first part of a frame-related parallel insn is
9156              always assumed to be relevant to the frame
9157              calculations; subsequent parts, are only
9158              frame-related if explicitly marked.  */
9159           if (aarch64_emit_cfi_for_reg_p (regno2))
9160             {
9161               if (need_cfa_note_p)
9162                 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
9163                                             sp_offset + GET_MODE_SIZE (mode));
9164               else
9165                 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
9166             }
9167
9168           regno = regno2;
9169         }
9170       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9171         {
9172           insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
9173           need_cfa_note_p = true;
9174         }
9175       else if (aarch64_sve_mode_p (mode))
9176         insn = emit_insn (gen_rtx_SET (mem, reg));
9177       else
9178         insn = emit_move_insn (mem, reg);
9179
9180       RTX_FRAME_RELATED_P (insn) = frame_related_p;
9181       if (frame_related_p && need_cfa_note_p)
9182         aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
9183     }
9184 }
9185
9186 /* Emit code to restore the callee registers from register number START
9187    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
9188    skipping any write-back candidates if SKIP_WB is true.  Write the
9189    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
9190
9191 static void
9192 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
9193                               unsigned limit, bool skip_wb, rtx *cfi_ops)
9194 {
9195   unsigned regno;
9196   unsigned regno2;
9197   poly_int64 offset;
9198   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
9199
9200   for (regno = aarch64_next_callee_save (start, limit);
9201        regno <= limit;
9202        regno = aarch64_next_callee_save (regno + 1, limit))
9203     {
9204       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9205       if (cfun->machine->reg_is_wrapped_separately[regno])
9206         continue;
9207
9208       rtx reg, mem;
9209
9210       if (skip_wb
9211           && (regno == cfun->machine->frame.wb_pop_candidate1
9212               || regno == cfun->machine->frame.wb_pop_candidate2))
9213         continue;
9214
9215       machine_mode mode = aarch64_reg_save_mode (regno);
9216       reg = gen_rtx_REG (mode, regno);
9217       offset = start_offset + cfun->machine->frame.reg_offset[regno];
9218       rtx base_rtx = stack_pointer_rtx;
9219       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9220         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9221                                              offset, ptrue);
9222       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9223
9224       if (!aarch64_sve_mode_p (mode)
9225           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
9226           && !cfun->machine->reg_is_wrapped_separately[regno2]
9227           && known_eq (GET_MODE_SIZE (mode),
9228                        cfun->machine->frame.reg_offset[regno2]
9229                        - cfun->machine->frame.reg_offset[regno]))
9230         {
9231           rtx reg2 = gen_rtx_REG (mode, regno2);
9232           rtx mem2;
9233
9234           offset += GET_MODE_SIZE (mode);
9235           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9236           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9237
9238           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
9239           regno = regno2;
9240         }
9241       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9242         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
9243       else if (aarch64_sve_mode_p (mode))
9244         emit_insn (gen_rtx_SET (reg, mem));
9245       else
9246         emit_move_insn (reg, mem);
9247       if (frame_related_p)
9248         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
9249     }
9250 }
9251
9252 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
9253    of MODE.  */
9254
9255 static inline bool
9256 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9257 {
9258   HOST_WIDE_INT multiple;
9259   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9260           && IN_RANGE (multiple, -8, 7));
9261 }
9262
9263 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
9264    of MODE.  */
9265
9266 static inline bool
9267 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9268 {
9269   HOST_WIDE_INT multiple;
9270   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9271           && IN_RANGE (multiple, -32, 31));
9272 }
9273
9274 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
9275    of MODE.  */
9276
9277 static inline bool
9278 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9279 {
9280   HOST_WIDE_INT multiple;
9281   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9282           && IN_RANGE (multiple, 0, 63));
9283 }
9284
9285 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
9286    of MODE.  */
9287
9288 bool
9289 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9290 {
9291   HOST_WIDE_INT multiple;
9292   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9293           && IN_RANGE (multiple, -64, 63));
9294 }
9295
9296 /* Return true if OFFSET is a signed 9-bit value.  */
9297
9298 bool
9299 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
9300                                        poly_int64 offset)
9301 {
9302   HOST_WIDE_INT const_offset;
9303   return (offset.is_constant (&const_offset)
9304           && IN_RANGE (const_offset, -256, 255));
9305 }
9306
9307 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
9308    of MODE.  */
9309
9310 static inline bool
9311 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9312 {
9313   HOST_WIDE_INT multiple;
9314   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9315           && IN_RANGE (multiple, -256, 255));
9316 }
9317
9318 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
9319    of MODE.  */
9320
9321 static inline bool
9322 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9323 {
9324   HOST_WIDE_INT multiple;
9325   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9326           && IN_RANGE (multiple, 0, 4095));
9327 }
9328
9329 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
9330
9331 static sbitmap
9332 aarch64_get_separate_components (void)
9333 {
9334   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9335   bitmap_clear (components);
9336
9337   /* The registers we need saved to the frame.  */
9338   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9339     if (aarch64_register_saved_on_entry (regno))
9340       {
9341         /* Punt on saves and restores that use ST1D and LD1D.  We could
9342            try to be smarter, but it would involve making sure that the
9343            spare predicate register itself is safe to use at the save
9344            and restore points.  Also, when a frame pointer is being used,
9345            the slots are often out of reach of ST1D and LD1D anyway.  */
9346         machine_mode mode = aarch64_reg_save_mode (regno);
9347         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9348           continue;
9349
9350         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9351
9352         /* If the register is saved in the first SVE save slot, we use
9353            it as a stack probe for -fstack-clash-protection.  */
9354         if (flag_stack_clash_protection
9355             && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
9356             && known_eq (offset, 0))
9357           continue;
9358
9359         /* Get the offset relative to the register we'll use.  */
9360         if (frame_pointer_needed)
9361           offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9362         else
9363           offset += crtl->outgoing_args_size;
9364
9365         /* Check that we can access the stack slot of the register with one
9366            direct load with no adjustments needed.  */
9367         if (aarch64_sve_mode_p (mode)
9368             ? offset_9bit_signed_scaled_p (mode, offset)
9369             : offset_12bit_unsigned_scaled_p (mode, offset))
9370           bitmap_set_bit (components, regno);
9371       }
9372
9373   /* Don't mess with the hard frame pointer.  */
9374   if (frame_pointer_needed)
9375     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
9376
9377   /* If the spare predicate register used by big-endian SVE code
9378      is call-preserved, it must be saved in the main prologue
9379      before any saves that use it.  */
9380   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
9381     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
9382
9383   unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9384   unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9385   /* If registers have been chosen to be stored/restored with
9386      writeback don't interfere with them to avoid having to output explicit
9387      stack adjustment instructions.  */
9388   if (reg2 != INVALID_REGNUM)
9389     bitmap_clear_bit (components, reg2);
9390   if (reg1 != INVALID_REGNUM)
9391     bitmap_clear_bit (components, reg1);
9392
9393   bitmap_clear_bit (components, LR_REGNUM);
9394   bitmap_clear_bit (components, SP_REGNUM);
9395
9396   return components;
9397 }
9398
9399 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
9400
9401 static sbitmap
9402 aarch64_components_for_bb (basic_block bb)
9403 {
9404   bitmap in = DF_LIVE_IN (bb);
9405   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
9406   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
9407
9408   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9409   bitmap_clear (components);
9410
9411   /* Clobbered registers don't generate values in any meaningful sense,
9412      since nothing after the clobber can rely on their value.  And we can't
9413      say that partially-clobbered registers are unconditionally killed,
9414      because whether they're killed or not depends on the mode of the
9415      value they're holding.  Thus partially call-clobbered registers
9416      appear in neither the kill set nor the gen set.
9417
9418      Check manually for any calls that clobber more of a register than the
9419      current function can.  */
9420   function_abi_aggregator callee_abis;
9421   rtx_insn *insn;
9422   FOR_BB_INSNS (bb, insn)
9423     if (CALL_P (insn))
9424       callee_abis.note_callee_abi (insn_callee_abi (insn));
9425   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
9426
9427   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
9428   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9429     if (!fixed_regs[regno]
9430         && !crtl->abi->clobbers_full_reg_p (regno)
9431         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
9432             || bitmap_bit_p (in, regno)
9433             || bitmap_bit_p (gen, regno)
9434             || bitmap_bit_p (kill, regno)))
9435       {
9436         bitmap_set_bit (components, regno);
9437
9438         /* If there is a callee-save at an adjacent offset, add it too
9439            to increase the use of LDP/STP.  */
9440         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9441         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
9442
9443         if (regno2 <= LAST_SAVED_REGNUM)
9444           {
9445             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9446             if (regno < regno2
9447                 ? known_eq (offset + 8, offset2)
9448                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
9449               bitmap_set_bit (components, regno2);
9450           }
9451       }
9452
9453   return components;
9454 }
9455
9456 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9457    Nothing to do for aarch64.  */
9458
9459 static void
9460 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
9461 {
9462 }
9463
9464 /* Return the next set bit in BMP from START onwards.  Return the total number
9465    of bits in BMP if no set bit is found at or after START.  */
9466
9467 static unsigned int
9468 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
9469 {
9470   unsigned int nbits = SBITMAP_SIZE (bmp);
9471   if (start == nbits)
9472     return start;
9473
9474   gcc_assert (start < nbits);
9475   for (unsigned int i = start; i < nbits; i++)
9476     if (bitmap_bit_p (bmp, i))
9477       return i;
9478
9479   return nbits;
9480 }
9481
9482 /* Do the work for aarch64_emit_prologue_components and
9483    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
9484    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9485    for these components or the epilogue sequence.  That is, it determines
9486    whether we should emit stores or loads and what kind of CFA notes to attach
9487    to the insns.  Otherwise the logic for the two sequences is very
9488    similar.  */
9489
9490 static void
9491 aarch64_process_components (sbitmap components, bool prologue_p)
9492 {
9493   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
9494                              ? HARD_FRAME_POINTER_REGNUM
9495                              : STACK_POINTER_REGNUM);
9496
9497   unsigned last_regno = SBITMAP_SIZE (components);
9498   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9499   rtx_insn *insn = NULL;
9500
9501   while (regno != last_regno)
9502     {
9503       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9504       machine_mode mode = aarch64_reg_save_mode (regno);
9505
9506       rtx reg = gen_rtx_REG (mode, regno);
9507       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9508       if (frame_pointer_needed)
9509         offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9510       else
9511         offset += crtl->outgoing_args_size;
9512
9513       rtx addr = plus_constant (Pmode, ptr_reg, offset);
9514       rtx mem = gen_frame_mem (mode, addr);
9515
9516       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9517       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9518       /* No more registers to handle after REGNO.
9519          Emit a single save/restore and exit.  */
9520       if (regno2 == last_regno)
9521         {
9522           insn = emit_insn (set);
9523           if (frame_related_p)
9524             {
9525               RTX_FRAME_RELATED_P (insn) = 1;
9526               if (prologue_p)
9527                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9528               else
9529                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9530             }
9531           break;
9532         }
9533
9534       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9535       /* The next register is not of the same class or its offset is not
9536          mergeable with the current one into a pair.  */
9537       if (aarch64_sve_mode_p (mode)
9538           || !satisfies_constraint_Ump (mem)
9539           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9540           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9541           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
9542                        GET_MODE_SIZE (mode)))
9543         {
9544           insn = emit_insn (set);
9545           if (frame_related_p)
9546             {
9547               RTX_FRAME_RELATED_P (insn) = 1;
9548               if (prologue_p)
9549                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9550               else
9551                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9552             }
9553
9554           regno = regno2;
9555           continue;
9556         }
9557
9558       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9559
9560       /* REGNO2 can be saved/restored in a pair with REGNO.  */
9561       rtx reg2 = gen_rtx_REG (mode, regno2);
9562       if (frame_pointer_needed)
9563         offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9564       else
9565         offset2 += crtl->outgoing_args_size;
9566       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9567       rtx mem2 = gen_frame_mem (mode, addr2);
9568       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9569                              : gen_rtx_SET (reg2, mem2);
9570
9571       if (prologue_p)
9572         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
9573       else
9574         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9575
9576       if (frame_related_p || frame_related2_p)
9577         {
9578           RTX_FRAME_RELATED_P (insn) = 1;
9579           if (prologue_p)
9580             {
9581               if (frame_related_p)
9582                 add_reg_note (insn, REG_CFA_OFFSET, set);
9583               if (frame_related2_p)
9584                 add_reg_note (insn, REG_CFA_OFFSET, set2);
9585             }
9586           else
9587             {
9588               if (frame_related_p)
9589                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9590               if (frame_related2_p)
9591                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9592             }
9593         }
9594
9595       regno = aarch64_get_next_set_bit (components, regno2 + 1);
9596     }
9597 }
9598
9599 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
9600
9601 static void
9602 aarch64_emit_prologue_components (sbitmap components)
9603 {
9604   aarch64_process_components (components, true);
9605 }
9606
9607 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
9608
9609 static void
9610 aarch64_emit_epilogue_components (sbitmap components)
9611 {
9612   aarch64_process_components (components, false);
9613 }
9614
9615 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
9616
9617 static void
9618 aarch64_set_handled_components (sbitmap components)
9619 {
9620   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9621     if (bitmap_bit_p (components, regno))
9622       cfun->machine->reg_is_wrapped_separately[regno] = true;
9623 }
9624
9625 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
9626    determining the probe offset for alloca.  */
9627
9628 static HOST_WIDE_INT
9629 aarch64_stack_clash_protection_alloca_probe_range (void)
9630 {
9631   return STACK_CLASH_CALLER_GUARD;
9632 }
9633
9634
9635 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9636    registers.  If POLY_SIZE is not large enough to require a probe this function
9637    will only adjust the stack.  When allocating the stack space
9638    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9639    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
9640    arguments.  If we are then we ensure that any allocation larger than the ABI
9641    defined buffer needs a probe so that the invariant of having a 1KB buffer is
9642    maintained.
9643
9644    We emit barriers after each stack adjustment to prevent optimizations from
9645    breaking the invariant that we never drop the stack more than a page.  This
9646    invariant is needed to make it easier to correctly handle asynchronous
9647    events, e.g. if we were to allow the stack to be dropped by more than a page
9648    and then have multiple probes up and we take a signal somewhere in between
9649    then the signal handler doesn't know the state of the stack and can make no
9650    assumptions about which pages have been probed.  */
9651
9652 static void
9653 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9654                                         poly_int64 poly_size,
9655                                         bool frame_related_p,
9656                                         bool final_adjustment_p)
9657 {
9658   HOST_WIDE_INT guard_size
9659     = 1 << param_stack_clash_protection_guard_size;
9660   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9661   HOST_WIDE_INT min_probe_threshold
9662     = (final_adjustment_p
9663        ? guard_used_by_caller
9664        : guard_size - guard_used_by_caller);
9665   /* When doing the final adjustment for the outgoing arguments, take into
9666      account any unprobed space there is above the current SP.  There are
9667      two cases:
9668
9669      - When saving SVE registers below the hard frame pointer, we force
9670        the lowest save to take place in the prologue before doing the final
9671        adjustment (i.e. we don't allow the save to be shrink-wrapped).
9672        This acts as a probe at SP, so there is no unprobed space.
9673
9674      - When there are no SVE register saves, we use the store of the link
9675        register as a probe.  We can't assume that LR was saved at position 0
9676        though, so treat any space below it as unprobed.  */
9677   if (final_adjustment_p
9678       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
9679     {
9680       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
9681       if (known_ge (lr_offset, 0))
9682         min_probe_threshold -= lr_offset.to_constant ();
9683       else
9684         gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
9685     }
9686
9687   poly_int64 frame_size = cfun->machine->frame.frame_size;
9688
9689   /* We should always have a positive probe threshold.  */
9690   gcc_assert (min_probe_threshold > 0);
9691
9692   if (flag_stack_clash_protection && !final_adjustment_p)
9693     {
9694       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9695       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9696       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9697
9698       if (known_eq (frame_size, 0))
9699         {
9700           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9701         }
9702       else if (known_lt (initial_adjust + sve_callee_adjust,
9703                          guard_size - guard_used_by_caller)
9704                && known_lt (final_adjust, guard_used_by_caller))
9705         {
9706           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9707         }
9708     }
9709
9710   /* If SIZE is not large enough to require probing, just adjust the stack and
9711      exit.  */
9712   if (known_lt (poly_size, min_probe_threshold)
9713       || !flag_stack_clash_protection)
9714     {
9715       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
9716       return;
9717     }
9718
9719   HOST_WIDE_INT size;
9720   /* Handle the SVE non-constant case first.  */
9721   if (!poly_size.is_constant (&size))
9722     {
9723      if (dump_file)
9724       {
9725         fprintf (dump_file, "Stack clash SVE prologue: ");
9726         print_dec (poly_size, dump_file);
9727         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9728       }
9729
9730       /* First calculate the amount of bytes we're actually spilling.  */
9731       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9732                           poly_size, temp1, temp2, false, true);
9733
9734       rtx_insn *insn = get_last_insn ();
9735
9736       if (frame_related_p)
9737         {
9738           /* This is done to provide unwinding information for the stack
9739              adjustments we're about to do, however to prevent the optimizers
9740              from removing the R11 move and leaving the CFA note (which would be
9741              very wrong) we tie the old and new stack pointer together.
9742              The tie will expand to nothing but the optimizers will not touch
9743              the instruction.  */
9744           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9745           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9746           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
9747
9748           /* We want the CFA independent of the stack pointer for the
9749              duration of the loop.  */
9750           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9751           RTX_FRAME_RELATED_P (insn) = 1;
9752         }
9753
9754       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9755       rtx guard_const = gen_int_mode (guard_size, Pmode);
9756
9757       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9758                                                    stack_pointer_rtx, temp1,
9759                                                    probe_const, guard_const));
9760
9761       /* Now reset the CFA register if needed.  */
9762       if (frame_related_p)
9763         {
9764           add_reg_note (insn, REG_CFA_DEF_CFA,
9765                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9766                                       gen_int_mode (poly_size, Pmode)));
9767           RTX_FRAME_RELATED_P (insn) = 1;
9768         }
9769
9770       return;
9771     }
9772
9773   if (dump_file)
9774     fprintf (dump_file,
9775              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9776              " bytes, probing will be required.\n", size);
9777
9778   /* Round size to the nearest multiple of guard_size, and calculate the
9779      residual as the difference between the original size and the rounded
9780      size.  */
9781   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9782   HOST_WIDE_INT residual = size - rounded_size;
9783
9784   /* We can handle a small number of allocations/probes inline.  Otherwise
9785      punt to a loop.  */
9786   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9787     {
9788       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9789         {
9790           aarch64_sub_sp (NULL, temp2, guard_size, true);
9791           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9792                                            guard_used_by_caller));
9793           emit_insn (gen_blockage ());
9794         }
9795       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9796     }
9797   else
9798     {
9799       /* Compute the ending address.  */
9800       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9801                           temp1, NULL, false, true);
9802       rtx_insn *insn = get_last_insn ();
9803
9804       /* For the initial allocation, we don't have a frame pointer
9805          set up, so we always need CFI notes.  If we're doing the
9806          final allocation, then we may have a frame pointer, in which
9807          case it is the CFA, otherwise we need CFI notes.
9808
9809          We can determine which allocation we are doing by looking at
9810          the value of FRAME_RELATED_P since the final allocations are not
9811          frame related.  */
9812       if (frame_related_p)
9813         {
9814           /* We want the CFA independent of the stack pointer for the
9815              duration of the loop.  */
9816           add_reg_note (insn, REG_CFA_DEF_CFA,
9817                         plus_constant (Pmode, temp1, rounded_size));
9818           RTX_FRAME_RELATED_P (insn) = 1;
9819         }
9820
9821       /* This allocates and probes the stack.  Note that this re-uses some of
9822          the existing Ada stack protection code.  However we are guaranteed not
9823          to enter the non loop or residual branches of that code.
9824
9825          The non-loop part won't be entered because if our allocation amount
9826          doesn't require a loop, the case above would handle it.
9827
9828          The residual amount won't be entered because TEMP1 is a mutliple of
9829          the allocation size.  The residual will always be 0.  As such, the only
9830          part we are actually using from that code is the loop setup.  The
9831          actual probing is done in aarch64_output_probe_stack_range.  */
9832       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9833                                                stack_pointer_rtx, temp1));
9834
9835       /* Now reset the CFA register if needed.  */
9836       if (frame_related_p)
9837         {
9838           add_reg_note (insn, REG_CFA_DEF_CFA,
9839                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9840           RTX_FRAME_RELATED_P (insn) = 1;
9841         }
9842
9843       emit_insn (gen_blockage ());
9844       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9845     }
9846
9847   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
9848      be probed.  This maintains the requirement that each page is probed at
9849      least once.  For initial probing we probe only if the allocation is
9850      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
9851      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
9852      GUARD_SIZE.  This works that for any allocation that is large enough to
9853      trigger a probe here, we'll have at least one, and if they're not large
9854      enough for this code to emit anything for them, The page would have been
9855      probed by the saving of FP/LR either by this function or any callees.  If
9856      we don't have any callees then we won't have more stack adjustments and so
9857      are still safe.  */
9858   if (residual)
9859     {
9860       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
9861       /* If we're doing final adjustments, and we've done any full page
9862          allocations then any residual needs to be probed.  */
9863       if (final_adjustment_p && rounded_size != 0)
9864         min_probe_threshold = 0;
9865       /* If doing a small final adjustment, we always probe at offset 0.
9866          This is done to avoid issues when LR is not at position 0 or when
9867          the final adjustment is smaller than the probing offset.  */
9868       else if (final_adjustment_p && rounded_size == 0)
9869         residual_probe_offset = 0;
9870
9871       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
9872       if (residual >= min_probe_threshold)
9873         {
9874           if (dump_file)
9875             fprintf (dump_file,
9876                      "Stack clash AArch64 prologue residuals: "
9877                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9878                      "\n", residual);
9879
9880             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9881                                              residual_probe_offset));
9882           emit_insn (gen_blockage ());
9883         }
9884     }
9885 }
9886
9887 /* Return 1 if the register is used by the epilogue.  We need to say the
9888    return register is used, but only after epilogue generation is complete.
9889    Note that in the case of sibcalls, the values "used by the epilogue" are
9890    considered live at the start of the called function.
9891
9892    For SIMD functions we need to return 1 for FP registers that are saved and
9893    restored by a function but are not zero in call_used_regs.  If we do not do
9894    this optimizations may remove the restore of the register.  */
9895
9896 int
9897 aarch64_epilogue_uses (int regno)
9898 {
9899   if (epilogue_completed)
9900     {
9901       if (regno == LR_REGNUM)
9902         return 1;
9903     }
9904   return 0;
9905 }
9906
9907 /* AArch64 stack frames generated by this compiler look like:
9908
9909         +-------------------------------+
9910         |                               |
9911         |  incoming stack arguments     |
9912         |                               |
9913         +-------------------------------+
9914         |                               | <-- incoming stack pointer (aligned)
9915         |  callee-allocated save area   |
9916         |  for register varargs         |
9917         |                               |
9918         +-------------------------------+
9919         |  local variables              | <-- frame_pointer_rtx
9920         |                               |
9921         +-------------------------------+
9922         |  padding                      | \
9923         +-------------------------------+  |
9924         |  callee-saved registers       |  | frame.saved_regs_size
9925         +-------------------------------+  |
9926         |  LR'                          |  |
9927         +-------------------------------+  |
9928         |  FP'                          |  |
9929         +-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
9930         |  SVE vector registers         |  | \
9931         +-------------------------------+  |  | below_hard_fp_saved_regs_size
9932         |  SVE predicate registers      | /  /
9933         +-------------------------------+
9934         |  dynamic allocation           |
9935         +-------------------------------+
9936         |  padding                      |
9937         +-------------------------------+
9938         |  outgoing stack arguments     | <-- arg_pointer
9939         |                               |
9940         +-------------------------------+
9941         |                               | <-- stack_pointer_rtx (aligned)
9942
9943    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9944    but leave frame_pointer_rtx and hard_frame_pointer_rtx
9945    unchanged.
9946
9947    By default for stack-clash we assume the guard is at least 64KB, but this
9948    value is configurable to either 4KB or 64KB.  We also force the guard size to
9949    be the same as the probing interval and both values are kept in sync.
9950
9951    With those assumptions the callee can allocate up to 63KB (or 3KB depending
9952    on the guard size) of stack space without probing.
9953
9954    When probing is needed, we emit a probe at the start of the prologue
9955    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9956
9957    We have to track how much space has been allocated and the only stores
9958    to the stack we track as implicit probes are the FP/LR stores.
9959
9960    For outgoing arguments we probe if the size is larger than 1KB, such that
9961    the ABI specified buffer is maintained for the next callee.
9962
9963    The following registers are reserved during frame layout and should not be
9964    used for any other purpose:
9965
9966    - r11: Used by stack clash protection when SVE is enabled, and also
9967           as an anchor register when saving and restoring registers
9968    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9969    - r14 and r15: Used for speculation tracking.
9970    - r16(IP0), r17(IP1): Used by indirect tailcalls.
9971    - r30(LR), r29(FP): Used by standard frame layout.
9972
9973    These registers must be avoided in frame layout related code unless the
9974    explicit intention is to interact with one of the features listed above.  */
9975
9976 /* Generate the prologue instructions for entry into a function.
9977    Establish the stack frame by decreasing the stack pointer with a
9978    properly calculated size and, if necessary, create a frame record
9979    filled with the values of LR and previous frame pointer.  The
9980    current FP is also set up if it is in use.  */
9981
9982 void
9983 aarch64_expand_prologue (void)
9984 {
9985   poly_int64 frame_size = cfun->machine->frame.frame_size;
9986   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9987   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
9988   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9989   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
9990   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9991   poly_int64 below_hard_fp_saved_regs_size
9992     = cfun->machine->frame.below_hard_fp_saved_regs_size;
9993   unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9994   unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9995   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
9996   rtx_insn *insn;
9997
9998   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
9999     {
10000       /* Fold the SVE allocation into the initial allocation.
10001          We don't do this in aarch64_layout_arg to avoid pessimizing
10002          the epilogue code.  */
10003       initial_adjust += sve_callee_adjust;
10004       sve_callee_adjust = 0;
10005     }
10006
10007   /* Sign return address for functions.  */
10008   if (aarch64_return_address_signing_enabled ())
10009     {
10010       switch (aarch_ra_sign_key)
10011         {
10012           case AARCH_KEY_A:
10013             insn = emit_insn (gen_paciasp ());
10014             break;
10015           case AARCH_KEY_B:
10016             insn = emit_insn (gen_pacibsp ());
10017             break;
10018           default:
10019             gcc_unreachable ();
10020         }
10021       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10022       RTX_FRAME_RELATED_P (insn) = 1;
10023     }
10024
10025   /* Push return address to shadow call stack.  */
10026   if (cfun->machine->frame.is_scs_enabled)
10027     emit_insn (gen_scs_push ());
10028
10029   if (flag_stack_usage_info)
10030     current_function_static_stack_size = constant_lower_bound (frame_size);
10031
10032   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10033     {
10034       if (crtl->is_leaf && !cfun->calls_alloca)
10035         {
10036           if (maybe_gt (frame_size, PROBE_INTERVAL)
10037               && maybe_gt (frame_size, get_stack_check_protect ()))
10038             aarch64_emit_probe_stack_range (get_stack_check_protect (),
10039                                             (frame_size
10040                                              - get_stack_check_protect ()));
10041         }
10042       else if (maybe_gt (frame_size, 0))
10043         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
10044     }
10045
10046   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10047   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
10048
10049   /* In theory we should never have both an initial adjustment
10050      and a callee save adjustment.  Verify that is the case since the
10051      code below does not handle it for -fstack-clash-protection.  */
10052   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
10053
10054   /* Will only probe if the initial adjustment is larger than the guard
10055      less the amount of the guard reserved for use by the caller's
10056      outgoing args.  */
10057   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
10058                                           true, false);
10059
10060   if (callee_adjust != 0)
10061     aarch64_push_regs (reg1, reg2, callee_adjust);
10062
10063   /* The offset of the frame chain record (if any) from the current SP.  */
10064   poly_int64 chain_offset = (initial_adjust + callee_adjust
10065                              - cfun->machine->frame.hard_fp_offset);
10066   gcc_assert (known_ge (chain_offset, 0));
10067
10068   /* The offset of the bottom of the save area from the current SP.  */
10069   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
10070
10071   if (emit_frame_chain)
10072     {
10073       if (callee_adjust == 0)
10074         {
10075           reg1 = R29_REGNUM;
10076           reg2 = R30_REGNUM;
10077           aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
10078                                      false, false);
10079         }
10080       else
10081         gcc_assert (known_eq (chain_offset, 0));
10082       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
10083                           stack_pointer_rtx, chain_offset,
10084                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
10085       if (frame_pointer_needed && !frame_size.is_constant ())
10086         {
10087           /* Variable-sized frames need to describe the save slot
10088              address using DW_CFA_expression rather than DW_CFA_offset.
10089              This means that, without taking further action, the
10090              locations of the registers that we've already saved would
10091              remain based on the stack pointer even after we redefine
10092              the CFA based on the frame pointer.  We therefore need new
10093              DW_CFA_expressions to re-express the save slots with addresses
10094              based on the frame pointer.  */
10095           rtx_insn *insn = get_last_insn ();
10096           gcc_assert (RTX_FRAME_RELATED_P (insn));
10097
10098           /* Add an explicit CFA definition if this was previously
10099              implicit.  */
10100           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
10101             {
10102               rtx src = plus_constant (Pmode, stack_pointer_rtx,
10103                                        callee_offset);
10104               add_reg_note (insn, REG_CFA_ADJUST_CFA,
10105                             gen_rtx_SET (hard_frame_pointer_rtx, src));
10106             }
10107
10108           /* Change the save slot expressions for the registers that
10109              we've already saved.  */
10110           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
10111                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
10112           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
10113                                       hard_frame_pointer_rtx, 0);
10114         }
10115       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
10116     }
10117
10118   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
10119                              callee_adjust != 0 || emit_frame_chain,
10120                              emit_frame_chain);
10121   if (maybe_ne (sve_callee_adjust, 0))
10122     {
10123       gcc_assert (!flag_stack_clash_protection
10124                   || known_eq (initial_adjust, 0));
10125       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
10126                                               sve_callee_adjust,
10127                                               !frame_pointer_needed, false);
10128       saved_regs_offset += sve_callee_adjust;
10129     }
10130   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
10131                              false, emit_frame_chain);
10132   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
10133                              callee_adjust != 0 || emit_frame_chain,
10134                              emit_frame_chain);
10135
10136   /* We may need to probe the final adjustment if it is larger than the guard
10137      that is assumed by the called.  */
10138   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
10139                                           !frame_pointer_needed, true);
10140 }
10141
10142 /* Return TRUE if we can use a simple_return insn.
10143
10144    This function checks whether the callee saved stack is empty, which
10145    means no restore actions are need. The pro_and_epilogue will use
10146    this to check whether shrink-wrapping opt is feasible.  */
10147
10148 bool
10149 aarch64_use_return_insn_p (void)
10150 {
10151   if (!reload_completed)
10152     return false;
10153
10154   if (crtl->profile)
10155     return false;
10156
10157   return known_eq (cfun->machine->frame.frame_size, 0);
10158 }
10159
10160 /* Generate the epilogue instructions for returning from a function.
10161    This is almost exactly the reverse of the prolog sequence, except
10162    that we need to insert barriers to avoid scheduling loads that read
10163    from a deallocated stack, and we optimize the unwind records by
10164    emitting them all together if possible.  */
10165 void
10166 aarch64_expand_epilogue (bool for_sibcall)
10167 {
10168   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
10169   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
10170   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
10171   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
10172   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
10173   poly_int64 below_hard_fp_saved_regs_size
10174     = cfun->machine->frame.below_hard_fp_saved_regs_size;
10175   unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
10176   unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
10177   unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
10178                            ? R29_REGNUM : R30_REGNUM);
10179   rtx cfi_ops = NULL;
10180   rtx_insn *insn;
10181   /* A stack clash protection prologue may not have left EP0_REGNUM or
10182      EP1_REGNUM in a usable state.  The same is true for allocations
10183      with an SVE component, since we then need both temporary registers
10184      for each allocation.  For stack clash we are in a usable state if
10185      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
10186   HOST_WIDE_INT guard_size
10187     = 1 << param_stack_clash_protection_guard_size;
10188   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
10189
10190   /* We can re-use the registers when:
10191
10192      (a) the deallocation amount is the same as the corresponding
10193          allocation amount (which is false if we combine the initial
10194          and SVE callee save allocations in the prologue); and
10195
10196      (b) the allocation amount doesn't need a probe (which is false
10197          if the amount is guard_size - guard_used_by_caller or greater).
10198
10199      In such situations the register should remain live with the correct
10200      value.  */
10201   bool can_inherit_p = (initial_adjust.is_constant ()
10202                         && final_adjust.is_constant ()
10203                         && (!flag_stack_clash_protection
10204                             || (known_lt (initial_adjust,
10205                                           guard_size - guard_used_by_caller)
10206                                 && known_eq (sve_callee_adjust, 0))));
10207
10208   /* We need to add memory barrier to prevent read from deallocated stack.  */
10209   bool need_barrier_p
10210     = maybe_ne (get_frame_size ()
10211                 + cfun->machine->frame.saved_varargs_size, 0);
10212
10213   /* Emit a barrier to prevent loads from a deallocated stack.  */
10214   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
10215       || cfun->calls_alloca
10216       || crtl->calls_eh_return)
10217     {
10218       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10219       need_barrier_p = false;
10220     }
10221
10222   /* Restore the stack pointer from the frame pointer if it may not
10223      be the same as the stack pointer.  */
10224   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10225   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
10226   if (frame_pointer_needed
10227       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
10228     /* If writeback is used when restoring callee-saves, the CFA
10229        is restored on the instruction doing the writeback.  */
10230     aarch64_add_offset (Pmode, stack_pointer_rtx,
10231                         hard_frame_pointer_rtx,
10232                         -callee_offset - below_hard_fp_saved_regs_size,
10233                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
10234   else
10235      /* The case where we need to re-use the register here is very rare, so
10236         avoid the complicated condition and just always emit a move if the
10237         immediate doesn't fit.  */
10238      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
10239
10240   /* Restore the vector registers before the predicate registers,
10241      so that we can use P4 as a temporary for big-endian SVE frames.  */
10242   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
10243                                 callee_adjust != 0, &cfi_ops);
10244   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
10245                                 false, &cfi_ops);
10246   if (maybe_ne (sve_callee_adjust, 0))
10247     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
10248
10249   /* When shadow call stack is enabled, the scs_pop in the epilogue will
10250      restore x30, we don't need to restore x30 again in the traditional
10251      way.  */
10252   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
10253                                 R0_REGNUM, last_gpr,
10254                                 callee_adjust != 0, &cfi_ops);
10255
10256   if (need_barrier_p)
10257     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10258
10259   if (callee_adjust != 0)
10260     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
10261
10262   /* If we have no register restore information, the CFA must have been
10263      defined in terms of the stack pointer since the end of the prologue.  */
10264   gcc_assert (cfi_ops || !frame_pointer_needed);
10265
10266   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
10267     {
10268       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
10269       insn = get_last_insn ();
10270       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
10271       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
10272       RTX_FRAME_RELATED_P (insn) = 1;
10273       cfi_ops = NULL;
10274     }
10275
10276   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10277      add restriction on emit_move optimization to leaf functions.  */
10278   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
10279                   (!can_inherit_p || !crtl->is_leaf
10280                    || df_regs_ever_live_p (EP0_REGNUM)));
10281
10282   if (cfi_ops)
10283     {
10284       /* Emit delayed restores and reset the CFA to be SP.  */
10285       insn = get_last_insn ();
10286       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
10287       REG_NOTES (insn) = cfi_ops;
10288       RTX_FRAME_RELATED_P (insn) = 1;
10289     }
10290
10291   /* Pop return address from shadow call stack.  */
10292   if (cfun->machine->frame.is_scs_enabled)
10293     {
10294       machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
10295       rtx reg = gen_rtx_REG (mode, R30_REGNUM);
10296
10297       insn = emit_insn (gen_scs_pop ());
10298       add_reg_note (insn, REG_CFA_RESTORE, reg);
10299       RTX_FRAME_RELATED_P (insn) = 1;
10300     }
10301
10302   /* We prefer to emit the combined return/authenticate instruction RETAA,
10303      however there are three cases in which we must instead emit an explicit
10304      authentication instruction.
10305
10306         1) Sibcalls don't return in a normal way, so if we're about to call one
10307            we must authenticate.
10308
10309         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10310            generating code for !TARGET_ARMV8_3 we can't use it and must
10311            explicitly authenticate.
10312     */
10313   if (aarch64_return_address_signing_enabled ()
10314       && (for_sibcall || !TARGET_ARMV8_3))
10315     {
10316       switch (aarch_ra_sign_key)
10317         {
10318           case AARCH_KEY_A:
10319             insn = emit_insn (gen_autiasp ());
10320             break;
10321           case AARCH_KEY_B:
10322             insn = emit_insn (gen_autibsp ());
10323             break;
10324           default:
10325             gcc_unreachable ();
10326         }
10327       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10328       RTX_FRAME_RELATED_P (insn) = 1;
10329     }
10330
10331   /* Stack adjustment for exception handler.  */
10332   if (crtl->calls_eh_return && !for_sibcall)
10333     {
10334       /* We need to unwind the stack by the offset computed by
10335          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
10336          to be SP; letting the CFA move during this adjustment
10337          is just as correct as retaining the CFA from the body
10338          of the function.  Therefore, do nothing special.  */
10339       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
10340     }
10341
10342   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10343   if (!for_sibcall)
10344     emit_jump_insn (ret_rtx);
10345 }
10346
10347 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
10348    normally or return to a previous frame after unwinding.
10349
10350    An EH return uses a single shared return sequence.  The epilogue is
10351    exactly like a normal epilogue except that it has an extra input
10352    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
10353    that must be applied after the frame has been destroyed.  An extra label
10354    is inserted before the epilogue which initializes this register to zero,
10355    and this is the entry point for a normal return.
10356
10357    An actual EH return updates the return address, initializes the stack
10358    adjustment and jumps directly into the epilogue (bypassing the zeroing
10359    of the adjustment).  Since the return address is typically saved on the
10360    stack when a function makes a call, the saved LR must be updated outside
10361    the epilogue.
10362
10363    This poses problems as the store is generated well before the epilogue,
10364    so the offset of LR is not known yet.  Also optimizations will remove the
10365    store as it appears dead, even after the epilogue is generated (as the
10366    base or offset for loading LR is different in many cases).
10367
10368    To avoid these problems this implementation forces the frame pointer
10369    in eh_return functions so that the location of LR is fixed and known early.
10370    It also marks the store volatile, so no optimization is permitted to
10371    remove the store.  */
10372 rtx
10373 aarch64_eh_return_handler_rtx (void)
10374 {
10375   rtx tmp = gen_frame_mem (Pmode,
10376     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
10377
10378   /* Mark the store volatile, so no optimization is permitted to remove it.  */
10379   MEM_VOLATILE_P (tmp) = true;
10380   return tmp;
10381 }
10382
10383 /* Output code to add DELTA to the first argument, and then jump
10384    to FUNCTION.  Used for C++ multiple inheritance.  */
10385 static void
10386 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10387                          HOST_WIDE_INT delta,
10388                          HOST_WIDE_INT vcall_offset,
10389                          tree function)
10390 {
10391   /* The this pointer is always in x0.  Note that this differs from
10392      Arm where the this pointer maybe bumped to r1 if r0 is required
10393      to return a pointer to an aggregate.  On AArch64 a result value
10394      pointer will be in x8.  */
10395   int this_regno = R0_REGNUM;
10396   rtx this_rtx, temp0, temp1, addr, funexp;
10397   rtx_insn *insn;
10398   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10399
10400   if (aarch_bti_enabled ())
10401     emit_insn (gen_bti_c());
10402
10403   reload_completed = 1;
10404   emit_note (NOTE_INSN_PROLOGUE_END);
10405
10406   this_rtx = gen_rtx_REG (Pmode, this_regno);
10407   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10408   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10409
10410   if (vcall_offset == 0)
10411     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
10412   else
10413     {
10414       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10415
10416       addr = this_rtx;
10417       if (delta != 0)
10418         {
10419           if (delta >= -256 && delta < 256)
10420             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10421                                        plus_constant (Pmode, this_rtx, delta));
10422           else
10423             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10424                                 temp1, temp0, false);
10425         }
10426
10427       if (Pmode == ptr_mode)
10428         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10429       else
10430         aarch64_emit_move (temp0,
10431                            gen_rtx_ZERO_EXTEND (Pmode,
10432                                                 gen_rtx_MEM (ptr_mode, addr)));
10433
10434       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10435           addr = plus_constant (Pmode, temp0, vcall_offset);
10436       else
10437         {
10438           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10439                                           Pmode);
10440           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10441         }
10442
10443       if (Pmode == ptr_mode)
10444         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10445       else
10446         aarch64_emit_move (temp1,
10447                            gen_rtx_SIGN_EXTEND (Pmode,
10448                                                 gen_rtx_MEM (ptr_mode, addr)));
10449
10450       emit_insn (gen_add2_insn (this_rtx, temp1));
10451     }
10452
10453   /* Generate a tail call to the target function.  */
10454   if (!TREE_USED (function))
10455     {
10456       assemble_external (function);
10457       TREE_USED (function) = 1;
10458     }
10459   funexp = XEXP (DECL_RTL (function), 0);
10460   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10461   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
10462   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10463   SIBLING_CALL_P (insn) = 1;
10464
10465   insn = get_insns ();
10466   shorten_branches (insn);
10467
10468   assemble_start_function (thunk, fnname);
10469   final_start_function (insn, file, 1);
10470   final (insn, file, 1);
10471   final_end_function ();
10472   assemble_end_function (thunk, fnname);
10473
10474   /* Stop pretending to be a post-reload pass.  */
10475   reload_completed = 0;
10476 }
10477
10478 static bool
10479 aarch64_tls_referenced_p (rtx x)
10480 {
10481   if (!TARGET_HAVE_TLS)
10482     return false;
10483   subrtx_iterator::array_type array;
10484   FOR_EACH_SUBRTX (iter, array, x, ALL)
10485     {
10486       const_rtx x = *iter;
10487       if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10488         return true;
10489       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10490          TLS offsets, not real symbol references.  */
10491       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10492         iter.skip_subrtxes ();
10493     }
10494   return false;
10495 }
10496
10497
10498 static bool
10499 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10500 {
10501   if (GET_CODE (x) == HIGH)
10502     return true;
10503
10504   /* There's no way to calculate VL-based values using relocations.  */
10505   subrtx_iterator::array_type array;
10506   FOR_EACH_SUBRTX (iter, array, x, ALL)
10507     if (GET_CODE (*iter) == CONST_POLY_INT)
10508       return true;
10509
10510   poly_int64 offset;
10511   rtx base = strip_offset_and_salt (x, &offset);
10512   if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10513     {
10514       /* We checked for POLY_INT_CST offsets above.  */
10515       if (aarch64_classify_symbol (base, offset.to_constant ())
10516           != SYMBOL_FORCE_TO_MEM)
10517         return true;
10518       else
10519         /* Avoid generating a 64-bit relocation in ILP32; leave
10520            to aarch64_expand_mov_immediate to handle it properly.  */
10521         return mode != ptr_mode;
10522     }
10523
10524   return aarch64_tls_referenced_p (x);
10525 }
10526
10527 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10528    The expansion for a table switch is quite expensive due to the number
10529    of instructions, the table lookup and hard to predict indirect jump.
10530    When optimizing for speed, and -O3 enabled, use the per-core tuning if
10531    set, otherwise use tables for >= 11 cases as a tradeoff between size and
10532    performance.  When optimizing for size, use 8 for smallest codesize.  */
10533
10534 static unsigned int
10535 aarch64_case_values_threshold (void)
10536 {
10537   /* Use the specified limit for the number of cases before using jump
10538      tables at higher optimization levels.  */
10539   if (optimize > 2
10540       && aarch64_tune_params.max_case_values != 0)
10541     return aarch64_tune_params.max_case_values;
10542   else
10543     return optimize_size ? 8 : 11;
10544 }
10545
10546 /* Return true if register REGNO is a valid index register.
10547    STRICT_P is true if REG_OK_STRICT is in effect.  */
10548
10549 bool
10550 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10551 {
10552   if (!HARD_REGISTER_NUM_P (regno))
10553     {
10554       if (!strict_p)
10555         return true;
10556
10557       if (!reg_renumber)
10558         return false;
10559
10560       regno = reg_renumber[regno];
10561     }
10562   return GP_REGNUM_P (regno);
10563 }
10564
10565 /* Return true if register REGNO is a valid base register for mode MODE.
10566    STRICT_P is true if REG_OK_STRICT is in effect.  */
10567
10568 bool
10569 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10570 {
10571   if (!HARD_REGISTER_NUM_P (regno))
10572     {
10573       if (!strict_p)
10574         return true;
10575
10576       if (!reg_renumber)
10577         return false;
10578
10579       regno = reg_renumber[regno];
10580     }
10581
10582   /* The fake registers will be eliminated to either the stack or
10583      hard frame pointer, both of which are usually valid base registers.
10584      Reload deals with the cases where the eliminated form isn't valid.  */
10585   return (GP_REGNUM_P (regno)
10586           || regno == SP_REGNUM
10587           || regno == FRAME_POINTER_REGNUM
10588           || regno == ARG_POINTER_REGNUM);
10589 }
10590
10591 /* Return true if X is a valid base register for mode MODE.
10592    STRICT_P is true if REG_OK_STRICT is in effect.  */
10593
10594 static bool
10595 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10596 {
10597   if (!strict_p
10598       && SUBREG_P (x)
10599       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10600     x = SUBREG_REG (x);
10601
10602   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10603 }
10604
10605 /* Return true if address offset is a valid index.  If it is, fill in INFO
10606    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10607
10608 static bool
10609 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10610                         machine_mode mode, bool strict_p)
10611 {
10612   enum aarch64_address_type type;
10613   rtx index;
10614   int shift;
10615
10616   /* (reg:P) */
10617   if ((REG_P (x) || SUBREG_P (x))
10618       && GET_MODE (x) == Pmode)
10619     {
10620       type = ADDRESS_REG_REG;
10621       index = x;
10622       shift = 0;
10623     }
10624   /* (sign_extend:DI (reg:SI)) */
10625   else if ((GET_CODE (x) == SIGN_EXTEND
10626             || GET_CODE (x) == ZERO_EXTEND)
10627            && GET_MODE (x) == DImode
10628            && GET_MODE (XEXP (x, 0)) == SImode)
10629     {
10630       type = (GET_CODE (x) == SIGN_EXTEND)
10631         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10632       index = XEXP (x, 0);
10633       shift = 0;
10634     }
10635   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10636   else if (GET_CODE (x) == MULT
10637            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10638                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10639            && GET_MODE (XEXP (x, 0)) == DImode
10640            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10641            && CONST_INT_P (XEXP (x, 1)))
10642     {
10643       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10644         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10645       index = XEXP (XEXP (x, 0), 0);
10646       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10647     }
10648   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10649   else if (GET_CODE (x) == ASHIFT
10650            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10651                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10652            && GET_MODE (XEXP (x, 0)) == DImode
10653            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10654            && CONST_INT_P (XEXP (x, 1)))
10655     {
10656       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10657         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10658       index = XEXP (XEXP (x, 0), 0);
10659       shift = INTVAL (XEXP (x, 1));
10660     }
10661   /* (and:DI (mult:DI (reg:DI) (const_int scale))
10662      (const_int 0xffffffff<<shift)) */
10663   else if (GET_CODE (x) == AND
10664            && GET_MODE (x) == DImode
10665            && GET_CODE (XEXP (x, 0)) == MULT
10666            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10667            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10668            && CONST_INT_P (XEXP (x, 1)))
10669     {
10670       type = ADDRESS_REG_UXTW;
10671       index = XEXP (XEXP (x, 0), 0);
10672       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10673       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10674         shift = -1;
10675     }
10676   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10677      (const_int 0xffffffff<<shift)) */
10678   else if (GET_CODE (x) == AND
10679            && GET_MODE (x) == DImode
10680            && GET_CODE (XEXP (x, 0)) == ASHIFT
10681            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10682            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10683            && CONST_INT_P (XEXP (x, 1)))
10684     {
10685       type = ADDRESS_REG_UXTW;
10686       index = XEXP (XEXP (x, 0), 0);
10687       shift = INTVAL (XEXP (XEXP (x, 0), 1));
10688       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10689         shift = -1;
10690     }
10691   /* (mult:P (reg:P) (const_int scale)) */
10692   else if (GET_CODE (x) == MULT
10693            && GET_MODE (x) == Pmode
10694            && GET_MODE (XEXP (x, 0)) == Pmode
10695            && CONST_INT_P (XEXP (x, 1)))
10696     {
10697       type = ADDRESS_REG_REG;
10698       index = XEXP (x, 0);
10699       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10700     }
10701   /* (ashift:P (reg:P) (const_int shift)) */
10702   else if (GET_CODE (x) == ASHIFT
10703            && GET_MODE (x) == Pmode
10704            && GET_MODE (XEXP (x, 0)) == Pmode
10705            && CONST_INT_P (XEXP (x, 1)))
10706     {
10707       type = ADDRESS_REG_REG;
10708       index = XEXP (x, 0);
10709       shift = INTVAL (XEXP (x, 1));
10710     }
10711   else
10712     return false;
10713
10714   if (!strict_p
10715       && SUBREG_P (index)
10716       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10717     index = SUBREG_REG (index);
10718
10719   if (aarch64_sve_data_mode_p (mode))
10720     {
10721       if (type != ADDRESS_REG_REG
10722           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10723         return false;
10724     }
10725   else
10726     {
10727       if (shift != 0
10728           && !(IN_RANGE (shift, 1, 3)
10729                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10730         return false;
10731     }
10732
10733   if (REG_P (index)
10734       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10735     {
10736       info->type = type;
10737       info->offset = index;
10738       info->shift = shift;
10739       return true;
10740     }
10741
10742   return false;
10743 }
10744
10745 /* Return true if MODE is one of the modes for which we
10746    support LDP/STP operations.  */
10747
10748 static bool
10749 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10750 {
10751   return mode == SImode || mode == DImode
10752          || mode == SFmode || mode == DFmode
10753          || mode == SDmode || mode == DDmode
10754          || (aarch64_vector_mode_supported_p (mode)
10755              && (known_eq (GET_MODE_SIZE (mode), 8)
10756                  || (known_eq (GET_MODE_SIZE (mode), 16)
10757                     && (aarch64_tune_params.extra_tuning_flags
10758                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
10759 }
10760
10761 /* Return true if REGNO is a virtual pointer register, or an eliminable
10762    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
10763    include stack_pointer or hard_frame_pointer.  */
10764 static bool
10765 virt_or_elim_regno_p (unsigned regno)
10766 {
10767   return ((regno >= FIRST_VIRTUAL_REGISTER
10768            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10769           || regno == FRAME_POINTER_REGNUM
10770           || regno == ARG_POINTER_REGNUM);
10771 }
10772
10773 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10774    If it is, fill in INFO appropriately.  STRICT_P is true if
10775    REG_OK_STRICT is in effect.  */
10776
10777 bool
10778 aarch64_classify_address (struct aarch64_address_info *info,
10779                           rtx x, machine_mode mode, bool strict_p,
10780                           aarch64_addr_query_type type)
10781 {
10782   enum rtx_code code = GET_CODE (x);
10783   rtx op0, op1;
10784   poly_int64 offset;
10785
10786   HOST_WIDE_INT const_size;
10787
10788   /* Whether a vector mode is partial doesn't affect address legitimacy.
10789      Partial vectors like VNx8QImode allow the same indexed addressing
10790      mode and MUL VL addressing mode as full vectors like VNx16QImode;
10791      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
10792   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10793   vec_flags &= ~VEC_PARTIAL;
10794
10795   /* On BE, we use load/store pair for all large int mode load/stores.
10796      TI/TF/TDmode may also use a load/store pair.  */
10797   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10798   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10799                             || type == ADDR_QUERY_LDP_STP_N
10800                             || mode == TImode
10801                             || mode == TFmode
10802                             || mode == TDmode
10803                             || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10804                                 && advsimd_struct_p));
10805   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10806      corresponds to the actual size of the memory being loaded/stored and the
10807      mode of the corresponding addressing mode is half of that.  */
10808   if (type == ADDR_QUERY_LDP_STP_N)
10809     {
10810       if (known_eq (GET_MODE_SIZE (mode), 16))
10811         mode = DFmode;
10812       else if (known_eq (GET_MODE_SIZE (mode), 8))
10813         mode = SFmode;
10814       else
10815         return false;
10816     }
10817
10818   bool allow_reg_index_p = (!load_store_pair_p
10819                             && ((vec_flags == 0
10820                                  && known_lt (GET_MODE_SIZE (mode), 16))
10821                                 || vec_flags == VEC_ADVSIMD
10822                                 || vec_flags & VEC_SVE_DATA));
10823
10824   /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10825      The latter is not valid for SVE predicates, and that's rejected through
10826      allow_reg_index_p above.  */
10827   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10828       && (code != REG && code != PLUS))
10829     return false;
10830
10831   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10832      REG addressing.  */
10833   if (advsimd_struct_p
10834       && TARGET_SIMD
10835       && !BYTES_BIG_ENDIAN
10836       && (code != POST_INC && code != REG))
10837     return false;
10838
10839   gcc_checking_assert (GET_MODE (x) == VOIDmode
10840                        || SCALAR_INT_MODE_P (GET_MODE (x)));
10841
10842   switch (code)
10843     {
10844     case REG:
10845     case SUBREG:
10846       info->type = ADDRESS_REG_IMM;
10847       info->base = x;
10848       info->offset = const0_rtx;
10849       info->const_offset = 0;
10850       return aarch64_base_register_rtx_p (x, strict_p);
10851
10852     case PLUS:
10853       op0 = XEXP (x, 0);
10854       op1 = XEXP (x, 1);
10855
10856       if (! strict_p
10857           && REG_P (op0)
10858           && virt_or_elim_regno_p (REGNO (op0))
10859           && poly_int_rtx_p (op1, &offset))
10860         {
10861           info->type = ADDRESS_REG_IMM;
10862           info->base = op0;
10863           info->offset = op1;
10864           info->const_offset = offset;
10865
10866           return true;
10867         }
10868
10869       if (maybe_ne (GET_MODE_SIZE (mode), 0)
10870           && aarch64_base_register_rtx_p (op0, strict_p)
10871           && poly_int_rtx_p (op1, &offset))
10872         {
10873           info->type = ADDRESS_REG_IMM;
10874           info->base = op0;
10875           info->offset = op1;
10876           info->const_offset = offset;
10877
10878           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10879              registers and individual Q registers.  The available
10880              address modes are:
10881              X,X: 7-bit signed scaled offset
10882              Q:   9-bit signed offset
10883              We conservatively require an offset representable in either mode.
10884              When performing the check for pairs of X registers i.e.  LDP/STP
10885              pass down DImode since that is the natural size of the LDP/STP
10886              instruction memory accesses.  */
10887           if (mode == TImode || mode == TFmode || mode == TDmode)
10888             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10889                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10890                         || offset_12bit_unsigned_scaled_p (mode, offset)));
10891
10892           if (mode == V8DImode)
10893             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10894                     && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10895
10896           /* A 7bit offset check because OImode will emit a ldp/stp
10897              instruction (only !TARGET_SIMD or big endian will get here).
10898              For ldp/stp instructions, the offset is scaled for the size of a
10899              single element of the pair.  */
10900           if (aarch64_advsimd_partial_struct_mode_p (mode)
10901               && known_eq (GET_MODE_SIZE (mode), 16))
10902             return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10903           if (aarch64_advsimd_full_struct_mode_p (mode)
10904               && known_eq (GET_MODE_SIZE (mode), 32))
10905             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10906
10907           /* Three 9/12 bit offsets checks because CImode will emit three
10908              ldr/str instructions (only !TARGET_SIMD or big endian will
10909              get here).  */
10910           if (aarch64_advsimd_partial_struct_mode_p (mode)
10911               && known_eq (GET_MODE_SIZE (mode), 24))
10912             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10913                     && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10914                                                                offset + 16)
10915                         || offset_12bit_unsigned_scaled_p (DImode,
10916                                                            offset + 16)));
10917           if (aarch64_advsimd_full_struct_mode_p (mode)
10918               && known_eq (GET_MODE_SIZE (mode), 48))
10919             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10920                     && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10921                                                                offset + 32)
10922                         || offset_12bit_unsigned_scaled_p (TImode,
10923                                                            offset + 32)));
10924
10925           /* Two 7bit offsets checks because XImode will emit two ldp/stp
10926              instructions (only big endian will get here).  */
10927           if (aarch64_advsimd_partial_struct_mode_p (mode)
10928               && known_eq (GET_MODE_SIZE (mode), 32))
10929             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10930                     && aarch64_offset_7bit_signed_scaled_p (DImode,
10931                                                             offset + 16));
10932           if (aarch64_advsimd_full_struct_mode_p (mode)
10933               && known_eq (GET_MODE_SIZE (mode), 64))
10934             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10935                     && aarch64_offset_7bit_signed_scaled_p (TImode,
10936                                                             offset + 32));
10937
10938           /* Make "m" use the LD1 offset range for SVE data modes, so
10939              that pre-RTL optimizers like ivopts will work to that
10940              instead of the wider LDR/STR range.  */
10941           if (vec_flags == VEC_SVE_DATA)
10942             return (type == ADDR_QUERY_M
10943                     ? offset_4bit_signed_scaled_p (mode, offset)
10944                     : offset_9bit_signed_scaled_p (mode, offset));
10945
10946           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10947             {
10948               poly_int64 end_offset = (offset
10949                                        + GET_MODE_SIZE (mode)
10950                                        - BYTES_PER_SVE_VECTOR);
10951               return (type == ADDR_QUERY_M
10952                       ? offset_4bit_signed_scaled_p (mode, offset)
10953                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10954                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10955                                                          end_offset)));
10956             }
10957
10958           if (vec_flags == VEC_SVE_PRED)
10959             return offset_9bit_signed_scaled_p (mode, offset);
10960
10961           if (load_store_pair_p)
10962             return ((known_eq (GET_MODE_SIZE (mode), 4)
10963                      || known_eq (GET_MODE_SIZE (mode), 8)
10964                      || known_eq (GET_MODE_SIZE (mode), 16))
10965                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10966           else
10967             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10968                     || offset_12bit_unsigned_scaled_p (mode, offset));
10969         }
10970
10971       if (allow_reg_index_p)
10972         {
10973           /* Look for base + (scaled/extended) index register.  */
10974           if (aarch64_base_register_rtx_p (op0, strict_p)
10975               && aarch64_classify_index (info, op1, mode, strict_p))
10976             {
10977               info->base = op0;
10978               return true;
10979             }
10980           if (aarch64_base_register_rtx_p (op1, strict_p)
10981               && aarch64_classify_index (info, op0, mode, strict_p))
10982             {
10983               info->base = op1;
10984               return true;
10985             }
10986         }
10987
10988       return false;
10989
10990     case POST_INC:
10991     case POST_DEC:
10992     case PRE_INC:
10993     case PRE_DEC:
10994       info->type = ADDRESS_REG_WB;
10995       info->base = XEXP (x, 0);
10996       info->offset = NULL_RTX;
10997       return aarch64_base_register_rtx_p (info->base, strict_p);
10998
10999     case POST_MODIFY:
11000     case PRE_MODIFY:
11001       info->type = ADDRESS_REG_WB;
11002       info->base = XEXP (x, 0);
11003       if (GET_CODE (XEXP (x, 1)) == PLUS
11004           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
11005           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
11006           && aarch64_base_register_rtx_p (info->base, strict_p))
11007         {
11008           info->offset = XEXP (XEXP (x, 1), 1);
11009           info->const_offset = offset;
11010
11011           /* TImode, TFmode and TDmode values are allowed in both pairs of X
11012              registers and individual Q registers.  The available
11013              address modes are:
11014              X,X: 7-bit signed scaled offset
11015              Q:   9-bit signed offset
11016              We conservatively require an offset representable in either mode.
11017            */
11018           if (mode == TImode || mode == TFmode || mode == TDmode)
11019             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
11020                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
11021
11022           if (load_store_pair_p)
11023             return ((known_eq (GET_MODE_SIZE (mode), 4)
11024                      || known_eq (GET_MODE_SIZE (mode), 8)
11025                      || known_eq (GET_MODE_SIZE (mode), 16))
11026                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
11027           else
11028             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
11029         }
11030       return false;
11031
11032     case CONST:
11033     case SYMBOL_REF:
11034     case LABEL_REF:
11035       /* load literal: pc-relative constant pool entry.  Only supported
11036          for SI mode or larger.  */
11037       info->type = ADDRESS_SYMBOLIC;
11038
11039       if (!load_store_pair_p
11040           && GET_MODE_SIZE (mode).is_constant (&const_size)
11041           && const_size >= 4)
11042         {
11043           poly_int64 offset;
11044           rtx sym = strip_offset_and_salt (x, &offset);
11045           return ((LABEL_REF_P (sym)
11046                    || (SYMBOL_REF_P (sym)
11047                        && CONSTANT_POOL_ADDRESS_P (sym)
11048                        && aarch64_pcrelative_literal_loads)));
11049         }
11050       return false;
11051
11052     case LO_SUM:
11053       info->type = ADDRESS_LO_SUM;
11054       info->base = XEXP (x, 0);
11055       info->offset = XEXP (x, 1);
11056       if (allow_reg_index_p
11057           && aarch64_base_register_rtx_p (info->base, strict_p))
11058         {
11059           poly_int64 offset;
11060           HOST_WIDE_INT const_offset;
11061           rtx sym = strip_offset_and_salt (info->offset, &offset);
11062           if (SYMBOL_REF_P (sym)
11063               && offset.is_constant (&const_offset)
11064               && (aarch64_classify_symbol (sym, const_offset)
11065                   == SYMBOL_SMALL_ABSOLUTE))
11066             {
11067               /* The symbol and offset must be aligned to the access size.  */
11068               unsigned int align;
11069
11070               if (CONSTANT_POOL_ADDRESS_P (sym))
11071                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
11072               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
11073                 {
11074                   tree exp = SYMBOL_REF_DECL (sym);
11075                   align = TYPE_ALIGN (TREE_TYPE (exp));
11076                   align = aarch64_constant_alignment (exp, align);
11077                 }
11078               else if (SYMBOL_REF_DECL (sym))
11079                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
11080               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
11081                        && SYMBOL_REF_BLOCK (sym) != NULL)
11082                 align = SYMBOL_REF_BLOCK (sym)->alignment;
11083               else
11084                 align = BITS_PER_UNIT;
11085
11086               poly_int64 ref_size = GET_MODE_SIZE (mode);
11087               if (known_eq (ref_size, 0))
11088                 ref_size = GET_MODE_SIZE (DImode);
11089
11090               return (multiple_p (const_offset, ref_size)
11091                       && multiple_p (align / BITS_PER_UNIT, ref_size));
11092             }
11093         }
11094       return false;
11095
11096     default:
11097       return false;
11098     }
11099 }
11100
11101 /* Return true if the address X is valid for a PRFM instruction.
11102    STRICT_P is true if we should do strict checking with
11103    aarch64_classify_address.  */
11104
11105 bool
11106 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
11107 {
11108   struct aarch64_address_info addr;
11109
11110   /* PRFM accepts the same addresses as DImode...  */
11111   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
11112   if (!res)
11113     return false;
11114
11115   /* ... except writeback forms.  */
11116   return addr.type != ADDRESS_REG_WB;
11117 }
11118
11119 bool
11120 aarch64_symbolic_address_p (rtx x)
11121 {
11122   poly_int64 offset;
11123   x = strip_offset_and_salt (x, &offset);
11124   return SYMBOL_REF_P (x) || LABEL_REF_P (x);
11125 }
11126
11127 /* Classify the base of symbolic expression X.  */
11128
11129 enum aarch64_symbol_type
11130 aarch64_classify_symbolic_expression (rtx x)
11131 {
11132   rtx offset;
11133
11134   split_const (x, &x, &offset);
11135   return aarch64_classify_symbol (x, INTVAL (offset));
11136 }
11137
11138
11139 /* Return TRUE if X is a legitimate address for accessing memory in
11140    mode MODE.  */
11141 static bool
11142 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
11143 {
11144   struct aarch64_address_info addr;
11145
11146   return aarch64_classify_address (&addr, x, mode, strict_p);
11147 }
11148
11149 /* Return TRUE if X is a legitimate address of type TYPE for accessing
11150    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
11151 bool
11152 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
11153                               aarch64_addr_query_type type)
11154 {
11155   struct aarch64_address_info addr;
11156
11157   return aarch64_classify_address (&addr, x, mode, strict_p, type);
11158 }
11159
11160 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
11161
11162 static bool
11163 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
11164                                          poly_int64 orig_offset,
11165                                          machine_mode mode)
11166 {
11167   HOST_WIDE_INT size;
11168   if (GET_MODE_SIZE (mode).is_constant (&size))
11169     {
11170       HOST_WIDE_INT const_offset, second_offset;
11171
11172       /* A general SVE offset is A * VQ + B.  Remove the A component from
11173          coefficient 0 in order to get the constant B.  */
11174       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
11175
11176       /* Split an out-of-range address displacement into a base and
11177          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
11178          range otherwise to increase opportunities for sharing the base
11179          address of different sizes.  Unaligned accesses use the signed
11180          9-bit range, TImode/TFmode/TDmode use the intersection of signed
11181          scaled 7-bit and signed 9-bit offset.  */
11182       if (mode == TImode || mode == TFmode || mode == TDmode)
11183         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
11184       else if ((const_offset & (size - 1)) != 0)
11185         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
11186       else
11187         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
11188
11189       if (second_offset == 0 || known_eq (orig_offset, second_offset))
11190         return false;
11191
11192       /* Split the offset into second_offset and the rest.  */
11193       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11194       *offset2 = gen_int_mode (second_offset, Pmode);
11195       return true;
11196     }
11197   else
11198     {
11199       /* Get the mode we should use as the basis of the range.  For structure
11200          modes this is the mode of one vector.  */
11201       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11202       machine_mode step_mode
11203         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
11204
11205       /* Get the "mul vl" multiplier we'd like to use.  */
11206       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
11207       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
11208       if (vec_flags & VEC_SVE_DATA)
11209         /* LDR supports a 9-bit range, but the move patterns for
11210            structure modes require all vectors to be in range of the
11211            same base.  The simplest way of accomodating that while still
11212            promoting reuse of anchor points between different modes is
11213            to use an 8-bit range unconditionally.  */
11214         vnum = ((vnum + 128) & 255) - 128;
11215       else
11216         /* Predicates are only handled singly, so we might as well use
11217            the full range.  */
11218         vnum = ((vnum + 256) & 511) - 256;
11219       if (vnum == 0)
11220         return false;
11221
11222       /* Convert the "mul vl" multiplier into a byte offset.  */
11223       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
11224       if (known_eq (second_offset, orig_offset))
11225         return false;
11226
11227       /* Split the offset into second_offset and the rest.  */
11228       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11229       *offset2 = gen_int_mode (second_offset, Pmode);
11230       return true;
11231     }
11232 }
11233
11234 /* Return the binary representation of floating point constant VALUE in INTVAL.
11235    If the value cannot be converted, return false without setting INTVAL.
11236    The conversion is done in the given MODE.  */
11237 bool
11238 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
11239 {
11240
11241   /* We make a general exception for 0.  */
11242   if (aarch64_float_const_zero_rtx_p (value))
11243     {
11244       *intval = 0;
11245       return true;
11246     }
11247
11248   scalar_float_mode mode;
11249   if (!CONST_DOUBLE_P (value)
11250       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
11251       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
11252       /* Only support up to DF mode.  */
11253       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
11254     return false;
11255
11256   unsigned HOST_WIDE_INT ival = 0;
11257
11258   long res[2];
11259   real_to_target (res,
11260                   CONST_DOUBLE_REAL_VALUE (value),
11261                   REAL_MODE_FORMAT (mode));
11262
11263   if (mode == DFmode || mode == DDmode)
11264     {
11265       int order = BYTES_BIG_ENDIAN ? 1 : 0;
11266       ival = zext_hwi (res[order], 32);
11267       ival |= (zext_hwi (res[1 - order], 32) << 32);
11268     }
11269   else
11270       ival = zext_hwi (res[0], 32);
11271
11272   *intval = ival;
11273   return true;
11274 }
11275
11276 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11277    single MOV(+MOVK) followed by an FMOV.  */
11278 bool
11279 aarch64_float_const_rtx_p (rtx x)
11280 {
11281   machine_mode mode = GET_MODE (x);
11282   if (mode == VOIDmode)
11283     return false;
11284
11285   /* Determine whether it's cheaper to write float constants as
11286      mov/movk pairs over ldr/adrp pairs.  */
11287   unsigned HOST_WIDE_INT ival;
11288
11289   if (CONST_DOUBLE_P (x)
11290       && SCALAR_FLOAT_MODE_P (mode)
11291       && aarch64_reinterpret_float_as_int (x, &ival))
11292     {
11293       machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
11294       int num_instr = aarch64_internal_mov_immediate
11295                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11296       return num_instr < 3;
11297     }
11298
11299   return false;
11300 }
11301
11302 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11303    Floating Point).  */
11304 bool
11305 aarch64_float_const_zero_rtx_p (rtx x)
11306 {
11307   /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11308      zr as our callers expect, so no need to check the actual
11309      value if X is of Decimal Floating Point type.  */
11310   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
11311     return false;
11312
11313   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
11314     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
11315   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
11316 }
11317
11318 /* Return TRUE if rtx X is immediate constant that fits in a single
11319    MOVI immediate operation.  */
11320 bool
11321 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11322 {
11323   if (!TARGET_SIMD)
11324      return false;
11325
11326   machine_mode vmode;
11327   scalar_int_mode imode;
11328   unsigned HOST_WIDE_INT ival;
11329
11330   if (CONST_DOUBLE_P (x)
11331       && SCALAR_FLOAT_MODE_P (mode))
11332     {
11333       if (!aarch64_reinterpret_float_as_int (x, &ival))
11334         return false;
11335
11336       /* We make a general exception for 0.  */
11337       if (aarch64_float_const_zero_rtx_p (x))
11338         return true;
11339
11340       imode = int_mode_for_mode (mode).require ();
11341     }
11342   else if (CONST_INT_P (x)
11343            && is_a <scalar_int_mode> (mode, &imode))
11344     ival = INTVAL (x);
11345   else
11346     return false;
11347
11348    /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11349      a 128 bit vector mode.  */
11350   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11351
11352   vmode = aarch64_simd_container_mode (imode, width);
11353   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11354
11355   return aarch64_simd_valid_immediate (v_op, NULL);
11356 }
11357
11358
11359 /* Return the fixed registers used for condition codes.  */
11360
11361 static bool
11362 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11363 {
11364   *p1 = CC_REGNUM;
11365   *p2 = INVALID_REGNUM;
11366   return true;
11367 }
11368
11369 /* This function is used by the call expanders of the machine description.
11370    RESULT is the register in which the result is returned.  It's NULL for
11371    "call" and "sibcall".
11372    MEM is the location of the function call.
11373    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
11374    SIBCALL indicates whether this function call is normal call or sibling call.
11375    It will generate different pattern accordingly.  */
11376
11377 void
11378 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
11379 {
11380   rtx call, callee, tmp;
11381   rtvec vec;
11382   machine_mode mode;
11383
11384   gcc_assert (MEM_P (mem));
11385   callee = XEXP (mem, 0);
11386   mode = GET_MODE (callee);
11387   gcc_assert (mode == Pmode);
11388
11389   /* Decide if we should generate indirect calls by loading the
11390      address of the callee into a register before performing
11391      the branch-and-link.  */
11392   if (SYMBOL_REF_P (callee)
11393       ? (aarch64_is_long_call_p (callee)
11394          || aarch64_is_noplt_call_p (callee))
11395       : !REG_P (callee))
11396     XEXP (mem, 0) = force_reg (mode, callee);
11397
11398   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11399
11400   if (result != NULL_RTX)
11401     call = gen_rtx_SET (result, call);
11402
11403   if (sibcall)
11404     tmp = ret_rtx;
11405   else
11406     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11407
11408   gcc_assert (CONST_INT_P (callee_abi));
11409   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11410                                UNSPEC_CALLEE_ABI);
11411
11412   vec = gen_rtvec (3, call, callee_abi, tmp);
11413   call = gen_rtx_PARALLEL (VOIDmode, vec);
11414
11415   aarch64_emit_call_insn (call);
11416 }
11417
11418 /* Emit call insn with PAT and do aarch64-specific handling.  */
11419
11420 void
11421 aarch64_emit_call_insn (rtx pat)
11422 {
11423   rtx insn = emit_call_insn (pat);
11424
11425   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11426   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11427   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11428 }
11429
11430 machine_mode
11431 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11432 {
11433   machine_mode mode_x = GET_MODE (x);
11434   rtx_code code_x = GET_CODE (x);
11435
11436   /* All floating point compares return CCFP if it is an equality
11437      comparison, and CCFPE otherwise.  */
11438   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11439     {
11440       switch (code)
11441         {
11442         case EQ:
11443         case NE:
11444         case UNORDERED:
11445         case ORDERED:
11446         case UNLT:
11447         case UNLE:
11448         case UNGT:
11449         case UNGE:
11450         case UNEQ:
11451           return CCFPmode;
11452
11453         case LT:
11454         case LE:
11455         case GT:
11456         case GE:
11457         case LTGT:
11458           return CCFPEmode;
11459
11460         default:
11461           gcc_unreachable ();
11462         }
11463     }
11464
11465   /* Equality comparisons of short modes against zero can be performed
11466      using the TST instruction with the appropriate bitmask.  */
11467   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11468       && (code == EQ || code == NE)
11469       && (mode_x == HImode || mode_x == QImode))
11470     return CC_Zmode;
11471
11472   /* Similarly, comparisons of zero_extends from shorter modes can
11473      be performed using an ANDS with an immediate mask.  */
11474   if (y == const0_rtx && code_x == ZERO_EXTEND
11475       && (mode_x == SImode || mode_x == DImode)
11476       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11477       && (code == EQ || code == NE))
11478     return CC_Zmode;
11479
11480   /* Zero extracts support equality comparisons.  */
11481   if ((mode_x == SImode || mode_x == DImode)
11482       && y == const0_rtx
11483       && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11484           && CONST_INT_P (XEXP (x, 2)))
11485       && (code == EQ || code == NE))
11486     return CC_Zmode;
11487
11488   /* ANDS/BICS/TST support equality and all signed comparisons.  */
11489   if ((mode_x == SImode || mode_x == DImode)
11490       && y == const0_rtx
11491       && (code_x == AND)
11492       && (code == EQ || code == NE || code == LT || code == GE
11493           || code == GT || code == LE))
11494     return CC_NZVmode;
11495
11496   /* ADDS/SUBS correctly set N and Z flags.  */
11497   if ((mode_x == SImode || mode_x == DImode)
11498       && y == const0_rtx
11499       && (code == EQ || code == NE || code == LT || code == GE)
11500       && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11501     return CC_NZmode;
11502
11503   /* A compare with a shifted operand.  Because of canonicalization,
11504      the comparison will have to be swapped when we emit the assembly
11505      code.  */
11506   if ((mode_x == SImode || mode_x == DImode)
11507       && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11508       && (code_x == ASHIFT || code_x == ASHIFTRT
11509           || code_x == LSHIFTRT
11510           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11511     return CC_SWPmode;
11512
11513   /* Similarly for a negated operand, but we can only do this for
11514      equalities.  */
11515   if ((mode_x == SImode || mode_x == DImode)
11516       && (REG_P (y) || SUBREG_P (y))
11517       && (code == EQ || code == NE)
11518       && code_x == NEG)
11519     return CC_Zmode;
11520
11521   /* A test for unsigned overflow from an addition.  */
11522   if ((mode_x == DImode || mode_x == TImode)
11523       && (code == LTU || code == GEU)
11524       && code_x == PLUS
11525       && rtx_equal_p (XEXP (x, 0), y))
11526     return CC_Cmode;
11527
11528   /* A test for unsigned overflow from an add with carry.  */
11529   if ((mode_x == DImode || mode_x == TImode)
11530       && (code == LTU || code == GEU)
11531       && code_x == PLUS
11532       && CONST_SCALAR_INT_P (y)
11533       && (rtx_mode_t (y, mode_x)
11534           == (wi::shwi (1, mode_x)
11535               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11536     return CC_ADCmode;
11537
11538   /* A test for signed overflow.  */
11539   if ((mode_x == DImode || mode_x == TImode)
11540       && code == NE
11541       && code_x == PLUS
11542       && GET_CODE (y) == SIGN_EXTEND)
11543     return CC_Vmode;
11544
11545   /* For everything else, return CCmode.  */
11546   return CCmode;
11547 }
11548
11549 static int
11550 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11551
11552 int
11553 aarch64_get_condition_code (rtx x)
11554 {
11555   machine_mode mode = GET_MODE (XEXP (x, 0));
11556   enum rtx_code comp_code = GET_CODE (x);
11557
11558   if (GET_MODE_CLASS (mode) != MODE_CC)
11559     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11560   return aarch64_get_condition_code_1 (mode, comp_code);
11561 }
11562
11563 static int
11564 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11565 {
11566   switch (mode)
11567     {
11568     case E_CCFPmode:
11569     case E_CCFPEmode:
11570       switch (comp_code)
11571         {
11572         case GE: return AARCH64_GE;
11573         case GT: return AARCH64_GT;
11574         case LE: return AARCH64_LS;
11575         case LT: return AARCH64_MI;
11576         case NE: return AARCH64_NE;
11577         case EQ: return AARCH64_EQ;
11578         case ORDERED: return AARCH64_VC;
11579         case UNORDERED: return AARCH64_VS;
11580         case UNLT: return AARCH64_LT;
11581         case UNLE: return AARCH64_LE;
11582         case UNGT: return AARCH64_HI;
11583         case UNGE: return AARCH64_PL;
11584         default: return -1;
11585         }
11586       break;
11587
11588     case E_CCmode:
11589       switch (comp_code)
11590         {
11591         case NE: return AARCH64_NE;
11592         case EQ: return AARCH64_EQ;
11593         case GE: return AARCH64_GE;
11594         case GT: return AARCH64_GT;
11595         case LE: return AARCH64_LE;
11596         case LT: return AARCH64_LT;
11597         case GEU: return AARCH64_CS;
11598         case GTU: return AARCH64_HI;
11599         case LEU: return AARCH64_LS;
11600         case LTU: return AARCH64_CC;
11601         default: return -1;
11602         }
11603       break;
11604
11605     case E_CC_SWPmode:
11606       switch (comp_code)
11607         {
11608         case NE: return AARCH64_NE;
11609         case EQ: return AARCH64_EQ;
11610         case GE: return AARCH64_LE;
11611         case GT: return AARCH64_LT;
11612         case LE: return AARCH64_GE;
11613         case LT: return AARCH64_GT;
11614         case GEU: return AARCH64_LS;
11615         case GTU: return AARCH64_CC;
11616         case LEU: return AARCH64_CS;
11617         case LTU: return AARCH64_HI;
11618         default: return -1;
11619         }
11620       break;
11621
11622     case E_CC_NZCmode:
11623       switch (comp_code)
11624         {
11625         case NE: return AARCH64_NE; /* = any */
11626         case EQ: return AARCH64_EQ; /* = none */
11627         case GE: return AARCH64_PL; /* = nfrst */
11628         case LT: return AARCH64_MI; /* = first */
11629         case GEU: return AARCH64_CS; /* = nlast */
11630         case GTU: return AARCH64_HI; /* = pmore */
11631         case LEU: return AARCH64_LS; /* = plast */
11632         case LTU: return AARCH64_CC; /* = last */
11633         default: return -1;
11634         }
11635       break;
11636
11637     case E_CC_NZVmode:
11638       switch (comp_code)
11639         {
11640         case NE: return AARCH64_NE;
11641         case EQ: return AARCH64_EQ;
11642         case GE: return AARCH64_PL;
11643         case LT: return AARCH64_MI;
11644         case GT: return AARCH64_GT;
11645         case LE: return AARCH64_LE;
11646         default: return -1;
11647         }
11648       break;
11649
11650     case E_CC_NZmode:
11651       switch (comp_code)
11652         {
11653         case NE: return AARCH64_NE;
11654         case EQ: return AARCH64_EQ;
11655         case GE: return AARCH64_PL;
11656         case LT: return AARCH64_MI;
11657         default: return -1;
11658         }
11659       break;
11660
11661     case E_CC_Zmode:
11662       switch (comp_code)
11663         {
11664         case NE: return AARCH64_NE;
11665         case EQ: return AARCH64_EQ;
11666         default: return -1;
11667         }
11668       break;
11669
11670     case E_CC_Cmode:
11671       switch (comp_code)
11672         {
11673         case LTU: return AARCH64_CS;
11674         case GEU: return AARCH64_CC;
11675         default: return -1;
11676         }
11677       break;
11678
11679     case E_CC_ADCmode:
11680       switch (comp_code)
11681         {
11682         case GEU: return AARCH64_CS;
11683         case LTU: return AARCH64_CC;
11684         default: return -1;
11685         }
11686       break;
11687
11688     case E_CC_Vmode:
11689       switch (comp_code)
11690         {
11691         case NE: return AARCH64_VS;
11692         case EQ: return AARCH64_VC;
11693         default: return -1;
11694         }
11695       break;
11696
11697     default:
11698       return -1;
11699     }
11700
11701   return -1;
11702 }
11703
11704 bool
11705 aarch64_const_vec_all_same_in_range_p (rtx x,
11706                                        HOST_WIDE_INT minval,
11707                                        HOST_WIDE_INT maxval)
11708 {
11709   rtx elt;
11710   return (const_vec_duplicate_p (x, &elt)
11711           && CONST_INT_P (elt)
11712           && IN_RANGE (INTVAL (elt), minval, maxval));
11713 }
11714
11715 bool
11716 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11717 {
11718   return aarch64_const_vec_all_same_in_range_p (x, val, val);
11719 }
11720
11721 /* Return true if VEC is a constant in which every element is in the range
11722    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
11723
11724 static bool
11725 aarch64_const_vec_all_in_range_p (rtx vec,
11726                                   HOST_WIDE_INT minval,
11727                                   HOST_WIDE_INT maxval)
11728 {
11729   if (!CONST_VECTOR_P (vec)
11730       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11731     return false;
11732
11733   int nunits;
11734   if (!CONST_VECTOR_STEPPED_P (vec))
11735     nunits = const_vector_encoded_nelts (vec);
11736   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11737     return false;
11738
11739   for (int i = 0; i < nunits; i++)
11740     {
11741       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11742       if (!CONST_INT_P (vec_elem)
11743           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11744         return false;
11745     }
11746   return true;
11747 }
11748
11749 /* N Z C V.  */
11750 #define AARCH64_CC_V 1
11751 #define AARCH64_CC_C (1 << 1)
11752 #define AARCH64_CC_Z (1 << 2)
11753 #define AARCH64_CC_N (1 << 3)
11754
11755 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
11756 static const int aarch64_nzcv_codes[] =
11757 {
11758   0,            /* EQ, Z == 1.  */
11759   AARCH64_CC_Z, /* NE, Z == 0.  */
11760   0,            /* CS, C == 1.  */
11761   AARCH64_CC_C, /* CC, C == 0.  */
11762   0,            /* MI, N == 1.  */
11763   AARCH64_CC_N, /* PL, N == 0.  */
11764   0,            /* VS, V == 1.  */
11765   AARCH64_CC_V, /* VC, V == 0.  */
11766   0,            /* HI, C ==1 && Z == 0.  */
11767   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
11768   AARCH64_CC_V, /* GE, N == V.  */
11769   0,            /* LT, N != V.  */
11770   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
11771   0,            /* LE, !(Z == 0 && N == V).  */
11772   0,            /* AL, Any.  */
11773   0             /* NV, Any.  */
11774 };
11775
11776 /* Print floating-point vector immediate operand X to F, negating it
11777    first if NEGATE is true.  Return true on success, false if it isn't
11778    a constant we can handle.  */
11779
11780 static bool
11781 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11782 {
11783   rtx elt;
11784
11785   if (!const_vec_duplicate_p (x, &elt))
11786     return false;
11787
11788   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11789   if (negate)
11790     r = real_value_negate (&r);
11791
11792   /* Handle the SVE single-bit immediates specially, since they have a
11793      fixed form in the assembly syntax.  */
11794   if (real_equal (&r, &dconst0))
11795     asm_fprintf (f, "0.0");
11796   else if (real_equal (&r, &dconst2))
11797     asm_fprintf (f, "2.0");
11798   else if (real_equal (&r, &dconst1))
11799     asm_fprintf (f, "1.0");
11800   else if (real_equal (&r, &dconsthalf))
11801     asm_fprintf (f, "0.5");
11802   else
11803     {
11804       const int buf_size = 20;
11805       char float_buf[buf_size] = {'\0'};
11806       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11807                                 1, GET_MODE (elt));
11808       asm_fprintf (f, "%s", float_buf);
11809     }
11810
11811   return true;
11812 }
11813
11814 /* Return the equivalent letter for size.  */
11815 static char
11816 sizetochar (int size)
11817 {
11818   switch (size)
11819     {
11820     case 64: return 'd';
11821     case 32: return 's';
11822     case 16: return 'h';
11823     case 8 : return 'b';
11824     default: gcc_unreachable ();
11825     }
11826 }
11827
11828 /* Print operand X to file F in a target specific manner according to CODE.
11829    The acceptable formatting commands given by CODE are:
11830      'c':               An integer or symbol address without a preceding #
11831                         sign.
11832      'C':               Take the duplicated element in a vector constant
11833                         and print it in hex.
11834      'D':               Take the duplicated element in a vector constant
11835                         and print it as an unsigned integer, in decimal.
11836      'e':               Print the sign/zero-extend size as a character 8->b,
11837                         16->h, 32->w.  Can also be used for masks:
11838                         0xff->b, 0xffff->h, 0xffffffff->w.
11839      'I':               If the operand is a duplicated vector constant,
11840                         replace it with the duplicated scalar.  If the
11841                         operand is then a floating-point constant, replace
11842                         it with the integer bit representation.  Print the
11843                         transformed constant as a signed decimal number.
11844      'p':               Prints N such that 2^N == X (X must be power of 2 and
11845                         const int).
11846      'P':               Print the number of non-zero bits in X (a const_int).
11847      'H':               Print the higher numbered register of a pair (TImode)
11848                         of regs.
11849      'm':               Print a condition (eq, ne, etc).
11850      'M':               Same as 'm', but invert condition.
11851      'N':               Take the duplicated element in a vector constant
11852                         and print the negative of it in decimal.
11853      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
11854      'S/T/U/V':         Print a FP/SIMD register name for a register list.
11855                         The register printed is the FP/SIMD register name
11856                         of X + 0/1/2/3 for S/T/U/V.
11857      'R':               Print a scalar Integer/FP/SIMD register name + 1.
11858      'X':               Print bottom 16 bits of integer constant in hex.
11859      'w/x':             Print a general register name or the zero register
11860                         (32-bit or 64-bit).
11861      '0':               Print a normal operand, if it's a general register,
11862                         then we assume DImode.
11863      'k':               Print NZCV for conditional compare instructions.
11864      'A':               Output address constant representing the first
11865                         argument of X, specifying a relocation offset
11866                         if appropriate.
11867      'L':               Output constant address specified by X
11868                         with a relocation offset if appropriate.
11869      'G':               Prints address of X, specifying a PC relative
11870                         relocation mode if appropriate.
11871      'y':               Output address of LDP or STP - this is used for
11872                         some LDP/STPs which don't use a PARALLEL in their
11873                         pattern (so the mode needs to be adjusted).
11874      'z':               Output address of a typical LDP or STP.  */
11875
11876 static void
11877 aarch64_print_operand (FILE *f, rtx x, int code)
11878 {
11879   rtx elt;
11880   switch (code)
11881     {
11882     case 'c':
11883       if (CONST_INT_P (x))
11884         fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11885       else
11886         {
11887           poly_int64 offset;
11888           rtx base = strip_offset_and_salt (x, &offset);
11889           if (SYMBOL_REF_P (base))
11890             output_addr_const (f, x);
11891           else
11892             output_operand_lossage ("unsupported operand for code '%c'", code);
11893         }
11894       break;
11895
11896     case 'e':
11897       {
11898         x = unwrap_const_vec_duplicate (x);
11899         if (!CONST_INT_P (x))
11900           {
11901             output_operand_lossage ("invalid operand for '%%%c'", code);
11902             return;
11903           }
11904
11905         HOST_WIDE_INT val = INTVAL (x);
11906         if ((val & ~7) == 8 || val == 0xff)
11907           fputc ('b', f);
11908         else if ((val & ~7) == 16 || val == 0xffff)
11909           fputc ('h', f);
11910         else if ((val & ~7) == 32 || val == 0xffffffff)
11911           fputc ('w', f);
11912         else
11913           {
11914             output_operand_lossage ("invalid operand for '%%%c'", code);
11915             return;
11916           }
11917       }
11918       break;
11919
11920     case 'p':
11921       {
11922         int n;
11923
11924         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
11925           {
11926             output_operand_lossage ("invalid operand for '%%%c'", code);
11927             return;
11928           }
11929
11930         asm_fprintf (f, "%d", n);
11931       }
11932       break;
11933
11934     case 'P':
11935       if (!CONST_INT_P (x))
11936         {
11937           output_operand_lossage ("invalid operand for '%%%c'", code);
11938           return;
11939         }
11940
11941       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
11942       break;
11943
11944     case 'H':
11945       if (x == const0_rtx)
11946         {
11947           asm_fprintf (f, "xzr");
11948           break;
11949         }
11950
11951       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
11952         {
11953           output_operand_lossage ("invalid operand for '%%%c'", code);
11954           return;
11955         }
11956
11957       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
11958       break;
11959
11960     case 'I':
11961       {
11962         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
11963         if (CONST_INT_P (x))
11964           asm_fprintf (f, "%wd", INTVAL (x));
11965         else
11966           {
11967             output_operand_lossage ("invalid operand for '%%%c'", code);
11968             return;
11969           }
11970         break;
11971       }
11972
11973     case 'M':
11974     case 'm':
11975       {
11976         int cond_code;
11977         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
11978         if (x == const_true_rtx)
11979           {
11980             if (code == 'M')
11981               fputs ("nv", f);
11982             return;
11983           }
11984
11985         if (!COMPARISON_P (x))
11986           {
11987             output_operand_lossage ("invalid operand for '%%%c'", code);
11988             return;
11989           }
11990
11991         cond_code = aarch64_get_condition_code (x);
11992         gcc_assert (cond_code >= 0);
11993         if (code == 'M')
11994           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
11995         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
11996           fputs (aarch64_sve_condition_codes[cond_code], f);
11997         else
11998           fputs (aarch64_condition_codes[cond_code], f);
11999       }
12000       break;
12001
12002     case 'N':
12003       if (!const_vec_duplicate_p (x, &elt))
12004         {
12005           output_operand_lossage ("invalid vector constant");
12006           return;
12007         }
12008
12009       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12010         asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
12011       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12012                && aarch64_print_vector_float_operand (f, x, true))
12013         ;
12014       else
12015         {
12016           output_operand_lossage ("invalid vector constant");
12017           return;
12018         }
12019       break;
12020
12021     case 'b':
12022     case 'h':
12023     case 's':
12024     case 'd':
12025     case 'q':
12026       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12027         {
12028           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12029           return;
12030         }
12031       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
12032       break;
12033
12034     case 'S':
12035     case 'T':
12036     case 'U':
12037     case 'V':
12038       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12039         {
12040           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12041           return;
12042         }
12043       asm_fprintf (f, "%c%d",
12044                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12045                    REGNO (x) - V0_REGNUM + (code - 'S'));
12046       break;
12047
12048     case 'R':
12049       if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12050           && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12051         asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12052       else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
12053         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12054       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12055         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12056       else
12057         output_operand_lossage ("incompatible register operand for '%%%c'",
12058                                 code);
12059       break;
12060
12061     case 'X':
12062       if (!CONST_INT_P (x))
12063         {
12064           output_operand_lossage ("invalid operand for '%%%c'", code);
12065           return;
12066         }
12067       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
12068       break;
12069
12070     case 'C':
12071       {
12072         /* Print a replicated constant in hex.  */
12073         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12074           {
12075             output_operand_lossage ("invalid operand for '%%%c'", code);
12076             return;
12077           }
12078         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12079         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12080       }
12081       break;
12082
12083     case 'D':
12084       {
12085         /* Print a replicated constant in decimal, treating it as
12086            unsigned.  */
12087         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12088           {
12089             output_operand_lossage ("invalid operand for '%%%c'", code);
12090             return;
12091           }
12092         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12093         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12094       }
12095       break;
12096
12097     case 'w':
12098     case 'x':
12099       if (x == const0_rtx
12100           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
12101         {
12102           asm_fprintf (f, "%czr", code);
12103           break;
12104         }
12105
12106       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12107         {
12108           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12109           break;
12110         }
12111
12112       if (REG_P (x) && REGNO (x) == SP_REGNUM)
12113         {
12114           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12115           break;
12116         }
12117
12118       /* Fall through */
12119
12120     case 0:
12121       if (x == NULL)
12122         {
12123           output_operand_lossage ("missing operand");
12124           return;
12125         }
12126
12127       switch (GET_CODE (x))
12128         {
12129         case REG:
12130           if (aarch64_sve_data_mode_p (GET_MODE (x)))
12131             {
12132               if (REG_NREGS (x) == 1)
12133                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12134               else
12135                 {
12136                   char suffix
12137                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12138                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
12139                                REGNO (x) - V0_REGNUM, suffix,
12140                                END_REGNO (x) - V0_REGNUM - 1, suffix);
12141                 }
12142             }
12143           else
12144             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12145           break;
12146
12147         case MEM:
12148           output_address (GET_MODE (x), XEXP (x, 0));
12149           break;
12150
12151         case LABEL_REF:
12152         case SYMBOL_REF:
12153           output_addr_const (asm_out_file, x);
12154           break;
12155
12156         case CONST_INT:
12157           asm_fprintf (f, "%wd", INTVAL (x));
12158           break;
12159
12160         case CONST:
12161           if (!VECTOR_MODE_P (GET_MODE (x)))
12162             {
12163               output_addr_const (asm_out_file, x);
12164               break;
12165             }
12166           /* fall through */
12167
12168         case CONST_VECTOR:
12169           if (!const_vec_duplicate_p (x, &elt))
12170             {
12171               output_operand_lossage ("invalid vector constant");
12172               return;
12173             }
12174
12175           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12176             asm_fprintf (f, "%wd", INTVAL (elt));
12177           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12178                    && aarch64_print_vector_float_operand (f, x, false))
12179             ;
12180           else
12181             {
12182               output_operand_lossage ("invalid vector constant");
12183               return;
12184             }
12185           break;
12186
12187         case CONST_DOUBLE:
12188           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12189              be getting CONST_DOUBLEs holding integers.  */
12190           gcc_assert (GET_MODE (x) != VOIDmode);
12191           if (aarch64_float_const_zero_rtx_p (x))
12192             {
12193               fputc ('0', f);
12194               break;
12195             }
12196           else if (aarch64_float_const_representable_p (x))
12197             {
12198 #define buf_size 20
12199               char float_buf[buf_size] = {'\0'};
12200               real_to_decimal_for_mode (float_buf,
12201                                         CONST_DOUBLE_REAL_VALUE (x),
12202                                         buf_size, buf_size,
12203                                         1, GET_MODE (x));
12204               asm_fprintf (asm_out_file, "%s", float_buf);
12205               break;
12206 #undef buf_size
12207             }
12208           output_operand_lossage ("invalid constant");
12209           return;
12210         default:
12211           output_operand_lossage ("invalid operand");
12212           return;
12213         }
12214       break;
12215
12216     case 'A':
12217       if (GET_CODE (x) == HIGH)
12218         x = XEXP (x, 0);
12219
12220       switch (aarch64_classify_symbolic_expression (x))
12221         {
12222         case SYMBOL_SMALL_GOT_4G:
12223           asm_fprintf (asm_out_file, ":got:");
12224           break;
12225
12226         case SYMBOL_SMALL_TLSGD:
12227           asm_fprintf (asm_out_file, ":tlsgd:");
12228           break;
12229
12230         case SYMBOL_SMALL_TLSDESC:
12231           asm_fprintf (asm_out_file, ":tlsdesc:");
12232           break;
12233
12234         case SYMBOL_SMALL_TLSIE:
12235           asm_fprintf (asm_out_file, ":gottprel:");
12236           break;
12237
12238         case SYMBOL_TLSLE24:
12239           asm_fprintf (asm_out_file, ":tprel:");
12240           break;
12241
12242         case SYMBOL_TINY_GOT:
12243           gcc_unreachable ();
12244           break;
12245
12246         default:
12247           break;
12248         }
12249       output_addr_const (asm_out_file, x);
12250       break;
12251
12252     case 'L':
12253       switch (aarch64_classify_symbolic_expression (x))
12254         {
12255         case SYMBOL_SMALL_GOT_4G:
12256           asm_fprintf (asm_out_file, ":got_lo12:");
12257           break;
12258
12259         case SYMBOL_SMALL_TLSGD:
12260           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12261           break;
12262
12263         case SYMBOL_SMALL_TLSDESC:
12264           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12265           break;
12266
12267         case SYMBOL_SMALL_TLSIE:
12268           asm_fprintf (asm_out_file, ":gottprel_lo12:");
12269           break;
12270
12271         case SYMBOL_TLSLE12:
12272           asm_fprintf (asm_out_file, ":tprel_lo12:");
12273           break;
12274
12275         case SYMBOL_TLSLE24:
12276           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12277           break;
12278
12279         case SYMBOL_TINY_GOT:
12280           asm_fprintf (asm_out_file, ":got:");
12281           break;
12282
12283         case SYMBOL_TINY_TLSIE:
12284           asm_fprintf (asm_out_file, ":gottprel:");
12285           break;
12286
12287         default:
12288           break;
12289         }
12290       output_addr_const (asm_out_file, x);
12291       break;
12292
12293     case 'G':
12294       switch (aarch64_classify_symbolic_expression (x))
12295         {
12296         case SYMBOL_TLSLE24:
12297           asm_fprintf (asm_out_file, ":tprel_hi12:");
12298           break;
12299         default:
12300           break;
12301         }
12302       output_addr_const (asm_out_file, x);
12303       break;
12304
12305     case 'k':
12306       {
12307         HOST_WIDE_INT cond_code;
12308
12309         if (!CONST_INT_P (x))
12310           {
12311             output_operand_lossage ("invalid operand for '%%%c'", code);
12312             return;
12313           }
12314
12315         cond_code = INTVAL (x);
12316         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12317         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12318       }
12319       break;
12320
12321     case 'y':
12322     case 'z':
12323       {
12324         machine_mode mode = GET_MODE (x);
12325
12326         if (!MEM_P (x)
12327             || (code == 'y'
12328                 && maybe_ne (GET_MODE_SIZE (mode), 8)
12329                 && maybe_ne (GET_MODE_SIZE (mode), 16)))
12330           {
12331             output_operand_lossage ("invalid operand for '%%%c'", code);
12332             return;
12333           }
12334
12335         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12336                                             code == 'y'
12337                                             ? ADDR_QUERY_LDP_STP_N
12338                                             : ADDR_QUERY_LDP_STP))
12339           output_operand_lossage ("invalid operand prefix '%%%c'", code);
12340       }
12341       break;
12342
12343     default:
12344       output_operand_lossage ("invalid operand prefix '%%%c'", code);
12345       return;
12346     }
12347 }
12348
12349 /* Print address 'x' of a memory access with mode 'mode'.
12350    'op' is the context required by aarch64_classify_address.  It can either be
12351    MEM for a normal memory access or PARALLEL for LDP/STP.  */
12352 static bool
12353 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12354                                 aarch64_addr_query_type type)
12355 {
12356   struct aarch64_address_info addr;
12357   unsigned int size, vec_flags;
12358
12359   /* Check all addresses are Pmode - including ILP32.  */
12360   if (GET_MODE (x) != Pmode
12361       && (!CONST_INT_P (x)
12362           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12363     {
12364       output_operand_lossage ("invalid address mode");
12365       return false;
12366     }
12367
12368   if (aarch64_classify_address (&addr, x, mode, true, type))
12369     switch (addr.type)
12370       {
12371       case ADDRESS_REG_IMM:
12372         if (known_eq (addr.const_offset, 0))
12373           {
12374             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12375             return true;
12376           }
12377
12378         vec_flags = aarch64_classify_vector_mode (mode);
12379         if (vec_flags & VEC_ANY_SVE)
12380           {
12381             HOST_WIDE_INT vnum
12382               = exact_div (addr.const_offset,
12383                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12384             asm_fprintf (f, "[%s, #%wd, mul vl]",
12385                          reg_names[REGNO (addr.base)], vnum);
12386             return true;
12387           }
12388
12389         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12390                      INTVAL (addr.offset));
12391         return true;
12392
12393       case ADDRESS_REG_REG:
12394         if (addr.shift == 0)
12395           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12396                        reg_names [REGNO (addr.offset)]);
12397         else
12398           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12399                        reg_names [REGNO (addr.offset)], addr.shift);
12400         return true;
12401
12402       case ADDRESS_REG_UXTW:
12403         if (addr.shift == 0)
12404           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12405                        REGNO (addr.offset) - R0_REGNUM);
12406         else
12407           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12408                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12409         return true;
12410
12411       case ADDRESS_REG_SXTW:
12412         if (addr.shift == 0)
12413           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12414                        REGNO (addr.offset) - R0_REGNUM);
12415         else
12416           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12417                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12418         return true;
12419
12420       case ADDRESS_REG_WB:
12421         /* Writeback is only supported for fixed-width modes.  */
12422         size = GET_MODE_SIZE (mode).to_constant ();
12423         switch (GET_CODE (x))
12424           {
12425           case PRE_INC:
12426             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12427             return true;
12428           case POST_INC:
12429             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12430             return true;
12431           case PRE_DEC:
12432             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12433             return true;
12434           case POST_DEC:
12435             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12436             return true;
12437           case PRE_MODIFY:
12438             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12439                          INTVAL (addr.offset));
12440             return true;
12441           case POST_MODIFY:
12442             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12443                          INTVAL (addr.offset));
12444             return true;
12445           default:
12446             break;
12447           }
12448         break;
12449
12450       case ADDRESS_LO_SUM:
12451         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12452         output_addr_const (f, addr.offset);
12453         asm_fprintf (f, "]");
12454         return true;
12455
12456       case ADDRESS_SYMBOLIC:
12457         output_addr_const (f, x);
12458         return true;
12459       }
12460
12461   return false;
12462 }
12463
12464 /* Print address 'x' of a memory access with mode 'mode'.  */
12465 static void
12466 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12467 {
12468   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12469     output_addr_const (f, x);
12470 }
12471
12472 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
12473
12474 static bool
12475 aarch64_output_addr_const_extra (FILE *file, rtx x)
12476 {
12477   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12478     {
12479       output_addr_const (file, XVECEXP (x, 0, 0));
12480       return true;
12481    }
12482   return false;
12483 }
12484
12485 bool
12486 aarch64_label_mentioned_p (rtx x)
12487 {
12488   const char *fmt;
12489   int i;
12490
12491   if (LABEL_REF_P (x))
12492     return true;
12493
12494   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12495      referencing instruction, but they are constant offsets, not
12496      symbols.  */
12497   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12498     return false;
12499
12500   fmt = GET_RTX_FORMAT (GET_CODE (x));
12501   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12502     {
12503       if (fmt[i] == 'E')
12504         {
12505           int j;
12506
12507           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12508             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12509               return 1;
12510         }
12511       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12512         return 1;
12513     }
12514
12515   return 0;
12516 }
12517
12518 /* Implement REGNO_REG_CLASS.  */
12519
12520 enum reg_class
12521 aarch64_regno_regclass (unsigned regno)
12522 {
12523   if (STUB_REGNUM_P (regno))
12524     return STUB_REGS;
12525
12526   if (GP_REGNUM_P (regno))
12527     return GENERAL_REGS;
12528
12529   if (regno == SP_REGNUM)
12530     return STACK_REG;
12531
12532   if (regno == FRAME_POINTER_REGNUM
12533       || regno == ARG_POINTER_REGNUM)
12534     return POINTER_REGS;
12535
12536   if (FP_REGNUM_P (regno))
12537     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12538             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12539
12540   if (PR_REGNUM_P (regno))
12541     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12542
12543   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12544     return FFR_REGS;
12545
12546   return NO_REGS;
12547 }
12548
12549 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12550    If OFFSET is out of range, return an offset of an anchor point
12551    that is in range.  Return 0 otherwise.  */
12552
12553 static HOST_WIDE_INT
12554 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12555                        machine_mode mode)
12556 {
12557   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
12558   if (size > 16)
12559     return (offset + 0x400) & ~0x7f0;
12560
12561   /* For offsets that aren't a multiple of the access size, the limit is
12562      -256...255.  */
12563   if (offset & (size - 1))
12564     {
12565       /* BLKmode typically uses LDP of X-registers.  */
12566       if (mode == BLKmode)
12567         return (offset + 512) & ~0x3ff;
12568       return (offset + 0x100) & ~0x1ff;
12569     }
12570
12571   /* Small negative offsets are supported.  */
12572   if (IN_RANGE (offset, -256, 0))
12573     return 0;
12574
12575   if (mode == TImode || mode == TFmode || mode == TDmode)
12576     return (offset + 0x100) & ~0x1ff;
12577
12578   /* Use 12-bit offset by access size.  */
12579   return offset & (~0xfff * size);
12580 }
12581
12582 static rtx
12583 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
12584 {
12585   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12586      where mask is selected by alignment and size of the offset.
12587      We try to pick as large a range for the offset as possible to
12588      maximize the chance of a CSE.  However, for aligned addresses
12589      we limit the range to 4k so that structures with different sized
12590      elements are likely to use the same base.  We need to be careful
12591      not to split a CONST for some forms of address expression, otherwise
12592      it will generate sub-optimal code.  */
12593
12594   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12595     {
12596       rtx base = XEXP (x, 0);
12597       rtx offset_rtx = XEXP (x, 1);
12598       HOST_WIDE_INT offset = INTVAL (offset_rtx);
12599
12600       if (GET_CODE (base) == PLUS)
12601         {
12602           rtx op0 = XEXP (base, 0);
12603           rtx op1 = XEXP (base, 1);
12604
12605           /* Force any scaling into a temp for CSE.  */
12606           op0 = force_reg (Pmode, op0);
12607           op1 = force_reg (Pmode, op1);
12608
12609           /* Let the pointer register be in op0.  */
12610           if (REG_POINTER (op1))
12611             std::swap (op0, op1);
12612
12613           /* If the pointer is virtual or frame related, then we know that
12614              virtual register instantiation or register elimination is going
12615              to apply a second constant.  We want the two constants folded
12616              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
12617           if (virt_or_elim_regno_p (REGNO (op0)))
12618             {
12619               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12620                                    NULL_RTX, true, OPTAB_DIRECT);
12621               return gen_rtx_PLUS (Pmode, base, op1);
12622             }
12623
12624           /* Otherwise, in order to encourage CSE (and thence loop strength
12625              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
12626           base = expand_binop (Pmode, add_optab, op0, op1,
12627                                NULL_RTX, true, OPTAB_DIRECT);
12628           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12629         }
12630
12631       HOST_WIDE_INT size;
12632       if (GET_MODE_SIZE (mode).is_constant (&size))
12633         {
12634           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12635                                                              mode);
12636           if (base_offset != 0)
12637             {
12638               base = plus_constant (Pmode, base, base_offset);
12639               base = force_operand (base, NULL_RTX);
12640               return plus_constant (Pmode, base, offset - base_offset);
12641             }
12642         }
12643     }
12644
12645   return x;
12646 }
12647
12648 static reg_class_t
12649 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12650                           reg_class_t rclass,
12651                           machine_mode mode,
12652                           secondary_reload_info *sri)
12653 {
12654   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12655      LDR and STR.  See the comment at the head of aarch64-sve.md for
12656      more details about the big-endian handling.  */
12657   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12658   if (reg_class_subset_p (rclass, FP_REGS)
12659       && !((REG_P (x) && HARD_REGISTER_P (x))
12660            || aarch64_simd_valid_immediate (x, NULL))
12661       && mode != VNx16QImode
12662       && (vec_flags & VEC_SVE_DATA)
12663       && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12664     {
12665       sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12666       return NO_REGS;
12667     }
12668
12669   /* If we have to disable direct literal pool loads and stores because the
12670      function is too big, then we need a scratch register.  */
12671   if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12672       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12673           || targetm.vector_mode_supported_p (GET_MODE (x)))
12674       && !aarch64_pcrelative_literal_loads)
12675     {
12676       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12677       return NO_REGS;
12678     }
12679
12680   /* Without the TARGET_SIMD instructions we cannot move a Q register
12681      to a Q register directly.  We need a scratch.  */
12682   if (REG_P (x)
12683       && (mode == TFmode
12684           || mode == TImode
12685           || mode == TDmode
12686           || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
12687       && mode == GET_MODE (x)
12688       && !TARGET_SIMD
12689       && FP_REGNUM_P (REGNO (x))
12690       && reg_class_subset_p (rclass, FP_REGS))
12691     {
12692       sri->icode = code_for_aarch64_reload_mov (mode);
12693       return NO_REGS;
12694     }
12695
12696   /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12697      because AArch64 has richer addressing modes for LDR/STR instructions
12698      than LDP/STP instructions.  */
12699   if (TARGET_FLOAT && rclass == GENERAL_REGS
12700       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12701     return FP_REGS;
12702
12703   if (rclass == FP_REGS
12704       && (mode == TImode || mode == TFmode || mode == TDmode)
12705       && CONSTANT_P(x))
12706       return GENERAL_REGS;
12707
12708   return NO_REGS;
12709 }
12710
12711 /* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
12712
12713 static bool
12714 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12715                                  reg_class_t class2)
12716 {
12717   if (!TARGET_SIMD
12718       && reg_classes_intersect_p (class1, FP_REGS)
12719       && reg_classes_intersect_p (class2, FP_REGS))
12720     {
12721       /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12722          so we can't easily split a move involving tuples of 128-bit
12723          vectors.  Force the copy through memory instead.
12724
12725          (Tuples of 64-bit vectors are fine.)  */
12726       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12727       if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12728         return true;
12729     }
12730   return false;
12731 }
12732
12733 static bool
12734 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12735 {
12736   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12737
12738   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12739      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
12740   if (frame_pointer_needed)
12741     return to == HARD_FRAME_POINTER_REGNUM;
12742   return true;
12743 }
12744
12745 poly_int64
12746 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12747 {
12748   if (to == HARD_FRAME_POINTER_REGNUM)
12749     {
12750       if (from == ARG_POINTER_REGNUM)
12751         return cfun->machine->frame.hard_fp_offset;
12752
12753       if (from == FRAME_POINTER_REGNUM)
12754         return cfun->machine->frame.hard_fp_offset
12755                - cfun->machine->frame.locals_offset;
12756     }
12757
12758   if (to == STACK_POINTER_REGNUM)
12759     {
12760       if (from == FRAME_POINTER_REGNUM)
12761           return cfun->machine->frame.frame_size
12762                  - cfun->machine->frame.locals_offset;
12763     }
12764
12765   return cfun->machine->frame.frame_size;
12766 }
12767
12768
12769 /* Get return address without mangling.  */
12770
12771 rtx
12772 aarch64_return_addr_rtx (void)
12773 {
12774   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12775   /* Note: aarch64_return_address_signing_enabled only
12776      works after cfun->machine->frame.laid_out is set,
12777      so here we don't know if the return address will
12778      be signed or not.  */
12779   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12780   emit_move_insn (lr, val);
12781   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12782   return lr;
12783 }
12784
12785
12786 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
12787    previous frame.  */
12788
12789 rtx
12790 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12791 {
12792   if (count != 0)
12793     return const0_rtx;
12794   return aarch64_return_addr_rtx ();
12795 }
12796
12797 static void
12798 aarch64_asm_trampoline_template (FILE *f)
12799 {
12800   /* Even if the current function doesn't have branch protection, some
12801      later function might, so since this template is only generated once
12802      we have to add a BTI just in case. */
12803   asm_fprintf (f, "\thint\t34 // bti c\n");
12804
12805   if (TARGET_ILP32)
12806     {
12807       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12808       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12809     }
12810   else
12811     {
12812       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12813       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12814     }
12815   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12816
12817   /* We always emit a speculation barrier.
12818      This is because the same trampoline template is used for every nested
12819      function.  Since nested functions are not particularly common or
12820      performant we don't worry too much about the extra instructions to copy
12821      around.
12822      This is not yet a problem, since we have not yet implemented function
12823      specific attributes to choose between hardening against straight line
12824      speculation or not, but such function specific attributes are likely to
12825      happen in the future.  */
12826   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12827
12828   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12829   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12830 }
12831
12832 static void
12833 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12834 {
12835   rtx fnaddr, mem, a_tramp;
12836   const int tramp_code_sz = 24;
12837
12838   /* Don't need to copy the trailing D-words, we fill those in below.  */
12839   /* We create our own memory address in Pmode so that `emit_block_move` can
12840      use parts of the backend which expect Pmode addresses.  */
12841   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12842   emit_block_move (gen_rtx_MEM (BLKmode, temp),
12843                    assemble_trampoline_template (),
12844                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12845   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
12846   fnaddr = XEXP (DECL_RTL (fndecl), 0);
12847   if (GET_MODE (fnaddr) != ptr_mode)
12848     fnaddr = convert_memory_address (ptr_mode, fnaddr);
12849   emit_move_insn (mem, fnaddr);
12850
12851   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
12852   emit_move_insn (mem, chain_value);
12853
12854   /* XXX We should really define a "clear_cache" pattern and use
12855      gen_clear_cache().  */
12856   a_tramp = XEXP (m_tramp, 0);
12857   maybe_emit_call_builtin___clear_cache (a_tramp,
12858                                          plus_constant (ptr_mode,
12859                                                         a_tramp,
12860                                                         TRAMPOLINE_SIZE));
12861 }
12862
12863 static unsigned char
12864 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
12865 {
12866   /* ??? Logically we should only need to provide a value when
12867      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
12868      can hold MODE, but at the moment we need to handle all modes.
12869      Just ignore any runtime parts for registers that can't store them.  */
12870   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
12871   unsigned int nregs, vec_flags;
12872   switch (regclass)
12873     {
12874     case STUB_REGS:
12875     case TAILCALL_ADDR_REGS:
12876     case POINTER_REGS:
12877     case GENERAL_REGS:
12878     case ALL_REGS:
12879     case POINTER_AND_FP_REGS:
12880     case FP_REGS:
12881     case FP_LO_REGS:
12882     case FP_LO8_REGS:
12883       vec_flags = aarch64_classify_vector_mode (mode);
12884       if ((vec_flags & VEC_SVE_DATA)
12885           && constant_multiple_p (GET_MODE_SIZE (mode),
12886                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
12887         return nregs;
12888       return (vec_flags & VEC_ADVSIMD
12889               ? CEIL (lowest_size, UNITS_PER_VREG)
12890               : CEIL (lowest_size, UNITS_PER_WORD));
12891     case STACK_REG:
12892     case PR_REGS:
12893     case PR_LO_REGS:
12894     case PR_HI_REGS:
12895     case FFR_REGS:
12896     case PR_AND_FFR_REGS:
12897       return 1;
12898
12899     case NO_REGS:
12900       return 0;
12901
12902     default:
12903       break;
12904     }
12905   gcc_unreachable ();
12906 }
12907
12908 static reg_class_t
12909 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
12910 {
12911   if (regclass == POINTER_REGS)
12912     return GENERAL_REGS;
12913
12914   if (regclass == STACK_REG)
12915     {
12916       if (REG_P(x)
12917           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
12918           return regclass;
12919
12920       return NO_REGS;
12921     }
12922
12923   /* Register eliminiation can result in a request for
12924      SP+constant->FP_REGS.  We cannot support such operations which
12925      use SP as source and an FP_REG as destination, so reject out
12926      right now.  */
12927   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
12928     {
12929       rtx lhs = XEXP (x, 0);
12930
12931       /* Look through a possible SUBREG introduced by ILP32.  */
12932       if (SUBREG_P (lhs))
12933         lhs = SUBREG_REG (lhs);
12934
12935       gcc_assert (REG_P (lhs));
12936       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
12937                                       POINTER_REGS));
12938       return NO_REGS;
12939     }
12940
12941   return regclass;
12942 }
12943
12944 void
12945 aarch64_asm_output_labelref (FILE* f, const char *name)
12946 {
12947   asm_fprintf (f, "%U%s", name);
12948 }
12949
12950 static void
12951 aarch64_elf_asm_constructor (rtx symbol, int priority)
12952 {
12953   if (priority == DEFAULT_INIT_PRIORITY)
12954     default_ctor_section_asm_out_constructor (symbol, priority);
12955   else
12956     {
12957       section *s;
12958       /* While priority is known to be in range [0, 65535], so 18 bytes
12959          would be enough, the compiler might not know that.  To avoid
12960          -Wformat-truncation false positive, use a larger size.  */
12961       char buf[23];
12962       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
12963       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12964       switch_to_section (s);
12965       assemble_align (POINTER_SIZE);
12966       assemble_aligned_integer (POINTER_BYTES, symbol);
12967     }
12968 }
12969
12970 static void
12971 aarch64_elf_asm_destructor (rtx symbol, int priority)
12972 {
12973   if (priority == DEFAULT_INIT_PRIORITY)
12974     default_dtor_section_asm_out_destructor (symbol, priority);
12975   else
12976     {
12977       section *s;
12978       /* While priority is known to be in range [0, 65535], so 18 bytes
12979          would be enough, the compiler might not know that.  To avoid
12980          -Wformat-truncation false positive, use a larger size.  */
12981       char buf[23];
12982       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
12983       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12984       switch_to_section (s);
12985       assemble_align (POINTER_SIZE);
12986       assemble_aligned_integer (POINTER_BYTES, symbol);
12987     }
12988 }
12989
12990 const char*
12991 aarch64_output_casesi (rtx *operands)
12992 {
12993   char buf[100];
12994   char label[100];
12995   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
12996   int index;
12997   static const char *const patterns[4][2] =
12998   {
12999     {
13000       "ldrb\t%w3, [%0,%w1,uxtw]",
13001       "add\t%3, %4, %w3, sxtb #2"
13002     },
13003     {
13004       "ldrh\t%w3, [%0,%w1,uxtw #1]",
13005       "add\t%3, %4, %w3, sxth #2"
13006     },
13007     {
13008       "ldr\t%w3, [%0,%w1,uxtw #2]",
13009       "add\t%3, %4, %w3, sxtw #2"
13010     },
13011     /* We assume that DImode is only generated when not optimizing and
13012        that we don't really need 64-bit address offsets.  That would
13013        imply an object file with 8GB of code in a single function!  */
13014     {
13015       "ldr\t%w3, [%0,%w1,uxtw #2]",
13016       "add\t%3, %4, %w3, sxtw #2"
13017     }
13018   };
13019
13020   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13021
13022   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13023   index = exact_log2 (GET_MODE_SIZE (mode));
13024
13025   gcc_assert (index >= 0 && index <= 3);
13026
13027   /* Need to implement table size reduction, by chaning the code below.  */
13028   output_asm_insn (patterns[index][0], operands);
13029   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13030   snprintf (buf, sizeof (buf),
13031             "adr\t%%4, %s", targetm.strip_name_encoding (label));
13032   output_asm_insn (buf, operands);
13033   output_asm_insn (patterns[index][1], operands);
13034   output_asm_insn ("br\t%3", operands);
13035   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13036                    operands);
13037   assemble_label (asm_out_file, label);
13038   return "";
13039 }
13040
13041
13042 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13043    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13044    operator.  */
13045
13046 int
13047 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13048 {
13049   if (shift >= 0 && shift <= 4)
13050     {
13051       int size;
13052       for (size = 8; size <= 32; size *= 2)
13053         {
13054           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13055           if (mask == bits << shift)
13056             return size;
13057         }
13058     }
13059   return 0;
13060 }
13061
13062 /* Constant pools are per function only when PC relative
13063    literal loads are true or we are in the large memory
13064    model.  */
13065
13066 static inline bool
13067 aarch64_can_use_per_function_literal_pools_p (void)
13068 {
13069   return (aarch64_pcrelative_literal_loads
13070           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13071 }
13072
13073 static bool
13074 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
13075 {
13076   /* We can't use blocks for constants when we're using a per-function
13077      constant pool.  */
13078   return !aarch64_can_use_per_function_literal_pools_p ();
13079 }
13080
13081 /* Select appropriate section for constants depending
13082    on where we place literal pools.  */
13083
13084 static section *
13085 aarch64_select_rtx_section (machine_mode mode,
13086                             rtx x,
13087                             unsigned HOST_WIDE_INT align)
13088 {
13089   if (aarch64_can_use_per_function_literal_pools_p ())
13090     return function_section (current_function_decl);
13091
13092   return default_elf_select_rtx_section (mode, x, align);
13093 }
13094
13095 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
13096 void
13097 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13098                                   HOST_WIDE_INT offset)
13099 {
13100   /* When using per-function literal pools, we must ensure that any code
13101      section is aligned to the minimal instruction length, lest we get
13102      errors from the assembler re "unaligned instructions".  */
13103   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13104     ASM_OUTPUT_ALIGN (f, 2);
13105 }
13106
13107 /* Costs.  */
13108
13109 /* Helper function for rtx cost calculation.  Strip a shift expression
13110    from X.  Returns the inner operand if successful, or the original
13111    expression on failure.  */
13112 static rtx
13113 aarch64_strip_shift (rtx x)
13114 {
13115   rtx op = x;
13116
13117   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13118      we can convert both to ROR during final output.  */
13119   if ((GET_CODE (op) == ASHIFT
13120        || GET_CODE (op) == ASHIFTRT
13121        || GET_CODE (op) == LSHIFTRT
13122        || GET_CODE (op) == ROTATERT
13123        || GET_CODE (op) == ROTATE)
13124       && CONST_INT_P (XEXP (op, 1)))
13125     return XEXP (op, 0);
13126
13127   if (GET_CODE (op) == MULT
13128       && CONST_INT_P (XEXP (op, 1))
13129       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13130     return XEXP (op, 0);
13131
13132   return x;
13133 }
13134
13135 /* Helper function for rtx cost calculation.  Strip an extend
13136    expression from X.  Returns the inner operand if successful, or the
13137    original expression on failure.  We deal with a number of possible
13138    canonicalization variations here. If STRIP_SHIFT is true, then
13139    we can strip off a shift also.  */
13140 static rtx
13141 aarch64_strip_extend (rtx x, bool strip_shift)
13142 {
13143   scalar_int_mode mode;
13144   rtx op = x;
13145
13146   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13147     return op;
13148
13149   if (GET_CODE (op) == AND
13150       && GET_CODE (XEXP (op, 0)) == MULT
13151       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13152       && CONST_INT_P (XEXP (op, 1))
13153       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13154                            INTVAL (XEXP (op, 1))) != 0)
13155     return XEXP (XEXP (op, 0), 0);
13156
13157   /* Now handle extended register, as this may also have an optional
13158      left shift by 1..4.  */
13159   if (strip_shift
13160       && GET_CODE (op) == ASHIFT
13161       && CONST_INT_P (XEXP (op, 1))
13162       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13163     op = XEXP (op, 0);
13164
13165   if (GET_CODE (op) == ZERO_EXTEND
13166       || GET_CODE (op) == SIGN_EXTEND)
13167     op = XEXP (op, 0);
13168
13169   if (op != x)
13170     return op;
13171
13172   return x;
13173 }
13174
13175 /* Helper function for rtx cost calculation. Strip extension as well as any
13176    inner VEC_SELECT high-half from X. Returns the inner vector operand if
13177    successful, or the original expression on failure.  */
13178 static rtx
13179 aarch64_strip_extend_vec_half (rtx x)
13180 {
13181   if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13182     {
13183       x = XEXP (x, 0);
13184       if (GET_CODE (x) == VEC_SELECT
13185           && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13186                                     XEXP (x, 1)))
13187         x = XEXP (x, 0);
13188     }
13189   return x;
13190 }
13191
13192 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13193    any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13194    operand if successful, or the original expression on failure.  */
13195 static rtx
13196 aarch64_strip_duplicate_vec_elt (rtx x)
13197 {
13198   if (GET_CODE (x) == VEC_DUPLICATE
13199       && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13200     {
13201       x = XEXP (x, 0);
13202       if (GET_CODE (x) == VEC_SELECT)
13203         x = XEXP (x, 0);
13204       else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13205                && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13206         x = XEXP (XEXP (x, 0), 0);
13207     }
13208   return x;
13209 }
13210
13211 /* Return true iff CODE is a shift supported in combination
13212    with arithmetic instructions.  */
13213
13214 static bool
13215 aarch64_shift_p (enum rtx_code code)
13216 {
13217   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13218 }
13219
13220
13221 /* Return true iff X is a cheap shift without a sign extend. */
13222
13223 static bool
13224 aarch64_cheap_mult_shift_p (rtx x)
13225 {
13226   rtx op0, op1;
13227
13228   op0 = XEXP (x, 0);
13229   op1 = XEXP (x, 1);
13230
13231   if (!(aarch64_tune_params.extra_tuning_flags
13232                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13233     return false;
13234
13235   if (GET_CODE (op0) == SIGN_EXTEND)
13236     return false;
13237
13238   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13239       && UINTVAL (op1) <= 4)
13240     return true;
13241
13242   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13243     return false;
13244
13245   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13246
13247   if (l2 > 0 && l2 <= 4)
13248     return true;
13249
13250   return false;
13251 }
13252
13253 /* Helper function for rtx cost calculation.  Calculate the cost of
13254    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13255    Return the calculated cost of the expression, recursing manually in to
13256    operands where needed.  */
13257
13258 static int
13259 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13260 {
13261   rtx op0, op1;
13262   const struct cpu_cost_table *extra_cost
13263     = aarch64_tune_params.insn_extra_cost;
13264   int cost = 0;
13265   bool compound_p = (outer == PLUS || outer == MINUS);
13266   machine_mode mode = GET_MODE (x);
13267
13268   gcc_checking_assert (code == MULT);
13269
13270   op0 = XEXP (x, 0);
13271   op1 = XEXP (x, 1);
13272
13273   if (VECTOR_MODE_P (mode))
13274     {
13275       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13276       if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13277         {
13278           /* The select-operand-high-half versions of the instruction have the
13279              same cost as the three vector version - don't add the costs of the
13280              extension or selection into the costs of the multiply.  */
13281           op0 = aarch64_strip_extend_vec_half (op0);
13282           op1 = aarch64_strip_extend_vec_half (op1);
13283           /* The by-element versions of the instruction have the same costs as
13284              the normal 3-vector version.  We make an assumption that the input
13285              to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
13286              costing of a MUL by element pre RA is a bit optimistic.  */
13287           op0 = aarch64_strip_duplicate_vec_elt (op0);
13288           op1 = aarch64_strip_duplicate_vec_elt (op1);
13289         }
13290       cost += rtx_cost (op0, mode, MULT, 0, speed);
13291       cost += rtx_cost (op1, mode, MULT, 1, speed);
13292       if (speed)
13293         {
13294           if (GET_CODE (x) == MULT)
13295             cost += extra_cost->vect.mult;
13296           /* This is to catch the SSRA costing currently flowing here.  */
13297           else
13298             cost += extra_cost->vect.alu;
13299         }
13300       return cost;
13301     }
13302
13303   /* Integer multiply/fma.  */
13304   if (GET_MODE_CLASS (mode) == MODE_INT)
13305     {
13306       /* The multiply will be canonicalized as a shift, cost it as such.  */
13307       if (aarch64_shift_p (GET_CODE (x))
13308           || (CONST_INT_P (op1)
13309               && exact_log2 (INTVAL (op1)) > 0))
13310         {
13311           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13312                            || GET_CODE (op0) == SIGN_EXTEND;
13313           if (speed)
13314             {
13315               if (compound_p)
13316                 {
13317                   /* If the shift is considered cheap,
13318                      then don't add any cost. */
13319                   if (aarch64_cheap_mult_shift_p (x))
13320                     ;
13321                   else if (REG_P (op1))
13322                     /* ARITH + shift-by-register.  */
13323                     cost += extra_cost->alu.arith_shift_reg;
13324                   else if (is_extend)
13325                     /* ARITH + extended register.  We don't have a cost field
13326                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
13327                     cost += extra_cost->alu.extend_arith;
13328                   else
13329                     /* ARITH + shift-by-immediate.  */
13330                     cost += extra_cost->alu.arith_shift;
13331                 }
13332               else
13333                 /* LSL (immediate).  */
13334                 cost += extra_cost->alu.shift;
13335
13336             }
13337           /* Strip extends as we will have costed them in the case above.  */
13338           if (is_extend)
13339             op0 = aarch64_strip_extend (op0, true);
13340
13341           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13342
13343           return cost;
13344         }
13345
13346       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
13347          compound and let the below cases handle it.  After all, MNEG is a
13348          special-case alias of MSUB.  */
13349       if (GET_CODE (op0) == NEG)
13350         {
13351           op0 = XEXP (op0, 0);
13352           compound_p = true;
13353         }
13354
13355       /* Integer multiplies or FMAs have zero/sign extending variants.  */
13356       if ((GET_CODE (op0) == ZERO_EXTEND
13357            && GET_CODE (op1) == ZERO_EXTEND)
13358           || (GET_CODE (op0) == SIGN_EXTEND
13359               && GET_CODE (op1) == SIGN_EXTEND))
13360         {
13361           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13362           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13363
13364           if (speed)
13365             {
13366               if (compound_p)
13367                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
13368                 cost += extra_cost->mult[0].extend_add;
13369               else
13370                 /* MUL/SMULL/UMULL.  */
13371                 cost += extra_cost->mult[0].extend;
13372             }
13373
13374           return cost;
13375         }
13376
13377       /* This is either an integer multiply or a MADD.  In both cases
13378          we want to recurse and cost the operands.  */
13379       cost += rtx_cost (op0, mode, MULT, 0, speed);
13380       cost += rtx_cost (op1, mode, MULT, 1, speed);
13381
13382       if (speed)
13383         {
13384           if (compound_p)
13385             /* MADD/MSUB.  */
13386             cost += extra_cost->mult[mode == DImode].add;
13387           else
13388             /* MUL.  */
13389             cost += extra_cost->mult[mode == DImode].simple;
13390         }
13391
13392       return cost;
13393     }
13394   else
13395     {
13396       if (speed)
13397         {
13398           /* Floating-point FMA/FMUL can also support negations of the
13399              operands, unless the rounding mode is upward or downward in
13400              which case FNMUL is different than FMUL with operand negation.  */
13401           bool neg0 = GET_CODE (op0) == NEG;
13402           bool neg1 = GET_CODE (op1) == NEG;
13403           if (compound_p || !flag_rounding_math || (neg0 && neg1))
13404             {
13405               if (neg0)
13406                 op0 = XEXP (op0, 0);
13407               if (neg1)
13408                 op1 = XEXP (op1, 0);
13409             }
13410
13411           if (compound_p)
13412             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
13413             cost += extra_cost->fp[mode == DFmode].fma;
13414           else
13415             /* FMUL/FNMUL.  */
13416             cost += extra_cost->fp[mode == DFmode].mult;
13417         }
13418
13419       cost += rtx_cost (op0, mode, MULT, 0, speed);
13420       cost += rtx_cost (op1, mode, MULT, 1, speed);
13421       return cost;
13422     }
13423 }
13424
13425 static int
13426 aarch64_address_cost (rtx x,
13427                       machine_mode mode,
13428                       addr_space_t as ATTRIBUTE_UNUSED,
13429                       bool speed)
13430 {
13431   enum rtx_code c = GET_CODE (x);
13432   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13433   struct aarch64_address_info info;
13434   int cost = 0;
13435   info.shift = 0;
13436
13437   if (!aarch64_classify_address (&info, x, mode, false))
13438     {
13439       if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13440         {
13441           /* This is a CONST or SYMBOL ref which will be split
13442              in a different way depending on the code model in use.
13443              Cost it through the generic infrastructure.  */
13444           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13445           /* Divide through by the cost of one instruction to
13446              bring it to the same units as the address costs.  */
13447           cost_symbol_ref /= COSTS_N_INSNS (1);
13448           /* The cost is then the cost of preparing the address,
13449              followed by an immediate (possibly 0) offset.  */
13450           return cost_symbol_ref + addr_cost->imm_offset;
13451         }
13452       else
13453         {
13454           /* This is most likely a jump table from a case
13455              statement.  */
13456           return addr_cost->register_offset;
13457         }
13458     }
13459
13460   switch (info.type)
13461     {
13462       case ADDRESS_LO_SUM:
13463       case ADDRESS_SYMBOLIC:
13464       case ADDRESS_REG_IMM:
13465         cost += addr_cost->imm_offset;
13466         break;
13467
13468       case ADDRESS_REG_WB:
13469         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13470           cost += addr_cost->pre_modify;
13471         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13472           {
13473             unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13474             if (nvectors == 3)
13475               cost += addr_cost->post_modify_ld3_st3;
13476             else if (nvectors == 4)
13477               cost += addr_cost->post_modify_ld4_st4;
13478             else
13479               cost += addr_cost->post_modify;
13480           }
13481         else
13482           gcc_unreachable ();
13483
13484         break;
13485
13486       case ADDRESS_REG_REG:
13487         cost += addr_cost->register_offset;
13488         break;
13489
13490       case ADDRESS_REG_SXTW:
13491         cost += addr_cost->register_sextend;
13492         break;
13493
13494       case ADDRESS_REG_UXTW:
13495         cost += addr_cost->register_zextend;
13496         break;
13497
13498       default:
13499         gcc_unreachable ();
13500     }
13501
13502
13503   if (info.shift > 0)
13504     {
13505       /* For the sake of calculating the cost of the shifted register
13506          component, we can treat same sized modes in the same way.  */
13507       if (known_eq (GET_MODE_BITSIZE (mode), 16))
13508         cost += addr_cost->addr_scale_costs.hi;
13509       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13510         cost += addr_cost->addr_scale_costs.si;
13511       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13512         cost += addr_cost->addr_scale_costs.di;
13513       else
13514         /* We can't tell, or this is a 128-bit vector.  */
13515         cost += addr_cost->addr_scale_costs.ti;
13516     }
13517
13518   return cost;
13519 }
13520
13521 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
13522    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
13523    to be taken.  */
13524
13525 int
13526 aarch64_branch_cost (bool speed_p, bool predictable_p)
13527 {
13528   /* When optimizing for speed, use the cost of unpredictable branches.  */
13529   const struct cpu_branch_cost *branch_costs =
13530     aarch64_tune_params.branch_costs;
13531
13532   if (!speed_p || predictable_p)
13533     return branch_costs->predictable;
13534   else
13535     return branch_costs->unpredictable;
13536 }
13537
13538 /* Return true if X is a zero or sign extract
13539    usable in an ADD or SUB (extended register) instruction.  */
13540 static bool
13541 aarch64_rtx_arith_op_extract_p (rtx x)
13542 {
13543   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13544      No shift.  */
13545   if (GET_CODE (x) == SIGN_EXTEND
13546       || GET_CODE (x) == ZERO_EXTEND)
13547     return REG_P (XEXP (x, 0));
13548
13549   return false;
13550 }
13551
13552 static bool
13553 aarch64_frint_unspec_p (unsigned int u)
13554 {
13555   switch (u)
13556     {
13557       case UNSPEC_FRINTZ:
13558       case UNSPEC_FRINTP:
13559       case UNSPEC_FRINTM:
13560       case UNSPEC_FRINTA:
13561       case UNSPEC_FRINTN:
13562       case UNSPEC_FRINTX:
13563       case UNSPEC_FRINTI:
13564         return true;
13565
13566       default:
13567         return false;
13568     }
13569 }
13570
13571 /* Return true iff X is an rtx that will match an extr instruction
13572    i.e. as described in the *extr<mode>5_insn family of patterns.
13573    OP0 and OP1 will be set to the operands of the shifts involved
13574    on success and will be NULL_RTX otherwise.  */
13575
13576 static bool
13577 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13578 {
13579   rtx op0, op1;
13580   scalar_int_mode mode;
13581   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13582     return false;
13583
13584   *res_op0 = NULL_RTX;
13585   *res_op1 = NULL_RTX;
13586
13587   if (GET_CODE (x) != IOR)
13588     return false;
13589
13590   op0 = XEXP (x, 0);
13591   op1 = XEXP (x, 1);
13592
13593   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13594       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13595     {
13596      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
13597       if (GET_CODE (op1) == ASHIFT)
13598         std::swap (op0, op1);
13599
13600       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13601         return false;
13602
13603       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13604       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13605
13606       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13607           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13608         {
13609           *res_op0 = XEXP (op0, 0);
13610           *res_op1 = XEXP (op1, 0);
13611           return true;
13612         }
13613     }
13614
13615   return false;
13616 }
13617
13618 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13619    storing it in *COST.  Result is true if the total cost of the operation
13620    has now been calculated.  */
13621 static bool
13622 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13623 {
13624   rtx inner;
13625   rtx comparator;
13626   enum rtx_code cmpcode;
13627   const struct cpu_cost_table *extra_cost
13628     = aarch64_tune_params.insn_extra_cost;
13629
13630   if (COMPARISON_P (op0))
13631     {
13632       inner = XEXP (op0, 0);
13633       comparator = XEXP (op0, 1);
13634       cmpcode = GET_CODE (op0);
13635     }
13636   else
13637     {
13638       inner = op0;
13639       comparator = const0_rtx;
13640       cmpcode = NE;
13641     }
13642
13643   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13644     {
13645       /* Conditional branch.  */
13646       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13647         return true;
13648       else
13649         {
13650           if (cmpcode == NE || cmpcode == EQ)
13651             {
13652               if (comparator == const0_rtx)
13653                 {
13654                   /* TBZ/TBNZ/CBZ/CBNZ.  */
13655                   if (GET_CODE (inner) == ZERO_EXTRACT)
13656                     /* TBZ/TBNZ.  */
13657                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13658                                        ZERO_EXTRACT, 0, speed);
13659                   else
13660                     /* CBZ/CBNZ.  */
13661                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13662
13663                   return true;
13664                 }
13665               if (register_operand (inner, VOIDmode)
13666                   && aarch64_imm24 (comparator, VOIDmode))
13667                 {
13668                   /* SUB and SUBS.  */
13669                   *cost += COSTS_N_INSNS (2);
13670                   if (speed)
13671                     *cost += extra_cost->alu.arith * 2;
13672                   return true;
13673                 }
13674             }
13675           else if (cmpcode == LT || cmpcode == GE)
13676             {
13677               /* TBZ/TBNZ.  */
13678               if (comparator == const0_rtx)
13679                 return true;
13680             }
13681         }
13682     }
13683   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13684     {
13685       /* CCMP.  */
13686       if (GET_CODE (op1) == COMPARE)
13687         {
13688           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
13689           if (XEXP (op1, 1) == const0_rtx)
13690             *cost += 1;
13691           if (speed)
13692             {
13693               machine_mode mode = GET_MODE (XEXP (op1, 0));
13694
13695               if (GET_MODE_CLASS (mode) == MODE_INT)
13696                 *cost += extra_cost->alu.arith;
13697               else
13698                 *cost += extra_cost->fp[mode == DFmode].compare;
13699             }
13700           return true;
13701         }
13702
13703       /* It's a conditional operation based on the status flags,
13704          so it must be some flavor of CSEL.  */
13705
13706       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
13707       if (GET_CODE (op1) == NEG
13708           || GET_CODE (op1) == NOT
13709           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13710         op1 = XEXP (op1, 0);
13711       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13712         {
13713           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
13714           op1 = XEXP (op1, 0);
13715           op2 = XEXP (op2, 0);
13716         }
13717       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13718         {
13719           inner = XEXP (op1, 0);
13720           if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13721             /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
13722             op1 = XEXP (inner, 0);
13723         }
13724
13725       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13726       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13727       return true;
13728     }
13729
13730   /* We don't know what this is, cost all operands.  */
13731   return false;
13732 }
13733
13734 /* Check whether X is a bitfield operation of the form shift + extend that
13735    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
13736    operand to which the bitfield operation is applied.  Otherwise return
13737    NULL_RTX.  */
13738
13739 static rtx
13740 aarch64_extend_bitfield_pattern_p (rtx x)
13741 {
13742   rtx_code outer_code = GET_CODE (x);
13743   machine_mode outer_mode = GET_MODE (x);
13744
13745   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13746       && outer_mode != SImode && outer_mode != DImode)
13747     return NULL_RTX;
13748
13749   rtx inner = XEXP (x, 0);
13750   rtx_code inner_code = GET_CODE (inner);
13751   machine_mode inner_mode = GET_MODE (inner);
13752   rtx op = NULL_RTX;
13753
13754   switch (inner_code)
13755     {
13756       case ASHIFT:
13757         if (CONST_INT_P (XEXP (inner, 1))
13758             && (inner_mode == QImode || inner_mode == HImode))
13759           op = XEXP (inner, 0);
13760         break;
13761       case LSHIFTRT:
13762         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13763             && (inner_mode == QImode || inner_mode == HImode))
13764           op = XEXP (inner, 0);
13765         break;
13766       case ASHIFTRT:
13767         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13768             && (inner_mode == QImode || inner_mode == HImode))
13769           op = XEXP (inner, 0);
13770         break;
13771       default:
13772         break;
13773     }
13774
13775   return op;
13776 }
13777
13778 /* Return true if the mask and a shift amount from an RTX of the form
13779    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13780    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
13781
13782 bool
13783 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13784                                     rtx shft_amnt)
13785 {
13786   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
13787          && INTVAL (mask) > 0
13788          && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
13789          && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
13790          && (UINTVAL (mask)
13791              & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
13792 }
13793
13794 /* Return true if the masks and a shift amount from an RTX of the form
13795    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
13796    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
13797
13798 bool
13799 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
13800                                    unsigned HOST_WIDE_INT mask1,
13801                                    unsigned HOST_WIDE_INT shft_amnt,
13802                                    unsigned HOST_WIDE_INT mask2)
13803 {
13804   unsigned HOST_WIDE_INT t;
13805
13806   /* Verify that there is no overlap in what bits are set in the two masks.  */
13807   if (mask1 != ~mask2)
13808     return false;
13809
13810   /* Verify that mask2 is not all zeros or ones.  */
13811   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
13812     return false;
13813
13814   /* The shift amount should always be less than the mode size.  */
13815   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
13816
13817   /* Verify that the mask being shifted is contiguous and would be in the
13818      least significant bits after shifting by shft_amnt.  */
13819   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
13820   return (t == (t & -t));
13821 }
13822
13823 /* Calculate the cost of calculating X, storing it in *COST.  Result
13824    is true if the total cost of the operation has now been calculated.  */
13825 static bool
13826 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
13827                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
13828 {
13829   rtx op0, op1, op2;
13830   const struct cpu_cost_table *extra_cost
13831     = aarch64_tune_params.insn_extra_cost;
13832   rtx_code code = GET_CODE (x);
13833   scalar_int_mode int_mode;
13834
13835   /* By default, assume that everything has equivalent cost to the
13836      cheapest instruction.  Any additional costs are applied as a delta
13837      above this default.  */
13838   *cost = COSTS_N_INSNS (1);
13839
13840   switch (code)
13841     {
13842     case SET:
13843       /* The cost depends entirely on the operands to SET.  */
13844       *cost = 0;
13845       op0 = SET_DEST (x);
13846       op1 = SET_SRC (x);
13847
13848       switch (GET_CODE (op0))
13849         {
13850         case MEM:
13851           if (speed)
13852             {
13853               rtx address = XEXP (op0, 0);
13854               if (VECTOR_MODE_P (mode))
13855                 *cost += extra_cost->ldst.storev;
13856               else if (GET_MODE_CLASS (mode) == MODE_INT)
13857                 *cost += extra_cost->ldst.store;
13858               else if (mode == SFmode || mode == SDmode)
13859                 *cost += extra_cost->ldst.storef;
13860               else if (mode == DFmode || mode == DDmode)
13861                 *cost += extra_cost->ldst.stored;
13862
13863               *cost +=
13864                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13865                                                      0, speed));
13866             }
13867
13868           *cost += rtx_cost (op1, mode, SET, 1, speed);
13869           return true;
13870
13871         case SUBREG:
13872           if (! REG_P (SUBREG_REG (op0)))
13873             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
13874
13875           /* Fall through.  */
13876         case REG:
13877           /* The cost is one per vector-register copied.  */
13878           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
13879             {
13880               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
13881               *cost = COSTS_N_INSNS (nregs);
13882             }
13883           /* const0_rtx is in general free, but we will use an
13884              instruction to set a register to 0.  */
13885           else if (REG_P (op1) || op1 == const0_rtx)
13886             {
13887               /* The cost is 1 per register copied.  */
13888               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
13889               *cost = COSTS_N_INSNS (nregs);
13890             }
13891           else
13892             /* Cost is just the cost of the RHS of the set.  */
13893             *cost += rtx_cost (op1, mode, SET, 1, speed);
13894           return true;
13895
13896         case ZERO_EXTRACT:
13897         case SIGN_EXTRACT:
13898           /* Bit-field insertion.  Strip any redundant widening of
13899              the RHS to meet the width of the target.  */
13900           if (SUBREG_P (op1))
13901             op1 = SUBREG_REG (op1);
13902           if ((GET_CODE (op1) == ZERO_EXTEND
13903                || GET_CODE (op1) == SIGN_EXTEND)
13904               && CONST_INT_P (XEXP (op0, 1))
13905               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
13906               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
13907             op1 = XEXP (op1, 0);
13908
13909           if (CONST_INT_P (op1))
13910             {
13911               /* MOV immediate is assumed to always be cheap.  */
13912               *cost = COSTS_N_INSNS (1);
13913             }
13914           else
13915             {
13916               /* BFM.  */
13917               if (speed)
13918                 *cost += extra_cost->alu.bfi;
13919               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
13920             }
13921
13922           return true;
13923
13924         default:
13925           /* We can't make sense of this, assume default cost.  */
13926           *cost = COSTS_N_INSNS (1);
13927           return false;
13928         }
13929       return false;
13930
13931     case CONST_INT:
13932       /* If an instruction can incorporate a constant within the
13933          instruction, the instruction's expression avoids calling
13934          rtx_cost() on the constant.  If rtx_cost() is called on a
13935          constant, then it is usually because the constant must be
13936          moved into a register by one or more instructions.
13937
13938          The exception is constant 0, which can be expressed
13939          as XZR/WZR and is therefore free.  The exception to this is
13940          if we have (set (reg) (const0_rtx)) in which case we must cost
13941          the move.  However, we can catch that when we cost the SET, so
13942          we don't need to consider that here.  */
13943       if (x == const0_rtx)
13944         *cost = 0;
13945       else
13946         {
13947           /* To an approximation, building any other constant is
13948              proportionally expensive to the number of instructions
13949              required to build that constant.  This is true whether we
13950              are compiling for SPEED or otherwise.  */
13951           machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
13952                                 ? SImode : DImode;
13953           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
13954                                  (NULL_RTX, x, false, imode));
13955         }
13956       return true;
13957
13958     case CONST_DOUBLE:
13959
13960       /* First determine number of instructions to do the move
13961           as an integer constant.  */
13962       if (!aarch64_float_const_representable_p (x)
13963            && !aarch64_can_const_movi_rtx_p (x, mode)
13964            && aarch64_float_const_rtx_p (x))
13965         {
13966           unsigned HOST_WIDE_INT ival;
13967           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
13968           gcc_assert (succeed);
13969
13970           machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
13971                                 ? DImode : SImode;
13972           int ncost = aarch64_internal_mov_immediate
13973                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
13974           *cost += COSTS_N_INSNS (ncost);
13975           return true;
13976         }
13977
13978       if (speed)
13979         {
13980           /* mov[df,sf]_aarch64.  */
13981           if (aarch64_float_const_representable_p (x))
13982             /* FMOV (scalar immediate).  */
13983             *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
13984           else if (!aarch64_float_const_zero_rtx_p (x))
13985             {
13986               /* This will be a load from memory.  */
13987               if (mode == DFmode || mode == DDmode)
13988                 *cost += extra_cost->ldst.loadd;
13989               else
13990                 *cost += extra_cost->ldst.loadf;
13991             }
13992           else
13993             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
13994                or MOV v0.s[0], wzr - neither of which are modeled by the
13995                cost tables.  Just use the default cost.  */
13996             {
13997             }
13998         }
13999
14000       return true;
14001
14002     case MEM:
14003       if (speed)
14004         {
14005           /* For loads we want the base cost of a load, plus an
14006              approximation for the additional cost of the addressing
14007              mode.  */
14008           rtx address = XEXP (x, 0);
14009           if (VECTOR_MODE_P (mode))
14010             *cost += extra_cost->ldst.loadv;
14011           else if (GET_MODE_CLASS (mode) == MODE_INT)
14012             *cost += extra_cost->ldst.load;
14013           else if (mode == SFmode || mode == SDmode)
14014             *cost += extra_cost->ldst.loadf;
14015           else if (mode == DFmode || mode == DDmode)
14016             *cost += extra_cost->ldst.loadd;
14017
14018           *cost +=
14019                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14020                                                      0, speed));
14021         }
14022
14023       return true;
14024
14025     case NEG:
14026       op0 = XEXP (x, 0);
14027
14028       if (VECTOR_MODE_P (mode))
14029         {
14030           if (speed)
14031             {
14032               /* FNEG.  */
14033               *cost += extra_cost->vect.alu;
14034             }
14035           return false;
14036         }
14037
14038       if (GET_MODE_CLASS (mode) == MODE_INT)
14039         {
14040           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14041               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14042             {
14043               /* CSETM.  */
14044               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
14045               return true;
14046             }
14047
14048           /* Cost this as SUB wzr, X.  */
14049           op0 = CONST0_RTX (mode);
14050           op1 = XEXP (x, 0);
14051           goto cost_minus;
14052         }
14053
14054       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14055         {
14056           /* Support (neg(fma...)) as a single instruction only if
14057              sign of zeros is unimportant.  This matches the decision
14058              making in aarch64.md.  */
14059           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14060             {
14061               /* FNMADD.  */
14062               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14063               return true;
14064             }
14065           if (GET_CODE (op0) == MULT)
14066             {
14067               /* FNMUL.  */
14068               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14069               return true;
14070             }
14071           if (speed)
14072             /* FNEG.  */
14073             *cost += extra_cost->fp[mode == DFmode].neg;
14074           return false;
14075         }
14076
14077       return false;
14078
14079     case CLRSB:
14080     case CLZ:
14081       if (speed)
14082         {
14083           if (VECTOR_MODE_P (mode))
14084             *cost += extra_cost->vect.alu;
14085           else
14086             *cost += extra_cost->alu.clz;
14087         }
14088
14089       return false;
14090
14091     case CTZ:
14092       *cost = COSTS_N_INSNS (2);
14093
14094       if (speed)
14095         *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14096       return false;
14097
14098     case COMPARE:
14099       op0 = XEXP (x, 0);
14100       op1 = XEXP (x, 1);
14101
14102       if (op1 == const0_rtx
14103           && GET_CODE (op0) == AND)
14104         {
14105           x = op0;
14106           mode = GET_MODE (op0);
14107           goto cost_logic;
14108         }
14109
14110       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14111         {
14112           /* TODO: A write to the CC flags possibly costs extra, this
14113              needs encoding in the cost tables.  */
14114
14115           mode = GET_MODE (op0);
14116           /* ANDS.  */
14117           if (GET_CODE (op0) == AND)
14118             {
14119               x = op0;
14120               goto cost_logic;
14121             }
14122
14123           if (GET_CODE (op0) == PLUS)
14124             {
14125               /* ADDS (and CMN alias).  */
14126               x = op0;
14127               goto cost_plus;
14128             }
14129
14130           if (GET_CODE (op0) == MINUS)
14131             {
14132               /* SUBS.  */
14133               x = op0;
14134               goto cost_minus;
14135             }
14136
14137           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14138               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14139               && CONST_INT_P (XEXP (op0, 2)))
14140             {
14141               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14142                  Handle it here directly rather than going to cost_logic
14143                  since we know the immediate generated for the TST is valid
14144                  so we can avoid creating an intermediate rtx for it only
14145                  for costing purposes.  */
14146               if (speed)
14147                 *cost += extra_cost->alu.logical;
14148
14149               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14150                                  ZERO_EXTRACT, 0, speed);
14151               return true;
14152             }
14153
14154           if (GET_CODE (op1) == NEG)
14155             {
14156               /* CMN.  */
14157               if (speed)
14158                 *cost += extra_cost->alu.arith;
14159
14160               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14161               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14162               return true;
14163             }
14164
14165           /* CMP.
14166
14167              Compare can freely swap the order of operands, and
14168              canonicalization puts the more complex operation first.
14169              But the integer MINUS logic expects the shift/extend
14170              operation in op1.  */
14171           if (! (REG_P (op0)
14172                  || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14173           {
14174             op0 = XEXP (x, 1);
14175             op1 = XEXP (x, 0);
14176           }
14177           goto cost_minus;
14178         }
14179
14180       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14181         {
14182           /* FCMP.  */
14183           if (speed)
14184             *cost += extra_cost->fp[mode == DFmode].compare;
14185
14186           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14187             {
14188               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14189               /* FCMP supports constant 0.0 for no extra cost. */
14190               return true;
14191             }
14192           return false;
14193         }
14194
14195       if (VECTOR_MODE_P (mode))
14196         {
14197           /* Vector compare.  */
14198           if (speed)
14199             *cost += extra_cost->vect.alu;
14200
14201           if (aarch64_float_const_zero_rtx_p (op1))
14202             {
14203               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14204                  cost.  */
14205               return true;
14206             }
14207           return false;
14208         }
14209       return false;
14210
14211     case MINUS:
14212       {
14213         op0 = XEXP (x, 0);
14214         op1 = XEXP (x, 1);
14215
14216 cost_minus:
14217         if (VECTOR_MODE_P (mode))
14218           {
14219             /* SUBL2 and SUBW2.  */
14220             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14221             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14222               {
14223                 /* The select-operand-high-half versions of the sub instruction
14224                    have the same cost as the regular three vector version -
14225                    don't add the costs of the select into the costs of the sub.
14226                    */
14227                 op0 = aarch64_strip_extend_vec_half (op0);
14228                 op1 = aarch64_strip_extend_vec_half (op1);
14229               }
14230           }
14231
14232         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14233
14234         /* Detect valid immediates.  */
14235         if ((GET_MODE_CLASS (mode) == MODE_INT
14236              || (GET_MODE_CLASS (mode) == MODE_CC
14237                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14238             && CONST_INT_P (op1)
14239             && aarch64_uimm12_shift (INTVAL (op1)))
14240           {
14241             if (speed)
14242               /* SUB(S) (immediate).  */
14243               *cost += extra_cost->alu.arith;
14244             return true;
14245           }
14246
14247         /* Look for SUB (extended register).  */
14248         if (is_a <scalar_int_mode> (mode)
14249             && aarch64_rtx_arith_op_extract_p (op1))
14250           {
14251             if (speed)
14252               *cost += extra_cost->alu.extend_arith;
14253
14254             op1 = aarch64_strip_extend (op1, true);
14255             *cost += rtx_cost (op1, VOIDmode,
14256                                (enum rtx_code) GET_CODE (op1), 0, speed);
14257             return true;
14258           }
14259
14260         rtx new_op1 = aarch64_strip_extend (op1, false);
14261
14262         /* Cost this as an FMA-alike operation.  */
14263         if ((GET_CODE (new_op1) == MULT
14264              || aarch64_shift_p (GET_CODE (new_op1)))
14265             && code != COMPARE)
14266           {
14267             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14268                                             (enum rtx_code) code,
14269                                             speed);
14270             return true;
14271           }
14272
14273         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14274
14275         if (speed)
14276           {
14277             if (VECTOR_MODE_P (mode))
14278               {
14279                 /* Vector SUB.  */
14280                 *cost += extra_cost->vect.alu;
14281               }
14282             else if (GET_MODE_CLASS (mode) == MODE_INT)
14283               {
14284                 /* SUB(S).  */
14285                 *cost += extra_cost->alu.arith;
14286               }
14287             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14288               {
14289                 /* FSUB.  */
14290                 *cost += extra_cost->fp[mode == DFmode].addsub;
14291               }
14292           }
14293         return true;
14294       }
14295
14296     case PLUS:
14297       {
14298         rtx new_op0;
14299
14300         op0 = XEXP (x, 0);
14301         op1 = XEXP (x, 1);
14302
14303 cost_plus:
14304         if (VECTOR_MODE_P (mode))
14305           {
14306             /* ADDL2 and ADDW2.  */
14307             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14308             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14309               {
14310                 /* The select-operand-high-half versions of the add instruction
14311                    have the same cost as the regular three vector version -
14312                    don't add the costs of the select into the costs of the add.
14313                    */
14314                 op0 = aarch64_strip_extend_vec_half (op0);
14315                 op1 = aarch64_strip_extend_vec_half (op1);
14316               }
14317           }
14318
14319         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14320             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14321           {
14322             /* CSINC.  */
14323             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14324             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14325             return true;
14326           }
14327
14328         if (GET_MODE_CLASS (mode) == MODE_INT
14329             && (aarch64_plus_immediate (op1, mode)
14330                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14331           {
14332             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14333
14334             if (speed)
14335               {
14336                 /* ADD (immediate).  */
14337                 *cost += extra_cost->alu.arith;
14338
14339                 /* Some tunings prefer to not use the VL-based scalar ops.
14340                    Increase the cost of the poly immediate to prevent their
14341                    formation.  */
14342                 if (GET_CODE (op1) == CONST_POLY_INT
14343                     && (aarch64_tune_params.extra_tuning_flags
14344                         & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14345                   *cost += COSTS_N_INSNS (1);
14346               }
14347             return true;
14348           }
14349
14350         if (aarch64_pluslong_immediate (op1, mode))
14351           {
14352             /* 24-bit add in 2 instructions or 12-bit shifted add.  */
14353             if ((INTVAL (op1) & 0xfff) != 0)
14354               *cost += COSTS_N_INSNS (1);
14355
14356             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14357             return true;
14358           }
14359
14360         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14361
14362         /* Look for ADD (extended register).  */
14363         if (is_a <scalar_int_mode> (mode)
14364             && aarch64_rtx_arith_op_extract_p (op0))
14365           {
14366             if (speed)
14367               *cost += extra_cost->alu.extend_arith;
14368
14369             op0 = aarch64_strip_extend (op0, true);
14370             *cost += rtx_cost (op0, VOIDmode,
14371                                (enum rtx_code) GET_CODE (op0), 0, speed);
14372             return true;
14373           }
14374
14375         /* Strip any extend, leave shifts behind as we will
14376            cost them through mult_cost.  */
14377         new_op0 = aarch64_strip_extend (op0, false);
14378
14379         if (GET_CODE (new_op0) == MULT
14380             || aarch64_shift_p (GET_CODE (new_op0)))
14381           {
14382             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14383                                             speed);
14384             return true;
14385           }
14386
14387         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14388
14389         if (speed)
14390           {
14391             if (VECTOR_MODE_P (mode))
14392               {
14393                 /* Vector ADD.  */
14394                 *cost += extra_cost->vect.alu;
14395               }
14396             else if (GET_MODE_CLASS (mode) == MODE_INT)
14397               {
14398                 /* ADD.  */
14399                 *cost += extra_cost->alu.arith;
14400               }
14401             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14402               {
14403                 /* FADD.  */
14404                 *cost += extra_cost->fp[mode == DFmode].addsub;
14405               }
14406           }
14407         return true;
14408       }
14409
14410     case BSWAP:
14411       *cost = COSTS_N_INSNS (1);
14412
14413       if (speed)
14414         {
14415           if (VECTOR_MODE_P (mode))
14416             *cost += extra_cost->vect.alu;
14417           else
14418             *cost += extra_cost->alu.rev;
14419         }
14420       return false;
14421
14422     case IOR:
14423       if (aarch_rev16_p (x))
14424         {
14425           *cost = COSTS_N_INSNS (1);
14426
14427           if (speed)
14428             {
14429               if (VECTOR_MODE_P (mode))
14430                 *cost += extra_cost->vect.alu;
14431               else
14432                 *cost += extra_cost->alu.rev;
14433             }
14434           return true;
14435         }
14436
14437       if (aarch64_extr_rtx_p (x, &op0, &op1))
14438         {
14439           *cost += rtx_cost (op0, mode, IOR, 0, speed);
14440           *cost += rtx_cost (op1, mode, IOR, 1, speed);
14441           if (speed)
14442             *cost += extra_cost->alu.shift;
14443
14444           return true;
14445         }
14446     /* Fall through.  */
14447     case XOR:
14448     case AND:
14449     cost_logic:
14450       op0 = XEXP (x, 0);
14451       op1 = XEXP (x, 1);
14452
14453       if (VECTOR_MODE_P (mode))
14454         {
14455           if (speed)
14456             *cost += extra_cost->vect.alu;
14457           return true;
14458         }
14459
14460       if (code == AND
14461           && GET_CODE (op0) == MULT
14462           && CONST_INT_P (XEXP (op0, 1))
14463           && CONST_INT_P (op1)
14464           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14465                                INTVAL (op1)) != 0)
14466         {
14467           /* This is a UBFM/SBFM.  */
14468           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14469           if (speed)
14470             *cost += extra_cost->alu.bfx;
14471           return true;
14472         }
14473
14474       if (is_int_mode (mode, &int_mode))
14475         {
14476           if (CONST_INT_P (op1))
14477             {
14478               /* We have a mask + shift version of a UBFIZ
14479                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
14480               if (GET_CODE (op0) == ASHIFT
14481                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14482                                                          XEXP (op0, 1)))
14483                 {
14484                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
14485                                      (enum rtx_code) code, 0, speed);
14486                   if (speed)
14487                     *cost += extra_cost->alu.bfx;
14488
14489                   return true;
14490                 }
14491               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14492                 {
14493                 /* We possibly get the immediate for free, this is not
14494                    modelled.  */
14495                   *cost += rtx_cost (op0, int_mode,
14496                                      (enum rtx_code) code, 0, speed);
14497                   if (speed)
14498                     *cost += extra_cost->alu.logical;
14499
14500                   return true;
14501                 }
14502             }
14503           else
14504             {
14505               rtx new_op0 = op0;
14506
14507               /* Handle ORN, EON, or BIC.  */
14508               if (GET_CODE (op0) == NOT)
14509                 op0 = XEXP (op0, 0);
14510
14511               new_op0 = aarch64_strip_shift (op0);
14512
14513               /* If we had a shift on op0 then this is a logical-shift-
14514                  by-register/immediate operation.  Otherwise, this is just
14515                  a logical operation.  */
14516               if (speed)
14517                 {
14518                   if (new_op0 != op0)
14519                     {
14520                       /* Shift by immediate.  */
14521                       if (CONST_INT_P (XEXP (op0, 1)))
14522                         *cost += extra_cost->alu.log_shift;
14523                       else
14524                         *cost += extra_cost->alu.log_shift_reg;
14525                     }
14526                   else
14527                     *cost += extra_cost->alu.logical;
14528                 }
14529
14530               /* In both cases we want to cost both operands.  */
14531               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14532                                  0, speed);
14533               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14534                                  1, speed);
14535
14536               return true;
14537             }
14538         }
14539       return false;
14540
14541     case NOT:
14542       x = XEXP (x, 0);
14543       op0 = aarch64_strip_shift (x);
14544
14545       if (VECTOR_MODE_P (mode))
14546         {
14547           /* Vector NOT.  */
14548           *cost += extra_cost->vect.alu;
14549           return false;
14550         }
14551
14552       /* MVN-shifted-reg.  */
14553       if (op0 != x)
14554         {
14555           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14556
14557           if (speed)
14558             *cost += extra_cost->alu.log_shift;
14559
14560           return true;
14561         }
14562       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14563          Handle the second form here taking care that 'a' in the above can
14564          be a shift.  */
14565       else if (GET_CODE (op0) == XOR)
14566         {
14567           rtx newop0 = XEXP (op0, 0);
14568           rtx newop1 = XEXP (op0, 1);
14569           rtx op0_stripped = aarch64_strip_shift (newop0);
14570
14571           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14572           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14573
14574           if (speed)
14575             {
14576               if (op0_stripped != newop0)
14577                 *cost += extra_cost->alu.log_shift;
14578               else
14579                 *cost += extra_cost->alu.logical;
14580             }
14581
14582           return true;
14583         }
14584       /* MVN.  */
14585       if (speed)
14586         *cost += extra_cost->alu.logical;
14587
14588       return false;
14589
14590     case ZERO_EXTEND:
14591
14592       op0 = XEXP (x, 0);
14593       /* If a value is written in SI mode, then zero extended to DI
14594          mode, the operation will in general be free as a write to
14595          a 'w' register implicitly zeroes the upper bits of an 'x'
14596          register.  However, if this is
14597
14598            (set (reg) (zero_extend (reg)))
14599
14600          we must cost the explicit register move.  */
14601       if (mode == DImode
14602           && GET_MODE (op0) == SImode)
14603         {
14604           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14605
14606         /* If OP_COST is non-zero, then the cost of the zero extend
14607            is effectively the cost of the inner operation.  Otherwise
14608            we have a MOV instruction and we take the cost from the MOV
14609            itself.  This is true independently of whether we are
14610            optimizing for space or time.  */
14611           if (op_cost)
14612             *cost = op_cost;
14613
14614           return true;
14615         }
14616       else if (MEM_P (op0))
14617         {
14618           /* All loads can zero extend to any size for free.  */
14619           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14620           return true;
14621         }
14622
14623       op0 = aarch64_extend_bitfield_pattern_p (x);
14624       if (op0)
14625         {
14626           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14627           if (speed)
14628             *cost += extra_cost->alu.bfx;
14629           return true;
14630         }
14631
14632       if (speed)
14633         {
14634           if (VECTOR_MODE_P (mode))
14635             {
14636               /* UMOV.  */
14637               *cost += extra_cost->vect.alu;
14638             }
14639           else
14640             {
14641               /* We generate an AND instead of UXTB/UXTH.  */
14642               *cost += extra_cost->alu.logical;
14643             }
14644         }
14645       return false;
14646
14647     case SIGN_EXTEND:
14648       if (MEM_P (XEXP (x, 0)))
14649         {
14650           /* LDRSH.  */
14651           if (speed)
14652             {
14653               rtx address = XEXP (XEXP (x, 0), 0);
14654               *cost += extra_cost->ldst.load_sign_extend;
14655
14656               *cost +=
14657                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14658                                                      0, speed));
14659             }
14660           return true;
14661         }
14662
14663       op0 = aarch64_extend_bitfield_pattern_p (x);
14664       if (op0)
14665         {
14666           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14667           if (speed)
14668             *cost += extra_cost->alu.bfx;
14669           return true;
14670         }
14671
14672       if (speed)
14673         {
14674           if (VECTOR_MODE_P (mode))
14675             *cost += extra_cost->vect.alu;
14676           else
14677             *cost += extra_cost->alu.extend;
14678         }
14679       return false;
14680
14681     case ROTATE:
14682     case ROTATERT:
14683     case LSHIFTRT:
14684     case ASHIFTRT:
14685     case ASHIFT:
14686       op0 = XEXP (x, 0);
14687       op1 = XEXP (x, 1);
14688
14689       if (CONST_INT_P (op1))
14690         {
14691           if (speed)
14692             {
14693               if (VECTOR_MODE_P (mode))
14694                 {
14695                   /* Vector shift (immediate).  */
14696                   *cost += extra_cost->vect.alu;
14697                 }
14698               else
14699                 {
14700                   /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
14701                      These are all aliases.  */
14702                   *cost += extra_cost->alu.shift;
14703                 }
14704             }
14705
14706           /* We can incorporate zero/sign extend for free.  */
14707           if (GET_CODE (op0) == ZERO_EXTEND
14708               || GET_CODE (op0) == SIGN_EXTEND)
14709             op0 = XEXP (op0, 0);
14710
14711           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
14712           return true;
14713         }
14714       else
14715         {
14716           if (VECTOR_MODE_P (mode))
14717             {
14718               if (speed)
14719                 /* Vector shift (register).  */
14720                 *cost += extra_cost->vect.alu;
14721             }
14722           else
14723             {
14724               if (speed)
14725                 /* LSLV, ASRV.  */
14726                 *cost += extra_cost->alu.shift_reg;
14727
14728                /* The register shift amount may be in a shorter mode expressed
14729                   as a lowpart SUBREG.  For costing purposes just look inside.  */
14730               if (SUBREG_P (op1) && subreg_lowpart_p (op1))
14731                 op1 = SUBREG_REG (op1);
14732               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14733                   && CONST_INT_P (XEXP (op1, 1))
14734                   && known_eq (INTVAL (XEXP (op1, 1)),
14735                                GET_MODE_BITSIZE (mode) - 1))
14736                 {
14737                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14738                   /* We already demanded XEXP (op1, 0) to be REG_P, so
14739                      don't recurse into it.  */
14740                   return true;
14741                 }
14742             }
14743           return false;  /* All arguments need to be in registers.  */
14744         }
14745
14746     case SYMBOL_REF:
14747
14748       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
14749           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
14750         {
14751           /* LDR.  */
14752           if (speed)
14753             *cost += extra_cost->ldst.load;
14754         }
14755       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
14756                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
14757         {
14758           /* ADRP, followed by ADD.  */
14759           *cost += COSTS_N_INSNS (1);
14760           if (speed)
14761             *cost += 2 * extra_cost->alu.arith;
14762         }
14763       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
14764                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14765         {
14766           /* ADR.  */
14767           if (speed)
14768             *cost += extra_cost->alu.arith;
14769         }
14770
14771       if (flag_pic)
14772         {
14773           /* One extra load instruction, after accessing the GOT.  */
14774           *cost += COSTS_N_INSNS (1);
14775           if (speed)
14776             *cost += extra_cost->ldst.load;
14777         }
14778       return true;
14779
14780     case HIGH:
14781     case LO_SUM:
14782       /* ADRP/ADD (immediate).  */
14783       if (speed)
14784         *cost += extra_cost->alu.arith;
14785       return true;
14786
14787     case ZERO_EXTRACT:
14788     case SIGN_EXTRACT:
14789       /* UBFX/SBFX.  */
14790       if (speed)
14791         {
14792           if (VECTOR_MODE_P (mode))
14793             *cost += extra_cost->vect.alu;
14794           else
14795             *cost += extra_cost->alu.bfx;
14796         }
14797
14798       /* We can trust that the immediates used will be correct (there
14799          are no by-register forms), so we need only cost op0.  */
14800       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
14801       return true;
14802
14803     case MULT:
14804       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
14805       /* aarch64_rtx_mult_cost always handles recursion to its
14806          operands.  */
14807       return true;
14808
14809     case MOD:
14810     /* We can expand signed mod by power of 2 using a NEGS, two parallel
14811        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
14812        an unconditional negate.  This case should only ever be reached through
14813        the set_smod_pow2_cheap check in expmed.cc.  */
14814       if (CONST_INT_P (XEXP (x, 1))
14815           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
14816           && (mode == SImode || mode == DImode))
14817         {
14818           /* We expand to 4 instructions.  Reset the baseline.  */
14819           *cost = COSTS_N_INSNS (4);
14820
14821           if (speed)
14822             *cost += 2 * extra_cost->alu.logical
14823                      + 2 * extra_cost->alu.arith;
14824
14825           return true;
14826         }
14827
14828     /* Fall-through.  */
14829     case UMOD:
14830       if (speed)
14831         {
14832           /* Slighly prefer UMOD over SMOD.  */
14833           if (VECTOR_MODE_P (mode))
14834             *cost += extra_cost->vect.alu;
14835           else if (GET_MODE_CLASS (mode) == MODE_INT)
14836             *cost += (extra_cost->mult[mode == DImode].add
14837                       + extra_cost->mult[mode == DImode].idiv
14838                       + (code == MOD ? 1 : 0));
14839         }
14840       return false;  /* All arguments need to be in registers.  */
14841
14842     case DIV:
14843     case UDIV:
14844     case SQRT:
14845       if (speed)
14846         {
14847           if (VECTOR_MODE_P (mode))
14848             *cost += extra_cost->vect.alu;
14849           else if (GET_MODE_CLASS (mode) == MODE_INT)
14850             /* There is no integer SQRT, so only DIV and UDIV can get
14851                here.  */
14852             *cost += (extra_cost->mult[mode == DImode].idiv
14853                      /* Slighly prefer UDIV over SDIV.  */
14854                      + (code == DIV ? 1 : 0));
14855           else
14856             *cost += extra_cost->fp[mode == DFmode].div;
14857         }
14858       return false;  /* All arguments need to be in registers.  */
14859
14860     case IF_THEN_ELSE:
14861       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
14862                                          XEXP (x, 2), cost, speed);
14863
14864     case EQ:
14865     case NE:
14866     case GT:
14867     case GTU:
14868     case LT:
14869     case LTU:
14870     case GE:
14871     case GEU:
14872     case LE:
14873     case LEU:
14874
14875       return false; /* All arguments must be in registers.  */
14876
14877     case FMA:
14878       op0 = XEXP (x, 0);
14879       op1 = XEXP (x, 1);
14880       op2 = XEXP (x, 2);
14881
14882       if (speed)
14883         {
14884           if (VECTOR_MODE_P (mode))
14885             *cost += extra_cost->vect.alu;
14886           else
14887             *cost += extra_cost->fp[mode == DFmode].fma;
14888         }
14889
14890       /* FMSUB, FNMADD, and FNMSUB are free.  */
14891       if (GET_CODE (op0) == NEG)
14892         op0 = XEXP (op0, 0);
14893
14894       if (GET_CODE (op2) == NEG)
14895         op2 = XEXP (op2, 0);
14896
14897       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
14898          and the by-element operand as operand 0.  */
14899       if (GET_CODE (op1) == NEG)
14900         op1 = XEXP (op1, 0);
14901
14902       /* Catch vector-by-element operations.  The by-element operand can
14903          either be (vec_duplicate (vec_select (x))) or just
14904          (vec_select (x)), depending on whether we are multiplying by
14905          a vector or a scalar.
14906
14907          Canonicalization is not very good in these cases, FMA4 will put the
14908          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
14909       if (GET_CODE (op0) == VEC_DUPLICATE)
14910         op0 = XEXP (op0, 0);
14911       else if (GET_CODE (op1) == VEC_DUPLICATE)
14912         op1 = XEXP (op1, 0);
14913
14914       if (GET_CODE (op0) == VEC_SELECT)
14915         op0 = XEXP (op0, 0);
14916       else if (GET_CODE (op1) == VEC_SELECT)
14917         op1 = XEXP (op1, 0);
14918
14919       /* If the remaining parameters are not registers,
14920          get the cost to put them into registers.  */
14921       *cost += rtx_cost (op0, mode, FMA, 0, speed);
14922       *cost += rtx_cost (op1, mode, FMA, 1, speed);
14923       *cost += rtx_cost (op2, mode, FMA, 2, speed);
14924       return true;
14925
14926     case FLOAT:
14927     case UNSIGNED_FLOAT:
14928       if (speed)
14929         *cost += extra_cost->fp[mode == DFmode].fromint;
14930       return false;
14931
14932     case FLOAT_EXTEND:
14933       if (speed)
14934         {
14935           if (VECTOR_MODE_P (mode))
14936             {
14937               /*Vector truncate.  */
14938               *cost += extra_cost->vect.alu;
14939             }
14940           else
14941             *cost += extra_cost->fp[mode == DFmode].widen;
14942         }
14943       return false;
14944
14945     case FLOAT_TRUNCATE:
14946       if (speed)
14947         {
14948           if (VECTOR_MODE_P (mode))
14949             {
14950               /*Vector conversion.  */
14951               *cost += extra_cost->vect.alu;
14952             }
14953           else
14954             *cost += extra_cost->fp[mode == DFmode].narrow;
14955         }
14956       return false;
14957
14958     case FIX:
14959     case UNSIGNED_FIX:
14960       x = XEXP (x, 0);
14961       /* Strip the rounding part.  They will all be implemented
14962          by the fcvt* family of instructions anyway.  */
14963       if (GET_CODE (x) == UNSPEC)
14964         {
14965           unsigned int uns_code = XINT (x, 1);
14966
14967           if (uns_code == UNSPEC_FRINTA
14968               || uns_code == UNSPEC_FRINTM
14969               || uns_code == UNSPEC_FRINTN
14970               || uns_code == UNSPEC_FRINTP
14971               || uns_code == UNSPEC_FRINTZ)
14972             x = XVECEXP (x, 0, 0);
14973         }
14974
14975       if (speed)
14976         {
14977           if (VECTOR_MODE_P (mode))
14978             *cost += extra_cost->vect.alu;
14979           else
14980             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
14981         }
14982
14983       /* We can combine fmul by a power of 2 followed by a fcvt into a single
14984          fixed-point fcvt.  */
14985       if (GET_CODE (x) == MULT
14986           && ((VECTOR_MODE_P (mode)
14987                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
14988               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
14989         {
14990           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
14991                              0, speed);
14992           return true;
14993         }
14994
14995       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
14996       return true;
14997
14998     case ABS:
14999       if (VECTOR_MODE_P (mode))
15000         {
15001           /* ABS (vector).  */
15002           if (speed)
15003             *cost += extra_cost->vect.alu;
15004         }
15005       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15006         {
15007           op0 = XEXP (x, 0);
15008
15009           /* FABD, which is analogous to FADD.  */
15010           if (GET_CODE (op0) == MINUS)
15011             {
15012               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15013               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
15014               if (speed)
15015                 *cost += extra_cost->fp[mode == DFmode].addsub;
15016
15017               return true;
15018             }
15019           /* Simple FABS is analogous to FNEG.  */
15020           if (speed)
15021             *cost += extra_cost->fp[mode == DFmode].neg;
15022         }
15023       else
15024         {
15025           /* Integer ABS will either be split to
15026              two arithmetic instructions, or will be an ABS
15027              (scalar), which we don't model.  */
15028           *cost = COSTS_N_INSNS (2);
15029           if (speed)
15030             *cost += 2 * extra_cost->alu.arith;
15031         }
15032       return false;
15033
15034     case SMAX:
15035     case SMIN:
15036       if (speed)
15037         {
15038           if (VECTOR_MODE_P (mode))
15039             *cost += extra_cost->vect.alu;
15040           else
15041             {
15042               /* FMAXNM/FMINNM/FMAX/FMIN.
15043                  TODO: This may not be accurate for all implementations, but
15044                  we do not model this in the cost tables.  */
15045               *cost += extra_cost->fp[mode == DFmode].addsub;
15046             }
15047         }
15048       return false;
15049
15050     case UNSPEC:
15051       /* The floating point round to integer frint* instructions.  */
15052       if (aarch64_frint_unspec_p (XINT (x, 1)))
15053         {
15054           if (speed)
15055             *cost += extra_cost->fp[mode == DFmode].roundint;
15056
15057           return false;
15058         }
15059
15060       if (XINT (x, 1) == UNSPEC_RBIT)
15061         {
15062           if (speed)
15063             *cost += extra_cost->alu.rev;
15064
15065           return false;
15066         }
15067       break;
15068
15069     case TRUNCATE:
15070
15071       /* Decompose <su>muldi3_highpart.  */
15072       if (/* (truncate:DI  */
15073           mode == DImode
15074           /*   (lshiftrt:TI  */
15075           && GET_MODE (XEXP (x, 0)) == TImode
15076           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15077           /*      (mult:TI  */
15078           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15079           /*        (ANY_EXTEND:TI (reg:DI))
15080                     (ANY_EXTEND:TI (reg:DI)))  */
15081           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15082                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15083               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15084                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15085           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15086           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15087           /*     (const_int 64)  */
15088           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15089           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15090         {
15091           /* UMULH/SMULH.  */
15092           if (speed)
15093             *cost += extra_cost->mult[mode == DImode].extend;
15094           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15095                              mode, MULT, 0, speed);
15096           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15097                              mode, MULT, 1, speed);
15098           return true;
15099         }
15100         break;
15101     case CONST_VECTOR:
15102         {
15103           /* Load using MOVI/MVNI.  */
15104           if (aarch64_simd_valid_immediate (x, NULL))
15105             *cost = extra_cost->vect.movi;
15106           else /* Load using constant pool.  */
15107             *cost = extra_cost->ldst.load;
15108           break;
15109         }
15110     case VEC_CONCAT:
15111         /* depending on the operation, either DUP or INS.
15112            For now, keep default costing.  */
15113         break;
15114     case VEC_DUPLICATE:
15115         /* Load using a DUP.  */
15116         *cost = extra_cost->vect.dup;
15117         return false;
15118     case VEC_SELECT:
15119         {
15120           rtx op0 = XEXP (x, 0);
15121           *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15122
15123           /* cost subreg of 0 as free, otherwise as DUP */
15124           rtx op1 = XEXP (x, 1);
15125           if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15126             ;
15127           else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15128             *cost = extra_cost->vect.dup;
15129           else
15130             *cost = extra_cost->vect.extract;
15131           return true;
15132         }
15133     default:
15134       break;
15135     }
15136
15137   if (dump_file
15138       && flag_aarch64_verbose_cost)
15139     fprintf (dump_file,
15140       "\nFailed to cost RTX.  Assuming default cost.\n");
15141
15142   return true;
15143 }
15144
15145 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15146    calculated for X.  This cost is stored in *COST.  Returns true
15147    if the total cost of X was calculated.  */
15148 static bool
15149 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15150                    int param, int *cost, bool speed)
15151 {
15152   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15153
15154   if (dump_file
15155       && flag_aarch64_verbose_cost)
15156     {
15157       print_rtl_single (dump_file, x);
15158       fprintf (dump_file, "\n%s cost: %d (%s)\n",
15159                speed ? "Hot" : "Cold",
15160                *cost, result ? "final" : "partial");
15161     }
15162
15163   return result;
15164 }
15165
15166 static int
15167 aarch64_register_move_cost (machine_mode mode,
15168                             reg_class_t from_i, reg_class_t to_i)
15169 {
15170   enum reg_class from = (enum reg_class) from_i;
15171   enum reg_class to = (enum reg_class) to_i;
15172   const struct cpu_regmove_cost *regmove_cost
15173     = aarch64_tune_params.regmove_cost;
15174
15175   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
15176   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
15177       || to == STUB_REGS)
15178     to = GENERAL_REGS;
15179
15180   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
15181       || from == STUB_REGS)
15182     from = GENERAL_REGS;
15183
15184   /* Make RDFFR very expensive.  In particular, if we know that the FFR
15185      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15186      as a way of obtaining a PTRUE.  */
15187   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15188       && hard_reg_set_subset_p (reg_class_contents[from_i],
15189                                 reg_class_contents[FFR_REGS]))
15190     return 80;
15191
15192   /* Moving between GPR and stack cost is the same as GP2GP.  */
15193   if ((from == GENERAL_REGS && to == STACK_REG)
15194       || (to == GENERAL_REGS && from == STACK_REG))
15195     return regmove_cost->GP2GP;
15196
15197   /* To/From the stack register, we move via the gprs.  */
15198   if (to == STACK_REG || from == STACK_REG)
15199     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15200             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15201
15202   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15203   if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15204       && known_eq (GET_MODE_SIZE (mode), 16))
15205     {
15206       /* 128-bit operations on general registers require 2 instructions.  */
15207       if (from == GENERAL_REGS && to == GENERAL_REGS)
15208         return regmove_cost->GP2GP * 2;
15209       else if (from == GENERAL_REGS)
15210         return regmove_cost->GP2FP * 2;
15211       else if (to == GENERAL_REGS)
15212         return regmove_cost->FP2GP * 2;
15213
15214       /* When AdvSIMD instructions are disabled it is not possible to move
15215          a 128-bit value directly between Q registers.  This is handled in
15216          secondary reload.  A general register is used as a scratch to move
15217          the upper DI value and the lower DI value is moved directly,
15218          hence the cost is the sum of three moves. */
15219       if (! TARGET_SIMD)
15220         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15221
15222       return regmove_cost->FP2FP;
15223     }
15224
15225   if (from == GENERAL_REGS && to == GENERAL_REGS)
15226     return regmove_cost->GP2GP;
15227   else if (from == GENERAL_REGS)
15228     return regmove_cost->GP2FP;
15229   else if (to == GENERAL_REGS)
15230     return regmove_cost->FP2GP;
15231
15232   if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15233     {
15234       /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15235          The cost must be greater than 2 units to indicate that direct
15236          moves aren't possible.  */
15237       auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15238                          + aarch64_tune_params.memmov_cost.store_fp);
15239       return MIN (CEIL (per_vector, 2), 4);
15240     }
15241
15242   return regmove_cost->FP2FP;
15243 }
15244
15245 /* Implements TARGET_MEMORY_MOVE_COST.  */
15246 static int
15247 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15248 {
15249   enum reg_class rclass = (enum reg_class) rclass_i;
15250   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15251       ? reg_classes_intersect_p (rclass, PR_REGS)
15252       : reg_class_subset_p (rclass, PR_REGS))
15253     return (in
15254             ? aarch64_tune_params.memmov_cost.load_pred
15255             : aarch64_tune_params.memmov_cost.store_pred);
15256
15257   if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15258       ? reg_classes_intersect_p (rclass, FP_REGS)
15259       : reg_class_subset_p (rclass, FP_REGS))
15260     return (in
15261             ? aarch64_tune_params.memmov_cost.load_fp
15262             : aarch64_tune_params.memmov_cost.store_fp);
15263
15264   return (in
15265           ? aarch64_tune_params.memmov_cost.load_int
15266           : aarch64_tune_params.memmov_cost.store_int);
15267 }
15268
15269 /* Implement TARGET_INIT_BUILTINS.  */
15270 static void
15271 aarch64_init_builtins ()
15272 {
15273   aarch64_general_init_builtins ();
15274   aarch64_sve::init_builtins ();
15275 #ifdef SUBTARGET_INIT_BUILTINS
15276   SUBTARGET_INIT_BUILTINS;
15277 #endif
15278 }
15279
15280 /* Implement TARGET_FOLD_BUILTIN.  */
15281 static tree
15282 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15283 {
15284   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15285   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15286   tree type = TREE_TYPE (TREE_TYPE (fndecl));
15287   switch (code & AARCH64_BUILTIN_CLASS)
15288     {
15289     case AARCH64_BUILTIN_GENERAL:
15290       return aarch64_general_fold_builtin (subcode, type, nargs, args);
15291
15292     case AARCH64_BUILTIN_SVE:
15293       return NULL_TREE;
15294     }
15295   gcc_unreachable ();
15296 }
15297
15298 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
15299 static bool
15300 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15301 {
15302   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15303   tree fndecl = gimple_call_fndecl (stmt);
15304   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15305   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15306   gimple *new_stmt = NULL;
15307   switch (code & AARCH64_BUILTIN_CLASS)
15308     {
15309     case AARCH64_BUILTIN_GENERAL:
15310       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15311       break;
15312
15313     case AARCH64_BUILTIN_SVE:
15314       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15315       break;
15316     }
15317
15318   if (!new_stmt)
15319     return false;
15320
15321   gsi_replace (gsi, new_stmt, false);
15322   return true;
15323 }
15324
15325 /* Implement TARGET_EXPAND_BUILTIN.  */
15326 static rtx
15327 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15328 {
15329   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15330   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15331   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15332   switch (code & AARCH64_BUILTIN_CLASS)
15333     {
15334     case AARCH64_BUILTIN_GENERAL:
15335       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15336
15337     case AARCH64_BUILTIN_SVE:
15338       return aarch64_sve::expand_builtin (subcode, exp, target);
15339     }
15340   gcc_unreachable ();
15341 }
15342
15343 /* Implement TARGET_BUILTIN_DECL.  */
15344 static tree
15345 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15346 {
15347   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15348   switch (code & AARCH64_BUILTIN_CLASS)
15349     {
15350     case AARCH64_BUILTIN_GENERAL:
15351       return aarch64_general_builtin_decl (subcode, initialize_p);
15352
15353     case AARCH64_BUILTIN_SVE:
15354       return aarch64_sve::builtin_decl (subcode, initialize_p);
15355     }
15356   gcc_unreachable ();
15357 }
15358
15359 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15360    to optimize 1.0/sqrt.  */
15361
15362 static bool
15363 use_rsqrt_p (machine_mode mode)
15364 {
15365   return (!flag_trapping_math
15366           && flag_unsafe_math_optimizations
15367           && ((aarch64_tune_params.approx_modes->recip_sqrt
15368                & AARCH64_APPROX_MODE (mode))
15369               || flag_mrecip_low_precision_sqrt));
15370 }
15371
15372 /* Function to decide when to use the approximate reciprocal square root
15373    builtin.  */
15374
15375 static tree
15376 aarch64_builtin_reciprocal (tree fndecl)
15377 {
15378   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15379
15380   if (!use_rsqrt_p (mode))
15381     return NULL_TREE;
15382   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15383   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15384   switch (code & AARCH64_BUILTIN_CLASS)
15385     {
15386     case AARCH64_BUILTIN_GENERAL:
15387       return aarch64_general_builtin_rsqrt (subcode);
15388
15389     case AARCH64_BUILTIN_SVE:
15390       return NULL_TREE;
15391     }
15392   gcc_unreachable ();
15393 }
15394
15395 /* Emit code to perform the floating-point operation:
15396
15397      DST = SRC1 * SRC2
15398
15399    where all three operands are already known to be registers.
15400    If the operation is an SVE one, PTRUE is a suitable all-true
15401    predicate.  */
15402
15403 static void
15404 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15405 {
15406   if (ptrue)
15407     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15408                                  dst, ptrue, src1, src2,
15409                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
15410   else
15411     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15412 }
15413
15414 /* Emit instruction sequence to compute either the approximate square root
15415    or its approximate reciprocal, depending on the flag RECP, and return
15416    whether the sequence was emitted or not.  */
15417
15418 bool
15419 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15420 {
15421   machine_mode mode = GET_MODE (dst);
15422
15423   if (GET_MODE_INNER (mode) == HFmode)
15424     {
15425       gcc_assert (!recp);
15426       return false;
15427     }
15428
15429   if (!recp)
15430     {
15431       if (!(flag_mlow_precision_sqrt
15432             || (aarch64_tune_params.approx_modes->sqrt
15433                 & AARCH64_APPROX_MODE (mode))))
15434         return false;
15435
15436       if (!flag_finite_math_only
15437           || flag_trapping_math
15438           || !flag_unsafe_math_optimizations
15439           || optimize_function_for_size_p (cfun))
15440         return false;
15441     }
15442   else
15443     /* Caller assumes we cannot fail.  */
15444     gcc_assert (use_rsqrt_p (mode));
15445
15446   rtx pg = NULL_RTX;
15447   if (aarch64_sve_mode_p (mode))
15448     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15449   machine_mode mmsk = (VECTOR_MODE_P (mode)
15450                        ? related_int_vector_mode (mode).require ()
15451                        : int_mode_for_mode (mode).require ());
15452   rtx xmsk = NULL_RTX;
15453   if (!recp)
15454     {
15455       /* When calculating the approximate square root, compare the
15456          argument with 0.0 and create a mask.  */
15457       rtx zero = CONST0_RTX (mode);
15458       if (pg)
15459         {
15460           xmsk = gen_reg_rtx (GET_MODE (pg));
15461           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15462           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15463                                            xmsk, pg, hint, src, zero));
15464         }
15465       else
15466         {
15467           xmsk = gen_reg_rtx (mmsk);
15468           emit_insn (gen_rtx_SET (xmsk,
15469                                   gen_rtx_NEG (mmsk,
15470                                                gen_rtx_EQ (mmsk, src, zero))));
15471         }
15472     }
15473
15474   /* Estimate the approximate reciprocal square root.  */
15475   rtx xdst = gen_reg_rtx (mode);
15476   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15477
15478   /* Iterate over the series twice for SF and thrice for DF.  */
15479   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15480
15481   /* Optionally iterate over the series once less for faster performance
15482      while sacrificing the accuracy.  */
15483   if ((recp && flag_mrecip_low_precision_sqrt)
15484       || (!recp && flag_mlow_precision_sqrt))
15485     iterations--;
15486
15487   /* Iterate over the series to calculate the approximate reciprocal square
15488      root.  */
15489   rtx x1 = gen_reg_rtx (mode);
15490   while (iterations--)
15491     {
15492       rtx x2 = gen_reg_rtx (mode);
15493       aarch64_emit_mult (x2, pg, xdst, xdst);
15494
15495       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15496
15497       if (iterations > 0)
15498         aarch64_emit_mult (xdst, pg, xdst, x1);
15499     }
15500
15501   if (!recp)
15502     {
15503       if (pg)
15504         /* Multiply nonzero source values by the corresponding intermediate
15505            result elements, so that the final calculation is the approximate
15506            square root rather than its reciprocal.  Select a zero result for
15507            zero source values, to avoid the Inf * 0 -> NaN that we'd get
15508            otherwise.  */
15509         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15510                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15511       else
15512         {
15513           /* Qualify the approximate reciprocal square root when the
15514              argument is 0.0 by squashing the intermediary result to 0.0.  */
15515           rtx xtmp = gen_reg_rtx (mmsk);
15516           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15517                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
15518           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15519
15520           /* Calculate the approximate square root.  */
15521           aarch64_emit_mult (xdst, pg, xdst, src);
15522         }
15523     }
15524
15525   /* Finalize the approximation.  */
15526   aarch64_emit_mult (dst, pg, xdst, x1);
15527
15528   return true;
15529 }
15530
15531 /* Emit the instruction sequence to compute the approximation for the division
15532    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
15533
15534 bool
15535 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15536 {
15537   machine_mode mode = GET_MODE (quo);
15538
15539   if (GET_MODE_INNER (mode) == HFmode)
15540     return false;
15541
15542   bool use_approx_division_p = (flag_mlow_precision_div
15543                                 || (aarch64_tune_params.approx_modes->division
15544                                     & AARCH64_APPROX_MODE (mode)));
15545
15546   if (!flag_finite_math_only
15547       || flag_trapping_math
15548       || !flag_unsafe_math_optimizations
15549       || optimize_function_for_size_p (cfun)
15550       || !use_approx_division_p)
15551     return false;
15552
15553   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15554     return false;
15555
15556   rtx pg = NULL_RTX;
15557   if (aarch64_sve_mode_p (mode))
15558     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15559
15560   /* Estimate the approximate reciprocal.  */
15561   rtx xrcp = gen_reg_rtx (mode);
15562   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15563
15564   /* Iterate over the series twice for SF and thrice for DF.  */
15565   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15566
15567   /* Optionally iterate over the series less for faster performance,
15568      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
15569   if (flag_mlow_precision_div)
15570     iterations = (GET_MODE_INNER (mode) == DFmode
15571                   ? aarch64_double_recp_precision
15572                   : aarch64_float_recp_precision);
15573
15574   /* Iterate over the series to calculate the approximate reciprocal.  */
15575   rtx xtmp = gen_reg_rtx (mode);
15576   while (iterations--)
15577     {
15578       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15579
15580       if (iterations > 0)
15581         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15582     }
15583
15584   if (num != CONST1_RTX (mode))
15585     {
15586       /* As the approximate reciprocal of DEN is already calculated, only
15587          calculate the approximate division when NUM is not 1.0.  */
15588       rtx xnum = force_reg (mode, num);
15589       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15590     }
15591
15592   /* Finalize the approximation.  */
15593   aarch64_emit_mult (quo, pg, xrcp, xtmp);
15594   return true;
15595 }
15596
15597 /* Return the number of instructions that can be issued per cycle.  */
15598 static int
15599 aarch64_sched_issue_rate (void)
15600 {
15601   return aarch64_tune_params.issue_rate;
15602 }
15603
15604 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
15605 static int
15606 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15607 {
15608   if (DEBUG_INSN_P (insn))
15609     return more;
15610
15611   rtx_code code = GET_CODE (PATTERN (insn));
15612   if (code == USE || code == CLOBBER)
15613     return more;
15614
15615   if (get_attr_type (insn) == TYPE_NO_INSN)
15616     return more;
15617
15618   return more - 1;
15619 }
15620
15621 static int
15622 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15623 {
15624   int issue_rate = aarch64_sched_issue_rate ();
15625
15626   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15627 }
15628
15629
15630 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15631    autopref_multipass_dfa_lookahead_guard from haifa-sched.cc.  It only
15632    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
15633
15634 static int
15635 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15636                                                     int ready_index)
15637 {
15638   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15639 }
15640
15641
15642 /* Vectorizer cost model target hooks.  */
15643
15644 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
15645    return the decl that should be recorded.  Return null otherwise.  */
15646 tree
15647 aarch64_vector_load_decl (tree addr)
15648 {
15649   if (TREE_CODE (addr) != ADDR_EXPR)
15650     return NULL_TREE;
15651   tree base = get_base_address (TREE_OPERAND (addr, 0));
15652   if (TREE_CODE (base) != VAR_DECL)
15653     return NULL_TREE;
15654   return base;
15655 }
15656
15657 /* Return true if STMT_INFO accesses a decl that is known to be the
15658    argument to a vld1 in the same function.  */
15659 static bool
15660 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
15661 {
15662   if (!cfun->machine->vector_load_decls)
15663     return false;
15664   auto dr = STMT_VINFO_DATA_REF (stmt_info);
15665   if (!dr)
15666     return false;
15667   tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
15668   return decl && cfun->machine->vector_load_decls->contains (decl);
15669 }
15670
15671 /* Information about how the CPU would issue the scalar, Advanced SIMD
15672    or SVE version of a vector loop, using the scheme defined by the
15673    aarch64_base_vec_issue_info hierarchy of structures.  */
15674 class aarch64_vec_op_count
15675 {
15676 public:
15677   aarch64_vec_op_count () = default;
15678   aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15679                         unsigned int = 1);
15680
15681   unsigned int vec_flags () const { return m_vec_flags; }
15682   unsigned int vf_factor () const { return m_vf_factor; }
15683
15684   const aarch64_base_vec_issue_info *base_issue_info () const;
15685   const aarch64_simd_vec_issue_info *simd_issue_info () const;
15686   const aarch64_sve_vec_issue_info *sve_issue_info () const;
15687
15688   fractional_cost rename_cycles_per_iter () const;
15689   fractional_cost min_nonpred_cycles_per_iter () const;
15690   fractional_cost min_pred_cycles_per_iter () const;
15691   fractional_cost min_cycles_per_iter () const;
15692
15693   void dump () const;
15694
15695   /* The number of individual "general" operations.  See the comments
15696      in aarch64_base_vec_issue_info for details.  */
15697   unsigned int general_ops = 0;
15698
15699   /* The number of load and store operations, under the same scheme
15700      as above.  */
15701   unsigned int loads = 0;
15702   unsigned int stores = 0;
15703
15704   /* The minimum number of cycles needed to execute all loop-carried
15705      operations, which in the vector code become associated with
15706      reductions.  */
15707   unsigned int reduction_latency = 0;
15708
15709   /* The number of individual predicate operations.  See the comments
15710      in aarch64_sve_vec_issue_info for details.  */
15711   unsigned int pred_ops = 0;
15712
15713 private:
15714   /* The issue information for the core.  */
15715   const aarch64_vec_issue_info *m_issue_info = nullptr;
15716
15717   /* - If M_VEC_FLAGS is zero then this structure describes scalar code
15718      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
15719        Advanced SIMD code.
15720      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
15721        SVE code.  */
15722   unsigned int m_vec_flags = 0;
15723
15724   /* Assume that, when the code is executing on the core described
15725      by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
15726      times more data than the vectorizer anticipates.
15727
15728      This is only ever different from 1 for SVE.  It allows us to consider
15729      what would happen on a 256-bit SVE target even when the -mtune
15730      parameters say that the “likely” SVE length is 128 bits.  */
15731   unsigned int m_vf_factor = 1;
15732 };
15733
15734 aarch64_vec_op_count::
15735 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
15736                       unsigned int vec_flags, unsigned int vf_factor)
15737   : m_issue_info (issue_info),
15738     m_vec_flags (vec_flags),
15739     m_vf_factor (vf_factor)
15740 {
15741 }
15742
15743 /* Return the base issue information (i.e. the parts that make sense
15744    for both scalar and vector code).  Return null if we have no issue
15745    information.  */
15746 const aarch64_base_vec_issue_info *
15747 aarch64_vec_op_count::base_issue_info () const
15748 {
15749   if (auto *ret = simd_issue_info ())
15750     return ret;
15751   return m_issue_info->scalar;
15752 }
15753
15754 /* If the structure describes vector code and we have associated issue
15755    information, return that issue information, otherwise return null.  */
15756 const aarch64_simd_vec_issue_info *
15757 aarch64_vec_op_count::simd_issue_info () const
15758 {
15759   if (auto *ret = sve_issue_info ())
15760     return ret;
15761   if (m_vec_flags)
15762     return m_issue_info->advsimd;
15763   return nullptr;
15764 }
15765
15766 /* If the structure describes SVE code and we have associated issue
15767    information, return that issue information, otherwise return null.  */
15768 const aarch64_sve_vec_issue_info *
15769 aarch64_vec_op_count::sve_issue_info () const
15770 {
15771   if (m_vec_flags & VEC_ANY_SVE)
15772     return m_issue_info->sve;
15773   return nullptr;
15774 }
15775
15776 /* Estimate the minimum number of cycles per iteration needed to rename
15777    the instructions.
15778
15779    ??? For now this is done inline rather than via cost tables, since it
15780    isn't clear how it should be parameterized for the general case.  */
15781 fractional_cost
15782 aarch64_vec_op_count::rename_cycles_per_iter () const
15783 {
15784   if (sve_issue_info () == &neoverse512tvb_sve_issue_info
15785       || sve_issue_info () == &neoversen2_sve_issue_info
15786       || sve_issue_info () == &neoversev2_sve_issue_info)
15787     /* + 1 for an addition.  We've already counted a general op for each
15788        store, so we don't need to account for stores separately.  The branch
15789        reads no registers and so does not need to be counted either.
15790
15791        ??? This value is very much on the pessimistic side, but seems to work
15792        pretty well in practice.  */
15793     return { general_ops + loads + pred_ops + 1, 5 };
15794
15795   return 0;
15796 }
15797
15798 /* Like min_cycles_per_iter, but excluding predicate operations.  */
15799 fractional_cost
15800 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
15801 {
15802   auto *issue_info = base_issue_info ();
15803
15804   fractional_cost cycles = MAX (reduction_latency, 1);
15805   cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
15806   cycles = std::max (cycles, { loads + stores,
15807                                issue_info->loads_stores_per_cycle });
15808   cycles = std::max (cycles, { general_ops,
15809                                issue_info->general_ops_per_cycle });
15810   cycles = std::max (cycles, rename_cycles_per_iter ());
15811   return cycles;
15812 }
15813
15814 /* Like min_cycles_per_iter, but including only the predicate operations.  */
15815 fractional_cost
15816 aarch64_vec_op_count::min_pred_cycles_per_iter () const
15817 {
15818   if (auto *issue_info = sve_issue_info ())
15819     return { pred_ops, issue_info->pred_ops_per_cycle };
15820   return 0;
15821 }
15822
15823 /* Estimate the minimum number of cycles needed to issue the operations.
15824    This is a very simplistic model!  */
15825 fractional_cost
15826 aarch64_vec_op_count::min_cycles_per_iter () const
15827 {
15828   return std::max (min_nonpred_cycles_per_iter (),
15829                    min_pred_cycles_per_iter ());
15830 }
15831
15832 /* Dump information about the structure.  */
15833 void
15834 aarch64_vec_op_count::dump () const
15835 {
15836   dump_printf_loc (MSG_NOTE, vect_location,
15837                    "  load operations = %d\n", loads);
15838   dump_printf_loc (MSG_NOTE, vect_location,
15839                    "  store operations = %d\n", stores);
15840   dump_printf_loc (MSG_NOTE, vect_location,
15841                    "  general operations = %d\n", general_ops);
15842   if (sve_issue_info ())
15843     dump_printf_loc (MSG_NOTE, vect_location,
15844                      "  predicate operations = %d\n", pred_ops);
15845   dump_printf_loc (MSG_NOTE, vect_location,
15846                    "  reduction latency = %d\n", reduction_latency);
15847   if (auto rcpi = rename_cycles_per_iter ())
15848     dump_printf_loc (MSG_NOTE, vect_location,
15849                      "  estimated cycles per iteration to rename = %f\n",
15850                      rcpi.as_double ());
15851   if (auto pred_cpi = min_pred_cycles_per_iter ())
15852     {
15853       dump_printf_loc (MSG_NOTE, vect_location,
15854                        "  estimated min cycles per iteration"
15855                        " without predication = %f\n",
15856                        min_nonpred_cycles_per_iter ().as_double ());
15857       dump_printf_loc (MSG_NOTE, vect_location,
15858                        "  estimated min cycles per iteration"
15859                        " for predication = %f\n", pred_cpi.as_double ());
15860     }
15861   if (auto cpi = min_cycles_per_iter ())
15862     dump_printf_loc (MSG_NOTE, vect_location,
15863                      "  estimated min cycles per iteration = %f\n",
15864                      cpi.as_double ());
15865 }
15866
15867 /* Information about vector code that we're in the process of costing.  */
15868 class aarch64_vector_costs : public vector_costs
15869 {
15870 public:
15871   aarch64_vector_costs (vec_info *, bool);
15872
15873   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
15874                               stmt_vec_info stmt_info, slp_tree, tree vectype,
15875                               int misalign,
15876                               vect_cost_model_location where) override;
15877   void finish_cost (const vector_costs *) override;
15878   bool better_main_loop_than_p (const vector_costs *other) const override;
15879
15880 private:
15881   void record_potential_advsimd_unrolling (loop_vec_info);
15882   void analyze_loop_vinfo (loop_vec_info);
15883   void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
15884                   aarch64_vec_op_count *);
15885   fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
15886                                         fractional_cost, unsigned int,
15887                                         unsigned int *, bool *);
15888   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
15889                                  unsigned int);
15890   bool prefer_unrolled_loop () const;
15891   unsigned int determine_suggested_unroll_factor ();
15892
15893   /* True if we have performed one-time initialization based on the
15894      vec_info.  */
15895   bool m_analyzed_vinfo = false;
15896
15897   /* This loop uses an average operation that is not supported by SVE, but is
15898      supported by Advanced SIMD and SVE2.  */
15899   bool m_has_avg = false;
15900
15901   /* True if the vector body contains a store to a decl and if the
15902      function is known to have a vld1 from the same decl.
15903
15904      In the Advanced SIMD ACLE, the recommended endian-agnostic way of
15905      initializing a vector is:
15906
15907        float f[4] = { elts };
15908        float32x4_t x = vld1q_f32(f);
15909
15910      We should strongly prefer vectorization of the initialization of f,
15911      so that the store to f and the load back can be optimized away,
15912      leaving a vectorization of { elts }.  */
15913   bool m_stores_to_vector_load_decl = false;
15914
15915   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
15916      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
15917        SIMD code.
15918      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
15919   unsigned int m_vec_flags = 0;
15920
15921   /* At the moment, we do not model LDP and STP in the vector and scalar costs.
15922      This means that code such as:
15923
15924         a[0] = x;
15925         a[1] = x;
15926
15927      will be costed as two scalar instructions and two vector instructions
15928      (a scalar_to_vec and an unaligned_store).  For SLP, the vector form
15929      wins if the costs are equal, because of the fact that the vector costs
15930      include constant initializations whereas the scalar costs don't.
15931      We would therefore tend to vectorize the code above, even though
15932      the scalar version can use a single STP.
15933
15934      We should eventually fix this and model LDP and STP in the main costs;
15935      see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
15936      Until then, we look specifically for code that does nothing more than
15937      STP-like operations.  We cost them on that basis in addition to the
15938      normal latency-based costs.
15939
15940      If the scalar or vector code could be a sequence of STPs +
15941      initialization, this variable counts the cost of the sequence,
15942      with 2 units per instruction.  The variable is ~0U for other
15943      kinds of code.  */
15944   unsigned int m_stp_sequence_cost = 0;
15945
15946   /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
15947      throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
15948      situations, we try to predict whether an Advanced SIMD implementation
15949      of the loop could be completely unrolled and become straight-line code.
15950      If so, it is generally better to use the Advanced SIMD version rather
15951      than length-agnostic SVE, since the SVE loop would execute an unknown
15952      number of times and so could not be completely unrolled in the same way.
15953
15954      If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
15955      number of Advanced SIMD loop iterations that would be unrolled and
15956      M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
15957      in the unrolled loop.  Both values are zero if we're not applying
15958      the heuristic.  */
15959   unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
15960   unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
15961
15962   /* If we're vectorizing a loop that executes a constant number of times,
15963      this variable gives the number of times that the vector loop would
15964      iterate, otherwise it is zero.  */
15965   uint64_t m_num_vector_iterations = 0;
15966
15967   /* Used only when vectorizing loops.  Estimates the number and kind of
15968      operations that would be needed by one iteration of the scalar
15969      or vector loop.  There is one entry for each tuning option of
15970      interest.  */
15971   auto_vec<aarch64_vec_op_count, 2> m_ops;
15972 };
15973
15974 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
15975                                             bool costing_for_scalar)
15976   : vector_costs (vinfo, costing_for_scalar),
15977     m_vec_flags (costing_for_scalar ? 0
15978                  : aarch64_classify_vector_mode (vinfo->vector_mode))
15979 {
15980   if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
15981     {
15982       m_ops.quick_push ({ issue_info, m_vec_flags });
15983       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
15984         {
15985           unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
15986           m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
15987                               vf_factor });
15988         }
15989     }
15990 }
15991
15992 /* Implement TARGET_VECTORIZE_CREATE_COSTS.  */
15993 vector_costs *
15994 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
15995 {
15996   return new aarch64_vector_costs (vinfo, costing_for_scalar);
15997 }
15998
15999 /* Return true if the current CPU should use the new costs defined
16000    in GCC 11.  This should be removed for GCC 12 and above, with the
16001    costs applying to all CPUs instead.  */
16002 static bool
16003 aarch64_use_new_vector_costs_p ()
16004 {
16005   return (aarch64_tune_params.extra_tuning_flags
16006           & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
16007 }
16008
16009 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
16010 static const simd_vec_cost *
16011 aarch64_simd_vec_costs (tree vectype)
16012 {
16013   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16014   if (vectype != NULL
16015       && aarch64_sve_mode_p (TYPE_MODE (vectype))
16016       && costs->sve != NULL)
16017     return costs->sve;
16018   return costs->advsimd;
16019 }
16020
16021 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
16022 static const simd_vec_cost *
16023 aarch64_simd_vec_costs_for_flags (unsigned int flags)
16024 {
16025   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16026   if ((flags & VEC_ANY_SVE) && costs->sve)
16027     return costs->sve;
16028   return costs->advsimd;
16029 }
16030
16031 /* If STMT_INFO is a memory reference, return the scalar memory type,
16032    otherwise return null.  */
16033 static tree
16034 aarch64_dr_type (stmt_vec_info stmt_info)
16035 {
16036   if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16037     return TREE_TYPE (DR_REF (dr));
16038   return NULL_TREE;
16039 }
16040
16041 /* Decide whether to use the unrolling heuristic described above
16042    m_unrolled_advsimd_niters, updating that field if so.  LOOP_VINFO
16043    describes the loop that we're vectorizing.  */
16044 void
16045 aarch64_vector_costs::
16046 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
16047 {
16048   /* The heuristic only makes sense on targets that have the same
16049      vector throughput for SVE and Advanced SIMD.  */
16050   if (!(aarch64_tune_params.extra_tuning_flags
16051         & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16052     return;
16053
16054   /* We only want to apply the heuristic if LOOP_VINFO is being
16055      vectorized for SVE.  */
16056   if (!(m_vec_flags & VEC_ANY_SVE))
16057     return;
16058
16059   /* Check whether it is possible in principle to use Advanced SIMD
16060      instead.  */
16061   if (aarch64_autovec_preference == 2)
16062     return;
16063
16064   /* We don't want to apply the heuristic to outer loops, since it's
16065      harder to track two levels of unrolling.  */
16066   if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16067     return;
16068
16069   /* Only handle cases in which the number of Advanced SIMD iterations
16070      would be known at compile time but the number of SVE iterations
16071      would not.  */
16072   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16073       || aarch64_sve_vg.is_constant ())
16074     return;
16075
16076   /* Guess how many times the Advanced SIMD loop would iterate and make
16077      sure that it is within the complete unrolling limit.  Even if the
16078      number of iterations is small enough, the number of statements might
16079      not be, which is why we need to estimate the number of statements too.  */
16080   unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16081   unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16082   unsigned HOST_WIDE_INT unrolled_advsimd_niters
16083     = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16084   if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16085     return;
16086
16087   /* Record that we're applying the heuristic and should try to estimate
16088      the number of statements in the Advanced SIMD loop.  */
16089   m_unrolled_advsimd_niters = unrolled_advsimd_niters;
16090 }
16091
16092 /* Do one-time initialization of the aarch64_vector_costs given that we're
16093    costing the loop vectorization described by LOOP_VINFO.  */
16094 void
16095 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
16096 {
16097   /* Record the number of times that the vector loop would execute,
16098      if known.  */
16099   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16100   auto scalar_niters = max_stmt_executions_int (loop);
16101   if (scalar_niters >= 0)
16102     {
16103       unsigned int vf = vect_vf_for_cost (loop_vinfo);
16104       if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16105         m_num_vector_iterations = scalar_niters / vf;
16106       else
16107         m_num_vector_iterations = CEIL (scalar_niters, vf);
16108     }
16109
16110   /* Detect whether we're vectorizing for SVE and should apply the unrolling
16111      heuristic described above m_unrolled_advsimd_niters.  */
16112   record_potential_advsimd_unrolling (loop_vinfo);
16113
16114   /* Record the issue information for any SVE WHILE instructions that the
16115      loop needs.  */
16116   if (!m_ops.is_empty () && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16117     {
16118       unsigned int num_masks = 0;
16119       rgroup_controls *rgm;
16120       unsigned int num_vectors_m1;
16121       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
16122         if (rgm->type)
16123           num_masks += num_vectors_m1 + 1;
16124       for (auto &ops : m_ops)
16125         if (auto *issue = ops.sve_issue_info ())
16126           ops.pred_ops += num_masks * issue->while_pred_ops;
16127     }
16128 }
16129
16130 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
16131 static int
16132 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16133                                     tree vectype,
16134                                     int misalign ATTRIBUTE_UNUSED)
16135 {
16136   unsigned elements;
16137   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16138   bool fp = false;
16139
16140   if (vectype != NULL)
16141     fp = FLOAT_TYPE_P (vectype);
16142
16143   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16144
16145   switch (type_of_cost)
16146     {
16147       case scalar_stmt:
16148         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16149
16150       case scalar_load:
16151         return costs->scalar_load_cost;
16152
16153       case scalar_store:
16154         return costs->scalar_store_cost;
16155
16156       case vector_stmt:
16157         return fp ? simd_costs->fp_stmt_cost
16158                   : simd_costs->int_stmt_cost;
16159
16160       case vector_load:
16161         return simd_costs->align_load_cost;
16162
16163       case vector_store:
16164         return simd_costs->store_cost;
16165
16166       case vec_to_scalar:
16167         return simd_costs->vec_to_scalar_cost;
16168
16169       case scalar_to_vec:
16170         return simd_costs->scalar_to_vec_cost;
16171
16172       case unaligned_load:
16173       case vector_gather_load:
16174         return simd_costs->unalign_load_cost;
16175
16176       case unaligned_store:
16177       case vector_scatter_store:
16178         return simd_costs->unalign_store_cost;
16179
16180       case cond_branch_taken:
16181         return costs->cond_taken_branch_cost;
16182
16183       case cond_branch_not_taken:
16184         return costs->cond_not_taken_branch_cost;
16185
16186       case vec_perm:
16187         return simd_costs->permute_cost;
16188
16189       case vec_promote_demote:
16190         return fp ? simd_costs->fp_stmt_cost
16191                   : simd_costs->int_stmt_cost;
16192
16193       case vec_construct:
16194         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16195         return elements / 2 + 1;
16196
16197       default:
16198         gcc_unreachable ();
16199     }
16200 }
16201
16202 /* Return true if an access of kind KIND for STMT_INFO represents one
16203    vector of an LD[234] or ST[234] operation.  Return the total number of
16204    vectors (2, 3 or 4) if so, otherwise return a value outside that range.  */
16205 static int
16206 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16207 {
16208   if ((kind == vector_load
16209        || kind == unaligned_load
16210        || kind == vector_store
16211        || kind == unaligned_store)
16212       && STMT_VINFO_DATA_REF (stmt_info))
16213     {
16214       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16215       if (stmt_info
16216           && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16217         return DR_GROUP_SIZE (stmt_info);
16218     }
16219   return 0;
16220 }
16221
16222 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16223    vectors would produce a series of LDP or STP operations.  KIND is the
16224    kind of statement that STMT_INFO represents.  */
16225 static bool
16226 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16227                            stmt_vec_info stmt_info)
16228 {
16229   switch (kind)
16230     {
16231     case vector_load:
16232     case vector_store:
16233     case unaligned_load:
16234     case unaligned_store:
16235       break;
16236
16237     default:
16238       return false;
16239     }
16240
16241   if (aarch64_tune_params.extra_tuning_flags
16242       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16243     return false;
16244
16245   return is_gimple_assign (stmt_info->stmt);
16246 }
16247
16248 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16249    or multiply-subtract sequence that might be suitable for fusing into a
16250    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16251    a scalar one, otherwise analyze it as an operation on vectors with those
16252    VEC_* flags.  */
16253 static bool
16254 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16255                         unsigned int vec_flags)
16256 {
16257   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16258   if (!assign)
16259     return false;
16260   tree_code code = gimple_assign_rhs_code (assign);
16261   if (code != PLUS_EXPR && code != MINUS_EXPR)
16262     return false;
16263
16264   if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
16265       || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
16266     return false;
16267
16268   for (int i = 1; i < 3; ++i)
16269     {
16270       tree rhs = gimple_op (assign, i);
16271       /* ??? Should we try to check for a single use as well?  */
16272       if (TREE_CODE (rhs) != SSA_NAME)
16273         continue;
16274
16275       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16276       if (!def_stmt_info
16277           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16278         continue;
16279       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16280       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16281         continue;
16282
16283       if (vec_flags & VEC_ADVSIMD)
16284         {
16285           /* Scalar and SVE code can tie the result to any FMLA input (or none,
16286              although that requires a MOVPRFX for SVE).  However, Advanced SIMD
16287              only supports MLA forms, so will require a move if the result
16288              cannot be tied to the accumulator.  The most important case in
16289              which this is true is when the accumulator input is invariant.  */
16290           rhs = gimple_op (assign, 3 - i);
16291           if (TREE_CODE (rhs) != SSA_NAME)
16292             return false;
16293           def_stmt_info = vinfo->lookup_def (rhs);
16294           if (!def_stmt_info
16295               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
16296             return false;
16297         }
16298
16299       return true;
16300     }
16301   return false;
16302 }
16303
16304 /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
16305    in-loop reduction that SVE supports directly, return its latency in cycles,
16306    otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
16307    instructions.  */
16308 static unsigned int
16309 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16310                                        stmt_vec_info stmt_info,
16311                                        const sve_vec_cost *sve_costs)
16312 {
16313   switch (vect_reduc_type (vinfo, stmt_info))
16314     {
16315     case EXTRACT_LAST_REDUCTION:
16316       return sve_costs->clast_cost;
16317
16318     case FOLD_LEFT_REDUCTION:
16319       switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16320         {
16321         case E_HFmode:
16322         case E_BFmode:
16323           return sve_costs->fadda_f16_cost;
16324
16325         case E_SFmode:
16326           return sve_costs->fadda_f32_cost;
16327
16328         case E_DFmode:
16329           return sve_costs->fadda_f64_cost;
16330
16331         default:
16332           break;
16333         }
16334       break;
16335     }
16336
16337   return 0;
16338 }
16339
16340 /* STMT_INFO describes a loop-carried operation in the original scalar code
16341    that we are considering implementing as a reduction.  Return one of the
16342    following values, depending on VEC_FLAGS:
16343
16344    - If VEC_FLAGS is zero, return the loop carry latency of the original
16345      scalar operation.
16346
16347    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16348      Advanced SIMD implementation.
16349
16350    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16351      SVE implementation.  */
16352 static unsigned int
16353 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16354                                    unsigned int vec_flags)
16355 {
16356   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16357   const sve_vec_cost *sve_costs = nullptr;
16358   if (vec_flags & VEC_ANY_SVE)
16359     sve_costs = aarch64_tune_params.vec_costs->sve;
16360
16361   /* If the caller is asking for the SVE latency, check for forms of reduction
16362      that only SVE can handle directly.  */
16363   if (sve_costs)
16364     {
16365       unsigned int latency
16366         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16367       if (latency)
16368         return latency;
16369     }
16370
16371   /* Handle scalar costs.  */
16372   bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16373   if (vec_flags == 0)
16374     {
16375       if (is_float)
16376         return vec_costs->scalar_fp_stmt_cost;
16377       return vec_costs->scalar_int_stmt_cost;
16378     }
16379
16380   /* Otherwise, the loop body just contains normal integer or FP operations,
16381      with a vector reduction outside the loop.  */
16382   const simd_vec_cost *simd_costs
16383     = aarch64_simd_vec_costs_for_flags (vec_flags);
16384   if (is_float)
16385     return simd_costs->fp_stmt_cost;
16386   return simd_costs->int_stmt_cost;
16387 }
16388
16389 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16390    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
16391    try to subdivide the target-independent categorization provided by KIND
16392    to get a more accurate cost.  */
16393 static fractional_cost
16394 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16395                                     stmt_vec_info stmt_info,
16396                                     fractional_cost stmt_cost)
16397 {
16398   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
16399      the extension with the load.  */
16400   if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16401     return 0;
16402
16403   return stmt_cost;
16404 }
16405
16406 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16407    for the vectorized form of STMT_INFO, which has cost kind KIND and which
16408    when vectorized would operate on vector type VECTYPE.  Try to subdivide
16409    the target-independent categorization provided by KIND to get a more
16410    accurate cost.  WHERE specifies where the cost associated with KIND
16411    occurs.  */
16412 static fractional_cost
16413 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16414                                     stmt_vec_info stmt_info, tree vectype,
16415                                     enum vect_cost_model_location where,
16416                                     fractional_cost stmt_cost)
16417 {
16418   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16419   const sve_vec_cost *sve_costs = nullptr;
16420   if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16421     sve_costs = aarch64_tune_params.vec_costs->sve;
16422
16423   /* It's generally better to avoid costing inductions, since the induction
16424      will usually be hidden by other operations.  This is particularly true
16425      for things like COND_REDUCTIONS.  */
16426   if (is_a<gphi *> (stmt_info->stmt))
16427     return 0;
16428
16429   /* Detect cases in which vec_to_scalar is describing the extraction of a
16430      vector element in preparation for a scalar store.  The store itself is
16431      costed separately.  */
16432   if (vect_is_store_elt_extraction (kind, stmt_info))
16433     return simd_costs->store_elt_extra_cost;
16434
16435   /* Detect SVE gather loads, which are costed as a single scalar_load
16436      for each element.  We therefore need to divide the full-instruction
16437      cost by the number of elements in the vector.  */
16438   if (kind == scalar_load
16439       && sve_costs
16440       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16441     {
16442       unsigned int nunits = vect_nunits_for_cost (vectype);
16443       if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16444         return { sve_costs->gather_load_x64_cost, nunits };
16445       return { sve_costs->gather_load_x32_cost, nunits };
16446     }
16447
16448   /* Detect cases in which a scalar_store is really storing one element
16449      in a scatter operation.  */
16450   if (kind == scalar_store
16451       && sve_costs
16452       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16453     return sve_costs->scatter_store_elt_cost;
16454
16455   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
16456   if (kind == vec_to_scalar
16457       && where == vect_body
16458       && sve_costs)
16459     {
16460       unsigned int latency
16461         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16462       if (latency)
16463         return latency;
16464     }
16465
16466   /* Detect cases in which vec_to_scalar represents a single reduction
16467      instruction like FADDP or MAXV.  */
16468   if (kind == vec_to_scalar
16469       && where == vect_epilogue
16470       && vect_is_reduction (stmt_info))
16471     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16472       {
16473       case E_QImode:
16474         return simd_costs->reduc_i8_cost;
16475
16476       case E_HImode:
16477         return simd_costs->reduc_i16_cost;
16478
16479       case E_SImode:
16480         return simd_costs->reduc_i32_cost;
16481
16482       case E_DImode:
16483         return simd_costs->reduc_i64_cost;
16484
16485       case E_HFmode:
16486       case E_BFmode:
16487         return simd_costs->reduc_f16_cost;
16488
16489       case E_SFmode:
16490         return simd_costs->reduc_f32_cost;
16491
16492       case E_DFmode:
16493         return simd_costs->reduc_f64_cost;
16494
16495       default:
16496         break;
16497       }
16498
16499   /* Otherwise stick with the original categorization.  */
16500   return stmt_cost;
16501 }
16502
16503 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16504    for STMT_INFO, which has cost kind KIND and which when vectorized would
16505    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
16506    targets.  */
16507 static fractional_cost
16508 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16509                               stmt_vec_info stmt_info, tree vectype,
16510                               fractional_cost stmt_cost)
16511 {
16512   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16513      vector register size or number of units.  Integer promotions of this
16514      type therefore map to SXT[BHW] or UXT[BHW].
16515
16516      Most loads have extending forms that can do the sign or zero extension
16517      on the fly.  Optimistically assume that a load followed by an extension
16518      will fold to this form during combine, and that the extension therefore
16519      comes for free.  */
16520   if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16521     stmt_cost = 0;
16522
16523   /* For similar reasons, vector_stmt integer truncations are a no-op,
16524      because we can just ignore the unused upper bits of the source.  */
16525   if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16526     stmt_cost = 0;
16527
16528   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16529      but there are no equivalent instructions for SVE.  This means that
16530      (all other things being equal) 128-bit SVE needs twice as many load
16531      and store instructions as Advanced SIMD in order to process vector pairs.
16532
16533      Also, scalar code can often use LDP and STP to access pairs of values,
16534      so it is too simplistic to say that one SVE load or store replaces
16535      VF scalar loads and stores.
16536
16537      Ideally we would account for this in the scalar and Advanced SIMD
16538      costs by making suitable load/store pairs as cheap as a single
16539      load/store.  However, that would be a very invasive change and in
16540      practice it tends to stress other parts of the cost model too much.
16541      E.g. stores of scalar constants currently count just a store,
16542      whereas stores of vector constants count a store and a vec_init.
16543      This is an artificial distinction for AArch64, where stores of
16544      nonzero scalar constants need the same kind of register invariant
16545      as vector stores.
16546
16547      An alternative would be to double the cost of any SVE loads and stores
16548      that could be paired in Advanced SIMD (and possibly also paired in
16549      scalar code).  But this tends to stress other parts of the cost model
16550      in the same way.  It also means that we can fall back to Advanced SIMD
16551      even if full-loop predication would have been useful.
16552
16553      Here we go for a more conservative version: double the costs of SVE
16554      loads and stores if one iteration of the scalar loop processes enough
16555      elements for it to use a whole number of Advanced SIMD LDP or STP
16556      instructions.  This makes it very likely that the VF would be 1 for
16557      Advanced SIMD, and so no epilogue should be needed.  */
16558   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16559     {
16560       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16561       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16562       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16563       if (multiple_p (count * elt_bits, 256)
16564           && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16565         stmt_cost *= 2;
16566     }
16567
16568   return stmt_cost;
16569 }
16570
16571 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16572    and which when vectorized would operate on vector type VECTYPE.  Add the
16573    cost of any embedded operations.  */
16574 static fractional_cost
16575 aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
16576                           tree vectype, fractional_cost stmt_cost)
16577 {
16578   if (vectype)
16579     {
16580       const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16581
16582       /* Detect cases in which a vector load or store represents an
16583          LD[234] or ST[234] instruction.  */
16584       switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16585         {
16586         case 2:
16587           stmt_cost += simd_costs->ld2_st2_permute_cost;
16588           break;
16589
16590         case 3:
16591           stmt_cost += simd_costs->ld3_st3_permute_cost;
16592           break;
16593
16594         case 4:
16595           stmt_cost += simd_costs->ld4_st4_permute_cost;
16596           break;
16597         }
16598
16599       if (kind == vector_stmt || kind == vec_to_scalar)
16600         if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16601           {
16602             if (FLOAT_TYPE_P (cmp_type))
16603               stmt_cost += simd_costs->fp_stmt_cost;
16604             else
16605               stmt_cost += simd_costs->int_stmt_cost;
16606           }
16607     }
16608
16609   if (kind == scalar_stmt)
16610     if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16611       {
16612         if (FLOAT_TYPE_P (cmp_type))
16613           stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16614         else
16615           stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16616       }
16617
16618   return stmt_cost;
16619 }
16620
16621 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16622    and they describe an operation in the body of a vector loop.  Record issue
16623    information relating to the vector operation in OPS.  */
16624 void
16625 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
16626                                  stmt_vec_info stmt_info,
16627                                  aarch64_vec_op_count *ops)
16628 {
16629   const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16630   if (!base_issue)
16631     return;
16632   const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
16633   const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
16634
16635   /* Calculate the minimum cycles per iteration imposed by a reduction
16636      operation.  */
16637   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16638       && vect_is_reduction (stmt_info))
16639     {
16640       unsigned int base
16641         = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
16642
16643       /* ??? Ideally we'd do COUNT reductions in parallel, but unfortunately
16644          that's not yet the case.  */
16645       ops->reduction_latency = MAX (ops->reduction_latency, base * count);
16646     }
16647
16648   /* Assume that multiply-adds will become a single operation.  */
16649   if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
16650     return;
16651
16652   /* Count the basic operation cost associated with KIND.  */
16653   switch (kind)
16654     {
16655     case cond_branch_taken:
16656     case cond_branch_not_taken:
16657     case vector_gather_load:
16658     case vector_scatter_store:
16659       /* We currently don't expect these to be used in a loop body.  */
16660       break;
16661
16662     case vec_perm:
16663     case vec_promote_demote:
16664     case vec_construct:
16665     case vec_to_scalar:
16666     case scalar_to_vec:
16667     case vector_stmt:
16668     case scalar_stmt:
16669       ops->general_ops += count;
16670       break;
16671
16672     case scalar_load:
16673     case vector_load:
16674     case unaligned_load:
16675       ops->loads += count;
16676       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16677         ops->general_ops += base_issue->fp_simd_load_general_ops * count;
16678       break;
16679
16680     case vector_store:
16681     case unaligned_store:
16682     case scalar_store:
16683       ops->stores += count;
16684       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16685         ops->general_ops += base_issue->fp_simd_store_general_ops * count;
16686       break;
16687     }
16688
16689   /* Add any embedded comparison operations.  */
16690   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16691       && vect_embedded_comparison_type (stmt_info))
16692     ops->general_ops += count;
16693
16694   /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
16695      have only accounted for one.  */
16696   if ((kind == vector_stmt || kind == vec_to_scalar)
16697       && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
16698     ops->general_ops += count;
16699
16700   /* Count the predicate operations needed by an SVE comparison.  */
16701   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
16702     if (tree type = vect_comparison_type (stmt_info))
16703       {
16704         unsigned int base = (FLOAT_TYPE_P (type)
16705                              ? sve_issue->fp_cmp_pred_ops
16706                              : sve_issue->int_cmp_pred_ops);
16707         ops->pred_ops += base * count;
16708       }
16709
16710   /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
16711   if (simd_issue)
16712     switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16713       {
16714       case 2:
16715         ops->general_ops += simd_issue->ld2_st2_general_ops * count;
16716         break;
16717
16718       case 3:
16719         ops->general_ops += simd_issue->ld3_st3_general_ops * count;
16720         break;
16721
16722       case 4:
16723         ops->general_ops += simd_issue->ld4_st4_general_ops * count;
16724         break;
16725       }
16726
16727   /* Add any overhead associated with gather loads and scatter stores.  */
16728   if (sve_issue
16729       && (kind == scalar_load || kind == scalar_store)
16730       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16731     {
16732       unsigned int pairs = CEIL (count, 2);
16733       ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
16734       ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
16735     }
16736 }
16737
16738 /* Return true if STMT_INFO contains a memory access and if the constant
16739    component of the memory address is aligned to SIZE bytes.  */
16740 static bool
16741 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
16742                                    poly_uint64 size)
16743 {
16744   if (!STMT_VINFO_DATA_REF (stmt_info))
16745     return false;
16746
16747   if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
16748     stmt_info = first_stmt;
16749   tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
16750   /* Needed for gathers & scatters, for example.  */
16751   if (!constant_offset)
16752     return false;
16753
16754   return multiple_p (wi::to_poly_offset (constant_offset), size);
16755 }
16756
16757 /* Check if a scalar or vector stmt could be part of a region of code
16758    that does nothing more than store values to memory, in the scalar
16759    case using STP.  Return the cost of the stmt if so, counting 2 for
16760    one instruction.  Return ~0U otherwise.
16761
16762    The arguments are a subset of those passed to add_stmt_cost.  */
16763 unsigned int
16764 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
16765                            stmt_vec_info stmt_info, tree vectype)
16766 {
16767   /* Code that stores vector constants uses a vector_load to create
16768      the constant.  We don't apply the heuristic to that case for two
16769      main reasons:
16770
16771      - At the moment, STPs are only formed via peephole2, and the
16772        constant scalar moves would often come between STRs and so
16773        prevent STP formation.
16774
16775      - The scalar code also has to load the constant somehow, and that
16776        isn't costed.  */
16777   switch (kind)
16778     {
16779     case scalar_to_vec:
16780       /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup.  */
16781       return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
16782
16783     case vec_construct:
16784       if (FLOAT_TYPE_P (vectype))
16785         /* Count 1 insn for the maximum number of FP->SIMD INS
16786            instructions.  */
16787         return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
16788
16789       /* Count 2 insns for a GPR->SIMD move and 2 insns for the
16790          maximum number of GPR->SIMD INS instructions.  */
16791       return vect_nunits_for_cost (vectype) * 4 * count;
16792
16793     case vector_store:
16794     case unaligned_store:
16795       /* Count 1 insn per vector if we can't form STP Q pairs.  */
16796       if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16797         return count * 2;
16798       if (aarch64_tune_params.extra_tuning_flags
16799           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16800         return count * 2;
16801
16802       if (stmt_info)
16803         {
16804           /* Assume we won't be able to use STP if the constant offset
16805              component of the address is misaligned.  ??? This could be
16806              removed if we formed STP pairs earlier, rather than relying
16807              on peephole2.  */
16808           auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
16809           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16810             return count * 2;
16811         }
16812       return CEIL (count, 2) * 2;
16813
16814     case scalar_store:
16815       if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
16816         {
16817           /* Check for a mode in which STP pairs can be formed.  */
16818           auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
16819           if (maybe_ne (size, 4) && maybe_ne (size, 8))
16820             return ~0U;
16821
16822           /* Assume we won't be able to use STP if the constant offset
16823              component of the address is misaligned.  ??? This could be
16824              removed if we formed STP pairs earlier, rather than relying
16825              on peephole2.  */
16826           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16827             return ~0U;
16828         }
16829       return count;
16830
16831     default:
16832       return ~0U;
16833     }
16834 }
16835
16836 unsigned
16837 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
16838                                      stmt_vec_info stmt_info, slp_tree,
16839                                      tree vectype, int misalign,
16840                                      vect_cost_model_location where)
16841 {
16842   fractional_cost stmt_cost
16843     = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
16844
16845   bool in_inner_loop_p = (where == vect_body
16846                           && stmt_info
16847                           && stmt_in_inner_loop_p (m_vinfo, stmt_info));
16848
16849   /* Do one-time initialization based on the vinfo.  */
16850   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
16851   if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
16852     {
16853       if (loop_vinfo)
16854         analyze_loop_vinfo (loop_vinfo);
16855
16856       m_analyzed_vinfo = true;
16857     }
16858
16859   /* Apply the heuristic described above m_stp_sequence_cost.  */
16860   if (m_stp_sequence_cost != ~0U)
16861     {
16862       uint64_t cost = aarch64_stp_sequence_cost (count, kind,
16863                                                  stmt_info, vectype);
16864       m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
16865     }
16866
16867   /* Try to get a more accurate cost by looking at STMT_INFO instead
16868      of just looking at KIND.  */
16869   if (stmt_info && aarch64_use_new_vector_costs_p ())
16870     {
16871       /* If we scalarize a strided store, the vectorizer costs one
16872          vec_to_scalar for each element.  However, we can store the first
16873          element using an FP store without a separate extract step.  */
16874       if (vect_is_store_elt_extraction (kind, stmt_info))
16875         count -= 1;
16876
16877       stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
16878                                                       stmt_info, stmt_cost);
16879
16880       if (vectype && m_vec_flags)
16881         stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
16882                                                         stmt_info, vectype,
16883                                                         where, stmt_cost);
16884     }
16885
16886   /* Do any SVE-specific adjustments to the cost.  */
16887   if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
16888     stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
16889                                               vectype, stmt_cost);
16890
16891   if (stmt_info && aarch64_use_new_vector_costs_p ())
16892     {
16893       /* Account for any extra "embedded" costs that apply additively
16894          to the base cost calculated above.  */
16895       stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
16896                                             stmt_cost);
16897
16898       /* If we're recording a nonzero vector loop body cost for the
16899          innermost loop, also estimate the operations that would need
16900          to be issued by all relevant implementations of the loop.  */
16901       if (loop_vinfo
16902           && (m_costing_for_scalar || where == vect_body)
16903           && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
16904           && stmt_cost != 0)
16905         for (auto &ops : m_ops)
16906           count_ops (count, kind, stmt_info, &ops);
16907
16908       /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
16909          estimate the number of statements in the unrolled Advanced SIMD
16910          loop.  For simplicitly, we assume that one iteration of the
16911          Advanced SIMD loop would need the same number of statements
16912          as one iteration of the SVE loop.  */
16913       if (where == vect_body && m_unrolled_advsimd_niters)
16914         m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
16915
16916       /* Detect the use of an averaging operation.  */
16917       gimple *stmt = stmt_info->stmt;
16918       if (is_gimple_call (stmt)
16919           && gimple_call_internal_p (stmt))
16920         {
16921           switch (gimple_call_internal_fn (stmt))
16922             {
16923             case IFN_AVG_FLOOR:
16924             case IFN_AVG_CEIL:
16925               m_has_avg = true;
16926             default:
16927               break;
16928             }
16929         }
16930     }
16931
16932   /* If the statement stores to a decl that is known to be the argument
16933      to a vld1 in the same function, ignore the store for costing purposes.
16934      See the comment above m_stores_to_vector_load_decl for more details.  */
16935   if (stmt_info
16936       && (kind == vector_store || kind == unaligned_store)
16937       && aarch64_accesses_vector_load_decl_p (stmt_info))
16938     {
16939       stmt_cost = 0;
16940       m_stores_to_vector_load_decl = true;
16941     }
16942
16943   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
16944 }
16945
16946 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
16947    heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
16948    says that we should prefer the Advanced SIMD loop.  */
16949 bool
16950 aarch64_vector_costs::prefer_unrolled_loop () const
16951 {
16952   if (!m_unrolled_advsimd_stmts)
16953     return false;
16954
16955   if (dump_enabled_p ())
16956     dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
16957                      " unrolled Advanced SIMD loop = "
16958                      HOST_WIDE_INT_PRINT_UNSIGNED "\n",
16959                      m_unrolled_advsimd_stmts);
16960
16961   /* The balance here is tricky.  On the one hand, we can't be sure whether
16962      the code is vectorizable with Advanced SIMD or not.  However, even if
16963      it isn't vectorizable with Advanced SIMD, there's a possibility that
16964      the scalar code could also be unrolled.  Some of the code might then
16965      benefit from SLP, or from using LDP and STP.  We therefore apply
16966      the heuristic regardless of can_use_advsimd_p.  */
16967   return (m_unrolled_advsimd_stmts
16968           && (m_unrolled_advsimd_stmts
16969               <= (unsigned int) param_max_completely_peeled_insns));
16970 }
16971
16972 /* Subroutine of adjust_body_cost for handling SVE.  Use ISSUE_INFO to work out
16973    how fast the SVE code can be issued and compare it to the equivalent value
16974    for scalar code (SCALAR_CYCLES_PER_ITER).  If COULD_USE_ADVSIMD is true,
16975    also compare it to the issue rate of Advanced SIMD code
16976    (ADVSIMD_CYCLES_PER_ITER).
16977
16978    ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
16979    *BODY_COST is the current value of the adjusted cost.  *SHOULD_DISPARAGE
16980    is true if we think the loop body is too expensive.  */
16981
16982 fractional_cost
16983 aarch64_vector_costs::
16984 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
16985                       fractional_cost scalar_cycles_per_iter,
16986                       unsigned int orig_body_cost, unsigned int *body_cost,
16987                       bool *should_disparage)
16988 {
16989   if (dump_enabled_p ())
16990     ops->dump ();
16991
16992   fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
16993   fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
16994
16995   /* If the scalar version of the loop could issue at least as
16996      quickly as the predicate parts of the SVE loop, make the SVE loop
16997      prohibitively expensive.  In this case vectorization is adding an
16998      overhead that the original scalar code didn't have.
16999
17000      This is mostly intended to detect cases in which WHILELOs dominate
17001      for very tight loops, which is something that normal latency-based
17002      costs would not model.  Adding this kind of cliffedge would be
17003      too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17004      code in the caller handles that case in a more conservative way.  */
17005   fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
17006   if (scalar_cycles_per_iter < sve_estimate)
17007     {
17008       unsigned int min_cost
17009         = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17010       if (*body_cost < min_cost)
17011         {
17012           if (dump_enabled_p ())
17013             dump_printf_loc (MSG_NOTE, vect_location,
17014                              "Increasing body cost to %d because the"
17015                              " scalar code could issue within the limit"
17016                              " imposed by predicate operations\n",
17017                              min_cost);
17018           *body_cost = min_cost;
17019           *should_disparage = true;
17020         }
17021     }
17022
17023   return sve_cycles_per_iter;
17024 }
17025
17026 unsigned int
17027 aarch64_vector_costs::determine_suggested_unroll_factor ()
17028 {
17029   bool sve = m_vec_flags & VEC_ANY_SVE;
17030   /* If we are trying to unroll an Advanced SIMD main loop that contains
17031      an averaging operation that we do not support with SVE and we might use a
17032      predicated epilogue, we need to be conservative and block unrolling as
17033      this might lead to a less optimal loop for the first and only epilogue
17034      using the original loop's vectorization factor.
17035      TODO: Remove this constraint when we add support for multiple epilogue
17036      vectorization.  */
17037   if (!sve && !TARGET_SVE2 && m_has_avg)
17038     return 1;
17039
17040   unsigned int max_unroll_factor = 1;
17041   for (auto vec_ops : m_ops)
17042     {
17043       aarch64_simd_vec_issue_info const *vec_issue
17044         = vec_ops.simd_issue_info ();
17045       if (!vec_issue)
17046         return 1;
17047       /* Limit unroll factor to a value adjustable by the user, the default
17048          value is 4. */
17049       unsigned int unroll_factor = aarch64_vect_unroll_limit;
17050       unsigned int factor
17051        = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17052       unsigned int temp;
17053
17054       /* Sanity check, this should never happen.  */
17055       if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17056         return 1;
17057
17058       /* Check stores.  */
17059       if (vec_ops.stores > 0)
17060         {
17061           temp = CEIL (factor * vec_issue->stores_per_cycle,
17062                        vec_ops.stores);
17063           unroll_factor = MIN (unroll_factor, temp);
17064         }
17065
17066       /* Check loads + stores.  */
17067       if (vec_ops.loads > 0)
17068         {
17069           temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17070                        vec_ops.loads + vec_ops.stores);
17071           unroll_factor = MIN (unroll_factor, temp);
17072         }
17073
17074       /* Check general ops.  */
17075       if (vec_ops.general_ops > 0)
17076         {
17077           temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17078                        vec_ops.general_ops);
17079           unroll_factor = MIN (unroll_factor, temp);
17080          }
17081       max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17082     }
17083
17084   /* Make sure unroll factor is power of 2.  */
17085   return 1 << ceil_log2 (max_unroll_factor);
17086 }
17087
17088 /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
17089    and return the new cost.  */
17090 unsigned int
17091 aarch64_vector_costs::
17092 adjust_body_cost (loop_vec_info loop_vinfo,
17093                   const aarch64_vector_costs *scalar_costs,
17094                   unsigned int body_cost)
17095 {
17096   if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17097     return body_cost;
17098
17099   const auto &scalar_ops = scalar_costs->m_ops[0];
17100   const auto &vector_ops = m_ops[0];
17101   unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17102   unsigned int orig_body_cost = body_cost;
17103   bool should_disparage = false;
17104
17105   if (dump_enabled_p ())
17106     dump_printf_loc (MSG_NOTE, vect_location,
17107                      "Original vector body cost = %d\n", body_cost);
17108
17109   fractional_cost scalar_cycles_per_iter
17110     = scalar_ops.min_cycles_per_iter () * estimated_vf;
17111
17112   fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17113
17114   if (dump_enabled_p ())
17115     {
17116       if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17117         dump_printf_loc (MSG_NOTE, vect_location,
17118                          "Vector loop iterates at most %wd times\n",
17119                          m_num_vector_iterations);
17120       dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17121       scalar_ops.dump ();
17122       dump_printf_loc (MSG_NOTE, vect_location,
17123                        "  estimated cycles per vector iteration"
17124                        " (for VF %d) = %f\n",
17125                        estimated_vf, scalar_cycles_per_iter.as_double ());
17126     }
17127
17128   if (vector_ops.sve_issue_info ())
17129     {
17130       if (dump_enabled_p ())
17131         dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17132       vector_cycles_per_iter
17133         = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17134                                 orig_body_cost, &body_cost, &should_disparage);
17135
17136       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17137         {
17138           /* Also take Neoverse V1 tuning into account, doubling the
17139              scalar and Advanced SIMD estimates to account for the
17140              doubling in SVE vector length.  */
17141           if (dump_enabled_p ())
17142             dump_printf_loc (MSG_NOTE, vect_location,
17143                              "Neoverse V1 estimate:\n");
17144           auto vf_factor = m_ops[1].vf_factor ();
17145           adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17146                                 orig_body_cost, &body_cost, &should_disparage);
17147         }
17148     }
17149   else
17150     {
17151       if (dump_enabled_p ())
17152         {
17153           dump_printf_loc (MSG_NOTE, vect_location,
17154                            "Vector issue estimate:\n");
17155           vector_ops.dump ();
17156         }
17157     }
17158
17159   /* Decide whether to stick to latency-based costs or whether to try to
17160      take issue rates into account.  */
17161   unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17162   if (m_vec_flags & VEC_ANY_SVE)
17163     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17164
17165   if (m_num_vector_iterations >= 1
17166       && m_num_vector_iterations < threshold)
17167     {
17168       if (dump_enabled_p ())
17169         dump_printf_loc (MSG_NOTE, vect_location,
17170                          "Low iteration count, so using pure latency"
17171                          " costs\n");
17172     }
17173   /* Increase the cost of the vector code if it looks like the scalar code
17174      could issue more quickly.  These values are only rough estimates,
17175      so minor differences should only result in minor changes.  */
17176   else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17177     {
17178       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17179                                           scalar_cycles_per_iter);
17180       if (dump_enabled_p ())
17181         dump_printf_loc (MSG_NOTE, vect_location,
17182                          "Increasing body cost to %d because scalar code"
17183                          " would issue more quickly\n", body_cost);
17184     }
17185   /* In general, it's expected that the proposed vector code would be able
17186      to issue more quickly than the original scalar code.  This should
17187      already be reflected to some extent in the latency-based costs.
17188
17189      However, the latency-based costs effectively assume that the scalar
17190      code and the vector code execute serially, which tends to underplay
17191      one important case: if the real (non-serialized) execution time of
17192      a scalar iteration is dominated by loop-carried dependencies,
17193      and if the vector code is able to reduce both the length of
17194      the loop-carried dependencies *and* the number of cycles needed
17195      to issue the code in general, we can be more confident that the
17196      vector code is an improvement, even if adding the other (non-loop-carried)
17197      latencies tends to hide this saving.  We therefore reduce the cost of the
17198      vector loop body in proportion to the saving.  */
17199   else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17200            && scalar_ops.reduction_latency == scalar_cycles_per_iter
17201            && scalar_cycles_per_iter > vector_cycles_per_iter
17202            && !should_disparage)
17203     {
17204       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17205                                           scalar_cycles_per_iter);
17206       if (dump_enabled_p ())
17207         dump_printf_loc (MSG_NOTE, vect_location,
17208                          "Decreasing body cost to %d account for smaller"
17209                          " reduction latency\n", body_cost);
17210     }
17211
17212   return body_cost;
17213 }
17214
17215 void
17216 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17217 {
17218   auto *scalar_costs
17219     = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17220   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17221   if (loop_vinfo
17222       && m_vec_flags
17223       && aarch64_use_new_vector_costs_p ())
17224     {
17225       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17226                                              m_costs[vect_body]);
17227       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17228     }
17229
17230   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
17231      the scalar code in the event of a tie, since there is more chance
17232      of scalar code being optimized with surrounding operations.
17233
17234      In addition, if the vector body is a simple store to a decl that
17235      is elsewhere loaded using vld1, strongly prefer the vector form,
17236      to the extent of giving the prologue a zero cost.  See the comment
17237      above m_stores_to_vector_load_decl for details.  */
17238   if (!loop_vinfo
17239       && scalar_costs
17240       && m_stp_sequence_cost != ~0U)
17241     {
17242       if (m_stores_to_vector_load_decl)
17243         m_costs[vect_prologue] = 0;
17244       else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17245         m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17246     }
17247
17248   vector_costs::finish_cost (scalar_costs);
17249 }
17250
17251 bool
17252 aarch64_vector_costs::
17253 better_main_loop_than_p (const vector_costs *uncast_other) const
17254 {
17255   auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17256
17257   auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17258   auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17259
17260   if (dump_enabled_p ())
17261     dump_printf_loc (MSG_NOTE, vect_location,
17262                      "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17263                      GET_MODE_NAME (this_loop_vinfo->vector_mode),
17264                      vect_vf_for_cost (this_loop_vinfo),
17265                      GET_MODE_NAME (other_loop_vinfo->vector_mode),
17266                      vect_vf_for_cost (other_loop_vinfo));
17267
17268   /* Apply the unrolling heuristic described above
17269      m_unrolled_advsimd_niters.  */
17270   if (bool (m_unrolled_advsimd_stmts)
17271       != bool (other->m_unrolled_advsimd_stmts))
17272     {
17273       bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17274       bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17275       if (this_prefer_unrolled != other_prefer_unrolled)
17276         {
17277           if (dump_enabled_p ())
17278             dump_printf_loc (MSG_NOTE, vect_location,
17279                              "Preferring Advanced SIMD loop because"
17280                              " it can be unrolled\n");
17281           return other_prefer_unrolled;
17282         }
17283     }
17284
17285   for (unsigned int i = 0; i < m_ops.length (); ++i)
17286     {
17287       if (dump_enabled_p ())
17288         {
17289           if (i)
17290             dump_printf_loc (MSG_NOTE, vect_location,
17291                              "Reconsidering with subtuning %d\n", i);
17292           dump_printf_loc (MSG_NOTE, vect_location,
17293                            "Issue info for %s loop:\n",
17294                            GET_MODE_NAME (this_loop_vinfo->vector_mode));
17295           this->m_ops[i].dump ();
17296           dump_printf_loc (MSG_NOTE, vect_location,
17297                            "Issue info for %s loop:\n",
17298                            GET_MODE_NAME (other_loop_vinfo->vector_mode));
17299           other->m_ops[i].dump ();
17300         }
17301
17302       auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17303                                 * this->m_ops[i].vf_factor ());
17304       auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17305                                  * other->m_ops[i].vf_factor ());
17306
17307       /* If it appears that one loop could process the same amount of data
17308          in fewer cycles, prefer that loop over the other one.  */
17309       fractional_cost this_cost
17310         = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17311       fractional_cost other_cost
17312         = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17313       if (dump_enabled_p ())
17314         {
17315           dump_printf_loc (MSG_NOTE, vect_location,
17316                            "Weighted cycles per iteration of %s loop ~= %f\n",
17317                            GET_MODE_NAME (this_loop_vinfo->vector_mode),
17318                            this_cost.as_double ());
17319           dump_printf_loc (MSG_NOTE, vect_location,
17320                            "Weighted cycles per iteration of %s loop ~= %f\n",
17321                            GET_MODE_NAME (other_loop_vinfo->vector_mode),
17322                            other_cost.as_double ());
17323         }
17324       if (this_cost != other_cost)
17325         {
17326           if (dump_enabled_p ())
17327             dump_printf_loc (MSG_NOTE, vect_location,
17328                              "Preferring loop with lower cycles"
17329                              " per iteration\n");
17330           return this_cost < other_cost;
17331         }
17332
17333       /* If the issue rate of SVE code is limited by predicate operations
17334          (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17335          and if Advanced SIMD code could issue within the limit imposed
17336          by the predicate operations, the predicate operations are adding an
17337          overhead that the original code didn't have and so we should prefer
17338          the Advanced SIMD version.  */
17339       auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17340                                     const aarch64_vec_op_count &b) -> bool
17341         {
17342           if (a.pred_ops == 0
17343               && (b.min_pred_cycles_per_iter ()
17344                   > b.min_nonpred_cycles_per_iter ()))
17345             {
17346               if (dump_enabled_p ())
17347                 dump_printf_loc (MSG_NOTE, vect_location,
17348                                  "Preferring Advanced SIMD loop since"
17349                                  " SVE loop is predicate-limited\n");
17350               return true;
17351             }
17352           return false;
17353         };
17354       if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17355         return true;
17356       if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17357         return false;
17358     }
17359
17360   return vector_costs::better_main_loop_than_p (other);
17361 }
17362
17363 static void initialize_aarch64_code_model (struct gcc_options *);
17364
17365 /* Parse the TO_PARSE string and put the architecture struct that it
17366    selects into RES and the architectural features into ISA_FLAGS.
17367    Return an aarch_parse_opt_result describing the parse result.
17368    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17369    When the TO_PARSE string contains an invalid extension,
17370    a copy of the string is created and stored to INVALID_EXTENSION.  */
17371
17372 static enum aarch_parse_opt_result
17373 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17374                     aarch64_feature_flags *isa_flags,
17375                     std::string *invalid_extension)
17376 {
17377   const char *ext;
17378   const struct processor *arch;
17379   size_t len;
17380
17381   ext = strchr (to_parse, '+');
17382
17383   if (ext != NULL)
17384     len = ext - to_parse;
17385   else
17386     len = strlen (to_parse);
17387
17388   if (len == 0)
17389     return AARCH_PARSE_MISSING_ARG;
17390
17391
17392   /* Loop through the list of supported ARCHes to find a match.  */
17393   for (arch = all_architectures; arch->name != NULL; arch++)
17394     {
17395       if (strlen (arch->name) == len
17396           && strncmp (arch->name, to_parse, len) == 0)
17397         {
17398           auto isa_temp = arch->flags;
17399
17400           if (ext != NULL)
17401             {
17402               /* TO_PARSE string contains at least one extension.  */
17403               enum aarch_parse_opt_result ext_res
17404                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17405
17406               if (ext_res != AARCH_PARSE_OK)
17407                 return ext_res;
17408             }
17409           /* Extension parsing was successful.  Confirm the result
17410              arch and ISA flags.  */
17411           *res = arch;
17412           *isa_flags = isa_temp;
17413           return AARCH_PARSE_OK;
17414         }
17415     }
17416
17417   /* ARCH name not found in list.  */
17418   return AARCH_PARSE_INVALID_ARG;
17419 }
17420
17421 /* Parse the TO_PARSE string and put the result tuning in RES and the
17422    architecture flags in ISA_FLAGS.  Return an aarch_parse_opt_result
17423    describing the parse result.  If there is an error parsing, RES and
17424    ISA_FLAGS are left unchanged.
17425    When the TO_PARSE string contains an invalid extension,
17426    a copy of the string is created and stored to INVALID_EXTENSION.  */
17427
17428 static enum aarch_parse_opt_result
17429 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17430                    aarch64_feature_flags *isa_flags,
17431                    std::string *invalid_extension)
17432 {
17433   const char *ext;
17434   const struct processor *cpu;
17435   size_t len;
17436
17437   ext = strchr (to_parse, '+');
17438
17439   if (ext != NULL)
17440     len = ext - to_parse;
17441   else
17442     len = strlen (to_parse);
17443
17444   if (len == 0)
17445     return AARCH_PARSE_MISSING_ARG;
17446
17447
17448   /* Loop through the list of supported CPUs to find a match.  */
17449   for (cpu = all_cores; cpu->name != NULL; cpu++)
17450     {
17451       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17452         {
17453           auto isa_temp = cpu->flags;
17454
17455           if (ext != NULL)
17456             {
17457               /* TO_PARSE string contains at least one extension.  */
17458               enum aarch_parse_opt_result ext_res
17459                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17460
17461               if (ext_res != AARCH_PARSE_OK)
17462                 return ext_res;
17463             }
17464           /* Extension parsing was successfull.  Confirm the result
17465              cpu and ISA flags.  */
17466           *res = cpu;
17467           *isa_flags = isa_temp;
17468           return AARCH_PARSE_OK;
17469         }
17470     }
17471
17472   /* CPU name not found in list.  */
17473   return AARCH_PARSE_INVALID_ARG;
17474 }
17475
17476 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17477    Return an aarch_parse_opt_result describing the parse result.
17478    If the parsing fails the RES does not change.  */
17479
17480 static enum aarch_parse_opt_result
17481 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17482 {
17483   const struct processor *cpu;
17484
17485   /* Loop through the list of supported CPUs to find a match.  */
17486   for (cpu = all_cores; cpu->name != NULL; cpu++)
17487     {
17488       if (strcmp (cpu->name, to_parse) == 0)
17489         {
17490           *res = cpu;
17491           return AARCH_PARSE_OK;
17492         }
17493     }
17494
17495   /* CPU name not found in list.  */
17496   return AARCH_PARSE_INVALID_ARG;
17497 }
17498
17499 /* Parse TOKEN, which has length LENGTH to see if it is an option
17500    described in FLAG.  If it is, return the index bit for that fusion type.
17501    If not, error (printing OPTION_NAME) and return zero.  */
17502
17503 static unsigned int
17504 aarch64_parse_one_option_token (const char *token,
17505                                 size_t length,
17506                                 const struct aarch64_flag_desc *flag,
17507                                 const char *option_name)
17508 {
17509   for (; flag->name != NULL; flag++)
17510     {
17511       if (length == strlen (flag->name)
17512           && !strncmp (flag->name, token, length))
17513         return flag->flag;
17514     }
17515
17516   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17517   return 0;
17518 }
17519
17520 /* Parse OPTION which is a comma-separated list of flags to enable.
17521    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17522    default state we inherit from the CPU tuning structures.  OPTION_NAME
17523    gives the top-level option we are parsing in the -moverride string,
17524    for use in error messages.  */
17525
17526 static unsigned int
17527 aarch64_parse_boolean_options (const char *option,
17528                                const struct aarch64_flag_desc *flags,
17529                                unsigned int initial_state,
17530                                const char *option_name)
17531 {
17532   const char separator = '.';
17533   const char* specs = option;
17534   const char* ntoken = option;
17535   unsigned int found_flags = initial_state;
17536
17537   while ((ntoken = strchr (specs, separator)))
17538     {
17539       size_t token_length = ntoken - specs;
17540       unsigned token_ops = aarch64_parse_one_option_token (specs,
17541                                                            token_length,
17542                                                            flags,
17543                                                            option_name);
17544       /* If we find "none" (or, for simplicity's sake, an error) anywhere
17545          in the token stream, reset the supported operations.  So:
17546
17547            adrp+add.cmp+branch.none.adrp+add
17548
17549            would have the result of turning on only adrp+add fusion.  */
17550       if (!token_ops)
17551         found_flags = 0;
17552
17553       found_flags |= token_ops;
17554       specs = ++ntoken;
17555     }
17556
17557   /* We ended with a comma, print something.  */
17558   if (!(*specs))
17559     {
17560       error ("%qs string ill-formed", option_name);
17561       return 0;
17562     }
17563
17564   /* We still have one more token to parse.  */
17565   size_t token_length = strlen (specs);
17566   unsigned token_ops = aarch64_parse_one_option_token (specs,
17567                                                        token_length,
17568                                                        flags,
17569                                                        option_name);
17570    if (!token_ops)
17571      found_flags = 0;
17572
17573   found_flags |= token_ops;
17574   return found_flags;
17575 }
17576
17577 /* Support for overriding instruction fusion.  */
17578
17579 static void
17580 aarch64_parse_fuse_string (const char *fuse_string,
17581                             struct tune_params *tune)
17582 {
17583   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
17584                                                      aarch64_fusible_pairs,
17585                                                      tune->fusible_ops,
17586                                                      "fuse=");
17587 }
17588
17589 /* Support for overriding other tuning flags.  */
17590
17591 static void
17592 aarch64_parse_tune_string (const char *tune_string,
17593                             struct tune_params *tune)
17594 {
17595   tune->extra_tuning_flags
17596     = aarch64_parse_boolean_options (tune_string,
17597                                      aarch64_tuning_flags,
17598                                      tune->extra_tuning_flags,
17599                                      "tune=");
17600 }
17601
17602 /* Parse the sve_width tuning moverride string in TUNE_STRING.
17603    Accept the valid SVE vector widths allowed by
17604    aarch64_sve_vector_bits_enum and use it to override sve_width
17605    in TUNE.  */
17606
17607 static void
17608 aarch64_parse_sve_width_string (const char *tune_string,
17609                                 struct tune_params *tune)
17610 {
17611   int width = -1;
17612
17613   int n = sscanf (tune_string, "%d", &width);
17614   if (n == EOF)
17615     {
17616       error ("invalid format for %<sve_width%>");
17617       return;
17618     }
17619   switch (width)
17620     {
17621     case SVE_128:
17622     case SVE_256:
17623     case SVE_512:
17624     case SVE_1024:
17625     case SVE_2048:
17626       break;
17627     default:
17628       error ("invalid %<sve_width%> value: %d", width);
17629     }
17630   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
17631 }
17632
17633 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
17634    we understand.  If it is, extract the option string and handoff to
17635    the appropriate function.  */
17636
17637 void
17638 aarch64_parse_one_override_token (const char* token,
17639                                   size_t length,
17640                                   struct tune_params *tune)
17641 {
17642   const struct aarch64_tuning_override_function *fn
17643     = aarch64_tuning_override_functions;
17644
17645   const char *option_part = strchr (token, '=');
17646   if (!option_part)
17647     {
17648       error ("tuning string missing in option (%s)", token);
17649       return;
17650     }
17651
17652   /* Get the length of the option name.  */
17653   length = option_part - token;
17654   /* Skip the '=' to get to the option string.  */
17655   option_part++;
17656
17657   for (; fn->name != NULL; fn++)
17658     {
17659       if (!strncmp (fn->name, token, length))
17660         {
17661           fn->parse_override (option_part, tune);
17662           return;
17663         }
17664     }
17665
17666   error ("unknown tuning option (%s)",token);
17667   return;
17668 }
17669
17670 /* A checking mechanism for the implementation of the tls size.  */
17671
17672 static void
17673 initialize_aarch64_tls_size (struct gcc_options *opts)
17674 {
17675   if (aarch64_tls_size == 0)
17676     aarch64_tls_size = 24;
17677
17678   switch (opts->x_aarch64_cmodel_var)
17679     {
17680     case AARCH64_CMODEL_TINY:
17681       /* Both the default and maximum TLS size allowed under tiny is 1M which
17682          needs two instructions to address, so we clamp the size to 24.  */
17683       if (aarch64_tls_size > 24)
17684         aarch64_tls_size = 24;
17685       break;
17686     case AARCH64_CMODEL_SMALL:
17687       /* The maximum TLS size allowed under small is 4G.  */
17688       if (aarch64_tls_size > 32)
17689         aarch64_tls_size = 32;
17690       break;
17691     case AARCH64_CMODEL_LARGE:
17692       /* The maximum TLS size allowed under large is 16E.
17693          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
17694       if (aarch64_tls_size > 48)
17695         aarch64_tls_size = 48;
17696       break;
17697     default:
17698       gcc_unreachable ();
17699     }
17700
17701   return;
17702 }
17703
17704 /* Return the CPU corresponding to the enum CPU.  */
17705
17706 static const struct processor *
17707 aarch64_get_tune_cpu (enum aarch64_processor cpu)
17708 {
17709   gcc_assert (cpu != aarch64_none);
17710
17711   return &all_cores[cpu];
17712 }
17713
17714 /* Return the architecture corresponding to the enum ARCH.  */
17715
17716 static const struct processor *
17717 aarch64_get_arch (enum aarch64_arch arch)
17718 {
17719   gcc_assert (arch != aarch64_no_arch);
17720
17721   return &all_architectures[arch];
17722 }
17723
17724 /* Parse STRING looking for options in the format:
17725      string     :: option:string
17726      option     :: name=substring
17727      name       :: {a-z}
17728      substring  :: defined by option.  */
17729
17730 static void
17731 aarch64_parse_override_string (const char* input_string,
17732                                struct tune_params* tune)
17733 {
17734   const char separator = ':';
17735   size_t string_length = strlen (input_string) + 1;
17736   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
17737   char *string = string_root;
17738   strncpy (string, input_string, string_length);
17739   string[string_length - 1] = '\0';
17740
17741   char* ntoken = string;
17742
17743   while ((ntoken = strchr (string, separator)))
17744     {
17745       size_t token_length = ntoken - string;
17746       /* Make this substring look like a string.  */
17747       *ntoken = '\0';
17748       aarch64_parse_one_override_token (string, token_length, tune);
17749       string = ++ntoken;
17750     }
17751
17752   /* One last option to parse.  */
17753   aarch64_parse_one_override_token (string, strlen (string), tune);
17754   free (string_root);
17755 }
17756
17757 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
17758    are best for a generic target with the currently-enabled architecture
17759    extensions.  */
17760 static void
17761 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
17762 {
17763   /* Neoverse V1 is the only core that is known to benefit from
17764      AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.  There is therefore no
17765      point enabling it for SVE2 and above.  */
17766   if (TARGET_SVE2)
17767     current_tune.extra_tuning_flags
17768       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
17769 }
17770
17771 static void
17772 aarch64_override_options_after_change_1 (struct gcc_options *opts)
17773 {
17774   if (accepted_branch_protection_string)
17775     {
17776       opts->x_aarch64_branch_protection_string
17777         = xstrdup (accepted_branch_protection_string);
17778     }
17779
17780   /* PR 70044: We have to be careful about being called multiple times for the
17781      same function.  This means all changes should be repeatable.  */
17782
17783   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
17784      Disable the frame pointer flag so the mid-end will not use a frame
17785      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
17786      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
17787      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
17788   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
17789   if (opts->x_flag_omit_frame_pointer == 0)
17790     opts->x_flag_omit_frame_pointer = 2;
17791
17792   /* If not optimizing for size, set the default
17793      alignment to what the target wants.  */
17794   if (!opts->x_optimize_size)
17795     {
17796       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
17797         opts->x_str_align_loops = aarch64_tune_params.loop_align;
17798       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
17799         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
17800       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
17801         opts->x_str_align_functions = aarch64_tune_params.function_align;
17802     }
17803
17804   /* We default to no pc-relative literal loads.  */
17805
17806   aarch64_pcrelative_literal_loads = false;
17807
17808   /* If -mpc-relative-literal-loads is set on the command line, this
17809      implies that the user asked for PC relative literal loads.  */
17810   if (opts->x_pcrelative_literal_loads == 1)
17811     aarch64_pcrelative_literal_loads = true;
17812
17813   /* In the tiny memory model it makes no sense to disallow PC relative
17814      literal pool loads.  */
17815   if (aarch64_cmodel == AARCH64_CMODEL_TINY
17816       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
17817     aarch64_pcrelative_literal_loads = true;
17818
17819   /* When enabling the lower precision Newton series for the square root, also
17820      enable it for the reciprocal square root, since the latter is an
17821      intermediary step for the former.  */
17822   if (flag_mlow_precision_sqrt)
17823     flag_mrecip_low_precision_sqrt = true;
17824 }
17825
17826 /* 'Unpack' up the internal tuning structs and update the options
17827     in OPTS.  The caller must have set up selected_tune and selected_arch
17828     as all the other target-specific codegen decisions are
17829     derived from them.  */
17830
17831 void
17832 aarch64_override_options_internal (struct gcc_options *opts)
17833 {
17834   const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
17835   aarch64_tune_flags = tune->flags;
17836   aarch64_tune = tune->sched_core;
17837   /* Make a copy of the tuning parameters attached to the core, which
17838      we may later overwrite.  */
17839   aarch64_tune_params = *(tune->tune);
17840   if (tune->tune == &generic_tunings)
17841     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
17842
17843   if (opts->x_aarch64_override_tune_string)
17844     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
17845                                    &aarch64_tune_params);
17846
17847   /* This target defaults to strict volatile bitfields.  */
17848   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
17849     opts->x_flag_strict_volatile_bitfields = 1;
17850
17851   if (aarch64_stack_protector_guard == SSP_GLOBAL
17852       && opts->x_aarch64_stack_protector_guard_offset_str)
17853     {
17854       error ("incompatible options %<-mstack-protector-guard=global%> and "
17855              "%<-mstack-protector-guard-offset=%s%>",
17856              aarch64_stack_protector_guard_offset_str);
17857     }
17858
17859   if (aarch64_stack_protector_guard == SSP_SYSREG
17860       && !(opts->x_aarch64_stack_protector_guard_offset_str
17861            && opts->x_aarch64_stack_protector_guard_reg_str))
17862     {
17863       error ("both %<-mstack-protector-guard-offset%> and "
17864              "%<-mstack-protector-guard-reg%> must be used "
17865              "with %<-mstack-protector-guard=sysreg%>");
17866     }
17867
17868   if (opts->x_aarch64_stack_protector_guard_reg_str)
17869     {
17870       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
17871           error ("specify a system register with a small string length");
17872     }
17873
17874   if (opts->x_aarch64_stack_protector_guard_offset_str)
17875     {
17876       char *end;
17877       const char *str = aarch64_stack_protector_guard_offset_str;
17878       errno = 0;
17879       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
17880       if (!*str || *end || errno)
17881         error ("%qs is not a valid offset in %qs", str,
17882                "-mstack-protector-guard-offset=");
17883       aarch64_stack_protector_guard_offset = offs;
17884     }
17885
17886   if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
17887       && !fixed_regs[R18_REGNUM])
17888     error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
17889
17890   initialize_aarch64_code_model (opts);
17891   initialize_aarch64_tls_size (opts);
17892
17893   int queue_depth = 0;
17894   switch (aarch64_tune_params.autoprefetcher_model)
17895     {
17896       case tune_params::AUTOPREFETCHER_OFF:
17897         queue_depth = -1;
17898         break;
17899       case tune_params::AUTOPREFETCHER_WEAK:
17900         queue_depth = 0;
17901         break;
17902       case tune_params::AUTOPREFETCHER_STRONG:
17903         queue_depth = max_insn_queue_index + 1;
17904         break;
17905       default:
17906         gcc_unreachable ();
17907     }
17908
17909   /* We don't mind passing in global_options_set here as we don't use
17910      the *options_set structs anyway.  */
17911   SET_OPTION_IF_UNSET (opts, &global_options_set,
17912                        param_sched_autopref_queue_depth, queue_depth);
17913
17914   /* If using Advanced SIMD only for autovectorization disable SVE vector costs
17915      comparison.  */
17916   if (aarch64_autovec_preference == 1)
17917     SET_OPTION_IF_UNSET (opts, &global_options_set,
17918                          aarch64_sve_compare_costs, 0);
17919
17920   /* Set up parameters to be used in prefetching algorithm.  Do not
17921      override the defaults unless we are tuning for a core we have
17922      researched values for.  */
17923   if (aarch64_tune_params.prefetch->num_slots > 0)
17924     SET_OPTION_IF_UNSET (opts, &global_options_set,
17925                          param_simultaneous_prefetches,
17926                          aarch64_tune_params.prefetch->num_slots);
17927   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
17928     SET_OPTION_IF_UNSET (opts, &global_options_set,
17929                          param_l1_cache_size,
17930                          aarch64_tune_params.prefetch->l1_cache_size);
17931   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17932     SET_OPTION_IF_UNSET (opts, &global_options_set,
17933                          param_l1_cache_line_size,
17934                          aarch64_tune_params.prefetch->l1_cache_line_size);
17935
17936   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17937     {
17938       SET_OPTION_IF_UNSET (opts, &global_options_set,
17939                            param_destruct_interfere_size,
17940                            aarch64_tune_params.prefetch->l1_cache_line_size);
17941       SET_OPTION_IF_UNSET (opts, &global_options_set,
17942                            param_construct_interfere_size,
17943                            aarch64_tune_params.prefetch->l1_cache_line_size);
17944     }
17945   else
17946     {
17947       /* For a generic AArch64 target, cover the current range of cache line
17948          sizes.  */
17949       SET_OPTION_IF_UNSET (opts, &global_options_set,
17950                            param_destruct_interfere_size,
17951                            256);
17952       SET_OPTION_IF_UNSET (opts, &global_options_set,
17953                            param_construct_interfere_size,
17954                            64);
17955     }
17956
17957   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
17958     SET_OPTION_IF_UNSET (opts, &global_options_set,
17959                          param_l2_cache_size,
17960                          aarch64_tune_params.prefetch->l2_cache_size);
17961   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
17962     SET_OPTION_IF_UNSET (opts, &global_options_set,
17963                          param_prefetch_dynamic_strides, 0);
17964   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
17965     SET_OPTION_IF_UNSET (opts, &global_options_set,
17966                          param_prefetch_minimum_stride,
17967                          aarch64_tune_params.prefetch->minimum_stride);
17968
17969   /* Use the alternative scheduling-pressure algorithm by default.  */
17970   SET_OPTION_IF_UNSET (opts, &global_options_set,
17971                        param_sched_pressure_algorithm,
17972                        SCHED_PRESSURE_MODEL);
17973
17974   /* Validate the guard size.  */
17975   int guard_size = param_stack_clash_protection_guard_size;
17976
17977   if (guard_size != 12 && guard_size != 16)
17978     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
17979            "size.  Given value %d (%llu KB) is out of range",
17980            guard_size, (1ULL << guard_size) / 1024ULL);
17981
17982   /* Enforce that interval is the same size as size so the mid-end does the
17983      right thing.  */
17984   SET_OPTION_IF_UNSET (opts, &global_options_set,
17985                        param_stack_clash_protection_probe_interval,
17986                        guard_size);
17987
17988   /* The maybe_set calls won't update the value if the user has explicitly set
17989      one.  Which means we need to validate that probing interval and guard size
17990      are equal.  */
17991   int probe_interval
17992     = param_stack_clash_protection_probe_interval;
17993   if (guard_size != probe_interval)
17994     error ("stack clash guard size %<%d%> must be equal to probing interval "
17995            "%<%d%>", guard_size, probe_interval);
17996
17997   /* Enable sw prefetching at specified optimization level for
17998      CPUS that have prefetch.  Lower optimization level threshold by 1
17999      when profiling is enabled.  */
18000   if (opts->x_flag_prefetch_loop_arrays < 0
18001       && !opts->x_optimize_size
18002       && aarch64_tune_params.prefetch->default_opt_level >= 0
18003       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
18004     opts->x_flag_prefetch_loop_arrays = 1;
18005
18006   aarch64_override_options_after_change_1 (opts);
18007 }
18008
18009 /* Print a hint with a suggestion for a core or architecture name that
18010    most closely resembles what the user passed in STR.  ARCH is true if
18011    the user is asking for an architecture name.  ARCH is false if the user
18012    is asking for a core name.  */
18013
18014 static void
18015 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
18016 {
18017   auto_vec<const char *> candidates;
18018   const struct processor *entry = arch ? all_architectures : all_cores;
18019   for (; entry->name != NULL; entry++)
18020     candidates.safe_push (entry->name);
18021
18022 #ifdef HAVE_LOCAL_CPU_DETECT
18023   /* Add also "native" as possible value.  */
18024   if (arch)
18025     candidates.safe_push ("native");
18026 #endif
18027
18028   char *s;
18029   const char *hint = candidates_list_and_hint (str, s, candidates);
18030   if (hint)
18031     inform (input_location, "valid arguments are: %s;"
18032                              " did you mean %qs?", s, hint);
18033   else
18034     inform (input_location, "valid arguments are: %s", s);
18035
18036   XDELETEVEC (s);
18037 }
18038
18039 /* Print a hint with a suggestion for a core name that most closely resembles
18040    what the user passed in STR.  */
18041
18042 inline static void
18043 aarch64_print_hint_for_core (const char *str)
18044 {
18045   aarch64_print_hint_for_core_or_arch (str, false);
18046 }
18047
18048 /* Print a hint with a suggestion for an architecture name that most closely
18049    resembles what the user passed in STR.  */
18050
18051 inline static void
18052 aarch64_print_hint_for_arch (const char *str)
18053 {
18054   aarch64_print_hint_for_core_or_arch (str, true);
18055 }
18056
18057
18058 /* Print a hint with a suggestion for an extension name
18059    that most closely resembles what the user passed in STR.  */
18060
18061 void
18062 aarch64_print_hint_for_extensions (const std::string &str)
18063 {
18064   auto_vec<const char *> candidates;
18065   aarch64_get_all_extension_candidates (&candidates);
18066   char *s;
18067   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
18068   if (hint)
18069     inform (input_location, "valid arguments are: %s;"
18070                              " did you mean %qs?", s, hint);
18071   else
18072     inform (input_location, "valid arguments are: %s", s);
18073
18074   XDELETEVEC (s);
18075 }
18076
18077 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
18078    specified in STR and throw errors if appropriate.  Put the results if
18079    they are valid in RES and ISA_FLAGS.  Return whether the option is
18080    valid.  */
18081
18082 static bool
18083 aarch64_validate_mcpu (const char *str, const struct processor **res,
18084                        aarch64_feature_flags *isa_flags)
18085 {
18086   std::string invalid_extension;
18087   enum aarch_parse_opt_result parse_res
18088     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18089
18090   if (parse_res == AARCH_PARSE_OK)
18091     return true;
18092
18093   switch (parse_res)
18094     {
18095       case AARCH_PARSE_MISSING_ARG:
18096         error ("missing cpu name in %<-mcpu=%s%>", str);
18097         break;
18098       case AARCH_PARSE_INVALID_ARG:
18099         error ("unknown value %qs for %<-mcpu%>", str);
18100         aarch64_print_hint_for_core (str);
18101         /* A common user error is confusing -march and -mcpu.
18102            If the -mcpu string matches a known architecture then suggest
18103            -march=.  */
18104         parse_res = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18105         if (parse_res == AARCH_PARSE_OK)
18106           inform (input_location, "did you mean %<-march=%s%>?", str);
18107         break;
18108       case AARCH_PARSE_INVALID_FEATURE:
18109         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18110                invalid_extension.c_str (), str);
18111         aarch64_print_hint_for_extensions (invalid_extension);
18112         break;
18113       default:
18114         gcc_unreachable ();
18115     }
18116
18117   return false;
18118 }
18119
18120 /* Straight line speculation indicators.  */
18121 enum aarch64_sls_hardening_type
18122 {
18123   SLS_NONE = 0,
18124   SLS_RETBR = 1,
18125   SLS_BLR = 2,
18126   SLS_ALL = 3,
18127 };
18128 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18129
18130 /* Return whether we should mitigatate Straight Line Speculation for the RET
18131    and BR instructions.  */
18132 bool
18133 aarch64_harden_sls_retbr_p (void)
18134 {
18135   return aarch64_sls_hardening & SLS_RETBR;
18136 }
18137
18138 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18139    instruction.  */
18140 bool
18141 aarch64_harden_sls_blr_p (void)
18142 {
18143   return aarch64_sls_hardening & SLS_BLR;
18144 }
18145
18146 /* As of yet we only allow setting these options globally, in the future we may
18147    allow setting them per function.  */
18148 static void
18149 aarch64_validate_sls_mitigation (const char *const_str)
18150 {
18151   char *token_save = NULL;
18152   char *str = NULL;
18153
18154   if (strcmp (const_str, "none") == 0)
18155     {
18156       aarch64_sls_hardening = SLS_NONE;
18157       return;
18158     }
18159   if (strcmp (const_str, "all") == 0)
18160     {
18161       aarch64_sls_hardening = SLS_ALL;
18162       return;
18163     }
18164
18165   char *str_root = xstrdup (const_str);
18166   str = strtok_r (str_root, ",", &token_save);
18167   if (!str)
18168     error ("invalid argument given to %<-mharden-sls=%>");
18169
18170   int temp = SLS_NONE;
18171   while (str)
18172     {
18173       if (strcmp (str, "blr") == 0)
18174         temp |= SLS_BLR;
18175       else if (strcmp (str, "retbr") == 0)
18176         temp |= SLS_RETBR;
18177       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18178         {
18179           error ("%qs must be by itself for %<-mharden-sls=%>", str);
18180           break;
18181         }
18182       else
18183         {
18184           error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18185           break;
18186         }
18187       str = strtok_r (NULL, ",", &token_save);
18188     }
18189   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18190   free (str_root);
18191 }
18192
18193 /* Validate a command-line -march option.  Parse the arch and extensions
18194    (if any) specified in STR and throw errors if appropriate.  Put the
18195    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
18196    option is valid.  */
18197
18198 static bool
18199 aarch64_validate_march (const char *str, const struct processor **res,
18200                         aarch64_feature_flags *isa_flags)
18201 {
18202   std::string invalid_extension;
18203   enum aarch_parse_opt_result parse_res
18204     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18205
18206   if (parse_res == AARCH_PARSE_OK)
18207     return true;
18208
18209   switch (parse_res)
18210     {
18211       case AARCH_PARSE_MISSING_ARG:
18212         error ("missing arch name in %<-march=%s%>", str);
18213         break;
18214       case AARCH_PARSE_INVALID_ARG:
18215         error ("unknown value %qs for %<-march%>", str);
18216         aarch64_print_hint_for_arch (str);
18217         /* A common user error is confusing -march and -mcpu.
18218            If the -march string matches a known CPU suggest -mcpu.  */
18219         parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18220         if (parse_res == AARCH_PARSE_OK)
18221           inform (input_location, "did you mean %<-mcpu=%s%>?", str);
18222         break;
18223       case AARCH_PARSE_INVALID_FEATURE:
18224         error ("invalid feature modifier %qs in %<-march=%s%>",
18225                invalid_extension.c_str (), str);
18226         aarch64_print_hint_for_extensions (invalid_extension);
18227         break;
18228       default:
18229         gcc_unreachable ();
18230     }
18231
18232   return false;
18233 }
18234
18235 /* Validate a command-line -mtune option.  Parse the cpu
18236    specified in STR and throw errors if appropriate.  Put the
18237    result, if it is valid, in RES.  Return whether the option is
18238    valid.  */
18239
18240 static bool
18241 aarch64_validate_mtune (const char *str, const struct processor **res)
18242 {
18243   enum aarch_parse_opt_result parse_res
18244     = aarch64_parse_tune (str, res);
18245
18246   if (parse_res == AARCH_PARSE_OK)
18247     return true;
18248
18249   switch (parse_res)
18250     {
18251       case AARCH_PARSE_MISSING_ARG:
18252         error ("missing cpu name in %<-mtune=%s%>", str);
18253         break;
18254       case AARCH_PARSE_INVALID_ARG:
18255         error ("unknown value %qs for %<-mtune%>", str);
18256         aarch64_print_hint_for_core (str);
18257         break;
18258       default:
18259         gcc_unreachable ();
18260     }
18261   return false;
18262 }
18263
18264 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
18265
18266 static poly_uint16
18267 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18268 {
18269   /* 128-bit SVE and Advanced SIMD modes use different register layouts
18270      on big-endian targets, so we would need to forbid subregs that convert
18271      from one to the other.  By default a reinterpret sequence would then
18272      involve a store to memory in one mode and a load back in the other.
18273      Even if we optimize that sequence using reverse instructions,
18274      it would still be a significant potential overhead.
18275
18276      For now, it seems better to generate length-agnostic code for that
18277      case instead.  */
18278   if (value == SVE_SCALABLE
18279       || (value == SVE_128 && BYTES_BIG_ENDIAN))
18280     return poly_uint16 (2, 2);
18281   else
18282     return (int) value / 64;
18283 }
18284
18285 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18286    aarch64_isa_flags accordingly.  */
18287
18288 void
18289 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18290 {
18291   aarch64_set_asm_isa_flags (&global_options, flags);
18292 }
18293
18294 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
18295    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18296    tuning structs.  In particular it must set selected_tune and
18297    aarch64_asm_isa_flags that define the available ISA features and tuning
18298    decisions.  It must also set selected_arch as this will be used to
18299    output the .arch asm tags for each function.  */
18300
18301 static void
18302 aarch64_override_options (void)
18303 {
18304   aarch64_feature_flags cpu_isa = 0;
18305   aarch64_feature_flags arch_isa = 0;
18306   aarch64_set_asm_isa_flags (0);
18307
18308   const struct processor *cpu = NULL;
18309   const struct processor *arch = NULL;
18310   const struct processor *tune = NULL;
18311
18312   if (aarch64_harden_sls_string)
18313     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18314
18315   if (aarch64_branch_protection_string)
18316     aarch_validate_mbranch_protection (aarch64_branch_protection_string);
18317
18318   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18319      If either of -march or -mtune is given, they override their
18320      respective component of -mcpu.  */
18321   if (aarch64_cpu_string)
18322     aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18323
18324   if (aarch64_arch_string)
18325     aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18326
18327   if (aarch64_tune_string)
18328     aarch64_validate_mtune (aarch64_tune_string, &tune);
18329
18330 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18331   SUBTARGET_OVERRIDE_OPTIONS;
18332 #endif
18333
18334   if (cpu && arch)
18335     {
18336       /* If both -mcpu and -march are specified, warn if they are not
18337          architecturally compatible and prefer the -march ISA flags.  */
18338       if (arch->arch != cpu->arch)
18339         {
18340           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
18341                        aarch64_cpu_string,
18342                        aarch64_arch_string);
18343         }
18344
18345       selected_arch = arch->arch;
18346       aarch64_set_asm_isa_flags (arch_isa);
18347     }
18348   else if (cpu)
18349     {
18350       selected_arch = cpu->arch;
18351       aarch64_set_asm_isa_flags (cpu_isa);
18352     }
18353   else if (arch)
18354     {
18355       cpu = &all_cores[arch->ident];
18356       selected_arch = arch->arch;
18357       aarch64_set_asm_isa_flags (arch_isa);
18358     }
18359   else
18360     {
18361       /* No -mcpu or -march specified, so use the default CPU.  */
18362       cpu = &all_cores[TARGET_CPU_DEFAULT];
18363       selected_arch = cpu->arch;
18364       aarch64_set_asm_isa_flags (cpu->flags);
18365     }
18366
18367   selected_tune = tune ? tune->ident : cpu->ident;
18368
18369   if (aarch_enable_bti == 2)
18370     {
18371 #ifdef TARGET_ENABLE_BTI
18372       aarch_enable_bti = 1;
18373 #else
18374       aarch_enable_bti = 0;
18375 #endif
18376     }
18377
18378   /* Return address signing is currently not supported for ILP32 targets.  For
18379      LP64 targets use the configured option in the absence of a command-line
18380      option for -mbranch-protection.  */
18381   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
18382     {
18383 #ifdef TARGET_ENABLE_PAC_RET
18384       aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18385 #else
18386       aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18387 #endif
18388     }
18389
18390 #ifndef HAVE_AS_MABI_OPTION
18391   /* The compiler may have been configured with 2.23.* binutils, which does
18392      not have support for ILP32.  */
18393   if (TARGET_ILP32)
18394     error ("assembler does not support %<-mabi=ilp32%>");
18395 #endif
18396
18397   /* Convert -msve-vector-bits to a VG count.  */
18398   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18399
18400   if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE && TARGET_ILP32)
18401     sorry ("return address signing is only supported for %<-mabi=lp64%>");
18402
18403   /* The pass to insert speculation tracking runs before
18404      shrink-wrapping and the latter does not know how to update the
18405      tracking status.  So disable it in this case.  */
18406   if (aarch64_track_speculation)
18407     flag_shrink_wrap = 0;
18408
18409   aarch64_override_options_internal (&global_options);
18410
18411   /* Save these options as the default ones in case we push and pop them later
18412      while processing functions with potential target attributes.  */
18413   target_option_default_node = target_option_current_node
18414     = build_target_option_node (&global_options, &global_options_set);
18415 }
18416
18417 /* Implement targetm.override_options_after_change.  */
18418
18419 static void
18420 aarch64_override_options_after_change (void)
18421 {
18422   aarch64_override_options_after_change_1 (&global_options);
18423 }
18424
18425 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
18426 static char *
18427 aarch64_offload_options (void)
18428 {
18429   if (TARGET_ILP32)
18430     return xstrdup ("-foffload-abi=ilp32");
18431   else
18432     return xstrdup ("-foffload-abi=lp64");
18433 }
18434
18435 static struct machine_function *
18436 aarch64_init_machine_status (void)
18437 {
18438   struct machine_function *machine;
18439   machine = ggc_cleared_alloc<machine_function> ();
18440   return machine;
18441 }
18442
18443 void
18444 aarch64_init_expanders (void)
18445 {
18446   init_machine_status = aarch64_init_machine_status;
18447 }
18448
18449 /* A checking mechanism for the implementation of the various code models.  */
18450 static void
18451 initialize_aarch64_code_model (struct gcc_options *opts)
18452 {
18453   aarch64_cmodel = opts->x_aarch64_cmodel_var;
18454   switch (opts->x_aarch64_cmodel_var)
18455     {
18456     case AARCH64_CMODEL_TINY:
18457       if (opts->x_flag_pic)
18458         aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18459       break;
18460     case AARCH64_CMODEL_SMALL:
18461       if (opts->x_flag_pic)
18462         {
18463 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18464           aarch64_cmodel = (flag_pic == 2
18465                             ? AARCH64_CMODEL_SMALL_PIC
18466                             : AARCH64_CMODEL_SMALL_SPIC);
18467 #else
18468           aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
18469 #endif
18470         }
18471       break;
18472     case AARCH64_CMODEL_LARGE:
18473       if (opts->x_flag_pic)
18474         sorry ("code model %qs with %<-f%s%>", "large",
18475                opts->x_flag_pic > 1 ? "PIC" : "pic");
18476       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18477         sorry ("code model %qs not supported in ilp32 mode", "large");
18478       break;
18479     case AARCH64_CMODEL_TINY_PIC:
18480     case AARCH64_CMODEL_SMALL_PIC:
18481     case AARCH64_CMODEL_SMALL_SPIC:
18482       gcc_unreachable ();
18483     }
18484 }
18485
18486 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
18487    using the information saved in PTR.  */
18488
18489 static void
18490 aarch64_option_restore (struct gcc_options *opts,
18491                         struct gcc_options * /* opts_set */,
18492                         struct cl_target_option * /* ptr */)
18493 {
18494   aarch64_override_options_internal (opts);
18495 }
18496
18497 /* Implement TARGET_OPTION_PRINT.  */
18498
18499 static void
18500 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
18501 {
18502   const struct processor *cpu
18503     = aarch64_get_tune_cpu (ptr->x_selected_tune);
18504   const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
18505   std::string extension
18506     = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_asm_isa_flags,
18507                                                   arch->flags);
18508
18509   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
18510   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
18511            arch->name, extension.c_str ());
18512 }
18513
18514 static GTY(()) tree aarch64_previous_fndecl;
18515
18516 void
18517 aarch64_reset_previous_fndecl (void)
18518 {
18519   aarch64_previous_fndecl = NULL;
18520 }
18521
18522 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
18523    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
18524    make sure optab availability predicates are recomputed when necessary.  */
18525
18526 void
18527 aarch64_save_restore_target_globals (tree new_tree)
18528 {
18529   if (TREE_TARGET_GLOBALS (new_tree))
18530     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
18531   else if (new_tree == target_option_default_node)
18532     restore_target_globals (&default_target_globals);
18533   else
18534     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
18535 }
18536
18537 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
18538    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
18539    of the function, if such exists.  This function may be called multiple
18540    times on a single function so use aarch64_previous_fndecl to avoid
18541    setting up identical state.  */
18542
18543 static void
18544 aarch64_set_current_function (tree fndecl)
18545 {
18546   if (!fndecl || fndecl == aarch64_previous_fndecl)
18547     return;
18548
18549   tree old_tree = (aarch64_previous_fndecl
18550                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
18551                    : NULL_TREE);
18552
18553   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18554
18555   /* If current function has no attributes but the previous one did,
18556      use the default node.  */
18557   if (!new_tree && old_tree)
18558     new_tree = target_option_default_node;
18559
18560   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
18561      the default have been handled by aarch64_save_restore_target_globals from
18562      aarch64_pragma_target_parse.  */
18563   if (old_tree == new_tree)
18564     return;
18565
18566   aarch64_previous_fndecl = fndecl;
18567
18568   /* First set the target options.  */
18569   cl_target_option_restore (&global_options, &global_options_set,
18570                             TREE_TARGET_OPTION (new_tree));
18571
18572   aarch64_save_restore_target_globals (new_tree);
18573 }
18574
18575 /* Enum describing the various ways we can handle attributes.
18576    In many cases we can reuse the generic option handling machinery.  */
18577
18578 enum aarch64_attr_opt_type
18579 {
18580   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
18581   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
18582   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
18583   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
18584 };
18585
18586 /* All the information needed to handle a target attribute.
18587    NAME is the name of the attribute.
18588    ATTR_TYPE specifies the type of behavior of the attribute as described
18589    in the definition of enum aarch64_attr_opt_type.
18590    ALLOW_NEG is true if the attribute supports a "no-" form.
18591    HANDLER is the function that takes the attribute string as an argument
18592    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
18593    OPT_NUM is the enum specifying the option that the attribute modifies.
18594    This is needed for attributes that mirror the behavior of a command-line
18595    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
18596    aarch64_attr_enum.  */
18597
18598 struct aarch64_attribute_info
18599 {
18600   const char *name;
18601   enum aarch64_attr_opt_type attr_type;
18602   bool allow_neg;
18603   bool (*handler) (const char *);
18604   enum opt_code opt_num;
18605 };
18606
18607 /* Handle the ARCH_STR argument to the arch= target attribute.  */
18608
18609 static bool
18610 aarch64_handle_attr_arch (const char *str)
18611 {
18612   const struct processor *tmp_arch = NULL;
18613   std::string invalid_extension;
18614   aarch64_feature_flags tmp_flags;
18615   enum aarch_parse_opt_result parse_res
18616     = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
18617
18618   if (parse_res == AARCH_PARSE_OK)
18619     {
18620       gcc_assert (tmp_arch);
18621       selected_arch = tmp_arch->arch;
18622       aarch64_set_asm_isa_flags (tmp_flags);
18623       return true;
18624     }
18625
18626   switch (parse_res)
18627     {
18628       case AARCH_PARSE_MISSING_ARG:
18629         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
18630         break;
18631       case AARCH_PARSE_INVALID_ARG:
18632         error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
18633         aarch64_print_hint_for_arch (str);
18634         break;
18635       case AARCH_PARSE_INVALID_FEATURE:
18636         error ("invalid feature modifier %s of value %qs in "
18637                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18638         aarch64_print_hint_for_extensions (invalid_extension);
18639         break;
18640       default:
18641         gcc_unreachable ();
18642     }
18643
18644   return false;
18645 }
18646
18647 /* Handle the argument CPU_STR to the cpu= target attribute.  */
18648
18649 static bool
18650 aarch64_handle_attr_cpu (const char *str)
18651 {
18652   const struct processor *tmp_cpu = NULL;
18653   std::string invalid_extension;
18654   aarch64_feature_flags tmp_flags;
18655   enum aarch_parse_opt_result parse_res
18656     = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
18657
18658   if (parse_res == AARCH_PARSE_OK)
18659     {
18660       gcc_assert (tmp_cpu);
18661       selected_tune = tmp_cpu->ident;
18662       selected_arch = tmp_cpu->arch;
18663       aarch64_set_asm_isa_flags (tmp_flags);
18664       return true;
18665     }
18666
18667   switch (parse_res)
18668     {
18669       case AARCH_PARSE_MISSING_ARG:
18670         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
18671         break;
18672       case AARCH_PARSE_INVALID_ARG:
18673         error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
18674         aarch64_print_hint_for_core (str);
18675         break;
18676       case AARCH_PARSE_INVALID_FEATURE:
18677         error ("invalid feature modifier %qs of value %qs in "
18678                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18679         aarch64_print_hint_for_extensions (invalid_extension);
18680         break;
18681       default:
18682         gcc_unreachable ();
18683     }
18684
18685   return false;
18686 }
18687
18688 /* Handle the argument STR to the branch-protection= attribute.  */
18689
18690  static bool
18691  aarch64_handle_attr_branch_protection (const char* str)
18692  {
18693   char *err_str = (char *) xmalloc (strlen (str) + 1);
18694   enum aarch_parse_opt_result res = aarch_parse_branch_protection (str,
18695                                                                    &err_str);
18696   bool success = false;
18697   switch (res)
18698     {
18699      case AARCH_PARSE_MISSING_ARG:
18700        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
18701               " attribute");
18702        break;
18703      case AARCH_PARSE_INVALID_ARG:
18704        error ("invalid protection type %qs in %<target(\"branch-protection"
18705               "=\")%> pragma or attribute", err_str);
18706        break;
18707      case AARCH_PARSE_OK:
18708        success = true;
18709       /* Fall through.  */
18710      case AARCH_PARSE_INVALID_FEATURE:
18711        break;
18712      default:
18713        gcc_unreachable ();
18714     }
18715   free (err_str);
18716   return success;
18717  }
18718
18719 /* Handle the argument STR to the tune= target attribute.  */
18720
18721 static bool
18722 aarch64_handle_attr_tune (const char *str)
18723 {
18724   const struct processor *tmp_tune = NULL;
18725   enum aarch_parse_opt_result parse_res
18726     = aarch64_parse_tune (str, &tmp_tune);
18727
18728   if (parse_res == AARCH_PARSE_OK)
18729     {
18730       gcc_assert (tmp_tune);
18731       selected_tune = tmp_tune->ident;
18732       return true;
18733     }
18734
18735   switch (parse_res)
18736     {
18737       case AARCH_PARSE_INVALID_ARG:
18738         error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
18739         aarch64_print_hint_for_core (str);
18740         break;
18741       default:
18742         gcc_unreachable ();
18743     }
18744
18745   return false;
18746 }
18747
18748 /* Parse an architecture extensions target attribute string specified in STR.
18749    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
18750    if successful.  Update aarch64_isa_flags to reflect the ISA features
18751    modified.  */
18752
18753 static bool
18754 aarch64_handle_attr_isa_flags (char *str)
18755 {
18756   enum aarch_parse_opt_result parse_res;
18757   auto isa_flags = aarch64_asm_isa_flags;
18758
18759   /* We allow "+nothing" in the beginning to clear out all architectural
18760      features if the user wants to handpick specific features.  */
18761   if (strncmp ("+nothing", str, 8) == 0)
18762     {
18763       isa_flags = 0;
18764       str += 8;
18765     }
18766
18767   std::string invalid_extension;
18768   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
18769
18770   if (parse_res == AARCH_PARSE_OK)
18771     {
18772       aarch64_set_asm_isa_flags (isa_flags);
18773       return true;
18774     }
18775
18776   switch (parse_res)
18777     {
18778       case AARCH_PARSE_MISSING_ARG:
18779         error ("missing value in %<target()%> pragma or attribute");
18780         break;
18781
18782       case AARCH_PARSE_INVALID_FEATURE:
18783         error ("invalid feature modifier %qs of value %qs in "
18784                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18785         break;
18786
18787       default:
18788         gcc_unreachable ();
18789     }
18790
18791  return false;
18792 }
18793
18794 /* The target attributes that we support.  On top of these we also support just
18795    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
18796    handled explicitly in aarch64_process_one_target_attr.  */
18797
18798 static const struct aarch64_attribute_info aarch64_attributes[] =
18799 {
18800   { "general-regs-only", aarch64_attr_mask, false, NULL,
18801      OPT_mgeneral_regs_only },
18802   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
18803      OPT_mfix_cortex_a53_835769 },
18804   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
18805      OPT_mfix_cortex_a53_843419 },
18806   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
18807   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
18808   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
18809      OPT_momit_leaf_frame_pointer },
18810   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
18811   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
18812      OPT_march_ },
18813   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
18814   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
18815      OPT_mtune_ },
18816   { "branch-protection", aarch64_attr_custom, false,
18817      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
18818   { "sign-return-address", aarch64_attr_enum, false, NULL,
18819      OPT_msign_return_address_ },
18820   { "outline-atomics", aarch64_attr_bool, true, NULL,
18821      OPT_moutline_atomics},
18822   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
18823 };
18824
18825 /* Parse ARG_STR which contains the definition of one target attribute.
18826    Show appropriate errors if any or return true if the attribute is valid.  */
18827
18828 static bool
18829 aarch64_process_one_target_attr (char *arg_str)
18830 {
18831   bool invert = false;
18832
18833   size_t len = strlen (arg_str);
18834
18835   if (len == 0)
18836     {
18837       error ("malformed %<target()%> pragma or attribute");
18838       return false;
18839     }
18840
18841   char *str_to_check = (char *) alloca (len + 1);
18842   strcpy (str_to_check, arg_str);
18843
18844   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
18845      It is easier to detect and handle it explicitly here rather than going
18846      through the machinery for the rest of the target attributes in this
18847      function.  */
18848   if (*str_to_check == '+')
18849     return aarch64_handle_attr_isa_flags (str_to_check);
18850
18851   if (len > 3 && startswith (str_to_check, "no-"))
18852     {
18853       invert = true;
18854       str_to_check += 3;
18855     }
18856   char *arg = strchr (str_to_check, '=');
18857
18858   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
18859      and point ARG to "foo".  */
18860   if (arg)
18861     {
18862       *arg = '\0';
18863       arg++;
18864     }
18865   const struct aarch64_attribute_info *p_attr;
18866   bool found = false;
18867   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
18868     {
18869       /* If the names don't match up, or the user has given an argument
18870          to an attribute that doesn't accept one, or didn't give an argument
18871          to an attribute that expects one, fail to match.  */
18872       if (strcmp (str_to_check, p_attr->name) != 0)
18873         continue;
18874
18875       found = true;
18876       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
18877                               || p_attr->attr_type == aarch64_attr_enum;
18878
18879       if (attr_need_arg_p ^ (arg != NULL))
18880         {
18881           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
18882           return false;
18883         }
18884
18885       /* If the name matches but the attribute does not allow "no-" versions
18886          then we can't match.  */
18887       if (invert && !p_attr->allow_neg)
18888         {
18889           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
18890           return false;
18891         }
18892
18893       switch (p_attr->attr_type)
18894         {
18895         /* Has a custom handler registered.
18896            For example, cpu=, arch=, tune=.  */
18897           case aarch64_attr_custom:
18898             gcc_assert (p_attr->handler);
18899             if (!p_attr->handler (arg))
18900               return false;
18901             break;
18902
18903           /* Either set or unset a boolean option.  */
18904           case aarch64_attr_bool:
18905             {
18906               struct cl_decoded_option decoded;
18907
18908               generate_option (p_attr->opt_num, NULL, !invert,
18909                                CL_TARGET, &decoded);
18910               aarch64_handle_option (&global_options, &global_options_set,
18911                                       &decoded, input_location);
18912               break;
18913             }
18914           /* Set or unset a bit in the target_flags.  aarch64_handle_option
18915              should know what mask to apply given the option number.  */
18916           case aarch64_attr_mask:
18917             {
18918               struct cl_decoded_option decoded;
18919               /* We only need to specify the option number.
18920                  aarch64_handle_option will know which mask to apply.  */
18921               decoded.opt_index = p_attr->opt_num;
18922               decoded.value = !invert;
18923               aarch64_handle_option (&global_options, &global_options_set,
18924                                       &decoded, input_location);
18925               break;
18926             }
18927           /* Use the option setting machinery to set an option to an enum.  */
18928           case aarch64_attr_enum:
18929             {
18930               gcc_assert (arg);
18931               bool valid;
18932               int value;
18933               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
18934                                               &value, CL_TARGET);
18935               if (valid)
18936                 {
18937                   set_option (&global_options, NULL, p_attr->opt_num, value,
18938                               NULL, DK_UNSPECIFIED, input_location,
18939                               global_dc);
18940                 }
18941               else
18942                 {
18943                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
18944                 }
18945               break;
18946             }
18947           default:
18948             gcc_unreachable ();
18949         }
18950     }
18951
18952   /* If we reached here we either have found an attribute and validated
18953      it or didn't match any.  If we matched an attribute but its arguments
18954      were malformed we will have returned false already.  */
18955   return found;
18956 }
18957
18958 /* Count how many times the character C appears in
18959    NULL-terminated string STR.  */
18960
18961 static unsigned int
18962 num_occurences_in_str (char c, char *str)
18963 {
18964   unsigned int res = 0;
18965   while (*str != '\0')
18966     {
18967       if (*str == c)
18968         res++;
18969
18970       str++;
18971     }
18972
18973   return res;
18974 }
18975
18976 /* Parse the tree in ARGS that contains the target attribute information
18977    and update the global target options space.  */
18978
18979 bool
18980 aarch64_process_target_attr (tree args)
18981 {
18982   if (TREE_CODE (args) == TREE_LIST)
18983     {
18984       do
18985         {
18986           tree head = TREE_VALUE (args);
18987           if (head)
18988             {
18989               if (!aarch64_process_target_attr (head))
18990                 return false;
18991             }
18992           args = TREE_CHAIN (args);
18993         } while (args);
18994
18995       return true;
18996     }
18997
18998   if (TREE_CODE (args) != STRING_CST)
18999     {
19000       error ("attribute %<target%> argument not a string");
19001       return false;
19002     }
19003
19004   size_t len = strlen (TREE_STRING_POINTER (args));
19005   char *str_to_check = (char *) alloca (len + 1);
19006   strcpy (str_to_check, TREE_STRING_POINTER (args));
19007
19008   if (len == 0)
19009     {
19010       error ("malformed %<target()%> pragma or attribute");
19011       return false;
19012     }
19013
19014   /* Used to catch empty spaces between commas i.e.
19015      attribute ((target ("attr1,,attr2"))).  */
19016   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19017
19018   /* Handle multiple target attributes separated by ','.  */
19019   char *token = strtok_r (str_to_check, ",", &str_to_check);
19020
19021   unsigned int num_attrs = 0;
19022   while (token)
19023     {
19024       num_attrs++;
19025       if (!aarch64_process_one_target_attr (token))
19026         {
19027           /* Check if token is possibly an arch extension without
19028              leading '+'.  */
19029           aarch64_feature_flags isa_temp = 0;
19030           auto with_plus = std::string ("+") + token;
19031           enum aarch_parse_opt_result ext_res
19032             = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19033
19034           if (ext_res == AARCH_PARSE_OK)
19035             error ("arch extension %<%s%> should be prefixed by %<+%>",
19036                    token);
19037           else
19038             error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19039           return false;
19040         }
19041
19042       token = strtok_r (NULL, ",", &str_to_check);
19043     }
19044
19045   if (num_attrs != num_commas + 1)
19046     {
19047       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19048       return false;
19049     }
19050
19051   return true;
19052 }
19053
19054 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
19055    process attribute ((target ("..."))).  */
19056
19057 static bool
19058 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19059 {
19060   struct cl_target_option cur_target;
19061   bool ret;
19062   tree old_optimize;
19063   tree new_target, new_optimize;
19064   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19065
19066   /* If what we're processing is the current pragma string then the
19067      target option node is already stored in target_option_current_node
19068      by aarch64_pragma_target_parse in aarch64-c.cc.  Use that to avoid
19069      having to re-parse the string.  This is especially useful to keep
19070      arm_neon.h compile times down since that header contains a lot
19071      of intrinsics enclosed in pragmas.  */
19072   if (!existing_target && args == current_target_pragma)
19073     {
19074       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19075       return true;
19076     }
19077   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19078
19079   old_optimize
19080     = build_optimization_node (&global_options, &global_options_set);
19081   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19082
19083   /* If the function changed the optimization levels as well as setting
19084      target options, start with the optimizations specified.  */
19085   if (func_optimize && func_optimize != old_optimize)
19086     cl_optimization_restore (&global_options, &global_options_set,
19087                              TREE_OPTIMIZATION (func_optimize));
19088
19089   /* Save the current target options to restore at the end.  */
19090   cl_target_option_save (&cur_target, &global_options, &global_options_set);
19091
19092   /* If fndecl already has some target attributes applied to it, unpack
19093      them so that we add this attribute on top of them, rather than
19094      overwriting them.  */
19095   if (existing_target)
19096     {
19097       struct cl_target_option *existing_options
19098         = TREE_TARGET_OPTION (existing_target);
19099
19100       if (existing_options)
19101         cl_target_option_restore (&global_options, &global_options_set,
19102                                   existing_options);
19103     }
19104   else
19105     cl_target_option_restore (&global_options, &global_options_set,
19106                               TREE_TARGET_OPTION (target_option_current_node));
19107
19108   ret = aarch64_process_target_attr (args);
19109
19110   /* Set up any additional state.  */
19111   if (ret)
19112     {
19113       aarch64_override_options_internal (&global_options);
19114       new_target = build_target_option_node (&global_options,
19115                                              &global_options_set);
19116     }
19117   else
19118     new_target = NULL;
19119
19120   new_optimize = build_optimization_node (&global_options,
19121                                           &global_options_set);
19122
19123   if (fndecl && ret)
19124     {
19125       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19126
19127       if (old_optimize != new_optimize)
19128         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19129     }
19130
19131   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19132
19133   if (old_optimize != new_optimize)
19134     cl_optimization_restore (&global_options, &global_options_set,
19135                              TREE_OPTIMIZATION (old_optimize));
19136   return ret;
19137 }
19138
19139 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
19140    tri-bool options (yes, no, don't care) and the default value is
19141    DEF, determine whether to reject inlining.  */
19142
19143 static bool
19144 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
19145                                      int dont_care, int def)
19146 {
19147   /* If the callee doesn't care, always allow inlining.  */
19148   if (callee == dont_care)
19149     return true;
19150
19151   /* If the caller doesn't care, always allow inlining.  */
19152   if (caller == dont_care)
19153     return true;
19154
19155   /* Otherwise, allow inlining if either the callee and caller values
19156      agree, or if the callee is using the default value.  */
19157   return (callee == caller || callee == def);
19158 }
19159
19160 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
19161    to inline CALLEE into CALLER based on target-specific info.
19162    Make sure that the caller and callee have compatible architectural
19163    features.  Then go through the other possible target attributes
19164    and see if they can block inlining.  Try not to reject always_inline
19165    callees unless they are incompatible architecturally.  */
19166
19167 static bool
19168 aarch64_can_inline_p (tree caller, tree callee)
19169 {
19170   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
19171   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
19172
19173   struct cl_target_option *caller_opts
19174         = TREE_TARGET_OPTION (caller_tree ? caller_tree
19175                                            : target_option_default_node);
19176
19177   struct cl_target_option *callee_opts
19178         = TREE_TARGET_OPTION (callee_tree ? callee_tree
19179                                            : target_option_default_node);
19180
19181   /* Callee's ISA flags should be a subset of the caller's.  */
19182   if ((caller_opts->x_aarch64_asm_isa_flags
19183        & callee_opts->x_aarch64_asm_isa_flags)
19184       != callee_opts->x_aarch64_asm_isa_flags)
19185     return false;
19186   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
19187       != callee_opts->x_aarch64_isa_flags)
19188     return false;
19189
19190   /* Allow non-strict aligned functions inlining into strict
19191      aligned ones.  */
19192   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
19193        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
19194       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
19195            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
19196     return false;
19197
19198   bool always_inline = lookup_attribute ("always_inline",
19199                                           DECL_ATTRIBUTES (callee));
19200
19201   /* If the architectural features match up and the callee is always_inline
19202      then the other attributes don't matter.  */
19203   if (always_inline)
19204     return true;
19205
19206   if (caller_opts->x_aarch64_cmodel_var
19207       != callee_opts->x_aarch64_cmodel_var)
19208     return false;
19209
19210   if (caller_opts->x_aarch64_tls_dialect
19211       != callee_opts->x_aarch64_tls_dialect)
19212     return false;
19213
19214   /* Honour explicit requests to workaround errata.  */
19215   if (!aarch64_tribools_ok_for_inlining_p (
19216           caller_opts->x_aarch64_fix_a53_err835769,
19217           callee_opts->x_aarch64_fix_a53_err835769,
19218           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
19219     return false;
19220
19221   if (!aarch64_tribools_ok_for_inlining_p (
19222           caller_opts->x_aarch64_fix_a53_err843419,
19223           callee_opts->x_aarch64_fix_a53_err843419,
19224           2, TARGET_FIX_ERR_A53_843419))
19225     return false;
19226
19227   /* If the user explicitly specified -momit-leaf-frame-pointer for the
19228      caller and calle and they don't match up, reject inlining.  */
19229   if (!aarch64_tribools_ok_for_inlining_p (
19230           caller_opts->x_flag_omit_leaf_frame_pointer,
19231           callee_opts->x_flag_omit_leaf_frame_pointer,
19232           2, 1))
19233     return false;
19234
19235   /* If the callee has specific tuning overrides, respect them.  */
19236   if (callee_opts->x_aarch64_override_tune_string != NULL
19237       && caller_opts->x_aarch64_override_tune_string == NULL)
19238     return false;
19239
19240   /* If the user specified tuning override strings for the
19241      caller and callee and they don't match up, reject inlining.
19242      We just do a string compare here, we don't analyze the meaning
19243      of the string, as it would be too costly for little gain.  */
19244   if (callee_opts->x_aarch64_override_tune_string
19245       && caller_opts->x_aarch64_override_tune_string
19246       && (strcmp (callee_opts->x_aarch64_override_tune_string,
19247                   caller_opts->x_aarch64_override_tune_string) != 0))
19248     return false;
19249
19250   return true;
19251 }
19252
19253 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
19254    been already.  */
19255
19256 unsigned int
19257 aarch64_tlsdesc_abi_id ()
19258 {
19259   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
19260   if (!tlsdesc_abi.initialized_p ())
19261     {
19262       HARD_REG_SET full_reg_clobbers;
19263       CLEAR_HARD_REG_SET (full_reg_clobbers);
19264       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
19265       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
19266       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
19267         SET_HARD_REG_BIT (full_reg_clobbers, regno);
19268       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
19269     }
19270   return tlsdesc_abi.id ();
19271 }
19272
19273 /* Return true if SYMBOL_REF X binds locally.  */
19274
19275 static bool
19276 aarch64_symbol_binds_local_p (const_rtx x)
19277 {
19278   return (SYMBOL_REF_DECL (x)
19279           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
19280           : SYMBOL_REF_LOCAL_P (x));
19281 }
19282
19283 /* Return true if SYMBOL_REF X is thread local */
19284 static bool
19285 aarch64_tls_symbol_p (rtx x)
19286 {
19287   if (! TARGET_HAVE_TLS)
19288     return false;
19289
19290   x = strip_salt (x);
19291   if (!SYMBOL_REF_P (x))
19292     return false;
19293
19294   return SYMBOL_REF_TLS_MODEL (x) != 0;
19295 }
19296
19297 /* Classify a TLS symbol into one of the TLS kinds.  */
19298 enum aarch64_symbol_type
19299 aarch64_classify_tls_symbol (rtx x)
19300 {
19301   enum tls_model tls_kind = tls_symbolic_operand_type (x);
19302
19303   switch (tls_kind)
19304     {
19305     case TLS_MODEL_GLOBAL_DYNAMIC:
19306     case TLS_MODEL_LOCAL_DYNAMIC:
19307       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
19308
19309     case TLS_MODEL_INITIAL_EXEC:
19310       switch (aarch64_cmodel)
19311         {
19312         case AARCH64_CMODEL_TINY:
19313         case AARCH64_CMODEL_TINY_PIC:
19314           return SYMBOL_TINY_TLSIE;
19315         default:
19316           return SYMBOL_SMALL_TLSIE;
19317         }
19318
19319     case TLS_MODEL_LOCAL_EXEC:
19320       if (aarch64_tls_size == 12)
19321         return SYMBOL_TLSLE12;
19322       else if (aarch64_tls_size == 24)
19323         return SYMBOL_TLSLE24;
19324       else if (aarch64_tls_size == 32)
19325         return SYMBOL_TLSLE32;
19326       else if (aarch64_tls_size == 48)
19327         return SYMBOL_TLSLE48;
19328       else
19329         gcc_unreachable ();
19330
19331     case TLS_MODEL_EMULATED:
19332     case TLS_MODEL_NONE:
19333       return SYMBOL_FORCE_TO_MEM;
19334
19335     default:
19336       gcc_unreachable ();
19337     }
19338 }
19339
19340 /* Return the correct method for accessing X + OFFSET, where X is either
19341    a SYMBOL_REF or LABEL_REF.  */
19342
19343 enum aarch64_symbol_type
19344 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
19345 {
19346   x = strip_salt (x);
19347
19348   if (LABEL_REF_P (x))
19349     {
19350       switch (aarch64_cmodel)
19351         {
19352         case AARCH64_CMODEL_LARGE:
19353           return SYMBOL_FORCE_TO_MEM;
19354
19355         case AARCH64_CMODEL_TINY_PIC:
19356         case AARCH64_CMODEL_TINY:
19357           return SYMBOL_TINY_ABSOLUTE;
19358
19359         case AARCH64_CMODEL_SMALL_SPIC:
19360         case AARCH64_CMODEL_SMALL_PIC:
19361         case AARCH64_CMODEL_SMALL:
19362           return SYMBOL_SMALL_ABSOLUTE;
19363
19364         default:
19365           gcc_unreachable ();
19366         }
19367     }
19368
19369   if (SYMBOL_REF_P (x))
19370     {
19371       if (aarch64_tls_symbol_p (x))
19372         return aarch64_classify_tls_symbol (x);
19373
19374       switch (aarch64_cmodel)
19375         {
19376         case AARCH64_CMODEL_TINY_PIC:
19377         case AARCH64_CMODEL_TINY:
19378           /* With -fPIC non-local symbols use the GOT.  For orthogonality
19379              always use the GOT for extern weak symbols.  */
19380           if ((flag_pic || SYMBOL_REF_WEAK (x))
19381               && !aarch64_symbol_binds_local_p (x))
19382             return SYMBOL_TINY_GOT;
19383
19384           /* When we retrieve symbol + offset address, we have to make sure
19385              the offset does not cause overflow of the final address.  But
19386              we have no way of knowing the address of symbol at compile time
19387              so we can't accurately say if the distance between the PC and
19388              symbol + offset is outside the addressible range of +/-1MB in the
19389              TINY code model.  So we limit the maximum offset to +/-64KB and
19390              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
19391              If offset_within_block_p is true we allow larger offsets.  */
19392           if (!(IN_RANGE (offset, -0x10000, 0x10000)
19393                 || offset_within_block_p (x, offset)))
19394             return SYMBOL_FORCE_TO_MEM;
19395
19396           return SYMBOL_TINY_ABSOLUTE;
19397
19398
19399         case AARCH64_CMODEL_SMALL_SPIC:
19400         case AARCH64_CMODEL_SMALL_PIC:
19401         case AARCH64_CMODEL_SMALL:
19402           if ((flag_pic || SYMBOL_REF_WEAK (x))
19403               && !aarch64_symbol_binds_local_p (x))
19404             return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
19405                     ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
19406
19407           /* Same reasoning as the tiny code model, but the offset cap here is
19408              1MB, allowing +/-3.9GB for the offset to the symbol.  */
19409           if (!(IN_RANGE (offset, -0x100000, 0x100000)
19410                 || offset_within_block_p (x, offset)))
19411             return SYMBOL_FORCE_TO_MEM;
19412
19413           return SYMBOL_SMALL_ABSOLUTE;
19414
19415         case AARCH64_CMODEL_LARGE:
19416           /* This is alright even in PIC code as the constant
19417              pool reference is always PC relative and within
19418              the same translation unit.  */
19419           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
19420             return SYMBOL_SMALL_ABSOLUTE;
19421           else
19422             return SYMBOL_FORCE_TO_MEM;
19423
19424         default:
19425           gcc_unreachable ();
19426         }
19427     }
19428
19429   /* By default push everything into the constant pool.  */
19430   return SYMBOL_FORCE_TO_MEM;
19431 }
19432
19433 bool
19434 aarch64_constant_address_p (rtx x)
19435 {
19436   return (CONSTANT_P (x) && memory_address_p (DImode, x));
19437 }
19438
19439 bool
19440 aarch64_legitimate_pic_operand_p (rtx x)
19441 {
19442   poly_int64 offset;
19443   x = strip_offset_and_salt (x, &offset);
19444   if (SYMBOL_REF_P (x))
19445     return false;
19446
19447   return true;
19448 }
19449
19450 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
19451    that should be rematerialized rather than spilled.  */
19452
19453 static bool
19454 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
19455 {
19456   /* Support CSE and rematerialization of common constants.  */
19457   if (CONST_INT_P (x)
19458       || CONST_DOUBLE_P (x))
19459     return true;
19460
19461   /* Only accept variable-length vector constants if they can be
19462      handled directly.
19463
19464      ??? It would be possible (but complex) to handle rematerialization
19465      of other constants via secondary reloads.  */
19466   if (!GET_MODE_SIZE (mode).is_constant ())
19467     return aarch64_simd_valid_immediate (x, NULL);
19468
19469   /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
19470      least be forced to memory and loaded from there.  */
19471   if (CONST_VECTOR_P (x))
19472     return !targetm.cannot_force_const_mem (mode, x);
19473
19474   /* Do not allow vector struct mode constants for Advanced SIMD.
19475      We could support 0 and -1 easily, but they need support in
19476      aarch64-simd.md.  */
19477   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19478   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
19479     return false;
19480
19481   if (GET_CODE (x) == HIGH)
19482     x = XEXP (x, 0);
19483
19484   /* Accept polynomial constants that can be calculated by using the
19485      destination of a move as the sole temporary.  Constants that
19486      require a second temporary cannot be rematerialized (they can't be
19487      forced to memory and also aren't legitimate constants).  */
19488   poly_int64 offset;
19489   if (poly_int_rtx_p (x, &offset))
19490     return aarch64_offset_temporaries (false, offset) <= 1;
19491
19492   /* If an offset is being added to something else, we need to allow the
19493      base to be moved into the destination register, meaning that there
19494      are no free temporaries for the offset.  */
19495   x = strip_offset_and_salt (x, &offset);
19496   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
19497     return false;
19498
19499   /* Do not allow const (plus (anchor_symbol, const_int)).  */
19500   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
19501     return false;
19502
19503   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
19504      so spilling them is better than rematerialization.  */
19505   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
19506     return true;
19507
19508   /* Label references are always constant.  */
19509   if (LABEL_REF_P (x))
19510     return true;
19511
19512   return false;
19513 }
19514
19515 rtx
19516 aarch64_load_tp (rtx target)
19517 {
19518   if (!target
19519       || GET_MODE (target) != Pmode
19520       || !register_operand (target, Pmode))
19521     target = gen_reg_rtx (Pmode);
19522
19523   /* Can return in any reg.  */
19524   emit_insn (gen_aarch64_load_tp_hard (target));
19525   return target;
19526 }
19527
19528 /* On AAPCS systems, this is the "struct __va_list".  */
19529 static GTY(()) tree va_list_type;
19530
19531 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
19532    Return the type to use as __builtin_va_list.
19533
19534    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
19535
19536    struct __va_list
19537    {
19538      void *__stack;
19539      void *__gr_top;
19540      void *__vr_top;
19541      int   __gr_offs;
19542      int   __vr_offs;
19543    };  */
19544
19545 static tree
19546 aarch64_build_builtin_va_list (void)
19547 {
19548   tree va_list_name;
19549   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19550
19551   /* Create the type.  */
19552   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
19553   /* Give it the required name.  */
19554   va_list_name = build_decl (BUILTINS_LOCATION,
19555                              TYPE_DECL,
19556                              get_identifier ("__va_list"),
19557                              va_list_type);
19558   DECL_ARTIFICIAL (va_list_name) = 1;
19559   TYPE_NAME (va_list_type) = va_list_name;
19560   TYPE_STUB_DECL (va_list_type) = va_list_name;
19561
19562   /* Create the fields.  */
19563   f_stack = build_decl (BUILTINS_LOCATION,
19564                         FIELD_DECL, get_identifier ("__stack"),
19565                         ptr_type_node);
19566   f_grtop = build_decl (BUILTINS_LOCATION,
19567                         FIELD_DECL, get_identifier ("__gr_top"),
19568                         ptr_type_node);
19569   f_vrtop = build_decl (BUILTINS_LOCATION,
19570                         FIELD_DECL, get_identifier ("__vr_top"),
19571                         ptr_type_node);
19572   f_groff = build_decl (BUILTINS_LOCATION,
19573                         FIELD_DECL, get_identifier ("__gr_offs"),
19574                         integer_type_node);
19575   f_vroff = build_decl (BUILTINS_LOCATION,
19576                         FIELD_DECL, get_identifier ("__vr_offs"),
19577                         integer_type_node);
19578
19579   /* Tell tree-stdarg pass about our internal offset fields.
19580      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
19581      purpose to identify whether the code is updating va_list internal
19582      offset fields through irregular way.  */
19583   va_list_gpr_counter_field = f_groff;
19584   va_list_fpr_counter_field = f_vroff;
19585
19586   DECL_ARTIFICIAL (f_stack) = 1;
19587   DECL_ARTIFICIAL (f_grtop) = 1;
19588   DECL_ARTIFICIAL (f_vrtop) = 1;
19589   DECL_ARTIFICIAL (f_groff) = 1;
19590   DECL_ARTIFICIAL (f_vroff) = 1;
19591
19592   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
19593   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
19594   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
19595   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
19596   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
19597
19598   TYPE_FIELDS (va_list_type) = f_stack;
19599   DECL_CHAIN (f_stack) = f_grtop;
19600   DECL_CHAIN (f_grtop) = f_vrtop;
19601   DECL_CHAIN (f_vrtop) = f_groff;
19602   DECL_CHAIN (f_groff) = f_vroff;
19603
19604   /* Compute its layout.  */
19605   layout_type (va_list_type);
19606
19607   return va_list_type;
19608 }
19609
19610 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
19611 static void
19612 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
19613 {
19614   const CUMULATIVE_ARGS *cum;
19615   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19616   tree stack, grtop, vrtop, groff, vroff;
19617   tree t;
19618   int gr_save_area_size = cfun->va_list_gpr_size;
19619   int vr_save_area_size = cfun->va_list_fpr_size;
19620   int vr_offset;
19621
19622   cum = &crtl->args.info;
19623   if (cfun->va_list_gpr_size)
19624     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
19625                              cfun->va_list_gpr_size);
19626   if (cfun->va_list_fpr_size)
19627     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
19628                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
19629
19630   if (!TARGET_FLOAT)
19631     {
19632       gcc_assert (cum->aapcs_nvrn == 0);
19633       vr_save_area_size = 0;
19634     }
19635
19636   f_stack = TYPE_FIELDS (va_list_type_node);
19637   f_grtop = DECL_CHAIN (f_stack);
19638   f_vrtop = DECL_CHAIN (f_grtop);
19639   f_groff = DECL_CHAIN (f_vrtop);
19640   f_vroff = DECL_CHAIN (f_groff);
19641
19642   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
19643                   NULL_TREE);
19644   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
19645                   NULL_TREE);
19646   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
19647                   NULL_TREE);
19648   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
19649                   NULL_TREE);
19650   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
19651                   NULL_TREE);
19652
19653   /* Emit code to initialize STACK, which points to the next varargs stack
19654      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
19655      by named arguments.  STACK is 8-byte aligned.  */
19656   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
19657   if (cum->aapcs_stack_size > 0)
19658     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
19659   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
19660   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19661
19662   /* Emit code to initialize GRTOP, the top of the GR save area.
19663      virtual_incoming_args_rtx should have been 16 byte aligned.  */
19664   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
19665   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
19666   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19667
19668   /* Emit code to initialize VRTOP, the top of the VR save area.
19669      This address is gr_save_area_bytes below GRTOP, rounded
19670      down to the next 16-byte boundary.  */
19671   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
19672   vr_offset = ROUND_UP (gr_save_area_size,
19673                         STACK_BOUNDARY / BITS_PER_UNIT);
19674
19675   if (vr_offset)
19676     t = fold_build_pointer_plus_hwi (t, -vr_offset);
19677   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
19678   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19679
19680   /* Emit code to initialize GROFF, the offset from GRTOP of the
19681      next GPR argument.  */
19682   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
19683               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
19684   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19685
19686   /* Likewise emit code to initialize VROFF, the offset from FTOP
19687      of the next VR argument.  */
19688   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
19689               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
19690   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19691 }
19692
19693 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
19694
19695 static tree
19696 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
19697                               gimple_seq *post_p ATTRIBUTE_UNUSED)
19698 {
19699   tree addr;
19700   bool indirect_p;
19701   bool is_ha;           /* is HFA or HVA.  */
19702   bool dw_align;        /* double-word align.  */
19703   machine_mode ag_mode = VOIDmode;
19704   int nregs;
19705   machine_mode mode;
19706
19707   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19708   tree stack, f_top, f_off, off, arg, roundup, on_stack;
19709   HOST_WIDE_INT size, rsize, adjust, align;
19710   tree t, u, cond1, cond2;
19711
19712   indirect_p = pass_va_arg_by_reference (type);
19713   if (indirect_p)
19714     type = build_pointer_type (type);
19715
19716   mode = TYPE_MODE (type);
19717
19718   f_stack = TYPE_FIELDS (va_list_type_node);
19719   f_grtop = DECL_CHAIN (f_stack);
19720   f_vrtop = DECL_CHAIN (f_grtop);
19721   f_groff = DECL_CHAIN (f_vrtop);
19722   f_vroff = DECL_CHAIN (f_groff);
19723
19724   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
19725                   f_stack, NULL_TREE);
19726   size = int_size_in_bytes (type);
19727
19728   unsigned int abi_break;
19729   unsigned int abi_break_packed;
19730   align
19731     = aarch64_function_arg_alignment (mode, type, &abi_break, &abi_break_packed)
19732     / BITS_PER_UNIT;
19733
19734   dw_align = false;
19735   adjust = 0;
19736   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
19737                                                &is_ha, false))
19738     {
19739       /* No frontends can create types with variable-sized modes, so we
19740          shouldn't be asked to pass or return them.  */
19741       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
19742
19743       /* TYPE passed in fp/simd registers.  */
19744       if (!TARGET_FLOAT)
19745         aarch64_err_no_fpadvsimd (mode);
19746
19747       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
19748                       unshare_expr (valist), f_vrtop, NULL_TREE);
19749       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
19750                       unshare_expr (valist), f_vroff, NULL_TREE);
19751
19752       rsize = nregs * UNITS_PER_VREG;
19753
19754       if (is_ha)
19755         {
19756           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
19757             adjust = UNITS_PER_VREG - ag_size;
19758         }
19759       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19760                && size < UNITS_PER_VREG)
19761         {
19762           adjust = UNITS_PER_VREG - size;
19763         }
19764     }
19765   else
19766     {
19767       /* TYPE passed in general registers.  */
19768       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
19769                       unshare_expr (valist), f_grtop, NULL_TREE);
19770       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
19771                       unshare_expr (valist), f_groff, NULL_TREE);
19772       rsize = ROUND_UP (size, UNITS_PER_WORD);
19773       nregs = rsize / UNITS_PER_WORD;
19774
19775       if (align <= 8 && abi_break_packed && warn_psabi)
19776         inform (input_location, "parameter passing for argument of type "
19777                 "%qT changed in GCC 13.1", type);
19778
19779       if (align > 8)
19780         {
19781           if (abi_break && warn_psabi)
19782             inform (input_location, "parameter passing for argument of type "
19783                     "%qT changed in GCC 9.1", type);
19784           dw_align = true;
19785         }
19786
19787       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19788           && size < UNITS_PER_WORD)
19789         {
19790           adjust = UNITS_PER_WORD  - size;
19791         }
19792     }
19793
19794   /* Get a local temporary for the field value.  */
19795   off = get_initialized_tmp_var (f_off, pre_p, NULL);
19796
19797   /* Emit code to branch if off >= 0.  */
19798   t = build2 (GE_EXPR, boolean_type_node, off,
19799               build_int_cst (TREE_TYPE (off), 0));
19800   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
19801
19802   if (dw_align)
19803     {
19804       /* Emit: offs = (offs + 15) & -16.  */
19805       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19806                   build_int_cst (TREE_TYPE (off), 15));
19807       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
19808                   build_int_cst (TREE_TYPE (off), -16));
19809       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
19810     }
19811   else
19812     roundup = NULL;
19813
19814   /* Update ap.__[g|v]r_offs  */
19815   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19816               build_int_cst (TREE_TYPE (off), rsize));
19817   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
19818
19819   /* String up.  */
19820   if (roundup)
19821     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19822
19823   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
19824   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
19825               build_int_cst (TREE_TYPE (f_off), 0));
19826   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
19827
19828   /* String up: make sure the assignment happens before the use.  */
19829   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
19830   COND_EXPR_ELSE (cond1) = t;
19831
19832   /* Prepare the trees handling the argument that is passed on the stack;
19833      the top level node will store in ON_STACK.  */
19834   arg = get_initialized_tmp_var (stack, pre_p, NULL);
19835   if (align > 8)
19836     {
19837       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
19838       t = fold_build_pointer_plus_hwi (arg, 15);
19839       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19840                   build_int_cst (TREE_TYPE (t), -16));
19841       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
19842     }
19843   else
19844     roundup = NULL;
19845   /* Advance ap.__stack  */
19846   t = fold_build_pointer_plus_hwi (arg, size + 7);
19847   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19848               build_int_cst (TREE_TYPE (t), -8));
19849   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
19850   /* String up roundup and advance.  */
19851   if (roundup)
19852     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19853   /* String up with arg */
19854   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
19855   /* Big-endianness related address adjustment.  */
19856   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19857       && size < UNITS_PER_WORD)
19858   {
19859     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
19860                 size_int (UNITS_PER_WORD - size));
19861     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
19862   }
19863
19864   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
19865   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
19866
19867   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
19868   t = off;
19869   if (adjust)
19870     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
19871                 build_int_cst (TREE_TYPE (off), adjust));
19872
19873   t = fold_convert (sizetype, t);
19874   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
19875
19876   if (is_ha)
19877     {
19878       /* type ha; // treat as "struct {ftype field[n];}"
19879          ... [computing offs]
19880          for (i = 0; i <nregs; ++i, offs += 16)
19881            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
19882          return ha;  */
19883       int i;
19884       tree tmp_ha, field_t, field_ptr_t;
19885
19886       /* Declare a local variable.  */
19887       tmp_ha = create_tmp_var_raw (type, "ha");
19888       gimple_add_tmp_var (tmp_ha);
19889
19890       /* Establish the base type.  */
19891       switch (ag_mode)
19892         {
19893         case E_SFmode:
19894           field_t = float_type_node;
19895           field_ptr_t = float_ptr_type_node;
19896           break;
19897         case E_DFmode:
19898           field_t = double_type_node;
19899           field_ptr_t = double_ptr_type_node;
19900           break;
19901         case E_TFmode:
19902           field_t = long_double_type_node;
19903           field_ptr_t = long_double_ptr_type_node;
19904           break;
19905         case E_SDmode:
19906           field_t = dfloat32_type_node;
19907           field_ptr_t = build_pointer_type (dfloat32_type_node);
19908           break;
19909         case E_DDmode:
19910           field_t = dfloat64_type_node;
19911           field_ptr_t = build_pointer_type (dfloat64_type_node);
19912           break;
19913         case E_TDmode:
19914           field_t = dfloat128_type_node;
19915           field_ptr_t = build_pointer_type (dfloat128_type_node);
19916           break;
19917         case E_HFmode:
19918           field_t = aarch64_fp16_type_node;
19919           field_ptr_t = aarch64_fp16_ptr_type_node;
19920           break;
19921         case E_BFmode:
19922           field_t = bfloat16_type_node;
19923           field_ptr_t = aarch64_bf16_ptr_type_node;
19924           break;
19925         case E_V2SImode:
19926         case E_V4SImode:
19927             {
19928               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
19929               field_t = build_vector_type_for_mode (innertype, ag_mode);
19930               field_ptr_t = build_pointer_type (field_t);
19931             }
19932           break;
19933         default:
19934           gcc_assert (0);
19935         }
19936
19937       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
19938       TREE_ADDRESSABLE (tmp_ha) = 1;
19939       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
19940       addr = t;
19941       t = fold_convert (field_ptr_t, addr);
19942       t = build2 (MODIFY_EXPR, field_t,
19943                   build1 (INDIRECT_REF, field_t, tmp_ha),
19944                   build1 (INDIRECT_REF, field_t, t));
19945
19946       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
19947       for (i = 1; i < nregs; ++i)
19948         {
19949           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
19950           u = fold_convert (field_ptr_t, addr);
19951           u = build2 (MODIFY_EXPR, field_t,
19952                       build2 (MEM_REF, field_t, tmp_ha,
19953                               build_int_cst (field_ptr_t,
19954                                              (i *
19955                                               int_size_in_bytes (field_t)))),
19956                       build1 (INDIRECT_REF, field_t, u));
19957           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
19958         }
19959
19960       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
19961       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
19962     }
19963
19964   COND_EXPR_ELSE (cond2) = t;
19965   addr = fold_convert (build_pointer_type (type), cond1);
19966   addr = build_va_arg_indirect_ref (addr);
19967
19968   if (indirect_p)
19969     addr = build_va_arg_indirect_ref (addr);
19970
19971   return addr;
19972 }
19973
19974 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
19975
19976 static void
19977 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
19978                                 const function_arg_info &arg,
19979                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
19980 {
19981   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
19982   CUMULATIVE_ARGS local_cum;
19983   int gr_saved = cfun->va_list_gpr_size;
19984   int vr_saved = cfun->va_list_fpr_size;
19985
19986   /* The caller has advanced CUM up to, but not beyond, the last named
19987      argument.  Advance a local copy of CUM past the last "real" named
19988      argument, to find out how many registers are left over.  */
19989   local_cum = *cum;
19990   if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
19991     aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
19992
19993   /* Found out how many registers we need to save.
19994      Honor tree-stdvar analysis results.  */
19995   if (cfun->va_list_gpr_size)
19996     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
19997                     cfun->va_list_gpr_size / UNITS_PER_WORD);
19998   if (cfun->va_list_fpr_size)
19999     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
20000                     cfun->va_list_fpr_size / UNITS_PER_VREG);
20001
20002   if (!TARGET_FLOAT)
20003     {
20004       gcc_assert (local_cum.aapcs_nvrn == 0);
20005       vr_saved = 0;
20006     }
20007
20008   if (!no_rtl)
20009     {
20010       if (gr_saved > 0)
20011         {
20012           rtx ptr, mem;
20013
20014           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
20015           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
20016                                - gr_saved * UNITS_PER_WORD);
20017           mem = gen_frame_mem (BLKmode, ptr);
20018           set_mem_alias_set (mem, get_varargs_alias_set ());
20019
20020           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
20021                                mem, gr_saved);
20022         }
20023       if (vr_saved > 0)
20024         {
20025           /* We can't use move_block_from_reg, because it will use
20026              the wrong mode, storing D regs only.  */
20027           machine_mode mode = TImode;
20028           int off, i, vr_start;
20029
20030           /* Set OFF to the offset from virtual_incoming_args_rtx of
20031              the first vector register.  The VR save area lies below
20032              the GR one, and is aligned to 16 bytes.  */
20033           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
20034                            STACK_BOUNDARY / BITS_PER_UNIT);
20035           off -= vr_saved * UNITS_PER_VREG;
20036
20037           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
20038           for (i = 0; i < vr_saved; ++i)
20039             {
20040               rtx ptr, mem;
20041
20042               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
20043               mem = gen_frame_mem (mode, ptr);
20044               set_mem_alias_set (mem, get_varargs_alias_set ());
20045               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
20046               off += UNITS_PER_VREG;
20047             }
20048         }
20049     }
20050
20051   /* We don't save the size into *PRETEND_SIZE because we want to avoid
20052      any complication of having crtl->args.pretend_args_size changed.  */
20053   cfun->machine->frame.saved_varargs_size
20054     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
20055                  STACK_BOUNDARY / BITS_PER_UNIT)
20056        + vr_saved * UNITS_PER_VREG);
20057 }
20058
20059 static void
20060 aarch64_conditional_register_usage (void)
20061 {
20062   int i;
20063   if (!TARGET_FLOAT)
20064     {
20065       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
20066         {
20067           fixed_regs[i] = 1;
20068           call_used_regs[i] = 1;
20069           CLEAR_HARD_REG_BIT (operand_reg_set, i);
20070         }
20071     }
20072   if (!TARGET_SVE)
20073     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
20074       {
20075         fixed_regs[i] = 1;
20076         call_used_regs[i] = 1;
20077       }
20078
20079   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
20080   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
20081   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
20082
20083   /* When tracking speculation, we need a couple of call-clobbered registers
20084      to track the speculation state.  It would be nice to just use
20085      IP0 and IP1, but currently there are numerous places that just
20086      assume these registers are free for other uses (eg pointer
20087      authentication).  */
20088   if (aarch64_track_speculation)
20089     {
20090       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
20091       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
20092       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20093       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20094     }
20095 }
20096
20097 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
20098
20099 bool
20100 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
20101 {
20102   /* For records we're passed a FIELD_DECL, for arrays we're passed
20103      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
20104   const_tree type = TREE_TYPE (field_or_array);
20105
20106   /* Assign BLKmode to anything that contains multiple SVE predicates.
20107      For structures, the "multiple" case is indicated by MODE being
20108      VOIDmode.  */
20109   unsigned int num_zr, num_pr;
20110   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
20111     {
20112       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
20113         return !simple_cst_equal (TYPE_SIZE (field_or_array),
20114                                   TYPE_SIZE (type));
20115       return mode == VOIDmode;
20116     }
20117
20118   return default_member_type_forces_blk (field_or_array, mode);
20119 }
20120
20121 /* Bitmasks that indicate whether earlier versions of GCC would have
20122    taken a different path through the ABI logic.  This should result in
20123    a -Wpsabi warning if the earlier path led to a different ABI decision.
20124
20125    WARN_PSABI_EMPTY_CXX17_BASE
20126       Indicates that the type includes an artificial empty C++17 base field
20127       that, prior to GCC 10.1, would prevent the type from being treated as
20128       a HFA or HVA.  See PR94383 for details.
20129
20130    WARN_PSABI_NO_UNIQUE_ADDRESS
20131       Indicates that the type includes an empty [[no_unique_address]] field
20132       that, prior to GCC 10.1, would prevent the type from being treated as
20133       a HFA or HVA.  */
20134 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
20135 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
20136 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
20137
20138 /* Walk down the type tree of TYPE counting consecutive base elements.
20139    If *MODEP is VOIDmode, then set it to the first valid floating point
20140    type.  If a non-floating point type is found, or if a floating point
20141    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
20142    otherwise return the count in the sub-tree.
20143
20144    The WARN_PSABI_FLAGS argument allows the caller to check whether this
20145    function has changed its behavior relative to earlier versions of GCC.
20146    Normally the argument should be nonnull and point to a zero-initialized
20147    variable.  The function then records whether the ABI decision might
20148    be affected by a known fix to the ABI logic, setting the associated
20149    WARN_PSABI_* bits if so.
20150
20151    When the argument is instead a null pointer, the function tries to
20152    simulate the behavior of GCC before all such ABI fixes were made.
20153    This is useful to check whether the function returns something
20154    different after the ABI fixes.  */
20155 static int
20156 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
20157                          unsigned int *warn_psabi_flags)
20158 {
20159   machine_mode mode;
20160   HOST_WIDE_INT size;
20161
20162   if (aarch64_sve::builtin_type_p (type))
20163     return -1;
20164
20165   switch (TREE_CODE (type))
20166     {
20167     case REAL_TYPE:
20168       mode = TYPE_MODE (type);
20169       if (mode != DFmode && mode != SFmode
20170           && mode != TFmode && mode != HFmode
20171           && mode != SDmode && mode != DDmode && mode != TDmode)
20172         return -1;
20173
20174       if (*modep == VOIDmode)
20175         *modep = mode;
20176
20177       if (*modep == mode)
20178         return 1;
20179
20180       break;
20181
20182     case COMPLEX_TYPE:
20183       mode = TYPE_MODE (TREE_TYPE (type));
20184       if (mode != DFmode && mode != SFmode
20185           && mode != TFmode && mode != HFmode)
20186         return -1;
20187
20188       if (*modep == VOIDmode)
20189         *modep = mode;
20190
20191       if (*modep == mode)
20192         return 2;
20193
20194       break;
20195
20196     case VECTOR_TYPE:
20197       /* Use V2SImode and V4SImode as representatives of all 64-bit
20198          and 128-bit vector types.  */
20199       size = int_size_in_bytes (type);
20200       switch (size)
20201         {
20202         case 8:
20203           mode = V2SImode;
20204           break;
20205         case 16:
20206           mode = V4SImode;
20207           break;
20208         default:
20209           return -1;
20210         }
20211
20212       if (*modep == VOIDmode)
20213         *modep = mode;
20214
20215       /* Vector modes are considered to be opaque: two vectors are
20216          equivalent for the purposes of being homogeneous aggregates
20217          if they are the same size.  */
20218       if (*modep == mode)
20219         return 1;
20220
20221       break;
20222
20223     case ARRAY_TYPE:
20224       {
20225         int count;
20226         tree index = TYPE_DOMAIN (type);
20227
20228         /* Can't handle incomplete types nor sizes that are not
20229            fixed.  */
20230         if (!COMPLETE_TYPE_P (type)
20231             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20232           return -1;
20233
20234         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
20235                                          warn_psabi_flags);
20236         if (count == -1
20237             || !index
20238             || !TYPE_MAX_VALUE (index)
20239             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
20240             || !TYPE_MIN_VALUE (index)
20241             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
20242             || count < 0)
20243           return -1;
20244
20245         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
20246                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
20247
20248         /* There must be no padding.  */
20249         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20250                       count * GET_MODE_BITSIZE (*modep)))
20251           return -1;
20252
20253         return count;
20254       }
20255
20256     case RECORD_TYPE:
20257       {
20258         int count = 0;
20259         int sub_count;
20260         tree field;
20261
20262         /* Can't handle incomplete types nor sizes that are not
20263            fixed.  */
20264         if (!COMPLETE_TYPE_P (type)
20265             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20266           return -1;
20267
20268         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20269           {
20270             if (TREE_CODE (field) != FIELD_DECL)
20271               continue;
20272
20273             if (DECL_FIELD_ABI_IGNORED (field))
20274               {
20275                 /* See whether this is something that earlier versions of
20276                    GCC failed to ignore.  */
20277                 unsigned int flag;
20278                 if (lookup_attribute ("no_unique_address",
20279                                       DECL_ATTRIBUTES (field)))
20280                   flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
20281                 else if (cxx17_empty_base_field_p (field))
20282                   flag = WARN_PSABI_EMPTY_CXX17_BASE;
20283                 else
20284                   /* No compatibility problem.  */
20285                   continue;
20286
20287                 /* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
20288                 if (warn_psabi_flags)
20289                   {
20290                     *warn_psabi_flags |= flag;
20291                     continue;
20292                   }
20293               }
20294             /* A zero-width bitfield may affect layout in some
20295                circumstances, but adds no members.  The determination
20296                of whether or not a type is an HFA is performed after
20297                layout is complete, so if the type still looks like an
20298                HFA afterwards, it is still classed as one.  This is
20299                potentially an ABI break for the hard-float ABI.  */
20300             else if (DECL_BIT_FIELD (field)
20301                      && integer_zerop (DECL_SIZE (field)))
20302               {
20303                 /* Prior to GCC-12 these fields were striped early,
20304                    hiding them from the back-end entirely and
20305                    resulting in the correct behaviour for argument
20306                    passing.  Simulate that old behaviour without
20307                    generating a warning.  */
20308                 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
20309                   continue;
20310                 if (warn_psabi_flags)
20311                   {
20312                     *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
20313                     continue;
20314                   }
20315               }
20316
20317             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20318                                                  warn_psabi_flags);
20319             if (sub_count < 0)
20320               return -1;
20321             count += sub_count;
20322           }
20323
20324         /* There must be no padding.  */
20325         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20326                       count * GET_MODE_BITSIZE (*modep)))
20327           return -1;
20328
20329         return count;
20330       }
20331
20332     case UNION_TYPE:
20333     case QUAL_UNION_TYPE:
20334       {
20335         /* These aren't very interesting except in a degenerate case.  */
20336         int count = 0;
20337         int sub_count;
20338         tree field;
20339
20340         /* Can't handle incomplete types nor sizes that are not
20341            fixed.  */
20342         if (!COMPLETE_TYPE_P (type)
20343             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20344           return -1;
20345
20346         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20347           {
20348             if (TREE_CODE (field) != FIELD_DECL)
20349               continue;
20350
20351             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20352                                                  warn_psabi_flags);
20353             if (sub_count < 0)
20354               return -1;
20355             count = count > sub_count ? count : sub_count;
20356           }
20357
20358         /* There must be no padding.  */
20359         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20360                       count * GET_MODE_BITSIZE (*modep)))
20361           return -1;
20362
20363         return count;
20364       }
20365
20366     default:
20367       break;
20368     }
20369
20370   return -1;
20371 }
20372
20373 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
20374    type as described in AAPCS64 \S 4.1.2.
20375
20376    See the comment above aarch64_composite_type_p for the notes on MODE.  */
20377
20378 static bool
20379 aarch64_short_vector_p (const_tree type,
20380                         machine_mode mode)
20381 {
20382   poly_int64 size = -1;
20383
20384   if (type && TREE_CODE (type) == VECTOR_TYPE)
20385     {
20386       if (aarch64_sve::builtin_type_p (type))
20387         return false;
20388       size = int_size_in_bytes (type);
20389     }
20390   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
20391            || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
20392     {
20393       /* The containing "else if" is too loose: it means that we look at TYPE
20394          if the type is a vector type (good), but that we otherwise ignore TYPE
20395          and look only at the mode.  This is wrong because the type describes
20396          the language-level information whereas the mode is purely an internal
20397          GCC concept.  We can therefore reach here for types that are not
20398          vectors in the AAPCS64 sense.
20399
20400          We can't "fix" that for the traditional Advanced SIMD vector modes
20401          without breaking backwards compatibility.  However, there's no such
20402          baggage for the structure modes, which were introduced in GCC 12.  */
20403       if (aarch64_advsimd_struct_mode_p (mode))
20404         return false;
20405
20406       /* For similar reasons, rely only on the type, not the mode, when
20407          processing SVE types.  */
20408       if (type && aarch64_some_values_include_pst_objects_p (type))
20409         /* Leave later code to report an error if SVE is disabled.  */
20410         gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
20411       else
20412         size = GET_MODE_SIZE (mode);
20413     }
20414   if (known_eq (size, 8) || known_eq (size, 16))
20415     {
20416       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
20417          they are being treated as scalable AAPCS64 types.  */
20418       gcc_assert (!aarch64_sve_mode_p (mode)
20419                   && !aarch64_advsimd_struct_mode_p (mode));
20420       return true;
20421     }
20422   return false;
20423 }
20424
20425 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
20426    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
20427    array types.  The C99 floating-point complex types are also considered
20428    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
20429    types, which are GCC extensions and out of the scope of AAPCS64, are
20430    treated as composite types here as well.
20431
20432    Note that MODE itself is not sufficient in determining whether a type
20433    is such a composite type or not.  This is because
20434    stor-layout.cc:compute_record_mode may have already changed the MODE
20435    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
20436    structure with only one field may have its MODE set to the mode of the
20437    field.  Also an integer mode whose size matches the size of the
20438    RECORD_TYPE type may be used to substitute the original mode
20439    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
20440    solely relied on.  */
20441
20442 static bool
20443 aarch64_composite_type_p (const_tree type,
20444                           machine_mode mode)
20445 {
20446   if (aarch64_short_vector_p (type, mode))
20447     return false;
20448
20449   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
20450     return true;
20451
20452   if (mode == BLKmode
20453       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
20454       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
20455     return true;
20456
20457   return false;
20458 }
20459
20460 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
20461    shall be passed or returned in simd/fp register(s) (providing these
20462    parameter passing registers are available).
20463
20464    Upon successful return, *COUNT returns the number of needed registers,
20465    *BASE_MODE returns the mode of the individual register and when IS_HA
20466    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
20467    floating-point aggregate or a homogeneous short-vector aggregate.
20468
20469    SILENT_P is true if the function should refrain from reporting any
20470    diagnostics.  This should only be used if the caller is certain that
20471    any ABI decisions would eventually come through this function with
20472    SILENT_P set to false.  */
20473
20474 static bool
20475 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
20476                                          const_tree type,
20477                                          machine_mode *base_mode,
20478                                          int *count,
20479                                          bool *is_ha,
20480                                          bool silent_p)
20481 {
20482   if (is_ha != NULL) *is_ha = false;
20483
20484   machine_mode new_mode = VOIDmode;
20485   bool composite_p = aarch64_composite_type_p (type, mode);
20486
20487   if ((!composite_p
20488        && (GET_MODE_CLASS (mode) == MODE_FLOAT
20489            || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
20490       || aarch64_short_vector_p (type, mode))
20491     {
20492       *count = 1;
20493       new_mode = mode;
20494     }
20495   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
20496     {
20497       if (is_ha != NULL) *is_ha = true;
20498       *count = 2;
20499       new_mode = GET_MODE_INNER (mode);
20500     }
20501   else if (type && composite_p)
20502     {
20503       unsigned int warn_psabi_flags = 0;
20504       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
20505                                               &warn_psabi_flags);
20506       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
20507         {
20508           static unsigned last_reported_type_uid;
20509           unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
20510           int alt;
20511           if (!silent_p
20512               && warn_psabi
20513               && warn_psabi_flags
20514               && uid != last_reported_type_uid
20515               && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
20516                   != ag_count))
20517             {
20518               const char *url10
20519                 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
20520               const char *url12
20521                 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
20522               gcc_assert (alt == -1);
20523               last_reported_type_uid = uid;
20524               /* Use TYPE_MAIN_VARIANT to strip any redundant const
20525                  qualification.  */
20526               if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
20527                 inform (input_location, "parameter passing for argument of "
20528                         "type %qT with %<[[no_unique_address]]%> members "
20529                         "changed %{in GCC 10.1%}",
20530                         TYPE_MAIN_VARIANT (type), url10);
20531               else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
20532                 inform (input_location, "parameter passing for argument of "
20533                         "type %qT when C++17 is enabled changed to match "
20534                         "C++14 %{in GCC 10.1%}",
20535                         TYPE_MAIN_VARIANT (type), url10);
20536               else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
20537                 inform (input_location, "parameter passing for argument of "
20538                         "type %qT changed %{in GCC 12.1%}",
20539                         TYPE_MAIN_VARIANT (type), url12);
20540             }
20541
20542           if (is_ha != NULL) *is_ha = true;
20543           *count = ag_count;
20544         }
20545       else
20546         return false;
20547     }
20548   else
20549     return false;
20550
20551   gcc_assert (!aarch64_sve_mode_p (new_mode));
20552   *base_mode = new_mode;
20553   return true;
20554 }
20555
20556 /* Implement TARGET_STRUCT_VALUE_RTX.  */
20557
20558 static rtx
20559 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
20560                           int incoming ATTRIBUTE_UNUSED)
20561 {
20562   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
20563 }
20564
20565 /* Implements target hook vector_mode_supported_p.  */
20566 static bool
20567 aarch64_vector_mode_supported_p (machine_mode mode)
20568 {
20569   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
20570   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
20571 }
20572
20573 /* Return the full-width SVE vector mode for element mode MODE, if one
20574    exists.  */
20575 opt_machine_mode
20576 aarch64_full_sve_mode (scalar_mode mode)
20577 {
20578   switch (mode)
20579     {
20580     case E_DFmode:
20581       return VNx2DFmode;
20582     case E_SFmode:
20583       return VNx4SFmode;
20584     case E_HFmode:
20585       return VNx8HFmode;
20586     case E_BFmode:
20587       return VNx8BFmode;
20588     case E_DImode:
20589       return VNx2DImode;
20590     case E_SImode:
20591       return VNx4SImode;
20592     case E_HImode:
20593       return VNx8HImode;
20594     case E_QImode:
20595       return VNx16QImode;
20596     default:
20597       return opt_machine_mode ();
20598     }
20599 }
20600
20601 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
20602    if it exists.  */
20603 opt_machine_mode
20604 aarch64_vq_mode (scalar_mode mode)
20605 {
20606   switch (mode)
20607     {
20608     case E_DFmode:
20609       return V2DFmode;
20610     case E_SFmode:
20611       return V4SFmode;
20612     case E_HFmode:
20613       return V8HFmode;
20614     case E_BFmode:
20615       return V8BFmode;
20616     case E_SImode:
20617       return V4SImode;
20618     case E_HImode:
20619       return V8HImode;
20620     case E_QImode:
20621       return V16QImode;
20622     case E_DImode:
20623       return V2DImode;
20624     default:
20625       return opt_machine_mode ();
20626     }
20627 }
20628
20629 /* Return appropriate SIMD container
20630    for MODE within a vector of WIDTH bits.  */
20631 static machine_mode
20632 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
20633 {
20634   if (TARGET_SVE
20635       && maybe_ne (width, 128)
20636       && known_eq (width, BITS_PER_SVE_VECTOR))
20637     return aarch64_full_sve_mode (mode).else_mode (word_mode);
20638
20639   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
20640   if (TARGET_SIMD)
20641     {
20642       if (known_eq (width, 128))
20643         return aarch64_vq_mode (mode).else_mode (word_mode);
20644       else
20645         switch (mode)
20646           {
20647           case E_SFmode:
20648             return V2SFmode;
20649           case E_HFmode:
20650             return V4HFmode;
20651           case E_BFmode:
20652             return V4BFmode;
20653           case E_SImode:
20654             return V2SImode;
20655           case E_HImode:
20656             return V4HImode;
20657           case E_QImode:
20658             return V8QImode;
20659           default:
20660             break;
20661           }
20662     }
20663   return word_mode;
20664 }
20665
20666 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
20667    and return whether the SVE mode should be preferred over the
20668    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
20669 static bool
20670 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
20671 {
20672   /* Take into account the aarch64-autovec-preference param if non-zero.  */
20673   bool only_asimd_p = aarch64_autovec_preference == 1;
20674   bool only_sve_p = aarch64_autovec_preference == 2;
20675
20676   if (only_asimd_p)
20677     return false;
20678   if (only_sve_p)
20679     return true;
20680
20681   /* The preference in case of a tie in costs.  */
20682   bool prefer_asimd = aarch64_autovec_preference == 3;
20683   bool prefer_sve = aarch64_autovec_preference == 4;
20684
20685   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
20686   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
20687   /* If the CPU information does not have an SVE width registered use the
20688      generic poly_int comparison that prefers SVE.  If a preference is
20689      explicitly requested avoid this path.  */
20690   if (aarch64_tune_params.sve_width == SVE_SCALABLE
20691       && !prefer_asimd
20692       && !prefer_sve)
20693     return maybe_gt (nunits_sve, nunits_asimd);
20694
20695   /* Otherwise estimate the runtime width of the modes involved.  */
20696   HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
20697   HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
20698
20699   /* Preferring SVE means picking it first unless the Advanced SIMD mode
20700      is clearly wider.  */
20701   if (prefer_sve)
20702     return est_sve >= est_asimd;
20703   /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
20704      is clearly wider.  */
20705   if (prefer_asimd)
20706     return est_sve > est_asimd;
20707
20708   /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
20709   return est_sve > est_asimd;
20710 }
20711
20712 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
20713 static machine_mode
20714 aarch64_preferred_simd_mode (scalar_mode mode)
20715 {
20716   /* Take into account explicit auto-vectorization ISA preferences through
20717      aarch64_cmp_autovec_modes.  */
20718   if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
20719     return aarch64_full_sve_mode (mode).else_mode (word_mode);
20720   if (TARGET_SIMD)
20721     return aarch64_vq_mode (mode).else_mode (word_mode);
20722   return word_mode;
20723 }
20724
20725 /* Return a list of possible vector sizes for the vectorizer
20726    to iterate over.  */
20727 static unsigned int
20728 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
20729 {
20730   static const machine_mode sve_modes[] = {
20731     /* Try using full vectors for all element types.  */
20732     VNx16QImode,
20733
20734     /* Try using 16-bit containers for 8-bit elements and full vectors
20735        for wider elements.  */
20736     VNx8QImode,
20737
20738     /* Try using 32-bit containers for 8-bit and 16-bit elements and
20739        full vectors for wider elements.  */
20740     VNx4QImode,
20741
20742     /* Try using 64-bit containers for all element types.  */
20743     VNx2QImode
20744   };
20745
20746   static const machine_mode advsimd_modes[] = {
20747     /* Try using 128-bit vectors for all element types.  */
20748     V16QImode,
20749
20750     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
20751        for wider elements.  */
20752     V8QImode,
20753
20754     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
20755        for wider elements.
20756
20757        TODO: We could support a limited form of V4QImode too, so that
20758        we use 32-bit vectors for 8-bit elements.  */
20759     V4HImode,
20760
20761     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
20762        for 64-bit elements.
20763
20764        TODO: We could similarly support limited forms of V2QImode and V2HImode
20765        for this case.  */
20766     V2SImode
20767   };
20768
20769   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
20770      This is because:
20771
20772      - If we can't use N-byte Advanced SIMD vectors then the placement
20773        doesn't matter; we'll just continue as though the Advanced SIMD
20774        entry didn't exist.
20775
20776      - If an SVE main loop with N bytes ends up being cheaper than an
20777        Advanced SIMD main loop with N bytes then by default we'll replace
20778        the Advanced SIMD version with the SVE one.
20779
20780      - If an Advanced SIMD main loop with N bytes ends up being cheaper
20781        than an SVE main loop with N bytes then by default we'll try to
20782        use the SVE loop to vectorize the epilogue instead.  */
20783
20784   bool only_asimd_p = aarch64_autovec_preference == 1;
20785   bool only_sve_p = aarch64_autovec_preference == 2;
20786
20787   unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
20788   unsigned int advsimd_i = 0;
20789
20790   while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
20791     {
20792       if (sve_i < ARRAY_SIZE (sve_modes)
20793           && aarch64_cmp_autovec_modes (sve_modes[sve_i],
20794                                         advsimd_modes[advsimd_i]))
20795         modes->safe_push (sve_modes[sve_i++]);
20796       else
20797         modes->safe_push (advsimd_modes[advsimd_i++]);
20798     }
20799   while (sve_i < ARRAY_SIZE (sve_modes))
20800    modes->safe_push (sve_modes[sve_i++]);
20801
20802   unsigned int flags = 0;
20803   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
20804      can compare SVE against Advanced SIMD and so that we can compare
20805      multiple SVE vectorization approaches against each other.  There's
20806      not really any point doing this for Advanced SIMD only, since the
20807      first mode that works should always be the best.  */
20808   if (TARGET_SVE && aarch64_sve_compare_costs)
20809     flags |= VECT_COMPARE_COSTS;
20810   return flags;
20811 }
20812
20813 /* Implement TARGET_MANGLE_TYPE.  */
20814
20815 static const char *
20816 aarch64_mangle_type (const_tree type)
20817 {
20818   /* The AArch64 ABI documents say that "__va_list" has to be
20819      mangled as if it is in the "std" namespace.  */
20820   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
20821     return "St9__va_list";
20822
20823   /* Half-precision floating point types.  */
20824   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
20825     {
20826       if (TYPE_MAIN_VARIANT (type) == float16_type_node)
20827         return NULL;
20828       if (TYPE_MODE (type) == BFmode)
20829         return "u6__bf16";
20830       else
20831         return "Dh";
20832     }
20833
20834   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
20835      builtin types.  */
20836   if (TYPE_NAME (type) != NULL)
20837     {
20838       const char *res;
20839       if ((res = aarch64_general_mangle_builtin_type (type))
20840           || (res = aarch64_sve::mangle_builtin_type (type)))
20841         return res;
20842     }
20843
20844   /* Use the default mangling.  */
20845   return NULL;
20846 }
20847
20848 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
20849
20850 static bool
20851 aarch64_verify_type_context (location_t loc, type_context_kind context,
20852                              const_tree type, bool silent_p)
20853 {
20854   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
20855 }
20856
20857 /* Find the first rtx_insn before insn that will generate an assembly
20858    instruction.  */
20859
20860 static rtx_insn *
20861 aarch64_prev_real_insn (rtx_insn *insn)
20862 {
20863   if (!insn)
20864     return NULL;
20865
20866   do
20867     {
20868       insn = prev_real_insn (insn);
20869     }
20870   while (insn && recog_memoized (insn) < 0);
20871
20872   return insn;
20873 }
20874
20875 static bool
20876 is_madd_op (enum attr_type t1)
20877 {
20878   unsigned int i;
20879   /* A number of these may be AArch32 only.  */
20880   enum attr_type mlatypes[] = {
20881     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
20882     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
20883     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
20884   };
20885
20886   for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
20887     {
20888       if (t1 == mlatypes[i])
20889         return true;
20890     }
20891
20892   return false;
20893 }
20894
20895 /* Check if there is a register dependency between a load and the insn
20896    for which we hold recog_data.  */
20897
20898 static bool
20899 dep_between_memop_and_curr (rtx memop)
20900 {
20901   rtx load_reg;
20902   int opno;
20903
20904   gcc_assert (GET_CODE (memop) == SET);
20905
20906   if (!REG_P (SET_DEST (memop)))
20907     return false;
20908
20909   load_reg = SET_DEST (memop);
20910   for (opno = 1; opno < recog_data.n_operands; opno++)
20911     {
20912       rtx operand = recog_data.operand[opno];
20913       if (REG_P (operand)
20914           && reg_overlap_mentioned_p (load_reg, operand))
20915         return true;
20916
20917     }
20918   return false;
20919 }
20920
20921
20922 /* When working around the Cortex-A53 erratum 835769,
20923    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
20924    instruction and has a preceding memory instruction such that a NOP
20925    should be inserted between them.  */
20926
20927 bool
20928 aarch64_madd_needs_nop (rtx_insn* insn)
20929 {
20930   enum attr_type attr_type;
20931   rtx_insn *prev;
20932   rtx body;
20933
20934   if (!TARGET_FIX_ERR_A53_835769)
20935     return false;
20936
20937   if (!INSN_P (insn) || recog_memoized (insn) < 0)
20938     return false;
20939
20940   attr_type = get_attr_type (insn);
20941   if (!is_madd_op (attr_type))
20942     return false;
20943
20944   prev = aarch64_prev_real_insn (insn);
20945   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
20946      Restore recog state to INSN to avoid state corruption.  */
20947   extract_constrain_insn_cached (insn);
20948
20949   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
20950     return false;
20951
20952   body = single_set (prev);
20953
20954   /* If the previous insn is a memory op and there is no dependency between
20955      it and the DImode madd, emit a NOP between them.  If body is NULL then we
20956      have a complex memory operation, probably a load/store pair.
20957      Be conservative for now and emit a NOP.  */
20958   if (GET_MODE (recog_data.operand[0]) == DImode
20959       && (!body || !dep_between_memop_and_curr (body)))
20960     return true;
20961
20962   return false;
20963
20964 }
20965
20966
20967 /* Implement FINAL_PRESCAN_INSN.  */
20968
20969 void
20970 aarch64_final_prescan_insn (rtx_insn *insn)
20971 {
20972   if (aarch64_madd_needs_nop (insn))
20973     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
20974 }
20975
20976
20977 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
20978    instruction.  */
20979
20980 bool
20981 aarch64_sve_index_immediate_p (rtx base_or_step)
20982 {
20983   return (CONST_INT_P (base_or_step)
20984           && IN_RANGE (INTVAL (base_or_step), -16, 15));
20985 }
20986
20987 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
20988    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
20989
20990 bool
20991 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
20992 {
20993   rtx elt = unwrap_const_vec_duplicate (x);
20994   if (!CONST_INT_P (elt))
20995     return false;
20996
20997   HOST_WIDE_INT val = INTVAL (elt);
20998   if (negate_p)
20999     val = -val;
21000   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
21001
21002   if (val & 0xff)
21003     return IN_RANGE (val, 0, 0xff);
21004   return IN_RANGE (val, 0, 0xff00);
21005 }
21006
21007 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
21008    instructions when applied to mode MODE.  Negate X first if NEGATE_P
21009    is true.  */
21010
21011 bool
21012 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
21013 {
21014   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
21015     return false;
21016
21017   /* After the optional negation, the immediate must be nonnegative.
21018      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
21019      instead of SQADD Zn.B, Zn.B, #129.  */
21020   rtx elt = unwrap_const_vec_duplicate (x);
21021   return negate_p == (INTVAL (elt) < 0);
21022 }
21023
21024 /* Return true if X is a valid immediate operand for an SVE logical
21025    instruction such as AND.  */
21026
21027 bool
21028 aarch64_sve_bitmask_immediate_p (rtx x)
21029 {
21030   rtx elt;
21031
21032   return (const_vec_duplicate_p (x, &elt)
21033           && CONST_INT_P (elt)
21034           && aarch64_bitmask_imm (INTVAL (elt),
21035                                   GET_MODE_INNER (GET_MODE (x))));
21036 }
21037
21038 /* Return true if X is a valid immediate for the SVE DUP and CPY
21039    instructions.  */
21040
21041 bool
21042 aarch64_sve_dup_immediate_p (rtx x)
21043 {
21044   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
21045   if (!CONST_INT_P (x))
21046     return false;
21047
21048   HOST_WIDE_INT val = INTVAL (x);
21049   if (val & 0xff)
21050     return IN_RANGE (val, -0x80, 0x7f);
21051   return IN_RANGE (val, -0x8000, 0x7f00);
21052 }
21053
21054 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
21055    SIGNED_P says whether the operand is signed rather than unsigned.  */
21056
21057 bool
21058 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
21059 {
21060   x = unwrap_const_vec_duplicate (x);
21061   return (CONST_INT_P (x)
21062           && (signed_p
21063               ? IN_RANGE (INTVAL (x), -16, 15)
21064               : IN_RANGE (INTVAL (x), 0, 127)));
21065 }
21066
21067 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
21068    instruction.  Negate X first if NEGATE_P is true.  */
21069
21070 bool
21071 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
21072 {
21073   rtx elt;
21074   REAL_VALUE_TYPE r;
21075
21076   if (!const_vec_duplicate_p (x, &elt)
21077       || !CONST_DOUBLE_P (elt))
21078     return false;
21079
21080   r = *CONST_DOUBLE_REAL_VALUE (elt);
21081
21082   if (negate_p)
21083     r = real_value_negate (&r);
21084
21085   if (real_equal (&r, &dconst1))
21086     return true;
21087   if (real_equal (&r, &dconsthalf))
21088     return true;
21089   return false;
21090 }
21091
21092 /* Return true if X is a valid immediate operand for an SVE FMUL
21093    instruction.  */
21094
21095 bool
21096 aarch64_sve_float_mul_immediate_p (rtx x)
21097 {
21098   rtx elt;
21099
21100   return (const_vec_duplicate_p (x, &elt)
21101           && CONST_DOUBLE_P (elt)
21102           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
21103               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
21104 }
21105
21106 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
21107    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
21108    is nonnull, use it to describe valid immediates.  */
21109 static bool
21110 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
21111                                     simd_immediate_info *info,
21112                                     enum simd_immediate_check which,
21113                                     simd_immediate_info::insn_type insn)
21114 {
21115   /* Try a 4-byte immediate with LSL.  */
21116   for (unsigned int shift = 0; shift < 32; shift += 8)
21117     if ((val32 & (0xff << shift)) == val32)
21118       {
21119         if (info)
21120           *info = simd_immediate_info (SImode, val32 >> shift, insn,
21121                                        simd_immediate_info::LSL, shift);
21122         return true;
21123       }
21124
21125   /* Try a 2-byte immediate with LSL.  */
21126   unsigned int imm16 = val32 & 0xffff;
21127   if (imm16 == (val32 >> 16))
21128     for (unsigned int shift = 0; shift < 16; shift += 8)
21129       if ((imm16 & (0xff << shift)) == imm16)
21130         {
21131           if (info)
21132             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
21133                                          simd_immediate_info::LSL, shift);
21134           return true;
21135         }
21136
21137   /* Try a 4-byte immediate with MSL, except for cases that MVN
21138      can handle.  */
21139   if (which == AARCH64_CHECK_MOV)
21140     for (unsigned int shift = 8; shift < 24; shift += 8)
21141       {
21142         unsigned int low = (1 << shift) - 1;
21143         if (((val32 & (0xff << shift)) | low) == val32)
21144           {
21145             if (info)
21146               *info = simd_immediate_info (SImode, val32 >> shift, insn,
21147                                            simd_immediate_info::MSL, shift);
21148             return true;
21149           }
21150       }
21151
21152   return false;
21153 }
21154
21155 /* Return true if replicating VAL64 is a valid immediate for the
21156    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
21157    use it to describe valid immediates.  */
21158 static bool
21159 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
21160                                  simd_immediate_info *info,
21161                                  enum simd_immediate_check which)
21162 {
21163   unsigned int val32 = val64 & 0xffffffff;
21164   unsigned int val16 = val64 & 0xffff;
21165   unsigned int val8 = val64 & 0xff;
21166
21167   if (val32 == (val64 >> 32))
21168     {
21169       if ((which & AARCH64_CHECK_ORR) != 0
21170           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
21171                                                  simd_immediate_info::MOV))
21172         return true;
21173
21174       if ((which & AARCH64_CHECK_BIC) != 0
21175           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
21176                                                  simd_immediate_info::MVN))
21177         return true;
21178
21179       /* Try using a replicated byte.  */
21180       if (which == AARCH64_CHECK_MOV
21181           && val16 == (val32 >> 16)
21182           && val8 == (val16 >> 8))
21183         {
21184           if (info)
21185             *info = simd_immediate_info (QImode, val8);
21186           return true;
21187         }
21188     }
21189
21190   /* Try using a bit-to-bytemask.  */
21191   if (which == AARCH64_CHECK_MOV)
21192     {
21193       unsigned int i;
21194       for (i = 0; i < 64; i += 8)
21195         {
21196           unsigned char byte = (val64 >> i) & 0xff;
21197           if (byte != 0 && byte != 0xff)
21198             break;
21199         }
21200       if (i == 64)
21201         {
21202           if (info)
21203             *info = simd_immediate_info (DImode, val64);
21204           return true;
21205         }
21206     }
21207   return false;
21208 }
21209
21210 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
21211    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
21212
21213 static bool
21214 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
21215                              simd_immediate_info *info)
21216 {
21217   scalar_int_mode mode = DImode;
21218   unsigned int val32 = val64 & 0xffffffff;
21219   if (val32 == (val64 >> 32))
21220     {
21221       mode = SImode;
21222       unsigned int val16 = val32 & 0xffff;
21223       if (val16 == (val32 >> 16))
21224         {
21225           mode = HImode;
21226           unsigned int val8 = val16 & 0xff;
21227           if (val8 == (val16 >> 8))
21228             mode = QImode;
21229         }
21230     }
21231   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
21232   if (IN_RANGE (val, -0x80, 0x7f))
21233     {
21234       /* DUP with no shift.  */
21235       if (info)
21236         *info = simd_immediate_info (mode, val);
21237       return true;
21238     }
21239   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
21240     {
21241       /* DUP with LSL #8.  */
21242       if (info)
21243         *info = simd_immediate_info (mode, val);
21244       return true;
21245     }
21246   if (aarch64_bitmask_imm (val64, mode))
21247     {
21248       /* DUPM.  */
21249       if (info)
21250         *info = simd_immediate_info (mode, val);
21251       return true;
21252     }
21253   return false;
21254 }
21255
21256 /* Return true if X is an UNSPEC_PTRUE constant of the form:
21257
21258        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
21259
21260    where PATTERN is the svpattern as a CONST_INT and where ZERO
21261    is a zero constant of the required PTRUE mode (which can have
21262    fewer elements than X's mode, if zero bits are significant).
21263
21264    If so, and if INFO is nonnull, describe the immediate in INFO.  */
21265 bool
21266 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
21267 {
21268   if (GET_CODE (x) != CONST)
21269     return false;
21270
21271   x = XEXP (x, 0);
21272   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
21273     return false;
21274
21275   if (info)
21276     {
21277       aarch64_svpattern pattern
21278         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
21279       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
21280       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
21281       *info = simd_immediate_info (int_mode, pattern);
21282     }
21283   return true;
21284 }
21285
21286 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
21287    it to describe valid immediates.  */
21288
21289 static bool
21290 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
21291 {
21292   if (aarch64_sve_ptrue_svpattern_p (x, info))
21293     return true;
21294
21295   if (x == CONST0_RTX (GET_MODE (x)))
21296     {
21297       if (info)
21298         *info = simd_immediate_info (DImode, 0);
21299       return true;
21300     }
21301
21302   /* Analyze the value as a VNx16BImode.  This should be relatively
21303      efficient, since rtx_vector_builder has enough built-in capacity
21304      to store all VLA predicate constants without needing the heap.  */
21305   rtx_vector_builder builder;
21306   if (!aarch64_get_sve_pred_bits (builder, x))
21307     return false;
21308
21309   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
21310   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
21311     {
21312       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
21313       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
21314       if (pattern != AARCH64_NUM_SVPATTERNS)
21315         {
21316           if (info)
21317             {
21318               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
21319               *info = simd_immediate_info (int_mode, pattern);
21320             }
21321           return true;
21322         }
21323     }
21324   return false;
21325 }
21326
21327 /* Return true if OP is a valid SIMD immediate for the operation
21328    described by WHICH.  If INFO is nonnull, use it to describe valid
21329    immediates.  */
21330 bool
21331 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
21332                               enum simd_immediate_check which)
21333 {
21334   machine_mode mode = GET_MODE (op);
21335   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21336   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21337     return false;
21338
21339   if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
21340     return false;
21341
21342   if (vec_flags & VEC_SVE_PRED)
21343     return aarch64_sve_pred_valid_immediate (op, info);
21344
21345   scalar_mode elt_mode = GET_MODE_INNER (mode);
21346   rtx base, step;
21347   unsigned int n_elts;
21348   if (CONST_VECTOR_P (op)
21349       && CONST_VECTOR_DUPLICATE_P (op))
21350     n_elts = CONST_VECTOR_NPATTERNS (op);
21351   else if ((vec_flags & VEC_SVE_DATA)
21352            && const_vec_series_p (op, &base, &step))
21353     {
21354       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
21355       if (!aarch64_sve_index_immediate_p (base)
21356           || !aarch64_sve_index_immediate_p (step))
21357         return false;
21358
21359       if (info)
21360         {
21361           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
21362              should yield two integer values per 128-bit block, meaning
21363              that we need to treat it in the same way as V2DI and then
21364              ignore the upper 32 bits of each element.  */
21365           elt_mode = aarch64_sve_container_int_mode (mode);
21366           *info = simd_immediate_info (elt_mode, base, step);
21367         }
21368       return true;
21369     }
21370   else if (CONST_VECTOR_P (op)
21371            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
21372     /* N_ELTS set above.  */;
21373   else
21374     return false;
21375
21376   scalar_float_mode elt_float_mode;
21377   if (n_elts == 1
21378       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
21379     {
21380       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
21381       if (aarch64_float_const_zero_rtx_p (elt)
21382           || aarch64_float_const_representable_p (elt))
21383         {
21384           if (info)
21385             *info = simd_immediate_info (elt_float_mode, elt);
21386           return true;
21387         }
21388     }
21389
21390   /* If all elements in an SVE vector have the same value, we have a free
21391      choice between using the element mode and using the container mode.
21392      Using the element mode means that unused parts of the vector are
21393      duplicates of the used elements, while using the container mode means
21394      that the unused parts are an extension of the used elements.  Using the
21395      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
21396      for its container mode VNx4SI while 0x00000101 isn't.
21397
21398      If not all elements in an SVE vector have the same value, we need the
21399      transition from one element to the next to occur at container boundaries.
21400      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
21401      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
21402   scalar_int_mode elt_int_mode;
21403   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
21404     elt_int_mode = aarch64_sve_container_int_mode (mode);
21405   else
21406     elt_int_mode = int_mode_for_mode (elt_mode).require ();
21407
21408   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
21409   if (elt_size > 8)
21410     return false;
21411
21412   /* Expand the vector constant out into a byte vector, with the least
21413      significant byte of the register first.  */
21414   auto_vec<unsigned char, 16> bytes;
21415   bytes.reserve (n_elts * elt_size);
21416   for (unsigned int i = 0; i < n_elts; i++)
21417     {
21418       /* The vector is provided in gcc endian-neutral fashion.
21419          For aarch64_be Advanced SIMD, it must be laid out in the vector
21420          register in reverse order.  */
21421       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
21422       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
21423
21424       if (elt_mode != elt_int_mode)
21425         elt = gen_lowpart (elt_int_mode, elt);
21426
21427       if (!CONST_INT_P (elt))
21428         return false;
21429
21430       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
21431       for (unsigned int byte = 0; byte < elt_size; byte++)
21432         {
21433           bytes.quick_push (elt_val & 0xff);
21434           elt_val >>= BITS_PER_UNIT;
21435         }
21436     }
21437
21438   /* The immediate must repeat every eight bytes.  */
21439   unsigned int nbytes = bytes.length ();
21440   for (unsigned i = 8; i < nbytes; ++i)
21441     if (bytes[i] != bytes[i - 8])
21442       return false;
21443
21444   /* Get the repeating 8-byte value as an integer.  No endian correction
21445      is needed here because bytes is already in lsb-first order.  */
21446   unsigned HOST_WIDE_INT val64 = 0;
21447   for (unsigned int i = 0; i < 8; i++)
21448     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
21449               << (i * BITS_PER_UNIT));
21450
21451   if (vec_flags & VEC_SVE_DATA)
21452     return aarch64_sve_valid_immediate (val64, info);
21453   else
21454     return aarch64_advsimd_valid_immediate (val64, info, which);
21455 }
21456
21457 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
21458    has a step in the range of INDEX.  Return the index expression if so,
21459    otherwise return null.  */
21460 rtx
21461 aarch64_check_zero_based_sve_index_immediate (rtx x)
21462 {
21463   rtx base, step;
21464   if (const_vec_series_p (x, &base, &step)
21465       && base == const0_rtx
21466       && aarch64_sve_index_immediate_p (step))
21467     return step;
21468   return NULL_RTX;
21469 }
21470
21471 /* Check of immediate shift constants are within range.  */
21472 bool
21473 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
21474 {
21475   x = unwrap_const_vec_duplicate (x);
21476   if (!CONST_INT_P (x))
21477     return false;
21478   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
21479   if (left)
21480     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
21481   else
21482     return IN_RANGE (INTVAL (x), 1, bit_width);
21483 }
21484
21485 /* Return the bitmask CONST_INT to select the bits required by a zero extract
21486    operation of width WIDTH at bit position POS.  */
21487
21488 rtx
21489 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
21490 {
21491   gcc_assert (CONST_INT_P (width));
21492   gcc_assert (CONST_INT_P (pos));
21493
21494   unsigned HOST_WIDE_INT mask
21495     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
21496   return GEN_INT (mask << UINTVAL (pos));
21497 }
21498
21499 bool
21500 aarch64_mov_operand_p (rtx x, machine_mode mode)
21501 {
21502   if (GET_CODE (x) == HIGH
21503       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
21504     return true;
21505
21506   if (CONST_INT_P (x))
21507     return true;
21508
21509   if (VECTOR_MODE_P (GET_MODE (x)))
21510     {
21511       /* Require predicate constants to be VNx16BI before RA, so that we
21512          force everything to have a canonical form.  */
21513       if (!lra_in_progress
21514           && !reload_completed
21515           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
21516           && GET_MODE (x) != VNx16BImode)
21517         return false;
21518
21519       return aarch64_simd_valid_immediate (x, NULL);
21520     }
21521
21522   /* Remove UNSPEC_SALT_ADDR before checking symbol reference.  */
21523   x = strip_salt (x);
21524
21525   /* GOT accesses are valid moves.  */
21526   if (SYMBOL_REF_P (x)
21527       && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
21528     return true;
21529
21530   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
21531     return true;
21532
21533   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
21534     return true;
21535
21536   return aarch64_classify_symbolic_expression (x)
21537     == SYMBOL_TINY_ABSOLUTE;
21538 }
21539
21540 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
21541    the constant creation.  */
21542
21543 rtx
21544 aarch64_gen_shareable_zero (machine_mode mode)
21545 {
21546   machine_mode zmode = V4SImode;
21547   rtx tmp = gen_reg_rtx (zmode);
21548   emit_move_insn (tmp, CONST0_RTX (zmode));
21549   return lowpart_subreg (mode, tmp, zmode);
21550 }
21551
21552 /* Return a const_int vector of VAL.  */
21553 rtx
21554 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
21555 {
21556   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
21557   return gen_const_vec_duplicate (mode, c);
21558 }
21559
21560 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
21561
21562 bool
21563 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
21564 {
21565   machine_mode vmode;
21566
21567   vmode = aarch64_simd_container_mode (mode, 64);
21568   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
21569   return aarch64_simd_valid_immediate (op_v, NULL);
21570 }
21571
21572 /* Construct and return a PARALLEL RTX vector with elements numbering the
21573    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
21574    the vector - from the perspective of the architecture.  This does not
21575    line up with GCC's perspective on lane numbers, so we end up with
21576    different masks depending on our target endian-ness.  The diagram
21577    below may help.  We must draw the distinction when building masks
21578    which select one half of the vector.  An instruction selecting
21579    architectural low-lanes for a big-endian target, must be described using
21580    a mask selecting GCC high-lanes.
21581
21582                  Big-Endian             Little-Endian
21583
21584 GCC             0   1   2   3           3   2   1   0
21585               | x | x | x | x |       | x | x | x | x |
21586 Architecture    3   2   1   0           3   2   1   0
21587
21588 Low Mask:         { 2, 3 }                { 0, 1 }
21589 High Mask:        { 0, 1 }                { 2, 3 }
21590
21591    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
21592
21593 rtx
21594 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
21595 {
21596   rtvec v = rtvec_alloc (nunits / 2);
21597   int high_base = nunits / 2;
21598   int low_base = 0;
21599   int base;
21600   rtx t1;
21601   int i;
21602
21603   if (BYTES_BIG_ENDIAN)
21604     base = high ? low_base : high_base;
21605   else
21606     base = high ? high_base : low_base;
21607
21608   for (i = 0; i < nunits / 2; i++)
21609     RTVEC_ELT (v, i) = GEN_INT (base + i);
21610
21611   t1 = gen_rtx_PARALLEL (mode, v);
21612   return t1;
21613 }
21614
21615 /* Check OP for validity as a PARALLEL RTX vector with elements
21616    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
21617    from the perspective of the architecture.  See the diagram above
21618    aarch64_simd_vect_par_cnst_half for more details.  */
21619
21620 bool
21621 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
21622                                        bool high)
21623 {
21624   int nelts;
21625   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
21626     return false;
21627
21628   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
21629   HOST_WIDE_INT count_op = XVECLEN (op, 0);
21630   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
21631   int i = 0;
21632
21633   if (count_op != count_ideal)
21634     return false;
21635
21636   for (i = 0; i < count_ideal; i++)
21637     {
21638       rtx elt_op = XVECEXP (op, 0, i);
21639       rtx elt_ideal = XVECEXP (ideal, 0, i);
21640
21641       if (!CONST_INT_P (elt_op)
21642           || INTVAL (elt_ideal) != INTVAL (elt_op))
21643         return false;
21644     }
21645   return true;
21646 }
21647
21648 /* Return a PARALLEL containing NELTS elements, with element I equal
21649    to BASE + I * STEP.  */
21650
21651 rtx
21652 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
21653 {
21654   rtvec vec = rtvec_alloc (nelts);
21655   for (unsigned int i = 0; i < nelts; ++i)
21656     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
21657   return gen_rtx_PARALLEL (VOIDmode, vec);
21658 }
21659
21660 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
21661    series with step STEP.  */
21662
21663 bool
21664 aarch64_stepped_int_parallel_p (rtx op, int step)
21665 {
21666   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
21667     return false;
21668
21669   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
21670   for (int i = 1; i < XVECLEN (op, 0); ++i)
21671     if (!CONST_INT_P (XVECEXP (op, 0, i))
21672         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
21673       return false;
21674
21675   return true;
21676 }
21677
21678 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
21679    HIGH (exclusive).  */
21680 void
21681 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
21682                           const_tree exp)
21683 {
21684   HOST_WIDE_INT lane;
21685   gcc_assert (CONST_INT_P (operand));
21686   lane = INTVAL (operand);
21687
21688   if (lane < low || lane >= high)
21689   {
21690     if (exp)
21691       error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
21692                 lane, low, high - 1);
21693     else
21694       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
21695   }
21696 }
21697
21698 /* Peform endian correction on lane number N, which indexes a vector
21699    of mode MODE, and return the result as an SImode rtx.  */
21700
21701 rtx
21702 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
21703 {
21704   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
21705 }
21706
21707 /* Return TRUE if OP is a valid vector addressing mode.  */
21708
21709 bool
21710 aarch64_simd_mem_operand_p (rtx op)
21711 {
21712   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
21713                         || REG_P (XEXP (op, 0)));
21714 }
21715
21716 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
21717
21718 bool
21719 aarch64_sve_ld1r_operand_p (rtx op)
21720 {
21721   struct aarch64_address_info addr;
21722   scalar_mode mode;
21723
21724   return (MEM_P (op)
21725           && is_a <scalar_mode> (GET_MODE (op), &mode)
21726           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
21727           && addr.type == ADDRESS_REG_IMM
21728           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
21729 }
21730
21731 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
21732    where the size of the read data is specified by `mode` and the size of the
21733    vector elements are specified by `elem_mode`.   */
21734 bool
21735 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
21736                                    scalar_mode elem_mode)
21737 {
21738   struct aarch64_address_info addr;
21739   if (!MEM_P (op)
21740       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
21741     return false;
21742
21743   if (addr.type == ADDRESS_REG_IMM)
21744     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
21745
21746   if (addr.type == ADDRESS_REG_REG)
21747     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
21748
21749   return false;
21750 }
21751
21752 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
21753 bool
21754 aarch64_sve_ld1rq_operand_p (rtx op)
21755 {
21756   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
21757                                             GET_MODE_INNER (GET_MODE (op)));
21758 }
21759
21760 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
21761    accessing a vector where the element size is specified by `elem_mode`.  */
21762 bool
21763 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
21764 {
21765   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
21766 }
21767
21768 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
21769 bool
21770 aarch64_sve_ldff1_operand_p (rtx op)
21771 {
21772   if (!MEM_P (op))
21773     return false;
21774
21775   struct aarch64_address_info addr;
21776   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
21777     return false;
21778
21779   if (addr.type == ADDRESS_REG_IMM)
21780     return known_eq (addr.const_offset, 0);
21781
21782   return addr.type == ADDRESS_REG_REG;
21783 }
21784
21785 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
21786 bool
21787 aarch64_sve_ldnf1_operand_p (rtx op)
21788 {
21789   struct aarch64_address_info addr;
21790
21791   return (MEM_P (op)
21792           && aarch64_classify_address (&addr, XEXP (op, 0),
21793                                        GET_MODE (op), false)
21794           && addr.type == ADDRESS_REG_IMM);
21795 }
21796
21797 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
21798    The conditions for STR are the same.  */
21799 bool
21800 aarch64_sve_ldr_operand_p (rtx op)
21801 {
21802   struct aarch64_address_info addr;
21803
21804   return (MEM_P (op)
21805           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
21806                                        false, ADDR_QUERY_ANY)
21807           && addr.type == ADDRESS_REG_IMM);
21808 }
21809
21810 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
21811    addressing memory of mode MODE.  */
21812 bool
21813 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
21814 {
21815   struct aarch64_address_info addr;
21816   if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
21817     return false;
21818
21819   if (addr.type == ADDRESS_REG_IMM)
21820     return offset_6bit_signed_scaled_p (mode, addr.const_offset);
21821
21822   return addr.type == ADDRESS_REG_REG;
21823 }
21824
21825 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
21826    We need to be able to access the individual pieces, so the range
21827    is different from LD[234] and ST[234].  */
21828 bool
21829 aarch64_sve_struct_memory_operand_p (rtx op)
21830 {
21831   if (!MEM_P (op))
21832     return false;
21833
21834   machine_mode mode = GET_MODE (op);
21835   struct aarch64_address_info addr;
21836   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
21837                                  ADDR_QUERY_ANY)
21838       || addr.type != ADDRESS_REG_IMM)
21839     return false;
21840
21841   poly_int64 first = addr.const_offset;
21842   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
21843   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
21844           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
21845 }
21846
21847 /* Emit a register copy from operand to operand, taking care not to
21848    early-clobber source registers in the process.
21849
21850    COUNT is the number of components into which the copy needs to be
21851    decomposed.  */
21852 void
21853 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
21854                                 unsigned int count)
21855 {
21856   unsigned int i;
21857   int rdest = REGNO (operands[0]);
21858   int rsrc = REGNO (operands[1]);
21859
21860   if (!reg_overlap_mentioned_p (operands[0], operands[1])
21861       || rdest < rsrc)
21862     for (i = 0; i < count; i++)
21863       emit_move_insn (gen_rtx_REG (mode, rdest + i),
21864                       gen_rtx_REG (mode, rsrc + i));
21865   else
21866     for (i = 0; i < count; i++)
21867       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
21868                       gen_rtx_REG (mode, rsrc + count - i - 1));
21869 }
21870
21871 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
21872    one of VSTRUCT modes: OI, CI, or XI.  */
21873 int
21874 aarch64_simd_attr_length_rglist (machine_mode mode)
21875 {
21876   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
21877   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
21878 }
21879
21880 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
21881    alignment of a vector to 128 bits.  SVE predicates have an alignment of
21882    16 bits.  */
21883 static HOST_WIDE_INT
21884 aarch64_simd_vector_alignment (const_tree type)
21885 {
21886   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
21887      be set for non-predicate vectors of booleans.  Modes are the most
21888      direct way we have of identifying real SVE predicate types.  */
21889   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
21890     return 16;
21891   widest_int min_size
21892     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
21893   return wi::umin (min_size, 128).to_uhwi ();
21894 }
21895
21896 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
21897 static poly_uint64
21898 aarch64_vectorize_preferred_vector_alignment (const_tree type)
21899 {
21900   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
21901     {
21902       /* If the length of the vector is a fixed power of 2, try to align
21903          to that length, otherwise don't try to align at all.  */
21904       HOST_WIDE_INT result;
21905       if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
21906           || !pow2p_hwi (result))
21907         result = TYPE_ALIGN (TREE_TYPE (type));
21908       return result;
21909     }
21910   return TYPE_ALIGN (type);
21911 }
21912
21913 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
21914 static bool
21915 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
21916 {
21917   if (is_packed)
21918     return false;
21919
21920   /* For fixed-length vectors, check that the vectorizer will aim for
21921      full-vector alignment.  This isn't true for generic GCC vectors
21922      that are wider than the ABI maximum of 128 bits.  */
21923   poly_uint64 preferred_alignment =
21924     aarch64_vectorize_preferred_vector_alignment (type);
21925   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
21926       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
21927                    preferred_alignment))
21928     return false;
21929
21930   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
21931   return true;
21932 }
21933
21934 /* Return true if the vector misalignment factor is supported by the
21935    target.  */
21936 static bool
21937 aarch64_builtin_support_vector_misalignment (machine_mode mode,
21938                                              const_tree type, int misalignment,
21939                                              bool is_packed)
21940 {
21941   if (TARGET_SIMD && STRICT_ALIGNMENT)
21942     {
21943       /* Return if movmisalign pattern is not supported for this mode.  */
21944       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
21945         return false;
21946
21947       /* Misalignment factor is unknown at compile time.  */
21948       if (misalignment == -1)
21949         return false;
21950     }
21951   return default_builtin_support_vector_misalignment (mode, type, misalignment,
21952                                                       is_packed);
21953 }
21954
21955 /* If VALS is a vector constant that can be loaded into a register
21956    using DUP, generate instructions to do so and return an RTX to
21957    assign to the register.  Otherwise return NULL_RTX.  */
21958 static rtx
21959 aarch64_simd_dup_constant (rtx vals)
21960 {
21961   machine_mode mode = GET_MODE (vals);
21962   machine_mode inner_mode = GET_MODE_INNER (mode);
21963   rtx x;
21964
21965   if (!const_vec_duplicate_p (vals, &x))
21966     return NULL_RTX;
21967
21968   /* We can load this constant by using DUP and a constant in a
21969      single ARM register.  This will be cheaper than a vector
21970      load.  */
21971   x = force_reg (inner_mode, x);
21972   return gen_vec_duplicate (mode, x);
21973 }
21974
21975
21976 /* Generate code to load VALS, which is a PARALLEL containing only
21977    constants (for vec_init) or CONST_VECTOR, efficiently into a
21978    register.  Returns an RTX to copy into the register, or NULL_RTX
21979    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
21980 static rtx
21981 aarch64_simd_make_constant (rtx vals)
21982 {
21983   machine_mode mode = GET_MODE (vals);
21984   rtx const_dup;
21985   rtx const_vec = NULL_RTX;
21986   int n_const = 0;
21987   int i;
21988
21989   if (CONST_VECTOR_P (vals))
21990     const_vec = vals;
21991   else if (GET_CODE (vals) == PARALLEL)
21992     {
21993       /* A CONST_VECTOR must contain only CONST_INTs and
21994          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
21995          Only store valid constants in a CONST_VECTOR.  */
21996       int n_elts = XVECLEN (vals, 0);
21997       for (i = 0; i < n_elts; ++i)
21998         {
21999           rtx x = XVECEXP (vals, 0, i);
22000           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22001             n_const++;
22002         }
22003       if (n_const == n_elts)
22004         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
22005     }
22006   else
22007     gcc_unreachable ();
22008
22009   if (const_vec != NULL_RTX
22010       && aarch64_simd_valid_immediate (const_vec, NULL))
22011     /* Load using MOVI/MVNI.  */
22012     return const_vec;
22013   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
22014     /* Loaded using DUP.  */
22015     return const_dup;
22016   else if (const_vec != NULL_RTX)
22017     /* Load from constant pool. We cannot take advantage of single-cycle
22018        LD1 because we need a PC-relative addressing mode.  */
22019     return const_vec;
22020   else
22021     /* A PARALLEL containing something not valid inside CONST_VECTOR.
22022        We cannot construct an initializer.  */
22023     return NULL_RTX;
22024 }
22025
22026 /* Expand a vector initialisation sequence, such that TARGET is
22027    initialised to contain VALS.  */
22028
22029 void
22030 aarch64_expand_vector_init (rtx target, rtx vals)
22031 {
22032   machine_mode mode = GET_MODE (target);
22033   scalar_mode inner_mode = GET_MODE_INNER (mode);
22034   /* The number of vector elements.  */
22035   int n_elts = XVECLEN (vals, 0);
22036   /* The number of vector elements which are not constant.  */
22037   int n_var = 0;
22038   rtx any_const = NULL_RTX;
22039   /* The first element of vals.  */
22040   rtx v0 = XVECEXP (vals, 0, 0);
22041   bool all_same = true;
22042
22043   /* This is a special vec_init<M><N> where N is not an element mode but a
22044      vector mode with half the elements of M.  We expect to find two entries
22045      of mode N in VALS and we must put their concatentation into TARGET.  */
22046   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
22047     {
22048       machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
22049       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
22050                   && known_eq (GET_MODE_SIZE (mode),
22051                                2 * GET_MODE_SIZE (narrow_mode)));
22052       emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
22053                                          XVECEXP (vals, 0, 0),
22054                                          XVECEXP (vals, 0, 1)));
22055      return;
22056    }
22057
22058   /* Count the number of variable elements to initialise.  */
22059   for (int i = 0; i < n_elts; ++i)
22060     {
22061       rtx x = XVECEXP (vals, 0, i);
22062       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
22063         ++n_var;
22064       else
22065         any_const = x;
22066
22067       all_same &= rtx_equal_p (x, v0);
22068     }
22069
22070   /* No variable elements, hand off to aarch64_simd_make_constant which knows
22071      how best to handle this.  */
22072   if (n_var == 0)
22073     {
22074       rtx constant = aarch64_simd_make_constant (vals);
22075       if (constant != NULL_RTX)
22076         {
22077           emit_move_insn (target, constant);
22078           return;
22079         }
22080     }
22081
22082   /* Splat a single non-constant element if we can.  */
22083   if (all_same)
22084     {
22085       rtx x = force_reg (inner_mode, v0);
22086       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22087       return;
22088     }
22089
22090   /* Check for interleaving case.
22091      For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
22092      Generate following code:
22093      dup v0.h, x
22094      dup v1.h, y
22095      zip1 v0.h, v0.h, v1.h
22096      for "large enough" initializer.  */
22097
22098   if (n_elts >= 8)
22099     {
22100       int i;
22101       for (i = 2; i < n_elts; i++)
22102         if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
22103           break;
22104
22105       if (i == n_elts)
22106         {
22107           machine_mode mode = GET_MODE (target);
22108           rtx dest[2];
22109
22110           for (int i = 0; i < 2; i++)
22111             {
22112               rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
22113               dest[i] = force_reg (mode, x);
22114             }
22115
22116           rtvec v = gen_rtvec (2, dest[0], dest[1]);
22117           emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22118           return;
22119         }
22120     }
22121
22122   enum insn_code icode = optab_handler (vec_set_optab, mode);
22123   gcc_assert (icode != CODE_FOR_nothing);
22124
22125   /* If there are only variable elements, try to optimize
22126      the insertion using dup for the most common element
22127      followed by insertions.  */
22128
22129   /* The algorithm will fill matches[*][0] with the earliest matching element,
22130      and matches[X][1] with the count of duplicate elements (if X is the
22131      earliest element which has duplicates).  */
22132
22133   if (n_var == n_elts && n_elts <= 16)
22134     {
22135       int matches[16][2] = {0};
22136       for (int i = 0; i < n_elts; i++)
22137         {
22138           for (int j = 0; j <= i; j++)
22139             {
22140               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
22141                 {
22142                   matches[i][0] = j;
22143                   matches[j][1]++;
22144                   break;
22145                 }
22146             }
22147         }
22148       int maxelement = 0;
22149       int maxv = 0;
22150       for (int i = 0; i < n_elts; i++)
22151         if (matches[i][1] > maxv)
22152           {
22153             maxelement = i;
22154             maxv = matches[i][1];
22155           }
22156
22157       /* Create a duplicate of the most common element, unless all elements
22158          are equally useless to us, in which case just immediately set the
22159          vector register using the first element.  */
22160
22161       if (maxv == 1)
22162         {
22163           /* For vectors of two 64-bit elements, we can do even better.  */
22164           if (n_elts == 2
22165               && (inner_mode == E_DImode
22166                   || inner_mode == E_DFmode))
22167
22168             {
22169               rtx x0 = XVECEXP (vals, 0, 0);
22170               rtx x1 = XVECEXP (vals, 0, 1);
22171               /* Combine can pick up this case, but handling it directly
22172                  here leaves clearer RTL.
22173
22174                  This is load_pair_lanes<mode>, and also gives us a clean-up
22175                  for store_pair_lanes<mode>.  */
22176               if (memory_operand (x0, inner_mode)
22177                   && memory_operand (x1, inner_mode)
22178                   && aarch64_mergeable_load_pair_p (mode, x0, x1))
22179                 {
22180                   rtx t;
22181                   if (inner_mode == DFmode)
22182                     t = gen_load_pair_lanesdf (target, x0, x1);
22183                   else
22184                     t = gen_load_pair_lanesdi (target, x0, x1);
22185                   emit_insn (t);
22186                   return;
22187                 }
22188             }
22189           /* The subreg-move sequence below will move into lane zero of the
22190              vector register.  For big-endian we want that position to hold
22191              the last element of VALS.  */
22192           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
22193           rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22194           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
22195         }
22196       else
22197         {
22198           rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22199           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22200         }
22201
22202       /* Insert the rest.  */
22203       for (int i = 0; i < n_elts; i++)
22204         {
22205           rtx x = XVECEXP (vals, 0, i);
22206           if (matches[i][0] == maxelement)
22207             continue;
22208           x = force_reg (inner_mode, x);
22209           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22210         }
22211       return;
22212     }
22213
22214   /* Initialise a vector which is part-variable.  We want to first try
22215      to build those lanes which are constant in the most efficient way we
22216      can.  */
22217   if (n_var != n_elts)
22218     {
22219       rtx copy = copy_rtx (vals);
22220
22221       /* Load constant part of vector.  We really don't care what goes into the
22222          parts we will overwrite, but we're more likely to be able to load the
22223          constant efficiently if it has fewer, larger, repeating parts
22224          (see aarch64_simd_valid_immediate).  */
22225       for (int i = 0; i < n_elts; i++)
22226         {
22227           rtx x = XVECEXP (vals, 0, i);
22228           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22229             continue;
22230           rtx subst = any_const;
22231           for (int bit = n_elts / 2; bit > 0; bit /= 2)
22232             {
22233               /* Look in the copied vector, as more elements are const.  */
22234               rtx test = XVECEXP (copy, 0, i ^ bit);
22235               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
22236                 {
22237                   subst = test;
22238                   break;
22239                 }
22240             }
22241           XVECEXP (copy, 0, i) = subst;
22242         }
22243       aarch64_expand_vector_init (target, copy);
22244     }
22245
22246   /* Insert the variable lanes directly.  */
22247   for (int i = 0; i < n_elts; i++)
22248     {
22249       rtx x = XVECEXP (vals, 0, i);
22250       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22251         continue;
22252       x = force_reg (inner_mode, x);
22253       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22254     }
22255 }
22256
22257 /* Emit RTL corresponding to:
22258    insr TARGET, ELEM.  */
22259
22260 static void
22261 emit_insr (rtx target, rtx elem)
22262 {
22263   machine_mode mode = GET_MODE (target);
22264   scalar_mode elem_mode = GET_MODE_INNER (mode);
22265   elem = force_reg (elem_mode, elem);
22266
22267   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
22268   gcc_assert (icode != CODE_FOR_nothing);
22269   emit_insn (GEN_FCN (icode) (target, target, elem));
22270 }
22271
22272 /* Subroutine of aarch64_sve_expand_vector_init for handling
22273    trailing constants.
22274    This function works as follows:
22275    (a) Create a new vector consisting of trailing constants.
22276    (b) Initialize TARGET with the constant vector using emit_move_insn.
22277    (c) Insert remaining elements in TARGET using insr.
22278    NELTS is the total number of elements in original vector while
22279    while NELTS_REQD is the number of elements that are actually
22280    significant.
22281
22282    ??? The heuristic used is to do above only if number of constants
22283    is at least half the total number of elements.  May need fine tuning.  */
22284
22285 static bool
22286 aarch64_sve_expand_vector_init_handle_trailing_constants
22287  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
22288 {
22289   machine_mode mode = GET_MODE (target);
22290   scalar_mode elem_mode = GET_MODE_INNER (mode);
22291   int n_trailing_constants = 0;
22292
22293   for (int i = nelts_reqd - 1;
22294        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
22295        i--)
22296     n_trailing_constants++;
22297
22298   if (n_trailing_constants >= nelts_reqd / 2)
22299     {
22300       /* Try to use the natural pattern of BUILDER to extend the trailing
22301          constant elements to a full vector.  Replace any variables in the
22302          extra elements with zeros.
22303
22304          ??? It would be better if the builders supported "don't care"
22305              elements, with the builder filling in whichever elements
22306              give the most compact encoding.  */
22307       rtx_vector_builder v (mode, nelts, 1);
22308       for (int i = 0; i < nelts; i++)
22309         {
22310           rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
22311           if (!valid_for_const_vector_p (elem_mode, x))
22312             x = CONST0_RTX (elem_mode);
22313           v.quick_push (x);
22314         }
22315       rtx const_vec = v.build ();
22316       emit_move_insn (target, const_vec);
22317
22318       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
22319         emit_insr (target, builder.elt (i));
22320
22321       return true;
22322     }
22323
22324   return false;
22325 }
22326
22327 /* Subroutine of aarch64_sve_expand_vector_init.
22328    Works as follows:
22329    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
22330    (b) Skip trailing elements from BUILDER, which are the same as
22331        element NELTS_REQD - 1.
22332    (c) Insert earlier elements in reverse order in TARGET using insr.  */
22333
22334 static void
22335 aarch64_sve_expand_vector_init_insert_elems (rtx target,
22336                                              const rtx_vector_builder &builder,
22337                                              int nelts_reqd)
22338 {
22339   machine_mode mode = GET_MODE (target);
22340   scalar_mode elem_mode = GET_MODE_INNER (mode);
22341
22342   struct expand_operand ops[2];
22343   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
22344   gcc_assert (icode != CODE_FOR_nothing);
22345
22346   create_output_operand (&ops[0], target, mode);
22347   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
22348   expand_insn (icode, 2, ops);
22349
22350   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22351   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
22352     emit_insr (target, builder.elt (i));
22353 }
22354
22355 /* Subroutine of aarch64_sve_expand_vector_init to handle case
22356    when all trailing elements of builder are same.
22357    This works as follows:
22358    (a) Use expand_insn interface to broadcast last vector element in TARGET.
22359    (b) Insert remaining elements in TARGET using insr.
22360
22361    ??? The heuristic used is to do above if number of same trailing elements
22362    is at least 3/4 of total number of elements, loosely based on
22363    heuristic from mostly_zeros_p.  May need fine-tuning.  */
22364
22365 static bool
22366 aarch64_sve_expand_vector_init_handle_trailing_same_elem
22367  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
22368 {
22369   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22370   if (ndups >= (3 * nelts_reqd) / 4)
22371     {
22372       aarch64_sve_expand_vector_init_insert_elems (target, builder,
22373                                                    nelts_reqd - ndups + 1);
22374       return true;
22375     }
22376
22377   return false;
22378 }
22379
22380 /* Initialize register TARGET from BUILDER. NELTS is the constant number
22381    of elements in BUILDER.
22382
22383    The function tries to initialize TARGET from BUILDER if it fits one
22384    of the special cases outlined below.
22385
22386    Failing that, the function divides BUILDER into two sub-vectors:
22387    v_even = even elements of BUILDER;
22388    v_odd = odd elements of BUILDER;
22389
22390    and recursively calls itself with v_even and v_odd.
22391
22392    if (recursive call succeeded for v_even or v_odd)
22393      TARGET = zip (v_even, v_odd)
22394
22395    The function returns true if it managed to build TARGET from BUILDER
22396    with one of the special cases, false otherwise.
22397
22398    Example: {a, 1, b, 2, c, 3, d, 4}
22399
22400    The vector gets divided into:
22401    v_even = {a, b, c, d}
22402    v_odd = {1, 2, 3, 4}
22403
22404    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
22405    initialize tmp2 from constant vector v_odd using emit_move_insn.
22406
22407    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
22408    4 elements, so we construct tmp1 from v_even using insr:
22409    tmp1 = dup(d)
22410    insr tmp1, c
22411    insr tmp1, b
22412    insr tmp1, a
22413
22414    And finally:
22415    TARGET = zip (tmp1, tmp2)
22416    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
22417
22418 static bool
22419 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
22420                                 int nelts, int nelts_reqd)
22421 {
22422   machine_mode mode = GET_MODE (target);
22423
22424   /* Case 1: Vector contains trailing constants.  */
22425
22426   if (aarch64_sve_expand_vector_init_handle_trailing_constants
22427        (target, builder, nelts, nelts_reqd))
22428     return true;
22429
22430   /* Case 2: Vector contains leading constants.  */
22431
22432   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
22433   for (int i = 0; i < nelts_reqd; i++)
22434     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
22435   rev_builder.finalize ();
22436
22437   if (aarch64_sve_expand_vector_init_handle_trailing_constants
22438        (target, rev_builder, nelts, nelts_reqd))
22439     {
22440       emit_insn (gen_aarch64_sve_rev (mode, target, target));
22441       return true;
22442     }
22443
22444   /* Case 3: Vector contains trailing same element.  */
22445
22446   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22447        (target, builder, nelts_reqd))
22448     return true;
22449
22450   /* Case 4: Vector contains leading same element.  */
22451
22452   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22453        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
22454     {
22455       emit_insn (gen_aarch64_sve_rev (mode, target, target));
22456       return true;
22457     }
22458
22459   /* Avoid recursing below 4-elements.
22460      ??? The threshold 4 may need fine-tuning.  */
22461
22462   if (nelts_reqd <= 4)
22463     return false;
22464
22465   rtx_vector_builder v_even (mode, nelts, 1);
22466   rtx_vector_builder v_odd (mode, nelts, 1);
22467
22468   for (int i = 0; i < nelts * 2; i += 2)
22469     {
22470       v_even.quick_push (builder.elt (i));
22471       v_odd.quick_push (builder.elt (i + 1));
22472     }
22473
22474   v_even.finalize ();
22475   v_odd.finalize ();
22476
22477   rtx tmp1 = gen_reg_rtx (mode);
22478   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
22479                                                     nelts, nelts_reqd / 2);
22480
22481   rtx tmp2 = gen_reg_rtx (mode);
22482   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
22483                                                    nelts, nelts_reqd / 2);
22484
22485   if (!did_even_p && !did_odd_p)
22486     return false;
22487
22488   /* Initialize v_even and v_odd using INSR if it didn't match any of the
22489      special cases and zip v_even, v_odd.  */
22490
22491   if (!did_even_p)
22492     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
22493
22494   if (!did_odd_p)
22495     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
22496
22497   rtvec v = gen_rtvec (2, tmp1, tmp2);
22498   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22499   return true;
22500 }
22501
22502 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
22503
22504 void
22505 aarch64_sve_expand_vector_init (rtx target, rtx vals)
22506 {
22507   machine_mode mode = GET_MODE (target);
22508   int nelts = XVECLEN (vals, 0);
22509
22510   rtx_vector_builder v (mode, nelts, 1);
22511   for (int i = 0; i < nelts; i++)
22512     v.quick_push (XVECEXP (vals, 0, i));
22513   v.finalize ();
22514
22515   /* If neither sub-vectors of v could be initialized specially,
22516      then use INSR to insert all elements from v into TARGET.
22517      ??? This might not be optimal for vectors with large
22518      initializers like 16-element or above.
22519      For nelts < 4, it probably isn't useful to handle specially.  */
22520
22521   if (nelts < 4
22522       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
22523     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
22524 }
22525
22526 /* Check whether VALUE is a vector constant in which every element
22527    is either a power of 2 or a negated power of 2.  If so, return
22528    a constant vector of log2s, and flip CODE between PLUS and MINUS
22529    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
22530
22531 static rtx
22532 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
22533 {
22534   if (!CONST_VECTOR_P (value))
22535     return NULL_RTX;
22536
22537   rtx_vector_builder builder;
22538   if (!builder.new_unary_operation (GET_MODE (value), value, false))
22539     return NULL_RTX;
22540
22541   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
22542   /* 1 if the result of the multiplication must be negated,
22543      0 if it mustn't, or -1 if we don't yet care.  */
22544   int negate = -1;
22545   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
22546   for (unsigned int i = 0; i < encoded_nelts; ++i)
22547     {
22548       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
22549       if (!CONST_SCALAR_INT_P (elt))
22550         return NULL_RTX;
22551       rtx_mode_t val (elt, int_mode);
22552       wide_int pow2 = wi::neg (val);
22553       if (val != pow2)
22554         {
22555           /* It matters whether we negate or not.  Make that choice,
22556              and make sure that it's consistent with previous elements.  */
22557           if (negate == !wi::neg_p (val))
22558             return NULL_RTX;
22559           negate = wi::neg_p (val);
22560           if (!negate)
22561             pow2 = val;
22562         }
22563       /* POW2 is now the value that we want to be a power of 2.  */
22564       int shift = wi::exact_log2 (pow2);
22565       if (shift < 0)
22566         return NULL_RTX;
22567       builder.quick_push (gen_int_mode (shift, int_mode));
22568     }
22569   if (negate == -1)
22570     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
22571     code = PLUS;
22572   else if (negate == 1)
22573     code = code == PLUS ? MINUS : PLUS;
22574   return builder.build ();
22575 }
22576
22577 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
22578    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
22579    operands array, in the same order as for fma_optab.  Return true if
22580    the function emitted all the necessary instructions, false if the caller
22581    should generate the pattern normally with the new OPERANDS array.  */
22582
22583 bool
22584 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
22585 {
22586   machine_mode mode = GET_MODE (operands[0]);
22587   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
22588     {
22589       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
22590                                   NULL_RTX, true, OPTAB_DIRECT);
22591       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
22592                           operands[3], product, operands[0], true,
22593                           OPTAB_DIRECT);
22594       return true;
22595     }
22596   operands[2] = force_reg (mode, operands[2]);
22597   return false;
22598 }
22599
22600 /* Likewise, but for a conditional pattern.  */
22601
22602 bool
22603 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
22604 {
22605   machine_mode mode = GET_MODE (operands[0]);
22606   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
22607     {
22608       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
22609                                   NULL_RTX, true, OPTAB_DIRECT);
22610       emit_insn (gen_cond (code, mode, operands[0], operands[1],
22611                            operands[4], product, operands[5]));
22612       return true;
22613     }
22614   operands[3] = force_reg (mode, operands[3]);
22615   return false;
22616 }
22617
22618 static unsigned HOST_WIDE_INT
22619 aarch64_shift_truncation_mask (machine_mode mode)
22620 {
22621   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
22622     return 0;
22623   return GET_MODE_UNIT_BITSIZE (mode) - 1;
22624 }
22625
22626 /* Select a format to encode pointers in exception handling data.  */
22627 int
22628 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
22629 {
22630    int type;
22631    switch (aarch64_cmodel)
22632      {
22633      case AARCH64_CMODEL_TINY:
22634      case AARCH64_CMODEL_TINY_PIC:
22635      case AARCH64_CMODEL_SMALL:
22636      case AARCH64_CMODEL_SMALL_PIC:
22637      case AARCH64_CMODEL_SMALL_SPIC:
22638        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
22639           for everything.  */
22640        type = DW_EH_PE_sdata4;
22641        break;
22642      default:
22643        /* No assumptions here.  8-byte relocs required.  */
22644        type = DW_EH_PE_sdata8;
22645        break;
22646      }
22647    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
22648 }
22649
22650 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
22651
22652 static void
22653 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
22654 {
22655   if (TREE_CODE (decl) == FUNCTION_DECL)
22656     {
22657       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
22658       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
22659         {
22660           fprintf (stream, "\t.variant_pcs\t");
22661           assemble_name (stream, name);
22662           fprintf (stream, "\n");
22663         }
22664     }
22665 }
22666
22667 /* The last .arch and .tune assembly strings that we printed.  */
22668 static std::string aarch64_last_printed_arch_string;
22669 static std::string aarch64_last_printed_tune_string;
22670
22671 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
22672    by the function fndecl.  */
22673
22674 void
22675 aarch64_declare_function_name (FILE *stream, const char* name,
22676                                 tree fndecl)
22677 {
22678   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
22679
22680   struct cl_target_option *targ_options;
22681   if (target_parts)
22682     targ_options = TREE_TARGET_OPTION (target_parts);
22683   else
22684     targ_options = TREE_TARGET_OPTION (target_option_current_node);
22685   gcc_assert (targ_options);
22686
22687   const struct processor *this_arch
22688     = aarch64_get_arch (targ_options->x_selected_arch);
22689
22690   auto isa_flags = targ_options->x_aarch64_asm_isa_flags;
22691   std::string extension
22692     = aarch64_get_extension_string_for_isa_flags (isa_flags,
22693                                                   this_arch->flags);
22694   /* Only update the assembler .arch string if it is distinct from the last
22695      such string we printed.  */
22696   std::string to_print = this_arch->name + extension;
22697   if (to_print != aarch64_last_printed_arch_string)
22698     {
22699       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
22700       aarch64_last_printed_arch_string = to_print;
22701     }
22702
22703   /* Print the cpu name we're tuning for in the comments, might be
22704      useful to readers of the generated asm.  Do it only when it changes
22705      from function to function and verbose assembly is requested.  */
22706   const struct processor *this_tune
22707     = aarch64_get_tune_cpu (targ_options->x_selected_tune);
22708
22709   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
22710     {
22711       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
22712                    this_tune->name);
22713       aarch64_last_printed_tune_string = this_tune->name;
22714     }
22715
22716   aarch64_asm_output_variant_pcs (stream, fndecl, name);
22717
22718   /* Don't forget the type directive for ELF.  */
22719   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
22720   ASM_OUTPUT_LABEL (stream, name);
22721
22722   cfun->machine->label_is_assembled = true;
22723 }
22724
22725 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  */
22726
22727 void
22728 aarch64_print_patchable_function_entry (FILE *file,
22729                                         unsigned HOST_WIDE_INT patch_area_size,
22730                                         bool record_p)
22731 {
22732   if (!cfun->machine->label_is_assembled)
22733     {
22734       /* Emit the patching area before the entry label, if any.  */
22735       default_print_patchable_function_entry (file, patch_area_size,
22736                                               record_p);
22737       return;
22738     }
22739
22740   rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
22741                                GEN_INT (record_p));
22742   basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
22743
22744   if (!aarch_bti_enabled ()
22745       || cgraph_node::get (cfun->decl)->only_called_directly_p ())
22746     {
22747       /* Emit the patchable_area at the beginning of the function.  */
22748       rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
22749       INSN_ADDRESSES_NEW (insn, -1);
22750       return;
22751     }
22752
22753   rtx_insn *insn = next_real_nondebug_insn (get_insns ());
22754   if (!insn
22755       || !INSN_P (insn)
22756       || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
22757       || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
22758     {
22759       /* Emit a BTI_C.  */
22760       insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
22761     }
22762
22763   /* Emit the patchable_area after BTI_C.  */
22764   insn = emit_insn_after (pa, insn);
22765   INSN_ADDRESSES_NEW (insn, -1);
22766 }
22767
22768 /* Output patchable area.  */
22769
22770 void
22771 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
22772 {
22773   default_print_patchable_function_entry (asm_out_file, patch_area_size,
22774                                           record_p);
22775 }
22776
22777 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
22778
22779 void
22780 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
22781 {
22782   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
22783   const char *value = IDENTIFIER_POINTER (target);
22784   aarch64_asm_output_variant_pcs (stream, decl, name);
22785   ASM_OUTPUT_DEF (stream, name, value);
22786 }
22787
22788 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
22789    function symbol references.  */
22790
22791 void
22792 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
22793 {
22794   default_elf_asm_output_external (stream, decl, name);
22795   aarch64_asm_output_variant_pcs (stream, decl, name);
22796 }
22797
22798 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
22799    Used to output the .cfi_b_key_frame directive when signing the current
22800    function with the B key.  */
22801
22802 void
22803 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
22804 {
22805   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
22806       && aarch_ra_sign_key == AARCH_KEY_B)
22807         asm_fprintf (f, "\t.cfi_b_key_frame\n");
22808 }
22809
22810 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
22811
22812 static void
22813 aarch64_start_file (void)
22814 {
22815   struct cl_target_option *default_options
22816     = TREE_TARGET_OPTION (target_option_default_node);
22817
22818   const struct processor *default_arch
22819     = aarch64_get_arch (default_options->x_selected_arch);
22820   auto default_isa_flags = default_options->x_aarch64_asm_isa_flags;
22821   std::string extension
22822     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
22823                                                   default_arch->flags);
22824
22825    aarch64_last_printed_arch_string = default_arch->name + extension;
22826    aarch64_last_printed_tune_string = "";
22827    asm_fprintf (asm_out_file, "\t.arch %s\n",
22828                 aarch64_last_printed_arch_string.c_str ());
22829
22830    default_file_start ();
22831 }
22832
22833 /* Emit load exclusive.  */
22834
22835 static void
22836 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
22837                              rtx mem, rtx model_rtx)
22838 {
22839   if (mode == TImode)
22840     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
22841                                                 gen_highpart (DImode, rval),
22842                                                 mem, model_rtx));
22843   else
22844     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
22845 }
22846
22847 /* Emit store exclusive.  */
22848
22849 static void
22850 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
22851                               rtx mem, rtx rval, rtx model_rtx)
22852 {
22853   if (mode == TImode)
22854     emit_insn (gen_aarch64_store_exclusive_pair
22855                (bval, mem, operand_subword (rval, 0, 0, TImode),
22856                 operand_subword (rval, 1, 0, TImode), model_rtx));
22857   else
22858     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
22859 }
22860
22861 /* Mark the previous jump instruction as unlikely.  */
22862
22863 static void
22864 aarch64_emit_unlikely_jump (rtx insn)
22865 {
22866   rtx_insn *jump = emit_jump_insn (insn);
22867   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
22868 }
22869
22870 /* We store the names of the various atomic helpers in a 5x5 array.
22871    Return the libcall function given MODE, MODEL and NAMES.  */
22872
22873 rtx
22874 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
22875                         const atomic_ool_names *names)
22876 {
22877   memmodel model = memmodel_from_int (INTVAL (model_rtx));
22878   int mode_idx, model_idx;
22879
22880   switch (mode)
22881     {
22882     case E_QImode:
22883       mode_idx = 0;
22884       break;
22885     case E_HImode:
22886       mode_idx = 1;
22887       break;
22888     case E_SImode:
22889       mode_idx = 2;
22890       break;
22891     case E_DImode:
22892       mode_idx = 3;
22893       break;
22894     case E_TImode:
22895       mode_idx = 4;
22896       break;
22897     default:
22898       gcc_unreachable ();
22899     }
22900
22901   switch (model)
22902     {
22903     case MEMMODEL_RELAXED:
22904       model_idx = 0;
22905       break;
22906     case MEMMODEL_CONSUME:
22907     case MEMMODEL_ACQUIRE:
22908       model_idx = 1;
22909       break;
22910     case MEMMODEL_RELEASE:
22911       model_idx = 2;
22912       break;
22913     case MEMMODEL_ACQ_REL:
22914     case MEMMODEL_SEQ_CST:
22915       model_idx = 3;
22916       break;
22917     case MEMMODEL_SYNC_ACQUIRE:
22918     case MEMMODEL_SYNC_RELEASE:
22919     case MEMMODEL_SYNC_SEQ_CST:
22920       model_idx = 4;
22921       break;
22922     default:
22923       gcc_unreachable ();
22924     }
22925
22926   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
22927                                       VISIBILITY_HIDDEN);
22928 }
22929
22930 #define DEF0(B, N) \
22931   { "__aarch64_" #B #N "_relax", \
22932     "__aarch64_" #B #N "_acq", \
22933     "__aarch64_" #B #N "_rel", \
22934     "__aarch64_" #B #N "_acq_rel", \
22935     "__aarch64_" #B #N "_sync" }
22936
22937 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
22938                  { NULL, NULL, NULL, NULL }
22939 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
22940
22941 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
22942 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
22943 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
22944 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
22945 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
22946 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
22947
22948 #undef DEF0
22949 #undef DEF4
22950 #undef DEF5
22951
22952 /* Expand a compare and swap pattern.  */
22953
22954 void
22955 aarch64_expand_compare_and_swap (rtx operands[])
22956 {
22957   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
22958   machine_mode mode, r_mode;
22959
22960   bval = operands[0];
22961   rval = operands[1];
22962   mem = operands[2];
22963   oldval = operands[3];
22964   newval = operands[4];
22965   is_weak = operands[5];
22966   mod_s = operands[6];
22967   mod_f = operands[7];
22968   mode = GET_MODE (mem);
22969
22970   /* Normally the succ memory model must be stronger than fail, but in the
22971      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
22972      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
22973   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
22974       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
22975     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
22976
22977   r_mode = mode;
22978   if (mode == QImode || mode == HImode)
22979     {
22980       r_mode = SImode;
22981       rval = gen_reg_rtx (r_mode);
22982     }
22983
22984   if (TARGET_LSE)
22985     {
22986       /* The CAS insn requires oldval and rval overlap, but we need to
22987          have a copy of oldval saved across the operation to tell if
22988          the operation is successful.  */
22989       if (reg_overlap_mentioned_p (rval, oldval))
22990         rval = copy_to_mode_reg (r_mode, oldval);
22991       else
22992         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
22993
22994       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
22995                                                    newval, mod_s));
22996       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
22997     }
22998   else if (TARGET_OUTLINE_ATOMICS)
22999     {
23000       /* Oldval must satisfy compare afterward.  */
23001       if (!aarch64_plus_operand (oldval, mode))
23002         oldval = force_reg (mode, oldval);
23003       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
23004       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
23005                                       oldval, mode, newval, mode,
23006                                       XEXP (mem, 0), Pmode);
23007       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23008     }
23009   else
23010     {
23011       /* The oldval predicate varies by mode.  Test it and force to reg.  */
23012       insn_code code = code_for_aarch64_compare_and_swap (mode);
23013       if (!insn_data[code].operand[2].predicate (oldval, mode))
23014         oldval = force_reg (mode, oldval);
23015
23016       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
23017                                  is_weak, mod_s, mod_f));
23018       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
23019     }
23020
23021   if (r_mode != mode)
23022     rval = gen_lowpart (mode, rval);
23023   emit_move_insn (operands[1], rval);
23024
23025   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
23026   emit_insn (gen_rtx_SET (bval, x));
23027 }
23028
23029 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
23030    sequence implementing an atomic operation.  */
23031
23032 static void
23033 aarch64_emit_post_barrier (enum memmodel model)
23034 {
23035   const enum memmodel base_model = memmodel_base (model);
23036
23037   if (is_mm_sync (model)
23038       && (base_model == MEMMODEL_ACQUIRE
23039           || base_model == MEMMODEL_ACQ_REL
23040           || base_model == MEMMODEL_SEQ_CST))
23041     {
23042       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
23043     }
23044 }
23045
23046 /* Split a compare and swap pattern.  */
23047
23048 void
23049 aarch64_split_compare_and_swap (rtx operands[])
23050 {
23051   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
23052   gcc_assert (epilogue_completed);
23053
23054   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
23055   machine_mode mode;
23056   bool is_weak;
23057   rtx_code_label *label1, *label2;
23058   enum memmodel model;
23059
23060   rval = operands[0];
23061   mem = operands[1];
23062   oldval = operands[2];
23063   newval = operands[3];
23064   is_weak = (operands[4] != const0_rtx);
23065   model_rtx = operands[5];
23066   scratch = operands[7];
23067   mode = GET_MODE (mem);
23068   model = memmodel_from_int (INTVAL (model_rtx));
23069
23070   /* When OLDVAL is zero and we want the strong version we can emit a tighter
23071     loop:
23072     .label1:
23073         LD[A]XR rval, [mem]
23074         CBNZ    rval, .label2
23075         ST[L]XR scratch, newval, [mem]
23076         CBNZ    scratch, .label1
23077     .label2:
23078         CMP     rval, 0.  */
23079   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
23080                         oldval == const0_rtx && mode != TImode);
23081
23082   label1 = NULL;
23083   if (!is_weak)
23084     {
23085       label1 = gen_label_rtx ();
23086       emit_label (label1);
23087     }
23088   label2 = gen_label_rtx ();
23089
23090   /* The initial load can be relaxed for a __sync operation since a final
23091      barrier will be emitted to stop code hoisting.  */
23092   if (is_mm_sync (model))
23093     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
23094   else
23095     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
23096
23097   if (strong_zero_p)
23098     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
23099   else
23100     {
23101       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23102       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
23103     }
23104   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23105                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
23106   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23107
23108   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
23109
23110   if (!is_weak)
23111     {
23112       if (aarch64_track_speculation)
23113         {
23114           /* Emit an explicit compare instruction, so that we can correctly
23115              track the condition codes.  */
23116           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23117           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23118         }
23119       else
23120         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
23121
23122       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23123                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
23124       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23125     }
23126   else
23127     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23128
23129   emit_label (label2);
23130
23131   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
23132      to set the condition flags.  If this is not used it will be removed by
23133      later passes.  */
23134   if (strong_zero_p)
23135     aarch64_gen_compare_reg (NE, rval, const0_rtx);
23136
23137   /* Emit any final barrier needed for a __sync operation.  */
23138   if (is_mm_sync (model))
23139     aarch64_emit_post_barrier (model);
23140 }
23141
23142 /* Split an atomic operation.  */
23143
23144 void
23145 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
23146                          rtx value, rtx model_rtx, rtx cond)
23147 {
23148   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
23149   gcc_assert (epilogue_completed);
23150
23151   machine_mode mode = GET_MODE (mem);
23152   machine_mode wmode = (mode == DImode ? DImode : SImode);
23153   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
23154   const bool is_sync = is_mm_sync (model);
23155   rtx_code_label *label;
23156   rtx x;
23157
23158   /* Split the atomic operation into a sequence.  */
23159   label = gen_label_rtx ();
23160   emit_label (label);
23161
23162   if (new_out)
23163     new_out = gen_lowpart (wmode, new_out);
23164   if (old_out)
23165     old_out = gen_lowpart (wmode, old_out);
23166   else
23167     old_out = new_out;
23168   value = simplify_gen_subreg (wmode, value, mode, 0);
23169
23170   /* The initial load can be relaxed for a __sync operation since a final
23171      barrier will be emitted to stop code hoisting.  */
23172  if (is_sync)
23173     aarch64_emit_load_exclusive (mode, old_out, mem,
23174                                  GEN_INT (MEMMODEL_RELAXED));
23175   else
23176     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
23177
23178   switch (code)
23179     {
23180     case SET:
23181       new_out = value;
23182       break;
23183
23184     case NOT:
23185       x = gen_rtx_AND (wmode, old_out, value);
23186       emit_insn (gen_rtx_SET (new_out, x));
23187       x = gen_rtx_NOT (wmode, new_out);
23188       emit_insn (gen_rtx_SET (new_out, x));
23189       break;
23190
23191     case MINUS:
23192       if (CONST_INT_P (value))
23193         {
23194           value = GEN_INT (-UINTVAL (value));
23195           code = PLUS;
23196         }
23197       /* Fall through.  */
23198
23199     default:
23200       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
23201       emit_insn (gen_rtx_SET (new_out, x));
23202       break;
23203     }
23204
23205   aarch64_emit_store_exclusive (mode, cond, mem,
23206                                 gen_lowpart (mode, new_out), model_rtx);
23207
23208   if (aarch64_track_speculation)
23209     {
23210       /* Emit an explicit compare instruction, so that we can correctly
23211          track the condition codes.  */
23212       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
23213       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23214     }
23215   else
23216     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
23217
23218   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23219                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
23220   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23221
23222   /* Emit any final barrier needed for a __sync operation.  */
23223   if (is_sync)
23224     aarch64_emit_post_barrier (model);
23225 }
23226
23227 static void
23228 aarch64_init_libfuncs (void)
23229 {
23230    /* Half-precision float operations.  The compiler handles all operations
23231      with NULL libfuncs by converting to SFmode.  */
23232
23233   /* Conversions.  */
23234   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
23235   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
23236
23237   /* Arithmetic.  */
23238   set_optab_libfunc (add_optab, HFmode, NULL);
23239   set_optab_libfunc (sdiv_optab, HFmode, NULL);
23240   set_optab_libfunc (smul_optab, HFmode, NULL);
23241   set_optab_libfunc (neg_optab, HFmode, NULL);
23242   set_optab_libfunc (sub_optab, HFmode, NULL);
23243
23244   /* Comparisons.  */
23245   set_optab_libfunc (eq_optab, HFmode, NULL);
23246   set_optab_libfunc (ne_optab, HFmode, NULL);
23247   set_optab_libfunc (lt_optab, HFmode, NULL);
23248   set_optab_libfunc (le_optab, HFmode, NULL);
23249   set_optab_libfunc (ge_optab, HFmode, NULL);
23250   set_optab_libfunc (gt_optab, HFmode, NULL);
23251   set_optab_libfunc (unord_optab, HFmode, NULL);
23252 }
23253
23254 /* Target hook for c_mode_for_suffix.  */
23255 static machine_mode
23256 aarch64_c_mode_for_suffix (char suffix)
23257 {
23258   if (suffix == 'q')
23259     return TFmode;
23260
23261   return VOIDmode;
23262 }
23263
23264 /* We can only represent floating point constants which will fit in
23265    "quarter-precision" values.  These values are characterised by
23266    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
23267    by:
23268
23269    (-1)^s * (n/16) * 2^r
23270
23271    Where:
23272      's' is the sign bit.
23273      'n' is an integer in the range 16 <= n <= 31.
23274      'r' is an integer in the range -3 <= r <= 4.  */
23275
23276 /* Return true iff X can be represented by a quarter-precision
23277    floating point immediate operand X.  Note, we cannot represent 0.0.  */
23278 bool
23279 aarch64_float_const_representable_p (rtx x)
23280 {
23281   /* This represents our current view of how many bits
23282      make up the mantissa.  */
23283   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
23284   int exponent;
23285   unsigned HOST_WIDE_INT mantissa, mask;
23286   REAL_VALUE_TYPE r, m;
23287   bool fail;
23288
23289   x = unwrap_const_vec_duplicate (x);
23290   if (!CONST_DOUBLE_P (x))
23291     return false;
23292
23293   if (GET_MODE (x) == VOIDmode
23294       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
23295     return false;
23296
23297   r = *CONST_DOUBLE_REAL_VALUE (x);
23298
23299   /* We cannot represent infinities, NaNs or +/-zero.  We won't
23300      know if we have +zero until we analyse the mantissa, but we
23301      can reject the other invalid values.  */
23302   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
23303       || REAL_VALUE_MINUS_ZERO (r))
23304     return false;
23305
23306   /* Extract exponent.  */
23307   r = real_value_abs (&r);
23308   exponent = REAL_EXP (&r);
23309
23310   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23311      highest (sign) bit, with a fixed binary point at bit point_pos.
23312      m1 holds the low part of the mantissa, m2 the high part.
23313      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23314      bits for the mantissa, this can fail (low bits will be lost).  */
23315   real_ldexp (&m, &r, point_pos - exponent);
23316   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
23317
23318   /* If the low part of the mantissa has bits set we cannot represent
23319      the value.  */
23320   if (w.ulow () != 0)
23321     return false;
23322   /* We have rejected the lower HOST_WIDE_INT, so update our
23323      understanding of how many bits lie in the mantissa and
23324      look only at the high HOST_WIDE_INT.  */
23325   mantissa = w.elt (1);
23326   point_pos -= HOST_BITS_PER_WIDE_INT;
23327
23328   /* We can only represent values with a mantissa of the form 1.xxxx.  */
23329   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
23330   if ((mantissa & mask) != 0)
23331     return false;
23332
23333   /* Having filtered unrepresentable values, we may now remove all
23334      but the highest 5 bits.  */
23335   mantissa >>= point_pos - 5;
23336
23337   /* We cannot represent the value 0.0, so reject it.  This is handled
23338      elsewhere.  */
23339   if (mantissa == 0)
23340     return false;
23341
23342   /* Then, as bit 4 is always set, we can mask it off, leaving
23343      the mantissa in the range [0, 15].  */
23344   mantissa &= ~(1 << 4);
23345   gcc_assert (mantissa <= 15);
23346
23347   /* GCC internally does not use IEEE754-like encoding (where normalized
23348      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
23349      Our mantissa values are shifted 4 places to the left relative to
23350      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23351      by 5 places to correct for GCC's representation.  */
23352   exponent = 5 - exponent;
23353
23354   return (exponent >= 0 && exponent <= 7);
23355 }
23356
23357 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
23358    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
23359    output MOVI/MVNI, ORR or BIC immediate.  */
23360 char*
23361 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
23362                                    enum simd_immediate_check which)
23363 {
23364   bool is_valid;
23365   static char templ[40];
23366   const char *mnemonic;
23367   const char *shift_op;
23368   unsigned int lane_count = 0;
23369   char element_char;
23370
23371   struct simd_immediate_info info;
23372
23373   /* This will return true to show const_vector is legal for use as either
23374      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
23375      It will also update INFO to show how the immediate should be generated.
23376      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
23377   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
23378   gcc_assert (is_valid);
23379
23380   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23381   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
23382
23383   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23384     {
23385       gcc_assert (info.insn == simd_immediate_info::MOV
23386                   && info.u.mov.shift == 0);
23387       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
23388          move immediate path.  */
23389       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23390         info.u.mov.value = GEN_INT (0);
23391       else
23392         {
23393           const unsigned int buf_size = 20;
23394           char float_buf[buf_size] = {'\0'};
23395           real_to_decimal_for_mode (float_buf,
23396                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23397                                     buf_size, buf_size, 1, info.elt_mode);
23398
23399           if (lane_count == 1)
23400             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
23401           else
23402             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
23403                       lane_count, element_char, float_buf);
23404           return templ;
23405         }
23406     }
23407
23408   gcc_assert (CONST_INT_P (info.u.mov.value));
23409
23410   if (which == AARCH64_CHECK_MOV)
23411     {
23412       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
23413       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
23414                   ? "msl" : "lsl");
23415       if (lane_count == 1)
23416         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
23417                   mnemonic, UINTVAL (info.u.mov.value));
23418       else if (info.u.mov.shift)
23419         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23420                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
23421                   element_char, UINTVAL (info.u.mov.value), shift_op,
23422                   info.u.mov.shift);
23423       else
23424         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23425                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
23426                   element_char, UINTVAL (info.u.mov.value));
23427     }
23428   else
23429     {
23430       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
23431       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
23432       if (info.u.mov.shift)
23433         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23434                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
23435                   element_char, UINTVAL (info.u.mov.value), "lsl",
23436                   info.u.mov.shift);
23437       else
23438         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23439                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
23440                   element_char, UINTVAL (info.u.mov.value));
23441     }
23442   return templ;
23443 }
23444
23445 char*
23446 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
23447 {
23448
23449   /* If a floating point number was passed and we desire to use it in an
23450      integer mode do the conversion to integer.  */
23451   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
23452     {
23453       unsigned HOST_WIDE_INT ival;
23454       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
23455           gcc_unreachable ();
23456       immediate = gen_int_mode (ival, mode);
23457     }
23458
23459   machine_mode vmode;
23460   /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
23461      a 128 bit vector mode.  */
23462   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
23463
23464   vmode = aarch64_simd_container_mode (mode, width);
23465   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
23466   return aarch64_output_simd_mov_immediate (v_op, width);
23467 }
23468
23469 /* Return the output string to use for moving immediate CONST_VECTOR
23470    into an SVE register.  */
23471
23472 char *
23473 aarch64_output_sve_mov_immediate (rtx const_vector)
23474 {
23475   static char templ[40];
23476   struct simd_immediate_info info;
23477   char element_char;
23478
23479   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
23480   gcc_assert (is_valid);
23481
23482   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23483
23484   machine_mode vec_mode = GET_MODE (const_vector);
23485   if (aarch64_sve_pred_mode_p (vec_mode))
23486     {
23487       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
23488       if (info.insn == simd_immediate_info::MOV)
23489         {
23490           gcc_assert (info.u.mov.value == const0_rtx);
23491           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
23492         }
23493       else
23494         {
23495           gcc_assert (info.insn == simd_immediate_info::PTRUE);
23496           unsigned int total_bytes;
23497           if (info.u.pattern == AARCH64_SV_ALL
23498               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
23499             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
23500                       total_bytes / GET_MODE_SIZE (info.elt_mode));
23501           else
23502             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
23503                       svpattern_token (info.u.pattern));
23504         }
23505       return buf;
23506     }
23507
23508   if (info.insn == simd_immediate_info::INDEX)
23509     {
23510       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
23511                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
23512                 element_char, INTVAL (info.u.index.base),
23513                 INTVAL (info.u.index.step));
23514       return templ;
23515     }
23516
23517   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23518     {
23519       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23520         info.u.mov.value = GEN_INT (0);
23521       else
23522         {
23523           const int buf_size = 20;
23524           char float_buf[buf_size] = {};
23525           real_to_decimal_for_mode (float_buf,
23526                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23527                                     buf_size, buf_size, 1, info.elt_mode);
23528
23529           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
23530                     element_char, float_buf);
23531           return templ;
23532         }
23533     }
23534
23535   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
23536             element_char, INTVAL (info.u.mov.value));
23537   return templ;
23538 }
23539
23540 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
23541    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
23542    pattern.  */
23543
23544 char *
23545 aarch64_output_sve_ptrues (rtx const_unspec)
23546 {
23547   static char templ[40];
23548
23549   struct simd_immediate_info info;
23550   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
23551   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
23552
23553   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23554   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
23555             svpattern_token (info.u.pattern));
23556   return templ;
23557 }
23558
23559 /* Split operands into moves from op[1] + op[2] into op[0].  */
23560
23561 void
23562 aarch64_split_combinev16qi (rtx operands[3])
23563 {
23564   unsigned int dest = REGNO (operands[0]);
23565   unsigned int src1 = REGNO (operands[1]);
23566   unsigned int src2 = REGNO (operands[2]);
23567   machine_mode halfmode = GET_MODE (operands[1]);
23568   unsigned int halfregs = REG_NREGS (operands[1]);
23569   rtx destlo, desthi;
23570
23571   gcc_assert (halfmode == V16QImode);
23572
23573   if (src1 == dest && src2 == dest + halfregs)
23574     {
23575       /* No-op move.  Can't split to nothing; emit something.  */
23576       emit_note (NOTE_INSN_DELETED);
23577       return;
23578     }
23579
23580   /* Preserve register attributes for variable tracking.  */
23581   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
23582   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
23583                                GET_MODE_SIZE (halfmode));
23584
23585   /* Special case of reversed high/low parts.  */
23586   if (reg_overlap_mentioned_p (operands[2], destlo)
23587       && reg_overlap_mentioned_p (operands[1], desthi))
23588     {
23589       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23590       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
23591       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23592     }
23593   else if (!reg_overlap_mentioned_p (operands[2], destlo))
23594     {
23595       /* Try to avoid unnecessary moves if part of the result
23596          is in the right place already.  */
23597       if (src1 != dest)
23598         emit_move_insn (destlo, operands[1]);
23599       if (src2 != dest + halfregs)
23600         emit_move_insn (desthi, operands[2]);
23601     }
23602   else
23603     {
23604       if (src2 != dest + halfregs)
23605         emit_move_insn (desthi, operands[2]);
23606       if (src1 != dest)
23607         emit_move_insn (destlo, operands[1]);
23608     }
23609 }
23610
23611 /* vec_perm support.  */
23612
23613 struct expand_vec_perm_d
23614 {
23615   rtx target, op0, op1;
23616   vec_perm_indices perm;
23617   machine_mode vmode;
23618   machine_mode op_mode;
23619   unsigned int vec_flags;
23620   unsigned int op_vec_flags;
23621   bool one_vector_p;
23622   bool testing_p;
23623 };
23624
23625 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
23626
23627 /* Generate a variable permutation.  */
23628
23629 static void
23630 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
23631 {
23632   machine_mode vmode = GET_MODE (target);
23633   bool one_vector_p = rtx_equal_p (op0, op1);
23634
23635   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
23636   gcc_checking_assert (GET_MODE (op0) == vmode);
23637   gcc_checking_assert (GET_MODE (op1) == vmode);
23638   gcc_checking_assert (GET_MODE (sel) == vmode);
23639   gcc_checking_assert (TARGET_SIMD);
23640
23641   if (one_vector_p)
23642     {
23643       if (vmode == V8QImode)
23644         {
23645           /* Expand the argument to a V16QI mode by duplicating it.  */
23646           rtx pair = gen_reg_rtx (V16QImode);
23647           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
23648           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23649         }
23650       else
23651         {
23652           emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
23653         }
23654     }
23655   else
23656     {
23657       rtx pair;
23658
23659       if (vmode == V8QImode)
23660         {
23661           pair = gen_reg_rtx (V16QImode);
23662           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
23663           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23664         }
23665       else
23666         {
23667           pair = gen_reg_rtx (V2x16QImode);
23668           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
23669           emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
23670         }
23671     }
23672 }
23673
23674 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
23675    NELT is the number of elements in the vector.  */
23676
23677 void
23678 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
23679                          unsigned int nelt)
23680 {
23681   machine_mode vmode = GET_MODE (target);
23682   bool one_vector_p = rtx_equal_p (op0, op1);
23683   rtx mask;
23684
23685   /* The TBL instruction does not use a modulo index, so we must take care
23686      of that ourselves.  */
23687   mask = aarch64_simd_gen_const_vector_dup (vmode,
23688       one_vector_p ? nelt - 1 : 2 * nelt - 1);
23689   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
23690
23691   /* For big-endian, we also need to reverse the index within the vector
23692      (but not which vector).  */
23693   if (BYTES_BIG_ENDIAN)
23694     {
23695       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
23696       if (!one_vector_p)
23697         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
23698       sel = expand_simple_binop (vmode, XOR, sel, mask,
23699                                  NULL, 0, OPTAB_LIB_WIDEN);
23700     }
23701   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
23702 }
23703
23704 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
23705
23706 static void
23707 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
23708 {
23709   emit_insn (gen_rtx_SET (target,
23710                           gen_rtx_UNSPEC (GET_MODE (target),
23711                                           gen_rtvec (2, op0, op1), code)));
23712 }
23713
23714 /* Expand an SVE vec_perm with the given operands.  */
23715
23716 void
23717 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
23718 {
23719   machine_mode data_mode = GET_MODE (target);
23720   machine_mode sel_mode = GET_MODE (sel);
23721   /* Enforced by the pattern condition.  */
23722   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
23723
23724   /* Note: vec_perm indices are supposed to wrap when they go beyond the
23725      size of the two value vectors, i.e. the upper bits of the indices
23726      are effectively ignored.  SVE TBL instead produces 0 for any
23727      out-of-range indices, so we need to modulo all the vec_perm indices
23728      to ensure they are all in range.  */
23729   rtx sel_reg = force_reg (sel_mode, sel);
23730
23731   /* Check if the sel only references the first values vector.  */
23732   if (CONST_VECTOR_P (sel)
23733       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
23734     {
23735       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
23736       return;
23737     }
23738
23739   /* Check if the two values vectors are the same.  */
23740   if (rtx_equal_p (op0, op1))
23741     {
23742       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
23743       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23744                                          NULL, 0, OPTAB_DIRECT);
23745       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
23746       return;
23747     }
23748
23749   /* Run TBL on for each value vector and combine the results.  */
23750
23751   rtx res0 = gen_reg_rtx (data_mode);
23752   rtx res1 = gen_reg_rtx (data_mode);
23753   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
23754   if (!CONST_VECTOR_P (sel)
23755       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
23756     {
23757       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
23758                                                        2 * nunits - 1);
23759       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23760                                      NULL, 0, OPTAB_DIRECT);
23761     }
23762   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
23763   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
23764                                      NULL, 0, OPTAB_DIRECT);
23765   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
23766   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
23767     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
23768   else
23769     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
23770 }
23771
23772 /* Recognize patterns suitable for the TRN instructions.  */
23773 static bool
23774 aarch64_evpc_trn (struct expand_vec_perm_d *d)
23775 {
23776   HOST_WIDE_INT odd;
23777   poly_uint64 nelt = d->perm.length ();
23778   rtx out, in0, in1;
23779   machine_mode vmode = d->vmode;
23780
23781   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23782     return false;
23783
23784   /* Note that these are little-endian tests.
23785      We correct for big-endian later.  */
23786   if (!d->perm[0].is_constant (&odd)
23787       || (odd != 0 && odd != 1)
23788       || !d->perm.series_p (0, 2, odd, 2)
23789       || !d->perm.series_p (1, 2, nelt + odd, 2))
23790     return false;
23791
23792   /* Success!  */
23793   if (d->testing_p)
23794     return true;
23795
23796   in0 = d->op0;
23797   in1 = d->op1;
23798   /* We don't need a big-endian lane correction for SVE; see the comment
23799      at the head of aarch64-sve.md for details.  */
23800   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23801     {
23802       std::swap (in0, in1);
23803       odd = !odd;
23804     }
23805   out = d->target;
23806
23807   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23808                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
23809   return true;
23810 }
23811
23812 /* Try to re-encode the PERM constant so it combines odd and even elements.
23813    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
23814    We retry with this new constant with the full suite of patterns.  */
23815 static bool
23816 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
23817 {
23818   expand_vec_perm_d newd;
23819   unsigned HOST_WIDE_INT nelt;
23820
23821   if (d->vec_flags != VEC_ADVSIMD)
23822     return false;
23823
23824   /* Get the new mode.  Always twice the size of the inner
23825      and half the elements.  */
23826   poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
23827   unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
23828   auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
23829   machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
23830
23831   if (new_mode == word_mode)
23832     return false;
23833
23834   /* to_constant is safe since this routine is specific to Advanced SIMD
23835      vectors.  */
23836   nelt = d->perm.length ().to_constant ();
23837
23838   vec_perm_builder newpermconst;
23839   newpermconst.new_vector (nelt / 2, nelt / 2, 1);
23840
23841   /* Convert the perm constant if we can.  Require even, odd as the pairs.  */
23842   for (unsigned int i = 0; i < nelt; i += 2)
23843     {
23844       poly_int64 elt0 = d->perm[i];
23845       poly_int64 elt1 = d->perm[i + 1];
23846       poly_int64 newelt;
23847       if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
23848         return false;
23849       newpermconst.quick_push (newelt.to_constant ());
23850     }
23851   newpermconst.finalize ();
23852
23853   newd.vmode = new_mode;
23854   newd.vec_flags = VEC_ADVSIMD;
23855   newd.op_mode = newd.vmode;
23856   newd.op_vec_flags = newd.vec_flags;
23857   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
23858   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
23859   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
23860   newd.testing_p = d->testing_p;
23861   newd.one_vector_p = d->one_vector_p;
23862
23863   newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
23864   return aarch64_expand_vec_perm_const_1 (&newd);
23865 }
23866
23867 /* Recognize patterns suitable for the UZP instructions.  */
23868 static bool
23869 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
23870 {
23871   HOST_WIDE_INT odd;
23872   rtx out, in0, in1;
23873   machine_mode vmode = d->vmode;
23874
23875   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23876     return false;
23877
23878   /* Note that these are little-endian tests.
23879      We correct for big-endian later.  */
23880   if (!d->perm[0].is_constant (&odd)
23881       || (odd != 0 && odd != 1)
23882       || !d->perm.series_p (0, 1, odd, 2))
23883     return false;
23884
23885   /* Success!  */
23886   if (d->testing_p)
23887     return true;
23888
23889   in0 = d->op0;
23890   in1 = d->op1;
23891   /* We don't need a big-endian lane correction for SVE; see the comment
23892      at the head of aarch64-sve.md for details.  */
23893   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23894     {
23895       std::swap (in0, in1);
23896       odd = !odd;
23897     }
23898   out = d->target;
23899
23900   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23901                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
23902   return true;
23903 }
23904
23905 /* Recognize patterns suitable for the ZIP instructions.  */
23906 static bool
23907 aarch64_evpc_zip (struct expand_vec_perm_d *d)
23908 {
23909   unsigned int high;
23910   poly_uint64 nelt = d->perm.length ();
23911   rtx out, in0, in1;
23912   machine_mode vmode = d->vmode;
23913
23914   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23915     return false;
23916
23917   /* Note that these are little-endian tests.
23918      We correct for big-endian later.  */
23919   poly_uint64 first = d->perm[0];
23920   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
23921       || !d->perm.series_p (0, 2, first, 1)
23922       || !d->perm.series_p (1, 2, first + nelt, 1))
23923     return false;
23924   high = maybe_ne (first, 0U);
23925
23926   /* Success!  */
23927   if (d->testing_p)
23928     return true;
23929
23930   in0 = d->op0;
23931   in1 = d->op1;
23932   /* We don't need a big-endian lane correction for SVE; see the comment
23933      at the head of aarch64-sve.md for details.  */
23934   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23935     {
23936       std::swap (in0, in1);
23937       high = !high;
23938     }
23939   out = d->target;
23940
23941   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23942                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
23943   return true;
23944 }
23945
23946 /* Recognize patterns for the EXT insn.  */
23947
23948 static bool
23949 aarch64_evpc_ext (struct expand_vec_perm_d *d)
23950 {
23951   HOST_WIDE_INT location;
23952   rtx offset;
23953
23954   /* The first element always refers to the first vector.
23955      Check if the extracted indices are increasing by one.  */
23956   if (d->vec_flags == VEC_SVE_PRED
23957       || !d->perm[0].is_constant (&location)
23958       || !d->perm.series_p (0, 1, location, 1))
23959     return false;
23960
23961   /* Success! */
23962   if (d->testing_p)
23963     return true;
23964
23965   /* The case where (location == 0) is a no-op for both big- and little-endian,
23966      and is removed by the mid-end at optimization levels -O1 and higher.
23967
23968      We don't need a big-endian lane correction for SVE; see the comment
23969      at the head of aarch64-sve.md for details.  */
23970   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
23971     {
23972       /* After setup, we want the high elements of the first vector (stored
23973          at the LSB end of the register), and the low elements of the second
23974          vector (stored at the MSB end of the register). So swap.  */
23975       std::swap (d->op0, d->op1);
23976       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
23977          to_constant () is safe since this is restricted to Advanced SIMD
23978          vectors.  */
23979       location = d->perm.length ().to_constant () - location;
23980     }
23981
23982   offset = GEN_INT (location);
23983   emit_set_insn (d->target,
23984                  gen_rtx_UNSPEC (d->vmode,
23985                                  gen_rtvec (3, d->op0, d->op1, offset),
23986                                  UNSPEC_EXT));
23987   return true;
23988 }
23989
23990 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
23991    within each 64-bit, 32-bit or 16-bit granule.  */
23992
23993 static bool
23994 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
23995 {
23996   HOST_WIDE_INT diff;
23997   unsigned int i, size, unspec;
23998   machine_mode pred_mode;
23999
24000   if (d->vec_flags == VEC_SVE_PRED
24001       || !d->one_vector_p
24002       || !d->perm[0].is_constant (&diff)
24003       || !diff)
24004     return false;
24005
24006   if (d->vec_flags & VEC_SVE_DATA)
24007     size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
24008   else
24009     size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
24010   if (size == 64)
24011     {
24012       unspec = UNSPEC_REV64;
24013       pred_mode = VNx2BImode;
24014     }
24015   else if (size == 32)
24016     {
24017       unspec = UNSPEC_REV32;
24018       pred_mode = VNx4BImode;
24019     }
24020   else if (size == 16)
24021     {
24022       unspec = UNSPEC_REV16;
24023       pred_mode = VNx8BImode;
24024     }
24025   else
24026     return false;
24027
24028   unsigned int step = diff + 1;
24029   for (i = 0; i < step; ++i)
24030     if (!d->perm.series_p (i, step, diff - i, step))
24031       return false;
24032
24033   /* Success! */
24034   if (d->testing_p)
24035     return true;
24036
24037   if (d->vec_flags & VEC_SVE_DATA)
24038     {
24039       rtx pred = aarch64_ptrue_reg (pred_mode);
24040       emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
24041                                          d->target, pred, d->op0));
24042       return true;
24043     }
24044   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
24045   emit_set_insn (d->target, src);
24046   return true;
24047 }
24048
24049 /* Recognize patterns for the REV insn, which reverses elements within
24050    a full vector.  */
24051
24052 static bool
24053 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
24054 {
24055   poly_uint64 nelt = d->perm.length ();
24056
24057   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
24058     return false;
24059
24060   if (!d->perm.series_p (0, 1, nelt - 1, -1))
24061     return false;
24062
24063   /* Success! */
24064   if (d->testing_p)
24065     return true;
24066
24067   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
24068   emit_set_insn (d->target, src);
24069   return true;
24070 }
24071
24072 static bool
24073 aarch64_evpc_dup (struct expand_vec_perm_d *d)
24074 {
24075   rtx out = d->target;
24076   rtx in0;
24077   HOST_WIDE_INT elt;
24078   machine_mode vmode = d->vmode;
24079   rtx lane;
24080
24081   if (d->vec_flags == VEC_SVE_PRED
24082       || d->perm.encoding ().encoded_nelts () != 1
24083       || !d->perm[0].is_constant (&elt))
24084     return false;
24085
24086   if ((d->vec_flags & VEC_SVE_DATA)
24087       && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
24088     return false;
24089
24090   /* Success! */
24091   if (d->testing_p)
24092     return true;
24093
24094   /* The generic preparation in aarch64_expand_vec_perm_const_1
24095      swaps the operand order and the permute indices if it finds
24096      d->perm[0] to be in the second operand.  Thus, we can always
24097      use d->op0 and need not do any extra arithmetic to get the
24098      correct lane number.  */
24099   in0 = d->op0;
24100   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
24101
24102   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
24103   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
24104   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
24105   return true;
24106 }
24107
24108 static bool
24109 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
24110 {
24111   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
24112   machine_mode vmode = d->vmode;
24113
24114   /* Make sure that the indices are constant.  */
24115   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
24116   for (unsigned int i = 0; i < encoded_nelts; ++i)
24117     if (!d->perm[i].is_constant ())
24118       return false;
24119
24120   if (d->testing_p)
24121     return true;
24122
24123   /* Generic code will try constant permutation twice.  Once with the
24124      original mode and again with the elements lowered to QImode.
24125      So wait and don't do the selector expansion ourselves.  */
24126   if (vmode != V8QImode && vmode != V16QImode)
24127     return false;
24128
24129   /* to_constant is safe since this routine is specific to Advanced SIMD
24130      vectors.  */
24131   unsigned int nelt = d->perm.length ().to_constant ();
24132   for (unsigned int i = 0; i < nelt; ++i)
24133     /* If big-endian and two vectors we end up with a weird mixed-endian
24134        mode on NEON.  Reverse the index within each word but not the word
24135        itself.  to_constant is safe because we checked is_constant above.  */
24136     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
24137                         ? d->perm[i].to_constant () ^ (nelt - 1)
24138                         : d->perm[i].to_constant ());
24139
24140   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
24141   sel = force_reg (vmode, sel);
24142
24143   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
24144   return true;
24145 }
24146
24147 /* Try to implement D using an SVE TBL instruction.  */
24148
24149 static bool
24150 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
24151 {
24152   unsigned HOST_WIDE_INT nelt;
24153
24154   /* Permuting two variable-length vectors could overflow the
24155      index range.  */
24156   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
24157     return false;
24158
24159   if (d->testing_p)
24160     return true;
24161
24162   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
24163   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
24164   if (d->one_vector_p)
24165     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
24166   else
24167     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
24168   return true;
24169 }
24170
24171 /* Try to implement D using SVE dup instruction.  */
24172
24173 static bool
24174 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
24175 {
24176   if (BYTES_BIG_ENDIAN
24177       || !d->one_vector_p
24178       || d->vec_flags != VEC_SVE_DATA
24179       || d->op_vec_flags != VEC_ADVSIMD
24180       || d->perm.encoding ().nelts_per_pattern () != 1
24181       || !known_eq (d->perm.encoding ().npatterns (),
24182                     GET_MODE_NUNITS (d->op_mode))
24183       || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
24184     return false;
24185
24186   int npatterns = d->perm.encoding ().npatterns ();
24187   for (int i = 0; i < npatterns; i++)
24188     if (!known_eq (d->perm[i], i))
24189       return false;
24190
24191   if (d->testing_p)
24192     return true;
24193
24194   aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
24195   return true;
24196 }
24197
24198 /* Try to implement D using SVE SEL instruction.  */
24199
24200 static bool
24201 aarch64_evpc_sel (struct expand_vec_perm_d *d)
24202 {
24203   machine_mode vmode = d->vmode;
24204   int unit_size = GET_MODE_UNIT_SIZE (vmode);
24205
24206   if (d->vec_flags != VEC_SVE_DATA
24207       || unit_size > 8)
24208     return false;
24209
24210   int n_patterns = d->perm.encoding ().npatterns ();
24211   poly_int64 vec_len = d->perm.length ();
24212
24213   for (int i = 0; i < n_patterns; ++i)
24214     if (!known_eq (d->perm[i], i)
24215         && !known_eq (d->perm[i], vec_len + i))
24216       return false;
24217
24218   for (int i = n_patterns; i < n_patterns * 2; i++)
24219     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
24220         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
24221       return false;
24222
24223   if (d->testing_p)
24224     return true;
24225
24226   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
24227
24228   /* Build a predicate that is true when op0 elements should be used.  */
24229   rtx_vector_builder builder (pred_mode, n_patterns, 2);
24230   for (int i = 0; i < n_patterns * 2; i++)
24231     {
24232       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
24233                                           : CONST0_RTX (BImode);
24234       builder.quick_push (elem);
24235     }
24236
24237   rtx const_vec = builder.build ();
24238   rtx pred = force_reg (pred_mode, const_vec);
24239   /* TARGET = PRED ? OP0 : OP1.  */
24240   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
24241   return true;
24242 }
24243
24244 /* Recognize patterns suitable for the INS instructions.  */
24245 static bool
24246 aarch64_evpc_ins (struct expand_vec_perm_d *d)
24247 {
24248   machine_mode mode = d->vmode;
24249   unsigned HOST_WIDE_INT nelt;
24250
24251   if (d->vec_flags != VEC_ADVSIMD)
24252     return false;
24253
24254   /* to_constant is safe since this routine is specific to Advanced SIMD
24255      vectors.  */
24256   nelt = d->perm.length ().to_constant ();
24257   rtx insv = d->op0;
24258
24259   HOST_WIDE_INT idx = -1;
24260
24261   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24262     {
24263       HOST_WIDE_INT elt;
24264       if (!d->perm[i].is_constant (&elt))
24265         return false;
24266       if (elt == (HOST_WIDE_INT) i)
24267         continue;
24268       if (idx != -1)
24269         {
24270           idx = -1;
24271           break;
24272         }
24273       idx = i;
24274     }
24275
24276   if (idx == -1)
24277     {
24278       insv = d->op1;
24279       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24280         {
24281           if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
24282             continue;
24283           if (idx != -1)
24284             return false;
24285           idx = i;
24286         }
24287
24288       if (idx == -1)
24289         return false;
24290     }
24291
24292   if (d->testing_p)
24293     return true;
24294
24295   gcc_assert (idx != -1);
24296
24297   unsigned extractindex = d->perm[idx].to_constant ();
24298   rtx extractv = d->op0;
24299   if (extractindex >= nelt)
24300     {
24301       extractv = d->op1;
24302       extractindex -= nelt;
24303     }
24304   gcc_assert (extractindex < nelt);
24305
24306   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
24307   expand_operand ops[5];
24308   create_output_operand (&ops[0], d->target, mode);
24309   create_input_operand (&ops[1], insv, mode);
24310   create_integer_operand (&ops[2], 1 << idx);
24311   create_input_operand (&ops[3], extractv, mode);
24312   create_integer_operand (&ops[4], extractindex);
24313   expand_insn (icode, 5, ops);
24314
24315   return true;
24316 }
24317
24318 static bool
24319 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
24320 {
24321   gcc_assert (d->op_mode != E_VOIDmode);
24322
24323   /* The pattern matching functions above are written to look for a small
24324      number to begin the sequence (0, 1, N/2).  If we begin with an index
24325      from the second operand, we can swap the operands.  */
24326   poly_int64 nelt = d->perm.length ();
24327   if (known_ge (d->perm[0], nelt))
24328     {
24329       d->perm.rotate_inputs (1);
24330       std::swap (d->op0, d->op1);
24331     }
24332
24333   if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
24334        || d->vec_flags == VEC_SVE_DATA
24335        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
24336        || d->vec_flags == VEC_SVE_PRED)
24337       && known_gt (nelt, 1))
24338     {
24339       if (d->vmode == d->op_mode)
24340         {
24341           if (aarch64_evpc_rev_local (d))
24342             return true;
24343           else if (aarch64_evpc_rev_global (d))
24344             return true;
24345           else if (aarch64_evpc_ext (d))
24346             return true;
24347           else if (aarch64_evpc_dup (d))
24348             return true;
24349           else if (aarch64_evpc_zip (d))
24350             return true;
24351           else if (aarch64_evpc_uzp (d))
24352             return true;
24353           else if (aarch64_evpc_trn (d))
24354             return true;
24355           else if (aarch64_evpc_sel (d))
24356             return true;
24357           else if (aarch64_evpc_ins (d))
24358             return true;
24359           else if (aarch64_evpc_reencode (d))
24360             return true;
24361
24362           if (d->vec_flags == VEC_SVE_DATA)
24363             return aarch64_evpc_sve_tbl (d);
24364           else if (d->vec_flags == VEC_ADVSIMD)
24365             return aarch64_evpc_tbl (d);
24366         }
24367       else
24368         {
24369           if (aarch64_evpc_sve_dup (d))
24370             return true;
24371         }
24372     }
24373   return false;
24374 }
24375
24376 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
24377
24378 static bool
24379 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
24380                                   rtx target, rtx op0, rtx op1,
24381                                   const vec_perm_indices &sel)
24382 {
24383   struct expand_vec_perm_d d;
24384
24385   /* Check whether the mask can be applied to a single vector.  */
24386   if (sel.ninputs () == 1
24387       || (op0 && rtx_equal_p (op0, op1)))
24388     d.one_vector_p = true;
24389   else if (sel.all_from_input_p (0))
24390     {
24391       d.one_vector_p = true;
24392       op1 = op0;
24393     }
24394   else if (sel.all_from_input_p (1))
24395     {
24396       d.one_vector_p = true;
24397       op0 = op1;
24398     }
24399   else
24400     d.one_vector_p = false;
24401
24402   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
24403                      sel.nelts_per_input ());
24404   d.vmode = vmode;
24405   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
24406   d.op_mode = op_mode;
24407   d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
24408   d.target = target;
24409   d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
24410   if (op0 == op1)
24411     d.op1 = d.op0;
24412   else
24413     d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
24414   d.testing_p = !target;
24415
24416   if (!d.testing_p)
24417     return aarch64_expand_vec_perm_const_1 (&d);
24418
24419   rtx_insn *last = get_last_insn ();
24420   bool ret = aarch64_expand_vec_perm_const_1 (&d);
24421   gcc_assert (last == get_last_insn ());
24422
24423   return ret;
24424 }
24425 /* Generate a byte permute mask for a register of mode MODE,
24426    which has NUNITS units.  */
24427
24428 rtx
24429 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
24430 {
24431   /* We have to reverse each vector because we dont have
24432      a permuted load that can reverse-load according to ABI rules.  */
24433   rtx mask;
24434   rtvec v = rtvec_alloc (16);
24435   unsigned int i, j;
24436   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
24437
24438   gcc_assert (BYTES_BIG_ENDIAN);
24439   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
24440
24441   for (i = 0; i < nunits; i++)
24442     for (j = 0; j < usize; j++)
24443       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
24444   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
24445   return force_reg (V16QImode, mask);
24446 }
24447
24448 /* Expand an SVE integer comparison using the SVE equivalent of:
24449
24450      (set TARGET (CODE OP0 OP1)).  */
24451
24452 void
24453 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
24454 {
24455   machine_mode pred_mode = GET_MODE (target);
24456   machine_mode data_mode = GET_MODE (op0);
24457   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
24458                                       op0, op1);
24459   if (!rtx_equal_p (target, res))
24460     emit_move_insn (target, res);
24461 }
24462
24463 /* Return the UNSPEC_COND_* code for comparison CODE.  */
24464
24465 static unsigned int
24466 aarch64_unspec_cond_code (rtx_code code)
24467 {
24468   switch (code)
24469     {
24470     case NE:
24471       return UNSPEC_COND_FCMNE;
24472     case EQ:
24473       return UNSPEC_COND_FCMEQ;
24474     case LT:
24475       return UNSPEC_COND_FCMLT;
24476     case GT:
24477       return UNSPEC_COND_FCMGT;
24478     case LE:
24479       return UNSPEC_COND_FCMLE;
24480     case GE:
24481       return UNSPEC_COND_FCMGE;
24482     case UNORDERED:
24483       return UNSPEC_COND_FCMUO;
24484     default:
24485       gcc_unreachable ();
24486     }
24487 }
24488
24489 /* Emit:
24490
24491       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24492
24493    where <X> is the operation associated with comparison CODE.
24494    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24495
24496 static void
24497 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
24498                           bool known_ptrue_p, rtx op0, rtx op1)
24499 {
24500   rtx flag = gen_int_mode (known_ptrue_p, SImode);
24501   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
24502                                gen_rtvec (4, pred, flag, op0, op1),
24503                                aarch64_unspec_cond_code (code));
24504   emit_set_insn (target, unspec);
24505 }
24506
24507 /* Emit the SVE equivalent of:
24508
24509       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
24510       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
24511       (set TARGET (ior:PRED_MODE TMP1 TMP2))
24512
24513    where <Xi> is the operation associated with comparison CODEi.
24514    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24515
24516 static void
24517 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
24518                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
24519 {
24520   machine_mode pred_mode = GET_MODE (pred);
24521   rtx tmp1 = gen_reg_rtx (pred_mode);
24522   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
24523   rtx tmp2 = gen_reg_rtx (pred_mode);
24524   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
24525   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
24526 }
24527
24528 /* Emit the SVE equivalent of:
24529
24530       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24531       (set TARGET (not TMP))
24532
24533    where <X> is the operation associated with comparison CODE.
24534    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24535
24536 static void
24537 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
24538                                  bool known_ptrue_p, rtx op0, rtx op1)
24539 {
24540   machine_mode pred_mode = GET_MODE (pred);
24541   rtx tmp = gen_reg_rtx (pred_mode);
24542   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
24543   aarch64_emit_unop (target, one_cmpl_optab, tmp);
24544 }
24545
24546 /* Expand an SVE floating-point comparison using the SVE equivalent of:
24547
24548      (set TARGET (CODE OP0 OP1))
24549
24550    If CAN_INVERT_P is true, the caller can also handle inverted results;
24551    return true if the result is in fact inverted.  */
24552
24553 bool
24554 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
24555                                   rtx op0, rtx op1, bool can_invert_p)
24556 {
24557   machine_mode pred_mode = GET_MODE (target);
24558   machine_mode data_mode = GET_MODE (op0);
24559
24560   rtx ptrue = aarch64_ptrue_reg (pred_mode);
24561   switch (code)
24562     {
24563     case UNORDERED:
24564       /* UNORDERED has no immediate form.  */
24565       op1 = force_reg (data_mode, op1);
24566       /* fall through */
24567     case LT:
24568     case LE:
24569     case GT:
24570     case GE:
24571     case EQ:
24572     case NE:
24573       {
24574         /* There is native support for the comparison.  */
24575         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24576         return false;
24577       }
24578
24579     case LTGT:
24580       /* This is a trapping operation (LT or GT).  */
24581       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
24582       return false;
24583
24584     case UNEQ:
24585       if (!flag_trapping_math)
24586         {
24587           /* This would trap for signaling NaNs.  */
24588           op1 = force_reg (data_mode, op1);
24589           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
24590                                         ptrue, true, op0, op1);
24591           return false;
24592         }
24593       /* fall through */
24594     case UNLT:
24595     case UNLE:
24596     case UNGT:
24597     case UNGE:
24598       if (flag_trapping_math)
24599         {
24600           /* Work out which elements are ordered.  */
24601           rtx ordered = gen_reg_rtx (pred_mode);
24602           op1 = force_reg (data_mode, op1);
24603           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
24604                                            ptrue, true, op0, op1);
24605
24606           /* Test the opposite condition for the ordered elements,
24607              then invert the result.  */
24608           if (code == UNEQ)
24609             code = NE;
24610           else
24611             code = reverse_condition_maybe_unordered (code);
24612           if (can_invert_p)
24613             {
24614               aarch64_emit_sve_fp_cond (target, code,
24615                                         ordered, false, op0, op1);
24616               return true;
24617             }
24618           aarch64_emit_sve_invert_fp_cond (target, code,
24619                                            ordered, false, op0, op1);
24620           return false;
24621         }
24622       break;
24623
24624     case ORDERED:
24625       /* ORDERED has no immediate form.  */
24626       op1 = force_reg (data_mode, op1);
24627       break;
24628
24629     default:
24630       gcc_unreachable ();
24631     }
24632
24633   /* There is native support for the inverse comparison.  */
24634   code = reverse_condition_maybe_unordered (code);
24635   if (can_invert_p)
24636     {
24637       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24638       return true;
24639     }
24640   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
24641   return false;
24642 }
24643
24644 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
24645    of the data being selected and CMP_MODE is the mode of the values being
24646    compared.  */
24647
24648 void
24649 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
24650                           rtx *ops)
24651 {
24652   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
24653   rtx pred = gen_reg_rtx (pred_mode);
24654   if (FLOAT_MODE_P (cmp_mode))
24655     {
24656       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
24657                                             ops[4], ops[5], true))
24658         std::swap (ops[1], ops[2]);
24659     }
24660   else
24661     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
24662
24663   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
24664     ops[1] = force_reg (data_mode, ops[1]);
24665   /* The "false" value can only be zero if the "true" value is a constant.  */
24666   if (register_operand (ops[1], data_mode)
24667       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
24668     ops[2] = force_reg (data_mode, ops[2]);
24669
24670   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
24671   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
24672 }
24673
24674 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
24675    true.  However due to issues with register allocation it is preferable
24676    to avoid tieing integer scalar and FP scalar modes.  Executing integer
24677    operations in general registers is better than treating them as scalar
24678    vector operations.  This reduces latency and avoids redundant int<->FP
24679    moves.  So tie modes if they are either the same class, or vector modes
24680    with other vector modes, vector structs or any scalar mode.  */
24681
24682 static bool
24683 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
24684 {
24685   if ((aarch64_advsimd_partial_struct_mode_p (mode1)
24686        != aarch64_advsimd_partial_struct_mode_p (mode2))
24687       && maybe_gt (GET_MODE_SIZE (mode1), 8)
24688       && maybe_gt (GET_MODE_SIZE (mode2), 8))
24689     return false;
24690
24691   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
24692     return true;
24693
24694   /* We specifically want to allow elements of "structure" modes to
24695      be tieable to the structure.  This more general condition allows
24696      other rarer situations too.  The reason we don't extend this to
24697      predicate modes is that there are no predicate structure modes
24698      nor any specific instructions for extracting part of a predicate
24699      register.  */
24700   if (aarch64_vector_data_mode_p (mode1)
24701       && aarch64_vector_data_mode_p (mode2))
24702     return true;
24703
24704   /* Also allow any scalar modes with vectors.  */
24705   if (aarch64_vector_mode_supported_p (mode1)
24706       || aarch64_vector_mode_supported_p (mode2))
24707     return true;
24708
24709   return false;
24710 }
24711
24712 /* Return a new RTX holding the result of moving POINTER forward by
24713    AMOUNT bytes.  */
24714
24715 static rtx
24716 aarch64_move_pointer (rtx pointer, poly_int64 amount)
24717 {
24718   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
24719
24720   return adjust_automodify_address (pointer, GET_MODE (pointer),
24721                                     next, amount);
24722 }
24723
24724 /* Return a new RTX holding the result of moving POINTER forward by the
24725    size of the mode it points to.  */
24726
24727 static rtx
24728 aarch64_progress_pointer (rtx pointer)
24729 {
24730   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
24731 }
24732
24733 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
24734    MODE bytes.  */
24735
24736 static void
24737 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
24738                                               machine_mode mode)
24739 {
24740   /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
24741      address copies using V4SImode so that we can use Q registers.  */
24742   if (known_eq (GET_MODE_BITSIZE (mode), 256))
24743     {
24744       mode = V4SImode;
24745       rtx reg1 = gen_reg_rtx (mode);
24746       rtx reg2 = gen_reg_rtx (mode);
24747       /* "Cast" the pointers to the correct mode.  */
24748       *src = adjust_address (*src, mode, 0);
24749       *dst = adjust_address (*dst, mode, 0);
24750       /* Emit the memcpy.  */
24751       emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
24752                                         aarch64_progress_pointer (*src)));
24753       emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
24754                                          aarch64_progress_pointer (*dst), reg2));
24755       /* Move the pointers forward.  */
24756       *src = aarch64_move_pointer (*src, 32);
24757       *dst = aarch64_move_pointer (*dst, 32);
24758       return;
24759     }
24760
24761   rtx reg = gen_reg_rtx (mode);
24762
24763   /* "Cast" the pointers to the correct mode.  */
24764   *src = adjust_address (*src, mode, 0);
24765   *dst = adjust_address (*dst, mode, 0);
24766   /* Emit the memcpy.  */
24767   emit_move_insn (reg, *src);
24768   emit_move_insn (*dst, reg);
24769   /* Move the pointers forward.  */
24770   *src = aarch64_progress_pointer (*src);
24771   *dst = aarch64_progress_pointer (*dst);
24772 }
24773
24774 /* Expand a cpymem using the MOPS extension.  OPERANDS are taken
24775    from the cpymem pattern.  Return true iff we succeeded.  */
24776 static bool
24777 aarch64_expand_cpymem_mops (rtx *operands)
24778 {
24779   if (!TARGET_MOPS)
24780     return false;
24781
24782   /* All three registers are changed by the instruction, so each one
24783      must be a fresh pseudo.  */
24784   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24785   rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
24786   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24787   rtx src_mem = replace_equiv_address (operands[1], src_addr);
24788   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
24789   emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
24790
24791   return true;
24792 }
24793
24794 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
24795    we succeed, otherwise return false, indicating that a libcall to
24796    memcpy should be emitted.  */
24797
24798 bool
24799 aarch64_expand_cpymem (rtx *operands)
24800 {
24801   int mode_bits;
24802   rtx dst = operands[0];
24803   rtx src = operands[1];
24804   rtx base;
24805   machine_mode cur_mode = BLKmode;
24806
24807   /* Variable-sized memcpy can go through the MOPS expansion if available.  */
24808   if (!CONST_INT_P (operands[2]))
24809     return aarch64_expand_cpymem_mops (operands);
24810
24811   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
24812
24813   /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
24814   unsigned HOST_WIDE_INT max_copy_size
24815     = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
24816
24817   bool size_p = optimize_function_for_size_p (cfun);
24818
24819   /* Large constant-sized cpymem should go through MOPS when possible.
24820      It should be a win even for size optimization in the general case.
24821      For speed optimization the choice between MOPS and the SIMD sequence
24822      depends on the size of the copy, rather than number of instructions,
24823      alignment etc.  */
24824   if (size > max_copy_size)
24825     return aarch64_expand_cpymem_mops (operands);
24826
24827   int copy_bits = 256;
24828
24829   /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
24830      support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
24831   if (size <= 24
24832       || !TARGET_SIMD
24833       || (aarch64_tune_params.extra_tuning_flags
24834           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
24835     copy_bits = 128;
24836
24837   /* Emit an inline load+store sequence and count the number of operations
24838      involved.  We use a simple count of just the loads and stores emitted
24839      rather than rtx_insn count as all the pointer adjustments and reg copying
24840      in this function will get optimized away later in the pipeline.  */
24841   start_sequence ();
24842   unsigned nops = 0;
24843
24844   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24845   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24846
24847   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
24848   src = adjust_automodify_address (src, VOIDmode, base, 0);
24849
24850   /* Convert size to bits to make the rest of the code simpler.  */
24851   int n = size * BITS_PER_UNIT;
24852
24853   while (n > 0)
24854     {
24855       /* Find the largest mode in which to do the copy in without over reading
24856          or writing.  */
24857       opt_scalar_int_mode mode_iter;
24858       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
24859         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
24860           cur_mode = mode_iter.require ();
24861
24862       gcc_assert (cur_mode != BLKmode);
24863
24864       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
24865
24866       /* Prefer Q-register accesses for the last bytes.  */
24867       if (mode_bits == 128 && copy_bits == 256)
24868         cur_mode = V4SImode;
24869
24870       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
24871       /* A single block copy is 1 load + 1 store.  */
24872       nops += 2;
24873       n -= mode_bits;
24874
24875       /* Emit trailing copies using overlapping unaligned accesses
24876         (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
24877       if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
24878         {
24879           machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
24880           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
24881           gcc_assert (n_bits <= mode_bits);
24882           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
24883           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
24884           n = n_bits;
24885         }
24886     }
24887   rtx_insn *seq = get_insns ();
24888   end_sequence ();
24889   /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
24890      the constant size into a register.  */
24891   unsigned mops_cost = 3 + 1;
24892
24893   /* If MOPS is available at this point we don't consider the libcall as it's
24894      not a win even on code size.  At this point only consider MOPS if
24895      optimizing for size.  For speed optimizations we will have chosen between
24896      the two based on copy size already.  */
24897   if (TARGET_MOPS)
24898     {
24899       if (size_p && mops_cost < nops)
24900         return aarch64_expand_cpymem_mops (operands);
24901       emit_insn (seq);
24902       return true;
24903     }
24904
24905   /* A memcpy libcall in the worst case takes 3 instructions to prepare the
24906      arguments + 1 for the call.  When MOPS is not available and we're
24907      optimizing for size a libcall may be preferable.  */
24908   unsigned libcall_cost = 4;
24909   if (size_p && libcall_cost < nops)
24910     return false;
24911
24912   emit_insn (seq);
24913   return true;
24914 }
24915
24916 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
24917    SRC is a register we have created with the duplicated value to be set.  */
24918 static void
24919 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
24920                                             machine_mode mode)
24921 {
24922   /* If we are copying 128bits or 256bits, we can do that straight from
24923      the SIMD register we prepared.  */
24924   if (known_eq (GET_MODE_BITSIZE (mode), 256))
24925     {
24926       mode = GET_MODE (src);
24927       /* "Cast" the *dst to the correct mode.  */
24928       *dst = adjust_address (*dst, mode, 0);
24929       /* Emit the memset.  */
24930       emit_insn (aarch64_gen_store_pair (mode, *dst, src,
24931                                          aarch64_progress_pointer (*dst), src));
24932
24933       /* Move the pointers forward.  */
24934       *dst = aarch64_move_pointer (*dst, 32);
24935       return;
24936     }
24937   if (known_eq (GET_MODE_BITSIZE (mode), 128))
24938     {
24939       /* "Cast" the *dst to the correct mode.  */
24940       *dst = adjust_address (*dst, GET_MODE (src), 0);
24941       /* Emit the memset.  */
24942       emit_move_insn (*dst, src);
24943       /* Move the pointers forward.  */
24944       *dst = aarch64_move_pointer (*dst, 16);
24945       return;
24946     }
24947   /* For copying less, we have to extract the right amount from src.  */
24948   rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
24949
24950   /* "Cast" the *dst to the correct mode.  */
24951   *dst = adjust_address (*dst, mode, 0);
24952   /* Emit the memset.  */
24953   emit_move_insn (*dst, reg);
24954   /* Move the pointer forward.  */
24955   *dst = aarch64_progress_pointer (*dst);
24956 }
24957
24958 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
24959    as for the setmem pattern.  Return true iff we succeed.  */
24960 static bool
24961 aarch64_expand_setmem_mops (rtx *operands)
24962 {
24963   if (!TARGET_MOPS)
24964     return false;
24965
24966   /* The first two registers are changed by the instruction, so both
24967      of them must be a fresh pseudo.  */
24968   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24969   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24970   rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
24971   rtx val = operands[2];
24972   if (val != CONST0_RTX (QImode))
24973     val = force_reg (QImode, val);
24974   emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
24975   return true;
24976 }
24977
24978 /* Expand setmem, as if from a __builtin_memset.  Return true if
24979    we succeed, otherwise return false.  */
24980
24981 bool
24982 aarch64_expand_setmem (rtx *operands)
24983 {
24984   int n, mode_bits;
24985   unsigned HOST_WIDE_INT len;
24986   rtx dst = operands[0];
24987   rtx val = operands[2], src;
24988   rtx base;
24989   machine_mode cur_mode = BLKmode, next_mode;
24990
24991   /* If we don't have SIMD registers or the size is variable use the MOPS
24992      inlined sequence if possible.  */
24993   if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
24994     return aarch64_expand_setmem_mops (operands);
24995
24996   bool size_p = optimize_function_for_size_p (cfun);
24997
24998   /* Default the maximum to 256-bytes when considering only libcall vs
24999      SIMD broadcast sequence.  */
25000   unsigned max_set_size = 256;
25001
25002   len = INTVAL (operands[1]);
25003   if (len > max_set_size && !TARGET_MOPS)
25004     return false;
25005
25006   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
25007   /* The MOPS sequence takes:
25008      3 instructions for the memory storing
25009      + 1 to move the constant size into a reg
25010      + 1 if VAL is a non-zero constant to move into a reg
25011     (zero constants can use XZR directly).  */
25012   unsigned mops_cost = 3 + 1 + cst_val;
25013   /* A libcall to memset in the worst case takes 3 instructions to prepare
25014      the arguments + 1 for the call.  */
25015   unsigned libcall_cost = 4;
25016
25017   /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
25018      when available.  */
25019   if (TARGET_MOPS
25020       && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
25021     return aarch64_expand_setmem_mops (operands);
25022
25023   /* Attempt a sequence with a vector broadcast followed by stores.
25024      Count the number of operations involved to see if it's worth it
25025      against the alternatives.  A simple counter simd_ops on the
25026      algorithmically-relevant operations is used rather than an rtx_insn count
25027      as all the pointer adjusmtents and mode reinterprets will be optimized
25028      away later.  */
25029   start_sequence ();
25030   unsigned simd_ops = 0;
25031
25032   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
25033   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
25034
25035   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
25036   src = expand_vector_broadcast (V16QImode, val);
25037   src = force_reg (V16QImode, src);
25038   simd_ops++;
25039   /* Convert len to bits to make the rest of the code simpler.  */
25040   n = len * BITS_PER_UNIT;
25041
25042   /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
25043      AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
25044   const int copy_limit = (aarch64_tune_params.extra_tuning_flags
25045                           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
25046                           ? GET_MODE_BITSIZE (TImode) : 256;
25047
25048   while (n > 0)
25049     {
25050       /* Find the largest mode in which to do the copy without
25051          over writing.  */
25052       opt_scalar_int_mode mode_iter;
25053       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
25054         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
25055           cur_mode = mode_iter.require ();
25056
25057       gcc_assert (cur_mode != BLKmode);
25058
25059       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
25060       aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
25061       simd_ops++;
25062       n -= mode_bits;
25063
25064       /* Do certain trailing copies as overlapping if it's going to be
25065          cheaper.  i.e. less instructions to do so.  For instance doing a 15
25066          byte copy it's more efficient to do two overlapping 8 byte copies than
25067          8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
25068       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
25069         {
25070           next_mode = smallest_mode_for_size (n, MODE_INT);
25071           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
25072           gcc_assert (n_bits <= mode_bits);
25073           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
25074           n = n_bits;
25075         }
25076     }
25077   rtx_insn *seq = get_insns ();
25078   end_sequence ();
25079
25080   if (size_p)
25081     {
25082       /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
25083          call to memset or the MOPS expansion.  */
25084       if (TARGET_MOPS
25085           && mops_cost <= libcall_cost
25086           && mops_cost <= simd_ops)
25087         return aarch64_expand_setmem_mops (operands);
25088       /* If MOPS is not available or not shorter pick a libcall if the SIMD
25089          sequence is too long.  */
25090       else if (libcall_cost < simd_ops)
25091         return false;
25092       emit_insn (seq);
25093       return true;
25094     }
25095
25096   /* At this point the SIMD broadcast sequence is the best choice when
25097      optimizing for speed.  */
25098   emit_insn (seq);
25099   return true;
25100 }
25101
25102
25103 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
25104    SImode stores.  Handle the case when the constant has identical
25105    bottom and top halves.  This is beneficial when the two stores can be
25106    merged into an STP and we avoid synthesising potentially expensive
25107    immediates twice.  Return true if such a split is possible.  */
25108
25109 bool
25110 aarch64_split_dimode_const_store (rtx dst, rtx src)
25111 {
25112   rtx lo = gen_lowpart (SImode, src);
25113   rtx hi = gen_highpart_mode (SImode, DImode, src);
25114
25115   bool size_p = optimize_function_for_size_p (cfun);
25116
25117   if (!rtx_equal_p (lo, hi))
25118     return false;
25119
25120   unsigned int orig_cost
25121     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
25122   unsigned int lo_cost
25123     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
25124
25125   /* We want to transform:
25126      MOV        x1, 49370
25127      MOVK       x1, 0x140, lsl 16
25128      MOVK       x1, 0xc0da, lsl 32
25129      MOVK       x1, 0x140, lsl 48
25130      STR        x1, [x0]
25131    into:
25132      MOV        w1, 49370
25133      MOVK       w1, 0x140, lsl 16
25134      STP        w1, w1, [x0]
25135    So we want to perform this only when we save two instructions
25136    or more.  When optimizing for size, however, accept any code size
25137    savings we can.  */
25138   if (size_p && orig_cost <= lo_cost)
25139     return false;
25140
25141   if (!size_p
25142       && (orig_cost <= lo_cost + 1))
25143     return false;
25144
25145   rtx mem_lo = adjust_address (dst, SImode, 0);
25146   if (!aarch64_mem_pair_operand (mem_lo, SImode))
25147     return false;
25148
25149   rtx tmp_reg = gen_reg_rtx (SImode);
25150   aarch64_expand_mov_immediate (tmp_reg, lo);
25151   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
25152   /* Don't emit an explicit store pair as this may not be always profitable.
25153      Let the sched-fusion logic decide whether to merge them.  */
25154   emit_move_insn (mem_lo, tmp_reg);
25155   emit_move_insn (mem_hi, tmp_reg);
25156
25157   return true;
25158 }
25159
25160 /* Generate RTL for a conditional branch with rtx comparison CODE in
25161    mode CC_MODE.  The destination of the unlikely conditional branch
25162    is LABEL_REF.  */
25163
25164 void
25165 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
25166                               rtx label_ref)
25167 {
25168   rtx x;
25169   x = gen_rtx_fmt_ee (code, VOIDmode,
25170                       gen_rtx_REG (cc_mode, CC_REGNUM),
25171                       const0_rtx);
25172
25173   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
25174                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
25175                             pc_rtx);
25176   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
25177 }
25178
25179 /* Generate DImode scratch registers for 128-bit (TImode) addition.
25180
25181    OP1 represents the TImode destination operand 1
25182    OP2 represents the TImode destination operand 2
25183    LOW_DEST represents the low half (DImode) of TImode operand 0
25184    LOW_IN1 represents the low half (DImode) of TImode operand 1
25185    LOW_IN2 represents the low half (DImode) of TImode operand 2
25186    HIGH_DEST represents the high half (DImode) of TImode operand 0
25187    HIGH_IN1 represents the high half (DImode) of TImode operand 1
25188    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
25189
25190 void
25191 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25192                             rtx *low_in1, rtx *low_in2,
25193                             rtx *high_dest, rtx *high_in1,
25194                             rtx *high_in2)
25195 {
25196   *low_dest = gen_reg_rtx (DImode);
25197   *low_in1 = gen_lowpart (DImode, op1);
25198   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25199                                   subreg_lowpart_offset (DImode, TImode));
25200   *high_dest = gen_reg_rtx (DImode);
25201   *high_in1 = gen_highpart (DImode, op1);
25202   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25203                                    subreg_highpart_offset (DImode, TImode));
25204 }
25205
25206 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
25207
25208    This function differs from 'arch64_addti_scratch_regs' in that
25209    OP1 can be an immediate constant (zero). We must call
25210    subreg_highpart_offset with DImode and TImode arguments, otherwise
25211    VOIDmode will be used for the const_int which generates an internal
25212    error from subreg_size_highpart_offset which does not expect a size of zero.
25213
25214    OP1 represents the TImode destination operand 1
25215    OP2 represents the TImode destination operand 2
25216    LOW_DEST represents the low half (DImode) of TImode operand 0
25217    LOW_IN1 represents the low half (DImode) of TImode operand 1
25218    LOW_IN2 represents the low half (DImode) of TImode operand 2
25219    HIGH_DEST represents the high half (DImode) of TImode operand 0
25220    HIGH_IN1 represents the high half (DImode) of TImode operand 1
25221    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
25222
25223
25224 void
25225 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25226                              rtx *low_in1, rtx *low_in2,
25227                              rtx *high_dest, rtx *high_in1,
25228                              rtx *high_in2)
25229 {
25230   *low_dest = gen_reg_rtx (DImode);
25231   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
25232                                   subreg_lowpart_offset (DImode, TImode));
25233
25234   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25235                                   subreg_lowpart_offset (DImode, TImode));
25236   *high_dest = gen_reg_rtx (DImode);
25237
25238   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
25239                                    subreg_highpart_offset (DImode, TImode));
25240   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25241                                    subreg_highpart_offset (DImode, TImode));
25242 }
25243
25244 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
25245
25246    OP0 represents the TImode destination operand 0
25247    LOW_DEST represents the low half (DImode) of TImode operand 0
25248    LOW_IN1 represents the low half (DImode) of TImode operand 1
25249    LOW_IN2 represents the low half (DImode) of TImode operand 2
25250    HIGH_DEST represents the high half (DImode) of TImode operand 0
25251    HIGH_IN1 represents the high half (DImode) of TImode operand 1
25252    HIGH_IN2 represents the high half (DImode) of TImode operand 2
25253    UNSIGNED_P is true if the operation is being performed on unsigned
25254    values.  */
25255 void
25256 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
25257                        rtx low_in2, rtx high_dest, rtx high_in1,
25258                        rtx high_in2, bool unsigned_p)
25259 {
25260   if (low_in2 == const0_rtx)
25261     {
25262       low_dest = low_in1;
25263       high_in2 = force_reg (DImode, high_in2);
25264       if (unsigned_p)
25265         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
25266       else
25267         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
25268     }
25269   else
25270     {
25271       if (aarch64_plus_immediate (low_in2, DImode))
25272         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
25273                                             GEN_INT (-UINTVAL (low_in2))));
25274       else
25275         {
25276           low_in2 = force_reg (DImode, low_in2);
25277           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
25278         }
25279       high_in2 = force_reg (DImode, high_in2);
25280
25281       if (unsigned_p)
25282         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
25283       else
25284         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
25285     }
25286
25287   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
25288   emit_move_insn (gen_highpart (DImode, op0), high_dest);
25289
25290 }
25291
25292 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
25293
25294 static unsigned HOST_WIDE_INT
25295 aarch64_asan_shadow_offset (void)
25296 {
25297   if (TARGET_ILP32)
25298     return (HOST_WIDE_INT_1 << 29);
25299   else
25300     return (HOST_WIDE_INT_1 << 36);
25301 }
25302
25303 static rtx
25304 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
25305                         int code, tree treeop0, tree treeop1)
25306 {
25307   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25308   rtx op0, op1;
25309   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25310   insn_code icode;
25311   struct expand_operand ops[4];
25312
25313   start_sequence ();
25314   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25315
25316   op_mode = GET_MODE (op0);
25317   if (op_mode == VOIDmode)
25318     op_mode = GET_MODE (op1);
25319
25320   switch (op_mode)
25321     {
25322     case E_QImode:
25323     case E_HImode:
25324     case E_SImode:
25325       cmp_mode = SImode;
25326       icode = CODE_FOR_cmpsi;
25327       break;
25328
25329     case E_DImode:
25330       cmp_mode = DImode;
25331       icode = CODE_FOR_cmpdi;
25332       break;
25333
25334     case E_SFmode:
25335       cmp_mode = SFmode;
25336       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25337       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
25338       break;
25339
25340     case E_DFmode:
25341       cmp_mode = DFmode;
25342       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25343       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
25344       break;
25345
25346     default:
25347       end_sequence ();
25348       return NULL_RTX;
25349     }
25350
25351   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
25352   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
25353   if (!op0 || !op1)
25354     {
25355       end_sequence ();
25356       return NULL_RTX;
25357     }
25358   *prep_seq = get_insns ();
25359   end_sequence ();
25360
25361   create_fixed_operand (&ops[0], op0);
25362   create_fixed_operand (&ops[1], op1);
25363
25364   start_sequence ();
25365   if (!maybe_expand_insn (icode, 2, ops))
25366     {
25367       end_sequence ();
25368       return NULL_RTX;
25369     }
25370   *gen_seq = get_insns ();
25371   end_sequence ();
25372
25373   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
25374                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
25375 }
25376
25377 static rtx
25378 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
25379                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
25380 {
25381   rtx op0, op1, target;
25382   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25383   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25384   insn_code icode;
25385   struct expand_operand ops[6];
25386   int aarch64_cond;
25387
25388   push_to_sequence (*prep_seq);
25389   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25390
25391   op_mode = GET_MODE (op0);
25392   if (op_mode == VOIDmode)
25393     op_mode = GET_MODE (op1);
25394
25395   switch (op_mode)
25396     {
25397     case E_QImode:
25398     case E_HImode:
25399     case E_SImode:
25400       cmp_mode = SImode;
25401       break;
25402
25403     case E_DImode:
25404       cmp_mode = DImode;
25405       break;
25406
25407     case E_SFmode:
25408       cmp_mode = SFmode;
25409       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25410       break;
25411
25412     case E_DFmode:
25413       cmp_mode = DFmode;
25414       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25415       break;
25416
25417     default:
25418       end_sequence ();
25419       return NULL_RTX;
25420     }
25421
25422   icode = code_for_ccmp (cc_mode, cmp_mode);
25423
25424   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
25425   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
25426   if (!op0 || !op1)
25427     {
25428       end_sequence ();
25429       return NULL_RTX;
25430     }
25431   *prep_seq = get_insns ();
25432   end_sequence ();
25433
25434   target = gen_rtx_REG (cc_mode, CC_REGNUM);
25435   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
25436
25437   if (bit_code != AND)
25438     {
25439       /* Treat the ccmp patterns as canonical and use them where possible,
25440          but fall back to ccmp_rev patterns if there's no other option.  */
25441       rtx_code prev_code = GET_CODE (prev);
25442       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
25443       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
25444           && !(prev_code == EQ
25445                || prev_code == NE
25446                || prev_code == ORDERED
25447                || prev_code == UNORDERED))
25448         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
25449       else
25450         {
25451           rtx_code code = reverse_condition (prev_code);
25452           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
25453         }
25454       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
25455     }
25456
25457   create_fixed_operand (&ops[0], XEXP (prev, 0));
25458   create_fixed_operand (&ops[1], target);
25459   create_fixed_operand (&ops[2], op0);
25460   create_fixed_operand (&ops[3], op1);
25461   create_fixed_operand (&ops[4], prev);
25462   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
25463
25464   push_to_sequence (*gen_seq);
25465   if (!maybe_expand_insn (icode, 6, ops))
25466     {
25467       end_sequence ();
25468       return NULL_RTX;
25469     }
25470
25471   *gen_seq = get_insns ();
25472   end_sequence ();
25473
25474   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
25475 }
25476
25477 #undef TARGET_GEN_CCMP_FIRST
25478 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
25479
25480 #undef TARGET_GEN_CCMP_NEXT
25481 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
25482
25483 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
25484    instruction fusion of some sort.  */
25485
25486 static bool
25487 aarch64_macro_fusion_p (void)
25488 {
25489   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
25490 }
25491
25492
25493 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
25494    should be kept together during scheduling.  */
25495
25496 static bool
25497 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
25498 {
25499   rtx set_dest;
25500   rtx prev_set = single_set (prev);
25501   rtx curr_set = single_set (curr);
25502   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
25503   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
25504
25505   if (!aarch64_macro_fusion_p ())
25506     return false;
25507
25508   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
25509     {
25510       /* We are trying to match:
25511          prev (mov)  == (set (reg r0) (const_int imm16))
25512          curr (movk) == (set (zero_extract (reg r0)
25513                                            (const_int 16)
25514                                            (const_int 16))
25515                              (const_int imm16_1))  */
25516
25517       set_dest = SET_DEST (curr_set);
25518
25519       if (GET_CODE (set_dest) == ZERO_EXTRACT
25520           && CONST_INT_P (SET_SRC (curr_set))
25521           && CONST_INT_P (SET_SRC (prev_set))
25522           && CONST_INT_P (XEXP (set_dest, 2))
25523           && INTVAL (XEXP (set_dest, 2)) == 16
25524           && REG_P (XEXP (set_dest, 0))
25525           && REG_P (SET_DEST (prev_set))
25526           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
25527         {
25528           return true;
25529         }
25530     }
25531
25532   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
25533     {
25534
25535       /*  We're trying to match:
25536           prev (adrp) == (set (reg r1)
25537                               (high (symbol_ref ("SYM"))))
25538           curr (add) == (set (reg r0)
25539                              (lo_sum (reg r1)
25540                                      (symbol_ref ("SYM"))))
25541           Note that r0 need not necessarily be the same as r1, especially
25542           during pre-regalloc scheduling.  */
25543
25544       if (satisfies_constraint_Ush (SET_SRC (prev_set))
25545           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25546         {
25547           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
25548               && REG_P (XEXP (SET_SRC (curr_set), 0))
25549               && REGNO (XEXP (SET_SRC (curr_set), 0))
25550                  == REGNO (SET_DEST (prev_set))
25551               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
25552                               XEXP (SET_SRC (curr_set), 1)))
25553             return true;
25554         }
25555     }
25556
25557   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
25558     {
25559
25560       /* We're trying to match:
25561          prev (movk) == (set (zero_extract (reg r0)
25562                                            (const_int 16)
25563                                            (const_int 32))
25564                              (const_int imm16_1))
25565          curr (movk) == (set (zero_extract (reg r0)
25566                                            (const_int 16)
25567                                            (const_int 48))
25568                              (const_int imm16_2))  */
25569
25570       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
25571           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
25572           && REG_P (XEXP (SET_DEST (prev_set), 0))
25573           && REG_P (XEXP (SET_DEST (curr_set), 0))
25574           && REGNO (XEXP (SET_DEST (prev_set), 0))
25575              == REGNO (XEXP (SET_DEST (curr_set), 0))
25576           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
25577           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
25578           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
25579           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
25580           && CONST_INT_P (SET_SRC (prev_set))
25581           && CONST_INT_P (SET_SRC (curr_set)))
25582         return true;
25583
25584     }
25585   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
25586     {
25587       /* We're trying to match:
25588           prev (adrp) == (set (reg r0)
25589                               (high (symbol_ref ("SYM"))))
25590           curr (ldr) == (set (reg r1)
25591                              (mem (lo_sum (reg r0)
25592                                              (symbol_ref ("SYM")))))
25593                  or
25594           curr (ldr) == (set (reg r1)
25595                              (zero_extend (mem
25596                                            (lo_sum (reg r0)
25597                                                    (symbol_ref ("SYM"))))))  */
25598       if (satisfies_constraint_Ush (SET_SRC (prev_set))
25599           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25600         {
25601           rtx curr_src = SET_SRC (curr_set);
25602
25603           if (GET_CODE (curr_src) == ZERO_EXTEND)
25604             curr_src = XEXP (curr_src, 0);
25605
25606           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
25607               && REG_P (XEXP (XEXP (curr_src, 0), 0))
25608               && REGNO (XEXP (XEXP (curr_src, 0), 0))
25609                  == REGNO (SET_DEST (prev_set))
25610               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
25611                               XEXP (SET_SRC (prev_set), 0)))
25612               return true;
25613         }
25614     }
25615
25616   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
25617   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
25618       && prev_set && curr_set && any_condjump_p (curr)
25619       && GET_CODE (SET_SRC (prev_set)) == COMPARE
25620       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
25621       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
25622     return true;
25623
25624   /* Fuse flag-setting ALU instructions and conditional branch.  */
25625   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
25626       && any_condjump_p (curr))
25627     {
25628       unsigned int condreg1, condreg2;
25629       rtx cc_reg_1;
25630       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
25631       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
25632
25633       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
25634           && prev
25635           && modified_in_p (cc_reg_1, prev))
25636         {
25637           enum attr_type prev_type = get_attr_type (prev);
25638
25639           /* FIXME: this misses some which is considered simple arthematic
25640              instructions for ThunderX.  Simple shifts are missed here.  */
25641           if (prev_type == TYPE_ALUS_SREG
25642               || prev_type == TYPE_ALUS_IMM
25643               || prev_type == TYPE_LOGICS_REG
25644               || prev_type == TYPE_LOGICS_IMM)
25645             return true;
25646         }
25647     }
25648
25649   /* Fuse ALU instructions and CBZ/CBNZ.  */
25650   if (prev_set
25651       && curr_set
25652       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
25653       && any_condjump_p (curr))
25654     {
25655       /* We're trying to match:
25656           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
25657           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
25658                                                          (const_int 0))
25659                                                  (label_ref ("SYM"))
25660                                                  (pc))  */
25661       if (SET_DEST (curr_set) == (pc_rtx)
25662           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
25663           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
25664           && REG_P (SET_DEST (prev_set))
25665           && REGNO (SET_DEST (prev_set))
25666              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
25667         {
25668           /* Fuse ALU operations followed by conditional branch instruction.  */
25669           switch (get_attr_type (prev))
25670             {
25671             case TYPE_ALU_IMM:
25672             case TYPE_ALU_SREG:
25673             case TYPE_ADC_REG:
25674             case TYPE_ADC_IMM:
25675             case TYPE_ADCS_REG:
25676             case TYPE_ADCS_IMM:
25677             case TYPE_LOGIC_REG:
25678             case TYPE_LOGIC_IMM:
25679             case TYPE_CSEL:
25680             case TYPE_ADR:
25681             case TYPE_MOV_IMM:
25682             case TYPE_SHIFT_REG:
25683             case TYPE_SHIFT_IMM:
25684             case TYPE_BFM:
25685             case TYPE_RBIT:
25686             case TYPE_REV:
25687             case TYPE_EXTEND:
25688               return true;
25689
25690             default:;
25691             }
25692         }
25693     }
25694
25695   /* Fuse A+B+1 and A-B-1 */
25696   if (simple_sets_p
25697       && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
25698     {
25699       /* We're trying to match:
25700           prev == (set (r0) (plus (r0) (r1)))
25701           curr == (set (r0) (plus (r0) (const_int 1)))
25702         or:
25703           prev == (set (r0) (minus (r0) (r1)))
25704           curr == (set (r0) (plus (r0) (const_int -1))) */
25705
25706       rtx prev_src = SET_SRC (prev_set);
25707       rtx curr_src = SET_SRC (curr_set);
25708
25709       int polarity = 1;
25710       if (GET_CODE (prev_src) == MINUS)
25711         polarity = -1;
25712
25713       if (GET_CODE (curr_src) == PLUS
25714           && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
25715           && CONST_INT_P (XEXP (curr_src, 1))
25716           && INTVAL (XEXP (curr_src, 1)) == polarity
25717           && REG_P (XEXP (curr_src, 0))
25718           && REG_P (SET_DEST (prev_set))
25719           && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
25720         return true;
25721     }
25722
25723   return false;
25724 }
25725
25726 /* Return true iff the instruction fusion described by OP is enabled.  */
25727
25728 bool
25729 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
25730 {
25731   return (aarch64_tune_params.fusible_ops & op) != 0;
25732 }
25733
25734 /* If MEM is in the form of [base+offset], extract the two parts
25735    of address and set to BASE and OFFSET, otherwise return false
25736    after clearing BASE and OFFSET.  */
25737
25738 bool
25739 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
25740 {
25741   rtx addr;
25742
25743   gcc_assert (MEM_P (mem));
25744
25745   addr = XEXP (mem, 0);
25746
25747   if (REG_P (addr))
25748     {
25749       *base = addr;
25750       *offset = const0_rtx;
25751       return true;
25752     }
25753
25754   if (GET_CODE (addr) == PLUS
25755       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
25756     {
25757       *base = XEXP (addr, 0);
25758       *offset = XEXP (addr, 1);
25759       return true;
25760     }
25761
25762   *base = NULL_RTX;
25763   *offset = NULL_RTX;
25764
25765   return false;
25766 }
25767
25768 /* Types for scheduling fusion.  */
25769 enum sched_fusion_type
25770 {
25771   SCHED_FUSION_NONE = 0,
25772   SCHED_FUSION_LD_SIGN_EXTEND,
25773   SCHED_FUSION_LD_ZERO_EXTEND,
25774   SCHED_FUSION_LD,
25775   SCHED_FUSION_ST,
25776   SCHED_FUSION_NUM
25777 };
25778
25779 /* If INSN is a load or store of address in the form of [base+offset],
25780    extract the two parts and set to BASE and OFFSET.  Return scheduling
25781    fusion type this INSN is.  */
25782
25783 static enum sched_fusion_type
25784 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
25785 {
25786   rtx x, dest, src;
25787   enum sched_fusion_type fusion = SCHED_FUSION_LD;
25788
25789   gcc_assert (INSN_P (insn));
25790   x = PATTERN (insn);
25791   if (GET_CODE (x) != SET)
25792     return SCHED_FUSION_NONE;
25793
25794   src = SET_SRC (x);
25795   dest = SET_DEST (x);
25796
25797   machine_mode dest_mode = GET_MODE (dest);
25798
25799   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
25800     return SCHED_FUSION_NONE;
25801
25802   if (GET_CODE (src) == SIGN_EXTEND)
25803     {
25804       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
25805       src = XEXP (src, 0);
25806       if (!MEM_P (src) || GET_MODE (src) != SImode)
25807         return SCHED_FUSION_NONE;
25808     }
25809   else if (GET_CODE (src) == ZERO_EXTEND)
25810     {
25811       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
25812       src = XEXP (src, 0);
25813       if (!MEM_P (src) || GET_MODE (src) != SImode)
25814         return SCHED_FUSION_NONE;
25815     }
25816
25817   if (MEM_P (src) && REG_P (dest))
25818     extract_base_offset_in_addr (src, base, offset);
25819   else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
25820     {
25821       fusion = SCHED_FUSION_ST;
25822       extract_base_offset_in_addr (dest, base, offset);
25823     }
25824   else
25825     return SCHED_FUSION_NONE;
25826
25827   if (*base == NULL_RTX || *offset == NULL_RTX)
25828     fusion = SCHED_FUSION_NONE;
25829
25830   return fusion;
25831 }
25832
25833 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
25834
25835    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
25836    and PRI are only calculated for these instructions.  For other instruction,
25837    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
25838    type instruction fusion can be added by returning different priorities.
25839
25840    It's important that irrelevant instructions get the largest FUSION_PRI.  */
25841
25842 static void
25843 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
25844                                int *fusion_pri, int *pri)
25845 {
25846   int tmp, off_val;
25847   rtx base, offset;
25848   enum sched_fusion_type fusion;
25849
25850   gcc_assert (INSN_P (insn));
25851
25852   tmp = max_pri - 1;
25853   fusion = fusion_load_store (insn, &base, &offset);
25854   if (fusion == SCHED_FUSION_NONE)
25855     {
25856       *pri = tmp;
25857       *fusion_pri = tmp;
25858       return;
25859     }
25860
25861   /* Set FUSION_PRI according to fusion type and base register.  */
25862   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
25863
25864   /* Calculate PRI.  */
25865   tmp /= 2;
25866
25867   /* INSN with smaller offset goes first.  */
25868   off_val = (int)(INTVAL (offset));
25869   if (off_val >= 0)
25870     tmp -= (off_val & 0xfffff);
25871   else
25872     tmp += ((- off_val) & 0xfffff);
25873
25874   *pri = tmp;
25875   return;
25876 }
25877
25878 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
25879    Adjust priority of sha1h instructions so they are scheduled before
25880    other SHA1 instructions.  */
25881
25882 static int
25883 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
25884 {
25885   rtx x = PATTERN (insn);
25886
25887   if (GET_CODE (x) == SET)
25888     {
25889       x = SET_SRC (x);
25890
25891       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
25892         return priority + 10;
25893     }
25894
25895   return priority;
25896 }
25897
25898 /* If REVERSED is null, return true if memory reference *MEM2 comes
25899    immediately after memory reference *MEM1.  Do not change the references
25900    in this case.
25901
25902    Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
25903    if they are, try to make them use constant offsets from the same base
25904    register.  Return true on success.  When returning true, set *REVERSED
25905    to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2.  */
25906 static bool
25907 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
25908 {
25909   if (reversed)
25910     *reversed = false;
25911
25912   if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
25913       || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
25914     return false;
25915
25916   if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
25917     return false;
25918
25919   auto size1 = MEM_SIZE (*mem1);
25920   auto size2 = MEM_SIZE (*mem2);
25921
25922   rtx base1, base2, offset1, offset2;
25923   extract_base_offset_in_addr (*mem1, &base1, &offset1);
25924   extract_base_offset_in_addr (*mem2, &base2, &offset2);
25925
25926   /* Make sure at least one memory is in base+offset form.  */
25927   if (!(base1 && offset1) && !(base2 && offset2))
25928     return false;
25929
25930   /* If both mems already use the same base register, just check the
25931      offsets.  */
25932   if (base1 && base2 && rtx_equal_p (base1, base2))
25933     {
25934       if (!offset1 || !offset2)
25935         return false;
25936
25937       if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
25938         return true;
25939
25940       if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
25941         {
25942           *reversed = true;
25943           return true;
25944         }
25945
25946       return false;
25947     }
25948
25949   /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
25950      guarantee that the values are consecutive.  */
25951   if (MEM_EXPR (*mem1)
25952       && MEM_EXPR (*mem2)
25953       && MEM_OFFSET_KNOWN_P (*mem1)
25954       && MEM_OFFSET_KNOWN_P (*mem2))
25955     {
25956       poly_int64 expr_offset1;
25957       poly_int64 expr_offset2;
25958       tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
25959                                                        &expr_offset1);
25960       tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
25961                                                        &expr_offset2);
25962       if (!expr_base1
25963           || !expr_base2
25964           || !DECL_P (expr_base1)
25965           || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
25966         return false;
25967
25968       expr_offset1 += MEM_OFFSET (*mem1);
25969       expr_offset2 += MEM_OFFSET (*mem2);
25970
25971       if (known_eq (expr_offset1 + size1, expr_offset2))
25972         ;
25973       else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
25974         *reversed = true;
25975       else
25976         return false;
25977
25978       if (reversed)
25979         {
25980           if (base2)
25981             {
25982               rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
25983                                          expr_offset1 - expr_offset2);
25984               *mem1 = replace_equiv_address_nv (*mem1, addr1);
25985             }
25986           else
25987             {
25988               rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
25989                                          expr_offset2 - expr_offset1);
25990               *mem2 = replace_equiv_address_nv (*mem2, addr2);
25991             }
25992         }
25993       return true;
25994     }
25995
25996   return false;
25997 }
25998
25999 /* Return true if MEM1 and MEM2 can be combined into a single access
26000    of mode MODE, with the combined access having the same address as MEM1.  */
26001
26002 bool
26003 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
26004 {
26005   if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
26006     return false;
26007   return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
26008 }
26009
26010 /* Given OPERANDS of consecutive load/store, check if we can merge
26011    them into ldp/stp.  LOAD is true if they are load instructions.
26012    MODE is the mode of memory operands.  */
26013
26014 bool
26015 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
26016                                 machine_mode mode)
26017 {
26018   enum reg_class rclass_1, rclass_2;
26019   rtx mem_1, mem_2, reg_1, reg_2;
26020
26021   /* Allow the tuning structure to disable LDP instruction formation
26022      from combining instructions (e.g., in peephole2).
26023      TODO: Implement fine-grained tuning control for LDP and STP:
26024            1. control policies for load and store separately;
26025            2. support the following policies:
26026               - default (use what is in the tuning structure)
26027               - always
26028               - never
26029               - aligned (only if the compiler can prove that the
26030                 load will be aligned to 2 * element_size)  */
26031   if (load && (aarch64_tune_params.extra_tuning_flags
26032                & AARCH64_EXTRA_TUNE_NO_LDP_COMBINE))
26033     return false;
26034
26035   if (load)
26036     {
26037       mem_1 = operands[1];
26038       mem_2 = operands[3];
26039       reg_1 = operands[0];
26040       reg_2 = operands[2];
26041       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
26042       if (REGNO (reg_1) == REGNO (reg_2))
26043         return false;
26044       if (reg_overlap_mentioned_p (reg_1, mem_2))
26045         return false;
26046     }
26047   else
26048     {
26049       mem_1 = operands[0];
26050       mem_2 = operands[2];
26051       reg_1 = operands[1];
26052       reg_2 = operands[3];
26053     }
26054
26055   /* The mems cannot be volatile.  */
26056   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
26057     return false;
26058
26059   /* If we have SImode and slow unaligned ldp,
26060      check the alignment to be at least 8 byte. */
26061   if (mode == SImode
26062       && (aarch64_tune_params.extra_tuning_flags
26063           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26064       && !optimize_size
26065       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
26066     return false;
26067
26068   /* Check if the addresses are in the form of [base+offset].  */
26069   bool reversed = false;
26070   if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
26071     return false;
26072
26073   /* The operands must be of the same size.  */
26074   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
26075                         GET_MODE_SIZE (GET_MODE (mem_2))));
26076
26077   /* One of the memory accesses must be a mempair operand.
26078      If it is not the first one, they need to be swapped by the
26079      peephole.  */
26080   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
26081        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
26082     return false;
26083
26084   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
26085     rclass_1 = FP_REGS;
26086   else
26087     rclass_1 = GENERAL_REGS;
26088
26089   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
26090     rclass_2 = FP_REGS;
26091   else
26092     rclass_2 = GENERAL_REGS;
26093
26094   /* Check if the registers are of same class.  */
26095   if (rclass_1 != rclass_2)
26096     return false;
26097
26098   return true;
26099 }
26100
26101 /* Given OPERANDS of consecutive load/store that can be merged,
26102    swap them if they are not in ascending order.  */
26103 void
26104 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
26105 {
26106   int mem_op = load ? 1 : 0;
26107   bool reversed = false;
26108   if (!aarch64_check_consecutive_mems (operands + mem_op,
26109                                        operands + mem_op + 2, &reversed))
26110     gcc_unreachable ();
26111
26112   if (reversed)
26113     {
26114       /* Irrespective of whether this is a load or a store,
26115          we do the same swap.  */
26116       std::swap (operands[0], operands[2]);
26117       std::swap (operands[1], operands[3]);
26118     }
26119 }
26120
26121 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
26122    comparison between the two.  */
26123 int
26124 aarch64_host_wide_int_compare (const void *x, const void *y)
26125 {
26126   return wi::cmps (* ((const HOST_WIDE_INT *) x),
26127                    * ((const HOST_WIDE_INT *) y));
26128 }
26129
26130 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
26131    other pointing to a REG rtx containing an offset, compare the offsets
26132    of the two pairs.
26133
26134    Return:
26135
26136         1 iff offset (X) > offset (Y)
26137         0 iff offset (X) == offset (Y)
26138         -1 iff offset (X) < offset (Y)  */
26139 int
26140 aarch64_ldrstr_offset_compare (const void *x, const void *y)
26141 {
26142   const rtx * operands_1 = (const rtx *) x;
26143   const rtx * operands_2 = (const rtx *) y;
26144   rtx mem_1, mem_2, base, offset_1, offset_2;
26145
26146   if (MEM_P (operands_1[0]))
26147     mem_1 = operands_1[0];
26148   else
26149     mem_1 = operands_1[1];
26150
26151   if (MEM_P (operands_2[0]))
26152     mem_2 = operands_2[0];
26153   else
26154     mem_2 = operands_2[1];
26155
26156   /* Extract the offsets.  */
26157   extract_base_offset_in_addr (mem_1, &base, &offset_1);
26158   extract_base_offset_in_addr (mem_2, &base, &offset_2);
26159
26160   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
26161
26162   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
26163 }
26164
26165 /* Given OPERANDS of consecutive load/store, check if we can merge
26166    them into ldp/stp by adjusting the offset.  LOAD is true if they
26167    are load instructions.  MODE is the mode of memory operands.
26168
26169    Given below consecutive stores:
26170
26171      str  w1, [xb, 0x100]
26172      str  w1, [xb, 0x104]
26173      str  w1, [xb, 0x108]
26174      str  w1, [xb, 0x10c]
26175
26176    Though the offsets are out of the range supported by stp, we can
26177    still pair them after adjusting the offset, like:
26178
26179      add  scratch, xb, 0x100
26180      stp  w1, w1, [scratch]
26181      stp  w1, w1, [scratch, 0x8]
26182
26183    The peephole patterns detecting this opportunity should guarantee
26184    the scratch register is avaliable.  */
26185
26186 bool
26187 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
26188                                        machine_mode mode)
26189 {
26190   const int num_insns = 4;
26191   enum reg_class rclass;
26192   HOST_WIDE_INT offvals[num_insns], msize;
26193   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
26194
26195   if (load)
26196     {
26197       for (int i = 0; i < num_insns; i++)
26198         {
26199           reg[i] = operands[2 * i];
26200           mem[i] = operands[2 * i + 1];
26201
26202           gcc_assert (REG_P (reg[i]));
26203         }
26204
26205       /* Do not attempt to merge the loads if the loads clobber each other.  */
26206       for (int i = 0; i < 8; i += 2)
26207         for (int j = i + 2; j < 8; j += 2)
26208           if (reg_overlap_mentioned_p (operands[i], operands[j]))
26209             return false;
26210     }
26211   else
26212     for (int i = 0; i < num_insns; i++)
26213       {
26214         mem[i] = operands[2 * i];
26215         reg[i] = operands[2 * i + 1];
26216       }
26217
26218   /* Skip if memory operand is by itself valid for ldp/stp.  */
26219   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
26220     return false;
26221
26222   for (int i = 0; i < num_insns; i++)
26223     {
26224       /* The mems cannot be volatile.  */
26225       if (MEM_VOLATILE_P (mem[i]))
26226         return false;
26227
26228       /* Check if the addresses are in the form of [base+offset].  */
26229       extract_base_offset_in_addr (mem[i], base + i, offset + i);
26230       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
26231         return false;
26232     }
26233
26234   /* Check if the registers are of same class.  */
26235   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
26236     ? FP_REGS : GENERAL_REGS;
26237
26238   for (int i = 1; i < num_insns; i++)
26239     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
26240       {
26241         if (rclass != FP_REGS)
26242           return false;
26243       }
26244     else
26245       {
26246         if (rclass != GENERAL_REGS)
26247           return false;
26248       }
26249
26250   /* Only the last register in the order in which they occur
26251      may be clobbered by the load.  */
26252   if (rclass == GENERAL_REGS && load)
26253     for (int i = 0; i < num_insns - 1; i++)
26254       if (reg_mentioned_p (reg[i], mem[i]))
26255         return false;
26256
26257   /* Check if the bases are same.  */
26258   for (int i = 0; i < num_insns - 1; i++)
26259     if (!rtx_equal_p (base[i], base[i + 1]))
26260       return false;
26261
26262   for (int i = 0; i < num_insns; i++)
26263     offvals[i] = INTVAL (offset[i]);
26264
26265   msize = GET_MODE_SIZE (mode).to_constant ();
26266
26267   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
26268   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
26269          aarch64_host_wide_int_compare);
26270
26271   if (!(offvals[1] == offvals[0] + msize
26272         && offvals[3] == offvals[2] + msize))
26273     return false;
26274
26275   /* Check that offsets are within range of each other.  The ldp/stp
26276      instructions have 7 bit immediate offsets, so use 0x80.  */
26277   if (offvals[2] - offvals[0] >= msize * 0x80)
26278     return false;
26279
26280   /* The offsets must be aligned with respect to each other.  */
26281   if (offvals[0] % msize != offvals[2] % msize)
26282     return false;
26283
26284   /* If we have SImode and slow unaligned ldp,
26285      check the alignment to be at least 8 byte. */
26286   if (mode == SImode
26287       && (aarch64_tune_params.extra_tuning_flags
26288           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26289       && !optimize_size
26290       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
26291     return false;
26292
26293   return true;
26294 }
26295
26296 /* Given OPERANDS of consecutive load/store, this function pairs them
26297    into LDP/STP after adjusting the offset.  It depends on the fact
26298    that the operands can be sorted so the offsets are correct for STP.
26299    MODE is the mode of memory operands.  CODE is the rtl operator
26300    which should be applied to all memory operands, it's SIGN_EXTEND,
26301    ZERO_EXTEND or UNKNOWN.  */
26302
26303 bool
26304 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
26305                              machine_mode mode, RTX_CODE code)
26306 {
26307   rtx base, offset_1, offset_3, t1, t2;
26308   rtx mem_1, mem_2, mem_3, mem_4;
26309   rtx temp_operands[8];
26310   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
26311                 stp_off_upper_limit, stp_off_lower_limit, msize;
26312
26313   /* We make changes on a copy as we may still bail out.  */
26314   for (int i = 0; i < 8; i ++)
26315     temp_operands[i] = operands[i];
26316
26317   /* Sort the operands.  Note for cases as below:
26318        [base + 0x310] = A
26319        [base + 0x320] = B
26320        [base + 0x330] = C
26321        [base + 0x320] = D
26322      We need stable sorting otherwise wrong data may be store to offset 0x320.
26323      Also note the dead store in above case should be optimized away, but no
26324      guarantees here.  */
26325   gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
26326                  aarch64_ldrstr_offset_compare);
26327
26328   /* Copy the memory operands so that if we have to bail for some
26329      reason the original addresses are unchanged.  */
26330   if (load)
26331     {
26332       mem_1 = copy_rtx (temp_operands[1]);
26333       mem_2 = copy_rtx (temp_operands[3]);
26334       mem_3 = copy_rtx (temp_operands[5]);
26335       mem_4 = copy_rtx (temp_operands[7]);
26336     }
26337   else
26338     {
26339       mem_1 = copy_rtx (temp_operands[0]);
26340       mem_2 = copy_rtx (temp_operands[2]);
26341       mem_3 = copy_rtx (temp_operands[4]);
26342       mem_4 = copy_rtx (temp_operands[6]);
26343       gcc_assert (code == UNKNOWN);
26344     }
26345
26346   extract_base_offset_in_addr (mem_1, &base, &offset_1);
26347   extract_base_offset_in_addr (mem_3, &base, &offset_3);
26348   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
26349               && offset_3 != NULL_RTX);
26350
26351   /* Adjust offset so it can fit in LDP/STP instruction.  */
26352   msize = GET_MODE_SIZE (mode).to_constant();
26353   stp_off_upper_limit = msize * (0x40 - 1);
26354   stp_off_lower_limit = - msize * 0x40;
26355
26356   off_val_1 = INTVAL (offset_1);
26357   off_val_3 = INTVAL (offset_3);
26358
26359   /* The base offset is optimally half way between the two STP/LDP offsets.  */
26360   if (msize <= 4)
26361     base_off = (off_val_1 + off_val_3) / 2;
26362   else
26363     /* However, due to issues with negative LDP/STP offset generation for
26364        larger modes, for DF, DD, DI and vector modes. we must not use negative
26365        addresses smaller than 9 signed unadjusted bits can store.  This
26366        provides the most range in this case.  */
26367     base_off = off_val_1;
26368
26369   /* Adjust the base so that it is aligned with the addresses but still
26370      optimal.  */
26371   if (base_off % msize != off_val_1 % msize)
26372     /* Fix the offset, bearing in mind we want to make it bigger not
26373        smaller.  */
26374     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26375   else if (msize <= 4)
26376     /* The negative range of LDP/STP is one larger than the positive range.  */
26377     base_off += msize;
26378
26379   /* Check if base offset is too big or too small.  We can attempt to resolve
26380      this issue by setting it to the maximum value and seeing if the offsets
26381      still fit.  */
26382   if (base_off >= 0x1000)
26383     {
26384       base_off = 0x1000 - 1;
26385       /* We must still make sure that the base offset is aligned with respect
26386          to the address.  But it may not be made any bigger.  */
26387       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26388     }
26389
26390   /* Likewise for the case where the base is too small.  */
26391   if (base_off <= -0x1000)
26392     {
26393       base_off = -0x1000 + 1;
26394       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26395     }
26396
26397   /* Offset of the first STP/LDP.  */
26398   new_off_1 = off_val_1 - base_off;
26399
26400   /* Offset of the second STP/LDP.  */
26401   new_off_3 = off_val_3 - base_off;
26402
26403   /* The offsets must be within the range of the LDP/STP instructions.  */
26404   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
26405       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
26406     return false;
26407
26408   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
26409                                                   new_off_1), true);
26410   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
26411                                                   new_off_1 + msize), true);
26412   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
26413                                                   new_off_3), true);
26414   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
26415                                                   new_off_3 + msize), true);
26416
26417   if (!aarch64_mem_pair_operand (mem_1, mode)
26418       || !aarch64_mem_pair_operand (mem_3, mode))
26419     return false;
26420
26421   if (code == ZERO_EXTEND)
26422     {
26423       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
26424       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
26425       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
26426       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
26427     }
26428   else if (code == SIGN_EXTEND)
26429     {
26430       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
26431       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
26432       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
26433       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
26434     }
26435
26436   if (load)
26437     {
26438       operands[0] = temp_operands[0];
26439       operands[1] = mem_1;
26440       operands[2] = temp_operands[2];
26441       operands[3] = mem_2;
26442       operands[4] = temp_operands[4];
26443       operands[5] = mem_3;
26444       operands[6] = temp_operands[6];
26445       operands[7] = mem_4;
26446     }
26447   else
26448     {
26449       operands[0] = mem_1;
26450       operands[1] = temp_operands[1];
26451       operands[2] = mem_2;
26452       operands[3] = temp_operands[3];
26453       operands[4] = mem_3;
26454       operands[5] = temp_operands[5];
26455       operands[6] = mem_4;
26456       operands[7] = temp_operands[7];
26457     }
26458
26459   /* Emit adjusting instruction.  */
26460   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
26461   /* Emit ldp/stp instructions.  */
26462   t1 = gen_rtx_SET (operands[0], operands[1]);
26463   t2 = gen_rtx_SET (operands[2], operands[3]);
26464   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26465   t1 = gen_rtx_SET (operands[4], operands[5]);
26466   t2 = gen_rtx_SET (operands[6], operands[7]);
26467   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26468   return true;
26469 }
26470
26471 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
26472    it isn't worth branching around empty masked ops (including masked
26473    stores).  */
26474
26475 static bool
26476 aarch64_empty_mask_is_expensive (unsigned)
26477 {
26478   return false;
26479 }
26480
26481 /* Return 1 if pseudo register should be created and used to hold
26482    GOT address for PIC code.  */
26483
26484 bool
26485 aarch64_use_pseudo_pic_reg (void)
26486 {
26487   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
26488 }
26489
26490 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
26491
26492 static int
26493 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
26494 {
26495   switch (XINT (x, 1))
26496     {
26497     case UNSPEC_GOTSMALLPIC:
26498     case UNSPEC_GOTSMALLPIC28K:
26499     case UNSPEC_GOTTINYPIC:
26500       return 0;
26501     default:
26502       break;
26503     }
26504
26505   return default_unspec_may_trap_p (x, flags);
26506 }
26507
26508
26509 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
26510    return the log2 of that value.  Otherwise return -1.  */
26511
26512 int
26513 aarch64_fpconst_pow_of_2 (rtx x)
26514 {
26515   const REAL_VALUE_TYPE *r;
26516
26517   if (!CONST_DOUBLE_P (x))
26518     return -1;
26519
26520   r = CONST_DOUBLE_REAL_VALUE (x);
26521
26522   if (REAL_VALUE_NEGATIVE (*r)
26523       || REAL_VALUE_ISNAN (*r)
26524       || REAL_VALUE_ISINF (*r)
26525       || !real_isinteger (r, DFmode))
26526     return -1;
26527
26528   return exact_log2 (real_to_integer (r));
26529 }
26530
26531 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
26532    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
26533    return n. Otherwise return -1.  */
26534
26535 int
26536 aarch64_fpconst_pow2_recip (rtx x)
26537 {
26538   REAL_VALUE_TYPE r0;
26539
26540   if (!CONST_DOUBLE_P (x))
26541     return -1;
26542
26543   r0 = *CONST_DOUBLE_REAL_VALUE (x);
26544   if (exact_real_inverse (DFmode, &r0)
26545       && !REAL_VALUE_NEGATIVE (r0))
26546     {
26547         int ret = exact_log2 (real_to_integer (&r0));
26548         if (ret >= 1 && ret <= 32)
26549             return ret;
26550     }
26551   return -1;
26552 }
26553
26554 /* If X is a vector of equal CONST_DOUBLE values and that value is
26555    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
26556
26557 int
26558 aarch64_vec_fpconst_pow_of_2 (rtx x)
26559 {
26560   int nelts;
26561   if (!CONST_VECTOR_P (x)
26562       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
26563     return -1;
26564
26565   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
26566     return -1;
26567
26568   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
26569   if (firstval <= 0)
26570     return -1;
26571
26572   for (int i = 1; i < nelts; i++)
26573     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
26574       return -1;
26575
26576   return firstval;
26577 }
26578
26579 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
26580    to float.
26581
26582    __fp16 always promotes through this hook.
26583    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
26584    through the generic excess precision logic rather than here.  */
26585
26586 static tree
26587 aarch64_promoted_type (const_tree t)
26588 {
26589   if (SCALAR_FLOAT_TYPE_P (t)
26590       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
26591     return float_type_node;
26592
26593   return NULL_TREE;
26594 }
26595
26596 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
26597
26598 static bool
26599 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
26600                            optimization_type opt_type)
26601 {
26602   switch (op)
26603     {
26604     case rsqrt_optab:
26605       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
26606
26607     default:
26608       return true;
26609     }
26610 }
26611
26612 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
26613
26614 static unsigned int
26615 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
26616                                         int *offset)
26617 {
26618   /* Polynomial invariant 1 == (VG / 2) - 1.  */
26619   gcc_assert (i == 1);
26620   *factor = 2;
26621   *offset = 1;
26622   return AARCH64_DWARF_VG;
26623 }
26624
26625 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
26626    if MODE is [BH]Fmode, and punt to the generic implementation otherwise.  */
26627
26628 static bool
26629 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
26630 {
26631   return ((mode == HFmode || mode == BFmode)
26632           ? true
26633           : default_libgcc_floating_mode_supported_p (mode));
26634 }
26635
26636 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
26637    if MODE is [BH]Fmode, and punt to the generic implementation otherwise.  */
26638
26639 static bool
26640 aarch64_scalar_mode_supported_p (scalar_mode mode)
26641 {
26642   if (DECIMAL_FLOAT_MODE_P (mode))
26643     return default_decimal_float_supported_p ();
26644
26645   return ((mode == HFmode || mode == BFmode)
26646           ? true
26647           : default_scalar_mode_supported_p (mode));
26648 }
26649
26650 /* Set the value of FLT_EVAL_METHOD.
26651    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
26652
26653     0: evaluate all operations and constants, whose semantic type has at
26654        most the range and precision of type float, to the range and
26655        precision of float; evaluate all other operations and constants to
26656        the range and precision of the semantic type;
26657
26658     N, where _FloatN is a supported interchange floating type
26659        evaluate all operations and constants, whose semantic type has at
26660        most the range and precision of _FloatN type, to the range and
26661        precision of the _FloatN type; evaluate all other operations and
26662        constants to the range and precision of the semantic type;
26663
26664    If we have the ARMv8.2-A extensions then we support _Float16 in native
26665    precision, so we should set this to 16.  Otherwise, we support the type,
26666    but want to evaluate expressions in float precision, so set this to
26667    0.  */
26668
26669 static enum flt_eval_method
26670 aarch64_excess_precision (enum excess_precision_type type)
26671 {
26672   switch (type)
26673     {
26674       case EXCESS_PRECISION_TYPE_FAST:
26675       case EXCESS_PRECISION_TYPE_STANDARD:
26676         /* We can calculate either in 16-bit range and precision or
26677            32-bit range and precision.  Make that decision based on whether
26678            we have native support for the ARMv8.2-A 16-bit floating-point
26679            instructions or not.  */
26680         return (TARGET_FP_F16INST
26681                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
26682                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
26683       case EXCESS_PRECISION_TYPE_IMPLICIT:
26684       case EXCESS_PRECISION_TYPE_FLOAT16:
26685         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
26686       default:
26687         gcc_unreachable ();
26688     }
26689   return FLT_EVAL_METHOD_UNPREDICTABLE;
26690 }
26691
26692 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
26693    scheduled for speculative execution.  Reject the long-running division
26694    and square-root instructions.  */
26695
26696 static bool
26697 aarch64_sched_can_speculate_insn (rtx_insn *insn)
26698 {
26699   switch (get_attr_type (insn))
26700     {
26701       case TYPE_SDIV:
26702       case TYPE_UDIV:
26703       case TYPE_FDIVS:
26704       case TYPE_FDIVD:
26705       case TYPE_FSQRTS:
26706       case TYPE_FSQRTD:
26707       case TYPE_NEON_FP_SQRT_S:
26708       case TYPE_NEON_FP_SQRT_D:
26709       case TYPE_NEON_FP_SQRT_S_Q:
26710       case TYPE_NEON_FP_SQRT_D_Q:
26711       case TYPE_NEON_FP_DIV_S:
26712       case TYPE_NEON_FP_DIV_D:
26713       case TYPE_NEON_FP_DIV_S_Q:
26714       case TYPE_NEON_FP_DIV_D_Q:
26715         return false;
26716       default:
26717         return true;
26718     }
26719 }
26720
26721 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
26722
26723 static int
26724 aarch64_compute_pressure_classes (reg_class *classes)
26725 {
26726   int i = 0;
26727   classes[i++] = GENERAL_REGS;
26728   classes[i++] = FP_REGS;
26729   /* PR_REGS isn't a useful pressure class because many predicate pseudo
26730      registers need to go in PR_LO_REGS at some point during their
26731      lifetime.  Splitting it into two halves has the effect of making
26732      all predicates count against PR_LO_REGS, so that we try whenever
26733      possible to restrict the number of live predicates to 8.  This
26734      greatly reduces the amount of spilling in certain loops.  */
26735   classes[i++] = PR_LO_REGS;
26736   classes[i++] = PR_HI_REGS;
26737   return i;
26738 }
26739
26740 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
26741
26742 static bool
26743 aarch64_can_change_mode_class (machine_mode from,
26744                                machine_mode to, reg_class_t)
26745 {
26746   unsigned int from_flags = aarch64_classify_vector_mode (from);
26747   unsigned int to_flags = aarch64_classify_vector_mode (to);
26748
26749   bool from_sve_p = (from_flags & VEC_ANY_SVE);
26750   bool to_sve_p = (to_flags & VEC_ANY_SVE);
26751
26752   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
26753   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
26754
26755   bool from_pred_p = (from_flags & VEC_SVE_PRED);
26756   bool to_pred_p = (to_flags & VEC_SVE_PRED);
26757
26758   bool to_partial_advsimd_struct_p = (to_flags == (VEC_ADVSIMD | VEC_STRUCT
26759                                                    | VEC_PARTIAL));
26760   bool from_partial_advsimd_struct_p = (from_flags == (VEC_ADVSIMD | VEC_STRUCT
26761                                                    | VEC_PARTIAL));
26762
26763   /* Don't allow changes between predicate modes and other modes.
26764      Only predicate registers can hold predicate modes and only
26765      non-predicate registers can hold non-predicate modes, so any
26766      attempt to mix them would require a round trip through memory.  */
26767   if (from_pred_p != to_pred_p)
26768     return false;
26769
26770   /* Don't allow changes between partial SVE modes and other modes.
26771      The contents of partial SVE modes are distributed evenly across
26772      the register, whereas GCC expects them to be clustered together.  */
26773   if (from_partial_sve_p != to_partial_sve_p)
26774     return false;
26775
26776   /* Similarly reject changes between partial SVE modes that have
26777      different patterns of significant and insignificant bits.  */
26778   if (from_partial_sve_p
26779       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
26780           || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
26781     return false;
26782
26783   /* Don't allow changes between partial and other registers only if
26784      one is a normal SIMD register, allow only if not larger than 64-bit.  */
26785   if ((to_partial_advsimd_struct_p ^ from_partial_advsimd_struct_p)
26786       && (known_gt (GET_MODE_SIZE (to), 8) || known_gt (GET_MODE_SIZE (to), 8)))
26787     return false;
26788
26789   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26790     {
26791       /* Don't allow changes between SVE modes and other modes that might
26792          be bigger than 128 bits.  In particular, OImode, CImode and XImode
26793          divide into 128-bit quantities while SVE modes divide into
26794          BITS_PER_SVE_VECTOR quantities.  */
26795       if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
26796         return false;
26797       if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
26798         return false;
26799     }
26800
26801   if (BYTES_BIG_ENDIAN)
26802     {
26803       /* Don't allow changes between SVE data modes and non-SVE modes.
26804          See the comment at the head of aarch64-sve.md for details.  */
26805       if (from_sve_p != to_sve_p)
26806         return false;
26807
26808       /* Don't allow changes in element size: lane 0 of the new vector
26809          would not then be lane 0 of the old vector.  See the comment
26810          above aarch64_maybe_expand_sve_subreg_move for a more detailed
26811          description.
26812
26813          In the worst case, this forces a register to be spilled in
26814          one mode and reloaded in the other, which handles the
26815          endianness correctly.  */
26816       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
26817         return false;
26818     }
26819   return true;
26820 }
26821
26822 /* Implement TARGET_EARLY_REMAT_MODES.  */
26823
26824 static void
26825 aarch64_select_early_remat_modes (sbitmap modes)
26826 {
26827   /* SVE values are not normally live across a call, so it should be
26828      worth doing early rematerialization even in VL-specific mode.  */
26829   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
26830     if (aarch64_sve_mode_p ((machine_mode) i))
26831       bitmap_set_bit (modes, i);
26832 }
26833
26834 /* Override the default target speculation_safe_value.  */
26835 static rtx
26836 aarch64_speculation_safe_value (machine_mode mode,
26837                                 rtx result, rtx val, rtx failval)
26838 {
26839   /* Maybe we should warn if falling back to hard barriers.  They are
26840      likely to be noticably more expensive than the alternative below.  */
26841   if (!aarch64_track_speculation)
26842     return default_speculation_safe_value (mode, result, val, failval);
26843
26844   if (!REG_P (val))
26845     val = copy_to_mode_reg (mode, val);
26846
26847   if (!aarch64_reg_or_zero (failval, mode))
26848     failval = copy_to_mode_reg (mode, failval);
26849
26850   emit_insn (gen_despeculate_copy (mode, result, val, failval));
26851   return result;
26852 }
26853
26854 /* Implement TARGET_ESTIMATED_POLY_VALUE.
26855    Look into the tuning structure for an estimate.
26856    KIND specifies the type of requested estimate: min, max or likely.
26857    For cores with a known SVE width all three estimates are the same.
26858    For generic SVE tuning we want to distinguish the maximum estimate from
26859    the minimum and likely ones.
26860    The likely estimate is the same as the minimum in that case to give a
26861    conservative behavior of auto-vectorizing with SVE when it is a win
26862    even for 128-bit SVE.
26863    When SVE width information is available VAL.coeffs[1] is multiplied by
26864    the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
26865
26866 static HOST_WIDE_INT
26867 aarch64_estimated_poly_value (poly_int64 val,
26868                               poly_value_estimate_kind kind
26869                                 = POLY_VALUE_LIKELY)
26870 {
26871   unsigned int width_source = aarch64_tune_params.sve_width;
26872
26873   /* If there is no core-specific information then the minimum and likely
26874      values are based on 128-bit vectors and the maximum is based on
26875      the architectural maximum of 2048 bits.  */
26876   if (width_source == SVE_SCALABLE)
26877     switch (kind)
26878       {
26879       case POLY_VALUE_MIN:
26880       case POLY_VALUE_LIKELY:
26881         return val.coeffs[0];
26882       case POLY_VALUE_MAX:
26883           return val.coeffs[0] + val.coeffs[1] * 15;
26884       }
26885
26886   /* Allow sve_width to be a bitmask of different VL, treating the lowest
26887      as likely.  This could be made more general if future -mtune options
26888      need it to be.  */
26889   if (kind == POLY_VALUE_MAX)
26890     width_source = 1 << floor_log2 (width_source);
26891   else
26892     width_source = least_bit_hwi (width_source);
26893
26894   /* If the core provides width information, use that.  */
26895   HOST_WIDE_INT over_128 = width_source - 128;
26896   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
26897 }
26898
26899
26900 /* Return true for types that could be supported as SIMD return or
26901    argument types.  */
26902
26903 static bool
26904 supported_simd_type (tree t)
26905 {
26906   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
26907     {
26908       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
26909       return s == 1 || s == 2 || s == 4 || s == 8;
26910     }
26911   return false;
26912 }
26913
26914 /* Return true for types that currently are supported as SIMD return
26915    or argument types.  */
26916
26917 static bool
26918 currently_supported_simd_type (tree t, tree b)
26919 {
26920   if (COMPLEX_FLOAT_TYPE_P (t))
26921     return false;
26922
26923   if (TYPE_SIZE (t) != TYPE_SIZE (b))
26924     return false;
26925
26926   return supported_simd_type (t);
26927 }
26928
26929 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
26930
26931 static int
26932 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
26933                                         struct cgraph_simd_clone *clonei,
26934                                         tree base_type, int num,
26935                                         bool explicit_p)
26936 {
26937   tree t, ret_type;
26938   unsigned int elt_bits, count;
26939   unsigned HOST_WIDE_INT const_simdlen;
26940   poly_uint64 vec_bits;
26941
26942   if (!TARGET_SIMD)
26943     return 0;
26944
26945   /* For now, SVE simdclones won't produce illegal simdlen, So only check
26946      const simdlens here.  */
26947   if (maybe_ne (clonei->simdlen, 0U)
26948       && clonei->simdlen.is_constant (&const_simdlen)
26949       && (const_simdlen < 2
26950           || const_simdlen > 1024
26951           || (const_simdlen & (const_simdlen - 1)) != 0))
26952     {
26953       if (explicit_p)
26954         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26955                     "unsupported simdlen %wd", const_simdlen);
26956       return 0;
26957     }
26958
26959   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
26960   if (TREE_CODE (ret_type) != VOID_TYPE
26961       && !currently_supported_simd_type (ret_type, base_type))
26962     {
26963       if (!explicit_p)
26964         ;
26965       else if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
26966         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26967                     "GCC does not currently support mixed size types "
26968                     "for %<simd%> functions");
26969       else if (supported_simd_type (ret_type))
26970         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26971                     "GCC does not currently support return type %qT "
26972                     "for %<simd%> functions", ret_type);
26973       else
26974         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26975                     "unsupported return type %qT for %<simd%> functions",
26976                     ret_type);
26977       return 0;
26978     }
26979
26980   int i;
26981   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
26982   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
26983
26984   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
26985        t && t != void_list_node; t = TREE_CHAIN (t), i++)
26986     {
26987       tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
26988
26989       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
26990           && !currently_supported_simd_type (arg_type, base_type))
26991         {
26992           if (!explicit_p)
26993             ;
26994           else if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
26995             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26996                         "GCC does not currently support mixed size types "
26997                         "for %<simd%> functions");
26998           else
26999             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27000                         "GCC does not currently support argument type %qT "
27001                         "for %<simd%> functions", arg_type);
27002           return 0;
27003         }
27004     }
27005
27006   clonei->vecsize_mangle = 'n';
27007   clonei->mask_mode = VOIDmode;
27008   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
27009   if (known_eq (clonei->simdlen, 0U))
27010     {
27011       count = 2;
27012       vec_bits = (num == 0 ? 64 : 128);
27013       clonei->simdlen = exact_div (vec_bits, elt_bits);
27014     }
27015   else
27016     {
27017       count = 1;
27018       vec_bits = clonei->simdlen * elt_bits;
27019       /* For now, SVE simdclones won't produce illegal simdlen, So only check
27020          const simdlens here.  */
27021       if (clonei->simdlen.is_constant (&const_simdlen)
27022           && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
27023         {
27024           if (explicit_p)
27025             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27026                         "GCC does not currently support simdlen %wd for "
27027                         "type %qT",
27028                         const_simdlen, base_type);
27029           return 0;
27030         }
27031     }
27032   clonei->vecsize_int = vec_bits;
27033   clonei->vecsize_float = vec_bits;
27034   return count;
27035 }
27036
27037 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
27038
27039 static void
27040 aarch64_simd_clone_adjust (struct cgraph_node *node)
27041 {
27042   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
27043      use the correct ABI.  */
27044
27045   tree t = TREE_TYPE (node->decl);
27046   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
27047                                         TYPE_ATTRIBUTES (t));
27048 }
27049
27050 /* Implement TARGET_SIMD_CLONE_USABLE.  */
27051
27052 static int
27053 aarch64_simd_clone_usable (struct cgraph_node *node)
27054 {
27055   switch (node->simdclone->vecsize_mangle)
27056     {
27057     case 'n':
27058       if (!TARGET_SIMD)
27059         return -1;
27060       return 0;
27061     default:
27062       gcc_unreachable ();
27063     }
27064 }
27065
27066 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
27067
27068 static int
27069 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
27070 {
27071   auto check_attr = [&](const char *name) {
27072     tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
27073     tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
27074     if (!attr1 && !attr2)
27075       return true;
27076
27077     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
27078   };
27079
27080   if (!check_attr ("aarch64_vector_pcs"))
27081     return 0;
27082   if (!check_attr ("Advanced SIMD type"))
27083     return 0;
27084   if (!check_attr ("SVE type"))
27085     return 0;
27086   if (!check_attr ("SVE sizeless type"))
27087     return 0;
27088   return 1;
27089 }
27090
27091 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
27092
27093 static const char *
27094 aarch64_get_multilib_abi_name (void)
27095 {
27096   if (TARGET_BIG_END)
27097     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
27098   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
27099 }
27100
27101 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
27102    global variable based guard use the default else
27103    return a null tree.  */
27104 static tree
27105 aarch64_stack_protect_guard (void)
27106 {
27107   if (aarch64_stack_protector_guard == SSP_GLOBAL)
27108     return default_stack_protect_guard ();
27109
27110   return NULL_TREE;
27111 }
27112
27113 /* Return the diagnostic message string if the binary operation OP is
27114    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
27115
27116 static const char *
27117 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
27118                            const_tree type2)
27119 {
27120   if (VECTOR_TYPE_P (type1)
27121       && VECTOR_TYPE_P (type2)
27122       && !TYPE_INDIVISIBLE_P (type1)
27123       && !TYPE_INDIVISIBLE_P (type2)
27124       && (aarch64_sve::builtin_type_p (type1)
27125           != aarch64_sve::builtin_type_p (type2)))
27126     return N_("cannot combine GNU and SVE vectors in a binary operation");
27127
27128   /* Operation allowed.  */
27129   return NULL;
27130 }
27131
27132 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES.  Here we tell the rest of the
27133    compiler that we automatically ignore the top byte of our pointers, which
27134    allows using -fsanitize=hwaddress.  */
27135 bool
27136 aarch64_can_tag_addresses ()
27137 {
27138   return !TARGET_ILP32;
27139 }
27140
27141 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
27142    section at the end if needed.  */
27143 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
27144 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
27145 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
27146 void
27147 aarch64_file_end_indicate_exec_stack ()
27148 {
27149   file_end_indicate_exec_stack ();
27150
27151   unsigned feature_1_and = 0;
27152   if (aarch_bti_enabled ())
27153     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
27154
27155   if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE)
27156     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
27157
27158   if (feature_1_and)
27159     {
27160       /* Generate .note.gnu.property section.  */
27161       switch_to_section (get_section (".note.gnu.property",
27162                                       SECTION_NOTYPE, NULL));
27163
27164       /* PT_NOTE header: namesz, descsz, type.
27165          namesz = 4 ("GNU\0")
27166          descsz = 16 (Size of the program property array)
27167                   [(12 + padding) * Number of array elements]
27168          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
27169       assemble_align (POINTER_SIZE);
27170       assemble_integer (GEN_INT (4), 4, 32, 1);
27171       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
27172       assemble_integer (GEN_INT (5), 4, 32, 1);
27173
27174       /* PT_NOTE name.  */
27175       assemble_string ("GNU", 4);
27176
27177       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
27178          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
27179          datasz = 4
27180          data   = feature_1_and.  */
27181       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
27182       assemble_integer (GEN_INT (4), 4, 32, 1);
27183       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
27184
27185       /* Pad the size of the note to the required alignment.  */
27186       assemble_align (POINTER_SIZE);
27187     }
27188 }
27189 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
27190 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
27191 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
27192
27193 /* Helper function for straight line speculation.
27194    Return what barrier should be emitted for straight line speculation
27195    mitigation.
27196    When not mitigating against straight line speculation this function returns
27197    an empty string.
27198    When mitigating against straight line speculation, use:
27199    * SB when the v8.5-A SB extension is enabled.
27200    * DSB+ISB otherwise.  */
27201 const char *
27202 aarch64_sls_barrier (int mitigation_required)
27203 {
27204   return mitigation_required
27205     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
27206     : "";
27207 }
27208
27209 static GTY (()) tree aarch64_sls_shared_thunks[30];
27210 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
27211 const char *indirect_symbol_names[30] = {
27212     "__call_indirect_x0",
27213     "__call_indirect_x1",
27214     "__call_indirect_x2",
27215     "__call_indirect_x3",
27216     "__call_indirect_x4",
27217     "__call_indirect_x5",
27218     "__call_indirect_x6",
27219     "__call_indirect_x7",
27220     "__call_indirect_x8",
27221     "__call_indirect_x9",
27222     "__call_indirect_x10",
27223     "__call_indirect_x11",
27224     "__call_indirect_x12",
27225     "__call_indirect_x13",
27226     "__call_indirect_x14",
27227     "__call_indirect_x15",
27228     "", /* "__call_indirect_x16",  */
27229     "", /* "__call_indirect_x17",  */
27230     "__call_indirect_x18",
27231     "__call_indirect_x19",
27232     "__call_indirect_x20",
27233     "__call_indirect_x21",
27234     "__call_indirect_x22",
27235     "__call_indirect_x23",
27236     "__call_indirect_x24",
27237     "__call_indirect_x25",
27238     "__call_indirect_x26",
27239     "__call_indirect_x27",
27240     "__call_indirect_x28",
27241     "__call_indirect_x29",
27242 };
27243
27244 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
27245    line speculation.  Instead of a simple BLR that can be speculated past,
27246    we emit a BL to this thunk, and this thunk contains a BR to the relevant
27247    register.  These thunks have the relevant speculation barries put after
27248    their indirect branch so that speculation is blocked.
27249
27250    We use such a thunk so the speculation barriers are kept off the
27251    architecturally executed path in order to reduce the performance overhead.
27252
27253    When optimizing for size we use stubs shared by the linked object.
27254    When optimizing for performance we emit stubs for each function in the hope
27255    that the branch predictor can better train on jumps specific for a given
27256    function.  */
27257 rtx
27258 aarch64_sls_create_blr_label (int regnum)
27259 {
27260   gcc_assert (STUB_REGNUM_P (regnum));
27261   if (optimize_function_for_size_p (cfun))
27262     {
27263       /* For the thunks shared between different functions in this compilation
27264          unit we use a named symbol -- this is just for users to more easily
27265          understand the generated assembly.  */
27266       aarch64_sls_shared_thunks_needed = true;
27267       const char *thunk_name = indirect_symbol_names[regnum];
27268       if (aarch64_sls_shared_thunks[regnum] == NULL)
27269         {
27270           /* Build a decl representing this function stub and record it for
27271              later.  We build a decl here so we can use the GCC machinery for
27272              handling sections automatically (through `get_named_section` and
27273              `make_decl_one_only`).  That saves us a lot of trouble handling
27274              the specifics of different output file formats.  */
27275           tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
27276                                   get_identifier (thunk_name),
27277                                   build_function_type_list (void_type_node,
27278                                                             NULL_TREE));
27279           DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
27280                                            NULL_TREE, void_type_node);
27281           TREE_PUBLIC (decl) = 1;
27282           TREE_STATIC (decl) = 1;
27283           DECL_IGNORED_P (decl) = 1;
27284           DECL_ARTIFICIAL (decl) = 1;
27285           make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
27286           resolve_unique_section (decl, 0, false);
27287           aarch64_sls_shared_thunks[regnum] = decl;
27288         }
27289
27290       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
27291     }
27292
27293   if (cfun->machine->call_via[regnum] == NULL)
27294     cfun->machine->call_via[regnum]
27295       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
27296   return cfun->machine->call_via[regnum];
27297 }
27298
27299 /* Helper function for aarch64_sls_emit_blr_function_thunks and
27300    aarch64_sls_emit_shared_blr_thunks below.  */
27301 static void
27302 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
27303 {
27304   /* Save in x16 and branch to that function so this transformation does
27305      not prevent jumping to `BTI c` instructions.  */
27306   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
27307   asm_fprintf (out_file, "\tbr\tx16\n");
27308 }
27309
27310 /* Emit all BLR stubs for this particular function.
27311    Here we emit all the BLR stubs needed for the current function.  Since we
27312    emit these stubs in a consecutive block we know there will be no speculation
27313    gadgets between each stub, and hence we only emit a speculation barrier at
27314    the end of the stub sequences.
27315
27316    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
27317 void
27318 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
27319 {
27320   if (! aarch64_harden_sls_blr_p ())
27321     return;
27322
27323   bool any_functions_emitted = false;
27324   /* We must save and restore the current function section since this assembly
27325      is emitted at the end of the function.  This means it can be emitted *just
27326      after* the cold section of a function.  That cold part would be emitted in
27327      a different section.  That switch would trigger a `.cfi_endproc` directive
27328      to be emitted in the original section and a `.cfi_startproc` directive to
27329      be emitted in the new section.  Switching to the original section without
27330      restoring would mean that the `.cfi_endproc` emitted as a function ends
27331      would happen in a different section -- leaving an unmatched
27332      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
27333      in the standard text section.  */
27334   section *save_text_section = in_section;
27335   switch_to_section (function_section (current_function_decl));
27336   for (int regnum = 0; regnum < 30; ++regnum)
27337     {
27338       rtx specu_label = cfun->machine->call_via[regnum];
27339       if (specu_label == NULL)
27340         continue;
27341
27342       targetm.asm_out.print_operand (out_file, specu_label, 0);
27343       asm_fprintf (out_file, ":\n");
27344       aarch64_sls_emit_function_stub (out_file, regnum);
27345       any_functions_emitted = true;
27346     }
27347   if (any_functions_emitted)
27348     /* Can use the SB if needs be here, since this stub will only be used
27349       by the current function, and hence for the current target.  */
27350     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
27351   switch_to_section (save_text_section);
27352 }
27353
27354 /* Emit shared BLR stubs for the current compilation unit.
27355    Over the course of compiling this unit we may have converted some BLR
27356    instructions to a BL to a shared stub function.  This is where we emit those
27357    stub functions.
27358    This function is for the stubs shared between different functions in this
27359    compilation unit.  We share when optimizing for size instead of speed.
27360
27361    This function is called through the TARGET_ASM_FILE_END hook.  */
27362 void
27363 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
27364 {
27365   if (! aarch64_sls_shared_thunks_needed)
27366     return;
27367
27368   for (int regnum = 0; regnum < 30; ++regnum)
27369     {
27370       tree decl = aarch64_sls_shared_thunks[regnum];
27371       if (!decl)
27372         continue;
27373
27374       const char *name = indirect_symbol_names[regnum];
27375       switch_to_section (get_named_section (decl, NULL, 0));
27376       ASM_OUTPUT_ALIGN (out_file, 2);
27377       targetm.asm_out.globalize_label (out_file, name);
27378       /* Only emits if the compiler is configured for an assembler that can
27379          handle visibility directives.  */
27380       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
27381       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
27382       ASM_OUTPUT_LABEL (out_file, name);
27383       aarch64_sls_emit_function_stub (out_file, regnum);
27384       /* Use the most conservative target to ensure it can always be used by any
27385          function in the translation unit.  */
27386       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
27387       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
27388     }
27389 }
27390
27391 /* Implement TARGET_ASM_FILE_END.  */
27392 void
27393 aarch64_asm_file_end ()
27394 {
27395   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
27396   /* Since this function will be called for the ASM_FILE_END hook, we ensure
27397      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
27398      for FreeBSD) still gets called.  */
27399 #ifdef TARGET_ASM_FILE_END
27400   TARGET_ASM_FILE_END ();
27401 #endif
27402 }
27403
27404 const char *
27405 aarch64_indirect_call_asm (rtx addr)
27406 {
27407   gcc_assert (REG_P (addr));
27408   if (aarch64_harden_sls_blr_p ())
27409     {
27410       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
27411       output_asm_insn ("bl\t%0", &stub_label);
27412     }
27413   else
27414    output_asm_insn ("blr\t%0", &addr);
27415   return "";
27416 }
27417
27418 /* Target-specific selftests.  */
27419
27420 #if CHECKING_P
27421
27422 namespace selftest {
27423
27424 /* Selftest for the RTL loader.
27425    Verify that the RTL loader copes with a dump from
27426    print_rtx_function.  This is essentially just a test that class
27427    function_reader can handle a real dump, but it also verifies
27428    that lookup_reg_by_dump_name correctly handles hard regs.
27429    The presence of hard reg names in the dump means that the test is
27430    target-specific, hence it is in this file.  */
27431
27432 static void
27433 aarch64_test_loading_full_dump ()
27434 {
27435   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
27436
27437   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
27438
27439   rtx_insn *insn_1 = get_insn_by_uid (1);
27440   ASSERT_EQ (NOTE, GET_CODE (insn_1));
27441
27442   rtx_insn *insn_15 = get_insn_by_uid (15);
27443   ASSERT_EQ (INSN, GET_CODE (insn_15));
27444   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
27445
27446   /* Verify crtl->return_rtx.  */
27447   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
27448   ASSERT_EQ (0, REGNO (crtl->return_rtx));
27449   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
27450 }
27451
27452 /* Test the fractional_cost class.  */
27453
27454 static void
27455 aarch64_test_fractional_cost ()
27456 {
27457   using cf = fractional_cost;
27458
27459   ASSERT_EQ (cf (0, 20), 0);
27460
27461   ASSERT_EQ (cf (4, 2), 2);
27462   ASSERT_EQ (3, cf (9, 3));
27463
27464   ASSERT_NE (cf (5, 2), 2);
27465   ASSERT_NE (3, cf (8, 3));
27466
27467   ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
27468   ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
27469   ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
27470
27471   ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
27472   ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
27473   ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
27474   ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
27475   ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
27476   ASSERT_EQ (3 - cf (10, 3), 0);
27477
27478   ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
27479   ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
27480
27481   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27482   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27483   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27484   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27485   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27486   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27487   ASSERT_TRUE (cf (239, 240) < 1);
27488   ASSERT_FALSE (cf (240, 240) < 1);
27489   ASSERT_FALSE (cf (241, 240) < 1);
27490   ASSERT_FALSE (2 < cf (207, 104));
27491   ASSERT_FALSE (2 < cf (208, 104));
27492   ASSERT_TRUE (2 < cf (209, 104));
27493
27494   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27495   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27496   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27497   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27498   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27499   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27500   ASSERT_TRUE (cf (239, 240) < 1);
27501   ASSERT_FALSE (cf (240, 240) < 1);
27502   ASSERT_FALSE (cf (241, 240) < 1);
27503   ASSERT_FALSE (2 < cf (207, 104));
27504   ASSERT_FALSE (2 < cf (208, 104));
27505   ASSERT_TRUE (2 < cf (209, 104));
27506
27507   ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
27508   ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
27509   ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
27510   ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
27511   ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
27512   ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
27513   ASSERT_FALSE (cf (239, 240) >= 1);
27514   ASSERT_TRUE (cf (240, 240) >= 1);
27515   ASSERT_TRUE (cf (241, 240) >= 1);
27516   ASSERT_TRUE (2 >= cf (207, 104));
27517   ASSERT_TRUE (2 >= cf (208, 104));
27518   ASSERT_FALSE (2 >= cf (209, 104));
27519
27520   ASSERT_FALSE (cf (4, 15) > cf (5, 15));
27521   ASSERT_FALSE (cf (5, 15) > cf (5, 15));
27522   ASSERT_TRUE (cf (6, 15) > cf (5, 15));
27523   ASSERT_FALSE (cf (1, 3) > cf (2, 5));
27524   ASSERT_FALSE (cf (1, 12) > cf (1, 6));
27525   ASSERT_FALSE (cf (5, 3) > cf (5, 3));
27526   ASSERT_FALSE (cf (239, 240) > 1);
27527   ASSERT_FALSE (cf (240, 240) > 1);
27528   ASSERT_TRUE (cf (241, 240) > 1);
27529   ASSERT_TRUE (2 > cf (207, 104));
27530   ASSERT_FALSE (2 > cf (208, 104));
27531   ASSERT_FALSE (2 > cf (209, 104));
27532
27533   ASSERT_EQ (cf (1, 2).ceil (), 1);
27534   ASSERT_EQ (cf (11, 7).ceil (), 2);
27535   ASSERT_EQ (cf (20, 1).ceil (), 20);
27536   ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
27537   ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
27538   ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
27539   ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
27540   ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
27541
27542   ASSERT_EQ (cf (1, 2).as_double (), 0.5);
27543 }
27544
27545 /* Run all target-specific selftests.  */
27546
27547 static void
27548 aarch64_run_selftests (void)
27549 {
27550   aarch64_test_loading_full_dump ();
27551   aarch64_test_fractional_cost ();
27552 }
27553
27554 } // namespace selftest
27555
27556 #endif /* #if CHECKING_P */
27557
27558 #undef TARGET_STACK_PROTECT_GUARD
27559 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
27560
27561 #undef TARGET_ADDRESS_COST
27562 #define TARGET_ADDRESS_COST aarch64_address_cost
27563
27564 /* This hook will determines whether unnamed bitfields affect the alignment
27565    of the containing structure.  The hook returns true if the structure
27566    should inherit the alignment requirements of an unnamed bitfield's
27567    type.  */
27568 #undef TARGET_ALIGN_ANON_BITFIELD
27569 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
27570
27571 #undef TARGET_ASM_ALIGNED_DI_OP
27572 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
27573
27574 #undef TARGET_ASM_ALIGNED_HI_OP
27575 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
27576
27577 #undef TARGET_ASM_ALIGNED_SI_OP
27578 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
27579
27580 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
27581 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
27582   hook_bool_const_tree_hwi_hwi_const_tree_true
27583
27584 #undef TARGET_ASM_FILE_START
27585 #define TARGET_ASM_FILE_START aarch64_start_file
27586
27587 #undef TARGET_ASM_OUTPUT_MI_THUNK
27588 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
27589
27590 #undef TARGET_ASM_SELECT_RTX_SECTION
27591 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
27592
27593 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
27594 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
27595
27596 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
27597 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
27598
27599 #undef TARGET_BUILD_BUILTIN_VA_LIST
27600 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
27601
27602 #undef TARGET_CALLEE_COPIES
27603 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
27604
27605 #undef TARGET_CAN_ELIMINATE
27606 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
27607
27608 #undef TARGET_CAN_INLINE_P
27609 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
27610
27611 #undef TARGET_CANNOT_FORCE_CONST_MEM
27612 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
27613
27614 #undef TARGET_CASE_VALUES_THRESHOLD
27615 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
27616
27617 #undef TARGET_CONDITIONAL_REGISTER_USAGE
27618 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
27619
27620 #undef TARGET_MEMBER_TYPE_FORCES_BLK
27621 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
27622
27623 /* Only the least significant bit is used for initialization guard
27624    variables.  */
27625 #undef TARGET_CXX_GUARD_MASK_BIT
27626 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
27627
27628 #undef TARGET_C_MODE_FOR_SUFFIX
27629 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
27630
27631 #ifdef TARGET_BIG_ENDIAN_DEFAULT
27632 #undef  TARGET_DEFAULT_TARGET_FLAGS
27633 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
27634 #endif
27635
27636 #undef TARGET_CLASS_MAX_NREGS
27637 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
27638
27639 #undef TARGET_BUILTIN_DECL
27640 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
27641
27642 #undef TARGET_BUILTIN_RECIPROCAL
27643 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
27644
27645 #undef TARGET_C_EXCESS_PRECISION
27646 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
27647
27648 #undef  TARGET_EXPAND_BUILTIN
27649 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
27650
27651 #undef TARGET_EXPAND_BUILTIN_VA_START
27652 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
27653
27654 #undef TARGET_FOLD_BUILTIN
27655 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
27656
27657 #undef TARGET_FUNCTION_ARG
27658 #define TARGET_FUNCTION_ARG aarch64_function_arg
27659
27660 #undef TARGET_FUNCTION_ARG_ADVANCE
27661 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
27662
27663 #undef TARGET_FUNCTION_ARG_BOUNDARY
27664 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
27665
27666 #undef TARGET_FUNCTION_ARG_PADDING
27667 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
27668
27669 #undef TARGET_GET_RAW_RESULT_MODE
27670 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
27671 #undef TARGET_GET_RAW_ARG_MODE
27672 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
27673
27674 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
27675 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
27676
27677 #undef TARGET_FUNCTION_VALUE
27678 #define TARGET_FUNCTION_VALUE aarch64_function_value
27679
27680 #undef TARGET_FUNCTION_VALUE_REGNO_P
27681 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
27682
27683 #undef TARGET_GIMPLE_FOLD_BUILTIN
27684 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
27685
27686 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
27687 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
27688
27689 #undef  TARGET_INIT_BUILTINS
27690 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
27691
27692 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
27693 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
27694   aarch64_ira_change_pseudo_allocno_class
27695
27696 #undef TARGET_LEGITIMATE_ADDRESS_P
27697 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
27698
27699 #undef TARGET_LEGITIMATE_CONSTANT_P
27700 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
27701
27702 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
27703 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
27704   aarch64_legitimize_address_displacement
27705
27706 #undef TARGET_LIBGCC_CMP_RETURN_MODE
27707 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
27708
27709 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
27710 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
27711 aarch64_libgcc_floating_mode_supported_p
27712
27713 #undef TARGET_MANGLE_TYPE
27714 #define TARGET_MANGLE_TYPE aarch64_mangle_type
27715
27716 #undef TARGET_INVALID_BINARY_OP
27717 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
27718
27719 #undef TARGET_VERIFY_TYPE_CONTEXT
27720 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
27721
27722 #undef TARGET_MEMORY_MOVE_COST
27723 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
27724
27725 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
27726 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
27727
27728 #undef TARGET_MUST_PASS_IN_STACK
27729 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
27730
27731 /* This target hook should return true if accesses to volatile bitfields
27732    should use the narrowest mode possible.  It should return false if these
27733    accesses should use the bitfield container type.  */
27734 #undef TARGET_NARROW_VOLATILE_BITFIELD
27735 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
27736
27737 #undef  TARGET_OPTION_OVERRIDE
27738 #define TARGET_OPTION_OVERRIDE aarch64_override_options
27739
27740 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
27741 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
27742   aarch64_override_options_after_change
27743
27744 #undef TARGET_OFFLOAD_OPTIONS
27745 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
27746
27747 #undef TARGET_OPTION_RESTORE
27748 #define TARGET_OPTION_RESTORE aarch64_option_restore
27749
27750 #undef TARGET_OPTION_PRINT
27751 #define TARGET_OPTION_PRINT aarch64_option_print
27752
27753 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
27754 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
27755
27756 #undef TARGET_SET_CURRENT_FUNCTION
27757 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
27758
27759 #undef TARGET_PASS_BY_REFERENCE
27760 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
27761
27762 #undef TARGET_PREFERRED_RELOAD_CLASS
27763 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
27764
27765 #undef TARGET_SCHED_REASSOCIATION_WIDTH
27766 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
27767
27768 #undef TARGET_DWARF_FRAME_REG_MODE
27769 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
27770
27771 #undef TARGET_PROMOTED_TYPE
27772 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
27773
27774 #undef TARGET_SECONDARY_RELOAD
27775 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
27776
27777 #undef TARGET_SECONDARY_MEMORY_NEEDED
27778 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
27779
27780 #undef TARGET_SHIFT_TRUNCATION_MASK
27781 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
27782
27783 #undef TARGET_SETUP_INCOMING_VARARGS
27784 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
27785
27786 #undef TARGET_STRUCT_VALUE_RTX
27787 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
27788
27789 #undef TARGET_REGISTER_MOVE_COST
27790 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
27791
27792 #undef TARGET_RETURN_IN_MEMORY
27793 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
27794
27795 #undef TARGET_RETURN_IN_MSB
27796 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
27797
27798 #undef TARGET_RTX_COSTS
27799 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
27800
27801 #undef TARGET_SCALAR_MODE_SUPPORTED_P
27802 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
27803
27804 #undef TARGET_SCHED_ISSUE_RATE
27805 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
27806
27807 #undef TARGET_SCHED_VARIABLE_ISSUE
27808 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
27809
27810 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
27811 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
27812   aarch64_sched_first_cycle_multipass_dfa_lookahead
27813
27814 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
27815 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
27816   aarch64_first_cycle_multipass_dfa_lookahead_guard
27817
27818 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
27819 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
27820   aarch64_get_separate_components
27821
27822 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
27823 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
27824   aarch64_components_for_bb
27825
27826 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
27827 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
27828   aarch64_disqualify_components
27829
27830 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
27831 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
27832   aarch64_emit_prologue_components
27833
27834 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
27835 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
27836   aarch64_emit_epilogue_components
27837
27838 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
27839 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
27840   aarch64_set_handled_components
27841
27842 #undef TARGET_TRAMPOLINE_INIT
27843 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
27844
27845 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
27846 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
27847
27848 #undef TARGET_VECTOR_MODE_SUPPORTED_P
27849 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
27850
27851 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
27852 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
27853
27854 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
27855 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
27856   aarch64_builtin_support_vector_misalignment
27857
27858 #undef TARGET_ARRAY_MODE
27859 #define TARGET_ARRAY_MODE aarch64_array_mode
27860
27861 #undef TARGET_ARRAY_MODE_SUPPORTED_P
27862 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
27863
27864 #undef TARGET_VECTORIZE_CREATE_COSTS
27865 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
27866
27867 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
27868 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
27869   aarch64_builtin_vectorization_cost
27870
27871 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
27872 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
27873
27874 #undef TARGET_VECTORIZE_BUILTINS
27875 #define TARGET_VECTORIZE_BUILTINS
27876
27877 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
27878 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
27879   aarch64_autovectorize_vector_modes
27880
27881 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
27882 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
27883   aarch64_atomic_assign_expand_fenv
27884
27885 /* Section anchor support.  */
27886
27887 #undef TARGET_MIN_ANCHOR_OFFSET
27888 #define TARGET_MIN_ANCHOR_OFFSET -256
27889
27890 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
27891    byte offset; we can do much more for larger data types, but have no way
27892    to determine the size of the access.  We assume accesses are aligned.  */
27893 #undef TARGET_MAX_ANCHOR_OFFSET
27894 #define TARGET_MAX_ANCHOR_OFFSET 4095
27895
27896 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
27897 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
27898   aarch64_vectorize_preferred_div_as_shifts_over_mult
27899
27900 #undef TARGET_VECTOR_ALIGNMENT
27901 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
27902
27903 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
27904 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
27905   aarch64_vectorize_preferred_vector_alignment
27906 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
27907 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
27908   aarch64_simd_vector_alignment_reachable
27909
27910 /* vec_perm support.  */
27911
27912 #undef TARGET_VECTORIZE_VEC_PERM_CONST
27913 #define TARGET_VECTORIZE_VEC_PERM_CONST \
27914   aarch64_vectorize_vec_perm_const
27915
27916 #undef TARGET_VECTORIZE_RELATED_MODE
27917 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
27918 #undef TARGET_VECTORIZE_GET_MASK_MODE
27919 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
27920 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
27921 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
27922   aarch64_empty_mask_is_expensive
27923 #undef TARGET_PREFERRED_ELSE_VALUE
27924 #define TARGET_PREFERRED_ELSE_VALUE \
27925   aarch64_preferred_else_value
27926
27927 #undef TARGET_INIT_LIBFUNCS
27928 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
27929
27930 #undef TARGET_FIXED_CONDITION_CODE_REGS
27931 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
27932
27933 #undef TARGET_FLAGS_REGNUM
27934 #define TARGET_FLAGS_REGNUM CC_REGNUM
27935
27936 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
27937 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
27938
27939 #undef TARGET_ASAN_SHADOW_OFFSET
27940 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
27941
27942 #undef TARGET_LEGITIMIZE_ADDRESS
27943 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
27944
27945 #undef TARGET_SCHED_CAN_SPECULATE_INSN
27946 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
27947
27948 #undef TARGET_CAN_USE_DOLOOP_P
27949 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
27950
27951 #undef TARGET_SCHED_ADJUST_PRIORITY
27952 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
27953
27954 #undef TARGET_SCHED_MACRO_FUSION_P
27955 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
27956
27957 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
27958 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
27959
27960 #undef TARGET_SCHED_FUSION_PRIORITY
27961 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
27962
27963 #undef TARGET_UNSPEC_MAY_TRAP_P
27964 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
27965
27966 #undef TARGET_USE_PSEUDO_PIC_REG
27967 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
27968
27969 #undef TARGET_PRINT_OPERAND
27970 #define TARGET_PRINT_OPERAND aarch64_print_operand
27971
27972 #undef TARGET_PRINT_OPERAND_ADDRESS
27973 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
27974
27975 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
27976 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
27977
27978 #undef TARGET_OPTAB_SUPPORTED_P
27979 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
27980
27981 #undef TARGET_OMIT_STRUCT_RETURN_REG
27982 #define TARGET_OMIT_STRUCT_RETURN_REG true
27983
27984 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
27985 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
27986   aarch64_dwarf_poly_indeterminate_value
27987
27988 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
27989 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
27990 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
27991
27992 #undef TARGET_HARD_REGNO_NREGS
27993 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
27994 #undef TARGET_HARD_REGNO_MODE_OK
27995 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
27996
27997 #undef TARGET_MODES_TIEABLE_P
27998 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
27999
28000 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
28001 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
28002   aarch64_hard_regno_call_part_clobbered
28003
28004 #undef TARGET_INSN_CALLEE_ABI
28005 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
28006
28007 #undef TARGET_CONSTANT_ALIGNMENT
28008 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
28009
28010 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
28011 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
28012   aarch64_stack_clash_protection_alloca_probe_range
28013
28014 #undef TARGET_COMPUTE_PRESSURE_CLASSES
28015 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
28016
28017 #undef TARGET_CAN_CHANGE_MODE_CLASS
28018 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
28019
28020 #undef TARGET_SELECT_EARLY_REMAT_MODES
28021 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
28022
28023 #undef TARGET_SPECULATION_SAFE_VALUE
28024 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
28025
28026 #undef TARGET_ESTIMATED_POLY_VALUE
28027 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
28028
28029 #undef TARGET_ATTRIBUTE_TABLE
28030 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
28031
28032 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
28033 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
28034   aarch64_simd_clone_compute_vecsize_and_simdlen
28035
28036 #undef TARGET_SIMD_CLONE_ADJUST
28037 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
28038
28039 #undef TARGET_SIMD_CLONE_USABLE
28040 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
28041
28042 #undef TARGET_COMP_TYPE_ATTRIBUTES
28043 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
28044
28045 #undef TARGET_GET_MULTILIB_ABI_NAME
28046 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
28047
28048 #undef TARGET_FNTYPE_ABI
28049 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
28050
28051 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
28052 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
28053
28054 #if CHECKING_P
28055 #undef TARGET_RUN_TARGET_SELFTESTS
28056 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
28057 #endif /* #if CHECKING_P */
28058
28059 #undef TARGET_ASM_POST_CFI_STARTPROC
28060 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
28061
28062 #undef TARGET_STRICT_ARGUMENT_NAMING
28063 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
28064
28065 #undef TARGET_MD_ASM_ADJUST
28066 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
28067
28068 #undef TARGET_ASM_FILE_END
28069 #define TARGET_ASM_FILE_END aarch64_asm_file_end
28070
28071 #undef TARGET_ASM_FUNCTION_EPILOGUE
28072 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
28073
28074 #undef TARGET_HAVE_SHADOW_CALL_STACK
28075 #define TARGET_HAVE_SHADOW_CALL_STACK true
28076
28077 #undef TARGET_CONST_ANCHOR
28078 #define TARGET_CONST_ANCHOR 0x1000000
28079
28080 struct gcc_target targetm = TARGET_INITIALIZER;
28081
28082 #include "gt-aarch64.h"