gcc/config/aarch64/aarch64.cc

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2022 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #define INCLUDE_STRING
  24 #define INCLUDE_ALGORITHM
  25 #include "config.h"
  26 #include "system.h"
  27 #include "coretypes.h"
  28 #include "backend.h"
  29 #include "target.h"
  30 #include "rtl.h"
  31 #include "tree.h"
  32 #include "memmodel.h"
  33 #include "gimple.h"
  34 #include "cfghooks.h"
  35 #include "cfgloop.h"
  36 #include "df.h"
  37 #include "tm_p.h"
  38 #include "stringpool.h"
  39 #include "attribs.h"
  40 #include "optabs.h"
  41 #include "regs.h"
  42 #include "emit-rtl.h"
  43 #include "recog.h"
  44 #include "cgraph.h"
  45 #include "diagnostic.h"
  46 #include "insn-attr.h"
  47 #include "alias.h"
  48 #include "fold-const.h"
  49 #include "stor-layout.h"
  50 #include "calls.h"
  51 #include "varasm.h"
  52 #include "output.h"
  53 #include "flags.h"
  54 #include "explow.h"
  55 #include "expr.h"
  56 #include "reload.h"
  57 #include "langhooks.h"
  58 #include "opts.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76 #include "expmed.h"
  77 #include "function-abi.h"
  78 #include "gimple-pretty-print.h"
  79 #include "tree-ssa-loop-niter.h"
  80 #include "fractional-cost.h"
  81 #include "rtlanal.h"
  82 #include "tree-dfa.h"
  83 #include "asan.h"
  84
  85 /* This file should be included last.  */
  86 #include "target-def.h"
  87
  88 /* Defined for convenience.  */
  89 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  90
  91 /* Information about a legitimate vector immediate operand.  */
  92 struct simd_immediate_info
  93 {
  94   enum insn_type { MOV, MVN, INDEX, PTRUE };
  95   enum modifier_type { LSL, MSL };
  96
  97   simd_immediate_info () {}
  98   simd_immediate_info (scalar_float_mode, rtx);
  99   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 100                        insn_type = MOV, modifier_type = LSL,
 101                        unsigned int = 0);
 102   simd_immediate_info (scalar_mode, rtx, rtx);
 103   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
 104
 105   /* The mode of the elements.  */
 106   scalar_mode elt_mode;
 107
 108   /* The instruction to use to move the immediate into a vector.  */
 109   insn_type insn;
 110
 111   union
 112   {
 113     /* For MOV and MVN.  */
 114     struct
 115     {
 116       /* The value of each element.  */
 117       rtx value;
 118
 119       /* The kind of shift modifier to use, and the number of bits to shift.
 120          This is (LSL, 0) if no shift is needed.  */
 121       modifier_type modifier;
 122       unsigned int shift;
 123     } mov;
 124
 125     /* For INDEX.  */
 126     struct
 127     {
 128       /* The value of the first element and the step to be added for each
 129          subsequent element.  */
 130       rtx base, step;
 131     } index;
 132
 133     /* For PTRUE.  */
 134     aarch64_svpattern pattern;
 135   } u;
 136 };
 137
 138 /* Construct a floating-point immediate in which each element has mode
 139    ELT_MODE_IN and value VALUE_IN.  */
 140 inline simd_immediate_info
 141 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 142   : elt_mode (elt_mode_in), insn (MOV)
 143 {
 144   u.mov.value = value_in;
 145   u.mov.modifier = LSL;
 146   u.mov.shift = 0;
 147 }
 148
 149 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 150    and value VALUE_IN.  The other parameters are as for the structure
 151    fields.  */
 152 inline simd_immediate_info
 153 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 154                        unsigned HOST_WIDE_INT value_in,
 155                        insn_type insn_in, modifier_type modifier_in,
 156                        unsigned int shift_in)
 157   : elt_mode (elt_mode_in), insn (insn_in)
 158 {
 159   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 160   u.mov.modifier = modifier_in;
 161   u.mov.shift = shift_in;
 162 }
 163
 164 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 165    and where element I is equal to BASE_IN + I * STEP_IN.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 168   : elt_mode (elt_mode_in), insn (INDEX)
 169 {
 170   u.index.base = base_in;
 171   u.index.step = step_in;
 172 }
 173
 174 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 175    and has PTRUE pattern PATTERN_IN.  */
 176 inline simd_immediate_info
 177 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 178                        aarch64_svpattern pattern_in)
 179   : elt_mode (elt_mode_in), insn (PTRUE)
 180 {
 181   u.pattern = pattern_in;
 182 }
 183
 184 namespace {
 185
 186 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
 187 class pure_scalable_type_info
 188 {
 189 public:
 190   /* Represents the result of analyzing a type.  All values are nonzero,
 191      in the possibly forlorn hope that accidental conversions to bool
 192      trigger a warning.  */
 193   enum analysis_result
 194   {
 195     /* The type does not have an ABI identity; i.e. it doesn't contain
 196        at least one object whose type is a Fundamental Data Type.  */
 197     NO_ABI_IDENTITY = 1,
 198
 199     /* The type is definitely a Pure Scalable Type.  */
 200     IS_PST,
 201
 202     /* The type is definitely not a Pure Scalable Type.  */
 203     ISNT_PST,
 204
 205     /* It doesn't matter for PCS purposes whether the type is a Pure
 206        Scalable Type or not, since the type will be handled the same
 207        way regardless.
 208
 209        Specifically, this means that if the type is a Pure Scalable Type,
 210        there aren't enough argument registers to hold it, and so it will
 211        need to be passed or returned in memory.  If the type isn't a
 212        Pure Scalable Type, it's too big to be passed or returned in core
 213        or SIMD&FP registers, and so again will need to go in memory.  */
 214     DOESNT_MATTER
 215   };
 216
 217   /* Aggregates of 17 bytes or more are normally passed and returned
 218      in memory, so aggregates of that size can safely be analyzed as
 219      DOESNT_MATTER.  We need to be able to collect enough pieces to
 220      represent a PST that is smaller than that.  Since predicates are
 221      2 bytes in size for -msve-vector-bits=128, that means we need to be
 222      able to store at least 8 pieces.
 223
 224      We also need to be able to store enough pieces to represent
 225      a single vector in each vector argument register and a single
 226      predicate in each predicate argument register.  This means that
 227      we need at least 12 pieces.  */
 228   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
 229   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
 230
 231   /* Describes one piece of a PST.  Each piece is one of:
 232
 233      - a single Scalable Vector Type (SVT)
 234      - a single Scalable Predicate Type (SPT)
 235      - a PST containing 2, 3 or 4 SVTs, with no padding
 236
 237      It either represents a single built-in type or a PST formed from
 238      multiple homogeneous built-in types.  */
 239   struct piece
 240   {
 241     rtx get_rtx (unsigned int, unsigned int) const;
 242
 243     /* The number of vector and predicate registers that the piece
 244        occupies.  One of the two is always zero.  */
 245     unsigned int num_zr;
 246     unsigned int num_pr;
 247
 248     /* The mode of the registers described above.  */
 249     machine_mode mode;
 250
 251     /* If this piece is formed from multiple homogeneous built-in types,
 252        this is the mode of the built-in types, otherwise it is MODE.  */
 253     machine_mode orig_mode;
 254
 255     /* The offset in bytes of the piece from the start of the type.  */
 256     poly_uint64_pod offset;
 257   };
 258
 259   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
 260      are in memory order.  */
 261   auto_vec<piece, MAX_PIECES> pieces;
 262
 263   unsigned int num_zr () const;
 264   unsigned int num_pr () const;
 265
 266   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
 267
 268   analysis_result analyze (const_tree);
 269   bool analyze_registers (const_tree);
 270
 271 private:
 272   analysis_result analyze_array (const_tree);
 273   analysis_result analyze_record (const_tree);
 274   void add_piece (const piece &);
 275 };
 276 }
 277
 278 /* The current code model.  */
 279 enum aarch64_code_model aarch64_cmodel;
 280
 281 /* The number of 64-bit elements in an SVE vector.  */
 282 poly_uint16 aarch64_sve_vg;
 283
 284 #ifdef HAVE_AS_TLS
 285 #undef TARGET_HAVE_TLS
 286 #define TARGET_HAVE_TLS 1
 287 #endif
 288
 289 static bool aarch64_composite_type_p (const_tree, machine_mode);
 290 static bool aarch64_return_in_memory_1 (const_tree);
 291 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 292                                                      const_tree,
 293                                                      machine_mode *, int *,
 294                                                      bool *, bool);
 295 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 296 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 297 static void aarch64_override_options_after_change (void);
 298 static bool aarch64_vector_mode_supported_p (machine_mode);
 299 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 300 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 301                                                          const_tree type,
 302                                                          int misalignment,
 303                                                          bool is_packed);
 304 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 305 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 306                                             aarch64_addr_query_type);
 307 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 308
 309 /* The processor for which instructions should be scheduled.  */
 310 enum aarch64_processor aarch64_tune = cortexa53;
 311
 312 /* Mask to specify which instruction scheduling options should be used.  */
 313 uint64_t aarch64_tune_flags = 0;
 314
 315 /* Global flag for PC relative loads.  */
 316 bool aarch64_pcrelative_literal_loads;
 317
 318 /* Global flag for whether frame pointer is enabled.  */
 319 bool aarch64_use_frame_pointer;
 320
 321 #define BRANCH_PROTECT_STR_MAX 255
 322 char *accepted_branch_protection_string = NULL;
 323
 324 static enum aarch64_parse_opt_result
 325 aarch64_parse_branch_protection (const char*, char**);
 326
 327 /* Support for command line parsing of boolean flags in the tuning
 328    structures.  */
 329 struct aarch64_flag_desc
 330 {
 331   const char* name;
 332   unsigned int flag;
 333 };
 334
 335 #define AARCH64_FUSION_PAIR(name, internal_name) \
 336   { name, AARCH64_FUSE_##internal_name },
 337 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 338 {
 339   { "none", AARCH64_FUSE_NOTHING },
 340 #include "aarch64-fusion-pairs.def"
 341   { "all", AARCH64_FUSE_ALL },
 342   { NULL, AARCH64_FUSE_NOTHING }
 343 };
 344
 345 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 346   { name, AARCH64_EXTRA_TUNE_##internal_name },
 347 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 348 {
 349   { "none", AARCH64_EXTRA_TUNE_NONE },
 350 #include "aarch64-tuning-flags.def"
 351   { "all", AARCH64_EXTRA_TUNE_ALL },
 352   { NULL, AARCH64_EXTRA_TUNE_NONE }
 353 };
 354
 355 /* Tuning parameters.  */
 356
 357 static const struct cpu_addrcost_table generic_addrcost_table =
 358 {
 359     {
 360       1, /* hi  */
 361       0, /* si  */
 362       0, /* di  */
 363       1, /* ti  */
 364     },
 365   0, /* pre_modify  */
 366   0, /* post_modify  */
 367   0, /* post_modify_ld3_st3  */
 368   0, /* post_modify_ld4_st4  */
 369   0, /* register_offset  */
 370   0, /* register_sextend  */
 371   0, /* register_zextend  */
 372   0 /* imm_offset  */
 373 };
 374
 375 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 376 {
 377     {
 378       0, /* hi  */
 379       0, /* si  */
 380       0, /* di  */
 381       2, /* ti  */
 382     },
 383   0, /* pre_modify  */
 384   0, /* post_modify  */
 385   0, /* post_modify_ld3_st3  */
 386   0, /* post_modify_ld4_st4  */
 387   1, /* register_offset  */
 388   1, /* register_sextend  */
 389   2, /* register_zextend  */
 390   0, /* imm_offset  */
 391 };
 392
 393 static const struct cpu_addrcost_table xgene1_addrcost_table =
 394 {
 395     {
 396       1, /* hi  */
 397       0, /* si  */
 398       0, /* di  */
 399       1, /* ti  */
 400     },
 401   1, /* pre_modify  */
 402   1, /* post_modify  */
 403   1, /* post_modify_ld3_st3  */
 404   1, /* post_modify_ld4_st4  */
 405   0, /* register_offset  */
 406   1, /* register_sextend  */
 407   1, /* register_zextend  */
 408   0, /* imm_offset  */
 409 };
 410
 411 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 412 {
 413     {
 414       1, /* hi  */
 415       1, /* si  */
 416       1, /* di  */
 417       2, /* ti  */
 418     },
 419   0, /* pre_modify  */
 420   0, /* post_modify  */
 421   0, /* post_modify_ld3_st3  */
 422   0, /* post_modify_ld4_st4  */
 423   2, /* register_offset  */
 424   3, /* register_sextend  */
 425   3, /* register_zextend  */
 426   0, /* imm_offset  */
 427 };
 428
 429 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
 430 {
 431     {
 432       1, /* hi  */
 433       1, /* si  */
 434       1, /* di  */
 435       2, /* ti  */
 436     },
 437   0, /* pre_modify  */
 438   0, /* post_modify  */
 439   0, /* post_modify_ld3_st3  */
 440   0, /* post_modify_ld4_st4  */
 441   2, /* register_offset  */
 442   3, /* register_sextend  */
 443   3, /* register_zextend  */
 444   0, /* imm_offset  */
 445 };
 446
 447 static const struct cpu_addrcost_table tsv110_addrcost_table =
 448 {
 449     {
 450       1, /* hi  */
 451       0, /* si  */
 452       0, /* di  */
 453       1, /* ti  */
 454     },
 455   0, /* pre_modify  */
 456   0, /* post_modify  */
 457   0, /* post_modify_ld3_st3  */
 458   0, /* post_modify_ld4_st4  */
 459   0, /* register_offset  */
 460   1, /* register_sextend  */
 461   1, /* register_zextend  */
 462   0, /* imm_offset  */
 463 };
 464
 465 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 466 {
 467     {
 468       1, /* hi  */
 469       1, /* si  */
 470       1, /* di  */
 471       2, /* ti  */
 472     },
 473   1, /* pre_modify  */
 474   1, /* post_modify  */
 475   1, /* post_modify_ld3_st3  */
 476   1, /* post_modify_ld4_st4  */
 477   3, /* register_offset  */
 478   3, /* register_sextend  */
 479   3, /* register_zextend  */
 480   2, /* imm_offset  */
 481 };
 482
 483 static const struct cpu_addrcost_table a64fx_addrcost_table =
 484 {
 485     {
 486       1, /* hi  */
 487       1, /* si  */
 488       1, /* di  */
 489       2, /* ti  */
 490     },
 491   0, /* pre_modify  */
 492   0, /* post_modify  */
 493   0, /* post_modify_ld3_st3  */
 494   0, /* post_modify_ld4_st4  */
 495   2, /* register_offset  */
 496   3, /* register_sextend  */
 497   3, /* register_zextend  */
 498   0, /* imm_offset  */
 499 };
 500
 501 static const struct cpu_addrcost_table neoversev1_addrcost_table =
 502 {
 503     {
 504       1, /* hi  */
 505       0, /* si  */
 506       0, /* di  */
 507       1, /* ti  */
 508     },
 509   0, /* pre_modify  */
 510   0, /* post_modify  */
 511   3, /* post_modify_ld3_st3  */
 512   3, /* post_modify_ld4_st4  */
 513   0, /* register_offset  */
 514   0, /* register_sextend  */
 515   0, /* register_zextend  */
 516   0 /* imm_offset  */
 517 };
 518
 519 static const struct cpu_addrcost_table neoversen2_addrcost_table =
 520 {
 521     {
 522       1, /* hi  */
 523       0, /* si  */
 524       0, /* di  */
 525       1, /* ti  */
 526     },
 527   0, /* pre_modify  */
 528   0, /* post_modify  */
 529   2, /* post_modify_ld3_st3  */
 530   2, /* post_modify_ld4_st4  */
 531   0, /* register_offset  */
 532   0, /* register_sextend  */
 533   0, /* register_zextend  */
 534   0 /* imm_offset  */
 535 };
 536
 537 static const struct cpu_addrcost_table demeter_addrcost_table =
 538 {
 539     {
 540       1, /* hi  */
 541       0, /* si  */
 542       0, /* di  */
 543       1, /* ti  */
 544     },
 545   0, /* pre_modify  */
 546   0, /* post_modify  */
 547   2, /* post_modify_ld3_st3  */
 548   2, /* post_modify_ld4_st4  */
 549   0, /* register_offset  */
 550   0, /* register_sextend  */
 551   0, /* register_zextend  */
 552   0 /* imm_offset  */
 553 };
 554
 555 static const struct cpu_regmove_cost generic_regmove_cost =
 556 {
 557   1, /* GP2GP  */
 558   /* Avoid the use of slow int<->fp moves for spilling by setting
 559      their cost higher than memmov_cost.  */
 560   5, /* GP2FP  */
 561   5, /* FP2GP  */
 562   2 /* FP2FP  */
 563 };
 564
 565 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 566 {
 567   1, /* GP2GP  */
 568   /* Avoid the use of slow int<->fp moves for spilling by setting
 569      their cost higher than memmov_cost.  */
 570   5, /* GP2FP  */
 571   5, /* FP2GP  */
 572   2 /* FP2FP  */
 573 };
 574
 575 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 576 {
 577   1, /* GP2GP  */
 578   /* Avoid the use of slow int<->fp moves for spilling by setting
 579      their cost higher than memmov_cost.  */
 580   5, /* GP2FP  */
 581   5, /* FP2GP  */
 582   2 /* FP2FP  */
 583 };
 584
 585 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 586 {
 587   1, /* GP2GP  */
 588   /* Avoid the use of slow int<->fp moves for spilling by setting
 589      their cost higher than memmov_cost (actual, 4 and 9).  */
 590   9, /* GP2FP  */
 591   9, /* FP2GP  */
 592   1 /* FP2FP  */
 593 };
 594
 595 static const struct cpu_regmove_cost thunderx_regmove_cost =
 596 {
 597   2, /* GP2GP  */
 598   2, /* GP2FP  */
 599   6, /* FP2GP  */
 600   4 /* FP2FP  */
 601 };
 602
 603 static const struct cpu_regmove_cost xgene1_regmove_cost =
 604 {
 605   1, /* GP2GP  */
 606   /* Avoid the use of slow int<->fp moves for spilling by setting
 607      their cost higher than memmov_cost.  */
 608   8, /* GP2FP  */
 609   8, /* FP2GP  */
 610   2 /* FP2FP  */
 611 };
 612
 613 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 614 {
 615   2, /* GP2GP  */
 616   /* Avoid the use of int<->fp moves for spilling.  */
 617   6, /* GP2FP  */
 618   6, /* FP2GP  */
 619   4 /* FP2FP  */
 620 };
 621
 622 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 623 {
 624   1, /* GP2GP  */
 625   /* Avoid the use of int<->fp moves for spilling.  */
 626   5, /* GP2FP  */
 627   6, /* FP2GP  */
 628   3, /* FP2FP  */
 629 };
 630
 631 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
 632 {
 633   1, /* GP2GP  */
 634   /* Avoid the use of int<->fp moves for spilling.  */
 635   4, /* GP2FP  */
 636   5, /* FP2GP  */
 637   4  /* FP2FP  */
 638 };
 639
 640 static const struct cpu_regmove_cost tsv110_regmove_cost =
 641 {
 642   1, /* GP2GP  */
 643   /* Avoid the use of slow int<->fp moves for spilling by setting
 644      their cost higher than memmov_cost.  */
 645   2, /* GP2FP  */
 646   3, /* FP2GP  */
 647   2  /* FP2FP  */
 648 };
 649
 650 static const struct cpu_regmove_cost a64fx_regmove_cost =
 651 {
 652   1, /* GP2GP  */
 653   /* Avoid the use of slow int<->fp moves for spilling by setting
 654      their cost higher than memmov_cost.  */
 655   5, /* GP2FP  */
 656   7, /* FP2GP  */
 657   2 /* FP2FP  */
 658 };
 659
 660 static const struct cpu_regmove_cost neoversen2_regmove_cost =
 661 {
 662   1, /* GP2GP  */
 663   /* Spilling to int<->fp instead of memory is recommended so set
 664      realistic costs compared to memmov_cost.  */
 665   3, /* GP2FP  */
 666   2, /* FP2GP  */
 667   2 /* FP2FP  */
 668 };
 669
 670 static const struct cpu_regmove_cost neoversev1_regmove_cost =
 671 {
 672   1, /* GP2GP  */
 673   /* Spilling to int<->fp instead of memory is recommended so set
 674      realistic costs compared to memmov_cost.  */
 675   3, /* GP2FP  */
 676   2, /* FP2GP  */
 677   2 /* FP2FP  */
 678 };
 679
 680 static const struct cpu_regmove_cost demeter_regmove_cost =
 681 {
 682   1, /* GP2GP  */
 683   /* Spilling to int<->fp instead of memory is recommended so set
 684      realistic costs compared to memmov_cost.  */
 685   3, /* GP2FP  */
 686   2, /* FP2GP  */
 687   2 /* FP2FP  */
 688 };
 689
 690 /* Generic costs for Advanced SIMD vector operations.   */
 691 static const advsimd_vec_cost generic_advsimd_vector_cost =
 692 {
 693   1, /* int_stmt_cost  */
 694   1, /* fp_stmt_cost  */
 695   0, /* ld2_st2_permute_cost  */
 696   0, /* ld3_st3_permute_cost  */
 697   0, /* ld4_st4_permute_cost  */
 698   2, /* permute_cost  */
 699   2, /* reduc_i8_cost  */
 700   2, /* reduc_i16_cost  */
 701   2, /* reduc_i32_cost  */
 702   2, /* reduc_i64_cost  */
 703   2, /* reduc_f16_cost  */
 704   2, /* reduc_f32_cost  */
 705   2, /* reduc_f64_cost  */
 706   2, /* store_elt_extra_cost  */
 707   2, /* vec_to_scalar_cost  */
 708   1, /* scalar_to_vec_cost  */
 709   1, /* align_load_cost  */
 710   1, /* unalign_load_cost  */
 711   1, /* unalign_store_cost  */
 712   1  /* store_cost  */
 713 };
 714
 715 /* Generic costs for SVE vector operations.  */
 716 static const sve_vec_cost generic_sve_vector_cost =
 717 {
 718   {
 719     1, /* int_stmt_cost  */
 720     1, /* fp_stmt_cost  */
 721     0, /* ld2_st2_permute_cost  */
 722     0, /* ld3_st3_permute_cost  */
 723     0, /* ld4_st4_permute_cost  */
 724     2, /* permute_cost  */
 725     2, /* reduc_i8_cost  */
 726     2, /* reduc_i16_cost  */
 727     2, /* reduc_i32_cost  */
 728     2, /* reduc_i64_cost  */
 729     2, /* reduc_f16_cost  */
 730     2, /* reduc_f32_cost  */
 731     2, /* reduc_f64_cost  */
 732     2, /* store_elt_extra_cost  */
 733     2, /* vec_to_scalar_cost  */
 734     1, /* scalar_to_vec_cost  */
 735     1, /* align_load_cost  */
 736     1, /* unalign_load_cost  */
 737     1, /* unalign_store_cost  */
 738     1  /* store_cost  */
 739   },
 740   2, /* clast_cost  */
 741   2, /* fadda_f16_cost  */
 742   2, /* fadda_f32_cost  */
 743   2, /* fadda_f64_cost  */
 744   4, /* gather_load_x32_cost  */
 745   2, /* gather_load_x64_cost  */
 746   1 /* scatter_store_elt_cost  */
 747 };
 748
 749 /* Generic costs for vector insn classes.  */
 750 static const struct cpu_vector_cost generic_vector_cost =
 751 {
 752   1, /* scalar_int_stmt_cost  */
 753   1, /* scalar_fp_stmt_cost  */
 754   1, /* scalar_load_cost  */
 755   1, /* scalar_store_cost  */
 756   3, /* cond_taken_branch_cost  */
 757   1, /* cond_not_taken_branch_cost  */
 758   &generic_advsimd_vector_cost, /* advsimd  */
 759   &generic_sve_vector_cost, /* sve */
 760   nullptr /* issue_info  */
 761 };
 762
 763 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
 764 {
 765   2, /* int_stmt_cost  */
 766   5, /* fp_stmt_cost  */
 767   0, /* ld2_st2_permute_cost  */
 768   0, /* ld3_st3_permute_cost  */
 769   0, /* ld4_st4_permute_cost  */
 770   3, /* permute_cost  */
 771   13, /* reduc_i8_cost  */
 772   13, /* reduc_i16_cost  */
 773   13, /* reduc_i32_cost  */
 774   13, /* reduc_i64_cost  */
 775   13, /* reduc_f16_cost  */
 776   13, /* reduc_f32_cost  */
 777   13, /* reduc_f64_cost  */
 778   13, /* store_elt_extra_cost  */
 779   13, /* vec_to_scalar_cost  */
 780   4, /* scalar_to_vec_cost  */
 781   6, /* align_load_cost  */
 782   6, /* unalign_load_cost  */
 783   1, /* unalign_store_cost  */
 784   1  /* store_cost  */
 785 };
 786
 787 static const sve_vec_cost a64fx_sve_vector_cost =
 788 {
 789   {
 790     2, /* int_stmt_cost  */
 791     5, /* fp_stmt_cost  */
 792     0, /* ld2_st2_permute_cost  */
 793     0, /* ld3_st3_permute_cost  */
 794     0, /* ld4_st4_permute_cost  */
 795     3, /* permute_cost  */
 796     13, /* reduc_i8_cost  */
 797     13, /* reduc_i16_cost  */
 798     13, /* reduc_i32_cost  */
 799     13, /* reduc_i64_cost  */
 800     13, /* reduc_f16_cost  */
 801     13, /* reduc_f32_cost  */
 802     13, /* reduc_f64_cost  */
 803     13, /* store_elt_extra_cost  */
 804     13, /* vec_to_scalar_cost  */
 805     4, /* scalar_to_vec_cost  */
 806     6, /* align_load_cost  */
 807     6, /* unalign_load_cost  */
 808     1, /* unalign_store_cost  */
 809     1  /* store_cost  */
 810   },
 811   13, /* clast_cost  */
 812   13, /* fadda_f16_cost  */
 813   13, /* fadda_f32_cost  */
 814   13, /* fadda_f64_cost  */
 815   64, /* gather_load_x32_cost  */
 816   32, /* gather_load_x64_cost  */
 817   1 /* scatter_store_elt_cost  */
 818 };
 819
 820 static const struct cpu_vector_cost a64fx_vector_cost =
 821 {
 822   1, /* scalar_int_stmt_cost  */
 823   5, /* scalar_fp_stmt_cost  */
 824   4, /* scalar_load_cost  */
 825   1, /* scalar_store_cost  */
 826   3, /* cond_taken_branch_cost  */
 827   1, /* cond_not_taken_branch_cost  */
 828   &a64fx_advsimd_vector_cost, /* advsimd  */
 829   &a64fx_sve_vector_cost, /* sve  */
 830   nullptr /* issue_info  */
 831 };
 832
 833 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
 834 {
 835   1, /* int_stmt_cost  */
 836   3, /* fp_stmt_cost  */
 837   0, /* ld2_st2_permute_cost  */
 838   0, /* ld3_st3_permute_cost  */
 839   0, /* ld4_st4_permute_cost  */
 840   2, /* permute_cost  */
 841   1, /* reduc_i8_cost  */
 842   1, /* reduc_i16_cost  */
 843   1, /* reduc_i32_cost  */
 844   1, /* reduc_i64_cost  */
 845   1, /* reduc_f16_cost  */
 846   1, /* reduc_f32_cost  */
 847   1, /* reduc_f64_cost  */
 848   1, /* store_elt_extra_cost  */
 849   1, /* vec_to_scalar_cost  */
 850   1, /* scalar_to_vec_cost  */
 851   1, /* align_load_cost  */
 852   1, /* unalign_load_cost  */
 853   1, /* unalign_store_cost  */
 854   1  /* store_cost  */
 855 };
 856
 857 /* QDF24XX costs for vector insn classes.  */
 858 static const struct cpu_vector_cost qdf24xx_vector_cost =
 859 {
 860   1, /* scalar_int_stmt_cost  */
 861   1, /* scalar_fp_stmt_cost  */
 862   1, /* scalar_load_cost  */
 863   1, /* scalar_store_cost  */
 864   3, /* cond_taken_branch_cost  */
 865   1, /* cond_not_taken_branch_cost  */
 866   &qdf24xx_advsimd_vector_cost, /* advsimd  */
 867   nullptr, /* sve  */
 868   nullptr /* issue_info  */
 869 };
 870
 871
 872 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
 873 {
 874   4, /* int_stmt_cost  */
 875   1, /* fp_stmt_cost  */
 876   0, /* ld2_st2_permute_cost  */
 877   0, /* ld3_st3_permute_cost  */
 878   0, /* ld4_st4_permute_cost  */
 879   4, /* permute_cost  */
 880   2, /* reduc_i8_cost  */
 881   2, /* reduc_i16_cost  */
 882   2, /* reduc_i32_cost  */
 883   2, /* reduc_i64_cost  */
 884   2, /* reduc_f16_cost  */
 885   2, /* reduc_f32_cost  */
 886   2, /* reduc_f64_cost  */
 887   2, /* store_elt_extra_cost  */
 888   2, /* vec_to_scalar_cost  */
 889   2, /* scalar_to_vec_cost  */
 890   3, /* align_load_cost  */
 891   5, /* unalign_load_cost  */
 892   5, /* unalign_store_cost  */
 893   1  /* store_cost  */
 894 };
 895
 896 /* ThunderX costs for vector insn classes.  */
 897 static const struct cpu_vector_cost thunderx_vector_cost =
 898 {
 899   1, /* scalar_int_stmt_cost  */
 900   1, /* scalar_fp_stmt_cost  */
 901   3, /* scalar_load_cost  */
 902   1, /* scalar_store_cost  */
 903   3, /* cond_taken_branch_cost  */
 904   3, /* cond_not_taken_branch_cost  */
 905   &thunderx_advsimd_vector_cost, /* advsimd  */
 906   nullptr, /* sve  */
 907   nullptr /* issue_info  */
 908 };
 909
 910 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
 911 {
 912   2, /* int_stmt_cost  */
 913   2, /* fp_stmt_cost  */
 914   0, /* ld2_st2_permute_cost  */
 915   0, /* ld3_st3_permute_cost  */
 916   0, /* ld4_st4_permute_cost  */
 917   2, /* permute_cost  */
 918   3, /* reduc_i8_cost  */
 919   3, /* reduc_i16_cost  */
 920   3, /* reduc_i32_cost  */
 921   3, /* reduc_i64_cost  */
 922   3, /* reduc_f16_cost  */
 923   3, /* reduc_f32_cost  */
 924   3, /* reduc_f64_cost  */
 925   3, /* store_elt_extra_cost  */
 926   3, /* vec_to_scalar_cost  */
 927   2, /* scalar_to_vec_cost  */
 928   5, /* align_load_cost  */
 929   5, /* unalign_load_cost  */
 930   1, /* unalign_store_cost  */
 931   1  /* store_cost  */
 932 };
 933
 934 static const struct cpu_vector_cost tsv110_vector_cost =
 935 {
 936   1, /* scalar_int_stmt_cost  */
 937   1, /* scalar_fp_stmt_cost  */
 938   5, /* scalar_load_cost  */
 939   1, /* scalar_store_cost  */
 940   1, /* cond_taken_branch_cost  */
 941   1, /* cond_not_taken_branch_cost  */
 942   &tsv110_advsimd_vector_cost, /* advsimd  */
 943   nullptr, /* sve  */
 944   nullptr /* issue_info  */
 945 };
 946
 947 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
 948 {
 949   2, /* int_stmt_cost  */
 950   2, /* fp_stmt_cost  */
 951   0, /* ld2_st2_permute_cost  */
 952   0, /* ld3_st3_permute_cost  */
 953   0, /* ld4_st4_permute_cost  */
 954   3, /* permute_cost  */
 955   8, /* reduc_i8_cost  */
 956   8, /* reduc_i16_cost  */
 957   8, /* reduc_i32_cost  */
 958   8, /* reduc_i64_cost  */
 959   8, /* reduc_f16_cost  */
 960   8, /* reduc_f32_cost  */
 961   8, /* reduc_f64_cost  */
 962   8, /* store_elt_extra_cost  */
 963   8, /* vec_to_scalar_cost  */
 964   8, /* scalar_to_vec_cost  */
 965   4, /* align_load_cost  */
 966   4, /* unalign_load_cost  */
 967   1, /* unalign_store_cost  */
 968   1  /* store_cost  */
 969 };
 970
 971 /* Cortex-A57 costs for vector insn classes.  */
 972 static const struct cpu_vector_cost cortexa57_vector_cost =
 973 {
 974   1, /* scalar_int_stmt_cost  */
 975   1, /* scalar_fp_stmt_cost  */
 976   4, /* scalar_load_cost  */
 977   1, /* scalar_store_cost  */
 978   1, /* cond_taken_branch_cost  */
 979   1, /* cond_not_taken_branch_cost  */
 980   &cortexa57_advsimd_vector_cost, /* advsimd  */
 981   nullptr, /* sve  */
 982   nullptr /* issue_info  */
 983 };
 984
 985 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
 986 {
 987   3, /* int_stmt_cost  */
 988   3, /* fp_stmt_cost  */
 989   0, /* ld2_st2_permute_cost  */
 990   0, /* ld3_st3_permute_cost  */
 991   0, /* ld4_st4_permute_cost  */
 992   3, /* permute_cost  */
 993   3, /* reduc_i8_cost  */
 994   3, /* reduc_i16_cost  */
 995   3, /* reduc_i32_cost  */
 996   3, /* reduc_i64_cost  */
 997   3, /* reduc_f16_cost  */
 998   3, /* reduc_f32_cost  */
 999   3, /* reduc_f64_cost  */
1000   3, /* store_elt_extra_cost  */
1001   3, /* vec_to_scalar_cost  */
1002   3, /* scalar_to_vec_cost  */
1003   5, /* align_load_cost  */
1004   5, /* unalign_load_cost  */
1005   1, /* unalign_store_cost  */
1006   1  /* store_cost  */
1007 };
1008
1009 static const struct cpu_vector_cost exynosm1_vector_cost =
1010 {
1011   1, /* scalar_int_stmt_cost  */
1012   1, /* scalar_fp_stmt_cost  */
1013   5, /* scalar_load_cost  */
1014   1, /* scalar_store_cost  */
1015   1, /* cond_taken_branch_cost  */
1016   1, /* cond_not_taken_branch_cost  */
1017   &exynosm1_advsimd_vector_cost, /* advsimd  */
1018   nullptr, /* sve  */
1019   nullptr /* issue_info  */
1020 };
1021
1022 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
1023 {
1024   2, /* int_stmt_cost  */
1025   2, /* fp_stmt_cost  */
1026   0, /* ld2_st2_permute_cost  */
1027   0, /* ld3_st3_permute_cost  */
1028   0, /* ld4_st4_permute_cost  */
1029   2, /* permute_cost  */
1030   4, /* reduc_i8_cost  */
1031   4, /* reduc_i16_cost  */
1032   4, /* reduc_i32_cost  */
1033   4, /* reduc_i64_cost  */
1034   4, /* reduc_f16_cost  */
1035   4, /* reduc_f32_cost  */
1036   4, /* reduc_f64_cost  */
1037   4, /* store_elt_extra_cost  */
1038   4, /* vec_to_scalar_cost  */
1039   4, /* scalar_to_vec_cost  */
1040   10, /* align_load_cost  */
1041   10, /* unalign_load_cost  */
1042   2, /* unalign_store_cost  */
1043   2  /* store_cost  */
1044 };
1045
1046 /* Generic costs for vector insn classes.  */
1047 static const struct cpu_vector_cost xgene1_vector_cost =
1048 {
1049   1, /* scalar_int_stmt_cost  */
1050   1, /* scalar_fp_stmt_cost  */
1051   5, /* scalar_load_cost  */
1052   1, /* scalar_store_cost  */
1053   2, /* cond_taken_branch_cost  */
1054   1, /* cond_not_taken_branch_cost  */
1055   &xgene1_advsimd_vector_cost, /* advsimd  */
1056   nullptr, /* sve  */
1057   nullptr /* issue_info  */
1058 };
1059
1060 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
1061 {
1062   4, /* int_stmt_cost  */
1063   5, /* fp_stmt_cost  */
1064   0, /* ld2_st2_permute_cost  */
1065   0, /* ld3_st3_permute_cost  */
1066   0, /* ld4_st4_permute_cost  */
1067   10, /* permute_cost  */
1068   6, /* reduc_i8_cost  */
1069   6, /* reduc_i16_cost  */
1070   6, /* reduc_i32_cost  */
1071   6, /* reduc_i64_cost  */
1072   6, /* reduc_f16_cost  */
1073   6, /* reduc_f32_cost  */
1074   6, /* reduc_f64_cost  */
1075   6, /* store_elt_extra_cost  */
1076   6, /* vec_to_scalar_cost  */
1077   5, /* scalar_to_vec_cost  */
1078   4, /* align_load_cost  */
1079   4, /* unalign_load_cost  */
1080   1, /* unalign_store_cost  */
1081   1  /* store_cost  */
1082 };
1083
1084 /* Costs for vector insn classes for Vulcan.  */
1085 static const struct cpu_vector_cost thunderx2t99_vector_cost =
1086 {
1087   1, /* scalar_int_stmt_cost  */
1088   6, /* scalar_fp_stmt_cost  */
1089   4, /* scalar_load_cost  */
1090   1, /* scalar_store_cost  */
1091   2, /* cond_taken_branch_cost  */
1092   1,  /* cond_not_taken_branch_cost  */
1093   &thunderx2t99_advsimd_vector_cost, /* advsimd  */
1094   nullptr, /* sve  */
1095   nullptr /* issue_info  */
1096 };
1097
1098 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1099 {
1100   5, /* int_stmt_cost  */
1101   5, /* fp_stmt_cost  */
1102   0, /* ld2_st2_permute_cost  */
1103   0, /* ld3_st3_permute_cost  */
1104   0, /* ld4_st4_permute_cost  */
1105   10, /* permute_cost  */
1106   5, /* reduc_i8_cost  */
1107   5, /* reduc_i16_cost  */
1108   5, /* reduc_i32_cost  */
1109   5, /* reduc_i64_cost  */
1110   5, /* reduc_f16_cost  */
1111   5, /* reduc_f32_cost  */
1112   5, /* reduc_f64_cost  */
1113   5, /* store_elt_extra_cost  */
1114   5, /* vec_to_scalar_cost  */
1115   5, /* scalar_to_vec_cost  */
1116   4, /* align_load_cost  */
1117   4, /* unalign_load_cost  */
1118   4, /* unalign_store_cost  */
1119   4  /* store_cost  */
1120 };
1121
1122 static const struct cpu_vector_cost thunderx3t110_vector_cost =
1123 {
1124   1, /* scalar_int_stmt_cost  */
1125   5, /* scalar_fp_stmt_cost  */
1126   4, /* scalar_load_cost  */
1127   1, /* scalar_store_cost  */
1128   2, /* cond_taken_branch_cost  */
1129   1,  /* cond_not_taken_branch_cost  */
1130   &thunderx3t110_advsimd_vector_cost, /* advsimd  */
1131   nullptr, /* sve  */
1132   nullptr /* issue_info  */
1133 };
1134
1135 static const advsimd_vec_cost ampere1_advsimd_vector_cost =
1136 {
1137   3, /* int_stmt_cost  */
1138   3, /* fp_stmt_cost  */
1139   0, /* ld2_st2_permute_cost  */
1140   0, /* ld3_st3_permute_cost  */
1141   0, /* ld4_st4_permute_cost  */
1142   2, /* permute_cost  */
1143   12, /* reduc_i8_cost  */
1144   9, /* reduc_i16_cost  */
1145   6, /* reduc_i32_cost  */
1146   5, /* reduc_i64_cost  */
1147   9, /* reduc_f16_cost  */
1148   6, /* reduc_f32_cost  */
1149   5, /* reduc_f64_cost  */
1150   8, /* store_elt_extra_cost  */
1151   6, /* vec_to_scalar_cost  */
1152   7, /* scalar_to_vec_cost  */
1153   5, /* align_load_cost  */
1154   5, /* unalign_load_cost  */
1155   2, /* unalign_store_cost  */
1156   2  /* store_cost  */
1157 };
1158
1159 /* Ampere-1 costs for vector insn classes.  */
1160 static const struct cpu_vector_cost ampere1_vector_cost =
1161 {
1162   1, /* scalar_int_stmt_cost  */
1163   1, /* scalar_fp_stmt_cost  */
1164   4, /* scalar_load_cost  */
1165   1, /* scalar_store_cost  */
1166   1, /* cond_taken_branch_cost  */
1167   1, /* cond_not_taken_branch_cost  */
1168   &ampere1_advsimd_vector_cost, /* advsimd  */
1169   nullptr, /* sve  */
1170   nullptr  /* issue_info  */
1171 };
1172
1173 /* Generic costs for branch instructions.  */
1174 static const struct cpu_branch_cost generic_branch_cost =
1175 {
1176   1,  /* Predictable.  */
1177   3   /* Unpredictable.  */
1178 };
1179
1180 /* Generic approximation modes.  */
1181 static const cpu_approx_modes generic_approx_modes =
1182 {
1183   AARCH64_APPROX_NONE,  /* division  */
1184   AARCH64_APPROX_NONE,  /* sqrt  */
1185   AARCH64_APPROX_NONE   /* recip_sqrt  */
1186 };
1187
1188 /* Approximation modes for Exynos M1.  */
1189 static const cpu_approx_modes exynosm1_approx_modes =
1190 {
1191   AARCH64_APPROX_NONE,  /* division  */
1192   AARCH64_APPROX_ALL,   /* sqrt  */
1193   AARCH64_APPROX_ALL    /* recip_sqrt  */
1194 };
1195
1196 /* Approximation modes for X-Gene 1.  */
1197 static const cpu_approx_modes xgene1_approx_modes =
1198 {
1199   AARCH64_APPROX_NONE,  /* division  */
1200   AARCH64_APPROX_NONE,  /* sqrt  */
1201   AARCH64_APPROX_ALL    /* recip_sqrt  */
1202 };
1203
1204 /* Generic prefetch settings (which disable prefetch).  */
1205 static const cpu_prefetch_tune generic_prefetch_tune =
1206 {
1207   0,                    /* num_slots  */
1208   -1,                   /* l1_cache_size  */
1209   -1,                   /* l1_cache_line_size  */
1210   -1,                   /* l2_cache_size  */
1211   true,                 /* prefetch_dynamic_strides */
1212   -1,                   /* minimum_stride */
1213   -1                    /* default_opt_level  */
1214 };
1215
1216 static const cpu_prefetch_tune exynosm1_prefetch_tune =
1217 {
1218   0,                    /* num_slots  */
1219   -1,                   /* l1_cache_size  */
1220   64,                   /* l1_cache_line_size  */
1221   -1,                   /* l2_cache_size  */
1222   true,                 /* prefetch_dynamic_strides */
1223   -1,                   /* minimum_stride */
1224   -1                    /* default_opt_level  */
1225 };
1226
1227 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1228 {
1229   4,                    /* num_slots  */
1230   32,                   /* l1_cache_size  */
1231   64,                   /* l1_cache_line_size  */
1232   512,                  /* l2_cache_size  */
1233   false,                /* prefetch_dynamic_strides */
1234   2048,                 /* minimum_stride */
1235   3                     /* default_opt_level  */
1236 };
1237
1238 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1239 {
1240   8,                    /* num_slots  */
1241   32,                   /* l1_cache_size  */
1242   128,                  /* l1_cache_line_size  */
1243   16*1024,              /* l2_cache_size  */
1244   true,                 /* prefetch_dynamic_strides */
1245   -1,                   /* minimum_stride */
1246   3                     /* default_opt_level  */
1247 };
1248
1249 static const cpu_prefetch_tune thunderx_prefetch_tune =
1250 {
1251   8,                    /* num_slots  */
1252   32,                   /* l1_cache_size  */
1253   128,                  /* l1_cache_line_size  */
1254   -1,                   /* l2_cache_size  */
1255   true,                 /* prefetch_dynamic_strides */
1256   -1,                   /* minimum_stride */
1257   -1                    /* default_opt_level  */
1258 };
1259
1260 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1261 {
1262   8,                    /* num_slots  */
1263   32,                   /* l1_cache_size  */
1264   64,                   /* l1_cache_line_size  */
1265   256,                  /* l2_cache_size  */
1266   true,                 /* prefetch_dynamic_strides */
1267   -1,                   /* minimum_stride */
1268   -1                    /* default_opt_level  */
1269 };
1270
1271 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1272 {
1273   8,                    /* num_slots  */
1274   32,                   /* l1_cache_size  */
1275   64,                   /* l1_cache_line_size  */
1276   256,                  /* l2_cache_size  */
1277   true,                 /* prefetch_dynamic_strides */
1278   -1,                   /* minimum_stride */
1279   -1                    /* default_opt_level  */
1280 };
1281
1282 static const cpu_prefetch_tune tsv110_prefetch_tune =
1283 {
1284   0,                    /* num_slots  */
1285   64,                   /* l1_cache_size  */
1286   64,                   /* l1_cache_line_size  */
1287   512,                  /* l2_cache_size  */
1288   true,                 /* prefetch_dynamic_strides */
1289   -1,                   /* minimum_stride */
1290   -1                    /* default_opt_level  */
1291 };
1292
1293 static const cpu_prefetch_tune xgene1_prefetch_tune =
1294 {
1295   8,                    /* num_slots  */
1296   32,                   /* l1_cache_size  */
1297   64,                   /* l1_cache_line_size  */
1298   256,                  /* l2_cache_size  */
1299   true,                 /* prefetch_dynamic_strides */
1300   -1,                   /* minimum_stride */
1301   -1                    /* default_opt_level  */
1302 };
1303
1304 static const cpu_prefetch_tune a64fx_prefetch_tune =
1305 {
1306   8,                    /* num_slots  */
1307   64,                   /* l1_cache_size  */
1308   256,                  /* l1_cache_line_size  */
1309   32768,                /* l2_cache_size  */
1310   true,                 /* prefetch_dynamic_strides */
1311   -1,                   /* minimum_stride */
1312   -1                    /* default_opt_level  */
1313 };
1314
1315 static const cpu_prefetch_tune ampere1_prefetch_tune =
1316 {
1317   0,                    /* num_slots  */
1318   64,                   /* l1_cache_size  */
1319   64,                   /* l1_cache_line_size  */
1320   2048,                 /* l2_cache_size  */
1321   true,                 /* prefetch_dynamic_strides */
1322   -1,                   /* minimum_stride */
1323   -1                    /* default_opt_level  */
1324 };
1325
1326 static const struct tune_params generic_tunings =
1327 {
1328   &cortexa57_extra_costs,
1329   &generic_addrcost_table,
1330   &generic_regmove_cost,
1331   &generic_vector_cost,
1332   &generic_branch_cost,
1333   &generic_approx_modes,
1334   SVE_NOT_IMPLEMENTED, /* sve_width  */
1335   { 4, /* load_int.  */
1336     4, /* store_int.  */
1337     4, /* load_fp.  */
1338     4, /* store_fp.  */
1339     4, /* load_pred.  */
1340     4 /* store_pred.  */
1341   }, /* memmov_cost.  */
1342   2, /* issue_rate  */
1343   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1344   "16:12",      /* function_align.  */
1345   "4",  /* jump_align.  */
1346   "8",  /* loop_align.  */
1347   2,    /* int_reassoc_width.  */
1348   4,    /* fp_reassoc_width.  */
1349   1,    /* vec_reassoc_width.  */
1350   2,    /* min_div_recip_mul_sf.  */
1351   2,    /* min_div_recip_mul_df.  */
1352   0,    /* max_case_values.  */
1353   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1354   /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1355      Neoverse V1.  It does not have a noticeable effect on A64FX and should
1356      have at most a very minor effect on SVE2 cores.  */
1357   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),    /* tune_flags.  */
1358   &generic_prefetch_tune
1359 };
1360
1361 static const struct tune_params cortexa35_tunings =
1362 {
1363   &cortexa53_extra_costs,
1364   &generic_addrcost_table,
1365   &cortexa53_regmove_cost,
1366   &generic_vector_cost,
1367   &generic_branch_cost,
1368   &generic_approx_modes,
1369   SVE_NOT_IMPLEMENTED, /* sve_width  */
1370   { 4, /* load_int.  */
1371     4, /* store_int.  */
1372     4, /* load_fp.  */
1373     4, /* store_fp.  */
1374     4, /* load_pred.  */
1375     4 /* store_pred.  */
1376   }, /* memmov_cost.  */
1377   1, /* issue_rate  */
1378   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1379    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1380   "16", /* function_align.  */
1381   "4",  /* jump_align.  */
1382   "8",  /* loop_align.  */
1383   2,    /* int_reassoc_width.  */
1384   4,    /* fp_reassoc_width.  */
1385   1,    /* vec_reassoc_width.  */
1386   2,    /* min_div_recip_mul_sf.  */
1387   2,    /* min_div_recip_mul_df.  */
1388   0,    /* max_case_values.  */
1389   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1390   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1391   &generic_prefetch_tune
1392 };
1393
1394 static const struct tune_params cortexa53_tunings =
1395 {
1396   &cortexa53_extra_costs,
1397   &generic_addrcost_table,
1398   &cortexa53_regmove_cost,
1399   &generic_vector_cost,
1400   &generic_branch_cost,
1401   &generic_approx_modes,
1402   SVE_NOT_IMPLEMENTED, /* sve_width  */
1403   { 4, /* load_int.  */
1404     4, /* store_int.  */
1405     4, /* load_fp.  */
1406     4, /* store_fp.  */
1407     4, /* load_pred.  */
1408     4 /* store_pred.  */
1409   }, /* memmov_cost.  */
1410   2, /* issue_rate  */
1411   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1412    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1413   "16", /* function_align.  */
1414   "4",  /* jump_align.  */
1415   "8",  /* loop_align.  */
1416   2,    /* int_reassoc_width.  */
1417   4,    /* fp_reassoc_width.  */
1418   1,    /* vec_reassoc_width.  */
1419   2,    /* min_div_recip_mul_sf.  */
1420   2,    /* min_div_recip_mul_df.  */
1421   0,    /* max_case_values.  */
1422   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1423   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1424   &generic_prefetch_tune
1425 };
1426
1427 static const struct tune_params cortexa57_tunings =
1428 {
1429   &cortexa57_extra_costs,
1430   &generic_addrcost_table,
1431   &cortexa57_regmove_cost,
1432   &cortexa57_vector_cost,
1433   &generic_branch_cost,
1434   &generic_approx_modes,
1435   SVE_NOT_IMPLEMENTED, /* sve_width  */
1436   { 4, /* load_int.  */
1437     4, /* store_int.  */
1438     4, /* load_fp.  */
1439     4, /* store_fp.  */
1440     4, /* load_pred.  */
1441     4 /* store_pred.  */
1442   }, /* memmov_cost.  */
1443   3, /* issue_rate  */
1444   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1445    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1446   "16", /* function_align.  */
1447   "4",  /* jump_align.  */
1448   "8",  /* loop_align.  */
1449   2,    /* int_reassoc_width.  */
1450   4,    /* fp_reassoc_width.  */
1451   1,    /* vec_reassoc_width.  */
1452   2,    /* min_div_recip_mul_sf.  */
1453   2,    /* min_div_recip_mul_df.  */
1454   0,    /* max_case_values.  */
1455   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1456   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
1457   &generic_prefetch_tune
1458 };
1459
1460 static const struct tune_params cortexa72_tunings =
1461 {
1462   &cortexa57_extra_costs,
1463   &generic_addrcost_table,
1464   &cortexa57_regmove_cost,
1465   &cortexa57_vector_cost,
1466   &generic_branch_cost,
1467   &generic_approx_modes,
1468   SVE_NOT_IMPLEMENTED, /* sve_width  */
1469   { 4, /* load_int.  */
1470     4, /* store_int.  */
1471     4, /* load_fp.  */
1472     4, /* store_fp.  */
1473     4, /* load_pred.  */
1474     4 /* store_pred.  */
1475   }, /* memmov_cost.  */
1476   3, /* issue_rate  */
1477   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1478    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1479   "16", /* function_align.  */
1480   "4",  /* jump_align.  */
1481   "8",  /* loop_align.  */
1482   2,    /* int_reassoc_width.  */
1483   4,    /* fp_reassoc_width.  */
1484   1,    /* vec_reassoc_width.  */
1485   2,    /* min_div_recip_mul_sf.  */
1486   2,    /* min_div_recip_mul_df.  */
1487   0,    /* max_case_values.  */
1488   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1489   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1490   &generic_prefetch_tune
1491 };
1492
1493 static const struct tune_params cortexa73_tunings =
1494 {
1495   &cortexa57_extra_costs,
1496   &generic_addrcost_table,
1497   &cortexa57_regmove_cost,
1498   &cortexa57_vector_cost,
1499   &generic_branch_cost,
1500   &generic_approx_modes,
1501   SVE_NOT_IMPLEMENTED, /* sve_width  */
1502   { 4, /* load_int.  */
1503     4, /* store_int.  */
1504     4, /* load_fp.  */
1505     4, /* store_fp.  */
1506     4, /* load_pred.  */
1507     4 /* store_pred.  */
1508   }, /* memmov_cost.  */
1509   2, /* issue_rate.  */
1510   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1511    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1512   "16", /* function_align.  */
1513   "4",  /* jump_align.  */
1514   "8",  /* loop_align.  */
1515   2,    /* int_reassoc_width.  */
1516   4,    /* fp_reassoc_width.  */
1517   1,    /* vec_reassoc_width.  */
1518   2,    /* min_div_recip_mul_sf.  */
1519   2,    /* min_div_recip_mul_df.  */
1520   0,    /* max_case_values.  */
1521   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1522   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1523   &generic_prefetch_tune
1524 };
1525
1526
1527
1528 static const struct tune_params exynosm1_tunings =
1529 {
1530   &exynosm1_extra_costs,
1531   &exynosm1_addrcost_table,
1532   &exynosm1_regmove_cost,
1533   &exynosm1_vector_cost,
1534   &generic_branch_cost,
1535   &exynosm1_approx_modes,
1536   SVE_NOT_IMPLEMENTED, /* sve_width  */
1537   { 4, /* load_int.  */
1538     4, /* store_int.  */
1539     4, /* load_fp.  */
1540     4, /* store_fp.  */
1541     4, /* load_pred.  */
1542     4 /* store_pred.  */
1543   }, /* memmov_cost.  */
1544   3,    /* issue_rate  */
1545   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
1546   "4",  /* function_align.  */
1547   "4",  /* jump_align.  */
1548   "4",  /* loop_align.  */
1549   2,    /* int_reassoc_width.  */
1550   4,    /* fp_reassoc_width.  */
1551   1,    /* vec_reassoc_width.  */
1552   2,    /* min_div_recip_mul_sf.  */
1553   2,    /* min_div_recip_mul_df.  */
1554   48,   /* max_case_values.  */
1555   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
1556   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
1557   &exynosm1_prefetch_tune
1558 };
1559
1560 static const struct tune_params thunderxt88_tunings =
1561 {
1562   &thunderx_extra_costs,
1563   &generic_addrcost_table,
1564   &thunderx_regmove_cost,
1565   &thunderx_vector_cost,
1566   &generic_branch_cost,
1567   &generic_approx_modes,
1568   SVE_NOT_IMPLEMENTED, /* sve_width  */
1569   { 6, /* load_int.  */
1570     6, /* store_int.  */
1571     6, /* load_fp.  */
1572     6, /* store_fp.  */
1573     6, /* load_pred.  */
1574     6 /* store_pred.  */
1575   }, /* memmov_cost.  */
1576   2, /* issue_rate  */
1577   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1578   "8",  /* function_align.  */
1579   "8",  /* jump_align.  */
1580   "8",  /* loop_align.  */
1581   2,    /* int_reassoc_width.  */
1582   4,    /* fp_reassoc_width.  */
1583   1,    /* vec_reassoc_width.  */
1584   2,    /* min_div_recip_mul_sf.  */
1585   2,    /* min_div_recip_mul_df.  */
1586   0,    /* max_case_values.  */
1587   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1588   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
1589   &thunderxt88_prefetch_tune
1590 };
1591
1592 static const struct tune_params thunderx_tunings =
1593 {
1594   &thunderx_extra_costs,
1595   &generic_addrcost_table,
1596   &thunderx_regmove_cost,
1597   &thunderx_vector_cost,
1598   &generic_branch_cost,
1599   &generic_approx_modes,
1600   SVE_NOT_IMPLEMENTED, /* sve_width  */
1601   { 6, /* load_int.  */
1602     6, /* store_int.  */
1603     6, /* load_fp.  */
1604     6, /* store_fp.  */
1605     6, /* load_pred.  */
1606     6 /* store_pred.  */
1607   }, /* memmov_cost.  */
1608   2, /* issue_rate  */
1609   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1610   "8",  /* function_align.  */
1611   "8",  /* jump_align.  */
1612   "8",  /* loop_align.  */
1613   2,    /* int_reassoc_width.  */
1614   4,    /* fp_reassoc_width.  */
1615   1,    /* vec_reassoc_width.  */
1616   2,    /* min_div_recip_mul_sf.  */
1617   2,    /* min_div_recip_mul_df.  */
1618   0,    /* max_case_values.  */
1619   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1620   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1621    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
1622   &thunderx_prefetch_tune
1623 };
1624
1625 static const struct tune_params tsv110_tunings =
1626 {
1627   &tsv110_extra_costs,
1628   &tsv110_addrcost_table,
1629   &tsv110_regmove_cost,
1630   &tsv110_vector_cost,
1631   &generic_branch_cost,
1632   &generic_approx_modes,
1633   SVE_NOT_IMPLEMENTED, /* sve_width  */
1634   { 4, /* load_int.  */
1635     4, /* store_int.  */
1636     4, /* load_fp.  */
1637     4, /* store_fp.  */
1638     4, /* load_pred.  */
1639     4 /* store_pred.  */
1640   }, /* memmov_cost.  */
1641   4,    /* issue_rate  */
1642   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1643    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1644   "16", /* function_align.  */
1645   "4",  /* jump_align.  */
1646   "8",  /* loop_align.  */
1647   2,    /* int_reassoc_width.  */
1648   4,    /* fp_reassoc_width.  */
1649   1,    /* vec_reassoc_width.  */
1650   2,    /* min_div_recip_mul_sf.  */
1651   2,    /* min_div_recip_mul_df.  */
1652   0,    /* max_case_values.  */
1653   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1654   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
1655   &tsv110_prefetch_tune
1656 };
1657
1658 static const struct tune_params xgene1_tunings =
1659 {
1660   &xgene1_extra_costs,
1661   &xgene1_addrcost_table,
1662   &xgene1_regmove_cost,
1663   &xgene1_vector_cost,
1664   &generic_branch_cost,
1665   &xgene1_approx_modes,
1666   SVE_NOT_IMPLEMENTED, /* sve_width  */
1667   { 6, /* load_int.  */
1668     6, /* store_int.  */
1669     6, /* load_fp.  */
1670     6, /* store_fp.  */
1671     6, /* load_pred.  */
1672     6 /* store_pred.  */
1673   }, /* memmov_cost.  */
1674   4, /* issue_rate  */
1675   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1676   "16", /* function_align.  */
1677   "16", /* jump_align.  */
1678   "16", /* loop_align.  */
1679   2,    /* int_reassoc_width.  */
1680   4,    /* fp_reassoc_width.  */
1681   1,    /* vec_reassoc_width.  */
1682   2,    /* min_div_recip_mul_sf.  */
1683   2,    /* min_div_recip_mul_df.  */
1684   17,   /* max_case_values.  */
1685   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1686   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1687   &xgene1_prefetch_tune
1688 };
1689
1690 static const struct tune_params emag_tunings =
1691 {
1692   &xgene1_extra_costs,
1693   &xgene1_addrcost_table,
1694   &xgene1_regmove_cost,
1695   &xgene1_vector_cost,
1696   &generic_branch_cost,
1697   &xgene1_approx_modes,
1698   SVE_NOT_IMPLEMENTED,
1699   { 6, /* load_int.  */
1700     6, /* store_int.  */
1701     6, /* load_fp.  */
1702     6, /* store_fp.  */
1703     6, /* load_pred.  */
1704     6 /* store_pred.  */
1705   }, /* memmov_cost.  */
1706   4, /* issue_rate  */
1707   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1708   "16", /* function_align.  */
1709   "16", /* jump_align.  */
1710   "16", /* loop_align.  */
1711   2,    /* int_reassoc_width.  */
1712   4,    /* fp_reassoc_width.  */
1713   1,    /* vec_reassoc_width.  */
1714   2,    /* min_div_recip_mul_sf.  */
1715   2,    /* min_div_recip_mul_df.  */
1716   17,   /* max_case_values.  */
1717   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1718   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1719   &xgene1_prefetch_tune
1720 };
1721
1722 static const struct tune_params qdf24xx_tunings =
1723 {
1724   &qdf24xx_extra_costs,
1725   &qdf24xx_addrcost_table,
1726   &qdf24xx_regmove_cost,
1727   &qdf24xx_vector_cost,
1728   &generic_branch_cost,
1729   &generic_approx_modes,
1730   SVE_NOT_IMPLEMENTED, /* sve_width  */
1731   { 4, /* load_int.  */
1732     4, /* store_int.  */
1733     4, /* load_fp.  */
1734     4, /* store_fp.  */
1735     4, /* load_pred.  */
1736     4 /* store_pred.  */
1737   }, /* memmov_cost.  */
1738   4, /* issue_rate  */
1739   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1740    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1741   "16", /* function_align.  */
1742   "8",  /* jump_align.  */
1743   "16", /* loop_align.  */
1744   2,    /* int_reassoc_width.  */
1745   4,    /* fp_reassoc_width.  */
1746   1,    /* vec_reassoc_width.  */
1747   2,    /* min_div_recip_mul_sf.  */
1748   2,    /* min_div_recip_mul_df.  */
1749   0,    /* max_case_values.  */
1750   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1751   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1752   &qdf24xx_prefetch_tune
1753 };
1754
1755 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1756    for now.  */
1757 static const struct tune_params saphira_tunings =
1758 {
1759   &generic_extra_costs,
1760   &generic_addrcost_table,
1761   &generic_regmove_cost,
1762   &generic_vector_cost,
1763   &generic_branch_cost,
1764   &generic_approx_modes,
1765   SVE_NOT_IMPLEMENTED, /* sve_width  */
1766   { 4, /* load_int.  */
1767     4, /* store_int.  */
1768     4, /* load_fp.  */
1769     4, /* store_fp.  */
1770     4, /* load_pred.  */
1771     4 /* store_pred.  */
1772   }, /* memmov_cost.  */
1773   4, /* issue_rate  */
1774   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1775    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1776   "16", /* function_align.  */
1777   "8",  /* jump_align.  */
1778   "16", /* loop_align.  */
1779   2,    /* int_reassoc_width.  */
1780   4,    /* fp_reassoc_width.  */
1781   1,    /* vec_reassoc_width.  */
1782   2,    /* min_div_recip_mul_sf.  */
1783   2,    /* min_div_recip_mul_df.  */
1784   0,    /* max_case_values.  */
1785   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1786   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1787   &generic_prefetch_tune
1788 };
1789
1790 static const struct tune_params thunderx2t99_tunings =
1791 {
1792   &thunderx2t99_extra_costs,
1793   &thunderx2t99_addrcost_table,
1794   &thunderx2t99_regmove_cost,
1795   &thunderx2t99_vector_cost,
1796   &generic_branch_cost,
1797   &generic_approx_modes,
1798   SVE_NOT_IMPLEMENTED, /* sve_width  */
1799   { 4, /* load_int.  */
1800     4, /* store_int.  */
1801     4, /* load_fp.  */
1802     4, /* store_fp.  */
1803     4, /* load_pred.  */
1804     4 /* store_pred.  */
1805   }, /* memmov_cost.  */
1806   4, /* issue_rate.  */
1807   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1808    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1809   "16", /* function_align.  */
1810   "8",  /* jump_align.  */
1811   "16", /* loop_align.  */
1812   3,    /* int_reassoc_width.  */
1813   2,    /* fp_reassoc_width.  */
1814   2,    /* vec_reassoc_width.  */
1815   2,    /* min_div_recip_mul_sf.  */
1816   2,    /* min_div_recip_mul_df.  */
1817   0,    /* max_case_values.  */
1818   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1819   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1820   &thunderx2t99_prefetch_tune
1821 };
1822
1823 static const struct tune_params thunderx3t110_tunings =
1824 {
1825   &thunderx3t110_extra_costs,
1826   &thunderx3t110_addrcost_table,
1827   &thunderx3t110_regmove_cost,
1828   &thunderx3t110_vector_cost,
1829   &generic_branch_cost,
1830   &generic_approx_modes,
1831   SVE_NOT_IMPLEMENTED, /* sve_width  */
1832   { 4, /* load_int.  */
1833     4, /* store_int.  */
1834     4, /* load_fp.  */
1835     4, /* store_fp.  */
1836     4, /* load_pred.  */
1837     4 /* store_pred.  */
1838   }, /* memmov_cost.  */
1839   6, /* issue_rate.  */
1840   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1841    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1842   "16", /* function_align.  */
1843   "8",  /* jump_align.  */
1844   "16", /* loop_align.  */
1845   3,    /* int_reassoc_width.  */
1846   2,    /* fp_reassoc_width.  */
1847   2,    /* vec_reassoc_width.  */
1848   2,    /* min_div_recip_mul_sf.  */
1849   2,    /* min_div_recip_mul_df.  */
1850   0,    /* max_case_values.  */
1851   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1852   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1853   &thunderx3t110_prefetch_tune
1854 };
1855
1856 static const struct tune_params neoversen1_tunings =
1857 {
1858   &cortexa76_extra_costs,
1859   &generic_addrcost_table,
1860   &generic_regmove_cost,
1861   &cortexa57_vector_cost,
1862   &generic_branch_cost,
1863   &generic_approx_modes,
1864   SVE_NOT_IMPLEMENTED, /* sve_width  */
1865   { 4, /* load_int.  */
1866     2, /* store_int.  */
1867     5, /* load_fp.  */
1868     2, /* store_fp.  */
1869     4, /* load_pred.  */
1870     4 /* store_pred.  */
1871   }, /* memmov_cost.  */
1872   3, /* issue_rate  */
1873   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1874   "32:16",      /* function_align.  */
1875   "4",          /* jump_align.  */
1876   "32:16",      /* loop_align.  */
1877   2,    /* int_reassoc_width.  */
1878   4,    /* fp_reassoc_width.  */
1879   2,    /* vec_reassoc_width.  */
1880   2,    /* min_div_recip_mul_sf.  */
1881   2,    /* min_div_recip_mul_df.  */
1882   0,    /* max_case_values.  */
1883   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1884   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),      /* tune_flags.  */
1885   &generic_prefetch_tune
1886 };
1887
1888 static const struct tune_params ampere1_tunings =
1889 {
1890   &ampere1_extra_costs,
1891   &generic_addrcost_table,
1892   &generic_regmove_cost,
1893   &ampere1_vector_cost,
1894   &generic_branch_cost,
1895   &generic_approx_modes,
1896   SVE_NOT_IMPLEMENTED, /* sve_width  */
1897   { 4, /* load_int.  */
1898     4, /* store_int.  */
1899     4, /* load_fp.  */
1900     4, /* store_fp.  */
1901     4, /* load_pred.  */
1902     4 /* store_pred.  */
1903   }, /* memmov_cost.  */
1904   4, /* issue_rate  */
1905   (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1906    AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1907    AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1908    AARCH64_FUSE_CMP_BRANCH),
1909   /* fusible_ops  */
1910   "32",         /* function_align.  */
1911   "4",          /* jump_align.  */
1912   "32:16",      /* loop_align.  */
1913   2,    /* int_reassoc_width.  */
1914   4,    /* fp_reassoc_width.  */
1915   2,    /* vec_reassoc_width.  */
1916   2,    /* min_div_recip_mul_sf.  */
1917   2,    /* min_div_recip_mul_df.  */
1918   0,    /* max_case_values.  */
1919   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1920   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1921   &ampere1_prefetch_tune
1922 };
1923
1924 static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1925 {
1926   2, /* int_stmt_cost  */
1927   2, /* fp_stmt_cost  */
1928   4, /* ld2_st2_permute_cost */
1929   4, /* ld3_st3_permute_cost  */
1930   5, /* ld4_st4_permute_cost  */
1931   3, /* permute_cost  */
1932   4, /* reduc_i8_cost  */
1933   4, /* reduc_i16_cost  */
1934   2, /* reduc_i32_cost  */
1935   2, /* reduc_i64_cost  */
1936   6, /* reduc_f16_cost  */
1937   3, /* reduc_f32_cost  */
1938   2, /* reduc_f64_cost  */
1939   2, /* store_elt_extra_cost  */
1940   /* This value is just inherited from the Cortex-A57 table.  */
1941   8, /* vec_to_scalar_cost  */
1942   /* This depends very much on what the scalar value is and
1943      where it comes from.  E.g. some constants take two dependent
1944      instructions or a load, while others might be moved from a GPR.
1945      4 seems to be a reasonable compromise in practice.  */
1946   4, /* scalar_to_vec_cost  */
1947   4, /* align_load_cost  */
1948   4, /* unalign_load_cost  */
1949   /* Although stores have a latency of 2 and compete for the
1950      vector pipes, in practice it's better not to model that.  */
1951   1, /* unalign_store_cost  */
1952   1  /* store_cost  */
1953 };
1954
1955 static const sve_vec_cost neoversev1_sve_vector_cost =
1956 {
1957   {
1958     2, /* int_stmt_cost  */
1959     2, /* fp_stmt_cost  */
1960     4, /* ld2_st2_permute_cost  */
1961     7, /* ld3_st3_permute_cost  */
1962     8, /* ld4_st4_permute_cost  */
1963     3, /* permute_cost  */
1964     /* Theoretically, a reduction involving 31 scalar ADDs could
1965        complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
1966        completes in 14 cycles, so give it a cost of 31 + 5.  */
1967     36, /* reduc_i8_cost  */
1968     /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
1969     22, /* reduc_i16_cost  */
1970     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
1971     14, /* reduc_i32_cost  */
1972     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
1973     11, /* reduc_i64_cost  */
1974     /* Theoretically, a reduction involving 15 scalar FADDs could
1975        complete in ~9 cycles and would have a cost of 30.  FADDV
1976        completes in 13 cycles, so give it a cost of 30 + 4.  */
1977     34, /* reduc_f16_cost  */
1978     /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
1979     19, /* reduc_f32_cost  */
1980     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
1981     11, /* reduc_f64_cost  */
1982     2, /* store_elt_extra_cost  */
1983     /* This value is just inherited from the Cortex-A57 table.  */
1984     8, /* vec_to_scalar_cost  */
1985     /* See the comment above the Advanced SIMD versions.  */
1986     4, /* scalar_to_vec_cost  */
1987     4, /* align_load_cost  */
1988     4, /* unalign_load_cost  */
1989     /* Although stores have a latency of 2 and compete for the
1990        vector pipes, in practice it's better not to model that.  */
1991     1, /* unalign_store_cost  */
1992     1  /* store_cost  */
1993   },
1994   3, /* clast_cost  */
1995   19, /* fadda_f16_cost  */
1996   11, /* fadda_f32_cost  */
1997   8, /* fadda_f64_cost  */
1998   32, /* gather_load_x32_cost  */
1999   16, /* gather_load_x64_cost  */
2000   3 /* scatter_store_elt_cost  */
2001 };
2002
2003 static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
2004 {
2005   3, /* loads_stores_per_cycle  */
2006   2, /* stores_per_cycle  */
2007   4, /* general_ops_per_cycle  */
2008   0, /* fp_simd_load_general_ops  */
2009   1 /* fp_simd_store_general_ops  */
2010 };
2011
2012 static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
2013 {
2014   {
2015     3, /* loads_stores_per_cycle  */
2016     2, /* stores_per_cycle  */
2017     4, /* general_ops_per_cycle  */
2018     0, /* fp_simd_load_general_ops  */
2019     1 /* fp_simd_store_general_ops  */
2020   },
2021   2, /* ld2_st2_general_ops  */
2022   2, /* ld3_st3_general_ops  */
2023   3 /* ld4_st4_general_ops  */
2024 };
2025
2026 static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
2027 {
2028   {
2029     {
2030       2, /* loads_per_cycle  */
2031       2, /* stores_per_cycle  */
2032       2, /* general_ops_per_cycle  */
2033       0, /* fp_simd_load_general_ops  */
2034       1 /* fp_simd_store_general_ops  */
2035     },
2036     2, /* ld2_st2_general_ops  */
2037     2, /* ld3_st3_general_ops  */
2038     3 /* ld4_st4_general_ops  */
2039   },
2040   1, /* pred_ops_per_cycle  */
2041   2, /* while_pred_ops  */
2042   2, /* int_cmp_pred_ops  */
2043   1, /* fp_cmp_pred_ops  */
2044   1, /* gather_scatter_pair_general_ops  */
2045   1 /* gather_scatter_pair_pred_ops  */
2046 };
2047
2048 static const aarch64_vec_issue_info neoversev1_vec_issue_info =
2049 {
2050   &neoversev1_scalar_issue_info,
2051   &neoversev1_advsimd_issue_info,
2052   &neoversev1_sve_issue_info
2053 };
2054
2055 /* Neoverse V1 costs for vector insn classes.  */
2056 static const struct cpu_vector_cost neoversev1_vector_cost =
2057 {
2058   1, /* scalar_int_stmt_cost  */
2059   2, /* scalar_fp_stmt_cost  */
2060   4, /* scalar_load_cost  */
2061   1, /* scalar_store_cost  */
2062   1, /* cond_taken_branch_cost  */
2063   1, /* cond_not_taken_branch_cost  */
2064   &neoversev1_advsimd_vector_cost, /* advsimd  */
2065   &neoversev1_sve_vector_cost, /* sve  */
2066   &neoversev1_vec_issue_info /* issue_info  */
2067 };
2068
2069 static const struct tune_params neoversev1_tunings =
2070 {
2071   &cortexa76_extra_costs,
2072   &neoversev1_addrcost_table,
2073   &neoversev1_regmove_cost,
2074   &neoversev1_vector_cost,
2075   &generic_branch_cost,
2076   &generic_approx_modes,
2077   SVE_256, /* sve_width  */
2078   { 4, /* load_int.  */
2079     2, /* store_int.  */
2080     6, /* load_fp.  */
2081     2, /* store_fp.  */
2082     6, /* load_pred.  */
2083     1 /* store_pred.  */
2084   }, /* memmov_cost.  */
2085   3, /* issue_rate  */
2086   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2087   "32:16",      /* function_align.  */
2088   "4",          /* jump_align.  */
2089   "32:16",      /* loop_align.  */
2090   2,    /* int_reassoc_width.  */
2091   4,    /* fp_reassoc_width.  */
2092   2,    /* vec_reassoc_width.  */
2093   2,    /* min_div_recip_mul_sf.  */
2094   2,    /* min_div_recip_mul_df.  */
2095   0,    /* max_case_values.  */
2096   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2097   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2098    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2099    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
2100    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
2101   &generic_prefetch_tune
2102 };
2103
2104 static const sve_vec_cost neoverse512tvb_sve_vector_cost =
2105 {
2106   {
2107     2, /* int_stmt_cost  */
2108     2, /* fp_stmt_cost  */
2109     4, /* ld2_st2_permute_cost  */
2110     5, /* ld3_st3_permute_cost  */
2111     5, /* ld4_st4_permute_cost  */
2112     3, /* permute_cost  */
2113     /* Theoretically, a reduction involving 15 scalar ADDs could
2114        complete in ~5 cycles and would have a cost of 15.  Assume that
2115        [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6.  */
2116     21, /* reduc_i8_cost  */
2117     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
2118     13, /* reduc_i16_cost  */
2119     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
2120     9, /* reduc_i32_cost  */
2121     /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7.  */
2122     8, /* reduc_i64_cost  */
2123     /* Theoretically, a reduction involving 7 scalar FADDs could
2124        complete in ~6 cycles and would have a cost of 14.  Assume that
2125        FADDV completes in 8 cycles and so give it a cost of 14 + 2.  */
2126     16, /* reduc_f16_cost  */
2127     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
2128     8, /* reduc_f32_cost  */
2129     /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2.  */
2130     4, /* reduc_f64_cost  */
2131     2, /* store_elt_extra_cost  */
2132     /* This value is just inherited from the Cortex-A57 table.  */
2133     8, /* vec_to_scalar_cost  */
2134     /* This depends very much on what the scalar value is and
2135        where it comes from.  E.g. some constants take two dependent
2136        instructions or a load, while others might be moved from a GPR.
2137        4 seems to be a reasonable compromise in practice.  */
2138     4, /* scalar_to_vec_cost  */
2139     4, /* align_load_cost  */
2140     4, /* unalign_load_cost  */
2141     /* Although stores generally have a latency of 2 and compete for the
2142        vector pipes, in practice it's better not to model that.  */
2143     1, /* unalign_store_cost  */
2144     1  /* store_cost  */
2145   },
2146   3, /* clast_cost  */
2147   10, /* fadda_f16_cost  */
2148   6, /* fadda_f32_cost  */
2149   4, /* fadda_f64_cost  */
2150   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2151      (6 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2152      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2153      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2154      (cost 2) to that, to avoid the difference being lost in rounding.
2155
2156      There is no easy comparison between a strided Advanced SIMD x32 load
2157      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2158      operation more than a 64-bit gather.  */
2159   14, /* gather_load_x32_cost  */
2160   12, /* gather_load_x64_cost  */
2161   3 /* scatter_store_elt_cost  */
2162 };
2163
2164 static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
2165 {
2166   {
2167     {
2168       3, /* loads_per_cycle  */
2169       2, /* stores_per_cycle  */
2170       4, /* general_ops_per_cycle  */
2171       0, /* fp_simd_load_general_ops  */
2172       1 /* fp_simd_store_general_ops  */
2173     },
2174     2, /* ld2_st2_general_ops  */
2175     2, /* ld3_st3_general_ops  */
2176     3 /* ld4_st4_general_ops  */
2177   },
2178   2, /* pred_ops_per_cycle  */
2179   2, /* while_pred_ops  */
2180   2, /* int_cmp_pred_ops  */
2181   1, /* fp_cmp_pred_ops  */
2182   1, /* gather_scatter_pair_general_ops  */
2183   1 /* gather_scatter_pair_pred_ops  */
2184 };
2185
2186 static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
2187 {
2188   &neoversev1_scalar_issue_info,
2189   &neoversev1_advsimd_issue_info,
2190   &neoverse512tvb_sve_issue_info
2191 };
2192
2193 static const struct cpu_vector_cost neoverse512tvb_vector_cost =
2194 {
2195   1, /* scalar_int_stmt_cost  */
2196   2, /* scalar_fp_stmt_cost  */
2197   4, /* scalar_load_cost  */
2198   1, /* scalar_store_cost  */
2199   1, /* cond_taken_branch_cost  */
2200   1, /* cond_not_taken_branch_cost  */
2201   &neoversev1_advsimd_vector_cost, /* advsimd  */
2202   &neoverse512tvb_sve_vector_cost, /* sve  */
2203   &neoverse512tvb_vec_issue_info /* issue_info  */
2204 };
2205
2206 static const struct tune_params neoverse512tvb_tunings =
2207 {
2208   &cortexa76_extra_costs,
2209   &neoversev1_addrcost_table,
2210   &neoversev1_regmove_cost,
2211   &neoverse512tvb_vector_cost,
2212   &generic_branch_cost,
2213   &generic_approx_modes,
2214   SVE_128 | SVE_256, /* sve_width  */
2215   { 4, /* load_int.  */
2216     2, /* store_int.  */
2217     6, /* load_fp.  */
2218     2, /* store_fp.  */
2219     6, /* load_pred.  */
2220     1 /* store_pred.  */
2221   }, /* memmov_cost.  */
2222   3, /* issue_rate  */
2223   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2224   "32:16",      /* function_align.  */
2225   "4",          /* jump_align.  */
2226   "32:16",      /* loop_align.  */
2227   2,    /* int_reassoc_width.  */
2228   4,    /* fp_reassoc_width.  */
2229   2,    /* vec_reassoc_width.  */
2230   2,    /* min_div_recip_mul_sf.  */
2231   2,    /* min_div_recip_mul_df.  */
2232   0,    /* max_case_values.  */
2233   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2234   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2235    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2236    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),     /* tune_flags.  */
2237   &generic_prefetch_tune
2238 };
2239
2240 static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
2241 {
2242   2, /* int_stmt_cost  */
2243   2, /* fp_stmt_cost  */
2244   2, /* ld2_st2_permute_cost */
2245   2, /* ld3_st3_permute_cost  */
2246   3, /* ld4_st4_permute_cost  */
2247   3, /* permute_cost  */
2248   4, /* reduc_i8_cost  */
2249   4, /* reduc_i16_cost  */
2250   2, /* reduc_i32_cost  */
2251   2, /* reduc_i64_cost  */
2252   6, /* reduc_f16_cost  */
2253   4, /* reduc_f32_cost  */
2254   2, /* reduc_f64_cost  */
2255   2, /* store_elt_extra_cost  */
2256   /* This value is just inherited from the Cortex-A57 table.  */
2257   8, /* vec_to_scalar_cost  */
2258   /* This depends very much on what the scalar value is and
2259      where it comes from.  E.g. some constants take two dependent
2260      instructions or a load, while others might be moved from a GPR.
2261      4 seems to be a reasonable compromise in practice.  */
2262   4, /* scalar_to_vec_cost  */
2263   4, /* align_load_cost  */
2264   4, /* unalign_load_cost  */
2265   /* Although stores have a latency of 2 and compete for the
2266      vector pipes, in practice it's better not to model that.  */
2267   1, /* unalign_store_cost  */
2268   1  /* store_cost  */
2269 };
2270
2271 static const sve_vec_cost neoversen2_sve_vector_cost =
2272 {
2273   {
2274     2, /* int_stmt_cost  */
2275     2, /* fp_stmt_cost  */
2276     3, /* ld2_st2_permute_cost  */
2277     4, /* ld3_st3_permute_cost  */
2278     4, /* ld4_st4_permute_cost  */
2279     3, /* permute_cost  */
2280     /* Theoretically, a reduction involving 15 scalar ADDs could
2281        complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
2282        completes in 11 cycles, so give it a cost of 15 + 6.  */
2283     21, /* reduc_i8_cost  */
2284     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
2285     13, /* reduc_i16_cost  */
2286     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
2287     9, /* reduc_i32_cost  */
2288     /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
2289     2, /* reduc_i64_cost  */
2290     /* Theoretically, a reduction involving 7 scalar FADDs could
2291        complete in ~8 cycles and would have a cost of 14.  FADDV
2292        completes in 6 cycles, so give it a cost of 14 - 2.  */
2293     12, /* reduc_f16_cost  */
2294     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
2295     6, /* reduc_f32_cost  */
2296     /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
2297     2, /* reduc_f64_cost  */
2298     2, /* store_elt_extra_cost  */
2299     /* This value is just inherited from the Cortex-A57 table.  */
2300     8, /* vec_to_scalar_cost  */
2301     /* See the comment above the Advanced SIMD versions.  */
2302     4, /* scalar_to_vec_cost  */
2303     4, /* align_load_cost  */
2304     4, /* unalign_load_cost  */
2305     /* Although stores have a latency of 2 and compete for the
2306        vector pipes, in practice it's better not to model that.  */
2307     1, /* unalign_store_cost  */
2308     1  /* store_cost  */
2309   },
2310   3, /* clast_cost  */
2311   10, /* fadda_f16_cost  */
2312   6, /* fadda_f32_cost  */
2313   4, /* fadda_f64_cost  */
2314   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2315      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2316      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2317      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2318      (cost 2) to that, to avoid the difference being lost in rounding.
2319
2320      There is no easy comparison between a strided Advanced SIMD x32 load
2321      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2322      operation more than a 64-bit gather.  */
2323   14, /* gather_load_x32_cost  */
2324   12, /* gather_load_x64_cost  */
2325   3 /* scatter_store_elt_cost  */
2326 };
2327
2328 static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
2329 {
2330   3, /* loads_stores_per_cycle  */
2331   2, /* stores_per_cycle  */
2332   4, /* general_ops_per_cycle  */
2333   0, /* fp_simd_load_general_ops  */
2334   1 /* fp_simd_store_general_ops  */
2335 };
2336
2337 static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
2338 {
2339   {
2340     3, /* loads_stores_per_cycle  */
2341     2, /* stores_per_cycle  */
2342     2, /* general_ops_per_cycle  */
2343     0, /* fp_simd_load_general_ops  */
2344     1 /* fp_simd_store_general_ops  */
2345   },
2346   2, /* ld2_st2_general_ops  */
2347   2, /* ld3_st3_general_ops  */
2348   3 /* ld4_st4_general_ops  */
2349 };
2350
2351 static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
2352 {
2353   {
2354     {
2355       3, /* loads_per_cycle  */
2356       2, /* stores_per_cycle  */
2357       2, /* general_ops_per_cycle  */
2358       0, /* fp_simd_load_general_ops  */
2359       1 /* fp_simd_store_general_ops  */
2360     },
2361     2, /* ld2_st2_general_ops  */
2362     3, /* ld3_st3_general_ops  */
2363     3 /* ld4_st4_general_ops  */
2364   },
2365   2, /* pred_ops_per_cycle  */
2366   2, /* while_pred_ops  */
2367   2, /* int_cmp_pred_ops  */
2368   1, /* fp_cmp_pred_ops  */
2369   1, /* gather_scatter_pair_general_ops  */
2370   1 /* gather_scatter_pair_pred_ops  */
2371 };
2372
2373 static const aarch64_vec_issue_info neoversen2_vec_issue_info =
2374 {
2375   &neoversen2_scalar_issue_info,
2376   &neoversen2_advsimd_issue_info,
2377   &neoversen2_sve_issue_info
2378 };
2379
2380 /* Neoverse N2 costs for vector insn classes.  */
2381 static const struct cpu_vector_cost neoversen2_vector_cost =
2382 {
2383   1, /* scalar_int_stmt_cost  */
2384   2, /* scalar_fp_stmt_cost  */
2385   4, /* scalar_load_cost  */
2386   1, /* scalar_store_cost  */
2387   1, /* cond_taken_branch_cost  */
2388   1, /* cond_not_taken_branch_cost  */
2389   &neoversen2_advsimd_vector_cost, /* advsimd  */
2390   &neoversen2_sve_vector_cost, /* sve  */
2391   &neoversen2_vec_issue_info /* issue_info  */
2392 };
2393
2394 static const struct tune_params neoversen2_tunings =
2395 {
2396   &cortexa76_extra_costs,
2397   &neoversen2_addrcost_table,
2398   &neoversen2_regmove_cost,
2399   &neoversen2_vector_cost,
2400   &generic_branch_cost,
2401   &generic_approx_modes,
2402   SVE_128, /* sve_width  */
2403   { 4, /* load_int.  */
2404     1, /* store_int.  */
2405     6, /* load_fp.  */
2406     2, /* store_fp.  */
2407     6, /* load_pred.  */
2408     1 /* store_pred.  */
2409   }, /* memmov_cost.  */
2410   3, /* issue_rate  */
2411   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2412   "32:16",      /* function_align.  */
2413   "4",          /* jump_align.  */
2414   "32:16",      /* loop_align.  */
2415   2,    /* int_reassoc_width.  */
2416   4,    /* fp_reassoc_width.  */
2417   2,    /* vec_reassoc_width.  */
2418   2,    /* min_div_recip_mul_sf.  */
2419   2,    /* min_div_recip_mul_df.  */
2420   0,    /* max_case_values.  */
2421   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2422   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2423    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2424    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2425    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),     /* tune_flags.  */
2426   &generic_prefetch_tune
2427 };
2428
2429 static const advsimd_vec_cost demeter_advsimd_vector_cost =
2430 {
2431   2, /* int_stmt_cost  */
2432   2, /* fp_stmt_cost  */
2433   2, /* ld2_st2_permute_cost */
2434   2, /* ld3_st3_permute_cost  */
2435   3, /* ld4_st4_permute_cost  */
2436   3, /* permute_cost  */
2437   4, /* reduc_i8_cost  */
2438   4, /* reduc_i16_cost  */
2439   2, /* reduc_i32_cost  */
2440   2, /* reduc_i64_cost  */
2441   6, /* reduc_f16_cost  */
2442   3, /* reduc_f32_cost  */
2443   2, /* reduc_f64_cost  */
2444   2, /* store_elt_extra_cost  */
2445   /* This value is just inherited from the Cortex-A57 table.  */
2446   8, /* vec_to_scalar_cost  */
2447   /* This depends very much on what the scalar value is and
2448      where it comes from.  E.g. some constants take two dependent
2449      instructions or a load, while others might be moved from a GPR.
2450      4 seems to be a reasonable compromise in practice.  */
2451   4, /* scalar_to_vec_cost  */
2452   4, /* align_load_cost  */
2453   4, /* unalign_load_cost  */
2454   /* Although stores have a latency of 2 and compete for the
2455      vector pipes, in practice it's better not to model that.  */
2456   1, /* unalign_store_cost  */
2457   1  /* store_cost  */
2458 };
2459
2460 static const sve_vec_cost demeter_sve_vector_cost =
2461 {
2462   {
2463     2, /* int_stmt_cost  */
2464     2, /* fp_stmt_cost  */
2465     3, /* ld2_st2_permute_cost  */
2466     3, /* ld3_st3_permute_cost  */
2467     4, /* ld4_st4_permute_cost  */
2468     3, /* permute_cost  */
2469     /* Theoretically, a reduction involving 15 scalar ADDs could
2470        complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
2471        completes in 11 cycles, so give it a cost of 15 + 8.  */
2472     21, /* reduc_i8_cost  */
2473     /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
2474     14, /* reduc_i16_cost  */
2475     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
2476     7, /* reduc_i32_cost  */
2477     /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
2478     2, /* reduc_i64_cost  */
2479     /* Theoretically, a reduction involving 7 scalar FADDs could
2480        complete in ~6 cycles and would have a cost of 14.  FADDV
2481        completes in 8 cycles, so give it a cost of 14 + 2.  */
2482     16, /* reduc_f16_cost  */
2483     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
2484     8, /* reduc_f32_cost  */
2485     /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2.  */
2486     4, /* reduc_f64_cost  */
2487     2, /* store_elt_extra_cost  */
2488     /* This value is just inherited from the Cortex-A57 table.  */
2489     8, /* vec_to_scalar_cost  */
2490     /* See the comment above the Advanced SIMD versions.  */
2491     4, /* scalar_to_vec_cost  */
2492     4, /* align_load_cost  */
2493     4, /* unalign_load_cost  */
2494     /* Although stores have a latency of 2 and compete for the
2495        vector pipes, in practice it's better not to model that.  */
2496     1, /* unalign_store_cost  */
2497     1  /* store_cost  */
2498   },
2499   3, /* clast_cost  */
2500   10, /* fadda_f16_cost  */
2501   6, /* fadda_f32_cost  */
2502   4, /* fadda_f64_cost  */
2503   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2504      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2505      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2506      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2507      (cost 2) to that, to avoid the difference being lost in rounding.
2508
2509      There is no easy comparison between a strided Advanced SIMD x32 load
2510      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2511      operation more than a 64-bit gather.  */
2512   14, /* gather_load_x32_cost  */
2513   12, /* gather_load_x64_cost  */
2514   3 /* scatter_store_elt_cost  */
2515 };
2516
2517 static const aarch64_scalar_vec_issue_info demeter_scalar_issue_info =
2518 {
2519   3, /* loads_stores_per_cycle  */
2520   2, /* stores_per_cycle  */
2521   6, /* general_ops_per_cycle  */
2522   0, /* fp_simd_load_general_ops  */
2523   1 /* fp_simd_store_general_ops  */
2524 };
2525
2526 static const aarch64_advsimd_vec_issue_info demeter_advsimd_issue_info =
2527 {
2528   {
2529     3, /* loads_stores_per_cycle  */
2530     2, /* stores_per_cycle  */
2531     4, /* general_ops_per_cycle  */
2532     0, /* fp_simd_load_general_ops  */
2533     1 /* fp_simd_store_general_ops  */
2534   },
2535   2, /* ld2_st2_general_ops  */
2536   2, /* ld3_st3_general_ops  */
2537   3 /* ld4_st4_general_ops  */
2538 };
2539
2540 static const aarch64_sve_vec_issue_info demeter_sve_issue_info =
2541 {
2542   {
2543     {
2544       3, /* loads_per_cycle  */
2545       2, /* stores_per_cycle  */
2546       4, /* general_ops_per_cycle  */
2547       0, /* fp_simd_load_general_ops  */
2548       1 /* fp_simd_store_general_ops  */
2549     },
2550     2, /* ld2_st2_general_ops  */
2551     3, /* ld3_st3_general_ops  */
2552     3 /* ld4_st4_general_ops  */
2553   },
2554   2, /* pred_ops_per_cycle  */
2555   2, /* while_pred_ops  */
2556   2, /* int_cmp_pred_ops  */
2557   1, /* fp_cmp_pred_ops  */
2558   1, /* gather_scatter_pair_general_ops  */
2559   1 /* gather_scatter_pair_pred_ops  */
2560 };
2561
2562 static const aarch64_vec_issue_info demeter_vec_issue_info =
2563 {
2564   &demeter_scalar_issue_info,
2565   &demeter_advsimd_issue_info,
2566   &demeter_sve_issue_info
2567 };
2568
2569 /* Demeter costs for vector insn classes.  */
2570 static const struct cpu_vector_cost demeter_vector_cost =
2571 {
2572   1, /* scalar_int_stmt_cost  */
2573   2, /* scalar_fp_stmt_cost  */
2574   4, /* scalar_load_cost  */
2575   1, /* scalar_store_cost  */
2576   1, /* cond_taken_branch_cost  */
2577   1, /* cond_not_taken_branch_cost  */
2578   &demeter_advsimd_vector_cost, /* advsimd  */
2579   &demeter_sve_vector_cost, /* sve  */
2580   &demeter_vec_issue_info /* issue_info  */
2581 };
2582
2583 static const struct tune_params demeter_tunings =
2584 {
2585   &cortexa76_extra_costs,
2586   &demeter_addrcost_table,
2587   &demeter_regmove_cost,
2588   &demeter_vector_cost,
2589   &generic_branch_cost,
2590   &generic_approx_modes,
2591   SVE_128, /* sve_width  */
2592   { 4, /* load_int.  */
2593     2, /* store_int.  */
2594     6, /* load_fp.  */
2595     1, /* store_fp.  */
2596     6, /* load_pred.  */
2597     2 /* store_pred.  */
2598   }, /* memmov_cost.  */
2599   5, /* issue_rate  */
2600   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2601   "32:16",      /* function_align.  */
2602   "4",          /* jump_align.  */
2603   "32:16",      /* loop_align.  */
2604   3,    /* int_reassoc_width.  */
2605   6,    /* fp_reassoc_width.  */
2606   3,    /* vec_reassoc_width.  */
2607   2,    /* min_div_recip_mul_sf.  */
2608   2,    /* min_div_recip_mul_df.  */
2609   0,    /* max_case_values.  */
2610   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2611   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2612    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2613    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2614    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),     /* tune_flags.  */
2615   &generic_prefetch_tune
2616 };
2617
2618 static const struct tune_params a64fx_tunings =
2619 {
2620   &a64fx_extra_costs,
2621   &a64fx_addrcost_table,
2622   &a64fx_regmove_cost,
2623   &a64fx_vector_cost,
2624   &generic_branch_cost,
2625   &generic_approx_modes,
2626   SVE_512, /* sve_width  */
2627   { 4, /* load_int.  */
2628     4, /* store_int.  */
2629     4, /* load_fp.  */
2630     4, /* store_fp.  */
2631     4, /* load_pred.  */
2632     4 /* store_pred.  */
2633   }, /* memmov_cost.  */
2634   7, /* issue_rate  */
2635   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2636   "32", /* function_align.  */
2637   "16", /* jump_align.  */
2638   "32", /* loop_align.  */
2639   4,    /* int_reassoc_width.  */
2640   2,    /* fp_reassoc_width.  */
2641   2,    /* vec_reassoc_width.  */
2642   2,    /* min_div_recip_mul_sf.  */
2643   2,    /* min_div_recip_mul_df.  */
2644   0,    /* max_case_values.  */
2645   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2646   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
2647   &a64fx_prefetch_tune
2648 };
2649
2650 /* Support for fine-grained override of the tuning structures.  */
2651 struct aarch64_tuning_override_function
2652 {
2653   const char* name;
2654   void (*parse_override)(const char*, struct tune_params*);
2655 };
2656
2657 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
2658 static void aarch64_parse_tune_string (const char*, struct tune_params*);
2659 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
2660
2661 static const struct aarch64_tuning_override_function
2662 aarch64_tuning_override_functions[] =
2663 {
2664   { "fuse", aarch64_parse_fuse_string },
2665   { "tune", aarch64_parse_tune_string },
2666   { "sve_width", aarch64_parse_sve_width_string },
2667   { NULL, NULL }
2668 };
2669
2670 /* A processor implementing AArch64.  */
2671 struct processor
2672 {
2673   const char *const name;
2674   enum aarch64_processor ident;
2675   enum aarch64_processor sched_core;
2676   enum aarch64_arch arch;
2677   const uint64_t flags;
2678   const struct tune_params *const tune;
2679 };
2680
2681 /* Architectures implementing AArch64.  */
2682 static const struct processor all_architectures[] =
2683 {
2684 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
2685   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, FLAGS, NULL},
2686 #include "aarch64-arches.def"
2687   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2688 };
2689
2690 /* Processor cores implementing AArch64.  */
2691 static const struct processor all_cores[] =
2692 {
2693 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
2694   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
2695   FLAGS, &COSTS##_tunings},
2696 #include "aarch64-cores.def"
2697   {"generic", generic, cortexa53, AARCH64_ARCH_8A,
2698     AARCH64_FL_FOR_ARCH8, &generic_tunings},
2699   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2700 };
2701
2702 /* The current tuning set.  */
2703 struct tune_params aarch64_tune_params = generic_tunings;
2704
2705 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
2706
2707 static tree
2708 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
2709                                      int, bool *no_add_attrs)
2710 {
2711   /* Since we set fn_type_req to true, the caller should have checked
2712      this for us.  */
2713   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
2714   switch ((arm_pcs) fntype_abi (*node).id ())
2715     {
2716     case ARM_PCS_AAPCS64:
2717     case ARM_PCS_SIMD:
2718       return NULL_TREE;
2719
2720     case ARM_PCS_SVE:
2721       error ("the %qE attribute cannot be applied to an SVE function type",
2722              name);
2723       *no_add_attrs = true;
2724       return NULL_TREE;
2725
2726     case ARM_PCS_TLSDESC:
2727     case ARM_PCS_UNKNOWN:
2728       break;
2729     }
2730   gcc_unreachable ();
2731 }
2732
2733 /* Table of machine attributes.  */
2734 static const struct attribute_spec aarch64_attribute_table[] =
2735 {
2736   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
2737        affects_type_identity, handler, exclude } */
2738   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
2739                           handle_aarch64_vector_pcs_attribute, NULL },
2740   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
2741                           aarch64_sve::handle_arm_sve_vector_bits_attribute,
2742                           NULL },
2743   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
2744   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
2745   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
2746   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
2747 };
2748
2749 /* An ISA extension in the co-processor and main instruction set space.  */
2750 struct aarch64_option_extension
2751 {
2752   const char *const name;
2753   const unsigned long flags_on;
2754   const unsigned long flags_off;
2755 };
2756
2757 typedef enum aarch64_cond_code
2758 {
2759   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2760   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2761   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2762 }
2763 aarch64_cc;
2764
2765 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2766
2767 struct aarch64_branch_protect_type
2768 {
2769   /* The type's name that the user passes to the branch-protection option
2770     string.  */
2771   const char* name;
2772   /* Function to handle the protection type and set global variables.
2773     First argument is the string token corresponding with this type and the
2774     second argument is the next token in the option string.
2775     Return values:
2776     * AARCH64_PARSE_OK: Handling was sucessful.
2777     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
2778       should print an error.
2779     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
2780       own error.  */
2781   enum aarch64_parse_opt_result (*handler)(char*, char*);
2782   /* A list of types that can follow this type in the option string.  */
2783   const aarch64_branch_protect_type* subtypes;
2784   unsigned int num_subtypes;
2785 };
2786
2787 static enum aarch64_parse_opt_result
2788 aarch64_handle_no_branch_protection (char* str, char* rest)
2789 {
2790   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
2791   aarch64_enable_bti = 0;
2792   if (rest)
2793     {
2794       error ("unexpected %<%s%> after %<%s%>", rest, str);
2795       return AARCH64_PARSE_INVALID_FEATURE;
2796     }
2797   return AARCH64_PARSE_OK;
2798 }
2799
2800 static enum aarch64_parse_opt_result
2801 aarch64_handle_standard_branch_protection (char* str, char* rest)
2802 {
2803   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2804   aarch64_ra_sign_key = AARCH64_KEY_A;
2805   aarch64_enable_bti = 1;
2806   if (rest)
2807     {
2808       error ("unexpected %<%s%> after %<%s%>", rest, str);
2809       return AARCH64_PARSE_INVALID_FEATURE;
2810     }
2811   return AARCH64_PARSE_OK;
2812 }
2813
2814 static enum aarch64_parse_opt_result
2815 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
2816                                     char* rest ATTRIBUTE_UNUSED)
2817 {
2818   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2819   aarch64_ra_sign_key = AARCH64_KEY_A;
2820   return AARCH64_PARSE_OK;
2821 }
2822
2823 static enum aarch64_parse_opt_result
2824 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
2825                               char* rest ATTRIBUTE_UNUSED)
2826 {
2827   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
2828   return AARCH64_PARSE_OK;
2829 }
2830
2831 static enum aarch64_parse_opt_result
2832 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
2833                               char* rest ATTRIBUTE_UNUSED)
2834 {
2835   aarch64_ra_sign_key = AARCH64_KEY_B;
2836   return AARCH64_PARSE_OK;
2837 }
2838
2839 static enum aarch64_parse_opt_result
2840 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
2841                                     char* rest ATTRIBUTE_UNUSED)
2842 {
2843   aarch64_enable_bti = 1;
2844   return AARCH64_PARSE_OK;
2845 }
2846
2847 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
2848   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
2849   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
2850   { NULL, NULL, NULL, 0 }
2851 };
2852
2853 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
2854   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
2855   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
2856   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
2857     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
2858   { "bti", aarch64_handle_bti_protection, NULL, 0 },
2859   { NULL, NULL, NULL, 0 }
2860 };
2861
2862 /* The condition codes of the processor, and the inverse function.  */
2863 static const char * const aarch64_condition_codes[] =
2864 {
2865   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2866   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2867 };
2868
2869 /* The preferred condition codes for SVE conditions.  */
2870 static const char *const aarch64_sve_condition_codes[] =
2871 {
2872   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2873   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2874 };
2875
2876 /* Return the assembly token for svpattern value VALUE.  */
2877
2878 static const char *
2879 svpattern_token (enum aarch64_svpattern pattern)
2880 {
2881   switch (pattern)
2882     {
2883 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2884     AARCH64_FOR_SVPATTERN (CASE)
2885 #undef CASE
2886     case AARCH64_NUM_SVPATTERNS:
2887       break;
2888     }
2889   gcc_unreachable ();
2890 }
2891
2892 /* Return the location of a piece that is known to be passed or returned
2893    in registers.  FIRST_ZR is the first unused vector argument register
2894    and FIRST_PR is the first unused predicate argument register.  */
2895
2896 rtx
2897 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2898                                          unsigned int first_pr) const
2899 {
2900   gcc_assert (VECTOR_MODE_P (mode)
2901               && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2902               && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2903
2904   if (num_zr > 0 && num_pr == 0)
2905     return gen_rtx_REG (mode, first_zr);
2906
2907   if (num_zr == 0 && num_pr == 1)
2908     return gen_rtx_REG (mode, first_pr);
2909
2910   gcc_unreachable ();
2911 }
2912
2913 /* Return the total number of vector registers required by the PST.  */
2914
2915 unsigned int
2916 pure_scalable_type_info::num_zr () const
2917 {
2918   unsigned int res = 0;
2919   for (unsigned int i = 0; i < pieces.length (); ++i)
2920     res += pieces[i].num_zr;
2921   return res;
2922 }
2923
2924 /* Return the total number of predicate registers required by the PST.  */
2925
2926 unsigned int
2927 pure_scalable_type_info::num_pr () const
2928 {
2929   unsigned int res = 0;
2930   for (unsigned int i = 0; i < pieces.length (); ++i)
2931     res += pieces[i].num_pr;
2932   return res;
2933 }
2934
2935 /* Return the location of a PST that is known to be passed or returned
2936    in registers.  FIRST_ZR is the first unused vector argument register
2937    and FIRST_PR is the first unused predicate argument register.  */
2938
2939 rtx
2940 pure_scalable_type_info::get_rtx (machine_mode mode,
2941                                   unsigned int first_zr,
2942                                   unsigned int first_pr) const
2943 {
2944   /* Try to return a single REG if possible.  This leads to better
2945      code generation; it isn't required for correctness.  */
2946   if (mode == pieces[0].mode)
2947     {
2948       gcc_assert (pieces.length () == 1);
2949       return pieces[0].get_rtx (first_zr, first_pr);
2950     }
2951
2952   /* Build up a PARALLEL that contains the individual pieces.  */
2953   rtvec rtxes = rtvec_alloc (pieces.length ());
2954   for (unsigned int i = 0; i < pieces.length (); ++i)
2955     {
2956       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
2957       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
2958       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
2959       first_zr += pieces[i].num_zr;
2960       first_pr += pieces[i].num_pr;
2961     }
2962   return gen_rtx_PARALLEL (mode, rtxes);
2963 }
2964
2965 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
2966    in the AAPCS64.  */
2967
2968 pure_scalable_type_info::analysis_result
2969 pure_scalable_type_info::analyze (const_tree type)
2970 {
2971   /* Prevent accidental reuse.  */
2972   gcc_assert (pieces.is_empty ());
2973
2974   /* No code will be generated for erroneous types, so we won't establish
2975      an ABI mapping.  */
2976   if (type == error_mark_node)
2977     return NO_ABI_IDENTITY;
2978
2979   /* Zero-sized types disappear in the language->ABI mapping.  */
2980   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2981     return NO_ABI_IDENTITY;
2982
2983   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
2984   piece p = {};
2985   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
2986     {
2987       machine_mode mode = TYPE_MODE_RAW (type);
2988       gcc_assert (VECTOR_MODE_P (mode)
2989                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
2990
2991       p.mode = p.orig_mode = mode;
2992       add_piece (p);
2993       return IS_PST;
2994     }
2995
2996   /* Check for user-defined PSTs.  */
2997   if (TREE_CODE (type) == ARRAY_TYPE)
2998     return analyze_array (type);
2999   if (TREE_CODE (type) == RECORD_TYPE)
3000     return analyze_record (type);
3001
3002   return ISNT_PST;
3003 }
3004
3005 /* Analyze a type that is known not to be passed or returned in memory.
3006    Return true if it has an ABI identity and is a Pure Scalable Type.  */
3007
3008 bool
3009 pure_scalable_type_info::analyze_registers (const_tree type)
3010 {
3011   analysis_result result = analyze (type);
3012   gcc_assert (result != DOESNT_MATTER);
3013   return result == IS_PST;
3014 }
3015
3016 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
3017
3018 pure_scalable_type_info::analysis_result
3019 pure_scalable_type_info::analyze_array (const_tree type)
3020 {
3021   /* Analyze the element type.  */
3022   pure_scalable_type_info element_info;
3023   analysis_result result = element_info.analyze (TREE_TYPE (type));
3024   if (result != IS_PST)
3025     return result;
3026
3027   /* An array of unknown, flexible or variable length will be passed and
3028      returned by reference whatever we do.  */
3029   tree nelts_minus_one = array_type_nelts (type);
3030   if (!tree_fits_uhwi_p (nelts_minus_one))
3031     return DOESNT_MATTER;
3032
3033   /* Likewise if the array is constant-sized but too big to be interesting.
3034      The double checks against MAX_PIECES are to protect against overflow.  */
3035   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
3036   if (count > MAX_PIECES)
3037     return DOESNT_MATTER;
3038   count += 1;
3039   if (count * element_info.pieces.length () > MAX_PIECES)
3040     return DOESNT_MATTER;
3041
3042   /* The above checks should have weeded out elements of unknown size.  */
3043   poly_uint64 element_bytes;
3044   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
3045     gcc_unreachable ();
3046
3047   /* Build up the list of individual vectors and predicates.  */
3048   gcc_assert (!element_info.pieces.is_empty ());
3049   for (unsigned int i = 0; i < count; ++i)
3050     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
3051       {
3052         piece p = element_info.pieces[j];
3053         p.offset += i * element_bytes;
3054         add_piece (p);
3055       }
3056   return IS_PST;
3057 }
3058
3059 /* Subroutine of analyze for handling RECORD_TYPEs.  */
3060
3061 pure_scalable_type_info::analysis_result
3062 pure_scalable_type_info::analyze_record (const_tree type)
3063 {
3064   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3065     {
3066       if (TREE_CODE (field) != FIELD_DECL)
3067         continue;
3068
3069       /* Zero-sized fields disappear in the language->ABI mapping.  */
3070       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
3071         continue;
3072
3073       /* All fields with an ABI identity must be PSTs for the record as
3074          a whole to be a PST.  If any individual field is too big to be
3075          interesting then the record is too.  */
3076       pure_scalable_type_info field_info;
3077       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
3078       if (subresult == NO_ABI_IDENTITY)
3079         continue;
3080       if (subresult != IS_PST)
3081         return subresult;
3082
3083       /* Since all previous fields are PSTs, we ought to be able to track
3084          the field offset using poly_ints.  */
3085       tree bitpos = bit_position (field);
3086       gcc_assert (poly_int_tree_p (bitpos));
3087
3088       /* For the same reason, it shouldn't be possible to create a PST field
3089          whose offset isn't byte-aligned.  */
3090       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
3091                                                 BITS_PER_UNIT);
3092
3093       /* Punt if the record is too big to be interesting.  */
3094       poly_uint64 bytepos;
3095       if (!wide_bytepos.to_uhwi (&bytepos)
3096           || pieces.length () + field_info.pieces.length () > MAX_PIECES)
3097         return DOESNT_MATTER;
3098
3099       /* Add the individual vectors and predicates in the field to the
3100          record's list.  */
3101       gcc_assert (!field_info.pieces.is_empty ());
3102       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
3103         {
3104           piece p = field_info.pieces[i];
3105           p.offset += bytepos;
3106           add_piece (p);
3107         }
3108     }
3109   /* Empty structures disappear in the language->ABI mapping.  */
3110   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
3111 }
3112
3113 /* Add P to the list of pieces in the type.  */
3114
3115 void
3116 pure_scalable_type_info::add_piece (const piece &p)
3117 {
3118   /* Try to fold the new piece into the previous one to form a
3119      single-mode PST.  For example, if we see three consecutive vectors
3120      of the same mode, we can represent them using the corresponding
3121      3-tuple mode.
3122
3123      This is purely an optimization.  */
3124   if (!pieces.is_empty ())
3125     {
3126       piece &prev = pieces.last ();
3127       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
3128       unsigned int nelems1, nelems2;
3129       if (prev.orig_mode == p.orig_mode
3130           && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
3131           && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
3132                                   GET_MODE_NUNITS (p.orig_mode), &nelems1)
3133           && constant_multiple_p (GET_MODE_NUNITS (p.mode),
3134                                   GET_MODE_NUNITS (p.orig_mode), &nelems2)
3135           && targetm.array_mode (p.orig_mode,
3136                                  nelems1 + nelems2).exists (&prev.mode))
3137         {
3138           prev.num_zr += p.num_zr;
3139           prev.num_pr += p.num_pr;
3140           return;
3141         }
3142     }
3143   pieces.quick_push (p);
3144 }
3145
3146 /* Return true if at least one possible value of type TYPE includes at
3147    least one object of Pure Scalable Type, in the sense of the AAPCS64.
3148
3149    This is a relatively expensive test for some types, so it should
3150    generally be made as late as possible.  */
3151
3152 static bool
3153 aarch64_some_values_include_pst_objects_p (const_tree type)
3154 {
3155   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3156     return false;
3157
3158   if (aarch64_sve::builtin_type_p (type))
3159     return true;
3160
3161   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
3162     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
3163
3164   if (RECORD_OR_UNION_TYPE_P (type))
3165     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3166       if (TREE_CODE (field) == FIELD_DECL
3167           && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
3168         return true;
3169
3170   return false;
3171 }
3172
3173 /* Return the descriptor of the SIMD ABI.  */
3174
3175 static const predefined_function_abi &
3176 aarch64_simd_abi (void)
3177 {
3178   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
3179   if (!simd_abi.initialized_p ())
3180     {
3181       HARD_REG_SET full_reg_clobbers
3182         = default_function_abi.full_reg_clobbers ();
3183       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
3184         if (FP_SIMD_SAVED_REGNUM_P (regno))
3185           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3186       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
3187     }
3188   return simd_abi;
3189 }
3190
3191 /* Return the descriptor of the SVE PCS.  */
3192
3193 static const predefined_function_abi &
3194 aarch64_sve_abi (void)
3195 {
3196   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
3197   if (!sve_abi.initialized_p ())
3198     {
3199       HARD_REG_SET full_reg_clobbers
3200         = default_function_abi.full_reg_clobbers ();
3201       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
3202         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3203       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
3204         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3205       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
3206     }
3207   return sve_abi;
3208 }
3209
3210 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
3211    wraps, otherwise return X itself.  */
3212
3213 static rtx
3214 strip_salt (rtx x)
3215 {
3216   rtx search = x;
3217   if (GET_CODE (search) == CONST)
3218     search = XEXP (search, 0);
3219   if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
3220     x = XVECEXP (search, 0, 0);
3221   return x;
3222 }
3223
3224 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
3225    expression.  */
3226
3227 static rtx
3228 strip_offset_and_salt (rtx addr, poly_int64 *offset)
3229 {
3230   return strip_salt (strip_offset (addr, offset));
3231 }
3232
3233 /* Generate code to enable conditional branches in functions over 1 MiB.  */
3234 const char *
3235 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
3236                         const char * branch_format)
3237 {
3238     rtx_code_label * tmp_label = gen_label_rtx ();
3239     char label_buf[256];
3240     char buffer[128];
3241     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
3242                                  CODE_LABEL_NUMBER (tmp_label));
3243     const char *label_ptr = targetm.strip_name_encoding (label_buf);
3244     rtx dest_label = operands[pos_label];
3245     operands[pos_label] = tmp_label;
3246
3247     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
3248     output_asm_insn (buffer, operands);
3249
3250     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
3251     operands[pos_label] = dest_label;
3252     output_asm_insn (buffer, operands);
3253     return "";
3254 }
3255
3256 void
3257 aarch64_err_no_fpadvsimd (machine_mode mode)
3258 {
3259   if (TARGET_GENERAL_REGS_ONLY)
3260     if (FLOAT_MODE_P (mode))
3261       error ("%qs is incompatible with the use of floating-point types",
3262              "-mgeneral-regs-only");
3263     else
3264       error ("%qs is incompatible with the use of vector types",
3265              "-mgeneral-regs-only");
3266   else
3267     if (FLOAT_MODE_P (mode))
3268       error ("%qs feature modifier is incompatible with the use of"
3269              " floating-point types", "+nofp");
3270     else
3271       error ("%qs feature modifier is incompatible with the use of"
3272              " vector types", "+nofp");
3273 }
3274
3275 /* Report when we try to do something that requires SVE when SVE is disabled.
3276    This is an error of last resort and isn't very high-quality.  It usually
3277    involves attempts to measure the vector length in some way.  */
3278 static void
3279 aarch64_report_sve_required (void)
3280 {
3281   static bool reported_p = false;
3282
3283   /* Avoid reporting a slew of messages for a single oversight.  */
3284   if (reported_p)
3285     return;
3286
3287   error ("this operation requires the SVE ISA extension");
3288   inform (input_location, "you can enable SVE using the command-line"
3289           " option %<-march%>, or by using the %<target%>"
3290           " attribute or pragma");
3291   reported_p = true;
3292 }
3293
3294 /* Return true if REGNO is P0-P15 or one of the special FFR-related
3295    registers.  */
3296 inline bool
3297 pr_or_ffr_regnum_p (unsigned int regno)
3298 {
3299   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
3300 }
3301
3302 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
3303    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
3304    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
3305    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
3306    and GENERAL_REGS is lower than the memory cost (in this case the best class
3307    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
3308    cost results in bad allocations with many redundant int<->FP moves which
3309    are expensive on various cores.
3310    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
3311    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
3312    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
3313    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
3314    The result of this is that it is no longer inefficient to have a higher
3315    memory move cost than the register move cost.
3316 */
3317
3318 static reg_class_t
3319 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
3320                                          reg_class_t best_class)
3321 {
3322   machine_mode mode;
3323
3324   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
3325       || !reg_class_subset_p (FP_REGS, allocno_class))
3326     return allocno_class;
3327
3328   if (!reg_class_subset_p (GENERAL_REGS, best_class)
3329       || !reg_class_subset_p (FP_REGS, best_class))
3330     return best_class;
3331
3332   mode = PSEUDO_REGNO_MODE (regno);
3333   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
3334 }
3335
3336 static unsigned int
3337 aarch64_min_divisions_for_recip_mul (machine_mode mode)
3338 {
3339   if (GET_MODE_UNIT_SIZE (mode) == 4)
3340     return aarch64_tune_params.min_div_recip_mul_sf;
3341   return aarch64_tune_params.min_div_recip_mul_df;
3342 }
3343
3344 /* Return the reassociation width of treeop OPC with mode MODE.  */
3345 static int
3346 aarch64_reassociation_width (unsigned opc, machine_mode mode)
3347 {
3348   if (VECTOR_MODE_P (mode))
3349     return aarch64_tune_params.vec_reassoc_width;
3350   if (INTEGRAL_MODE_P (mode))
3351     return aarch64_tune_params.int_reassoc_width;
3352   /* Avoid reassociating floating point addition so we emit more FMAs.  */
3353   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
3354     return aarch64_tune_params.fp_reassoc_width;
3355   return 1;
3356 }
3357
3358 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
3359 unsigned
3360 aarch64_dbx_register_number (unsigned regno)
3361 {
3362    if (GP_REGNUM_P (regno))
3363      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
3364    else if (regno == SP_REGNUM)
3365      return AARCH64_DWARF_SP;
3366    else if (FP_REGNUM_P (regno))
3367      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
3368    else if (PR_REGNUM_P (regno))
3369      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
3370    else if (regno == VG_REGNUM)
3371      return AARCH64_DWARF_VG;
3372
3373    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
3374       equivalent DWARF register.  */
3375    return DWARF_FRAME_REGISTERS;
3376 }
3377
3378 /* If X is a CONST_DOUBLE, return its bit representation as a constant
3379    integer, otherwise return X unmodified.  */
3380 static rtx
3381 aarch64_bit_representation (rtx x)
3382 {
3383   if (CONST_DOUBLE_P (x))
3384     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
3385   return x;
3386 }
3387
3388 /* Return an estimate for the number of quadwords in an SVE vector.  This is
3389    equivalent to the number of Advanced SIMD vectors in an SVE vector.  */
3390 static unsigned int
3391 aarch64_estimated_sve_vq ()
3392 {
3393   return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
3394 }
3395
3396 /* Return true if MODE is an SVE predicate mode.  */
3397 static bool
3398 aarch64_sve_pred_mode_p (machine_mode mode)
3399 {
3400   return (TARGET_SVE
3401           && (mode == VNx16BImode
3402               || mode == VNx8BImode
3403               || mode == VNx4BImode
3404               || mode == VNx2BImode));
3405 }
3406
3407 /* Three mutually-exclusive flags describing a vector or predicate type.  */
3408 const unsigned int VEC_ADVSIMD  = 1;
3409 const unsigned int VEC_SVE_DATA = 2;
3410 const unsigned int VEC_SVE_PRED = 4;
3411 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
3412    a structure of 2, 3 or 4 vectors.  */
3413 const unsigned int VEC_STRUCT   = 8;
3414 /* Can be used in combination with VEC_SVE_DATA to indicate that the
3415    vector has fewer significant bytes than a full SVE vector.  */
3416 const unsigned int VEC_PARTIAL  = 16;
3417 /* Useful combinations of the above.  */
3418 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
3419 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
3420
3421 /* Return a set of flags describing the vector properties of mode MODE.
3422    Ignore modes that are not supported by the current target.  */
3423 static unsigned int
3424 aarch64_classify_vector_mode (machine_mode mode)
3425 {
3426   if (aarch64_sve_pred_mode_p (mode))
3427     return VEC_SVE_PRED;
3428
3429   /* Make the decision based on the mode's enum value rather than its
3430      properties, so that we keep the correct classification regardless
3431      of -msve-vector-bits.  */
3432   switch (mode)
3433     {
3434     /* Partial SVE QI vectors.  */
3435     case E_VNx2QImode:
3436     case E_VNx4QImode:
3437     case E_VNx8QImode:
3438     /* Partial SVE HI vectors.  */
3439     case E_VNx2HImode:
3440     case E_VNx4HImode:
3441     /* Partial SVE SI vector.  */
3442     case E_VNx2SImode:
3443     /* Partial SVE HF vectors.  */
3444     case E_VNx2HFmode:
3445     case E_VNx4HFmode:
3446     /* Partial SVE BF vectors.  */
3447     case E_VNx2BFmode:
3448     case E_VNx4BFmode:
3449     /* Partial SVE SF vector.  */
3450     case E_VNx2SFmode:
3451       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
3452
3453     case E_VNx16QImode:
3454     case E_VNx8HImode:
3455     case E_VNx4SImode:
3456     case E_VNx2DImode:
3457     case E_VNx8BFmode:
3458     case E_VNx8HFmode:
3459     case E_VNx4SFmode:
3460     case E_VNx2DFmode:
3461       return TARGET_SVE ? VEC_SVE_DATA : 0;
3462
3463     /* x2 SVE vectors.  */
3464     case E_VNx32QImode:
3465     case E_VNx16HImode:
3466     case E_VNx8SImode:
3467     case E_VNx4DImode:
3468     case E_VNx16BFmode:
3469     case E_VNx16HFmode:
3470     case E_VNx8SFmode:
3471     case E_VNx4DFmode:
3472     /* x3 SVE vectors.  */
3473     case E_VNx48QImode:
3474     case E_VNx24HImode:
3475     case E_VNx12SImode:
3476     case E_VNx6DImode:
3477     case E_VNx24BFmode:
3478     case E_VNx24HFmode:
3479     case E_VNx12SFmode:
3480     case E_VNx6DFmode:
3481     /* x4 SVE vectors.  */
3482     case E_VNx64QImode:
3483     case E_VNx32HImode:
3484     case E_VNx16SImode:
3485     case E_VNx8DImode:
3486     case E_VNx32BFmode:
3487     case E_VNx32HFmode:
3488     case E_VNx16SFmode:
3489     case E_VNx8DFmode:
3490       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
3491
3492     case E_OImode:
3493     case E_CImode:
3494     case E_XImode:
3495       return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT : 0;
3496
3497     /* Structures of 64-bit Advanced SIMD vectors.  */
3498     case E_V2x8QImode:
3499     case E_V2x4HImode:
3500     case E_V2x2SImode:
3501     case E_V2x1DImode:
3502     case E_V2x4BFmode:
3503     case E_V2x4HFmode:
3504     case E_V2x2SFmode:
3505     case E_V2x1DFmode:
3506     case E_V3x8QImode:
3507     case E_V3x4HImode:
3508     case E_V3x2SImode:
3509     case E_V3x1DImode:
3510     case E_V3x4BFmode:
3511     case E_V3x4HFmode:
3512     case E_V3x2SFmode:
3513     case E_V3x1DFmode:
3514     case E_V4x8QImode:
3515     case E_V4x4HImode:
3516     case E_V4x2SImode:
3517     case E_V4x1DImode:
3518     case E_V4x4BFmode:
3519     case E_V4x4HFmode:
3520     case E_V4x2SFmode:
3521     case E_V4x1DFmode:
3522       return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
3523
3524     /* Structures of 128-bit Advanced SIMD vectors.  */
3525     case E_V2x16QImode:
3526     case E_V2x8HImode:
3527     case E_V2x4SImode:
3528     case E_V2x2DImode:
3529     case E_V2x8BFmode:
3530     case E_V2x8HFmode:
3531     case E_V2x4SFmode:
3532     case E_V2x2DFmode:
3533     case E_V3x16QImode:
3534     case E_V3x8HImode:
3535     case E_V3x4SImode:
3536     case E_V3x2DImode:
3537     case E_V3x8BFmode:
3538     case E_V3x8HFmode:
3539     case E_V3x4SFmode:
3540     case E_V3x2DFmode:
3541     case E_V4x16QImode:
3542     case E_V4x8HImode:
3543     case E_V4x4SImode:
3544     case E_V4x2DImode:
3545     case E_V4x8BFmode:
3546     case E_V4x8HFmode:
3547     case E_V4x4SFmode:
3548     case E_V4x2DFmode:
3549       return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT : 0;
3550
3551     /* 64-bit Advanced SIMD vectors.  */
3552     case E_V8QImode:
3553     case E_V4HImode:
3554     case E_V2SImode:
3555     /* ...E_V1DImode doesn't exist.  */
3556     case E_V4HFmode:
3557     case E_V4BFmode:
3558     case E_V2SFmode:
3559     case E_V1DFmode:
3560     /* 128-bit Advanced SIMD vectors.  */
3561     case E_V16QImode:
3562     case E_V8HImode:
3563     case E_V4SImode:
3564     case E_V2DImode:
3565     case E_V8HFmode:
3566     case E_V8BFmode:
3567     case E_V4SFmode:
3568     case E_V2DFmode:
3569       return TARGET_SIMD ? VEC_ADVSIMD : 0;
3570
3571     default:
3572       return 0;
3573     }
3574 }
3575
3576 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
3577 bool
3578 aarch64_advsimd_struct_mode_p (machine_mode mode)
3579 {
3580   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3581   return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
3582 }
3583
3584 /* Return true if MODE is an Advanced SIMD D-register structure mode.  */
3585 static bool
3586 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
3587 {
3588   return (aarch64_classify_vector_mode (mode)
3589           == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
3590 }
3591
3592 /* Return true if MODE is an Advanced SIMD Q-register structure mode.  */
3593 static bool
3594 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
3595 {
3596   return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
3597 }
3598
3599 /* Return true if MODE is any of the data vector modes, including
3600    structure modes.  */
3601 static bool
3602 aarch64_vector_data_mode_p (machine_mode mode)
3603 {
3604   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
3605 }
3606
3607 /* Return true if MODE is any form of SVE mode, including predicates,
3608    vectors and structures.  */
3609 bool
3610 aarch64_sve_mode_p (machine_mode mode)
3611 {
3612   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
3613 }
3614
3615 /* Return true if MODE is an SVE data vector mode; either a single vector
3616    or a structure of vectors.  */
3617 static bool
3618 aarch64_sve_data_mode_p (machine_mode mode)
3619 {
3620   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
3621 }
3622
3623 /* Return the number of defined bytes in one constituent vector of
3624    SVE mode MODE, which has vector flags VEC_FLAGS.  */
3625 static poly_int64
3626 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
3627 {
3628   if (vec_flags & VEC_PARTIAL)
3629     /* A single partial vector.  */
3630     return GET_MODE_SIZE (mode);
3631
3632   if (vec_flags & VEC_SVE_DATA)
3633     /* A single vector or a tuple.  */
3634     return BYTES_PER_SVE_VECTOR;
3635
3636   /* A single predicate.  */
3637   gcc_assert (vec_flags & VEC_SVE_PRED);
3638   return BYTES_PER_SVE_PRED;
3639 }
3640
3641 /* If MODE holds an array of vectors, return the number of vectors
3642    in the array, otherwise return 1.  */
3643
3644 static unsigned int
3645 aarch64_ldn_stn_vectors (machine_mode mode)
3646 {
3647   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3648   if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
3649     return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
3650   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
3651     return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
3652   if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
3653     return exact_div (GET_MODE_SIZE (mode),
3654                       BYTES_PER_SVE_VECTOR).to_constant ();
3655   return 1;
3656 }
3657
3658 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
3659    corresponding vector structure mode.  */
3660 static opt_machine_mode
3661 aarch64_advsimd_vector_array_mode (machine_mode mode,
3662                                    unsigned HOST_WIDE_INT nelems)
3663 {
3664   unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
3665   if (known_eq (GET_MODE_SIZE (mode), 8))
3666     flags |= VEC_PARTIAL;
3667
3668   machine_mode struct_mode;
3669   FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
3670     if (aarch64_classify_vector_mode (struct_mode) == flags
3671         && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
3672         && known_eq (GET_MODE_NUNITS (struct_mode),
3673              GET_MODE_NUNITS (mode) * nelems))
3674       return struct_mode;
3675   return opt_machine_mode ();
3676 }
3677
3678 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
3679
3680 opt_machine_mode
3681 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
3682 {
3683   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
3684                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
3685   machine_mode mode;
3686   FOR_EACH_MODE_IN_CLASS (mode, mclass)
3687     if (inner_mode == GET_MODE_INNER (mode)
3688         && known_eq (nunits, GET_MODE_NUNITS (mode))
3689         && aarch64_sve_data_mode_p (mode))
3690       return mode;
3691   return opt_machine_mode ();
3692 }
3693
3694 /* Implement target hook TARGET_ARRAY_MODE.  */
3695 static opt_machine_mode
3696 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
3697 {
3698   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
3699       && IN_RANGE (nelems, 2, 4))
3700     return aarch64_sve_data_mode (GET_MODE_INNER (mode),
3701                                   GET_MODE_NUNITS (mode) * nelems);
3702   if (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD
3703       && IN_RANGE (nelems, 2, 4))
3704     return aarch64_advsimd_vector_array_mode (mode, nelems);
3705
3706   return opt_machine_mode ();
3707 }
3708
3709 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
3710 static bool
3711 aarch64_array_mode_supported_p (machine_mode mode,
3712                                 unsigned HOST_WIDE_INT nelems)
3713 {
3714   if (TARGET_SIMD
3715       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
3716           || AARCH64_VALID_SIMD_DREG_MODE (mode))
3717       && (nelems >= 2 && nelems <= 4))
3718     return true;
3719
3720   return false;
3721 }
3722
3723 /* MODE is some form of SVE vector mode.  For data modes, return the number
3724    of vector register bits that each element of MODE occupies, such as 64
3725    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
3726    in a 64-bit container).  For predicate modes, return the number of
3727    data bits controlled by each significant predicate bit.  */
3728
3729 static unsigned int
3730 aarch64_sve_container_bits (machine_mode mode)
3731 {
3732   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3733   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
3734                              ? BITS_PER_SVE_VECTOR
3735                              : GET_MODE_BITSIZE (mode));
3736   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
3737 }
3738
3739 /* Return the SVE predicate mode to use for elements that have
3740    ELEM_NBYTES bytes, if such a mode exists.  */
3741
3742 opt_machine_mode
3743 aarch64_sve_pred_mode (unsigned int elem_nbytes)
3744 {
3745   if (TARGET_SVE)
3746     {
3747       if (elem_nbytes == 1)
3748         return VNx16BImode;
3749       if (elem_nbytes == 2)
3750         return VNx8BImode;
3751       if (elem_nbytes == 4)
3752         return VNx4BImode;
3753       if (elem_nbytes == 8)
3754         return VNx2BImode;
3755     }
3756   return opt_machine_mode ();
3757 }
3758
3759 /* Return the SVE predicate mode that should be used to control
3760    SVE mode MODE.  */
3761
3762 machine_mode
3763 aarch64_sve_pred_mode (machine_mode mode)
3764 {
3765   unsigned int bits = aarch64_sve_container_bits (mode);
3766   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
3767 }
3768
3769 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
3770
3771 static opt_machine_mode
3772 aarch64_get_mask_mode (machine_mode mode)
3773 {
3774   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3775   if (vec_flags & VEC_SVE_DATA)
3776     return aarch64_sve_pred_mode (mode);
3777
3778   return default_get_mask_mode (mode);
3779 }
3780
3781 /* Return the integer element mode associated with SVE mode MODE.  */
3782
3783 static scalar_int_mode
3784 aarch64_sve_element_int_mode (machine_mode mode)
3785 {
3786   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3787                              ? BITS_PER_SVE_VECTOR
3788                              : GET_MODE_BITSIZE (mode));
3789   unsigned int elt_bits = vector_element_size (vector_bits,
3790                                                GET_MODE_NUNITS (mode));
3791   return int_mode_for_size (elt_bits, 0).require ();
3792 }
3793
3794 /* Return an integer element mode that contains exactly
3795    aarch64_sve_container_bits (MODE) bits.  This is wider than
3796    aarch64_sve_element_int_mode if MODE is a partial vector,
3797    otherwise it's the same.  */
3798
3799 static scalar_int_mode
3800 aarch64_sve_container_int_mode (machine_mode mode)
3801 {
3802   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
3803 }
3804
3805 /* Return the integer vector mode associated with SVE mode MODE.
3806    Unlike related_int_vector_mode, this can handle the case in which
3807    MODE is a predicate (and thus has a different total size).  */
3808
3809 machine_mode
3810 aarch64_sve_int_mode (machine_mode mode)
3811 {
3812   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
3813   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
3814 }
3815
3816 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
3817
3818 static opt_machine_mode
3819 aarch64_vectorize_related_mode (machine_mode vector_mode,
3820                                 scalar_mode element_mode,
3821                                 poly_uint64 nunits)
3822 {
3823   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
3824
3825   /* If we're operating on SVE vectors, try to return an SVE mode.  */
3826   poly_uint64 sve_nunits;
3827   if ((vec_flags & VEC_SVE_DATA)
3828       && multiple_p (BYTES_PER_SVE_VECTOR,
3829                      GET_MODE_SIZE (element_mode), &sve_nunits))
3830     {
3831       machine_mode sve_mode;
3832       if (maybe_ne (nunits, 0U))
3833         {
3834           /* Try to find a full or partial SVE mode with exactly
3835              NUNITS units.  */
3836           if (multiple_p (sve_nunits, nunits)
3837               && aarch64_sve_data_mode (element_mode,
3838                                         nunits).exists (&sve_mode))
3839             return sve_mode;
3840         }
3841       else
3842         {
3843           /* Take the preferred number of units from the number of bytes
3844              that fit in VECTOR_MODE.  We always start by "autodetecting"
3845              a full vector mode with preferred_simd_mode, so vectors
3846              chosen here will also be full vector modes.  Then
3847              autovectorize_vector_modes tries smaller starting modes
3848              and thus smaller preferred numbers of units.  */
3849           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
3850           if (aarch64_sve_data_mode (element_mode,
3851                                      sve_nunits).exists (&sve_mode))
3852             return sve_mode;
3853         }
3854     }
3855
3856   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
3857   if ((vec_flags & VEC_ADVSIMD)
3858       && known_eq (nunits, 0U)
3859       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3860       && maybe_ge (GET_MODE_BITSIZE (element_mode)
3861                    * GET_MODE_NUNITS (vector_mode), 128U))
3862     {
3863       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3864       if (VECTOR_MODE_P (res))
3865         return res;
3866     }
3867
3868   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3869 }
3870
3871 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
3872    prefer to use the first arithmetic operand as the else value if
3873    the else value doesn't matter, since that exactly matches the SVE
3874    destructive merging form.  For ternary operations we could either
3875    pick the first operand and use FMAD-like instructions or the last
3876    operand and use FMLA-like instructions; the latter seems more
3877    natural.  */
3878
3879 static tree
3880 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
3881 {
3882   return nops == 3 ? ops[2] : ops[0];
3883 }
3884
3885 /* Implement TARGET_HARD_REGNO_NREGS.  */
3886
3887 static unsigned int
3888 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
3889 {
3890   /* ??? Logically we should only need to provide a value when
3891      HARD_REGNO_MODE_OK says that the combination is valid,
3892      but at the moment we need to handle all modes.  Just ignore
3893      any runtime parts for registers that can't store them.  */
3894   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
3895   switch (aarch64_regno_regclass (regno))
3896     {
3897     case FP_REGS:
3898     case FP_LO_REGS:
3899     case FP_LO8_REGS:
3900       {
3901         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3902         if (vec_flags & VEC_SVE_DATA)
3903           return exact_div (GET_MODE_SIZE (mode),
3904                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
3905         if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
3906           return GET_MODE_SIZE (mode).to_constant () / 8;
3907         return CEIL (lowest_size, UNITS_PER_VREG);
3908       }
3909     case PR_REGS:
3910     case PR_LO_REGS:
3911     case PR_HI_REGS:
3912     case FFR_REGS:
3913     case PR_AND_FFR_REGS:
3914       return 1;
3915     default:
3916       return CEIL (lowest_size, UNITS_PER_WORD);
3917     }
3918   gcc_unreachable ();
3919 }
3920
3921 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
3922
3923 static bool
3924 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
3925 {
3926   if (mode == V8DImode)
3927     return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
3928            && multiple_p (regno - R0_REGNUM, 2);
3929
3930   if (GET_MODE_CLASS (mode) == MODE_CC)
3931     return regno == CC_REGNUM;
3932
3933   if (regno == VG_REGNUM)
3934     /* This must have the same size as _Unwind_Word.  */
3935     return mode == DImode;
3936
3937   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3938   if (vec_flags & VEC_SVE_PRED)
3939     return pr_or_ffr_regnum_p (regno);
3940
3941   if (pr_or_ffr_regnum_p (regno))
3942     return false;
3943
3944   if (regno == SP_REGNUM)
3945     /* The purpose of comparing with ptr_mode is to support the
3946        global register variable associated with the stack pointer
3947        register via the syntax of asm ("wsp") in ILP32.  */
3948     return mode == Pmode || mode == ptr_mode;
3949
3950   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
3951     return mode == Pmode;
3952
3953   if (GP_REGNUM_P (regno))
3954     {
3955       if (vec_flags & VEC_ANY_SVE)
3956         return false;
3957       if (known_le (GET_MODE_SIZE (mode), 8))
3958         return true;
3959       if (known_le (GET_MODE_SIZE (mode), 16))
3960         return (regno & 1) == 0;
3961     }
3962   else if (FP_REGNUM_P (regno))
3963     {
3964       if (vec_flags & VEC_STRUCT)
3965         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
3966       else
3967         return !VECTOR_MODE_P (mode) || vec_flags != 0;
3968     }
3969
3970   return false;
3971 }
3972
3973 /* Return true if a function with type FNTYPE returns its value in
3974    SVE vector or predicate registers.  */
3975
3976 static bool
3977 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
3978 {
3979   tree return_type = TREE_TYPE (fntype);
3980
3981   pure_scalable_type_info pst_info;
3982   switch (pst_info.analyze (return_type))
3983     {
3984     case pure_scalable_type_info::IS_PST:
3985       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
3986               && pst_info.num_pr () <= NUM_PR_ARG_REGS);
3987
3988     case pure_scalable_type_info::DOESNT_MATTER:
3989       gcc_assert (aarch64_return_in_memory_1 (return_type));
3990       return false;
3991
3992     case pure_scalable_type_info::NO_ABI_IDENTITY:
3993     case pure_scalable_type_info::ISNT_PST:
3994       return false;
3995     }
3996   gcc_unreachable ();
3997 }
3998
3999 /* Return true if a function with type FNTYPE takes arguments in
4000    SVE vector or predicate registers.  */
4001
4002 static bool
4003 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
4004 {
4005   CUMULATIVE_ARGS args_so_far_v;
4006   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
4007                                 NULL_TREE, 0, true);
4008   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
4009
4010   for (tree chain = TYPE_ARG_TYPES (fntype);
4011        chain && chain != void_list_node;
4012        chain = TREE_CHAIN (chain))
4013     {
4014       tree arg_type = TREE_VALUE (chain);
4015       if (arg_type == error_mark_node)
4016         return false;
4017
4018       function_arg_info arg (arg_type, /*named=*/true);
4019       apply_pass_by_reference_rules (&args_so_far_v, arg);
4020       pure_scalable_type_info pst_info;
4021       if (pst_info.analyze_registers (arg.type))
4022         {
4023           unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
4024           unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
4025           gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
4026           return true;
4027         }
4028
4029       targetm.calls.function_arg_advance (args_so_far, arg);
4030     }
4031   return false;
4032 }
4033
4034 /* Implement TARGET_FNTYPE_ABI.  */
4035
4036 static const predefined_function_abi &
4037 aarch64_fntype_abi (const_tree fntype)
4038 {
4039   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
4040     return aarch64_simd_abi ();
4041
4042   if (aarch64_returns_value_in_sve_regs_p (fntype)
4043       || aarch64_takes_arguments_in_sve_regs_p (fntype))
4044     return aarch64_sve_abi ();
4045
4046   return default_function_abi;
4047 }
4048
4049 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
4050
4051 static bool
4052 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
4053 {
4054   return (aarch64_sve::builtin_type_p (type1)
4055           == aarch64_sve::builtin_type_p (type2));
4056 }
4057
4058 /* Return true if we should emit CFI for register REGNO.  */
4059
4060 static bool
4061 aarch64_emit_cfi_for_reg_p (unsigned int regno)
4062 {
4063   return (GP_REGNUM_P (regno)
4064           || !default_function_abi.clobbers_full_reg_p (regno));
4065 }
4066
4067 /* Return the mode we should use to save and restore register REGNO.  */
4068
4069 static machine_mode
4070 aarch64_reg_save_mode (unsigned int regno)
4071 {
4072   if (GP_REGNUM_P (regno))
4073     return DImode;
4074
4075   if (FP_REGNUM_P (regno))
4076     switch (crtl->abi->id ())
4077       {
4078       case ARM_PCS_AAPCS64:
4079         /* Only the low 64 bits are saved by the base PCS.  */
4080         return DFmode;
4081
4082       case ARM_PCS_SIMD:
4083         /* The vector PCS saves the low 128 bits (which is the full
4084            register on non-SVE targets).  */
4085         return TFmode;
4086
4087       case ARM_PCS_SVE:
4088         /* Use vectors of DImode for registers that need frame
4089            information, so that the first 64 bytes of the save slot
4090            are always the equivalent of what storing D<n> would give.  */
4091         if (aarch64_emit_cfi_for_reg_p (regno))
4092           return VNx2DImode;
4093
4094         /* Use vectors of bytes otherwise, so that the layout is
4095            endian-agnostic, and so that we can use LDR and STR for
4096            big-endian targets.  */
4097         return VNx16QImode;
4098
4099       case ARM_PCS_TLSDESC:
4100       case ARM_PCS_UNKNOWN:
4101         break;
4102       }
4103
4104   if (PR_REGNUM_P (regno))
4105     /* Save the full predicate register.  */
4106     return VNx16BImode;
4107
4108   gcc_unreachable ();
4109 }
4110
4111 /* Implement TARGET_INSN_CALLEE_ABI.  */
4112
4113 const predefined_function_abi &
4114 aarch64_insn_callee_abi (const rtx_insn *insn)
4115 {
4116   rtx pat = PATTERN (insn);
4117   gcc_assert (GET_CODE (pat) == PARALLEL);
4118   rtx unspec = XVECEXP (pat, 0, 1);
4119   gcc_assert (GET_CODE (unspec) == UNSPEC
4120               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
4121   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
4122 }
4123
4124 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
4125    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
4126    clobbers the top 64 bits when restoring the bottom 64 bits.  */
4127
4128 static bool
4129 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
4130                                         unsigned int regno,
4131                                         machine_mode mode)
4132 {
4133   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
4134     {
4135       poly_int64 per_register_size = GET_MODE_SIZE (mode);
4136       unsigned int nregs = hard_regno_nregs (regno, mode);
4137       if (nregs > 1)
4138         per_register_size = exact_div (per_register_size, nregs);
4139       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
4140         return maybe_gt (per_register_size, 16);
4141       return maybe_gt (per_register_size, 8);
4142     }
4143   return false;
4144 }
4145
4146 /* Implement REGMODE_NATURAL_SIZE.  */
4147 poly_uint64
4148 aarch64_regmode_natural_size (machine_mode mode)
4149 {
4150   /* The natural size for SVE data modes is one SVE data vector,
4151      and similarly for predicates.  We can't independently modify
4152      anything smaller than that.  */
4153   /* ??? For now, only do this for variable-width SVE registers.
4154      Doing it for constant-sized registers breaks lower-subreg.cc.  */
4155   /* ??? And once that's fixed, we should probably have similar
4156      code for Advanced SIMD.  */
4157   if (!aarch64_sve_vg.is_constant ())
4158     {
4159       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4160       if (vec_flags & VEC_SVE_PRED)
4161         return BYTES_PER_SVE_PRED;
4162       if (vec_flags & VEC_SVE_DATA)
4163         return BYTES_PER_SVE_VECTOR;
4164     }
4165   return UNITS_PER_WORD;
4166 }
4167
4168 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
4169 machine_mode
4170 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
4171                                      machine_mode mode)
4172 {
4173   /* The predicate mode determines which bits are significant and
4174      which are "don't care".  Decreasing the number of lanes would
4175      lose data while increasing the number of lanes would make bits
4176      unnecessarily significant.  */
4177   if (PR_REGNUM_P (regno))
4178     return mode;
4179   if (known_ge (GET_MODE_SIZE (mode), 4))
4180     return mode;
4181   else
4182     return SImode;
4183 }
4184
4185 /* Return true if I's bits are consecutive ones from the MSB.  */
4186 bool
4187 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
4188 {
4189   return exact_log2 (-i) != HOST_WIDE_INT_M1;
4190 }
4191
4192 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
4193    that strcpy from constants will be faster.  */
4194
4195 static HOST_WIDE_INT
4196 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
4197 {
4198   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
4199     return MAX (align, BITS_PER_WORD);
4200   return align;
4201 }
4202
4203 /* Return true if calls to DECL should be treated as
4204    long-calls (ie called via a register).  */
4205 static bool
4206 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
4207 {
4208   return false;
4209 }
4210
4211 /* Return true if calls to symbol-ref SYM should be treated as
4212    long-calls (ie called via a register).  */
4213 bool
4214 aarch64_is_long_call_p (rtx sym)
4215 {
4216   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
4217 }
4218
4219 /* Return true if calls to symbol-ref SYM should not go through
4220    plt stubs.  */
4221
4222 bool
4223 aarch64_is_noplt_call_p (rtx sym)
4224 {
4225   const_tree decl = SYMBOL_REF_DECL (sym);
4226
4227   if (flag_pic
4228       && decl
4229       && (!flag_plt
4230           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
4231       && !targetm.binds_local_p (decl))
4232     return true;
4233
4234   return false;
4235 }
4236
4237 /* Emit an insn that's a simple single-set.  Both the operands must be
4238    known to be valid.  */
4239 inline static rtx_insn *
4240 emit_set_insn (rtx x, rtx y)
4241 {
4242   return emit_insn (gen_rtx_SET (x, y));
4243 }
4244
4245 /* X and Y are two things to compare using CODE.  Emit the compare insn and
4246    return the rtx for register 0 in the proper mode.  */
4247 rtx
4248 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
4249 {
4250   machine_mode cmp_mode = GET_MODE (x);
4251   machine_mode cc_mode;
4252   rtx cc_reg;
4253
4254   if (cmp_mode == TImode)
4255     {
4256       gcc_assert (code == NE);
4257
4258       cc_mode = CCmode;
4259       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4260
4261       rtx x_lo = operand_subword (x, 0, 0, TImode);
4262       rtx y_lo = operand_subword (y, 0, 0, TImode);
4263       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
4264
4265       rtx x_hi = operand_subword (x, 1, 0, TImode);
4266       rtx y_hi = operand_subword (y, 1, 0, TImode);
4267       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
4268                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
4269                                GEN_INT (AARCH64_EQ)));
4270     }
4271   else
4272     {
4273       cc_mode = SELECT_CC_MODE (code, x, y);
4274       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4275       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
4276     }
4277   return cc_reg;
4278 }
4279
4280 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
4281
4282 static rtx
4283 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
4284                                   machine_mode y_mode)
4285 {
4286   if (y_mode == E_QImode || y_mode == E_HImode)
4287     {
4288       if (CONST_INT_P (y))
4289         {
4290           y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
4291           y_mode = SImode;
4292         }
4293       else
4294         {
4295           rtx t, cc_reg;
4296           machine_mode cc_mode;
4297
4298           t = gen_rtx_ZERO_EXTEND (SImode, y);
4299           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
4300           cc_mode = CC_SWPmode;
4301           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4302           emit_set_insn (cc_reg, t);
4303           return cc_reg;
4304         }
4305     }
4306
4307   if (!aarch64_plus_operand (y, y_mode))
4308     y = force_reg (y_mode, y);
4309
4310   return aarch64_gen_compare_reg (code, x, y);
4311 }
4312
4313 /* Consider the operation:
4314
4315      OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
4316
4317    where:
4318
4319    - CODE is [SU]MAX or [SU]MIN
4320    - OPERANDS[2] and OPERANDS[3] are constant integers
4321    - OPERANDS[3] is a positive or negative shifted 12-bit immediate
4322    - all operands have mode MODE
4323
4324    Decide whether it is possible to implement the operation using:
4325
4326      SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
4327      or
4328      ADDS <tmp>, OPERANDS[1], OPERANDS[3]
4329
4330    followed by:
4331
4332      <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
4333
4334    where <insn> is one of CSEL, CSINV or CSINC.  Return true if so.
4335    If GENERATE_P is true, also update OPERANDS as follows:
4336
4337      OPERANDS[4] = -OPERANDS[3]
4338      OPERANDS[5] = the rtl condition representing <cond>
4339      OPERANDS[6] = <tmp>
4340      OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC.  */
4341 bool
4342 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
4343 {
4344   signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
4345   rtx dst = operands[0];
4346   rtx maxmin_op = operands[2];
4347   rtx add_op = operands[3];
4348   machine_mode mode = GET_MODE (dst);
4349
4350   /* max (x, y) - z == (x >= y + 1 ? x : y) - z
4351                     == (x >= y ? x : y) - z
4352                     == (x > y ? x : y) - z
4353                     == (x > y - 1 ? x : y) - z
4354
4355      min (x, y) - z == (x <= y - 1 ? x : y) - z
4356                     == (x <= y ? x : y) - z
4357                     == (x < y ? x : y) - z
4358                     == (x < y + 1 ? x : y) - z
4359
4360      Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
4361      which x is compared with z.  Set DIFF to y - z.  Thus the supported
4362      combinations are as follows, with DIFF being the value after the ":":
4363
4364      max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1   [z == y + 1]
4365                     == x >= y ? x - y : 0              [z == y]
4366                     == x > y ? x - y : 0               [z == y]
4367                     == x > y - 1 ? x - (y - 1) : 1     [z == y - 1]
4368
4369      min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1    [z == y - 1]
4370                     == x <= y ? x - y : 0              [z == y]
4371                     == x < y ? x - y : 0               [z == y]
4372                     == x < y + 1 ? x - (y + 1) : -1    [z == y + 1].  */
4373   auto maxmin_val = rtx_mode_t (maxmin_op, mode);
4374   auto add_val = rtx_mode_t (add_op, mode);
4375   auto sub_val = wi::neg (add_val);
4376   auto diff = wi::sub (maxmin_val, sub_val);
4377   if (!(diff == 0
4378         || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
4379         || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
4380     return false;
4381
4382   if (!generate_p)
4383     return true;
4384
4385   rtx_code cmp;
4386   switch (code)
4387     {
4388     case SMAX:
4389       cmp = diff == 1 ? GT : GE;
4390       break;
4391     case UMAX:
4392       cmp = diff == 1 ? GTU : GEU;
4393       break;
4394     case SMIN:
4395       cmp = diff == -1 ? LT : LE;
4396       break;
4397     case UMIN:
4398       cmp = diff == -1 ? LTU : LEU;
4399       break;
4400     default:
4401       gcc_unreachable ();
4402     }
4403   rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
4404
4405   operands[4] = immed_wide_int_const (sub_val, mode);
4406   operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
4407   if (can_create_pseudo_p ())
4408     operands[6] = gen_reg_rtx (mode);
4409   else
4410     operands[6] = dst;
4411   operands[7] = immed_wide_int_const (diff, mode);
4412
4413   return true;
4414 }
4415
4416
4417 /* Build the SYMBOL_REF for __tls_get_addr.  */
4418
4419 static GTY(()) rtx tls_get_addr_libfunc;
4420
4421 rtx
4422 aarch64_tls_get_addr (void)
4423 {
4424   if (!tls_get_addr_libfunc)
4425     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
4426   return tls_get_addr_libfunc;
4427 }
4428
4429 /* Return the TLS model to use for ADDR.  */
4430
4431 static enum tls_model
4432 tls_symbolic_operand_type (rtx addr)
4433 {
4434   enum tls_model tls_kind = TLS_MODEL_NONE;
4435   poly_int64 offset;
4436   addr = strip_offset_and_salt (addr, &offset);
4437   if (SYMBOL_REF_P (addr))
4438     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
4439
4440   return tls_kind;
4441 }
4442
4443 /* We'll allow lo_sum's in addresses in our legitimate addresses
4444    so that combine would take care of combining addresses where
4445    necessary, but for generation purposes, we'll generate the address
4446    as :
4447    RTL                               Absolute
4448    tmp = hi (symbol_ref);            adrp  x1, foo
4449    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
4450                                      nop
4451
4452    PIC                               TLS
4453    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
4454    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
4455                                      bl   __tls_get_addr
4456                                      nop
4457
4458    Load TLS symbol, depending on TLS mechanism and TLS access model.
4459
4460    Global Dynamic - Traditional TLS:
4461    adrp tmp, :tlsgd:imm
4462    add  dest, tmp, #:tlsgd_lo12:imm
4463    bl   __tls_get_addr
4464
4465    Global Dynamic - TLS Descriptors:
4466    adrp dest, :tlsdesc:imm
4467    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
4468    add  dest, dest, #:tlsdesc_lo12:imm
4469    blr  tmp
4470    mrs  tp, tpidr_el0
4471    add  dest, dest, tp
4472
4473    Initial Exec:
4474    mrs  tp, tpidr_el0
4475    adrp tmp, :gottprel:imm
4476    ldr  dest, [tmp, #:gottprel_lo12:imm]
4477    add  dest, dest, tp
4478
4479    Local Exec:
4480    mrs  tp, tpidr_el0
4481    add  t0, tp, #:tprel_hi12:imm, lsl #12
4482    add  t0, t0, #:tprel_lo12_nc:imm
4483 */
4484
4485 static void
4486 aarch64_load_symref_appropriately (rtx dest, rtx imm,
4487                                    enum aarch64_symbol_type type)
4488 {
4489   switch (type)
4490     {
4491     case SYMBOL_SMALL_ABSOLUTE:
4492       {
4493         /* In ILP32, the mode of dest can be either SImode or DImode.  */
4494         rtx tmp_reg = dest;
4495         machine_mode mode = GET_MODE (dest);
4496
4497         gcc_assert (mode == Pmode || mode == ptr_mode);
4498
4499         if (can_create_pseudo_p ())
4500           tmp_reg = gen_reg_rtx (mode);
4501
4502         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
4503         emit_insn (gen_add_losym (dest, tmp_reg, imm));
4504         return;
4505       }
4506
4507     case SYMBOL_TINY_ABSOLUTE:
4508       emit_insn (gen_rtx_SET (dest, imm));
4509       return;
4510
4511     case SYMBOL_SMALL_GOT_28K:
4512       {
4513         machine_mode mode = GET_MODE (dest);
4514         rtx gp_rtx = pic_offset_table_rtx;
4515         rtx insn;
4516         rtx mem;
4517
4518         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
4519            here before rtl expand.  Tree IVOPT will generate rtl pattern to
4520            decide rtx costs, in which case pic_offset_table_rtx is not
4521            initialized.  For that case no need to generate the first adrp
4522            instruction as the final cost for global variable access is
4523            one instruction.  */
4524         if (gp_rtx != NULL)
4525           {
4526             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
4527                using the page base as GOT base, the first page may be wasted,
4528                in the worst scenario, there is only 28K space for GOT).
4529
4530                The generate instruction sequence for accessing global variable
4531                is:
4532
4533                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
4534
4535                Only one instruction needed. But we must initialize
4536                pic_offset_table_rtx properly.  We generate initialize insn for
4537                every global access, and allow CSE to remove all redundant.
4538
4539                The final instruction sequences will look like the following
4540                for multiply global variables access.
4541
4542                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
4543
4544                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
4545                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
4546                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
4547                  ...  */
4548
4549             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
4550             crtl->uses_pic_offset_table = 1;
4551             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
4552
4553             if (mode != GET_MODE (gp_rtx))
4554              gp_rtx = gen_lowpart (mode, gp_rtx);
4555
4556           }
4557
4558         if (mode == ptr_mode)
4559           {
4560             if (mode == DImode)
4561               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
4562             else
4563               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
4564
4565             mem = XVECEXP (SET_SRC (insn), 0, 0);
4566           }
4567         else
4568           {
4569             gcc_assert (mode == Pmode);
4570
4571             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
4572             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
4573           }
4574
4575         /* The operand is expected to be MEM.  Whenever the related insn
4576            pattern changed, above code which calculate mem should be
4577            updated.  */
4578         gcc_assert (MEM_P (mem));
4579         MEM_READONLY_P (mem) = 1;
4580         MEM_NOTRAP_P (mem) = 1;
4581         emit_insn (insn);
4582         return;
4583       }
4584
4585     case SYMBOL_SMALL_GOT_4G:
4586       emit_insn (gen_rtx_SET (dest, imm));
4587       return;
4588
4589     case SYMBOL_SMALL_TLSGD:
4590       {
4591         rtx_insn *insns;
4592         /* The return type of __tls_get_addr is the C pointer type
4593            so use ptr_mode.  */
4594         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
4595         rtx tmp_reg = dest;
4596
4597         if (GET_MODE (dest) != ptr_mode)
4598           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
4599
4600         start_sequence ();
4601         if (ptr_mode == SImode)
4602           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
4603         else
4604           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
4605         insns = get_insns ();
4606         end_sequence ();
4607
4608         RTL_CONST_CALL_P (insns) = 1;
4609         emit_libcall_block (insns, tmp_reg, result, imm);
4610         /* Convert back to the mode of the dest adding a zero_extend
4611            from SImode (ptr_mode) to DImode (Pmode). */
4612         if (dest != tmp_reg)
4613           convert_move (dest, tmp_reg, true);
4614         return;
4615       }
4616
4617     case SYMBOL_SMALL_TLSDESC:
4618       {
4619         machine_mode mode = GET_MODE (dest);
4620         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
4621         rtx tp;
4622
4623         gcc_assert (mode == Pmode || mode == ptr_mode);
4624
4625         /* In ILP32, the got entry is always of SImode size.  Unlike
4626            small GOT, the dest is fixed at reg 0.  */
4627         if (TARGET_ILP32)
4628           emit_insn (gen_tlsdesc_small_si (imm));
4629         else
4630           emit_insn (gen_tlsdesc_small_di (imm));
4631         tp = aarch64_load_tp (NULL);
4632
4633         if (mode != Pmode)
4634           tp = gen_lowpart (mode, tp);
4635
4636         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
4637         if (REG_P (dest))
4638           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4639         return;
4640       }
4641
4642     case SYMBOL_SMALL_TLSIE:
4643       {
4644         /* In ILP32, the mode of dest can be either SImode or DImode,
4645            while the got entry is always of SImode size.  The mode of
4646            dest depends on how dest is used: if dest is assigned to a
4647            pointer (e.g. in the memory), it has SImode; it may have
4648            DImode if dest is dereferenced to access the memeory.
4649            This is why we have to handle three different tlsie_small
4650            patterns here (two patterns for ILP32).  */
4651         machine_mode mode = GET_MODE (dest);
4652         rtx tmp_reg = gen_reg_rtx (mode);
4653         rtx tp = aarch64_load_tp (NULL);
4654
4655         if (mode == ptr_mode)
4656           {
4657             if (mode == DImode)
4658               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
4659             else
4660               {
4661                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
4662                 tp = gen_lowpart (mode, tp);
4663               }
4664           }
4665         else
4666           {
4667             gcc_assert (mode == Pmode);
4668             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
4669           }
4670
4671         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
4672         if (REG_P (dest))
4673           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4674         return;
4675       }
4676
4677     case SYMBOL_TLSLE12:
4678     case SYMBOL_TLSLE24:
4679     case SYMBOL_TLSLE32:
4680     case SYMBOL_TLSLE48:
4681       {
4682         machine_mode mode = GET_MODE (dest);
4683         rtx tp = aarch64_load_tp (NULL);
4684
4685         if (mode != Pmode)
4686           tp = gen_lowpart (mode, tp);
4687
4688         switch (type)
4689           {
4690           case SYMBOL_TLSLE12:
4691             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
4692                         (dest, tp, imm));
4693             break;
4694           case SYMBOL_TLSLE24:
4695             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
4696                         (dest, tp, imm));
4697           break;
4698           case SYMBOL_TLSLE32:
4699             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
4700                         (dest, imm));
4701             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4702                         (dest, dest, tp));
4703           break;
4704           case SYMBOL_TLSLE48:
4705             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
4706                         (dest, imm));
4707             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4708                         (dest, dest, tp));
4709             break;
4710           default:
4711             gcc_unreachable ();
4712           }
4713
4714         if (REG_P (dest))
4715           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4716         return;
4717       }
4718
4719     case SYMBOL_TINY_GOT:
4720       {
4721         rtx insn;
4722         machine_mode mode = GET_MODE (dest);
4723
4724         if (mode == ptr_mode)
4725           insn = gen_ldr_got_tiny (mode, dest, imm);
4726         else
4727           {
4728             gcc_assert (mode == Pmode);
4729             insn = gen_ldr_got_tiny_sidi (dest, imm);
4730           }
4731
4732         emit_insn (insn);
4733         return;
4734       }
4735
4736     case SYMBOL_TINY_TLSIE:
4737       {
4738         machine_mode mode = GET_MODE (dest);
4739         rtx tp = aarch64_load_tp (NULL);
4740
4741         if (mode == ptr_mode)
4742           {
4743             if (mode == DImode)
4744               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
4745             else
4746               {
4747                 tp = gen_lowpart (mode, tp);
4748                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
4749               }
4750           }
4751         else
4752           {
4753             gcc_assert (mode == Pmode);
4754             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
4755           }
4756
4757         if (REG_P (dest))
4758           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4759         return;
4760       }
4761
4762     default:
4763       gcc_unreachable ();
4764     }
4765 }
4766
4767 /* Emit a move from SRC to DEST.  Assume that the move expanders can
4768    handle all moves if !can_create_pseudo_p ().  The distinction is
4769    important because, unlike emit_move_insn, the move expanders know
4770    how to force Pmode objects into the constant pool even when the
4771    constant pool address is not itself legitimate.  */
4772 static rtx
4773 aarch64_emit_move (rtx dest, rtx src)
4774 {
4775   return (can_create_pseudo_p ()
4776           ? emit_move_insn (dest, src)
4777           : emit_move_insn_1 (dest, src));
4778 }
4779
4780 /* Apply UNOPTAB to OP and store the result in DEST.  */
4781
4782 static void
4783 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
4784 {
4785   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
4786   if (dest != tmp)
4787     emit_move_insn (dest, tmp);
4788 }
4789
4790 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
4791
4792 static void
4793 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
4794 {
4795   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
4796                           OPTAB_DIRECT);
4797   if (dest != tmp)
4798     emit_move_insn (dest, tmp);
4799 }
4800
4801 /* Split a 128-bit move operation into two 64-bit move operations,
4802    taking care to handle partial overlap of register to register
4803    copies.  Special cases are needed when moving between GP regs and
4804    FP regs.  SRC can be a register, constant or memory; DST a register
4805    or memory.  If either operand is memory it must not have any side
4806    effects.  */
4807 void
4808 aarch64_split_128bit_move (rtx dst, rtx src)
4809 {
4810   rtx dst_lo, dst_hi;
4811   rtx src_lo, src_hi;
4812
4813   machine_mode mode = GET_MODE (dst);
4814
4815   gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
4816   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
4817   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
4818
4819   if (REG_P (dst) && REG_P (src))
4820     {
4821       int src_regno = REGNO (src);
4822       int dst_regno = REGNO (dst);
4823
4824       /* Handle FP <-> GP regs.  */
4825       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
4826         {
4827           src_lo = gen_lowpart (word_mode, src);
4828           src_hi = gen_highpart (word_mode, src);
4829
4830           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
4831           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
4832           return;
4833         }
4834       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
4835         {
4836           dst_lo = gen_lowpart (word_mode, dst);
4837           dst_hi = gen_highpart (word_mode, dst);
4838
4839           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
4840           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
4841           return;
4842         }
4843     }
4844
4845   dst_lo = gen_lowpart (word_mode, dst);
4846   dst_hi = gen_highpart (word_mode, dst);
4847   src_lo = gen_lowpart (word_mode, src);
4848   src_hi = gen_highpart_mode (word_mode, mode, src);
4849
4850   /* At most one pairing may overlap.  */
4851   if (reg_overlap_mentioned_p (dst_lo, src_hi))
4852     {
4853       aarch64_emit_move (dst_hi, src_hi);
4854       aarch64_emit_move (dst_lo, src_lo);
4855     }
4856   else
4857     {
4858       aarch64_emit_move (dst_lo, src_lo);
4859       aarch64_emit_move (dst_hi, src_hi);
4860     }
4861 }
4862
4863 /* Return true if we should split a move from 128-bit value SRC
4864    to 128-bit register DEST.  */
4865
4866 bool
4867 aarch64_split_128bit_move_p (rtx dst, rtx src)
4868 {
4869   if (FP_REGNUM_P (REGNO (dst)))
4870     return REG_P (src) && !FP_REGNUM_P (REGNO (src));
4871   /* All moves to GPRs need to be split.  */
4872   return true;
4873 }
4874
4875 /* Split a complex SIMD move.  */
4876
4877 void
4878 aarch64_split_simd_move (rtx dst, rtx src)
4879 {
4880   machine_mode src_mode = GET_MODE (src);
4881   machine_mode dst_mode = GET_MODE (dst);
4882
4883   gcc_assert (VECTOR_MODE_P (dst_mode));
4884
4885   if (REG_P (dst) && REG_P (src))
4886     {
4887       gcc_assert (VECTOR_MODE_P (src_mode));
4888       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
4889     }
4890 }
4891
4892 bool
4893 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
4894                               machine_mode ymode, rtx y)
4895 {
4896   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
4897   gcc_assert (r != NULL);
4898   return rtx_equal_p (x, r);
4899 }
4900
4901 /* Return TARGET if it is nonnull and a register of mode MODE.
4902    Otherwise, return a fresh register of mode MODE if we can,
4903    or TARGET reinterpreted as MODE if we can't.  */
4904
4905 static rtx
4906 aarch64_target_reg (rtx target, machine_mode mode)
4907 {
4908   if (target && REG_P (target) && GET_MODE (target) == mode)
4909     return target;
4910   if (!can_create_pseudo_p ())
4911     {
4912       gcc_assert (target);
4913       return gen_lowpart (mode, target);
4914     }
4915   return gen_reg_rtx (mode);
4916 }
4917
4918 /* Return a register that contains the constant in BUILDER, given that
4919    the constant is a legitimate move operand.  Use TARGET as the register
4920    if it is nonnull and convenient.  */
4921
4922 static rtx
4923 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
4924 {
4925   rtx src = builder.build ();
4926   target = aarch64_target_reg (target, GET_MODE (src));
4927   emit_insn (gen_rtx_SET (target, src));
4928   return target;
4929 }
4930
4931 static rtx
4932 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
4933 {
4934   if (can_create_pseudo_p ())
4935     return force_reg (mode, value);
4936   else
4937     {
4938       gcc_assert (x);
4939       aarch64_emit_move (x, value);
4940       return x;
4941     }
4942 }
4943
4944 /* Return true if predicate value X is a constant in which every element
4945    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
4946    value, i.e. as a predicate in which all bits are significant.  */
4947
4948 static bool
4949 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
4950 {
4951   if (!CONST_VECTOR_P (x))
4952     return false;
4953
4954   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
4955                                              GET_MODE_NUNITS (GET_MODE (x)));
4956   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
4957   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
4958   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
4959
4960   unsigned int nelts = const_vector_encoded_nelts (x);
4961   for (unsigned int i = 0; i < nelts; ++i)
4962     {
4963       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
4964       if (!CONST_INT_P (elt))
4965         return false;
4966
4967       builder.quick_push (elt);
4968       for (unsigned int j = 1; j < factor; ++j)
4969         builder.quick_push (const0_rtx);
4970     }
4971   builder.finalize ();
4972   return true;
4973 }
4974
4975 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
4976    widest predicate element size it can have (that is, the largest size
4977    for which each element would still be 0 or 1).  */
4978
4979 unsigned int
4980 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
4981 {
4982   /* Start with the most optimistic assumption: that we only need
4983      one bit per pattern.  This is what we will use if only the first
4984      bit in each pattern is ever set.  */
4985   unsigned int mask = GET_MODE_SIZE (DImode);
4986   mask |= builder.npatterns ();
4987
4988   /* Look for set bits.  */
4989   unsigned int nelts = builder.encoded_nelts ();
4990   for (unsigned int i = 1; i < nelts; ++i)
4991     if (INTVAL (builder.elt (i)) != 0)
4992       {
4993         if (i & 1)
4994           return 1;
4995         mask |= i;
4996       }
4997   return mask & -mask;
4998 }
4999
5000 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
5001    return that predicate mode, otherwise return opt_machine_mode ().  */
5002
5003 opt_machine_mode
5004 aarch64_ptrue_all_mode (rtx x)
5005 {
5006   gcc_assert (GET_MODE (x) == VNx16BImode);
5007   if (!CONST_VECTOR_P (x)
5008       || !CONST_VECTOR_DUPLICATE_P (x)
5009       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
5010       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
5011     return opt_machine_mode ();
5012
5013   unsigned int nelts = const_vector_encoded_nelts (x);
5014   for (unsigned int i = 1; i < nelts; ++i)
5015     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
5016       return opt_machine_mode ();
5017
5018   return aarch64_sve_pred_mode (nelts);
5019 }
5020
5021 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
5022    that the constant would have with predicate element size ELT_SIZE
5023    (ignoring the upper bits in each element) and return:
5024
5025    * -1 if all bits are set
5026    * N if the predicate has N leading set bits followed by all clear bits
5027    * 0 if the predicate does not have any of these forms.  */
5028
5029 int
5030 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
5031                               unsigned int elt_size)
5032 {
5033   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
5034      followed by set bits.  */
5035   if (builder.nelts_per_pattern () == 3)
5036     return 0;
5037
5038   /* Skip over leading set bits.  */
5039   unsigned int nelts = builder.encoded_nelts ();
5040   unsigned int i = 0;
5041   for (; i < nelts; i += elt_size)
5042     if (INTVAL (builder.elt (i)) == 0)
5043       break;
5044   unsigned int vl = i / elt_size;
5045
5046   /* Check for the all-true case.  */
5047   if (i == nelts)
5048     return -1;
5049
5050   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
5051      repeating pattern of set bits followed by clear bits.  */
5052   if (builder.nelts_per_pattern () != 2)
5053     return 0;
5054
5055   /* We have a "foreground" value and a duplicated "background" value.
5056      If the background might repeat and the last set bit belongs to it,
5057      we might have set bits followed by clear bits followed by set bits.  */
5058   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
5059     return 0;
5060
5061   /* Make sure that the rest are all clear.  */
5062   for (; i < nelts; i += elt_size)
5063     if (INTVAL (builder.elt (i)) != 0)
5064       return 0;
5065
5066   return vl;
5067 }
5068
5069 /* See if there is an svpattern that encodes an SVE predicate of mode
5070    PRED_MODE in which the first VL bits are set and the rest are clear.
5071    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
5072    A VL of -1 indicates an all-true vector.  */
5073
5074 aarch64_svpattern
5075 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
5076 {
5077   if (vl < 0)
5078     return AARCH64_SV_ALL;
5079
5080   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
5081     return AARCH64_NUM_SVPATTERNS;
5082
5083   if (vl >= 1 && vl <= 8)
5084     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
5085
5086   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
5087     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
5088
5089   int max_vl;
5090   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
5091     {
5092       if (vl == (max_vl / 3) * 3)
5093         return AARCH64_SV_MUL3;
5094       /* These would only trigger for non-power-of-2 lengths.  */
5095       if (vl == (max_vl & -4))
5096         return AARCH64_SV_MUL4;
5097       if (vl == (1 << floor_log2 (max_vl)))
5098         return AARCH64_SV_POW2;
5099       if (vl == max_vl)
5100         return AARCH64_SV_ALL;
5101     }
5102   return AARCH64_NUM_SVPATTERNS;
5103 }
5104
5105 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
5106    bits has the lowest bit set and the upper bits clear.  This is the
5107    VNx16BImode equivalent of a PTRUE for controlling elements of
5108    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
5109    all bits are significant, even the upper zeros.  */
5110
5111 rtx
5112 aarch64_ptrue_all (unsigned int elt_size)
5113 {
5114   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
5115   builder.quick_push (const1_rtx);
5116   for (unsigned int i = 1; i < elt_size; ++i)
5117     builder.quick_push (const0_rtx);
5118   return builder.build ();
5119 }
5120
5121 /* Return an all-true predicate register of mode MODE.  */
5122
5123 rtx
5124 aarch64_ptrue_reg (machine_mode mode)
5125 {
5126   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5127   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
5128   return gen_lowpart (mode, reg);
5129 }
5130
5131 /* Return an all-false predicate register of mode MODE.  */
5132
5133 rtx
5134 aarch64_pfalse_reg (machine_mode mode)
5135 {
5136   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5137   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
5138   return gen_lowpart (mode, reg);
5139 }
5140
5141 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
5142    for it.  PRED2[0] is the predicate for the instruction whose result
5143    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
5144    for it.  Return true if we can prove that the two predicates are
5145    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
5146    with PRED1[0] without changing behavior.  */
5147
5148 bool
5149 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
5150 {
5151   machine_mode mode = GET_MODE (pred1[0]);
5152   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
5153               && mode == GET_MODE (pred2[0])
5154               && aarch64_sve_ptrue_flag (pred1[1], SImode)
5155               && aarch64_sve_ptrue_flag (pred2[1], SImode));
5156
5157   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
5158                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
5159   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
5160                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
5161   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
5162 }
5163
5164 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
5165    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
5166    Use TARGET as the target register if nonnull and convenient.  */
5167
5168 static rtx
5169 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
5170                           machine_mode data_mode, rtx op1, rtx op2)
5171 {
5172   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
5173   expand_operand ops[5];
5174   create_output_operand (&ops[0], target, pred_mode);
5175   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
5176   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
5177   create_input_operand (&ops[3], op1, data_mode);
5178   create_input_operand (&ops[4], op2, data_mode);
5179   expand_insn (icode, 5, ops);
5180   return ops[0].value;
5181 }
5182
5183 /* Use a comparison to convert integer vector SRC into MODE, which is
5184    the corresponding SVE predicate mode.  Use TARGET for the result
5185    if it's nonnull and convenient.  */
5186
5187 rtx
5188 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
5189 {
5190   machine_mode src_mode = GET_MODE (src);
5191   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
5192                                    src, CONST0_RTX (src_mode));
5193 }
5194
5195 /* Return the assembly token for svprfop value PRFOP.  */
5196
5197 static const char *
5198 svprfop_token (enum aarch64_svprfop prfop)
5199 {
5200   switch (prfop)
5201     {
5202 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
5203     AARCH64_FOR_SVPRFOP (CASE)
5204 #undef CASE
5205     case AARCH64_NUM_SVPRFOPS:
5206       break;
5207     }
5208   gcc_unreachable ();
5209 }
5210
5211 /* Return the assembly string for an SVE prefetch operation with
5212    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
5213    and that SUFFIX is the format for the remaining operands.  */
5214
5215 char *
5216 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
5217                              const char *suffix)
5218 {
5219   static char buffer[128];
5220   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
5221   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
5222                                    mnemonic, svprfop_token (prfop), suffix);
5223   gcc_assert (written < sizeof (buffer));
5224   return buffer;
5225 }
5226
5227 /* Check whether we can calculate the number of elements in PATTERN
5228    at compile time, given that there are NELTS_PER_VQ elements per
5229    128-bit block.  Return the value if so, otherwise return -1.  */
5230
5231 HOST_WIDE_INT
5232 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
5233 {
5234   unsigned int vl, const_vg;
5235   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
5236     vl = 1 + (pattern - AARCH64_SV_VL1);
5237   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
5238     vl = 16 << (pattern - AARCH64_SV_VL16);
5239   else if (aarch64_sve_vg.is_constant (&const_vg))
5240     {
5241       /* There are two vector granules per quadword.  */
5242       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
5243       switch (pattern)
5244         {
5245         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
5246         case AARCH64_SV_MUL4: return nelts & -4;
5247         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
5248         case AARCH64_SV_ALL: return nelts;
5249         default: gcc_unreachable ();
5250         }
5251     }
5252   else
5253     return -1;
5254
5255   /* There are two vector granules per quadword.  */
5256   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
5257   if (known_le (vl, nelts_all))
5258     return vl;
5259
5260   /* Requesting more elements than are available results in a PFALSE.  */
5261   if (known_gt (vl, nelts_all))
5262     return 0;
5263
5264   return -1;
5265 }
5266
5267 /* Return true if we can move VALUE into a register using a single
5268    CNT[BHWD] instruction.  */
5269
5270 static bool
5271 aarch64_sve_cnt_immediate_p (poly_int64 value)
5272 {
5273   HOST_WIDE_INT factor = value.coeffs[0];
5274   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
5275   return (value.coeffs[1] == factor
5276           && IN_RANGE (factor, 2, 16 * 16)
5277           && (factor & 1) == 0
5278           && factor <= 16 * (factor & -factor));
5279 }
5280
5281 /* Likewise for rtx X.  */
5282
5283 bool
5284 aarch64_sve_cnt_immediate_p (rtx x)
5285 {
5286   poly_int64 value;
5287   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
5288 }
5289
5290 /* Return the asm string for an instruction with a CNT-like vector size
5291    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5292    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5293    first part of the operands template (the part that comes before the
5294    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
5295    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
5296    in each quadword.  If it is zero, we can use any element size.  */
5297
5298 static char *
5299 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5300                                   aarch64_svpattern pattern,
5301                                   unsigned int factor,
5302                                   unsigned int nelts_per_vq)
5303 {
5304   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
5305
5306   if (nelts_per_vq == 0)
5307     /* There is some overlap in the ranges of the four CNT instructions.
5308        Here we always use the smallest possible element size, so that the
5309        multiplier is 1 whereever possible.  */
5310     nelts_per_vq = factor & -factor;
5311   int shift = std::min (exact_log2 (nelts_per_vq), 4);
5312   gcc_assert (IN_RANGE (shift, 1, 4));
5313   char suffix = "dwhb"[shift - 1];
5314
5315   factor >>= shift;
5316   unsigned int written;
5317   if (pattern == AARCH64_SV_ALL && factor == 1)
5318     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
5319                         prefix, suffix, operands);
5320   else if (factor == 1)
5321     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
5322                         prefix, suffix, operands, svpattern_token (pattern));
5323   else
5324     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
5325                         prefix, suffix, operands, svpattern_token (pattern),
5326                         factor);
5327   gcc_assert (written < sizeof (buffer));
5328   return buffer;
5329 }
5330
5331 /* Return the asm string for an instruction with a CNT-like vector size
5332    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5333    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5334    first part of the operands template (the part that comes before the
5335    vector size itself).  X is the value of the vector size operand,
5336    as a polynomial integer rtx; we need to convert this into an "all"
5337    pattern with a multiplier.  */
5338
5339 char *
5340 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5341                                   rtx x)
5342 {
5343   poly_int64 value = rtx_to_poly_int64 (x);
5344   gcc_assert (aarch64_sve_cnt_immediate_p (value));
5345   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
5346                                            value.coeffs[1], 0);
5347 }
5348
5349 /* Return the asm string for an instruction with a CNT-like vector size
5350    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5351    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5352    first part of the operands template (the part that comes before the
5353    vector size itself).  CNT_PAT[0..2] are the operands of the
5354    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
5355
5356 char *
5357 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
5358                                       const char *operands, rtx *cnt_pat)
5359 {
5360   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
5361   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
5362   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
5363   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
5364                                            factor, nelts_per_vq);
5365 }
5366
5367 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
5368
5369 bool
5370 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
5371 {
5372   poly_int64 value;
5373   return (poly_int_rtx_p (x, &value)
5374           && (aarch64_sve_cnt_immediate_p (value)
5375               || aarch64_sve_cnt_immediate_p (-value)));
5376 }
5377
5378 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
5379    operand 0.  */
5380
5381 char *
5382 aarch64_output_sve_scalar_inc_dec (rtx offset)
5383 {
5384   poly_int64 offset_value = rtx_to_poly_int64 (offset);
5385   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
5386   if (offset_value.coeffs[1] > 0)
5387     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
5388                                              offset_value.coeffs[1], 0);
5389   else
5390     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
5391                                              -offset_value.coeffs[1], 0);
5392 }
5393
5394 /* Return true if we can add VALUE to a register using a single ADDVL
5395    or ADDPL instruction.  */
5396
5397 static bool
5398 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
5399 {
5400   HOST_WIDE_INT factor = value.coeffs[0];
5401   if (factor == 0 || value.coeffs[1] != factor)
5402     return false;
5403   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
5404      and a value of 16 is one vector width.  */
5405   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
5406           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
5407 }
5408
5409 /* Likewise for rtx X.  */
5410
5411 bool
5412 aarch64_sve_addvl_addpl_immediate_p (rtx x)
5413 {
5414   poly_int64 value;
5415   return (poly_int_rtx_p (x, &value)
5416           && aarch64_sve_addvl_addpl_immediate_p (value));
5417 }
5418
5419 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
5420    to operand 1 and storing the result in operand 0.  */
5421
5422 char *
5423 aarch64_output_sve_addvl_addpl (rtx offset)
5424 {
5425   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
5426   poly_int64 offset_value = rtx_to_poly_int64 (offset);
5427   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
5428
5429   int factor = offset_value.coeffs[1];
5430   if ((factor & 15) == 0)
5431     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
5432   else
5433     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
5434   return buffer;
5435 }
5436
5437 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5438    instruction.  If it is, store the number of elements in each vector
5439    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
5440    factor in *FACTOR_OUT (if nonnull).  */
5441
5442 bool
5443 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
5444                                         unsigned int *nelts_per_vq_out)
5445 {
5446   rtx elt;
5447   poly_int64 value;
5448
5449   if (!const_vec_duplicate_p (x, &elt)
5450       || !poly_int_rtx_p (elt, &value))
5451     return false;
5452
5453   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
5454   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
5455     /* There's no vector INCB.  */
5456     return false;
5457
5458   HOST_WIDE_INT factor = value.coeffs[0];
5459   if (value.coeffs[1] != factor)
5460     return false;
5461
5462   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
5463   if ((factor % nelts_per_vq) != 0
5464       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
5465     return false;
5466
5467   if (factor_out)
5468     *factor_out = factor;
5469   if (nelts_per_vq_out)
5470     *nelts_per_vq_out = nelts_per_vq;
5471   return true;
5472 }
5473
5474 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5475    instruction.  */
5476
5477 bool
5478 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
5479 {
5480   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
5481 }
5482
5483 /* Return the asm template for an SVE vector INC or DEC instruction.
5484    OPERANDS gives the operands before the vector count and X is the
5485    value of the vector count operand itself.  */
5486
5487 char *
5488 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
5489 {
5490   int factor;
5491   unsigned int nelts_per_vq;
5492   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
5493     gcc_unreachable ();
5494   if (factor < 0)
5495     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
5496                                              -factor, nelts_per_vq);
5497   else
5498     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
5499                                              factor, nelts_per_vq);
5500 }
5501
5502 static int
5503 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
5504                                 scalar_int_mode mode)
5505 {
5506   int i;
5507   unsigned HOST_WIDE_INT val, val2, mask;
5508   int one_match, zero_match;
5509   int num_insns;
5510
5511   val = INTVAL (imm);
5512
5513   if (aarch64_move_imm (val, mode))
5514     {
5515       if (generate)
5516         emit_insn (gen_rtx_SET (dest, imm));
5517       return 1;
5518     }
5519
5520   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
5521      (with XXXX non-zero). In that case check to see if the move can be done in
5522      a smaller mode.  */
5523   val2 = val & 0xffffffff;
5524   if (mode == DImode
5525       && aarch64_move_imm (val2, SImode)
5526       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
5527     {
5528       if (generate)
5529         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5530
5531       /* Check if we have to emit a second instruction by checking to see
5532          if any of the upper 32 bits of the original DI mode value is set.  */
5533       if (val == val2)
5534         return 1;
5535
5536       i = (val >> 48) ? 48 : 32;
5537
5538       if (generate)
5539          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5540                                     GEN_INT ((val >> i) & 0xffff)));
5541
5542       return 2;
5543     }
5544
5545   if ((val >> 32) == 0 || mode == SImode)
5546     {
5547       if (generate)
5548         {
5549           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
5550           if (mode == SImode)
5551             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
5552                                        GEN_INT ((val >> 16) & 0xffff)));
5553           else
5554             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
5555                                        GEN_INT ((val >> 16) & 0xffff)));
5556         }
5557       return 2;
5558     }
5559
5560   /* Remaining cases are all for DImode.  */
5561
5562   mask = 0xffff;
5563   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
5564     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
5565   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
5566     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
5567
5568   if (zero_match != 2 && one_match != 2)
5569     {
5570       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
5571          For a 64-bit bitmask try whether changing 16 bits to all ones or
5572          zeroes creates a valid bitmask.  To check any repeated bitmask,
5573          try using 16 bits from the other 32-bit half of val.  */
5574
5575       for (i = 0; i < 64; i += 16, mask <<= 16)
5576         {
5577           val2 = val & ~mask;
5578           if (val2 != val && aarch64_bitmask_imm (val2, mode))
5579             break;
5580           val2 = val | mask;
5581           if (val2 != val && aarch64_bitmask_imm (val2, mode))
5582             break;
5583           val2 = val2 & ~mask;
5584           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
5585           if (val2 != val && aarch64_bitmask_imm (val2, mode))
5586             break;
5587         }
5588       if (i != 64)
5589         {
5590           if (generate)
5591             {
5592               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5593               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5594                                          GEN_INT ((val >> i) & 0xffff)));
5595             }
5596           return 2;
5597         }
5598     }
5599
5600   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
5601      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
5602      otherwise skip zero bits.  */
5603
5604   num_insns = 1;
5605   mask = 0xffff;
5606   val2 = one_match > zero_match ? ~val : val;
5607   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
5608
5609   if (generate)
5610     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
5611                                            ? (val | ~(mask << i))
5612                                            : (val & (mask << i)))));
5613   for (i += 16; i < 64; i += 16)
5614     {
5615       if ((val2 & (mask << i)) == 0)
5616         continue;
5617       if (generate)
5618         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5619                                    GEN_INT ((val >> i) & 0xffff)));
5620       num_insns ++;
5621     }
5622
5623   return num_insns;
5624 }
5625
5626 /* Return whether imm is a 128-bit immediate which is simple enough to
5627    expand inline.  */
5628 bool
5629 aarch64_mov128_immediate (rtx imm)
5630 {
5631   if (CONST_INT_P (imm))
5632     return true;
5633
5634   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
5635
5636   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
5637   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
5638
5639   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
5640          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
5641 }
5642
5643
5644 /* Return the number of temporary registers that aarch64_add_offset_1
5645    would need to add OFFSET to a register.  */
5646
5647 static unsigned int
5648 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
5649 {
5650   return absu_hwi (offset) < 0x1000000 ? 0 : 1;
5651 }
5652
5653 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
5654    a non-polynomial OFFSET.  MODE is the mode of the addition.
5655    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5656    be set and CFA adjustments added to the generated instructions.
5657
5658    TEMP1, if nonnull, is a register of mode MODE that can be used as a
5659    temporary if register allocation is already complete.  This temporary
5660    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
5661    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
5662    the immediate again.
5663
5664    Since this function may be used to adjust the stack pointer, we must
5665    ensure that it cannot cause transient stack deallocation (for example
5666    by first incrementing SP and then decrementing when adjusting by a
5667    large immediate).  */
5668
5669 static void
5670 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
5671                       rtx src, HOST_WIDE_INT offset, rtx temp1,
5672                       bool frame_related_p, bool emit_move_imm)
5673 {
5674   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5675   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5676
5677   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
5678   rtx_insn *insn;
5679
5680   if (!moffset)
5681     {
5682       if (!rtx_equal_p (dest, src))
5683         {
5684           insn = emit_insn (gen_rtx_SET (dest, src));
5685           RTX_FRAME_RELATED_P (insn) = frame_related_p;
5686         }
5687       return;
5688     }
5689
5690   /* Single instruction adjustment.  */
5691   if (aarch64_uimm12_shift (moffset))
5692     {
5693       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
5694       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5695       return;
5696     }
5697
5698   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
5699      and either:
5700
5701      a) the offset cannot be loaded by a 16-bit move or
5702      b) there is no spare register into which we can move it.  */
5703   if (moffset < 0x1000000
5704       && ((!temp1 && !can_create_pseudo_p ())
5705           || !aarch64_move_imm (moffset, mode)))
5706     {
5707       HOST_WIDE_INT low_off = moffset & 0xfff;
5708
5709       low_off = offset < 0 ? -low_off : low_off;
5710       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
5711       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5712       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
5713       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5714       return;
5715     }
5716
5717   /* Emit a move immediate if required and an addition/subtraction.  */
5718   if (emit_move_imm)
5719     {
5720       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
5721       temp1 = aarch64_force_temporary (mode, temp1,
5722                                        gen_int_mode (moffset, mode));
5723     }
5724   insn = emit_insn (offset < 0
5725                     ? gen_sub3_insn (dest, src, temp1)
5726                     : gen_add3_insn (dest, src, temp1));
5727   if (frame_related_p)
5728     {
5729       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5730       rtx adj = plus_constant (mode, src, offset);
5731       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
5732     }
5733 }
5734
5735 /* Return the number of temporary registers that aarch64_add_offset
5736    would need to move OFFSET into a register or add OFFSET to a register;
5737    ADD_P is true if we want the latter rather than the former.  */
5738
5739 static unsigned int
5740 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
5741 {
5742   /* This follows the same structure as aarch64_add_offset.  */
5743   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
5744     return 0;
5745
5746   unsigned int count = 0;
5747   HOST_WIDE_INT factor = offset.coeffs[1];
5748   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
5749   poly_int64 poly_offset (factor, factor);
5750   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
5751     /* Need one register for the ADDVL/ADDPL result.  */
5752     count += 1;
5753   else if (factor != 0)
5754     {
5755       factor = abs (factor);
5756       if (factor > 16 * (factor & -factor))
5757         /* Need one register for the CNT result and one for the multiplication
5758            factor.  If necessary, the second temporary can be reused for the
5759            constant part of the offset.  */
5760         return 2;
5761       /* Need one register for the CNT result (which might then
5762          be shifted).  */
5763       count += 1;
5764     }
5765   return count + aarch64_add_offset_1_temporaries (constant);
5766 }
5767
5768 /* If X can be represented as a poly_int64, return the number
5769    of temporaries that are required to add it to a register.
5770    Return -1 otherwise.  */
5771
5772 int
5773 aarch64_add_offset_temporaries (rtx x)
5774 {
5775   poly_int64 offset;
5776   if (!poly_int_rtx_p (x, &offset))
5777     return -1;
5778   return aarch64_offset_temporaries (true, offset);
5779 }
5780
5781 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
5782    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5783    be set and CFA adjustments added to the generated instructions.
5784
5785    TEMP1, if nonnull, is a register of mode MODE that can be used as a
5786    temporary if register allocation is already complete.  This temporary
5787    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
5788    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
5789    false to avoid emitting the immediate again.
5790
5791    TEMP2, if nonnull, is a second temporary register that doesn't
5792    overlap either DEST or REG.
5793
5794    Since this function may be used to adjust the stack pointer, we must
5795    ensure that it cannot cause transient stack deallocation (for example
5796    by first incrementing SP and then decrementing when adjusting by a
5797    large immediate).  */
5798
5799 static void
5800 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
5801                     poly_int64 offset, rtx temp1, rtx temp2,
5802                     bool frame_related_p, bool emit_move_imm = true)
5803 {
5804   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5805   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5806   gcc_assert (temp1 == NULL_RTX
5807               || !frame_related_p
5808               || !reg_overlap_mentioned_p (temp1, dest));
5809   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
5810
5811   /* Try using ADDVL or ADDPL to add the whole value.  */
5812   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
5813     {
5814       rtx offset_rtx = gen_int_mode (offset, mode);
5815       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
5816       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5817       return;
5818     }
5819
5820   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
5821      SVE vector register, over and above the minimum size of 128 bits.
5822      This is equivalent to half the value returned by CNTD with a
5823      vector shape of ALL.  */
5824   HOST_WIDE_INT factor = offset.coeffs[1];
5825   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
5826
5827   /* Try using ADDVL or ADDPL to add the VG-based part.  */
5828   poly_int64 poly_offset (factor, factor);
5829   if (src != const0_rtx
5830       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
5831     {
5832       rtx offset_rtx = gen_int_mode (poly_offset, mode);
5833       if (frame_related_p)
5834         {
5835           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
5836           RTX_FRAME_RELATED_P (insn) = true;
5837           src = dest;
5838         }
5839       else
5840         {
5841           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
5842           src = aarch64_force_temporary (mode, temp1, addr);
5843           temp1 = temp2;
5844           temp2 = NULL_RTX;
5845         }
5846     }
5847   /* Otherwise use a CNT-based sequence.  */
5848   else if (factor != 0)
5849     {
5850       /* Use a subtraction if we have a negative factor.  */
5851       rtx_code code = PLUS;
5852       if (factor < 0)
5853         {
5854           factor = -factor;
5855           code = MINUS;
5856         }
5857
5858       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
5859          into the multiplication.  */
5860       rtx val;
5861       int shift = 0;
5862       if (factor & 1)
5863         /* Use a right shift by 1.  */
5864         shift = -1;
5865       else
5866         factor /= 2;
5867       HOST_WIDE_INT low_bit = factor & -factor;
5868       if (factor <= 16 * low_bit)
5869         {
5870           if (factor > 16 * 8)
5871             {
5872               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
5873                  the value with the minimum multiplier and shift it into
5874                  position.  */
5875               int extra_shift = exact_log2 (low_bit);
5876               shift += extra_shift;
5877               factor >>= extra_shift;
5878             }
5879           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
5880         }
5881       else
5882         {
5883           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
5884              directly, since that should increase the chances of being
5885              able to use a shift and add sequence.  If LOW_BIT itself
5886              is out of range, just use CNTD.  */
5887           if (low_bit <= 16 * 8)
5888             factor /= low_bit;
5889           else
5890             low_bit = 1;
5891
5892           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
5893           val = aarch64_force_temporary (mode, temp1, val);
5894
5895           if (can_create_pseudo_p ())
5896             {
5897               rtx coeff1 = gen_int_mode (factor, mode);
5898               val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
5899             }
5900           else
5901             {
5902               /* Go back to using a negative multiplication factor if we have
5903                  no register from which to subtract.  */
5904               if (code == MINUS && src == const0_rtx)
5905                 {
5906                   factor = -factor;
5907                   code = PLUS;
5908                 }
5909               rtx coeff1 = gen_int_mode (factor, mode);
5910               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
5911               val = gen_rtx_MULT (mode, val, coeff1);
5912             }
5913         }
5914
5915       if (shift > 0)
5916         {
5917           /* Multiply by 1 << SHIFT.  */
5918           val = aarch64_force_temporary (mode, temp1, val);
5919           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
5920         }
5921       else if (shift == -1)
5922         {
5923           /* Divide by 2.  */
5924           val = aarch64_force_temporary (mode, temp1, val);
5925           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
5926         }
5927
5928       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
5929       if (src != const0_rtx)
5930         {
5931           val = aarch64_force_temporary (mode, temp1, val);
5932           val = gen_rtx_fmt_ee (code, mode, src, val);
5933         }
5934       else if (code == MINUS)
5935         {
5936           val = aarch64_force_temporary (mode, temp1, val);
5937           val = gen_rtx_NEG (mode, val);
5938         }
5939
5940       if (constant == 0 || frame_related_p)
5941         {
5942           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
5943           if (frame_related_p)
5944             {
5945               RTX_FRAME_RELATED_P (insn) = true;
5946               add_reg_note (insn, REG_CFA_ADJUST_CFA,
5947                             gen_rtx_SET (dest, plus_constant (Pmode, src,
5948                                                               poly_offset)));
5949             }
5950           src = dest;
5951           if (constant == 0)
5952             return;
5953         }
5954       else
5955         {
5956           src = aarch64_force_temporary (mode, temp1, val);
5957           temp1 = temp2;
5958           temp2 = NULL_RTX;
5959         }
5960
5961       emit_move_imm = true;
5962     }
5963
5964   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
5965                         frame_related_p, emit_move_imm);
5966 }
5967
5968 /* Like aarch64_add_offset, but the offset is given as an rtx rather
5969    than a poly_int64.  */
5970
5971 void
5972 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
5973                           rtx offset_rtx, rtx temp1, rtx temp2)
5974 {
5975   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
5976                       temp1, temp2, false);
5977 }
5978
5979 /* Add DELTA to the stack pointer, marking the instructions frame-related.
5980    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
5981    if TEMP1 already contains abs (DELTA).  */
5982
5983 static inline void
5984 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
5985 {
5986   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
5987                       temp1, temp2, true, emit_move_imm);
5988 }
5989
5990 /* Subtract DELTA from the stack pointer, marking the instructions
5991    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
5992    if nonnull.  */
5993
5994 static inline void
5995 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
5996                 bool emit_move_imm = true)
5997 {
5998   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
5999                       temp1, temp2, frame_related_p, emit_move_imm);
6000 }
6001
6002 /* Set DEST to (vec_series BASE STEP).  */
6003
6004 static void
6005 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
6006 {
6007   machine_mode mode = GET_MODE (dest);
6008   scalar_mode inner = GET_MODE_INNER (mode);
6009
6010   /* Each operand can be a register or an immediate in the range [-16, 15].  */
6011   if (!aarch64_sve_index_immediate_p (base))
6012     base = force_reg (inner, base);
6013   if (!aarch64_sve_index_immediate_p (step))
6014     step = force_reg (inner, step);
6015
6016   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
6017 }
6018
6019 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
6020    register of mode MODE.  Use TARGET for the result if it's nonnull
6021    and convenient.
6022
6023    The two vector modes must have the same element mode.  The behavior
6024    is to duplicate architectural lane N of SRC into architectural lanes
6025    N + I * STEP of the result.  On big-endian targets, architectural
6026    lane 0 of an Advanced SIMD vector is the last element of the vector
6027    in memory layout, so for big-endian targets this operation has the
6028    effect of reversing SRC before duplicating it.  Callers need to
6029    account for this.  */
6030
6031 rtx
6032 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
6033 {
6034   machine_mode src_mode = GET_MODE (src);
6035   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
6036   insn_code icode = (BYTES_BIG_ENDIAN
6037                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
6038                      : code_for_aarch64_vec_duplicate_vq_le (mode));
6039
6040   unsigned int i = 0;
6041   expand_operand ops[3];
6042   create_output_operand (&ops[i++], target, mode);
6043   create_output_operand (&ops[i++], src, src_mode);
6044   if (BYTES_BIG_ENDIAN)
6045     {
6046       /* Create a PARALLEL describing the reversal of SRC.  */
6047       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
6048       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
6049                                                   nelts_per_vq - 1, -1);
6050       create_fixed_operand (&ops[i++], sel);
6051     }
6052   expand_insn (icode, i, ops);
6053   return ops[0].value;
6054 }
6055
6056 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
6057    the memory image into DEST.  Return true on success.  */
6058
6059 static bool
6060 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
6061 {
6062   src = force_const_mem (GET_MODE (src), src);
6063   if (!src)
6064     return false;
6065
6066   /* Make sure that the address is legitimate.  */
6067   if (!aarch64_sve_ld1rq_operand_p (src))
6068     {
6069       rtx addr = force_reg (Pmode, XEXP (src, 0));
6070       src = replace_equiv_address (src, addr);
6071     }
6072
6073   machine_mode mode = GET_MODE (dest);
6074   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6075   rtx ptrue = aarch64_ptrue_reg (pred_mode);
6076   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
6077   return true;
6078 }
6079
6080 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
6081    by N "background" values.  Try to move it into TARGET using:
6082
6083       PTRUE PRED.<T>, VL<N>
6084       MOV TRUE.<T>, #<foreground>
6085       MOV FALSE.<T>, #<background>
6086       SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
6087
6088    The PTRUE is always a single instruction but the MOVs might need a
6089    longer sequence.  If the background value is zero (as it often is),
6090    the sequence can sometimes collapse to a PTRUE followed by a
6091    zero-predicated move.
6092
6093    Return the target on success, otherwise return null.  */
6094
6095 static rtx
6096 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
6097 {
6098   gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
6099
6100   /* Make sure that the PTRUE is valid.  */
6101   machine_mode mode = GET_MODE (src);
6102   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6103   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6104   if (aarch64_svpattern_for_vl (pred_mode, npatterns)
6105       == AARCH64_NUM_SVPATTERNS)
6106     return NULL_RTX;
6107
6108   rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
6109   rtx_vector_builder true_builder (mode, npatterns, 1);
6110   rtx_vector_builder false_builder (mode, npatterns, 1);
6111   for (unsigned int i = 0; i < npatterns; ++i)
6112     {
6113       true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6114       pred_builder.quick_push (CONST1_RTX (BImode));
6115     }
6116   for (unsigned int i = 0; i < npatterns; ++i)
6117     {
6118       false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
6119       pred_builder.quick_push (CONST0_RTX (BImode));
6120     }
6121   expand_operand ops[4];
6122   create_output_operand (&ops[0], target, mode);
6123   create_input_operand (&ops[1], true_builder.build (), mode);
6124   create_input_operand (&ops[2], false_builder.build (), mode);
6125   create_input_operand (&ops[3], pred_builder.build (), pred_mode);
6126   expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
6127   return target;
6128 }
6129
6130 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
6131    SVE data mode and isn't a legitimate constant.  Use TARGET for the
6132    result if convenient.
6133
6134    The returned register can have whatever mode seems most natural
6135    given the contents of SRC.  */
6136
6137 static rtx
6138 aarch64_expand_sve_const_vector (rtx target, rtx src)
6139 {
6140   machine_mode mode = GET_MODE (src);
6141   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6142   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
6143   scalar_mode elt_mode = GET_MODE_INNER (mode);
6144   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
6145   unsigned int container_bits = aarch64_sve_container_bits (mode);
6146   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
6147
6148   if (nelts_per_pattern == 1
6149       && encoded_bits <= 128
6150       && container_bits != elt_bits)
6151     {
6152       /* We have a partial vector mode and a constant whose full-vector
6153          equivalent would occupy a repeating 128-bit sequence.  Build that
6154          full-vector equivalent instead, so that we have the option of
6155          using LD1RQ and Advanced SIMD operations.  */
6156       unsigned int repeat = container_bits / elt_bits;
6157       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
6158       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
6159       for (unsigned int i = 0; i < npatterns; ++i)
6160         for (unsigned int j = 0; j < repeat; ++j)
6161           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6162       target = aarch64_target_reg (target, full_mode);
6163       return aarch64_expand_sve_const_vector (target, builder.build ());
6164     }
6165
6166   if (nelts_per_pattern == 1 && encoded_bits == 128)
6167     {
6168       /* The constant is a duplicated quadword but can't be narrowed
6169          beyond a quadword.  Get the memory image of the first quadword
6170          as a 128-bit vector and try using LD1RQ to load it from memory.
6171
6172          The effect for both endiannesses is to load memory lane N into
6173          architectural lanes N + I * STEP of the result.  On big-endian
6174          targets, the layout of the 128-bit vector in an Advanced SIMD
6175          register would be different from its layout in an SVE register,
6176          but this 128-bit vector is a memory value only.  */
6177       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6178       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
6179       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
6180         return target;
6181     }
6182
6183   if (nelts_per_pattern == 1 && encoded_bits < 128)
6184     {
6185       /* The vector is a repeating sequence of 64 bits or fewer.
6186          See if we can load them using an Advanced SIMD move and then
6187          duplicate it to fill a vector.  This is better than using a GPR
6188          move because it keeps everything in the same register file.  */
6189       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6190       rtx_vector_builder builder (vq_mode, npatterns, 1);
6191       for (unsigned int i = 0; i < npatterns; ++i)
6192         {
6193           /* We want memory lane N to go into architectural lane N,
6194              so reverse for big-endian targets.  The DUP .Q pattern
6195              has a compensating reverse built-in.  */
6196           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
6197           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
6198         }
6199       rtx vq_src = builder.build ();
6200       if (aarch64_simd_valid_immediate (vq_src, NULL))
6201         {
6202           vq_src = force_reg (vq_mode, vq_src);
6203           return aarch64_expand_sve_dupq (target, mode, vq_src);
6204         }
6205
6206       /* Get an integer representation of the repeating part of Advanced
6207          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
6208          which for big-endian targets is lane-swapped wrt a normal
6209          Advanced SIMD vector.  This means that for both endiannesses,
6210          memory lane N of SVE vector SRC corresponds to architectural
6211          lane N of a register holding VQ_SRC.  This in turn means that
6212          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
6213          as a single 128-bit value) and thus that memory lane 0 of SRC is
6214          in the lsb of the integer.  Duplicating the integer therefore
6215          ensures that memory lane N of SRC goes into architectural lane
6216          N + I * INDEX of the SVE register.  */
6217       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
6218       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
6219       if (elt_value)
6220         {
6221           /* Pretend that we had a vector of INT_MODE to start with.  */
6222           elt_mode = int_mode;
6223           mode = aarch64_full_sve_mode (int_mode).require ();
6224
6225           /* If the integer can be moved into a general register by a
6226              single instruction, do that and duplicate the result.  */
6227           if (CONST_INT_P (elt_value)
6228               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
6229             {
6230               elt_value = force_reg (elt_mode, elt_value);
6231               return expand_vector_broadcast (mode, elt_value);
6232             }
6233         }
6234       else if (npatterns == 1)
6235         /* We're duplicating a single value, but can't do better than
6236            force it to memory and load from there.  This handles things
6237            like symbolic constants.  */
6238         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
6239
6240       if (elt_value)
6241         {
6242           /* Load the element from memory if we can, otherwise move it into
6243              a register and use a DUP.  */
6244           rtx op = force_const_mem (elt_mode, elt_value);
6245           if (!op)
6246             op = force_reg (elt_mode, elt_value);
6247           return expand_vector_broadcast (mode, op);
6248         }
6249     }
6250
6251   /* Try using INDEX.  */
6252   rtx base, step;
6253   if (const_vec_series_p (src, &base, &step))
6254     {
6255       aarch64_expand_vec_series (target, base, step);
6256       return target;
6257     }
6258
6259   /* From here on, it's better to force the whole constant to memory
6260      if we can.  */
6261   if (GET_MODE_NUNITS (mode).is_constant ())
6262     return NULL_RTX;
6263
6264   if (nelts_per_pattern == 2)
6265     if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
6266       return res;
6267
6268   /* Expand each pattern individually.  */
6269   gcc_assert (npatterns > 1);
6270   rtx_vector_builder builder;
6271   auto_vec<rtx, 16> vectors (npatterns);
6272   for (unsigned int i = 0; i < npatterns; ++i)
6273     {
6274       builder.new_vector (mode, 1, nelts_per_pattern);
6275       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
6276         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
6277       vectors.quick_push (force_reg (mode, builder.build ()));
6278     }
6279
6280   /* Use permutes to interleave the separate vectors.  */
6281   while (npatterns > 1)
6282     {
6283       npatterns /= 2;
6284       for (unsigned int i = 0; i < npatterns; ++i)
6285         {
6286           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
6287           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
6288           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
6289           vectors[i] = tmp;
6290         }
6291     }
6292   gcc_assert (vectors[0] == target);
6293   return target;
6294 }
6295
6296 /* Use WHILE to set a predicate register of mode MODE in which the first
6297    VL bits are set and the rest are clear.  Use TARGET for the register
6298    if it's nonnull and convenient.  */
6299
6300 static rtx
6301 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
6302                                  unsigned int vl)
6303 {
6304   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
6305   target = aarch64_target_reg (target, mode);
6306   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
6307                         target, const0_rtx, limit));
6308   return target;
6309 }
6310
6311 static rtx
6312 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
6313
6314 /* BUILDER is a constant predicate in which the index of every set bit
6315    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
6316    by inverting every element at a multiple of ELT_SIZE and EORing the
6317    result with an ELT_SIZE PTRUE.
6318
6319    Return a register that contains the constant on success, otherwise
6320    return null.  Use TARGET as the register if it is nonnull and
6321    convenient.  */
6322
6323 static rtx
6324 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
6325                                    unsigned int elt_size)
6326 {
6327   /* Invert every element at a multiple of ELT_SIZE, keeping the
6328      other bits zero.  */
6329   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
6330                                   builder.nelts_per_pattern ());
6331   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6332     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
6333       inv_builder.quick_push (const1_rtx);
6334     else
6335       inv_builder.quick_push (const0_rtx);
6336   inv_builder.finalize ();
6337
6338   /* See if we can load the constant cheaply.  */
6339   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
6340   if (!inv)
6341     return NULL_RTX;
6342
6343   /* EOR the result with an ELT_SIZE PTRUE.  */
6344   rtx mask = aarch64_ptrue_all (elt_size);
6345   mask = force_reg (VNx16BImode, mask);
6346   inv = gen_lowpart (VNx16BImode, inv);
6347   target = aarch64_target_reg (target, VNx16BImode);
6348   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
6349   return target;
6350 }
6351
6352 /* BUILDER is a constant predicate in which the index of every set bit
6353    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
6354    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
6355    register on success, otherwise return null.  Use TARGET as the register
6356    if nonnull and convenient.  */
6357
6358 static rtx
6359 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
6360                                    unsigned int elt_size,
6361                                    unsigned int permute_size)
6362 {
6363   /* We're going to split the constant into two new constants A and B,
6364      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
6365      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6366
6367      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6368      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6369
6370      where _ indicates elements that will be discarded by the permute.
6371
6372      First calculate the ELT_SIZEs for A and B.  */
6373   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
6374   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
6375   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
6376     if (INTVAL (builder.elt (i)) != 0)
6377       {
6378         if (i & permute_size)
6379           b_elt_size |= i - permute_size;
6380         else
6381           a_elt_size |= i;
6382       }
6383   a_elt_size &= -a_elt_size;
6384   b_elt_size &= -b_elt_size;
6385
6386   /* Now construct the vectors themselves.  */
6387   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
6388                                 builder.nelts_per_pattern ());
6389   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
6390                                 builder.nelts_per_pattern ());
6391   unsigned int nelts = builder.encoded_nelts ();
6392   for (unsigned int i = 0; i < nelts; ++i)
6393     if (i & (elt_size - 1))
6394       {
6395         a_builder.quick_push (const0_rtx);
6396         b_builder.quick_push (const0_rtx);
6397       }
6398     else if ((i & permute_size) == 0)
6399       {
6400         /* The A and B elements are significant.  */
6401         a_builder.quick_push (builder.elt (i));
6402         b_builder.quick_push (builder.elt (i + permute_size));
6403       }
6404     else
6405       {
6406         /* The A and B elements are going to be discarded, so pick whatever
6407            is likely to give a nice constant.  We are targeting element
6408            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6409            with the aim of each being a sequence of ones followed by
6410            a sequence of zeros.  So:
6411
6412            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6413              duplicate the last X_ELT_SIZE element, to extend the
6414              current sequence of ones or zeros.
6415
6416            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6417              zero, so that the constant really does have X_ELT_SIZE and
6418              not a smaller size.  */
6419         if (a_elt_size > permute_size)
6420           a_builder.quick_push (const0_rtx);
6421         else
6422           a_builder.quick_push (a_builder.elt (i - a_elt_size));
6423         if (b_elt_size > permute_size)
6424           b_builder.quick_push (const0_rtx);
6425         else
6426           b_builder.quick_push (b_builder.elt (i - b_elt_size));
6427       }
6428   a_builder.finalize ();
6429   b_builder.finalize ();
6430
6431   /* Try loading A into a register.  */
6432   rtx_insn *last = get_last_insn ();
6433   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
6434   if (!a)
6435     return NULL_RTX;
6436
6437   /* Try loading B into a register.  */
6438   rtx b = a;
6439   if (a_builder != b_builder)
6440     {
6441       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
6442       if (!b)
6443         {
6444           delete_insns_since (last);
6445           return NULL_RTX;
6446         }
6447     }
6448
6449   /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
6450      operands but permutes them as though they had mode MODE.  */
6451   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
6452   target = aarch64_target_reg (target, GET_MODE (a));
6453   rtx type_reg = CONST0_RTX (mode);
6454   emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
6455   return target;
6456 }
6457
6458 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
6459    constant in BUILDER into an SVE predicate register.  Return the register
6460    on success, otherwise return null.  Use TARGET for the register if
6461    nonnull and convenient.
6462
6463    ALLOW_RECURSE_P is true if we can use methods that would call this
6464    function recursively.  */
6465
6466 static rtx
6467 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
6468                                  bool allow_recurse_p)
6469 {
6470   if (builder.encoded_nelts () == 1)
6471     /* A PFALSE or a PTRUE .B ALL.  */
6472     return aarch64_emit_set_immediate (target, builder);
6473
6474   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
6475   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
6476     {
6477       /* If we can load the constant using PTRUE, use it as-is.  */
6478       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
6479       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
6480         return aarch64_emit_set_immediate (target, builder);
6481
6482       /* Otherwise use WHILE to set the first VL bits.  */
6483       return aarch64_sve_move_pred_via_while (target, mode, vl);
6484     }
6485
6486   if (!allow_recurse_p)
6487     return NULL_RTX;
6488
6489   /* Try inverting the vector in element size ELT_SIZE and then EORing
6490      the result with an ELT_SIZE PTRUE.  */
6491   if (INTVAL (builder.elt (0)) == 0)
6492     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
6493                                                      elt_size))
6494       return res;
6495
6496   /* Try using TRN1 to permute two simpler constants.  */
6497   for (unsigned int i = elt_size; i <= 8; i *= 2)
6498     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
6499                                                      elt_size, i))
6500       return res;
6501
6502   return NULL_RTX;
6503 }
6504
6505 /* Return an SVE predicate register that contains the VNx16BImode
6506    constant in BUILDER, without going through the move expanders.
6507
6508    The returned register can have whatever mode seems most natural
6509    given the contents of BUILDER.  Use TARGET for the result if
6510    convenient.  */
6511
6512 static rtx
6513 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
6514 {
6515   /* Try loading the constant using pure predicate operations.  */
6516   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
6517     return res;
6518
6519   /* Try forcing the constant to memory.  */
6520   if (builder.full_nelts ().is_constant ())
6521     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
6522       {
6523         target = aarch64_target_reg (target, VNx16BImode);
6524         emit_move_insn (target, mem);
6525         return target;
6526       }
6527
6528   /* The last resort is to load the constant as an integer and then
6529      compare it against zero.  Use -1 for set bits in order to increase
6530      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
6531   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
6532                                   builder.nelts_per_pattern ());
6533   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6534     int_builder.quick_push (INTVAL (builder.elt (i))
6535                             ? constm1_rtx : const0_rtx);
6536   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
6537                                            int_builder.build ());
6538 }
6539
6540 /* Set DEST to immediate IMM.  */
6541
6542 void
6543 aarch64_expand_mov_immediate (rtx dest, rtx imm)
6544 {
6545   machine_mode mode = GET_MODE (dest);
6546
6547   /* Check on what type of symbol it is.  */
6548   scalar_int_mode int_mode;
6549   if ((SYMBOL_REF_P (imm)
6550        || LABEL_REF_P (imm)
6551        || GET_CODE (imm) == CONST
6552        || GET_CODE (imm) == CONST_POLY_INT)
6553       && is_a <scalar_int_mode> (mode, &int_mode))
6554     {
6555       rtx mem;
6556       poly_int64 offset;
6557       HOST_WIDE_INT const_offset;
6558       enum aarch64_symbol_type sty;
6559
6560       /* If we have (const (plus symbol offset)), separate out the offset
6561          before we start classifying the symbol.  */
6562       rtx base = strip_offset (imm, &offset);
6563
6564       /* We must always add an offset involving VL separately, rather than
6565          folding it into the relocation.  */
6566       if (!offset.is_constant (&const_offset))
6567         {
6568           if (!TARGET_SVE)
6569             {
6570               aarch64_report_sve_required ();
6571               return;
6572             }
6573           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
6574             emit_insn (gen_rtx_SET (dest, imm));
6575           else
6576             {
6577               /* Do arithmetic on 32-bit values if the result is smaller
6578                  than that.  */
6579               if (partial_subreg_p (int_mode, SImode))
6580                 {
6581                   /* It is invalid to do symbol calculations in modes
6582                      narrower than SImode.  */
6583                   gcc_assert (base == const0_rtx);
6584                   dest = gen_lowpart (SImode, dest);
6585                   int_mode = SImode;
6586                 }
6587               if (base != const0_rtx)
6588                 {
6589                   base = aarch64_force_temporary (int_mode, dest, base);
6590                   aarch64_add_offset (int_mode, dest, base, offset,
6591                                       NULL_RTX, NULL_RTX, false);
6592                 }
6593               else
6594                 aarch64_add_offset (int_mode, dest, base, offset,
6595                                     dest, NULL_RTX, false);
6596             }
6597           return;
6598         }
6599
6600       sty = aarch64_classify_symbol (base, const_offset);
6601       switch (sty)
6602         {
6603         case SYMBOL_FORCE_TO_MEM:
6604           if (int_mode != ptr_mode)
6605             imm = convert_memory_address (ptr_mode, imm);
6606
6607           if (const_offset != 0
6608               && targetm.cannot_force_const_mem (ptr_mode, imm))
6609             {
6610               gcc_assert (can_create_pseudo_p ());
6611               base = aarch64_force_temporary (int_mode, dest, base);
6612               aarch64_add_offset (int_mode, dest, base, const_offset,
6613                                   NULL_RTX, NULL_RTX, false);
6614               return;
6615             }
6616
6617           mem = force_const_mem (ptr_mode, imm);
6618           gcc_assert (mem);
6619
6620           /* If we aren't generating PC relative literals, then
6621              we need to expand the literal pool access carefully.
6622              This is something that needs to be done in a number
6623              of places, so could well live as a separate function.  */
6624           if (!aarch64_pcrelative_literal_loads)
6625             {
6626               gcc_assert (can_create_pseudo_p ());
6627               base = gen_reg_rtx (ptr_mode);
6628               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6629               if (ptr_mode != Pmode)
6630                 base = convert_memory_address (Pmode, base);
6631               mem = gen_rtx_MEM (ptr_mode, base);
6632             }
6633
6634           if (int_mode != ptr_mode)
6635             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6636
6637           emit_insn (gen_rtx_SET (dest, mem));
6638
6639           return;
6640
6641         case SYMBOL_SMALL_TLSGD:
6642         case SYMBOL_SMALL_TLSDESC:
6643         case SYMBOL_SMALL_TLSIE:
6644         case SYMBOL_SMALL_GOT_28K:
6645         case SYMBOL_SMALL_GOT_4G:
6646         case SYMBOL_TINY_GOT:
6647         case SYMBOL_TINY_TLSIE:
6648           if (const_offset != 0)
6649             {
6650               gcc_assert(can_create_pseudo_p ());
6651               base = aarch64_force_temporary (int_mode, dest, base);
6652               aarch64_add_offset (int_mode, dest, base, const_offset,
6653                                   NULL_RTX, NULL_RTX, false);
6654               return;
6655             }
6656           /* FALLTHRU */
6657
6658         case SYMBOL_SMALL_ABSOLUTE:
6659         case SYMBOL_TINY_ABSOLUTE:
6660         case SYMBOL_TLSLE12:
6661         case SYMBOL_TLSLE24:
6662         case SYMBOL_TLSLE32:
6663         case SYMBOL_TLSLE48:
6664           aarch64_load_symref_appropriately (dest, imm, sty);
6665           return;
6666
6667         default:
6668           gcc_unreachable ();
6669         }
6670     }
6671
6672   if (!CONST_INT_P (imm))
6673     {
6674       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
6675         {
6676           /* Only the low bit of each .H, .S and .D element is defined,
6677              so we can set the upper bits to whatever we like.  If the
6678              predicate is all-true in MODE, prefer to set all the undefined
6679              bits as well, so that we can share a single .B predicate for
6680              all modes.  */
6681           if (imm == CONSTM1_RTX (mode))
6682             imm = CONSTM1_RTX (VNx16BImode);
6683
6684           /* All methods for constructing predicate modes wider than VNx16BI
6685              will set the upper bits of each element to zero.  Expose this
6686              by moving such constants as a VNx16BI, so that all bits are
6687              significant and so that constants for different modes can be
6688              shared.  The wider constant will still be available as a
6689              REG_EQUAL note.  */
6690           rtx_vector_builder builder;
6691           if (aarch64_get_sve_pred_bits (builder, imm))
6692             {
6693               rtx res = aarch64_expand_sve_const_pred (dest, builder);
6694               if (dest != res)
6695                 emit_move_insn (dest, gen_lowpart (mode, res));
6696               return;
6697             }
6698         }
6699
6700       if (GET_CODE (imm) == HIGH
6701           || aarch64_simd_valid_immediate (imm, NULL))
6702         {
6703           emit_insn (gen_rtx_SET (dest, imm));
6704           return;
6705         }
6706
6707       if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6708         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6709           {
6710             if (dest != res)
6711               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6712             return;
6713           }
6714
6715       rtx mem = force_const_mem (mode, imm);
6716       gcc_assert (mem);
6717       emit_move_insn (dest, mem);
6718       return;
6719     }
6720
6721   aarch64_internal_mov_immediate (dest, imm, true,
6722                                   as_a <scalar_int_mode> (mode));
6723 }
6724
6725 /* Return the MEM rtx that provides the canary value that should be used
6726    for stack-smashing protection.  MODE is the mode of the memory.
6727    For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6728    (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
6729    indicates whether the caller is performing a SET or a TEST operation.  */
6730
6731 rtx
6732 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6733                                   aarch64_salt_type salt_type)
6734 {
6735   rtx addr;
6736   if (aarch64_stack_protector_guard == SSP_GLOBAL)
6737     {
6738       gcc_assert (MEM_P (decl_rtl));
6739       addr = XEXP (decl_rtl, 0);
6740       poly_int64 offset;
6741       rtx base = strip_offset_and_salt (addr, &offset);
6742       if (!SYMBOL_REF_P (base))
6743         return decl_rtl;
6744
6745       rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6746       addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6747       addr = gen_rtx_CONST (Pmode, addr);
6748       addr = plus_constant (Pmode, addr, offset);
6749     }
6750   else
6751     {
6752       /* Calculate the address from the system register.  */
6753       rtx salt = GEN_INT (salt_type);
6754       addr = gen_reg_rtx (mode);
6755       if (mode == DImode)
6756         emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6757       else
6758         {
6759           emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6760           addr = convert_memory_address (Pmode, addr);
6761         }
6762       addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6763     }
6764   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6765 }
6766
6767 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
6768    that is known to contain PTRUE.  */
6769
6770 void
6771 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6772 {
6773   expand_operand ops[3];
6774   machine_mode mode = GET_MODE (dest);
6775   create_output_operand (&ops[0], dest, mode);
6776   create_input_operand (&ops[1], pred, GET_MODE(pred));
6777   create_input_operand (&ops[2], src, mode);
6778   temporary_volatile_ok v (true);
6779   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6780 }
6781
6782 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6783    operand is in memory.  In this case we need to use the predicated LD1
6784    and ST1 instead of LDR and STR, both for correctness on big-endian
6785    targets and because LD1 and ST1 support a wider range of addressing modes.
6786    PRED_MODE is the mode of the predicate.
6787
6788    See the comment at the head of aarch64-sve.md for details about the
6789    big-endian handling.  */
6790
6791 void
6792 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6793 {
6794   machine_mode mode = GET_MODE (dest);
6795   rtx ptrue = aarch64_ptrue_reg (pred_mode);
6796   if (!register_operand (src, mode)
6797       && !register_operand (dest, mode))
6798     {
6799       rtx tmp = gen_reg_rtx (mode);
6800       if (MEM_P (src))
6801         aarch64_emit_sve_pred_move (tmp, ptrue, src);
6802       else
6803         emit_move_insn (tmp, src);
6804       src = tmp;
6805     }
6806   aarch64_emit_sve_pred_move (dest, ptrue, src);
6807 }
6808
6809 /* Called only on big-endian targets.  See whether an SVE vector move
6810    from SRC to DEST is effectively a REV[BHW] instruction, because at
6811    least one operand is a subreg of an SVE vector that has wider or
6812    narrower elements.  Return true and emit the instruction if so.
6813
6814    For example:
6815
6816      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6817
6818    represents a VIEW_CONVERT between the following vectors, viewed
6819    in memory order:
6820
6821      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
6822      R1: { [0],      [1],      [2],      [3],     ... }
6823
6824    The high part of lane X in R2 should therefore correspond to lane X*2
6825    of R1, but the register representations are:
6826
6827          msb                                      lsb
6828      R2: ...... [1].high  [1].low   [0].high  [0].low
6829      R1: ...... [3]       [2]       [1]       [0]
6830
6831    where the low part of lane X in R2 corresponds to lane X*2 in R1.
6832    We therefore need a reverse operation to swap the high and low values
6833    around.
6834
6835    This is purely an optimization.  Without it we would spill the
6836    subreg operand to the stack in one mode and reload it in the
6837    other mode, which has the same effect as the REV.  */
6838
6839 bool
6840 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6841 {
6842   gcc_assert (BYTES_BIG_ENDIAN);
6843
6844   /* Do not try to optimize subregs that LRA has created for matched
6845      reloads.  These subregs only exist as a temporary measure to make
6846      the RTL well-formed, but they are exempt from the usual
6847      TARGET_CAN_CHANGE_MODE_CLASS rules.
6848
6849      For example, if we have:
6850
6851        (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6852
6853      and the constraints require R1 and R2 to be in the same register,
6854      LRA may need to create RTL such as:
6855
6856        (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6857        (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6858        (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6859
6860      which forces both the input and output of the original instruction
6861      to use the same hard register.  But for this to work, the normal
6862      rules have to be suppressed on the subreg input, otherwise LRA
6863      would need to reload that input too, meaning that the process
6864      would never terminate.  To compensate for this, the normal rules
6865      are also suppressed for the subreg output of the first move.
6866      Ignoring the special case and handling the first move normally
6867      would therefore generate wrong code: we would reverse the elements
6868      for the first subreg but not reverse them back for the second subreg.  */
6869   if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
6870     dest = SUBREG_REG (dest);
6871   if (SUBREG_P (src) && !LRA_SUBREG_P (src))
6872     src = SUBREG_REG (src);
6873
6874   /* The optimization handles two single SVE REGs with different element
6875      sizes.  */
6876   if (!REG_P (dest)
6877       || !REG_P (src)
6878       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6879       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6880       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6881           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6882     return false;
6883
6884   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
6885   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
6886   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6887                                UNSPEC_REV_SUBREG);
6888   emit_insn (gen_rtx_SET (dest, unspec));
6889   return true;
6890 }
6891
6892 /* Return a copy of X with mode MODE, without changing its other
6893    attributes.  Unlike gen_lowpart, this doesn't care whether the
6894    mode change is valid.  */
6895
6896 rtx
6897 aarch64_replace_reg_mode (rtx x, machine_mode mode)
6898 {
6899   if (GET_MODE (x) == mode)
6900     return x;
6901
6902   x = shallow_copy_rtx (x);
6903   set_mode_and_regno (x, mode, REGNO (x));
6904   return x;
6905 }
6906
6907 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6908    stored in wider integer containers.  */
6909
6910 static unsigned int
6911 aarch64_sve_rev_unspec (machine_mode mode)
6912 {
6913   switch (GET_MODE_UNIT_SIZE (mode))
6914     {
6915     case 1: return UNSPEC_REVB;
6916     case 2: return UNSPEC_REVH;
6917     case 4: return UNSPEC_REVW;
6918     }
6919   gcc_unreachable ();
6920 }
6921
6922 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6923    operands.  */
6924
6925 void
6926 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6927 {
6928   /* Decide which REV operation we need.  The mode with wider elements
6929      determines the mode of the operands and the mode with the narrower
6930      elements determines the reverse width.  */
6931   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6932   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6933   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6934       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6935     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6936
6937   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6938   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6939
6940   /* Get the operands in the appropriate modes and emit the instruction.  */
6941   ptrue = gen_lowpart (pred_mode, ptrue);
6942   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6943   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6944   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6945                                dest, ptrue, src));
6946 }
6947
6948 static bool
6949 aarch64_function_ok_for_sibcall (tree, tree exp)
6950 {
6951   if (crtl->abi->id () != expr_callee_abi (exp).id ())
6952     return false;
6953
6954   return true;
6955 }
6956
6957 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6958    passed in SVE registers.  */
6959
6960 static bool
6961 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6962                              const function_arg_info &arg)
6963 {
6964   HOST_WIDE_INT size;
6965   machine_mode dummymode;
6966   int nregs;
6967
6968   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
6969   if (arg.mode == BLKmode && arg.type)
6970     size = int_size_in_bytes (arg.type);
6971   else
6972     /* No frontends can create types with variable-sized modes, so we
6973        shouldn't be asked to pass or return them.  */
6974     size = GET_MODE_SIZE (arg.mode).to_constant ();
6975
6976   /* Aggregates are passed by reference based on their size.  */
6977   if (arg.aggregate_type_p ())
6978     size = int_size_in_bytes (arg.type);
6979
6980   /* Variable sized arguments are always returned by reference.  */
6981   if (size < 0)
6982     return true;
6983
6984   /* Can this be a candidate to be passed in fp/simd register(s)?  */
6985   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6986                                                &dummymode, &nregs, NULL,
6987                                                !pcum || pcum->silent_p))
6988     return false;
6989
6990   /* Arguments which are variable sized or larger than 2 registers are
6991      passed by reference unless they are a homogenous floating point
6992      aggregate.  */
6993   return size > 2 * UNITS_PER_WORD;
6994 }
6995
6996 /* Implement TARGET_PASS_BY_REFERENCE.  */
6997
6998 static bool
6999 aarch64_pass_by_reference (cumulative_args_t pcum_v,
7000                            const function_arg_info &arg)
7001 {
7002   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7003
7004   if (!arg.type)
7005     return aarch64_pass_by_reference_1 (pcum, arg);
7006
7007   pure_scalable_type_info pst_info;
7008   switch (pst_info.analyze (arg.type))
7009     {
7010     case pure_scalable_type_info::IS_PST:
7011       if (pcum && !pcum->silent_p && !TARGET_SVE)
7012         /* We can't gracefully recover at this point, so make this a
7013            fatal error.  */
7014         fatal_error (input_location, "arguments of type %qT require"
7015                      " the SVE ISA extension", arg.type);
7016
7017       /* Variadic SVE types are passed by reference.  Normal non-variadic
7018          arguments are too if we've run out of registers.  */
7019       return (!arg.named
7020               || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
7021               || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
7022
7023     case pure_scalable_type_info::DOESNT_MATTER:
7024       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
7025       return true;
7026
7027     case pure_scalable_type_info::NO_ABI_IDENTITY:
7028     case pure_scalable_type_info::ISNT_PST:
7029       return aarch64_pass_by_reference_1 (pcum, arg);
7030     }
7031   gcc_unreachable ();
7032 }
7033
7034 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
7035 static bool
7036 aarch64_return_in_msb (const_tree valtype)
7037 {
7038   machine_mode dummy_mode;
7039   int dummy_int;
7040
7041   /* Never happens in little-endian mode.  */
7042   if (!BYTES_BIG_ENDIAN)
7043     return false;
7044
7045   /* Only composite types smaller than or equal to 16 bytes can
7046      be potentially returned in registers.  */
7047   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
7048       || int_size_in_bytes (valtype) <= 0
7049       || int_size_in_bytes (valtype) > 16)
7050     return false;
7051
7052   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
7053      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
7054      is always passed/returned in the least significant bits of fp/simd
7055      register(s).  */
7056   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
7057                                                &dummy_mode, &dummy_int, NULL,
7058                                                false))
7059     return false;
7060
7061   /* Likewise pure scalable types for SVE vector and predicate registers.  */
7062   pure_scalable_type_info pst_info;
7063   if (pst_info.analyze_registers (valtype))
7064     return false;
7065
7066   return true;
7067 }
7068
7069 /* Implement TARGET_FUNCTION_VALUE.
7070    Define how to find the value returned by a function.  */
7071
7072 static rtx
7073 aarch64_function_value (const_tree type, const_tree func,
7074                         bool outgoing ATTRIBUTE_UNUSED)
7075 {
7076   machine_mode mode;
7077   int unsignedp;
7078
7079   mode = TYPE_MODE (type);
7080   if (INTEGRAL_TYPE_P (type))
7081     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
7082
7083   pure_scalable_type_info pst_info;
7084   if (type && pst_info.analyze_registers (type))
7085     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
7086
7087   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7088      are returned in memory, not by value.  */
7089   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7090   bool sve_p = (vec_flags & VEC_ANY_SVE);
7091
7092   if (aarch64_return_in_msb (type))
7093     {
7094       HOST_WIDE_INT size = int_size_in_bytes (type);
7095
7096       if (size % UNITS_PER_WORD != 0)
7097         {
7098           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
7099           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
7100         }
7101     }
7102
7103   int count;
7104   machine_mode ag_mode;
7105   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
7106                                                NULL, false))
7107     {
7108       gcc_assert (!sve_p);
7109       if (!aarch64_composite_type_p (type, mode))
7110         {
7111           gcc_assert (count == 1 && mode == ag_mode);
7112           return gen_rtx_REG (mode, V0_REGNUM);
7113         }
7114       else if (aarch64_advsimd_full_struct_mode_p (mode)
7115                && known_eq (GET_MODE_SIZE (ag_mode), 16))
7116         return gen_rtx_REG (mode, V0_REGNUM);
7117       else if (aarch64_advsimd_partial_struct_mode_p (mode)
7118                && known_eq (GET_MODE_SIZE (ag_mode), 8))
7119         return gen_rtx_REG (mode, V0_REGNUM);
7120       else
7121         {
7122           int i;
7123           rtx par;
7124
7125           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
7126           for (i = 0; i < count; i++)
7127             {
7128               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
7129               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
7130               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7131               XVECEXP (par, 0, i) = tmp;
7132             }
7133           return par;
7134         }
7135     }
7136   else
7137     {
7138       if (sve_p)
7139         {
7140           /* Vector types can acquire a partial SVE mode using things like
7141              __attribute__((vector_size(N))), and this is potentially useful.
7142              However, the choice of mode doesn't affect the type's ABI
7143              identity, so we should treat the types as though they had
7144              the associated integer mode, just like they did before SVE
7145              was introduced.
7146
7147              We know that the vector must be 128 bits or smaller,
7148              otherwise we'd have returned it in memory instead.  */
7149           gcc_assert (type
7150                       && (aarch64_some_values_include_pst_objects_p (type)
7151                           || (vec_flags & VEC_PARTIAL)));
7152
7153           scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
7154           rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
7155           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
7156           return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
7157         }
7158       return gen_rtx_REG (mode, R0_REGNUM);
7159     }
7160 }
7161
7162 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
7163    Return true if REGNO is the number of a hard register in which the values
7164    of called function may come back.  */
7165
7166 static bool
7167 aarch64_function_value_regno_p (const unsigned int regno)
7168 {
7169   /* Maximum of 16 bytes can be returned in the general registers.  Examples
7170      of 16-byte return values are: 128-bit integers and 16-byte small
7171      structures (excluding homogeneous floating-point aggregates).  */
7172   if (regno == R0_REGNUM || regno == R1_REGNUM)
7173     return true;
7174
7175   /* Up to four fp/simd registers can return a function value, e.g. a
7176      homogeneous floating-point aggregate having four members.  */
7177   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
7178     return TARGET_FLOAT;
7179
7180   return false;
7181 }
7182
7183 /* Subroutine for aarch64_return_in_memory for types that are not returned
7184    in SVE registers.  */
7185
7186 static bool
7187 aarch64_return_in_memory_1 (const_tree type)
7188 {
7189   HOST_WIDE_INT size;
7190   machine_mode ag_mode;
7191   int count;
7192
7193   if (!AGGREGATE_TYPE_P (type)
7194       && TREE_CODE (type) != COMPLEX_TYPE
7195       && TREE_CODE (type) != VECTOR_TYPE)
7196     /* Simple scalar types always returned in registers.  */
7197     return false;
7198
7199   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7200                                                &ag_mode, &count, NULL, false))
7201     return false;
7202
7203   /* Types larger than 2 registers returned in memory.  */
7204   size = int_size_in_bytes (type);
7205   return (size < 0 || size > 2 * UNITS_PER_WORD);
7206 }
7207
7208 /* Implement TARGET_RETURN_IN_MEMORY.
7209
7210    If the type T of the result of a function is such that
7211      void func (T arg)
7212    would require that arg be passed as a value in a register (or set of
7213    registers) according to the parameter passing rules, then the result
7214    is returned in the same registers as would be used for such an
7215    argument.  */
7216
7217 static bool
7218 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
7219 {
7220   pure_scalable_type_info pst_info;
7221   switch (pst_info.analyze (type))
7222     {
7223     case pure_scalable_type_info::IS_PST:
7224       return (pst_info.num_zr () > NUM_FP_ARG_REGS
7225               || pst_info.num_pr () > NUM_PR_ARG_REGS);
7226
7227     case pure_scalable_type_info::DOESNT_MATTER:
7228       gcc_assert (aarch64_return_in_memory_1 (type));
7229       return true;
7230
7231     case pure_scalable_type_info::NO_ABI_IDENTITY:
7232     case pure_scalable_type_info::ISNT_PST:
7233       return aarch64_return_in_memory_1 (type);
7234     }
7235   gcc_unreachable ();
7236 }
7237
7238 static bool
7239 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
7240                                const_tree type, int *nregs)
7241 {
7242   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7243   return aarch64_vfp_is_call_or_return_candidate (mode, type,
7244                                                   &pcum->aapcs_vfp_rmode,
7245                                                   nregs, NULL, pcum->silent_p);
7246 }
7247
7248 /* Given MODE and TYPE of a function argument, return the alignment in
7249    bits.  The idea is to suppress any stronger alignment requested by
7250    the user and opt for the natural alignment (specified in AAPCS64 \S
7251    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
7252    calculated in versions of GCC prior to GCC-9.  This is a helper
7253    function for local use only.  */
7254
7255 static unsigned int
7256 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
7257                                 unsigned int *abi_break)
7258 {
7259   *abi_break = 0;
7260   if (!type)
7261     return GET_MODE_ALIGNMENT (mode);
7262
7263   if (integer_zerop (TYPE_SIZE (type)))
7264     return 0;
7265
7266   gcc_assert (TYPE_MODE (type) == mode);
7267
7268   if (!AGGREGATE_TYPE_P (type))
7269     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
7270
7271   if (TREE_CODE (type) == ARRAY_TYPE)
7272     return TYPE_ALIGN (TREE_TYPE (type));
7273
7274   unsigned int alignment = 0;
7275   unsigned int bitfield_alignment = 0;
7276   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7277     if (TREE_CODE (field) == FIELD_DECL)
7278       {
7279         /* Note that we explicitly consider zero-sized fields here,
7280            even though they don't map to AAPCS64 machine types.
7281            For example, in:
7282
7283                struct __attribute__((aligned(8))) empty {};
7284
7285                struct s {
7286                  [[no_unique_address]] empty e;
7287                  int x;
7288                };
7289
7290            "s" contains only one Fundamental Data Type (the int field)
7291            but gains 8-byte alignment and size thanks to "e".  */
7292         alignment = std::max (alignment, DECL_ALIGN (field));
7293         if (DECL_BIT_FIELD_TYPE (field))
7294           bitfield_alignment
7295             = std::max (bitfield_alignment,
7296                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7297       }
7298
7299   if (bitfield_alignment > alignment)
7300     {
7301       *abi_break = alignment;
7302       return bitfield_alignment;
7303     }
7304
7305   return alignment;
7306 }
7307
7308 /* Layout a function argument according to the AAPCS64 rules.  The rule
7309    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
7310    mode that was originally given to us by the target hook, whereas the
7311    mode in ARG might be the result of replacing partial SVE modes with
7312    the equivalent integer mode.  */
7313
7314 static void
7315 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7316 {
7317   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7318   tree type = arg.type;
7319   machine_mode mode = arg.mode;
7320   int ncrn, nvrn, nregs;
7321   bool allocate_ncrn, allocate_nvrn;
7322   HOST_WIDE_INT size;
7323   unsigned int abi_break;
7324
7325   /* We need to do this once per argument.  */
7326   if (pcum->aapcs_arg_processed)
7327     return;
7328
7329   pcum->aapcs_arg_processed = true;
7330
7331   pure_scalable_type_info pst_info;
7332   if (type && pst_info.analyze_registers (type))
7333     {
7334       /* The PCS says that it is invalid to pass an SVE value to an
7335          unprototyped function.  There is no ABI-defined location we
7336          can return in this case, so we have no real choice but to raise
7337          an error immediately, even though this is only a query function.  */
7338       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
7339         {
7340           gcc_assert (!pcum->silent_p);
7341           error ("SVE type %qT cannot be passed to an unprototyped function",
7342                  arg.type);
7343           /* Avoid repeating the message, and avoid tripping the assert
7344              below.  */
7345           pcum->pcs_variant = ARM_PCS_SVE;
7346         }
7347
7348       /* We would have converted the argument into pass-by-reference
7349          form if it didn't fit in registers.  */
7350       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
7351       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
7352       gcc_assert (arg.named
7353                   && pcum->pcs_variant == ARM_PCS_SVE
7354                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
7355                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
7356       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
7357                                           P0_REGNUM + pcum->aapcs_nprn);
7358       return;
7359     }
7360
7361   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7362      are passed by reference, not by value.  */
7363   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7364   bool sve_p = (vec_flags & VEC_ANY_SVE);
7365   if (sve_p)
7366     /* Vector types can acquire a partial SVE mode using things like
7367        __attribute__((vector_size(N))), and this is potentially useful.
7368        However, the choice of mode doesn't affect the type's ABI
7369        identity, so we should treat the types as though they had
7370        the associated integer mode, just like they did before SVE
7371        was introduced.
7372
7373        We know that the vector must be 128 bits or smaller,
7374        otherwise we'd have passed it in memory instead.  */
7375     gcc_assert (type
7376                 && (aarch64_some_values_include_pst_objects_p (type)
7377                     || (vec_flags & VEC_PARTIAL)));
7378
7379   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
7380   if (type)
7381     size = int_size_in_bytes (type);
7382   else
7383     /* No frontends can create types with variable-sized modes, so we
7384        shouldn't be asked to pass or return them.  */
7385     size = GET_MODE_SIZE (mode).to_constant ();
7386   size = ROUND_UP (size, UNITS_PER_WORD);
7387
7388   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
7389   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
7390                                                  mode,
7391                                                  type,
7392                                                  &nregs);
7393   gcc_assert (!sve_p || !allocate_nvrn);
7394
7395   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7396      The following code thus handles passing by SIMD/FP registers first.  */
7397
7398   nvrn = pcum->aapcs_nvrn;
7399
7400   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7401      and homogenous short-vector aggregates (HVA).  */
7402   if (allocate_nvrn)
7403     {
7404       if (!pcum->silent_p && !TARGET_FLOAT)
7405         aarch64_err_no_fpadvsimd (mode);
7406
7407       if (nvrn + nregs <= NUM_FP_ARG_REGS)
7408         {
7409           pcum->aapcs_nextnvrn = nvrn + nregs;
7410           if (!aarch64_composite_type_p (type, mode))
7411             {
7412               gcc_assert (nregs == 1);
7413               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7414             }
7415           else if (aarch64_advsimd_full_struct_mode_p (mode)
7416                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
7417             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7418           else if (aarch64_advsimd_partial_struct_mode_p (mode)
7419                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
7420             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7421           else
7422             {
7423               rtx par;
7424               int i;
7425               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7426               for (i = 0; i < nregs; i++)
7427                 {
7428                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7429                                          V0_REGNUM + nvrn + i);
7430                   rtx offset = gen_int_mode
7431                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7432                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7433                   XVECEXP (par, 0, i) = tmp;
7434                 }
7435               pcum->aapcs_reg = par;
7436             }
7437           return;
7438         }
7439       else
7440         {
7441           /* C.3 NSRN is set to 8.  */
7442           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7443           goto on_stack;
7444         }
7445     }
7446
7447   ncrn = pcum->aapcs_ncrn;
7448   nregs = size / UNITS_PER_WORD;
7449
7450   /* C6 - C9.  though the sign and zero extension semantics are
7451      handled elsewhere.  This is the case where the argument fits
7452      entirely general registers.  */
7453   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7454     {
7455       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7456
7457       /* C.8 if the argument has an alignment of 16 then the NGRN is
7458          rounded up to the next even number.  */
7459       if (nregs == 2
7460           && ncrn % 2
7461           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7462              comparison is there because for > 16 * BITS_PER_UNIT
7463              alignment nregs should be > 2 and therefore it should be
7464              passed by reference rather than value.  */
7465           && (aarch64_function_arg_alignment (mode, type, &abi_break)
7466               == 16 * BITS_PER_UNIT))
7467         {
7468           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
7469             inform (input_location, "parameter passing for argument of type "
7470                     "%qT changed in GCC 9.1", type);
7471           ++ncrn;
7472           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7473         }
7474
7475       /* If an argument with an SVE mode needs to be shifted up to the
7476          high part of the register, treat it as though it had an integer mode.
7477          Using the normal (parallel [...]) would suppress the shifting.  */
7478       if (sve_p
7479           && BYTES_BIG_ENDIAN
7480           && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7481           && aarch64_pad_reg_upward (mode, type, false))
7482         {
7483           mode = int_mode_for_mode (mode).require ();
7484           sve_p = false;
7485         }
7486
7487       /* NREGS can be 0 when e.g. an empty structure is to be passed.
7488          A reg is still generated for it, but the caller should be smart
7489          enough not to use it.  */
7490       if (nregs == 0
7491           || (nregs == 1 && !sve_p)
7492           || GET_MODE_CLASS (mode) == MODE_INT)
7493         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7494       else
7495         {
7496           rtx par;
7497           int i;
7498
7499           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7500           for (i = 0; i < nregs; i++)
7501             {
7502               scalar_int_mode reg_mode = word_mode;
7503               if (nregs == 1)
7504                 reg_mode = int_mode_for_mode (mode).require ();
7505               rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7506               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7507                                        GEN_INT (i * UNITS_PER_WORD));
7508               XVECEXP (par, 0, i) = tmp;
7509             }
7510           pcum->aapcs_reg = par;
7511         }
7512
7513       pcum->aapcs_nextncrn = ncrn + nregs;
7514       return;
7515     }
7516
7517   /* C.11  */
7518   pcum->aapcs_nextncrn = NUM_ARG_REGS;
7519
7520   /* The argument is passed on stack; record the needed number of words for
7521      this argument and align the total size if necessary.  */
7522 on_stack:
7523   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7524
7525   if (aarch64_function_arg_alignment (mode, type, &abi_break)
7526       == 16 * BITS_PER_UNIT)
7527     {
7528       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7529       if (pcum->aapcs_stack_size != new_size)
7530         {
7531           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
7532             inform (input_location, "parameter passing for argument of type "
7533                     "%qT changed in GCC 9.1", type);
7534           pcum->aapcs_stack_size = new_size;
7535         }
7536     }
7537   return;
7538 }
7539
7540 /* Implement TARGET_FUNCTION_ARG.  */
7541
7542 static rtx
7543 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7544 {
7545   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7546   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7547               || pcum->pcs_variant == ARM_PCS_SIMD
7548               || pcum->pcs_variant == ARM_PCS_SVE);
7549
7550   if (arg.end_marker_p ())
7551     return gen_int_mode (pcum->pcs_variant, DImode);
7552
7553   aarch64_layout_arg (pcum_v, arg);
7554   return pcum->aapcs_reg;
7555 }
7556
7557 void
7558 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7559                               const_tree fntype,
7560                               rtx libname ATTRIBUTE_UNUSED,
7561                               const_tree fndecl ATTRIBUTE_UNUSED,
7562                               unsigned n_named ATTRIBUTE_UNUSED,
7563                               bool silent_p)
7564 {
7565   pcum->aapcs_ncrn = 0;
7566   pcum->aapcs_nvrn = 0;
7567   pcum->aapcs_nprn = 0;
7568   pcum->aapcs_nextncrn = 0;
7569   pcum->aapcs_nextnvrn = 0;
7570   pcum->aapcs_nextnprn = 0;
7571   if (fntype)
7572     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7573   else
7574     pcum->pcs_variant = ARM_PCS_AAPCS64;
7575   pcum->aapcs_reg = NULL_RTX;
7576   pcum->aapcs_arg_processed = false;
7577   pcum->aapcs_stack_words = 0;
7578   pcum->aapcs_stack_size = 0;
7579   pcum->silent_p = silent_p;
7580
7581   if (!silent_p
7582       && !TARGET_FLOAT
7583       && fntype && fntype != error_mark_node)
7584     {
7585       const_tree type = TREE_TYPE (fntype);
7586       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
7587       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
7588       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7589                                                    &mode, &nregs, NULL, false))
7590         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7591     }
7592
7593   if (!silent_p
7594       && !TARGET_SVE
7595       && pcum->pcs_variant == ARM_PCS_SVE)
7596     {
7597       /* We can't gracefully recover at this point, so make this a
7598          fatal error.  */
7599       if (fndecl)
7600         fatal_error (input_location, "%qE requires the SVE ISA extension",
7601                      fndecl);
7602       else
7603         fatal_error (input_location, "calls to functions of type %qT require"
7604                      " the SVE ISA extension", fntype);
7605     }
7606 }
7607
7608 static void
7609 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7610                               const function_arg_info &arg)
7611 {
7612   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7613   if (pcum->pcs_variant == ARM_PCS_AAPCS64
7614       || pcum->pcs_variant == ARM_PCS_SIMD
7615       || pcum->pcs_variant == ARM_PCS_SVE)
7616     {
7617       aarch64_layout_arg (pcum_v, arg);
7618       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7619                   != (pcum->aapcs_stack_words != 0));
7620       pcum->aapcs_arg_processed = false;
7621       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7622       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7623       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7624       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7625       pcum->aapcs_stack_words = 0;
7626       pcum->aapcs_reg = NULL_RTX;
7627     }
7628 }
7629
7630 bool
7631 aarch64_function_arg_regno_p (unsigned regno)
7632 {
7633   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7634           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
7635 }
7636
7637 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
7638    PARM_BOUNDARY bits of alignment, but will be given anything up
7639    to STACK_BOUNDARY bits if the type requires it.  This makes sure
7640    that both before and after the layout of each argument, the Next
7641    Stacked Argument Address (NSAA) will have a minimum alignment of
7642    8 bytes.  */
7643
7644 static unsigned int
7645 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7646 {
7647   unsigned int abi_break;
7648   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7649                                                            &abi_break);
7650   alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7651   if (abi_break & warn_psabi)
7652     {
7653       abi_break = MIN (MAX (abi_break, PARM_BOUNDARY), STACK_BOUNDARY);
7654       if (alignment != abi_break)
7655         inform (input_location, "parameter passing for argument of type "
7656                 "%qT changed in GCC 9.1", type);
7657     }
7658
7659   return alignment;
7660 }
7661
7662 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
7663
7664 static fixed_size_mode
7665 aarch64_get_reg_raw_mode (int regno)
7666 {
7667   if (TARGET_SVE && FP_REGNUM_P (regno))
7668     /* Don't use the SVE part of the register for __builtin_apply and
7669        __builtin_return.  The SVE registers aren't used by the normal PCS,
7670        so using them there would be a waste of time.  The PCS extensions
7671        for SVE types are fundamentally incompatible with the
7672        __builtin_return/__builtin_apply interface.  */
7673     return as_a <fixed_size_mode> (V16QImode);
7674   return default_get_reg_raw_mode (regno);
7675 }
7676
7677 /* Implement TARGET_FUNCTION_ARG_PADDING.
7678
7679    Small aggregate types are placed in the lowest memory address.
7680
7681    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
7682
7683 static pad_direction
7684 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7685 {
7686   /* On little-endian targets, the least significant byte of every stack
7687      argument is passed at the lowest byte address of the stack slot.  */
7688   if (!BYTES_BIG_ENDIAN)
7689     return PAD_UPWARD;
7690
7691   /* Otherwise, integral, floating-point and pointer types are padded downward:
7692      the least significant byte of a stack argument is passed at the highest
7693      byte address of the stack slot.  */
7694   if (type
7695       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7696          || POINTER_TYPE_P (type))
7697       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7698     return PAD_DOWNWARD;
7699
7700   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
7701   return PAD_UPWARD;
7702 }
7703
7704 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7705
7706    It specifies padding for the last (may also be the only)
7707    element of a block move between registers and memory.  If
7708    assuming the block is in the memory, padding upward means that
7709    the last element is padded after its highest significant byte,
7710    while in downward padding, the last element is padded at the
7711    its least significant byte side.
7712
7713    Small aggregates and small complex types are always padded
7714    upwards.
7715
7716    We don't need to worry about homogeneous floating-point or
7717    short-vector aggregates; their move is not affected by the
7718    padding direction determined here.  Regardless of endianness,
7719    each element of such an aggregate is put in the least
7720    significant bits of a fp/simd register.
7721
7722    Return !BYTES_BIG_ENDIAN if the least significant byte of the
7723    register has useful data, and return the opposite if the most
7724    significant byte does.  */
7725
7726 bool
7727 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
7728                      bool first ATTRIBUTE_UNUSED)
7729 {
7730
7731   /* Aside from pure scalable types, small composite types are always
7732      padded upward.  */
7733   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
7734     {
7735       HOST_WIDE_INT size;
7736       if (type)
7737         size = int_size_in_bytes (type);
7738       else
7739         /* No frontends can create types with variable-sized modes, so we
7740            shouldn't be asked to pass or return them.  */
7741         size = GET_MODE_SIZE (mode).to_constant ();
7742       if (size < 2 * UNITS_PER_WORD)
7743         {
7744           pure_scalable_type_info pst_info;
7745           if (pst_info.analyze_registers (type))
7746             return false;
7747           return true;
7748         }
7749     }
7750
7751   /* Otherwise, use the default padding.  */
7752   return !BYTES_BIG_ENDIAN;
7753 }
7754
7755 static scalar_int_mode
7756 aarch64_libgcc_cmp_return_mode (void)
7757 {
7758   return SImode;
7759 }
7760
7761 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7762
7763 /* We use the 12-bit shifted immediate arithmetic instructions so values
7764    must be multiple of (1 << 12), i.e. 4096.  */
7765 #define ARITH_FACTOR 4096
7766
7767 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7768 #error Cannot use simple address calculation for stack probing
7769 #endif
7770
7771 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7772    inclusive.  These are offsets from the current stack pointer.  */
7773
7774 static void
7775 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
7776 {
7777   HOST_WIDE_INT size;
7778   if (!poly_size.is_constant (&size))
7779     {
7780       sorry ("stack probes for SVE frames");
7781       return;
7782     }
7783
7784   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
7785
7786   /* See the same assertion on PROBE_INTERVAL above.  */
7787   gcc_assert ((first % ARITH_FACTOR) == 0);
7788
7789   /* See if we have a constant small number of probes to generate.  If so,
7790      that's the easy case.  */
7791   if (size <= PROBE_INTERVAL)
7792     {
7793       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7794
7795       emit_set_insn (reg1,
7796                      plus_constant (Pmode,
7797                                     stack_pointer_rtx, -(first + base)));
7798       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
7799     }
7800
7801   /* The run-time loop is made up of 8 insns in the generic case while the
7802      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
7803   else if (size <= 4 * PROBE_INTERVAL)
7804     {
7805       HOST_WIDE_INT i, rem;
7806
7807       emit_set_insn (reg1,
7808                      plus_constant (Pmode,
7809                                     stack_pointer_rtx,
7810                                     -(first + PROBE_INTERVAL)));
7811       emit_stack_probe (reg1);
7812
7813       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7814          it exceeds SIZE.  If only two probes are needed, this will not
7815          generate any code.  Then probe at FIRST + SIZE.  */
7816       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7817         {
7818           emit_set_insn (reg1,
7819                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
7820           emit_stack_probe (reg1);
7821         }
7822
7823       rem = size - (i - PROBE_INTERVAL);
7824       if (rem > 256)
7825         {
7826           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7827
7828           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7829           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
7830         }
7831       else
7832         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
7833     }
7834
7835   /* Otherwise, do the same as above, but in a loop.  Note that we must be
7836      extra careful with variables wrapping around because we might be at
7837      the very top (or the very bottom) of the address space and we have
7838      to be able to handle this case properly; in particular, we use an
7839      equality test for the loop condition.  */
7840   else
7841     {
7842       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
7843
7844       /* Step 1: round SIZE to the previous multiple of the interval.  */
7845
7846       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7847
7848
7849       /* Step 2: compute initial and final value of the loop counter.  */
7850
7851       /* TEST_ADDR = SP + FIRST.  */
7852       emit_set_insn (reg1,
7853                      plus_constant (Pmode, stack_pointer_rtx, -first));
7854
7855       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
7856       HOST_WIDE_INT adjustment = - (first + rounded_size);
7857       if (! aarch64_uimm12_shift (adjustment))
7858         {
7859           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7860                                           true, Pmode);
7861           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7862         }
7863       else
7864         emit_set_insn (reg2,
7865                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
7866
7867       /* Step 3: the loop
7868
7869          do
7870            {
7871              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7872              probe at TEST_ADDR
7873            }
7874          while (TEST_ADDR != LAST_ADDR)
7875
7876          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7877          until it is equal to ROUNDED_SIZE.  */
7878
7879       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
7880
7881
7882       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7883          that SIZE is equal to ROUNDED_SIZE.  */
7884
7885       if (size != rounded_size)
7886         {
7887           HOST_WIDE_INT rem = size - rounded_size;
7888
7889           if (rem > 256)
7890             {
7891               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7892
7893               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7894               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
7895             }
7896           else
7897             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
7898         }
7899     }
7900
7901   /* Make sure nothing is scheduled before we are done.  */
7902   emit_insn (gen_blockage ());
7903 }
7904
7905 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
7906    absolute addresses.  */
7907
7908 const char *
7909 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7910 {
7911   static int labelno = 0;
7912   char loop_lab[32];
7913   rtx xops[2];
7914
7915   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7916
7917   /* Loop.  */
7918   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7919
7920   HOST_WIDE_INT stack_clash_probe_interval
7921     = 1 << param_stack_clash_protection_guard_size;
7922
7923   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
7924   xops[0] = reg1;
7925   HOST_WIDE_INT interval;
7926   if (flag_stack_clash_protection)
7927     interval = stack_clash_probe_interval;
7928   else
7929     interval = PROBE_INTERVAL;
7930
7931   gcc_assert (aarch64_uimm12_shift (interval));
7932   xops[1] = GEN_INT (interval);
7933
7934   output_asm_insn ("sub\t%0, %0, %1", xops);
7935
7936   /* If doing stack clash protection then we probe up by the ABI specified
7937      amount.  We do this because we're dropping full pages at a time in the
7938      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
7939   if (flag_stack_clash_protection)
7940     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7941   else
7942     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7943
7944   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
7945      by this amount for each iteration.  */
7946   output_asm_insn ("str\txzr, [%0, %1]", xops);
7947
7948   /* Test if TEST_ADDR == LAST_ADDR.  */
7949   xops[1] = reg2;
7950   output_asm_insn ("cmp\t%0, %1", xops);
7951
7952   /* Branch.  */
7953   fputs ("\tb.ne\t", asm_out_file);
7954   assemble_name_raw (asm_out_file, loop_lab);
7955   fputc ('\n', asm_out_file);
7956
7957   return "";
7958 }
7959
7960 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7961    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7962    of GUARD_SIZE.  When a probe is emitted it is done at most
7963    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7964    at most MIN_PROBE_THRESHOLD.  By the end of this function
7965    BASE = BASE - ADJUSTMENT.  */
7966
7967 const char *
7968 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7969                                       rtx min_probe_threshold, rtx guard_size)
7970 {
7971   /* This function is not allowed to use any instruction generation function
7972      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
7973      so instead emit the code you want using output_asm_insn.  */
7974   gcc_assert (flag_stack_clash_protection);
7975   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7976   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7977
7978   /* The minimum required allocation before the residual requires probing.  */
7979   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7980
7981   /* Clamp the value down to the nearest value that can be used with a cmp.  */
7982   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7983   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7984
7985   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7986   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7987
7988   static int labelno = 0;
7989   char loop_start_lab[32];
7990   char loop_end_lab[32];
7991   rtx xops[2];
7992
7993   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7994   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7995
7996   /* Emit loop start label.  */
7997   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7998
7999   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
8000   xops[0] = adjustment;
8001   xops[1] = probe_offset_value_rtx;
8002   output_asm_insn ("cmp\t%0, %1", xops);
8003
8004   /* Branch to end if not enough adjustment to probe.  */
8005   fputs ("\tb.lt\t", asm_out_file);
8006   assemble_name_raw (asm_out_file, loop_end_lab);
8007   fputc ('\n', asm_out_file);
8008
8009   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
8010   xops[0] = base;
8011   xops[1] = probe_offset_value_rtx;
8012   output_asm_insn ("sub\t%0, %0, %1", xops);
8013
8014   /* Probe at BASE.  */
8015   xops[1] = const0_rtx;
8016   output_asm_insn ("str\txzr, [%0, %1]", xops);
8017
8018   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
8019   xops[0] = adjustment;
8020   xops[1] = probe_offset_value_rtx;
8021   output_asm_insn ("sub\t%0, %0, %1", xops);
8022
8023   /* Branch to start if still more bytes to allocate.  */
8024   fputs ("\tb\t", asm_out_file);
8025   assemble_name_raw (asm_out_file, loop_start_lab);
8026   fputc ('\n', asm_out_file);
8027
8028   /* No probe leave.  */
8029   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
8030
8031   /* BASE = BASE - ADJUSTMENT.  */
8032   xops[0] = base;
8033   xops[1] = adjustment;
8034   output_asm_insn ("sub\t%0, %0, %1", xops);
8035   return "";
8036 }
8037
8038 /* Determine whether a frame chain needs to be generated.  */
8039 static bool
8040 aarch64_needs_frame_chain (void)
8041 {
8042   /* Force a frame chain for EH returns so the return address is at FP+8.  */
8043   if (frame_pointer_needed || crtl->calls_eh_return)
8044     return true;
8045
8046   /* A leaf function cannot have calls or write LR.  */
8047   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
8048
8049   /* Don't use a frame chain in leaf functions if leaf frame pointers
8050      are disabled.  */
8051   if (flag_omit_leaf_frame_pointer && is_leaf)
8052     return false;
8053
8054   return aarch64_use_frame_pointer;
8055 }
8056
8057 /* Mark the registers that need to be saved by the callee and calculate
8058    the size of the callee-saved registers area and frame record (both FP
8059    and LR may be omitted).  */
8060 static void
8061 aarch64_layout_frame (void)
8062 {
8063   poly_int64 offset = 0;
8064   int regno, last_fp_reg = INVALID_REGNUM;
8065   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
8066   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
8067   bool frame_related_fp_reg_p = false;
8068   aarch64_frame &frame = cfun->machine->frame;
8069
8070   frame.emit_frame_chain = aarch64_needs_frame_chain ();
8071
8072   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
8073      the mid-end is doing.  */
8074   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
8075
8076 #define SLOT_NOT_REQUIRED (-2)
8077 #define SLOT_REQUIRED     (-1)
8078
8079   frame.wb_push_candidate1 = INVALID_REGNUM;
8080   frame.wb_push_candidate2 = INVALID_REGNUM;
8081   frame.spare_pred_reg = INVALID_REGNUM;
8082
8083   /* First mark all the registers that really need to be saved...  */
8084   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8085     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
8086
8087   /* ... that includes the eh data registers (if needed)...  */
8088   if (crtl->calls_eh_return)
8089     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
8090       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
8091
8092   /* ... and any callee saved register that dataflow says is live.  */
8093   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8094     if (df_regs_ever_live_p (regno)
8095         && !fixed_regs[regno]
8096         && (regno == R30_REGNUM
8097             || !crtl->abi->clobbers_full_reg_p (regno)))
8098       frame.reg_offset[regno] = SLOT_REQUIRED;
8099
8100   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8101     if (df_regs_ever_live_p (regno)
8102         && !fixed_regs[regno]
8103         && !crtl->abi->clobbers_full_reg_p (regno))
8104       {
8105         frame.reg_offset[regno] = SLOT_REQUIRED;
8106         last_fp_reg = regno;
8107         if (aarch64_emit_cfi_for_reg_p (regno))
8108           frame_related_fp_reg_p = true;
8109       }
8110
8111   /* Big-endian SVE frames need a spare predicate register in order
8112      to save Z8-Z15.  Decide which register they should use.  Prefer
8113      an unused argument register if possible, so that we don't force P4
8114      to be saved unnecessarily.  */
8115   if (frame_related_fp_reg_p
8116       && crtl->abi->id () == ARM_PCS_SVE
8117       && BYTES_BIG_ENDIAN)
8118     {
8119       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8120       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
8121       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
8122         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
8123           break;
8124       gcc_assert (regno <= P7_REGNUM);
8125       frame.spare_pred_reg = regno;
8126       df_set_regs_ever_live (regno, true);
8127     }
8128
8129   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8130     if (df_regs_ever_live_p (regno)
8131         && !fixed_regs[regno]
8132         && !crtl->abi->clobbers_full_reg_p (regno))
8133       frame.reg_offset[regno] = SLOT_REQUIRED;
8134
8135   /* With stack-clash, LR must be saved in non-leaf functions.  The saving of
8136      LR counts as an implicit probe which allows us to maintain the invariant
8137      described in the comment at expand_prologue.  */
8138   gcc_assert (crtl->is_leaf
8139               || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
8140
8141   /* Now assign stack slots for the registers.  Start with the predicate
8142      registers, since predicate LDR and STR have a relatively small
8143      offset range.  These saves happen below the hard frame pointer.  */
8144   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8145     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8146       {
8147         frame.reg_offset[regno] = offset;
8148         offset += BYTES_PER_SVE_PRED;
8149       }
8150
8151   if (maybe_ne (offset, 0))
8152     {
8153       /* If we have any vector registers to save above the predicate registers,
8154          the offset of the vector register save slots need to be a multiple
8155          of the vector size.  This lets us use the immediate forms of LDR/STR
8156          (or LD1/ST1 for big-endian).
8157
8158          A vector register is 8 times the size of a predicate register,
8159          and we need to save a maximum of 12 predicate registers, so the
8160          first vector register will be at either #1, MUL VL or #2, MUL VL.
8161
8162          If we don't have any vector registers to save, and we know how
8163          big the predicate save area is, we can just round it up to the
8164          next 16-byte boundary.  */
8165       if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
8166         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8167       else
8168         {
8169           if (known_le (offset, vector_save_size))
8170             offset = vector_save_size;
8171           else if (known_le (offset, vector_save_size * 2))
8172             offset = vector_save_size * 2;
8173           else
8174             gcc_unreachable ();
8175         }
8176     }
8177
8178   /* If we need to save any SVE vector registers, add them next.  */
8179   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
8180     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8181       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8182         {
8183           frame.reg_offset[regno] = offset;
8184           offset += vector_save_size;
8185         }
8186
8187   /* OFFSET is now the offset of the hard frame pointer from the bottom
8188      of the callee save area.  */
8189   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
8190   frame.below_hard_fp_saved_regs_size = offset;
8191   if (frame.emit_frame_chain)
8192     {
8193       /* FP and LR are placed in the linkage record.  */
8194       frame.reg_offset[R29_REGNUM] = offset;
8195       frame.wb_push_candidate1 = R29_REGNUM;
8196       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
8197       frame.wb_push_candidate2 = R30_REGNUM;
8198       offset += 2 * UNITS_PER_WORD;
8199     }
8200
8201   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8202     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8203       {
8204         frame.reg_offset[regno] = offset;
8205         if (frame.wb_push_candidate1 == INVALID_REGNUM)
8206           frame.wb_push_candidate1 = regno;
8207         else if (frame.wb_push_candidate2 == INVALID_REGNUM)
8208           frame.wb_push_candidate2 = regno;
8209         offset += UNITS_PER_WORD;
8210       }
8211
8212   poly_int64 max_int_offset = offset;
8213   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8214   bool has_align_gap = maybe_ne (offset, max_int_offset);
8215
8216   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8217     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8218       {
8219         /* If there is an alignment gap between integer and fp callee-saves,
8220            allocate the last fp register to it if possible.  */
8221         if (regno == last_fp_reg
8222             && has_align_gap
8223             && known_eq (vector_save_size, 8)
8224             && multiple_p (offset, 16))
8225           {
8226             frame.reg_offset[regno] = max_int_offset;
8227             break;
8228           }
8229
8230         frame.reg_offset[regno] = offset;
8231         if (frame.wb_push_candidate1 == INVALID_REGNUM)
8232           frame.wb_push_candidate1 = regno;
8233         else if (frame.wb_push_candidate2 == INVALID_REGNUM
8234                  && frame.wb_push_candidate1 >= V0_REGNUM)
8235           frame.wb_push_candidate2 = regno;
8236         offset += vector_save_size;
8237       }
8238
8239   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8240
8241   frame.saved_regs_size = offset;
8242
8243   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
8244
8245   poly_int64 above_outgoing_args
8246     = aligned_upper_bound (varargs_and_saved_regs_size
8247                            + get_frame_size (),
8248                            STACK_BOUNDARY / BITS_PER_UNIT);
8249
8250   frame.hard_fp_offset
8251     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
8252
8253   /* Both these values are already aligned.  */
8254   gcc_assert (multiple_p (crtl->outgoing_args_size,
8255                           STACK_BOUNDARY / BITS_PER_UNIT));
8256   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
8257
8258   frame.locals_offset = frame.saved_varargs_size;
8259
8260   frame.initial_adjust = 0;
8261   frame.final_adjust = 0;
8262   frame.callee_adjust = 0;
8263   frame.sve_callee_adjust = 0;
8264   frame.callee_offset = 0;
8265
8266   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8267   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8268
8269   /* Shadow call stack only deals with functions where the LR is pushed
8270      onto the stack and without specifying the "no_sanitize" attribute
8271      with the argument "shadow-call-stack".  */
8272   frame.is_scs_enabled
8273     = (!crtl->calls_eh_return
8274        && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8275        && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
8276
8277   /* When shadow call stack is enabled, the scs_pop in the epilogue will
8278      restore x30, and we don't need to pop x30 again in the traditional
8279      way.  Pop candidates record the registers that need to be popped
8280      eventually.  */
8281   if (frame.is_scs_enabled)
8282     {
8283       if (frame.wb_pop_candidate2 == R30_REGNUM)
8284         frame.wb_pop_candidate2 = INVALID_REGNUM;
8285       else if (frame.wb_pop_candidate1 == R30_REGNUM)
8286         frame.wb_pop_candidate1 = INVALID_REGNUM;
8287     }
8288
8289   /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8290      256 to ensure that the offset meets the requirements of emit_move_insn.
8291      Similarly, if candidate1 is INVALID_REGNUM, we need to set
8292      max_push_offset to 0, because no registers are popped at this time,
8293      so callee_adjust cannot be adjusted.  */
8294   HOST_WIDE_INT max_push_offset = 0;
8295   if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8296     max_push_offset = 512;
8297   else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8298     max_push_offset = 256;
8299
8300   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
8301   HOST_WIDE_INT const_saved_regs_size;
8302   if (frame.frame_size.is_constant (&const_size)
8303       && const_size < max_push_offset
8304       && known_eq (frame.hard_fp_offset, const_size))
8305     {
8306       /* Simple, small frame with no outgoing arguments:
8307
8308          stp reg1, reg2, [sp, -frame_size]!
8309          stp reg3, reg4, [sp, 16]  */
8310       frame.callee_adjust = const_size;
8311     }
8312   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
8313            && frame.saved_regs_size.is_constant (&const_saved_regs_size)
8314            && const_outgoing_args_size + const_saved_regs_size < 512
8315            /* We could handle this case even with outgoing args, provided
8316               that the number of args left us with valid offsets for all
8317               predicate and vector save slots.  It's such a rare case that
8318               it hardly seems worth the effort though.  */
8319            && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
8320            && !(cfun->calls_alloca
8321                 && frame.hard_fp_offset.is_constant (&const_fp_offset)
8322                 && const_fp_offset < max_push_offset))
8323     {
8324       /* Frame with small outgoing arguments:
8325
8326          sub sp, sp, frame_size
8327          stp reg1, reg2, [sp, outgoing_args_size]
8328          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
8329       frame.initial_adjust = frame.frame_size;
8330       frame.callee_offset = const_outgoing_args_size;
8331     }
8332   else if (saves_below_hard_fp_p
8333            && known_eq (frame.saved_regs_size,
8334                         frame.below_hard_fp_saved_regs_size))
8335     {
8336       /* Frame in which all saves are SVE saves:
8337
8338          sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
8339          save SVE registers relative to SP
8340          sub sp, sp, outgoing_args_size  */
8341       frame.initial_adjust = (frame.hard_fp_offset
8342                               + frame.below_hard_fp_saved_regs_size);
8343       frame.final_adjust = crtl->outgoing_args_size;
8344     }
8345   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
8346            && const_fp_offset < max_push_offset)
8347     {
8348       /* Frame with large outgoing arguments or SVE saves, but with
8349          a small local area:
8350
8351          stp reg1, reg2, [sp, -hard_fp_offset]!
8352          stp reg3, reg4, [sp, 16]
8353          [sub sp, sp, below_hard_fp_saved_regs_size]
8354          [save SVE registers relative to SP]
8355          sub sp, sp, outgoing_args_size  */
8356       frame.callee_adjust = const_fp_offset;
8357       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8358       frame.final_adjust = crtl->outgoing_args_size;
8359     }
8360   else
8361     {
8362       /* Frame with large local area and outgoing arguments or SVE saves,
8363          using frame pointer:
8364
8365          sub sp, sp, hard_fp_offset
8366          stp x29, x30, [sp, 0]
8367          add x29, sp, 0
8368          stp reg3, reg4, [sp, 16]
8369          [sub sp, sp, below_hard_fp_saved_regs_size]
8370          [save SVE registers relative to SP]
8371          sub sp, sp, outgoing_args_size  */
8372       frame.initial_adjust = frame.hard_fp_offset;
8373       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8374       frame.final_adjust = crtl->outgoing_args_size;
8375     }
8376
8377   /* Make sure the individual adjustments add up to the full frame size.  */
8378   gcc_assert (known_eq (frame.initial_adjust
8379                         + frame.callee_adjust
8380                         + frame.sve_callee_adjust
8381                         + frame.final_adjust, frame.frame_size));
8382
8383   if (!frame.emit_frame_chain && frame.callee_adjust == 0)
8384     {
8385       /* We've decided not to associate any register saves with the initial
8386          stack allocation.  */
8387       frame.wb_pop_candidate1 = frame.wb_push_candidate1 = INVALID_REGNUM;
8388       frame.wb_pop_candidate2 = frame.wb_push_candidate2 = INVALID_REGNUM;
8389     }
8390
8391   frame.laid_out = true;
8392 }
8393
8394 /* Return true if the register REGNO is saved on entry to
8395    the current function.  */
8396
8397 static bool
8398 aarch64_register_saved_on_entry (int regno)
8399 {
8400   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8401 }
8402
8403 /* Return the next register up from REGNO up to LIMIT for the callee
8404    to save.  */
8405
8406 static unsigned
8407 aarch64_next_callee_save (unsigned regno, unsigned limit)
8408 {
8409   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
8410     regno ++;
8411   return regno;
8412 }
8413
8414 /* Push the register number REGNO of mode MODE to the stack with write-back
8415    adjusting the stack by ADJUSTMENT.  */
8416
8417 static void
8418 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8419                            HOST_WIDE_INT adjustment)
8420  {
8421   rtx base_rtx = stack_pointer_rtx;
8422   rtx insn, reg, mem;
8423
8424   reg = gen_rtx_REG (mode, regno);
8425   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8426                             plus_constant (Pmode, base_rtx, -adjustment));
8427   mem = gen_frame_mem (mode, mem);
8428
8429   insn = emit_move_insn (mem, reg);
8430   RTX_FRAME_RELATED_P (insn) = 1;
8431 }
8432
8433 /* Generate and return an instruction to store the pair of registers
8434    REG and REG2 of mode MODE to location BASE with write-back adjusting
8435    the stack location BASE by ADJUSTMENT.  */
8436
8437 static rtx
8438 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8439                           HOST_WIDE_INT adjustment)
8440 {
8441   switch (mode)
8442     {
8443     case E_DImode:
8444       return gen_storewb_pairdi_di (base, base, reg, reg2,
8445                                     GEN_INT (-adjustment),
8446                                     GEN_INT (UNITS_PER_WORD - adjustment));
8447     case E_DFmode:
8448       return gen_storewb_pairdf_di (base, base, reg, reg2,
8449                                     GEN_INT (-adjustment),
8450                                     GEN_INT (UNITS_PER_WORD - adjustment));
8451     case E_TFmode:
8452       return gen_storewb_pairtf_di (base, base, reg, reg2,
8453                                     GEN_INT (-adjustment),
8454                                     GEN_INT (UNITS_PER_VREG - adjustment));
8455     default:
8456       gcc_unreachable ();
8457     }
8458 }
8459
8460 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8461    stack pointer by ADJUSTMENT.  */
8462
8463 static void
8464 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8465 {
8466   rtx_insn *insn;
8467   machine_mode mode = aarch64_reg_save_mode (regno1);
8468
8469   if (regno2 == INVALID_REGNUM)
8470     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8471
8472   rtx reg1 = gen_rtx_REG (mode, regno1);
8473   rtx reg2 = gen_rtx_REG (mode, regno2);
8474
8475   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8476                                               reg2, adjustment));
8477   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8478   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8479   RTX_FRAME_RELATED_P (insn) = 1;
8480 }
8481
8482 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8483    adjusting it by ADJUSTMENT afterwards.  */
8484
8485 static rtx
8486 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8487                          HOST_WIDE_INT adjustment)
8488 {
8489   switch (mode)
8490     {
8491     case E_DImode:
8492       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
8493                                    GEN_INT (UNITS_PER_WORD));
8494     case E_DFmode:
8495       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
8496                                    GEN_INT (UNITS_PER_WORD));
8497     case E_TFmode:
8498       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
8499                                    GEN_INT (UNITS_PER_VREG));
8500     default:
8501       gcc_unreachable ();
8502     }
8503 }
8504
8505 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8506    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8507    into CFI_OPS.  */
8508
8509 static void
8510 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8511                   rtx *cfi_ops)
8512 {
8513   machine_mode mode = aarch64_reg_save_mode (regno1);
8514   rtx reg1 = gen_rtx_REG (mode, regno1);
8515
8516   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8517
8518   if (regno2 == INVALID_REGNUM)
8519     {
8520       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8521       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8522       emit_move_insn (reg1, gen_frame_mem (mode, mem));
8523     }
8524   else
8525     {
8526       rtx reg2 = gen_rtx_REG (mode, regno2);
8527       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8528       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8529                                           reg2, adjustment));
8530     }
8531 }
8532
8533 /* Generate and return a store pair instruction of mode MODE to store
8534    register REG1 to MEM1 and register REG2 to MEM2.  */
8535
8536 static rtx
8537 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
8538                         rtx reg2)
8539 {
8540   switch (mode)
8541     {
8542     case E_DImode:
8543       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
8544
8545     case E_DFmode:
8546       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
8547
8548     case E_TFmode:
8549       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
8550
8551     case E_V4SImode:
8552       return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
8553
8554     case E_V16QImode:
8555       return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
8556
8557     default:
8558       gcc_unreachable ();
8559     }
8560 }
8561
8562 /* Generate and regurn a load pair isntruction of mode MODE to load register
8563    REG1 from MEM1 and register REG2 from MEM2.  */
8564
8565 static rtx
8566 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
8567                        rtx mem2)
8568 {
8569   switch (mode)
8570     {
8571     case E_DImode:
8572       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
8573
8574     case E_DFmode:
8575       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
8576
8577     case E_TFmode:
8578       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
8579
8580     case E_V4SImode:
8581       return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
8582
8583     default:
8584       gcc_unreachable ();
8585     }
8586 }
8587
8588 /* Return TRUE if return address signing should be enabled for the current
8589    function, otherwise return FALSE.  */
8590
8591 bool
8592 aarch64_return_address_signing_enabled (void)
8593 {
8594   /* This function should only be called after frame laid out.   */
8595   gcc_assert (cfun->machine->frame.laid_out);
8596
8597   /* Turn return address signing off in any function that uses
8598      __builtin_eh_return.  The address passed to __builtin_eh_return
8599      is not signed so either it has to be signed (with original sp)
8600      or the code path that uses it has to avoid authenticating it.
8601      Currently eh return introduces a return to anywhere gadget, no
8602      matter what we do here since it uses ret with user provided
8603      address. An ideal fix for that is to use indirect branch which
8604      can be protected with BTI j (to some extent).  */
8605   if (crtl->calls_eh_return)
8606     return false;
8607
8608   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
8609      if its LR is pushed onto stack.  */
8610   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
8611           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
8612               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8613 }
8614
8615 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
8616 bool
8617 aarch64_bti_enabled (void)
8618 {
8619   return (aarch64_enable_bti == 1);
8620 }
8621
8622 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8623    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8624    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
8625
8626      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8627          or LD1D address
8628
8629      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8630          if the variable isn't already nonnull
8631
8632    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8633    Handle this case using a temporary base register that is suitable for
8634    all offsets in that range.  Use ANCHOR_REG as this base register if it
8635    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
8636
8637 static inline void
8638 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8639                                      rtx &anchor_reg, poly_int64 &offset,
8640                                      rtx &ptrue)
8641 {
8642   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8643     {
8644       /* This is the maximum valid offset of the anchor from the base.
8645          Lower values would be valid too.  */
8646       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8647       if (!anchor_reg)
8648         {
8649           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8650           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8651                                     gen_int_mode (anchor_offset, Pmode)));
8652         }
8653       base_rtx = anchor_reg;
8654       offset -= anchor_offset;
8655     }
8656   if (!ptrue)
8657     {
8658       int pred_reg = cfun->machine->frame.spare_pred_reg;
8659       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8660                       CONSTM1_RTX (VNx16BImode));
8661       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8662     }
8663 }
8664
8665 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8666    is saved at BASE + OFFSET.  */
8667
8668 static void
8669 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8670                             rtx base, poly_int64 offset)
8671 {
8672   rtx mem = gen_frame_mem (GET_MODE (reg),
8673                            plus_constant (Pmode, base, offset));
8674   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8675 }
8676
8677 /* Emit code to save the callee-saved registers from register number START
8678    to LIMIT to the stack at the location starting at offset START_OFFSET,
8679    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
8680    is true if the hard frame pointer has been set up.  */
8681
8682 static void
8683 aarch64_save_callee_saves (poly_int64 start_offset,
8684                            unsigned start, unsigned limit, bool skip_wb,
8685                            bool hard_fp_valid_p)
8686 {
8687   rtx_insn *insn;
8688   unsigned regno;
8689   unsigned regno2;
8690   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8691
8692   for (regno = aarch64_next_callee_save (start, limit);
8693        regno <= limit;
8694        regno = aarch64_next_callee_save (regno + 1, limit))
8695     {
8696       rtx reg, mem;
8697       poly_int64 offset;
8698       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8699
8700       if (skip_wb
8701           && (regno == cfun->machine->frame.wb_push_candidate1
8702               || regno == cfun->machine->frame.wb_push_candidate2))
8703         continue;
8704
8705       if (cfun->machine->reg_is_wrapped_separately[regno])
8706         continue;
8707
8708       machine_mode mode = aarch64_reg_save_mode (regno);
8709       reg = gen_rtx_REG (mode, regno);
8710       offset = start_offset + cfun->machine->frame.reg_offset[regno];
8711       rtx base_rtx = stack_pointer_rtx;
8712       poly_int64 sp_offset = offset;
8713
8714       HOST_WIDE_INT const_offset;
8715       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8716         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8717                                              offset, ptrue);
8718       else if (GP_REGNUM_P (regno)
8719                && (!offset.is_constant (&const_offset) || const_offset >= 512))
8720         {
8721           gcc_assert (known_eq (start_offset, 0));
8722           poly_int64 fp_offset
8723             = cfun->machine->frame.below_hard_fp_saved_regs_size;
8724           if (hard_fp_valid_p)
8725             base_rtx = hard_frame_pointer_rtx;
8726           else
8727             {
8728               if (!anchor_reg)
8729                 {
8730                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8731                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8732                                             gen_int_mode (fp_offset, Pmode)));
8733                 }
8734               base_rtx = anchor_reg;
8735             }
8736           offset -= fp_offset;
8737         }
8738       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8739       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
8740
8741       if (!aarch64_sve_mode_p (mode)
8742           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
8743           && !cfun->machine->reg_is_wrapped_separately[regno2]
8744           && known_eq (GET_MODE_SIZE (mode),
8745                        cfun->machine->frame.reg_offset[regno2]
8746                        - cfun->machine->frame.reg_offset[regno]))
8747         {
8748           rtx reg2 = gen_rtx_REG (mode, regno2);
8749           rtx mem2;
8750
8751           offset += GET_MODE_SIZE (mode);
8752           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8753           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
8754                                                     reg2));
8755
8756           /* The first part of a frame-related parallel insn is
8757              always assumed to be relevant to the frame
8758              calculations; subsequent parts, are only
8759              frame-related if explicitly marked.  */
8760           if (aarch64_emit_cfi_for_reg_p (regno2))
8761             {
8762               if (need_cfa_note_p)
8763                 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
8764                                             sp_offset + GET_MODE_SIZE (mode));
8765               else
8766                 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8767             }
8768
8769           regno = regno2;
8770         }
8771       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8772         {
8773           insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
8774           need_cfa_note_p = true;
8775         }
8776       else if (aarch64_sve_mode_p (mode))
8777         insn = emit_insn (gen_rtx_SET (mem, reg));
8778       else
8779         insn = emit_move_insn (mem, reg);
8780
8781       RTX_FRAME_RELATED_P (insn) = frame_related_p;
8782       if (frame_related_p && need_cfa_note_p)
8783         aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
8784     }
8785 }
8786
8787 /* Emit code to restore the callee registers from register number START
8788    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
8789    skipping any write-back candidates if SKIP_WB is true.  Write the
8790    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
8791
8792 static void
8793 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
8794                               unsigned limit, bool skip_wb, rtx *cfi_ops)
8795 {
8796   unsigned regno;
8797   unsigned regno2;
8798   poly_int64 offset;
8799   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8800
8801   for (regno = aarch64_next_callee_save (start, limit);
8802        regno <= limit;
8803        regno = aarch64_next_callee_save (regno + 1, limit))
8804     {
8805       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8806       if (cfun->machine->reg_is_wrapped_separately[regno])
8807         continue;
8808
8809       rtx reg, mem;
8810
8811       if (skip_wb
8812           && (regno == cfun->machine->frame.wb_pop_candidate1
8813               || regno == cfun->machine->frame.wb_pop_candidate2))
8814         continue;
8815
8816       machine_mode mode = aarch64_reg_save_mode (regno);
8817       reg = gen_rtx_REG (mode, regno);
8818       offset = start_offset + cfun->machine->frame.reg_offset[regno];
8819       rtx base_rtx = stack_pointer_rtx;
8820       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8821         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8822                                              offset, ptrue);
8823       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8824
8825       if (!aarch64_sve_mode_p (mode)
8826           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
8827           && !cfun->machine->reg_is_wrapped_separately[regno2]
8828           && known_eq (GET_MODE_SIZE (mode),
8829                        cfun->machine->frame.reg_offset[regno2]
8830                        - cfun->machine->frame.reg_offset[regno]))
8831         {
8832           rtx reg2 = gen_rtx_REG (mode, regno2);
8833           rtx mem2;
8834
8835           offset += GET_MODE_SIZE (mode);
8836           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8837           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
8838
8839           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8840           regno = regno2;
8841         }
8842       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8843         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
8844       else if (aarch64_sve_mode_p (mode))
8845         emit_insn (gen_rtx_SET (reg, mem));
8846       else
8847         emit_move_insn (reg, mem);
8848       if (frame_related_p)
8849         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
8850     }
8851 }
8852
8853 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
8854    of MODE.  */
8855
8856 static inline bool
8857 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8858 {
8859   HOST_WIDE_INT multiple;
8860   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8861           && IN_RANGE (multiple, -8, 7));
8862 }
8863
8864 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
8865    of MODE.  */
8866
8867 static inline bool
8868 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8869 {
8870   HOST_WIDE_INT multiple;
8871   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8872           && IN_RANGE (multiple, -32, 31));
8873 }
8874
8875 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
8876    of MODE.  */
8877
8878 static inline bool
8879 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8880 {
8881   HOST_WIDE_INT multiple;
8882   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8883           && IN_RANGE (multiple, 0, 63));
8884 }
8885
8886 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
8887    of MODE.  */
8888
8889 bool
8890 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8891 {
8892   HOST_WIDE_INT multiple;
8893   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8894           && IN_RANGE (multiple, -64, 63));
8895 }
8896
8897 /* Return true if OFFSET is a signed 9-bit value.  */
8898
8899 bool
8900 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
8901                                        poly_int64 offset)
8902 {
8903   HOST_WIDE_INT const_offset;
8904   return (offset.is_constant (&const_offset)
8905           && IN_RANGE (const_offset, -256, 255));
8906 }
8907
8908 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
8909    of MODE.  */
8910
8911 static inline bool
8912 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8913 {
8914   HOST_WIDE_INT multiple;
8915   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8916           && IN_RANGE (multiple, -256, 255));
8917 }
8918
8919 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
8920    of MODE.  */
8921
8922 static inline bool
8923 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8924 {
8925   HOST_WIDE_INT multiple;
8926   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8927           && IN_RANGE (multiple, 0, 4095));
8928 }
8929
8930 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
8931
8932 static sbitmap
8933 aarch64_get_separate_components (void)
8934 {
8935   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8936   bitmap_clear (components);
8937
8938   /* The registers we need saved to the frame.  */
8939   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8940     if (aarch64_register_saved_on_entry (regno))
8941       {
8942         /* Punt on saves and restores that use ST1D and LD1D.  We could
8943            try to be smarter, but it would involve making sure that the
8944            spare predicate register itself is safe to use at the save
8945            and restore points.  Also, when a frame pointer is being used,
8946            the slots are often out of reach of ST1D and LD1D anyway.  */
8947         machine_mode mode = aarch64_reg_save_mode (regno);
8948         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8949           continue;
8950
8951         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8952
8953         /* If the register is saved in the first SVE save slot, we use
8954            it as a stack probe for -fstack-clash-protection.  */
8955         if (flag_stack_clash_protection
8956             && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
8957             && known_eq (offset, 0))
8958           continue;
8959
8960         /* Get the offset relative to the register we'll use.  */
8961         if (frame_pointer_needed)
8962           offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8963         else
8964           offset += crtl->outgoing_args_size;
8965
8966         /* Check that we can access the stack slot of the register with one
8967            direct load with no adjustments needed.  */
8968         if (aarch64_sve_mode_p (mode)
8969             ? offset_9bit_signed_scaled_p (mode, offset)
8970             : offset_12bit_unsigned_scaled_p (mode, offset))
8971           bitmap_set_bit (components, regno);
8972       }
8973
8974   /* Don't mess with the hard frame pointer.  */
8975   if (frame_pointer_needed)
8976     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
8977
8978   /* If the spare predicate register used by big-endian SVE code
8979      is call-preserved, it must be saved in the main prologue
8980      before any saves that use it.  */
8981   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
8982     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
8983
8984   unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
8985   unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
8986   /* If registers have been chosen to be stored/restored with
8987      writeback don't interfere with them to avoid having to output explicit
8988      stack adjustment instructions.  */
8989   if (reg2 != INVALID_REGNUM)
8990     bitmap_clear_bit (components, reg2);
8991   if (reg1 != INVALID_REGNUM)
8992     bitmap_clear_bit (components, reg1);
8993
8994   bitmap_clear_bit (components, LR_REGNUM);
8995   bitmap_clear_bit (components, SP_REGNUM);
8996
8997   return components;
8998 }
8999
9000 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
9001
9002 static sbitmap
9003 aarch64_components_for_bb (basic_block bb)
9004 {
9005   bitmap in = DF_LIVE_IN (bb);
9006   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
9007   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
9008
9009   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9010   bitmap_clear (components);
9011
9012   /* Clobbered registers don't generate values in any meaningful sense,
9013      since nothing after the clobber can rely on their value.  And we can't
9014      say that partially-clobbered registers are unconditionally killed,
9015      because whether they're killed or not depends on the mode of the
9016      value they're holding.  Thus partially call-clobbered registers
9017      appear in neither the kill set nor the gen set.
9018
9019      Check manually for any calls that clobber more of a register than the
9020      current function can.  */
9021   function_abi_aggregator callee_abis;
9022   rtx_insn *insn;
9023   FOR_BB_INSNS (bb, insn)
9024     if (CALL_P (insn))
9025       callee_abis.note_callee_abi (insn_callee_abi (insn));
9026   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
9027
9028   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
9029   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9030     if (!fixed_regs[regno]
9031         && !crtl->abi->clobbers_full_reg_p (regno)
9032         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
9033             || bitmap_bit_p (in, regno)
9034             || bitmap_bit_p (gen, regno)
9035             || bitmap_bit_p (kill, regno)))
9036       {
9037         bitmap_set_bit (components, regno);
9038
9039         /* If there is a callee-save at an adjacent offset, add it too
9040            to increase the use of LDP/STP.  */
9041         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9042         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
9043
9044         if (regno2 <= LAST_SAVED_REGNUM)
9045           {
9046             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9047             if (regno < regno2
9048                 ? known_eq (offset + 8, offset2)
9049                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
9050               bitmap_set_bit (components, regno2);
9051           }
9052       }
9053
9054   return components;
9055 }
9056
9057 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9058    Nothing to do for aarch64.  */
9059
9060 static void
9061 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
9062 {
9063 }
9064
9065 /* Return the next set bit in BMP from START onwards.  Return the total number
9066    of bits in BMP if no set bit is found at or after START.  */
9067
9068 static unsigned int
9069 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
9070 {
9071   unsigned int nbits = SBITMAP_SIZE (bmp);
9072   if (start == nbits)
9073     return start;
9074
9075   gcc_assert (start < nbits);
9076   for (unsigned int i = start; i < nbits; i++)
9077     if (bitmap_bit_p (bmp, i))
9078       return i;
9079
9080   return nbits;
9081 }
9082
9083 /* Do the work for aarch64_emit_prologue_components and
9084    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
9085    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9086    for these components or the epilogue sequence.  That is, it determines
9087    whether we should emit stores or loads and what kind of CFA notes to attach
9088    to the insns.  Otherwise the logic for the two sequences is very
9089    similar.  */
9090
9091 static void
9092 aarch64_process_components (sbitmap components, bool prologue_p)
9093 {
9094   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
9095                              ? HARD_FRAME_POINTER_REGNUM
9096                              : STACK_POINTER_REGNUM);
9097
9098   unsigned last_regno = SBITMAP_SIZE (components);
9099   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9100   rtx_insn *insn = NULL;
9101
9102   while (regno != last_regno)
9103     {
9104       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9105       machine_mode mode = aarch64_reg_save_mode (regno);
9106
9107       rtx reg = gen_rtx_REG (mode, regno);
9108       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9109       if (frame_pointer_needed)
9110         offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9111       else
9112         offset += crtl->outgoing_args_size;
9113
9114       rtx addr = plus_constant (Pmode, ptr_reg, offset);
9115       rtx mem = gen_frame_mem (mode, addr);
9116
9117       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9118       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9119       /* No more registers to handle after REGNO.
9120          Emit a single save/restore and exit.  */
9121       if (regno2 == last_regno)
9122         {
9123           insn = emit_insn (set);
9124           if (frame_related_p)
9125             {
9126               RTX_FRAME_RELATED_P (insn) = 1;
9127               if (prologue_p)
9128                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9129               else
9130                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9131             }
9132           break;
9133         }
9134
9135       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9136       /* The next register is not of the same class or its offset is not
9137          mergeable with the current one into a pair.  */
9138       if (aarch64_sve_mode_p (mode)
9139           || !satisfies_constraint_Ump (mem)
9140           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9141           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9142           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
9143                        GET_MODE_SIZE (mode)))
9144         {
9145           insn = emit_insn (set);
9146           if (frame_related_p)
9147             {
9148               RTX_FRAME_RELATED_P (insn) = 1;
9149               if (prologue_p)
9150                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9151               else
9152                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9153             }
9154
9155           regno = regno2;
9156           continue;
9157         }
9158
9159       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9160
9161       /* REGNO2 can be saved/restored in a pair with REGNO.  */
9162       rtx reg2 = gen_rtx_REG (mode, regno2);
9163       if (frame_pointer_needed)
9164         offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9165       else
9166         offset2 += crtl->outgoing_args_size;
9167       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9168       rtx mem2 = gen_frame_mem (mode, addr2);
9169       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9170                              : gen_rtx_SET (reg2, mem2);
9171
9172       if (prologue_p)
9173         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
9174       else
9175         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9176
9177       if (frame_related_p || frame_related2_p)
9178         {
9179           RTX_FRAME_RELATED_P (insn) = 1;
9180           if (prologue_p)
9181             {
9182               if (frame_related_p)
9183                 add_reg_note (insn, REG_CFA_OFFSET, set);
9184               if (frame_related2_p)
9185                 add_reg_note (insn, REG_CFA_OFFSET, set2);
9186             }
9187           else
9188             {
9189               if (frame_related_p)
9190                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9191               if (frame_related2_p)
9192                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9193             }
9194         }
9195
9196       regno = aarch64_get_next_set_bit (components, regno2 + 1);
9197     }
9198 }
9199
9200 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
9201
9202 static void
9203 aarch64_emit_prologue_components (sbitmap components)
9204 {
9205   aarch64_process_components (components, true);
9206 }
9207
9208 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
9209
9210 static void
9211 aarch64_emit_epilogue_components (sbitmap components)
9212 {
9213   aarch64_process_components (components, false);
9214 }
9215
9216 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
9217
9218 static void
9219 aarch64_set_handled_components (sbitmap components)
9220 {
9221   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9222     if (bitmap_bit_p (components, regno))
9223       cfun->machine->reg_is_wrapped_separately[regno] = true;
9224 }
9225
9226 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
9227    determining the probe offset for alloca.  */
9228
9229 static HOST_WIDE_INT
9230 aarch64_stack_clash_protection_alloca_probe_range (void)
9231 {
9232   return STACK_CLASH_CALLER_GUARD;
9233 }
9234
9235
9236 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9237    registers.  If POLY_SIZE is not large enough to require a probe this function
9238    will only adjust the stack.  When allocating the stack space
9239    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9240    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
9241    arguments.  If we are then we ensure that any allocation larger than the ABI
9242    defined buffer needs a probe so that the invariant of having a 1KB buffer is
9243    maintained.
9244
9245    We emit barriers after each stack adjustment to prevent optimizations from
9246    breaking the invariant that we never drop the stack more than a page.  This
9247    invariant is needed to make it easier to correctly handle asynchronous
9248    events, e.g. if we were to allow the stack to be dropped by more than a page
9249    and then have multiple probes up and we take a signal somewhere in between
9250    then the signal handler doesn't know the state of the stack and can make no
9251    assumptions about which pages have been probed.  */
9252
9253 static void
9254 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9255                                         poly_int64 poly_size,
9256                                         bool frame_related_p,
9257                                         bool final_adjustment_p)
9258 {
9259   HOST_WIDE_INT guard_size
9260     = 1 << param_stack_clash_protection_guard_size;
9261   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9262   HOST_WIDE_INT min_probe_threshold
9263     = (final_adjustment_p
9264        ? guard_used_by_caller
9265        : guard_size - guard_used_by_caller);
9266   /* When doing the final adjustment for the outgoing arguments, take into
9267      account any unprobed space there is above the current SP.  There are
9268      two cases:
9269
9270      - When saving SVE registers below the hard frame pointer, we force
9271        the lowest save to take place in the prologue before doing the final
9272        adjustment (i.e. we don't allow the save to be shrink-wrapped).
9273        This acts as a probe at SP, so there is no unprobed space.
9274
9275      - When there are no SVE register saves, we use the store of the link
9276        register as a probe.  We can't assume that LR was saved at position 0
9277        though, so treat any space below it as unprobed.  */
9278   if (final_adjustment_p
9279       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
9280     {
9281       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
9282       if (known_ge (lr_offset, 0))
9283         min_probe_threshold -= lr_offset.to_constant ();
9284       else
9285         gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
9286     }
9287
9288   poly_int64 frame_size = cfun->machine->frame.frame_size;
9289
9290   /* We should always have a positive probe threshold.  */
9291   gcc_assert (min_probe_threshold > 0);
9292
9293   if (flag_stack_clash_protection && !final_adjustment_p)
9294     {
9295       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9296       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9297       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9298
9299       if (known_eq (frame_size, 0))
9300         {
9301           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9302         }
9303       else if (known_lt (initial_adjust + sve_callee_adjust,
9304                          guard_size - guard_used_by_caller)
9305                && known_lt (final_adjust, guard_used_by_caller))
9306         {
9307           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9308         }
9309     }
9310
9311   /* If SIZE is not large enough to require probing, just adjust the stack and
9312      exit.  */
9313   if (known_lt (poly_size, min_probe_threshold)
9314       || !flag_stack_clash_protection)
9315     {
9316       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
9317       return;
9318     }
9319
9320   HOST_WIDE_INT size;
9321   /* Handle the SVE non-constant case first.  */
9322   if (!poly_size.is_constant (&size))
9323     {
9324      if (dump_file)
9325       {
9326         fprintf (dump_file, "Stack clash SVE prologue: ");
9327         print_dec (poly_size, dump_file);
9328         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9329       }
9330
9331       /* First calculate the amount of bytes we're actually spilling.  */
9332       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9333                           poly_size, temp1, temp2, false, true);
9334
9335       rtx_insn *insn = get_last_insn ();
9336
9337       if (frame_related_p)
9338         {
9339           /* This is done to provide unwinding information for the stack
9340              adjustments we're about to do, however to prevent the optimizers
9341              from removing the R11 move and leaving the CFA note (which would be
9342              very wrong) we tie the old and new stack pointer together.
9343              The tie will expand to nothing but the optimizers will not touch
9344              the instruction.  */
9345           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9346           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9347           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
9348
9349           /* We want the CFA independent of the stack pointer for the
9350              duration of the loop.  */
9351           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9352           RTX_FRAME_RELATED_P (insn) = 1;
9353         }
9354
9355       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9356       rtx guard_const = gen_int_mode (guard_size, Pmode);
9357
9358       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9359                                                    stack_pointer_rtx, temp1,
9360                                                    probe_const, guard_const));
9361
9362       /* Now reset the CFA register if needed.  */
9363       if (frame_related_p)
9364         {
9365           add_reg_note (insn, REG_CFA_DEF_CFA,
9366                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9367                                       gen_int_mode (poly_size, Pmode)));
9368           RTX_FRAME_RELATED_P (insn) = 1;
9369         }
9370
9371       return;
9372     }
9373
9374   if (dump_file)
9375     fprintf (dump_file,
9376              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9377              " bytes, probing will be required.\n", size);
9378
9379   /* Round size to the nearest multiple of guard_size, and calculate the
9380      residual as the difference between the original size and the rounded
9381      size.  */
9382   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9383   HOST_WIDE_INT residual = size - rounded_size;
9384
9385   /* We can handle a small number of allocations/probes inline.  Otherwise
9386      punt to a loop.  */
9387   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9388     {
9389       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9390         {
9391           aarch64_sub_sp (NULL, temp2, guard_size, true);
9392           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9393                                            guard_used_by_caller));
9394           emit_insn (gen_blockage ());
9395         }
9396       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9397     }
9398   else
9399     {
9400       /* Compute the ending address.  */
9401       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9402                           temp1, NULL, false, true);
9403       rtx_insn *insn = get_last_insn ();
9404
9405       /* For the initial allocation, we don't have a frame pointer
9406          set up, so we always need CFI notes.  If we're doing the
9407          final allocation, then we may have a frame pointer, in which
9408          case it is the CFA, otherwise we need CFI notes.
9409
9410          We can determine which allocation we are doing by looking at
9411          the value of FRAME_RELATED_P since the final allocations are not
9412          frame related.  */
9413       if (frame_related_p)
9414         {
9415           /* We want the CFA independent of the stack pointer for the
9416              duration of the loop.  */
9417           add_reg_note (insn, REG_CFA_DEF_CFA,
9418                         plus_constant (Pmode, temp1, rounded_size));
9419           RTX_FRAME_RELATED_P (insn) = 1;
9420         }
9421
9422       /* This allocates and probes the stack.  Note that this re-uses some of
9423          the existing Ada stack protection code.  However we are guaranteed not
9424          to enter the non loop or residual branches of that code.
9425
9426          The non-loop part won't be entered because if our allocation amount
9427          doesn't require a loop, the case above would handle it.
9428
9429          The residual amount won't be entered because TEMP1 is a mutliple of
9430          the allocation size.  The residual will always be 0.  As such, the only
9431          part we are actually using from that code is the loop setup.  The
9432          actual probing is done in aarch64_output_probe_stack_range.  */
9433       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9434                                                stack_pointer_rtx, temp1));
9435
9436       /* Now reset the CFA register if needed.  */
9437       if (frame_related_p)
9438         {
9439           add_reg_note (insn, REG_CFA_DEF_CFA,
9440                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9441           RTX_FRAME_RELATED_P (insn) = 1;
9442         }
9443
9444       emit_insn (gen_blockage ());
9445       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9446     }
9447
9448   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
9449      be probed.  This maintains the requirement that each page is probed at
9450      least once.  For initial probing we probe only if the allocation is
9451      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
9452      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
9453      GUARD_SIZE.  This works that for any allocation that is large enough to
9454      trigger a probe here, we'll have at least one, and if they're not large
9455      enough for this code to emit anything for them, The page would have been
9456      probed by the saving of FP/LR either by this function or any callees.  If
9457      we don't have any callees then we won't have more stack adjustments and so
9458      are still safe.  */
9459   if (residual)
9460     {
9461       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
9462       /* If we're doing final adjustments, and we've done any full page
9463          allocations then any residual needs to be probed.  */
9464       if (final_adjustment_p && rounded_size != 0)
9465         min_probe_threshold = 0;
9466       /* If doing a small final adjustment, we always probe at offset 0.
9467          This is done to avoid issues when LR is not at position 0 or when
9468          the final adjustment is smaller than the probing offset.  */
9469       else if (final_adjustment_p && rounded_size == 0)
9470         residual_probe_offset = 0;
9471
9472       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
9473       if (residual >= min_probe_threshold)
9474         {
9475           if (dump_file)
9476             fprintf (dump_file,
9477                      "Stack clash AArch64 prologue residuals: "
9478                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9479                      "\n", residual);
9480
9481             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9482                                              residual_probe_offset));
9483           emit_insn (gen_blockage ());
9484         }
9485     }
9486 }
9487
9488 /* Return 1 if the register is used by the epilogue.  We need to say the
9489    return register is used, but only after epilogue generation is complete.
9490    Note that in the case of sibcalls, the values "used by the epilogue" are
9491    considered live at the start of the called function.
9492
9493    For SIMD functions we need to return 1 for FP registers that are saved and
9494    restored by a function but are not zero in call_used_regs.  If we do not do
9495    this optimizations may remove the restore of the register.  */
9496
9497 int
9498 aarch64_epilogue_uses (int regno)
9499 {
9500   if (epilogue_completed)
9501     {
9502       if (regno == LR_REGNUM)
9503         return 1;
9504     }
9505   return 0;
9506 }
9507
9508 /* AArch64 stack frames generated by this compiler look like:
9509
9510         +-------------------------------+
9511         |                               |
9512         |  incoming stack arguments     |
9513         |                               |
9514         +-------------------------------+
9515         |                               | <-- incoming stack pointer (aligned)
9516         |  callee-allocated save area   |
9517         |  for register varargs         |
9518         |                               |
9519         +-------------------------------+
9520         |  local variables              | <-- frame_pointer_rtx
9521         |                               |
9522         +-------------------------------+
9523         |  padding                      | \
9524         +-------------------------------+  |
9525         |  callee-saved registers       |  | frame.saved_regs_size
9526         +-------------------------------+  |
9527         |  LR'                          |  |
9528         +-------------------------------+  |
9529         |  FP'                          |  |
9530         +-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
9531         |  SVE vector registers         |  | \
9532         +-------------------------------+  |  | below_hard_fp_saved_regs_size
9533         |  SVE predicate registers      | /  /
9534         +-------------------------------+
9535         |  dynamic allocation           |
9536         +-------------------------------+
9537         |  padding                      |
9538         +-------------------------------+
9539         |  outgoing stack arguments     | <-- arg_pointer
9540         |                               |
9541         +-------------------------------+
9542         |                               | <-- stack_pointer_rtx (aligned)
9543
9544    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9545    but leave frame_pointer_rtx and hard_frame_pointer_rtx
9546    unchanged.
9547
9548    By default for stack-clash we assume the guard is at least 64KB, but this
9549    value is configurable to either 4KB or 64KB.  We also force the guard size to
9550    be the same as the probing interval and both values are kept in sync.
9551
9552    With those assumptions the callee can allocate up to 63KB (or 3KB depending
9553    on the guard size) of stack space without probing.
9554
9555    When probing is needed, we emit a probe at the start of the prologue
9556    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9557
9558    We have to track how much space has been allocated and the only stores
9559    to the stack we track as implicit probes are the FP/LR stores.
9560
9561    For outgoing arguments we probe if the size is larger than 1KB, such that
9562    the ABI specified buffer is maintained for the next callee.
9563
9564    The following registers are reserved during frame layout and should not be
9565    used for any other purpose:
9566
9567    - r11: Used by stack clash protection when SVE is enabled, and also
9568           as an anchor register when saving and restoring registers
9569    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9570    - r14 and r15: Used for speculation tracking.
9571    - r16(IP0), r17(IP1): Used by indirect tailcalls.
9572    - r30(LR), r29(FP): Used by standard frame layout.
9573
9574    These registers must be avoided in frame layout related code unless the
9575    explicit intention is to interact with one of the features listed above.  */
9576
9577 /* Generate the prologue instructions for entry into a function.
9578    Establish the stack frame by decreasing the stack pointer with a
9579    properly calculated size and, if necessary, create a frame record
9580    filled with the values of LR and previous frame pointer.  The
9581    current FP is also set up if it is in use.  */
9582
9583 void
9584 aarch64_expand_prologue (void)
9585 {
9586   poly_int64 frame_size = cfun->machine->frame.frame_size;
9587   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9588   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
9589   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9590   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
9591   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9592   poly_int64 below_hard_fp_saved_regs_size
9593     = cfun->machine->frame.below_hard_fp_saved_regs_size;
9594   unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9595   unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9596   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
9597   rtx_insn *insn;
9598
9599   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
9600     {
9601       /* Fold the SVE allocation into the initial allocation.
9602          We don't do this in aarch64_layout_arg to avoid pessimizing
9603          the epilogue code.  */
9604       initial_adjust += sve_callee_adjust;
9605       sve_callee_adjust = 0;
9606     }
9607
9608   /* Sign return address for functions.  */
9609   if (aarch64_return_address_signing_enabled ())
9610     {
9611       switch (aarch64_ra_sign_key)
9612         {
9613           case AARCH64_KEY_A:
9614             insn = emit_insn (gen_paciasp ());
9615             break;
9616           case AARCH64_KEY_B:
9617             insn = emit_insn (gen_pacibsp ());
9618             break;
9619           default:
9620             gcc_unreachable ();
9621         }
9622       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9623       RTX_FRAME_RELATED_P (insn) = 1;
9624     }
9625
9626   /* Push return address to shadow call stack.  */
9627   if (cfun->machine->frame.is_scs_enabled)
9628     emit_insn (gen_scs_push ());
9629
9630   if (flag_stack_usage_info)
9631     current_function_static_stack_size = constant_lower_bound (frame_size);
9632
9633   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9634     {
9635       if (crtl->is_leaf && !cfun->calls_alloca)
9636         {
9637           if (maybe_gt (frame_size, PROBE_INTERVAL)
9638               && maybe_gt (frame_size, get_stack_check_protect ()))
9639             aarch64_emit_probe_stack_range (get_stack_check_protect (),
9640                                             (frame_size
9641                                              - get_stack_check_protect ()));
9642         }
9643       else if (maybe_gt (frame_size, 0))
9644         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9645     }
9646
9647   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9648   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9649
9650   /* In theory we should never have both an initial adjustment
9651      and a callee save adjustment.  Verify that is the case since the
9652      code below does not handle it for -fstack-clash-protection.  */
9653   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9654
9655   /* Will only probe if the initial adjustment is larger than the guard
9656      less the amount of the guard reserved for use by the caller's
9657      outgoing args.  */
9658   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9659                                           true, false);
9660
9661   if (callee_adjust != 0)
9662     aarch64_push_regs (reg1, reg2, callee_adjust);
9663
9664   /* The offset of the frame chain record (if any) from the current SP.  */
9665   poly_int64 chain_offset = (initial_adjust + callee_adjust
9666                              - cfun->machine->frame.hard_fp_offset);
9667   gcc_assert (known_ge (chain_offset, 0));
9668
9669   /* The offset of the bottom of the save area from the current SP.  */
9670   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
9671
9672   if (emit_frame_chain)
9673     {
9674       if (callee_adjust == 0)
9675         {
9676           reg1 = R29_REGNUM;
9677           reg2 = R30_REGNUM;
9678           aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
9679                                      false, false);
9680         }
9681       else
9682         gcc_assert (known_eq (chain_offset, 0));
9683       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9684                           stack_pointer_rtx, chain_offset,
9685                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
9686       if (frame_pointer_needed && !frame_size.is_constant ())
9687         {
9688           /* Variable-sized frames need to describe the save slot
9689              address using DW_CFA_expression rather than DW_CFA_offset.
9690              This means that, without taking further action, the
9691              locations of the registers that we've already saved would
9692              remain based on the stack pointer even after we redefine
9693              the CFA based on the frame pointer.  We therefore need new
9694              DW_CFA_expressions to re-express the save slots with addresses
9695              based on the frame pointer.  */
9696           rtx_insn *insn = get_last_insn ();
9697           gcc_assert (RTX_FRAME_RELATED_P (insn));
9698
9699           /* Add an explicit CFA definition if this was previously
9700              implicit.  */
9701           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9702             {
9703               rtx src = plus_constant (Pmode, stack_pointer_rtx,
9704                                        callee_offset);
9705               add_reg_note (insn, REG_CFA_ADJUST_CFA,
9706                             gen_rtx_SET (hard_frame_pointer_rtx, src));
9707             }
9708
9709           /* Change the save slot expressions for the registers that
9710              we've already saved.  */
9711           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9712                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
9713           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
9714                                       hard_frame_pointer_rtx, 0);
9715         }
9716       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
9717     }
9718
9719   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
9720                              callee_adjust != 0 || emit_frame_chain,
9721                              emit_frame_chain);
9722   if (maybe_ne (sve_callee_adjust, 0))
9723     {
9724       gcc_assert (!flag_stack_clash_protection
9725                   || known_eq (initial_adjust, 0));
9726       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
9727                                               sve_callee_adjust,
9728                                               !frame_pointer_needed, false);
9729       saved_regs_offset += sve_callee_adjust;
9730     }
9731   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
9732                              false, emit_frame_chain);
9733   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
9734                              callee_adjust != 0 || emit_frame_chain,
9735                              emit_frame_chain);
9736
9737   /* We may need to probe the final adjustment if it is larger than the guard
9738      that is assumed by the called.  */
9739   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
9740                                           !frame_pointer_needed, true);
9741 }
9742
9743 /* Return TRUE if we can use a simple_return insn.
9744
9745    This function checks whether the callee saved stack is empty, which
9746    means no restore actions are need. The pro_and_epilogue will use
9747    this to check whether shrink-wrapping opt is feasible.  */
9748
9749 bool
9750 aarch64_use_return_insn_p (void)
9751 {
9752   if (!reload_completed)
9753     return false;
9754
9755   if (crtl->profile)
9756     return false;
9757
9758   return known_eq (cfun->machine->frame.frame_size, 0);
9759 }
9760
9761 /* Generate the epilogue instructions for returning from a function.
9762    This is almost exactly the reverse of the prolog sequence, except
9763    that we need to insert barriers to avoid scheduling loads that read
9764    from a deallocated stack, and we optimize the unwind records by
9765    emitting them all together if possible.  */
9766 void
9767 aarch64_expand_epilogue (bool for_sibcall)
9768 {
9769   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9770   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
9771   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9772   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
9773   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9774   poly_int64 below_hard_fp_saved_regs_size
9775     = cfun->machine->frame.below_hard_fp_saved_regs_size;
9776   unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
9777   unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
9778   unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
9779                            ? R29_REGNUM : R30_REGNUM);
9780   rtx cfi_ops = NULL;
9781   rtx_insn *insn;
9782   /* A stack clash protection prologue may not have left EP0_REGNUM or
9783      EP1_REGNUM in a usable state.  The same is true for allocations
9784      with an SVE component, since we then need both temporary registers
9785      for each allocation.  For stack clash we are in a usable state if
9786      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
9787   HOST_WIDE_INT guard_size
9788     = 1 << param_stack_clash_protection_guard_size;
9789   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9790
9791   /* We can re-use the registers when:
9792
9793      (a) the deallocation amount is the same as the corresponding
9794          allocation amount (which is false if we combine the initial
9795          and SVE callee save allocations in the prologue); and
9796
9797      (b) the allocation amount doesn't need a probe (which is false
9798          if the amount is guard_size - guard_used_by_caller or greater).
9799
9800      In such situations the register should remain live with the correct
9801      value.  */
9802   bool can_inherit_p = (initial_adjust.is_constant ()
9803                         && final_adjust.is_constant ()
9804                         && (!flag_stack_clash_protection
9805                             || (known_lt (initial_adjust,
9806                                           guard_size - guard_used_by_caller)
9807                                 && known_eq (sve_callee_adjust, 0))));
9808
9809   /* We need to add memory barrier to prevent read from deallocated stack.  */
9810   bool need_barrier_p
9811     = maybe_ne (get_frame_size ()
9812                 + cfun->machine->frame.saved_varargs_size, 0);
9813
9814   /* Emit a barrier to prevent loads from a deallocated stack.  */
9815   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
9816       || cfun->calls_alloca
9817       || crtl->calls_eh_return)
9818     {
9819       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
9820       need_barrier_p = false;
9821     }
9822
9823   /* Restore the stack pointer from the frame pointer if it may not
9824      be the same as the stack pointer.  */
9825   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9826   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9827   if (frame_pointer_needed
9828       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
9829     /* If writeback is used when restoring callee-saves, the CFA
9830        is restored on the instruction doing the writeback.  */
9831     aarch64_add_offset (Pmode, stack_pointer_rtx,
9832                         hard_frame_pointer_rtx,
9833                         -callee_offset - below_hard_fp_saved_regs_size,
9834                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
9835   else
9836      /* The case where we need to re-use the register here is very rare, so
9837         avoid the complicated condition and just always emit a move if the
9838         immediate doesn't fit.  */
9839      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
9840
9841   /* Restore the vector registers before the predicate registers,
9842      so that we can use P4 as a temporary for big-endian SVE frames.  */
9843   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
9844                                 callee_adjust != 0, &cfi_ops);
9845   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
9846                                 false, &cfi_ops);
9847   if (maybe_ne (sve_callee_adjust, 0))
9848     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
9849
9850   /* When shadow call stack is enabled, the scs_pop in the epilogue will
9851      restore x30, we don't need to restore x30 again in the traditional
9852      way.  */
9853   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
9854                                 R0_REGNUM, last_gpr,
9855                                 callee_adjust != 0, &cfi_ops);
9856
9857   if (need_barrier_p)
9858     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
9859
9860   if (callee_adjust != 0)
9861     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
9862
9863   /* If we have no register restore information, the CFA must have been
9864      defined in terms of the stack pointer since the end of the prologue.  */
9865   gcc_assert (cfi_ops || !frame_pointer_needed);
9866
9867   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
9868     {
9869       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
9870       insn = get_last_insn ();
9871       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
9872       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
9873       RTX_FRAME_RELATED_P (insn) = 1;
9874       cfi_ops = NULL;
9875     }
9876
9877   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
9878      add restriction on emit_move optimization to leaf functions.  */
9879   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
9880                   (!can_inherit_p || !crtl->is_leaf
9881                    || df_regs_ever_live_p (EP0_REGNUM)));
9882
9883   if (cfi_ops)
9884     {
9885       /* Emit delayed restores and reset the CFA to be SP.  */
9886       insn = get_last_insn ();
9887       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
9888       REG_NOTES (insn) = cfi_ops;
9889       RTX_FRAME_RELATED_P (insn) = 1;
9890     }
9891
9892   /* Pop return address from shadow call stack.  */
9893   if (cfun->machine->frame.is_scs_enabled)
9894     {
9895       machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
9896       rtx reg = gen_rtx_REG (mode, R30_REGNUM);
9897
9898       insn = emit_insn (gen_scs_pop ());
9899       add_reg_note (insn, REG_CFA_RESTORE, reg);
9900       RTX_FRAME_RELATED_P (insn) = 1;
9901     }
9902
9903   /* We prefer to emit the combined return/authenticate instruction RETAA,
9904      however there are three cases in which we must instead emit an explicit
9905      authentication instruction.
9906
9907         1) Sibcalls don't return in a normal way, so if we're about to call one
9908            we must authenticate.
9909
9910         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
9911            generating code for !TARGET_ARMV8_3 we can't use it and must
9912            explicitly authenticate.
9913     */
9914   if (aarch64_return_address_signing_enabled ()
9915       && (for_sibcall || !TARGET_ARMV8_3))
9916     {
9917       switch (aarch64_ra_sign_key)
9918         {
9919           case AARCH64_KEY_A:
9920             insn = emit_insn (gen_autiasp ());
9921             break;
9922           case AARCH64_KEY_B:
9923             insn = emit_insn (gen_autibsp ());
9924             break;
9925           default:
9926             gcc_unreachable ();
9927         }
9928       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9929       RTX_FRAME_RELATED_P (insn) = 1;
9930     }
9931
9932   /* Stack adjustment for exception handler.  */
9933   if (crtl->calls_eh_return && !for_sibcall)
9934     {
9935       /* We need to unwind the stack by the offset computed by
9936          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
9937          to be SP; letting the CFA move during this adjustment
9938          is just as correct as retaining the CFA from the body
9939          of the function.  Therefore, do nothing special.  */
9940       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
9941     }
9942
9943   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
9944   if (!for_sibcall)
9945     emit_jump_insn (ret_rtx);
9946 }
9947
9948 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
9949    normally or return to a previous frame after unwinding.
9950
9951    An EH return uses a single shared return sequence.  The epilogue is
9952    exactly like a normal epilogue except that it has an extra input
9953    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
9954    that must be applied after the frame has been destroyed.  An extra label
9955    is inserted before the epilogue which initializes this register to zero,
9956    and this is the entry point for a normal return.
9957
9958    An actual EH return updates the return address, initializes the stack
9959    adjustment and jumps directly into the epilogue (bypassing the zeroing
9960    of the adjustment).  Since the return address is typically saved on the
9961    stack when a function makes a call, the saved LR must be updated outside
9962    the epilogue.
9963
9964    This poses problems as the store is generated well before the epilogue,
9965    so the offset of LR is not known yet.  Also optimizations will remove the
9966    store as it appears dead, even after the epilogue is generated (as the
9967    base or offset for loading LR is different in many cases).
9968
9969    To avoid these problems this implementation forces the frame pointer
9970    in eh_return functions so that the location of LR is fixed and known early.
9971    It also marks the store volatile, so no optimization is permitted to
9972    remove the store.  */
9973 rtx
9974 aarch64_eh_return_handler_rtx (void)
9975 {
9976   rtx tmp = gen_frame_mem (Pmode,
9977     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
9978
9979   /* Mark the store volatile, so no optimization is permitted to remove it.  */
9980   MEM_VOLATILE_P (tmp) = true;
9981   return tmp;
9982 }
9983
9984 /* Output code to add DELTA to the first argument, and then jump
9985    to FUNCTION.  Used for C++ multiple inheritance.  */
9986 static void
9987 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
9988                          HOST_WIDE_INT delta,
9989                          HOST_WIDE_INT vcall_offset,
9990                          tree function)
9991 {
9992   /* The this pointer is always in x0.  Note that this differs from
9993      Arm where the this pointer maybe bumped to r1 if r0 is required
9994      to return a pointer to an aggregate.  On AArch64 a result value
9995      pointer will be in x8.  */
9996   int this_regno = R0_REGNUM;
9997   rtx this_rtx, temp0, temp1, addr, funexp;
9998   rtx_insn *insn;
9999   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10000
10001   if (aarch64_bti_enabled ())
10002     emit_insn (gen_bti_c());
10003
10004   reload_completed = 1;
10005   emit_note (NOTE_INSN_PROLOGUE_END);
10006
10007   this_rtx = gen_rtx_REG (Pmode, this_regno);
10008   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10009   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10010
10011   if (vcall_offset == 0)
10012     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
10013   else
10014     {
10015       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10016
10017       addr = this_rtx;
10018       if (delta != 0)
10019         {
10020           if (delta >= -256 && delta < 256)
10021             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10022                                        plus_constant (Pmode, this_rtx, delta));
10023           else
10024             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10025                                 temp1, temp0, false);
10026         }
10027
10028       if (Pmode == ptr_mode)
10029         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10030       else
10031         aarch64_emit_move (temp0,
10032                            gen_rtx_ZERO_EXTEND (Pmode,
10033                                                 gen_rtx_MEM (ptr_mode, addr)));
10034
10035       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10036           addr = plus_constant (Pmode, temp0, vcall_offset);
10037       else
10038         {
10039           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10040                                           Pmode);
10041           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10042         }
10043
10044       if (Pmode == ptr_mode)
10045         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10046       else
10047         aarch64_emit_move (temp1,
10048                            gen_rtx_SIGN_EXTEND (Pmode,
10049                                                 gen_rtx_MEM (ptr_mode, addr)));
10050
10051       emit_insn (gen_add2_insn (this_rtx, temp1));
10052     }
10053
10054   /* Generate a tail call to the target function.  */
10055   if (!TREE_USED (function))
10056     {
10057       assemble_external (function);
10058       TREE_USED (function) = 1;
10059     }
10060   funexp = XEXP (DECL_RTL (function), 0);
10061   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10062   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
10063   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10064   SIBLING_CALL_P (insn) = 1;
10065
10066   insn = get_insns ();
10067   shorten_branches (insn);
10068
10069   assemble_start_function (thunk, fnname);
10070   final_start_function (insn, file, 1);
10071   final (insn, file, 1);
10072   final_end_function ();
10073   assemble_end_function (thunk, fnname);
10074
10075   /* Stop pretending to be a post-reload pass.  */
10076   reload_completed = 0;
10077 }
10078
10079 static bool
10080 aarch64_tls_referenced_p (rtx x)
10081 {
10082   if (!TARGET_HAVE_TLS)
10083     return false;
10084   subrtx_iterator::array_type array;
10085   FOR_EACH_SUBRTX (iter, array, x, ALL)
10086     {
10087       const_rtx x = *iter;
10088       if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10089         return true;
10090       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10091          TLS offsets, not real symbol references.  */
10092       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10093         iter.skip_subrtxes ();
10094     }
10095   return false;
10096 }
10097
10098
10099 /* Return true if val can be encoded as a 12-bit unsigned immediate with
10100    a left shift of 0 or 12 bits.  */
10101 bool
10102 aarch64_uimm12_shift (HOST_WIDE_INT val)
10103 {
10104   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
10105           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
10106           );
10107 }
10108
10109 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
10110    that can be created with a left shift of 0 or 12.  */
10111 static HOST_WIDE_INT
10112 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
10113 {
10114   /* Check to see if the value fits in 24 bits, as that is the maximum we can
10115      handle correctly.  */
10116   gcc_assert ((val & 0xffffff) == val);
10117
10118   if (((val & 0xfff) << 0) == val)
10119     return val;
10120
10121   return val & (0xfff << 12);
10122 }
10123
10124 /* Return true if val is an immediate that can be loaded into a
10125    register by a MOVZ instruction.  */
10126 static bool
10127 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
10128 {
10129   if (GET_MODE_SIZE (mode) > 4)
10130     {
10131       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
10132           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
10133         return 1;
10134     }
10135   else
10136     {
10137       /* Ignore sign extension.  */
10138       val &= (HOST_WIDE_INT) 0xffffffff;
10139     }
10140   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
10141           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
10142 }
10143
10144 /* Test whether:
10145
10146      X = (X & AND_VAL) | IOR_VAL;
10147
10148    can be implemented using:
10149
10150      MOVK X, #(IOR_VAL >> shift), LSL #shift
10151
10152    Return the shift if so, otherwise return -1.  */
10153 int
10154 aarch64_movk_shift (const wide_int_ref &and_val,
10155                     const wide_int_ref &ior_val)
10156 {
10157   unsigned int precision = and_val.get_precision ();
10158   unsigned HOST_WIDE_INT mask = 0xffff;
10159   for (unsigned int shift = 0; shift < precision; shift += 16)
10160     {
10161       if (and_val == ~mask && (ior_val & mask) == ior_val)
10162         return shift;
10163       mask <<= 16;
10164     }
10165   return -1;
10166 }
10167
10168 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
10169    64-bit (DImode) integer.  */
10170
10171 static unsigned HOST_WIDE_INT
10172 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
10173 {
10174   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
10175   while (size < 64)
10176     {
10177       val &= (HOST_WIDE_INT_1U << size) - 1;
10178       val |= val << size;
10179       size *= 2;
10180     }
10181   return val;
10182 }
10183
10184 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
10185
10186 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
10187   {
10188     0x0000000100000001ull,
10189     0x0001000100010001ull,
10190     0x0101010101010101ull,
10191     0x1111111111111111ull,
10192     0x5555555555555555ull,
10193   };
10194
10195
10196 /* Return true if val is a valid bitmask immediate.  */
10197
10198 bool
10199 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
10200 {
10201   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
10202   int bits;
10203
10204   /* Check for a single sequence of one bits and return quickly if so.
10205      The special cases of all ones and all zeroes returns false.  */
10206   val = aarch64_replicate_bitmask_imm (val_in, mode);
10207   tmp = val + (val & -val);
10208
10209   if (tmp == (tmp & -tmp))
10210     return (val + 1) > 1;
10211
10212   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
10213   if (mode == SImode)
10214     val = (val << 32) | (val & 0xffffffff);
10215
10216   /* Invert if the immediate doesn't start with a zero bit - this means we
10217      only need to search for sequences of one bits.  */
10218   if (val & 1)
10219     val = ~val;
10220
10221   /* Find the first set bit and set tmp to val with the first sequence of one
10222      bits removed.  Return success if there is a single sequence of ones.  */
10223   first_one = val & -val;
10224   tmp = val & (val + first_one);
10225
10226   if (tmp == 0)
10227     return true;
10228
10229   /* Find the next set bit and compute the difference in bit position.  */
10230   next_one = tmp & -tmp;
10231   bits = clz_hwi (first_one) - clz_hwi (next_one);
10232   mask = val ^ tmp;
10233
10234   /* Check the bit position difference is a power of 2, and that the first
10235      sequence of one bits fits within 'bits' bits.  */
10236   if ((mask >> bits) != 0 || bits != (bits & -bits))
10237     return false;
10238
10239   /* Check the sequence of one bits is repeated 64/bits times.  */
10240   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
10241 }
10242
10243 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
10244    Assumed precondition: VAL_IN Is not zero.  */
10245
10246 unsigned HOST_WIDE_INT
10247 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
10248 {
10249   int lowest_bit_set = ctz_hwi (val_in);
10250   int highest_bit_set = floor_log2 (val_in);
10251   gcc_assert (val_in != 0);
10252
10253   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
10254           (HOST_WIDE_INT_1U << lowest_bit_set));
10255 }
10256
10257 /* Create constant where bits outside of lowest bit set to highest bit set
10258    are set to 1.  */
10259
10260 unsigned HOST_WIDE_INT
10261 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
10262 {
10263   return val_in | ~aarch64_and_split_imm1 (val_in);
10264 }
10265
10266 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
10267
10268 bool
10269 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
10270 {
10271   scalar_int_mode int_mode;
10272   if (!is_a <scalar_int_mode> (mode, &int_mode))
10273     return false;
10274
10275   if (aarch64_bitmask_imm (val_in, int_mode))
10276     return false;
10277
10278   if (aarch64_move_imm (val_in, int_mode))
10279     return false;
10280
10281   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
10282
10283   return aarch64_bitmask_imm (imm2, int_mode);
10284 }
10285
10286 /* Return true if val is an immediate that can be loaded into a
10287    register in a single instruction.  */
10288 bool
10289 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
10290 {
10291   scalar_int_mode int_mode;
10292   if (!is_a <scalar_int_mode> (mode, &int_mode))
10293     return false;
10294
10295   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
10296     return 1;
10297   return aarch64_bitmask_imm (val, int_mode);
10298 }
10299
10300 static bool
10301 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10302 {
10303   if (GET_CODE (x) == HIGH)
10304     return true;
10305
10306   /* There's no way to calculate VL-based values using relocations.  */
10307   subrtx_iterator::array_type array;
10308   FOR_EACH_SUBRTX (iter, array, x, ALL)
10309     if (GET_CODE (*iter) == CONST_POLY_INT)
10310       return true;
10311
10312   poly_int64 offset;
10313   rtx base = strip_offset_and_salt (x, &offset);
10314   if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10315     {
10316       /* We checked for POLY_INT_CST offsets above.  */
10317       if (aarch64_classify_symbol (base, offset.to_constant ())
10318           != SYMBOL_FORCE_TO_MEM)
10319         return true;
10320       else
10321         /* Avoid generating a 64-bit relocation in ILP32; leave
10322            to aarch64_expand_mov_immediate to handle it properly.  */
10323         return mode != ptr_mode;
10324     }
10325
10326   return aarch64_tls_referenced_p (x);
10327 }
10328
10329 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10330    The expansion for a table switch is quite expensive due to the number
10331    of instructions, the table lookup and hard to predict indirect jump.
10332    When optimizing for speed, and -O3 enabled, use the per-core tuning if
10333    set, otherwise use tables for >= 11 cases as a tradeoff between size and
10334    performance.  When optimizing for size, use 8 for smallest codesize.  */
10335
10336 static unsigned int
10337 aarch64_case_values_threshold (void)
10338 {
10339   /* Use the specified limit for the number of cases before using jump
10340      tables at higher optimization levels.  */
10341   if (optimize > 2
10342       && aarch64_tune_params.max_case_values != 0)
10343     return aarch64_tune_params.max_case_values;
10344   else
10345     return optimize_size ? 8 : 11;
10346 }
10347
10348 /* Return true if register REGNO is a valid index register.
10349    STRICT_P is true if REG_OK_STRICT is in effect.  */
10350
10351 bool
10352 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10353 {
10354   if (!HARD_REGISTER_NUM_P (regno))
10355     {
10356       if (!strict_p)
10357         return true;
10358
10359       if (!reg_renumber)
10360         return false;
10361
10362       regno = reg_renumber[regno];
10363     }
10364   return GP_REGNUM_P (regno);
10365 }
10366
10367 /* Return true if register REGNO is a valid base register for mode MODE.
10368    STRICT_P is true if REG_OK_STRICT is in effect.  */
10369
10370 bool
10371 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10372 {
10373   if (!HARD_REGISTER_NUM_P (regno))
10374     {
10375       if (!strict_p)
10376         return true;
10377
10378       if (!reg_renumber)
10379         return false;
10380
10381       regno = reg_renumber[regno];
10382     }
10383
10384   /* The fake registers will be eliminated to either the stack or
10385      hard frame pointer, both of which are usually valid base registers.
10386      Reload deals with the cases where the eliminated form isn't valid.  */
10387   return (GP_REGNUM_P (regno)
10388           || regno == SP_REGNUM
10389           || regno == FRAME_POINTER_REGNUM
10390           || regno == ARG_POINTER_REGNUM);
10391 }
10392
10393 /* Return true if X is a valid base register for mode MODE.
10394    STRICT_P is true if REG_OK_STRICT is in effect.  */
10395
10396 static bool
10397 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10398 {
10399   if (!strict_p
10400       && SUBREG_P (x)
10401       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10402     x = SUBREG_REG (x);
10403
10404   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10405 }
10406
10407 /* Return true if address offset is a valid index.  If it is, fill in INFO
10408    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10409
10410 static bool
10411 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10412                         machine_mode mode, bool strict_p)
10413 {
10414   enum aarch64_address_type type;
10415   rtx index;
10416   int shift;
10417
10418   /* (reg:P) */
10419   if ((REG_P (x) || SUBREG_P (x))
10420       && GET_MODE (x) == Pmode)
10421     {
10422       type = ADDRESS_REG_REG;
10423       index = x;
10424       shift = 0;
10425     }
10426   /* (sign_extend:DI (reg:SI)) */
10427   else if ((GET_CODE (x) == SIGN_EXTEND
10428             || GET_CODE (x) == ZERO_EXTEND)
10429            && GET_MODE (x) == DImode
10430            && GET_MODE (XEXP (x, 0)) == SImode)
10431     {
10432       type = (GET_CODE (x) == SIGN_EXTEND)
10433         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10434       index = XEXP (x, 0);
10435       shift = 0;
10436     }
10437   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10438   else if (GET_CODE (x) == MULT
10439            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10440                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10441            && GET_MODE (XEXP (x, 0)) == DImode
10442            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10443            && CONST_INT_P (XEXP (x, 1)))
10444     {
10445       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10446         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10447       index = XEXP (XEXP (x, 0), 0);
10448       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10449     }
10450   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10451   else if (GET_CODE (x) == ASHIFT
10452            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10453                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10454            && GET_MODE (XEXP (x, 0)) == DImode
10455            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10456            && CONST_INT_P (XEXP (x, 1)))
10457     {
10458       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10459         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10460       index = XEXP (XEXP (x, 0), 0);
10461       shift = INTVAL (XEXP (x, 1));
10462     }
10463   /* (and:DI (mult:DI (reg:DI) (const_int scale))
10464      (const_int 0xffffffff<<shift)) */
10465   else if (GET_CODE (x) == AND
10466            && GET_MODE (x) == DImode
10467            && GET_CODE (XEXP (x, 0)) == MULT
10468            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10469            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10470            && CONST_INT_P (XEXP (x, 1)))
10471     {
10472       type = ADDRESS_REG_UXTW;
10473       index = XEXP (XEXP (x, 0), 0);
10474       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10475       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10476         shift = -1;
10477     }
10478   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10479      (const_int 0xffffffff<<shift)) */
10480   else if (GET_CODE (x) == AND
10481            && GET_MODE (x) == DImode
10482            && GET_CODE (XEXP (x, 0)) == ASHIFT
10483            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10484            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10485            && CONST_INT_P (XEXP (x, 1)))
10486     {
10487       type = ADDRESS_REG_UXTW;
10488       index = XEXP (XEXP (x, 0), 0);
10489       shift = INTVAL (XEXP (XEXP (x, 0), 1));
10490       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10491         shift = -1;
10492     }
10493   /* (mult:P (reg:P) (const_int scale)) */
10494   else if (GET_CODE (x) == MULT
10495            && GET_MODE (x) == Pmode
10496            && GET_MODE (XEXP (x, 0)) == Pmode
10497            && CONST_INT_P (XEXP (x, 1)))
10498     {
10499       type = ADDRESS_REG_REG;
10500       index = XEXP (x, 0);
10501       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10502     }
10503   /* (ashift:P (reg:P) (const_int shift)) */
10504   else if (GET_CODE (x) == ASHIFT
10505            && GET_MODE (x) == Pmode
10506            && GET_MODE (XEXP (x, 0)) == Pmode
10507            && CONST_INT_P (XEXP (x, 1)))
10508     {
10509       type = ADDRESS_REG_REG;
10510       index = XEXP (x, 0);
10511       shift = INTVAL (XEXP (x, 1));
10512     }
10513   else
10514     return false;
10515
10516   if (!strict_p
10517       && SUBREG_P (index)
10518       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10519     index = SUBREG_REG (index);
10520
10521   if (aarch64_sve_data_mode_p (mode))
10522     {
10523       if (type != ADDRESS_REG_REG
10524           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10525         return false;
10526     }
10527   else
10528     {
10529       if (shift != 0
10530           && !(IN_RANGE (shift, 1, 3)
10531                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10532         return false;
10533     }
10534
10535   if (REG_P (index)
10536       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10537     {
10538       info->type = type;
10539       info->offset = index;
10540       info->shift = shift;
10541       return true;
10542     }
10543
10544   return false;
10545 }
10546
10547 /* Return true if MODE is one of the modes for which we
10548    support LDP/STP operations.  */
10549
10550 static bool
10551 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10552 {
10553   return mode == SImode || mode == DImode
10554          || mode == SFmode || mode == DFmode
10555          || mode == SDmode || mode == DDmode
10556          || (aarch64_vector_mode_supported_p (mode)
10557              && (known_eq (GET_MODE_SIZE (mode), 8)
10558                  || (known_eq (GET_MODE_SIZE (mode), 16)
10559                     && (aarch64_tune_params.extra_tuning_flags
10560                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
10561 }
10562
10563 /* Return true if REGNO is a virtual pointer register, or an eliminable
10564    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
10565    include stack_pointer or hard_frame_pointer.  */
10566 static bool
10567 virt_or_elim_regno_p (unsigned regno)
10568 {
10569   return ((regno >= FIRST_VIRTUAL_REGISTER
10570            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10571           || regno == FRAME_POINTER_REGNUM
10572           || regno == ARG_POINTER_REGNUM);
10573 }
10574
10575 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10576    If it is, fill in INFO appropriately.  STRICT_P is true if
10577    REG_OK_STRICT is in effect.  */
10578
10579 bool
10580 aarch64_classify_address (struct aarch64_address_info *info,
10581                           rtx x, machine_mode mode, bool strict_p,
10582                           aarch64_addr_query_type type)
10583 {
10584   enum rtx_code code = GET_CODE (x);
10585   rtx op0, op1;
10586   poly_int64 offset;
10587
10588   HOST_WIDE_INT const_size;
10589
10590   /* Whether a vector mode is partial doesn't affect address legitimacy.
10591      Partial vectors like VNx8QImode allow the same indexed addressing
10592      mode and MUL VL addressing mode as full vectors like VNx16QImode;
10593      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
10594   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10595   vec_flags &= ~VEC_PARTIAL;
10596
10597   /* On BE, we use load/store pair for all large int mode load/stores.
10598      TI/TF/TDmode may also use a load/store pair.  */
10599   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10600   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10601                             || type == ADDR_QUERY_LDP_STP_N
10602                             || mode == TImode
10603                             || mode == TFmode
10604                             || mode == TDmode
10605                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
10606   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10607      corresponds to the actual size of the memory being loaded/stored and the
10608      mode of the corresponding addressing mode is half of that.  */
10609   if (type == ADDR_QUERY_LDP_STP_N)
10610     {
10611       if (known_eq (GET_MODE_SIZE (mode), 16))
10612         mode = DFmode;
10613       else if (known_eq (GET_MODE_SIZE (mode), 8))
10614         mode = SFmode;
10615       else
10616         return false;
10617     }
10618
10619   bool allow_reg_index_p = (!load_store_pair_p
10620                             && ((vec_flags == 0
10621                                  && known_lt (GET_MODE_SIZE (mode), 16))
10622                                 || vec_flags == VEC_ADVSIMD
10623                                 || vec_flags & VEC_SVE_DATA));
10624
10625   /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10626      The latter is not valid for SVE predicates, and that's rejected through
10627      allow_reg_index_p above.  */
10628   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10629       && (code != REG && code != PLUS))
10630     return false;
10631
10632   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10633      REG addressing.  */
10634   if (advsimd_struct_p
10635       && !BYTES_BIG_ENDIAN
10636       && (code != POST_INC && code != REG))
10637     return false;
10638
10639   gcc_checking_assert (GET_MODE (x) == VOIDmode
10640                        || SCALAR_INT_MODE_P (GET_MODE (x)));
10641
10642   switch (code)
10643     {
10644     case REG:
10645     case SUBREG:
10646       info->type = ADDRESS_REG_IMM;
10647       info->base = x;
10648       info->offset = const0_rtx;
10649       info->const_offset = 0;
10650       return aarch64_base_register_rtx_p (x, strict_p);
10651
10652     case PLUS:
10653       op0 = XEXP (x, 0);
10654       op1 = XEXP (x, 1);
10655
10656       if (! strict_p
10657           && REG_P (op0)
10658           && virt_or_elim_regno_p (REGNO (op0))
10659           && poly_int_rtx_p (op1, &offset))
10660         {
10661           info->type = ADDRESS_REG_IMM;
10662           info->base = op0;
10663           info->offset = op1;
10664           info->const_offset = offset;
10665
10666           return true;
10667         }
10668
10669       if (maybe_ne (GET_MODE_SIZE (mode), 0)
10670           && aarch64_base_register_rtx_p (op0, strict_p)
10671           && poly_int_rtx_p (op1, &offset))
10672         {
10673           info->type = ADDRESS_REG_IMM;
10674           info->base = op0;
10675           info->offset = op1;
10676           info->const_offset = offset;
10677
10678           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10679              registers and individual Q registers.  The available
10680              address modes are:
10681              X,X: 7-bit signed scaled offset
10682              Q:   9-bit signed offset
10683              We conservatively require an offset representable in either mode.
10684              When performing the check for pairs of X registers i.e.  LDP/STP
10685              pass down DImode since that is the natural size of the LDP/STP
10686              instruction memory accesses.  */
10687           if (mode == TImode || mode == TFmode || mode == TDmode)
10688             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10689                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10690                         || offset_12bit_unsigned_scaled_p (mode, offset)));
10691
10692           if (mode == V8DImode)
10693             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10694                     && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10695
10696           /* A 7bit offset check because OImode will emit a ldp/stp
10697              instruction (only big endian will get here).
10698              For ldp/stp instructions, the offset is scaled for the size of a
10699              single element of the pair.  */
10700           if (aarch64_advsimd_partial_struct_mode_p (mode)
10701               && known_eq (GET_MODE_SIZE (mode), 16))
10702             return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10703           if (aarch64_advsimd_full_struct_mode_p (mode)
10704               && known_eq (GET_MODE_SIZE (mode), 32))
10705             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10706
10707           /* Three 9/12 bit offsets checks because CImode will emit three
10708              ldr/str instructions (only big endian will get here).  */
10709           if (aarch64_advsimd_partial_struct_mode_p (mode)
10710               && known_eq (GET_MODE_SIZE (mode), 24))
10711             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10712                     && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10713                                                                offset + 16)
10714                         || offset_12bit_unsigned_scaled_p (DImode,
10715                                                            offset + 16)));
10716           if (aarch64_advsimd_full_struct_mode_p (mode)
10717               && known_eq (GET_MODE_SIZE (mode), 48))
10718             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10719                     && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10720                                                                offset + 32)
10721                         || offset_12bit_unsigned_scaled_p (TImode,
10722                                                            offset + 32)));
10723
10724           /* Two 7bit offsets checks because XImode will emit two ldp/stp
10725              instructions (only big endian will get here).  */
10726           if (aarch64_advsimd_partial_struct_mode_p (mode)
10727               && known_eq (GET_MODE_SIZE (mode), 32))
10728             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10729                     && aarch64_offset_7bit_signed_scaled_p (DImode,
10730                                                             offset + 16));
10731           if (aarch64_advsimd_full_struct_mode_p (mode)
10732               && known_eq (GET_MODE_SIZE (mode), 64))
10733             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10734                     && aarch64_offset_7bit_signed_scaled_p (TImode,
10735                                                             offset + 32));
10736
10737           /* Make "m" use the LD1 offset range for SVE data modes, so
10738              that pre-RTL optimizers like ivopts will work to that
10739              instead of the wider LDR/STR range.  */
10740           if (vec_flags == VEC_SVE_DATA)
10741             return (type == ADDR_QUERY_M
10742                     ? offset_4bit_signed_scaled_p (mode, offset)
10743                     : offset_9bit_signed_scaled_p (mode, offset));
10744
10745           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10746             {
10747               poly_int64 end_offset = (offset
10748                                        + GET_MODE_SIZE (mode)
10749                                        - BYTES_PER_SVE_VECTOR);
10750               return (type == ADDR_QUERY_M
10751                       ? offset_4bit_signed_scaled_p (mode, offset)
10752                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10753                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10754                                                          end_offset)));
10755             }
10756
10757           if (vec_flags == VEC_SVE_PRED)
10758             return offset_9bit_signed_scaled_p (mode, offset);
10759
10760           if (load_store_pair_p)
10761             return ((known_eq (GET_MODE_SIZE (mode), 4)
10762                      || known_eq (GET_MODE_SIZE (mode), 8)
10763                      || known_eq (GET_MODE_SIZE (mode), 16))
10764                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10765           else
10766             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10767                     || offset_12bit_unsigned_scaled_p (mode, offset));
10768         }
10769
10770       if (allow_reg_index_p)
10771         {
10772           /* Look for base + (scaled/extended) index register.  */
10773           if (aarch64_base_register_rtx_p (op0, strict_p)
10774               && aarch64_classify_index (info, op1, mode, strict_p))
10775             {
10776               info->base = op0;
10777               return true;
10778             }
10779           if (aarch64_base_register_rtx_p (op1, strict_p)
10780               && aarch64_classify_index (info, op0, mode, strict_p))
10781             {
10782               info->base = op1;
10783               return true;
10784             }
10785         }
10786
10787       return false;
10788
10789     case POST_INC:
10790     case POST_DEC:
10791     case PRE_INC:
10792     case PRE_DEC:
10793       info->type = ADDRESS_REG_WB;
10794       info->base = XEXP (x, 0);
10795       info->offset = NULL_RTX;
10796       return aarch64_base_register_rtx_p (info->base, strict_p);
10797
10798     case POST_MODIFY:
10799     case PRE_MODIFY:
10800       info->type = ADDRESS_REG_WB;
10801       info->base = XEXP (x, 0);
10802       if (GET_CODE (XEXP (x, 1)) == PLUS
10803           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10804           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10805           && aarch64_base_register_rtx_p (info->base, strict_p))
10806         {
10807           info->offset = XEXP (XEXP (x, 1), 1);
10808           info->const_offset = offset;
10809
10810           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10811              registers and individual Q registers.  The available
10812              address modes are:
10813              X,X: 7-bit signed scaled offset
10814              Q:   9-bit signed offset
10815              We conservatively require an offset representable in either mode.
10816            */
10817           if (mode == TImode || mode == TFmode || mode == TDmode)
10818             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10819                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10820
10821           if (load_store_pair_p)
10822             return ((known_eq (GET_MODE_SIZE (mode), 4)
10823                      || known_eq (GET_MODE_SIZE (mode), 8)
10824                      || known_eq (GET_MODE_SIZE (mode), 16))
10825                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10826           else
10827             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10828         }
10829       return false;
10830
10831     case CONST:
10832     case SYMBOL_REF:
10833     case LABEL_REF:
10834       /* load literal: pc-relative constant pool entry.  Only supported
10835          for SI mode or larger.  */
10836       info->type = ADDRESS_SYMBOLIC;
10837
10838       if (!load_store_pair_p
10839           && GET_MODE_SIZE (mode).is_constant (&const_size)
10840           && const_size >= 4)
10841         {
10842           poly_int64 offset;
10843           rtx sym = strip_offset_and_salt (x, &offset);
10844           return ((LABEL_REF_P (sym)
10845                    || (SYMBOL_REF_P (sym)
10846                        && CONSTANT_POOL_ADDRESS_P (sym)
10847                        && aarch64_pcrelative_literal_loads)));
10848         }
10849       return false;
10850
10851     case LO_SUM:
10852       info->type = ADDRESS_LO_SUM;
10853       info->base = XEXP (x, 0);
10854       info->offset = XEXP (x, 1);
10855       if (allow_reg_index_p
10856           && aarch64_base_register_rtx_p (info->base, strict_p))
10857         {
10858           poly_int64 offset;
10859           HOST_WIDE_INT const_offset;
10860           rtx sym = strip_offset_and_salt (info->offset, &offset);
10861           if (SYMBOL_REF_P (sym)
10862               && offset.is_constant (&const_offset)
10863               && (aarch64_classify_symbol (sym, const_offset)
10864                   == SYMBOL_SMALL_ABSOLUTE))
10865             {
10866               /* The symbol and offset must be aligned to the access size.  */
10867               unsigned int align;
10868
10869               if (CONSTANT_POOL_ADDRESS_P (sym))
10870                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
10871               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
10872                 {
10873                   tree exp = SYMBOL_REF_DECL (sym);
10874                   align = TYPE_ALIGN (TREE_TYPE (exp));
10875                   align = aarch64_constant_alignment (exp, align);
10876                 }
10877               else if (SYMBOL_REF_DECL (sym))
10878                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
10879               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
10880                        && SYMBOL_REF_BLOCK (sym) != NULL)
10881                 align = SYMBOL_REF_BLOCK (sym)->alignment;
10882               else
10883                 align = BITS_PER_UNIT;
10884
10885               poly_int64 ref_size = GET_MODE_SIZE (mode);
10886               if (known_eq (ref_size, 0))
10887                 ref_size = GET_MODE_SIZE (DImode);
10888
10889               return (multiple_p (const_offset, ref_size)
10890                       && multiple_p (align / BITS_PER_UNIT, ref_size));
10891             }
10892         }
10893       return false;
10894
10895     default:
10896       return false;
10897     }
10898 }
10899
10900 /* Return true if the address X is valid for a PRFM instruction.
10901    STRICT_P is true if we should do strict checking with
10902    aarch64_classify_address.  */
10903
10904 bool
10905 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
10906 {
10907   struct aarch64_address_info addr;
10908
10909   /* PRFM accepts the same addresses as DImode...  */
10910   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
10911   if (!res)
10912     return false;
10913
10914   /* ... except writeback forms.  */
10915   return addr.type != ADDRESS_REG_WB;
10916 }
10917
10918 bool
10919 aarch64_symbolic_address_p (rtx x)
10920 {
10921   poly_int64 offset;
10922   x = strip_offset_and_salt (x, &offset);
10923   return SYMBOL_REF_P (x) || LABEL_REF_P (x);
10924 }
10925
10926 /* Classify the base of symbolic expression X.  */
10927
10928 enum aarch64_symbol_type
10929 aarch64_classify_symbolic_expression (rtx x)
10930 {
10931   rtx offset;
10932
10933   split_const (x, &x, &offset);
10934   return aarch64_classify_symbol (x, INTVAL (offset));
10935 }
10936
10937
10938 /* Return TRUE if X is a legitimate address for accessing memory in
10939    mode MODE.  */
10940 static bool
10941 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
10942 {
10943   struct aarch64_address_info addr;
10944
10945   return aarch64_classify_address (&addr, x, mode, strict_p);
10946 }
10947
10948 /* Return TRUE if X is a legitimate address of type TYPE for accessing
10949    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10950 bool
10951 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
10952                               aarch64_addr_query_type type)
10953 {
10954   struct aarch64_address_info addr;
10955
10956   return aarch64_classify_address (&addr, x, mode, strict_p, type);
10957 }
10958
10959 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
10960
10961 static bool
10962 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
10963                                          poly_int64 orig_offset,
10964                                          machine_mode mode)
10965 {
10966   HOST_WIDE_INT size;
10967   if (GET_MODE_SIZE (mode).is_constant (&size))
10968     {
10969       HOST_WIDE_INT const_offset, second_offset;
10970
10971       /* A general SVE offset is A * VQ + B.  Remove the A component from
10972          coefficient 0 in order to get the constant B.  */
10973       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
10974
10975       /* Split an out-of-range address displacement into a base and
10976          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
10977          range otherwise to increase opportunities for sharing the base
10978          address of different sizes.  Unaligned accesses use the signed
10979          9-bit range, TImode/TFmode/TDmode use the intersection of signed
10980          scaled 7-bit and signed 9-bit offset.  */
10981       if (mode == TImode || mode == TFmode || mode == TDmode)
10982         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
10983       else if ((const_offset & (size - 1)) != 0)
10984         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
10985       else
10986         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
10987
10988       if (second_offset == 0 || known_eq (orig_offset, second_offset))
10989         return false;
10990
10991       /* Split the offset into second_offset and the rest.  */
10992       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10993       *offset2 = gen_int_mode (second_offset, Pmode);
10994       return true;
10995     }
10996   else
10997     {
10998       /* Get the mode we should use as the basis of the range.  For structure
10999          modes this is the mode of one vector.  */
11000       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11001       machine_mode step_mode
11002         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
11003
11004       /* Get the "mul vl" multiplier we'd like to use.  */
11005       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
11006       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
11007       if (vec_flags & VEC_SVE_DATA)
11008         /* LDR supports a 9-bit range, but the move patterns for
11009            structure modes require all vectors to be in range of the
11010            same base.  The simplest way of accomodating that while still
11011            promoting reuse of anchor points between different modes is
11012            to use an 8-bit range unconditionally.  */
11013         vnum = ((vnum + 128) & 255) - 128;
11014       else
11015         /* Predicates are only handled singly, so we might as well use
11016            the full range.  */
11017         vnum = ((vnum + 256) & 511) - 256;
11018       if (vnum == 0)
11019         return false;
11020
11021       /* Convert the "mul vl" multiplier into a byte offset.  */
11022       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
11023       if (known_eq (second_offset, orig_offset))
11024         return false;
11025
11026       /* Split the offset into second_offset and the rest.  */
11027       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11028       *offset2 = gen_int_mode (second_offset, Pmode);
11029       return true;
11030     }
11031 }
11032
11033 /* Return the binary representation of floating point constant VALUE in INTVAL.
11034    If the value cannot be converted, return false without setting INTVAL.
11035    The conversion is done in the given MODE.  */
11036 bool
11037 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
11038 {
11039
11040   /* We make a general exception for 0.  */
11041   if (aarch64_float_const_zero_rtx_p (value))
11042     {
11043       *intval = 0;
11044       return true;
11045     }
11046
11047   scalar_float_mode mode;
11048   if (!CONST_DOUBLE_P (value)
11049       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
11050       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
11051       /* Only support up to DF mode.  */
11052       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
11053     return false;
11054
11055   unsigned HOST_WIDE_INT ival = 0;
11056
11057   long res[2];
11058   real_to_target (res,
11059                   CONST_DOUBLE_REAL_VALUE (value),
11060                   REAL_MODE_FORMAT (mode));
11061
11062   if (mode == DFmode || mode == DDmode)
11063     {
11064       int order = BYTES_BIG_ENDIAN ? 1 : 0;
11065       ival = zext_hwi (res[order], 32);
11066       ival |= (zext_hwi (res[1 - order], 32) << 32);
11067     }
11068   else
11069       ival = zext_hwi (res[0], 32);
11070
11071   *intval = ival;
11072   return true;
11073 }
11074
11075 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11076    single MOV(+MOVK) followed by an FMOV.  */
11077 bool
11078 aarch64_float_const_rtx_p (rtx x)
11079 {
11080   machine_mode mode = GET_MODE (x);
11081   if (mode == VOIDmode)
11082     return false;
11083
11084   /* Determine whether it's cheaper to write float constants as
11085      mov/movk pairs over ldr/adrp pairs.  */
11086   unsigned HOST_WIDE_INT ival;
11087
11088   if (CONST_DOUBLE_P (x)
11089       && SCALAR_FLOAT_MODE_P (mode)
11090       && aarch64_reinterpret_float_as_int (x, &ival))
11091     {
11092       scalar_int_mode imode = (mode == HFmode
11093                                ? SImode
11094                                : int_mode_for_mode (mode).require ());
11095       int num_instr = aarch64_internal_mov_immediate
11096                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11097       return num_instr < 3;
11098     }
11099
11100   return false;
11101 }
11102
11103 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11104    Floating Point).  */
11105 bool
11106 aarch64_float_const_zero_rtx_p (rtx x)
11107 {
11108   /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11109      zr as our callers expect, so no need to check the actual
11110      value if X is of Decimal Floating Point type.  */
11111   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
11112     return false;
11113
11114   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
11115     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
11116   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
11117 }
11118
11119 /* Return TRUE if rtx X is immediate constant that fits in a single
11120    MOVI immediate operation.  */
11121 bool
11122 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11123 {
11124   if (!TARGET_SIMD)
11125      return false;
11126
11127   machine_mode vmode;
11128   scalar_int_mode imode;
11129   unsigned HOST_WIDE_INT ival;
11130
11131   if (CONST_DOUBLE_P (x)
11132       && SCALAR_FLOAT_MODE_P (mode))
11133     {
11134       if (!aarch64_reinterpret_float_as_int (x, &ival))
11135         return false;
11136
11137       /* We make a general exception for 0.  */
11138       if (aarch64_float_const_zero_rtx_p (x))
11139         return true;
11140
11141       imode = int_mode_for_mode (mode).require ();
11142     }
11143   else if (CONST_INT_P (x)
11144            && is_a <scalar_int_mode> (mode, &imode))
11145     ival = INTVAL (x);
11146   else
11147     return false;
11148
11149    /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11150      a 128 bit vector mode.  */
11151   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11152
11153   vmode = aarch64_simd_container_mode (imode, width);
11154   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11155
11156   return aarch64_simd_valid_immediate (v_op, NULL);
11157 }
11158
11159
11160 /* Return the fixed registers used for condition codes.  */
11161
11162 static bool
11163 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11164 {
11165   *p1 = CC_REGNUM;
11166   *p2 = INVALID_REGNUM;
11167   return true;
11168 }
11169
11170 /* This function is used by the call expanders of the machine description.
11171    RESULT is the register in which the result is returned.  It's NULL for
11172    "call" and "sibcall".
11173    MEM is the location of the function call.
11174    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
11175    SIBCALL indicates whether this function call is normal call or sibling call.
11176    It will generate different pattern accordingly.  */
11177
11178 void
11179 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
11180 {
11181   rtx call, callee, tmp;
11182   rtvec vec;
11183   machine_mode mode;
11184
11185   gcc_assert (MEM_P (mem));
11186   callee = XEXP (mem, 0);
11187   mode = GET_MODE (callee);
11188   gcc_assert (mode == Pmode);
11189
11190   /* Decide if we should generate indirect calls by loading the
11191      address of the callee into a register before performing
11192      the branch-and-link.  */
11193   if (SYMBOL_REF_P (callee)
11194       ? (aarch64_is_long_call_p (callee)
11195          || aarch64_is_noplt_call_p (callee))
11196       : !REG_P (callee))
11197     XEXP (mem, 0) = force_reg (mode, callee);
11198
11199   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11200
11201   if (result != NULL_RTX)
11202     call = gen_rtx_SET (result, call);
11203
11204   if (sibcall)
11205     tmp = ret_rtx;
11206   else
11207     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11208
11209   gcc_assert (CONST_INT_P (callee_abi));
11210   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11211                                UNSPEC_CALLEE_ABI);
11212
11213   vec = gen_rtvec (3, call, callee_abi, tmp);
11214   call = gen_rtx_PARALLEL (VOIDmode, vec);
11215
11216   aarch64_emit_call_insn (call);
11217 }
11218
11219 /* Emit call insn with PAT and do aarch64-specific handling.  */
11220
11221 void
11222 aarch64_emit_call_insn (rtx pat)
11223 {
11224   rtx insn = emit_call_insn (pat);
11225
11226   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11227   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11228   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11229 }
11230
11231 machine_mode
11232 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11233 {
11234   machine_mode mode_x = GET_MODE (x);
11235   rtx_code code_x = GET_CODE (x);
11236
11237   /* All floating point compares return CCFP if it is an equality
11238      comparison, and CCFPE otherwise.  */
11239   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11240     {
11241       switch (code)
11242         {
11243         case EQ:
11244         case NE:
11245         case UNORDERED:
11246         case ORDERED:
11247         case UNLT:
11248         case UNLE:
11249         case UNGT:
11250         case UNGE:
11251         case UNEQ:
11252           return CCFPmode;
11253
11254         case LT:
11255         case LE:
11256         case GT:
11257         case GE:
11258         case LTGT:
11259           return CCFPEmode;
11260
11261         default:
11262           gcc_unreachable ();
11263         }
11264     }
11265
11266   /* Equality comparisons of short modes against zero can be performed
11267      using the TST instruction with the appropriate bitmask.  */
11268   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11269       && (code == EQ || code == NE)
11270       && (mode_x == HImode || mode_x == QImode))
11271     return CC_NZmode;
11272
11273   /* Similarly, comparisons of zero_extends from shorter modes can
11274      be performed using an ANDS with an immediate mask.  */
11275   if (y == const0_rtx && code_x == ZERO_EXTEND
11276       && (mode_x == SImode || mode_x == DImode)
11277       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11278       && (code == EQ || code == NE))
11279     return CC_NZmode;
11280
11281   if ((mode_x == SImode || mode_x == DImode)
11282       && y == const0_rtx
11283       && (code == EQ || code == NE || code == LT || code == GE)
11284       && (code_x == PLUS || code_x == MINUS || code_x == AND
11285           || code_x == NEG
11286           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11287               && CONST_INT_P (XEXP (x, 2)))))
11288     return CC_NZmode;
11289
11290   /* A compare with a shifted operand.  Because of canonicalization,
11291      the comparison will have to be swapped when we emit the assembly
11292      code.  */
11293   if ((mode_x == SImode || mode_x == DImode)
11294       && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11295       && (code_x == ASHIFT || code_x == ASHIFTRT
11296           || code_x == LSHIFTRT
11297           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11298     return CC_SWPmode;
11299
11300   /* Similarly for a negated operand, but we can only do this for
11301      equalities.  */
11302   if ((mode_x == SImode || mode_x == DImode)
11303       && (REG_P (y) || SUBREG_P (y))
11304       && (code == EQ || code == NE)
11305       && code_x == NEG)
11306     return CC_Zmode;
11307
11308   /* A test for unsigned overflow from an addition.  */
11309   if ((mode_x == DImode || mode_x == TImode)
11310       && (code == LTU || code == GEU)
11311       && code_x == PLUS
11312       && rtx_equal_p (XEXP (x, 0), y))
11313     return CC_Cmode;
11314
11315   /* A test for unsigned overflow from an add with carry.  */
11316   if ((mode_x == DImode || mode_x == TImode)
11317       && (code == LTU || code == GEU)
11318       && code_x == PLUS
11319       && CONST_SCALAR_INT_P (y)
11320       && (rtx_mode_t (y, mode_x)
11321           == (wi::shwi (1, mode_x)
11322               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11323     return CC_ADCmode;
11324
11325   /* A test for signed overflow.  */
11326   if ((mode_x == DImode || mode_x == TImode)
11327       && code == NE
11328       && code_x == PLUS
11329       && GET_CODE (y) == SIGN_EXTEND)
11330     return CC_Vmode;
11331
11332   /* For everything else, return CCmode.  */
11333   return CCmode;
11334 }
11335
11336 static int
11337 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11338
11339 int
11340 aarch64_get_condition_code (rtx x)
11341 {
11342   machine_mode mode = GET_MODE (XEXP (x, 0));
11343   enum rtx_code comp_code = GET_CODE (x);
11344
11345   if (GET_MODE_CLASS (mode) != MODE_CC)
11346     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11347   return aarch64_get_condition_code_1 (mode, comp_code);
11348 }
11349
11350 static int
11351 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11352 {
11353   switch (mode)
11354     {
11355     case E_CCFPmode:
11356     case E_CCFPEmode:
11357       switch (comp_code)
11358         {
11359         case GE: return AARCH64_GE;
11360         case GT: return AARCH64_GT;
11361         case LE: return AARCH64_LS;
11362         case LT: return AARCH64_MI;
11363         case NE: return AARCH64_NE;
11364         case EQ: return AARCH64_EQ;
11365         case ORDERED: return AARCH64_VC;
11366         case UNORDERED: return AARCH64_VS;
11367         case UNLT: return AARCH64_LT;
11368         case UNLE: return AARCH64_LE;
11369         case UNGT: return AARCH64_HI;
11370         case UNGE: return AARCH64_PL;
11371         default: return -1;
11372         }
11373       break;
11374
11375     case E_CCmode:
11376       switch (comp_code)
11377         {
11378         case NE: return AARCH64_NE;
11379         case EQ: return AARCH64_EQ;
11380         case GE: return AARCH64_GE;
11381         case GT: return AARCH64_GT;
11382         case LE: return AARCH64_LE;
11383         case LT: return AARCH64_LT;
11384         case GEU: return AARCH64_CS;
11385         case GTU: return AARCH64_HI;
11386         case LEU: return AARCH64_LS;
11387         case LTU: return AARCH64_CC;
11388         default: return -1;
11389         }
11390       break;
11391
11392     case E_CC_SWPmode:
11393       switch (comp_code)
11394         {
11395         case NE: return AARCH64_NE;
11396         case EQ: return AARCH64_EQ;
11397         case GE: return AARCH64_LE;
11398         case GT: return AARCH64_LT;
11399         case LE: return AARCH64_GE;
11400         case LT: return AARCH64_GT;
11401         case GEU: return AARCH64_LS;
11402         case GTU: return AARCH64_CC;
11403         case LEU: return AARCH64_CS;
11404         case LTU: return AARCH64_HI;
11405         default: return -1;
11406         }
11407       break;
11408
11409     case E_CC_NZCmode:
11410       switch (comp_code)
11411         {
11412         case NE: return AARCH64_NE; /* = any */
11413         case EQ: return AARCH64_EQ; /* = none */
11414         case GE: return AARCH64_PL; /* = nfrst */
11415         case LT: return AARCH64_MI; /* = first */
11416         case GEU: return AARCH64_CS; /* = nlast */
11417         case GTU: return AARCH64_HI; /* = pmore */
11418         case LEU: return AARCH64_LS; /* = plast */
11419         case LTU: return AARCH64_CC; /* = last */
11420         default: return -1;
11421         }
11422       break;
11423
11424     case E_CC_NZmode:
11425       switch (comp_code)
11426         {
11427         case NE: return AARCH64_NE;
11428         case EQ: return AARCH64_EQ;
11429         case GE: return AARCH64_PL;
11430         case LT: return AARCH64_MI;
11431         default: return -1;
11432         }
11433       break;
11434
11435     case E_CC_Zmode:
11436       switch (comp_code)
11437         {
11438         case NE: return AARCH64_NE;
11439         case EQ: return AARCH64_EQ;
11440         default: return -1;
11441         }
11442       break;
11443
11444     case E_CC_Cmode:
11445       switch (comp_code)
11446         {
11447         case LTU: return AARCH64_CS;
11448         case GEU: return AARCH64_CC;
11449         default: return -1;
11450         }
11451       break;
11452
11453     case E_CC_ADCmode:
11454       switch (comp_code)
11455         {
11456         case GEU: return AARCH64_CS;
11457         case LTU: return AARCH64_CC;
11458         default: return -1;
11459         }
11460       break;
11461
11462     case E_CC_Vmode:
11463       switch (comp_code)
11464         {
11465         case NE: return AARCH64_VS;
11466         case EQ: return AARCH64_VC;
11467         default: return -1;
11468         }
11469       break;
11470
11471     default:
11472       return -1;
11473     }
11474
11475   return -1;
11476 }
11477
11478 bool
11479 aarch64_const_vec_all_same_in_range_p (rtx x,
11480                                        HOST_WIDE_INT minval,
11481                                        HOST_WIDE_INT maxval)
11482 {
11483   rtx elt;
11484   return (const_vec_duplicate_p (x, &elt)
11485           && CONST_INT_P (elt)
11486           && IN_RANGE (INTVAL (elt), minval, maxval));
11487 }
11488
11489 bool
11490 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11491 {
11492   return aarch64_const_vec_all_same_in_range_p (x, val, val);
11493 }
11494
11495 /* Return true if VEC is a constant in which every element is in the range
11496    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
11497
11498 static bool
11499 aarch64_const_vec_all_in_range_p (rtx vec,
11500                                   HOST_WIDE_INT minval,
11501                                   HOST_WIDE_INT maxval)
11502 {
11503   if (!CONST_VECTOR_P (vec)
11504       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11505     return false;
11506
11507   int nunits;
11508   if (!CONST_VECTOR_STEPPED_P (vec))
11509     nunits = const_vector_encoded_nelts (vec);
11510   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11511     return false;
11512
11513   for (int i = 0; i < nunits; i++)
11514     {
11515       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11516       if (!CONST_INT_P (vec_elem)
11517           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11518         return false;
11519     }
11520   return true;
11521 }
11522
11523 /* N Z C V.  */
11524 #define AARCH64_CC_V 1
11525 #define AARCH64_CC_C (1 << 1)
11526 #define AARCH64_CC_Z (1 << 2)
11527 #define AARCH64_CC_N (1 << 3)
11528
11529 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
11530 static const int aarch64_nzcv_codes[] =
11531 {
11532   0,            /* EQ, Z == 1.  */
11533   AARCH64_CC_Z, /* NE, Z == 0.  */
11534   0,            /* CS, C == 1.  */
11535   AARCH64_CC_C, /* CC, C == 0.  */
11536   0,            /* MI, N == 1.  */
11537   AARCH64_CC_N, /* PL, N == 0.  */
11538   0,            /* VS, V == 1.  */
11539   AARCH64_CC_V, /* VC, V == 0.  */
11540   0,            /* HI, C ==1 && Z == 0.  */
11541   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
11542   AARCH64_CC_V, /* GE, N == V.  */
11543   0,            /* LT, N != V.  */
11544   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
11545   0,            /* LE, !(Z == 0 && N == V).  */
11546   0,            /* AL, Any.  */
11547   0             /* NV, Any.  */
11548 };
11549
11550 /* Print floating-point vector immediate operand X to F, negating it
11551    first if NEGATE is true.  Return true on success, false if it isn't
11552    a constant we can handle.  */
11553
11554 static bool
11555 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11556 {
11557   rtx elt;
11558
11559   if (!const_vec_duplicate_p (x, &elt))
11560     return false;
11561
11562   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11563   if (negate)
11564     r = real_value_negate (&r);
11565
11566   /* Handle the SVE single-bit immediates specially, since they have a
11567      fixed form in the assembly syntax.  */
11568   if (real_equal (&r, &dconst0))
11569     asm_fprintf (f, "0.0");
11570   else if (real_equal (&r, &dconst2))
11571     asm_fprintf (f, "2.0");
11572   else if (real_equal (&r, &dconst1))
11573     asm_fprintf (f, "1.0");
11574   else if (real_equal (&r, &dconsthalf))
11575     asm_fprintf (f, "0.5");
11576   else
11577     {
11578       const int buf_size = 20;
11579       char float_buf[buf_size] = {'\0'};
11580       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11581                                 1, GET_MODE (elt));
11582       asm_fprintf (f, "%s", float_buf);
11583     }
11584
11585   return true;
11586 }
11587
11588 /* Return the equivalent letter for size.  */
11589 static char
11590 sizetochar (int size)
11591 {
11592   switch (size)
11593     {
11594     case 64: return 'd';
11595     case 32: return 's';
11596     case 16: return 'h';
11597     case 8 : return 'b';
11598     default: gcc_unreachable ();
11599     }
11600 }
11601
11602 /* Print operand X to file F in a target specific manner according to CODE.
11603    The acceptable formatting commands given by CODE are:
11604      'c':               An integer or symbol address without a preceding #
11605                         sign.
11606      'C':               Take the duplicated element in a vector constant
11607                         and print it in hex.
11608      'D':               Take the duplicated element in a vector constant
11609                         and print it as an unsigned integer, in decimal.
11610      'e':               Print the sign/zero-extend size as a character 8->b,
11611                         16->h, 32->w.  Can also be used for masks:
11612                         0xff->b, 0xffff->h, 0xffffffff->w.
11613      'I':               If the operand is a duplicated vector constant,
11614                         replace it with the duplicated scalar.  If the
11615                         operand is then a floating-point constant, replace
11616                         it with the integer bit representation.  Print the
11617                         transformed constant as a signed decimal number.
11618      'p':               Prints N such that 2^N == X (X must be power of 2 and
11619                         const int).
11620      'P':               Print the number of non-zero bits in X (a const_int).
11621      'H':               Print the higher numbered register of a pair (TImode)
11622                         of regs.
11623      'm':               Print a condition (eq, ne, etc).
11624      'M':               Same as 'm', but invert condition.
11625      'N':               Take the duplicated element in a vector constant
11626                         and print the negative of it in decimal.
11627      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
11628      'S/T/U/V':         Print a FP/SIMD register name for a register list.
11629                         The register printed is the FP/SIMD register name
11630                         of X + 0/1/2/3 for S/T/U/V.
11631      'R':               Print a scalar Integer/FP/SIMD register name + 1.
11632      'X':               Print bottom 16 bits of integer constant in hex.
11633      'w/x':             Print a general register name or the zero register
11634                         (32-bit or 64-bit).
11635      '0':               Print a normal operand, if it's a general register,
11636                         then we assume DImode.
11637      'k':               Print NZCV for conditional compare instructions.
11638      'A':               Output address constant representing the first
11639                         argument of X, specifying a relocation offset
11640                         if appropriate.
11641      'L':               Output constant address specified by X
11642                         with a relocation offset if appropriate.
11643      'G':               Prints address of X, specifying a PC relative
11644                         relocation mode if appropriate.
11645      'y':               Output address of LDP or STP - this is used for
11646                         some LDP/STPs which don't use a PARALLEL in their
11647                         pattern (so the mode needs to be adjusted).
11648      'z':               Output address of a typical LDP or STP.  */
11649
11650 static void
11651 aarch64_print_operand (FILE *f, rtx x, int code)
11652 {
11653   rtx elt;
11654   switch (code)
11655     {
11656     case 'c':
11657       if (CONST_INT_P (x))
11658         fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11659       else
11660         {
11661           poly_int64 offset;
11662           rtx base = strip_offset_and_salt (x, &offset);
11663           if (SYMBOL_REF_P (base))
11664             output_addr_const (f, x);
11665           else
11666             output_operand_lossage ("unsupported operand for code '%c'", code);
11667         }
11668       break;
11669
11670     case 'e':
11671       {
11672         x = unwrap_const_vec_duplicate (x);
11673         if (!CONST_INT_P (x))
11674           {
11675             output_operand_lossage ("invalid operand for '%%%c'", code);
11676             return;
11677           }
11678
11679         HOST_WIDE_INT val = INTVAL (x);
11680         if ((val & ~7) == 8 || val == 0xff)
11681           fputc ('b', f);
11682         else if ((val & ~7) == 16 || val == 0xffff)
11683           fputc ('h', f);
11684         else if ((val & ~7) == 32 || val == 0xffffffff)
11685           fputc ('w', f);
11686         else
11687           {
11688             output_operand_lossage ("invalid operand for '%%%c'", code);
11689             return;
11690           }
11691       }
11692       break;
11693
11694     case 'p':
11695       {
11696         int n;
11697
11698         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
11699           {
11700             output_operand_lossage ("invalid operand for '%%%c'", code);
11701             return;
11702           }
11703
11704         asm_fprintf (f, "%d", n);
11705       }
11706       break;
11707
11708     case 'P':
11709       if (!CONST_INT_P (x))
11710         {
11711           output_operand_lossage ("invalid operand for '%%%c'", code);
11712           return;
11713         }
11714
11715       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
11716       break;
11717
11718     case 'H':
11719       if (x == const0_rtx)
11720         {
11721           asm_fprintf (f, "xzr");
11722           break;
11723         }
11724
11725       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
11726         {
11727           output_operand_lossage ("invalid operand for '%%%c'", code);
11728           return;
11729         }
11730
11731       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
11732       break;
11733
11734     case 'I':
11735       {
11736         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
11737         if (CONST_INT_P (x))
11738           asm_fprintf (f, "%wd", INTVAL (x));
11739         else
11740           {
11741             output_operand_lossage ("invalid operand for '%%%c'", code);
11742             return;
11743           }
11744         break;
11745       }
11746
11747     case 'M':
11748     case 'm':
11749       {
11750         int cond_code;
11751         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
11752         if (x == const_true_rtx)
11753           {
11754             if (code == 'M')
11755               fputs ("nv", f);
11756             return;
11757           }
11758
11759         if (!COMPARISON_P (x))
11760           {
11761             output_operand_lossage ("invalid operand for '%%%c'", code);
11762             return;
11763           }
11764
11765         cond_code = aarch64_get_condition_code (x);
11766         gcc_assert (cond_code >= 0);
11767         if (code == 'M')
11768           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
11769         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
11770           fputs (aarch64_sve_condition_codes[cond_code], f);
11771         else
11772           fputs (aarch64_condition_codes[cond_code], f);
11773       }
11774       break;
11775
11776     case 'N':
11777       if (!const_vec_duplicate_p (x, &elt))
11778         {
11779           output_operand_lossage ("invalid vector constant");
11780           return;
11781         }
11782
11783       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
11784         asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
11785       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11786                && aarch64_print_vector_float_operand (f, x, true))
11787         ;
11788       else
11789         {
11790           output_operand_lossage ("invalid vector constant");
11791           return;
11792         }
11793       break;
11794
11795     case 'b':
11796     case 'h':
11797     case 's':
11798     case 'd':
11799     case 'q':
11800       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
11801         {
11802           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
11803           return;
11804         }
11805       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
11806       break;
11807
11808     case 'S':
11809     case 'T':
11810     case 'U':
11811     case 'V':
11812       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
11813         {
11814           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
11815           return;
11816         }
11817       asm_fprintf (f, "%c%d",
11818                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
11819                    REGNO (x) - V0_REGNUM + (code - 'S'));
11820       break;
11821
11822     case 'R':
11823       if (REG_P (x) && FP_REGNUM_P (REGNO (x))
11824           && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
11825         asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
11826       else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
11827         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
11828       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
11829         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
11830       else
11831         output_operand_lossage ("incompatible register operand for '%%%c'",
11832                                 code);
11833       break;
11834
11835     case 'X':
11836       if (!CONST_INT_P (x))
11837         {
11838           output_operand_lossage ("invalid operand for '%%%c'", code);
11839           return;
11840         }
11841       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
11842       break;
11843
11844     case 'C':
11845       {
11846         /* Print a replicated constant in hex.  */
11847         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
11848           {
11849             output_operand_lossage ("invalid operand for '%%%c'", code);
11850             return;
11851           }
11852         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
11853         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
11854       }
11855       break;
11856
11857     case 'D':
11858       {
11859         /* Print a replicated constant in decimal, treating it as
11860            unsigned.  */
11861         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
11862           {
11863             output_operand_lossage ("invalid operand for '%%%c'", code);
11864             return;
11865           }
11866         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
11867         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
11868       }
11869       break;
11870
11871     case 'w':
11872     case 'x':
11873       if (x == const0_rtx
11874           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
11875         {
11876           asm_fprintf (f, "%czr", code);
11877           break;
11878         }
11879
11880       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
11881         {
11882           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
11883           break;
11884         }
11885
11886       if (REG_P (x) && REGNO (x) == SP_REGNUM)
11887         {
11888           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
11889           break;
11890         }
11891
11892       /* Fall through */
11893
11894     case 0:
11895       if (x == NULL)
11896         {
11897           output_operand_lossage ("missing operand");
11898           return;
11899         }
11900
11901       switch (GET_CODE (x))
11902         {
11903         case REG:
11904           if (aarch64_sve_data_mode_p (GET_MODE (x)))
11905             {
11906               if (REG_NREGS (x) == 1)
11907                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
11908               else
11909                 {
11910                   char suffix
11911                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
11912                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
11913                                REGNO (x) - V0_REGNUM, suffix,
11914                                END_REGNO (x) - V0_REGNUM - 1, suffix);
11915                 }
11916             }
11917           else
11918             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
11919           break;
11920
11921         case MEM:
11922           output_address (GET_MODE (x), XEXP (x, 0));
11923           break;
11924
11925         case LABEL_REF:
11926         case SYMBOL_REF:
11927           output_addr_const (asm_out_file, x);
11928           break;
11929
11930         case CONST_INT:
11931           asm_fprintf (f, "%wd", INTVAL (x));
11932           break;
11933
11934         case CONST:
11935           if (!VECTOR_MODE_P (GET_MODE (x)))
11936             {
11937               output_addr_const (asm_out_file, x);
11938               break;
11939             }
11940           /* fall through */
11941
11942         case CONST_VECTOR:
11943           if (!const_vec_duplicate_p (x, &elt))
11944             {
11945               output_operand_lossage ("invalid vector constant");
11946               return;
11947             }
11948
11949           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
11950             asm_fprintf (f, "%wd", INTVAL (elt));
11951           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11952                    && aarch64_print_vector_float_operand (f, x, false))
11953             ;
11954           else
11955             {
11956               output_operand_lossage ("invalid vector constant");
11957               return;
11958             }
11959           break;
11960
11961         case CONST_DOUBLE:
11962           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
11963              be getting CONST_DOUBLEs holding integers.  */
11964           gcc_assert (GET_MODE (x) != VOIDmode);
11965           if (aarch64_float_const_zero_rtx_p (x))
11966             {
11967               fputc ('0', f);
11968               break;
11969             }
11970           else if (aarch64_float_const_representable_p (x))
11971             {
11972 #define buf_size 20
11973               char float_buf[buf_size] = {'\0'};
11974               real_to_decimal_for_mode (float_buf,
11975                                         CONST_DOUBLE_REAL_VALUE (x),
11976                                         buf_size, buf_size,
11977                                         1, GET_MODE (x));
11978               asm_fprintf (asm_out_file, "%s", float_buf);
11979               break;
11980 #undef buf_size
11981             }
11982           output_operand_lossage ("invalid constant");
11983           return;
11984         default:
11985           output_operand_lossage ("invalid operand");
11986           return;
11987         }
11988       break;
11989
11990     case 'A':
11991       if (GET_CODE (x) == HIGH)
11992         x = XEXP (x, 0);
11993
11994       switch (aarch64_classify_symbolic_expression (x))
11995         {
11996         case SYMBOL_SMALL_GOT_4G:
11997           asm_fprintf (asm_out_file, ":got:");
11998           break;
11999
12000         case SYMBOL_SMALL_TLSGD:
12001           asm_fprintf (asm_out_file, ":tlsgd:");
12002           break;
12003
12004         case SYMBOL_SMALL_TLSDESC:
12005           asm_fprintf (asm_out_file, ":tlsdesc:");
12006           break;
12007
12008         case SYMBOL_SMALL_TLSIE:
12009           asm_fprintf (asm_out_file, ":gottprel:");
12010           break;
12011
12012         case SYMBOL_TLSLE24:
12013           asm_fprintf (asm_out_file, ":tprel:");
12014           break;
12015
12016         case SYMBOL_TINY_GOT:
12017           gcc_unreachable ();
12018           break;
12019
12020         default:
12021           break;
12022         }
12023       output_addr_const (asm_out_file, x);
12024       break;
12025
12026     case 'L':
12027       switch (aarch64_classify_symbolic_expression (x))
12028         {
12029         case SYMBOL_SMALL_GOT_4G:
12030           asm_fprintf (asm_out_file, ":got_lo12:");
12031           break;
12032
12033         case SYMBOL_SMALL_TLSGD:
12034           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12035           break;
12036
12037         case SYMBOL_SMALL_TLSDESC:
12038           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12039           break;
12040
12041         case SYMBOL_SMALL_TLSIE:
12042           asm_fprintf (asm_out_file, ":gottprel_lo12:");
12043           break;
12044
12045         case SYMBOL_TLSLE12:
12046           asm_fprintf (asm_out_file, ":tprel_lo12:");
12047           break;
12048
12049         case SYMBOL_TLSLE24:
12050           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12051           break;
12052
12053         case SYMBOL_TINY_GOT:
12054           asm_fprintf (asm_out_file, ":got:");
12055           break;
12056
12057         case SYMBOL_TINY_TLSIE:
12058           asm_fprintf (asm_out_file, ":gottprel:");
12059           break;
12060
12061         default:
12062           break;
12063         }
12064       output_addr_const (asm_out_file, x);
12065       break;
12066
12067     case 'G':
12068       switch (aarch64_classify_symbolic_expression (x))
12069         {
12070         case SYMBOL_TLSLE24:
12071           asm_fprintf (asm_out_file, ":tprel_hi12:");
12072           break;
12073         default:
12074           break;
12075         }
12076       output_addr_const (asm_out_file, x);
12077       break;
12078
12079     case 'k':
12080       {
12081         HOST_WIDE_INT cond_code;
12082
12083         if (!CONST_INT_P (x))
12084           {
12085             output_operand_lossage ("invalid operand for '%%%c'", code);
12086             return;
12087           }
12088
12089         cond_code = INTVAL (x);
12090         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12091         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12092       }
12093       break;
12094
12095     case 'y':
12096     case 'z':
12097       {
12098         machine_mode mode = GET_MODE (x);
12099
12100         if (!MEM_P (x)
12101             || (code == 'y'
12102                 && maybe_ne (GET_MODE_SIZE (mode), 8)
12103                 && maybe_ne (GET_MODE_SIZE (mode), 16)))
12104           {
12105             output_operand_lossage ("invalid operand for '%%%c'", code);
12106             return;
12107           }
12108
12109         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12110                                             code == 'y'
12111                                             ? ADDR_QUERY_LDP_STP_N
12112                                             : ADDR_QUERY_LDP_STP))
12113           output_operand_lossage ("invalid operand prefix '%%%c'", code);
12114       }
12115       break;
12116
12117     default:
12118       output_operand_lossage ("invalid operand prefix '%%%c'", code);
12119       return;
12120     }
12121 }
12122
12123 /* Print address 'x' of a memory access with mode 'mode'.
12124    'op' is the context required by aarch64_classify_address.  It can either be
12125    MEM for a normal memory access or PARALLEL for LDP/STP.  */
12126 static bool
12127 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12128                                 aarch64_addr_query_type type)
12129 {
12130   struct aarch64_address_info addr;
12131   unsigned int size, vec_flags;
12132
12133   /* Check all addresses are Pmode - including ILP32.  */
12134   if (GET_MODE (x) != Pmode
12135       && (!CONST_INT_P (x)
12136           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12137     {
12138       output_operand_lossage ("invalid address mode");
12139       return false;
12140     }
12141
12142   if (aarch64_classify_address (&addr, x, mode, true, type))
12143     switch (addr.type)
12144       {
12145       case ADDRESS_REG_IMM:
12146         if (known_eq (addr.const_offset, 0))
12147           {
12148             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12149             return true;
12150           }
12151
12152         vec_flags = aarch64_classify_vector_mode (mode);
12153         if (vec_flags & VEC_ANY_SVE)
12154           {
12155             HOST_WIDE_INT vnum
12156               = exact_div (addr.const_offset,
12157                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12158             asm_fprintf (f, "[%s, #%wd, mul vl]",
12159                          reg_names[REGNO (addr.base)], vnum);
12160             return true;
12161           }
12162
12163         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12164                      INTVAL (addr.offset));
12165         return true;
12166
12167       case ADDRESS_REG_REG:
12168         if (addr.shift == 0)
12169           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12170                        reg_names [REGNO (addr.offset)]);
12171         else
12172           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12173                        reg_names [REGNO (addr.offset)], addr.shift);
12174         return true;
12175
12176       case ADDRESS_REG_UXTW:
12177         if (addr.shift == 0)
12178           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12179                        REGNO (addr.offset) - R0_REGNUM);
12180         else
12181           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12182                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12183         return true;
12184
12185       case ADDRESS_REG_SXTW:
12186         if (addr.shift == 0)
12187           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12188                        REGNO (addr.offset) - R0_REGNUM);
12189         else
12190           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12191                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12192         return true;
12193
12194       case ADDRESS_REG_WB:
12195         /* Writeback is only supported for fixed-width modes.  */
12196         size = GET_MODE_SIZE (mode).to_constant ();
12197         switch (GET_CODE (x))
12198           {
12199           case PRE_INC:
12200             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12201             return true;
12202           case POST_INC:
12203             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12204             return true;
12205           case PRE_DEC:
12206             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12207             return true;
12208           case POST_DEC:
12209             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12210             return true;
12211           case PRE_MODIFY:
12212             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12213                          INTVAL (addr.offset));
12214             return true;
12215           case POST_MODIFY:
12216             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12217                          INTVAL (addr.offset));
12218             return true;
12219           default:
12220             break;
12221           }
12222         break;
12223
12224       case ADDRESS_LO_SUM:
12225         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12226         output_addr_const (f, addr.offset);
12227         asm_fprintf (f, "]");
12228         return true;
12229
12230       case ADDRESS_SYMBOLIC:
12231         output_addr_const (f, x);
12232         return true;
12233       }
12234
12235   return false;
12236 }
12237
12238 /* Print address 'x' of a memory access with mode 'mode'.  */
12239 static void
12240 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12241 {
12242   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12243     output_addr_const (f, x);
12244 }
12245
12246 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
12247
12248 static bool
12249 aarch64_output_addr_const_extra (FILE *file, rtx x)
12250 {
12251   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12252     {
12253       output_addr_const (file, XVECEXP (x, 0, 0));
12254       return true;
12255    }
12256   return false;
12257 }
12258
12259 bool
12260 aarch64_label_mentioned_p (rtx x)
12261 {
12262   const char *fmt;
12263   int i;
12264
12265   if (LABEL_REF_P (x))
12266     return true;
12267
12268   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12269      referencing instruction, but they are constant offsets, not
12270      symbols.  */
12271   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12272     return false;
12273
12274   fmt = GET_RTX_FORMAT (GET_CODE (x));
12275   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12276     {
12277       if (fmt[i] == 'E')
12278         {
12279           int j;
12280
12281           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12282             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12283               return 1;
12284         }
12285       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12286         return 1;
12287     }
12288
12289   return 0;
12290 }
12291
12292 /* Implement REGNO_REG_CLASS.  */
12293
12294 enum reg_class
12295 aarch64_regno_regclass (unsigned regno)
12296 {
12297   if (STUB_REGNUM_P (regno))
12298     return STUB_REGS;
12299
12300   if (GP_REGNUM_P (regno))
12301     return GENERAL_REGS;
12302
12303   if (regno == SP_REGNUM)
12304     return STACK_REG;
12305
12306   if (regno == FRAME_POINTER_REGNUM
12307       || regno == ARG_POINTER_REGNUM)
12308     return POINTER_REGS;
12309
12310   if (FP_REGNUM_P (regno))
12311     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12312             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12313
12314   if (PR_REGNUM_P (regno))
12315     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12316
12317   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12318     return FFR_REGS;
12319
12320   return NO_REGS;
12321 }
12322
12323 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12324    If OFFSET is out of range, return an offset of an anchor point
12325    that is in range.  Return 0 otherwise.  */
12326
12327 static HOST_WIDE_INT
12328 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12329                        machine_mode mode)
12330 {
12331   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
12332   if (size > 16)
12333     return (offset + 0x400) & ~0x7f0;
12334
12335   /* For offsets that aren't a multiple of the access size, the limit is
12336      -256...255.  */
12337   if (offset & (size - 1))
12338     {
12339       /* BLKmode typically uses LDP of X-registers.  */
12340       if (mode == BLKmode)
12341         return (offset + 512) & ~0x3ff;
12342       return (offset + 0x100) & ~0x1ff;
12343     }
12344
12345   /* Small negative offsets are supported.  */
12346   if (IN_RANGE (offset, -256, 0))
12347     return 0;
12348
12349   if (mode == TImode || mode == TFmode || mode == TDmode)
12350     return (offset + 0x100) & ~0x1ff;
12351
12352   /* Use 12-bit offset by access size.  */
12353   return offset & (~0xfff * size);
12354 }
12355
12356 static rtx
12357 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
12358 {
12359   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12360      where mask is selected by alignment and size of the offset.
12361      We try to pick as large a range for the offset as possible to
12362      maximize the chance of a CSE.  However, for aligned addresses
12363      we limit the range to 4k so that structures with different sized
12364      elements are likely to use the same base.  We need to be careful
12365      not to split a CONST for some forms of address expression, otherwise
12366      it will generate sub-optimal code.  */
12367
12368   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12369     {
12370       rtx base = XEXP (x, 0);
12371       rtx offset_rtx = XEXP (x, 1);
12372       HOST_WIDE_INT offset = INTVAL (offset_rtx);
12373
12374       if (GET_CODE (base) == PLUS)
12375         {
12376           rtx op0 = XEXP (base, 0);
12377           rtx op1 = XEXP (base, 1);
12378
12379           /* Force any scaling into a temp for CSE.  */
12380           op0 = force_reg (Pmode, op0);
12381           op1 = force_reg (Pmode, op1);
12382
12383           /* Let the pointer register be in op0.  */
12384           if (REG_POINTER (op1))
12385             std::swap (op0, op1);
12386
12387           /* If the pointer is virtual or frame related, then we know that
12388              virtual register instantiation or register elimination is going
12389              to apply a second constant.  We want the two constants folded
12390              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
12391           if (virt_or_elim_regno_p (REGNO (op0)))
12392             {
12393               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12394                                    NULL_RTX, true, OPTAB_DIRECT);
12395               return gen_rtx_PLUS (Pmode, base, op1);
12396             }
12397
12398           /* Otherwise, in order to encourage CSE (and thence loop strength
12399              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
12400           base = expand_binop (Pmode, add_optab, op0, op1,
12401                                NULL_RTX, true, OPTAB_DIRECT);
12402           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12403         }
12404
12405       HOST_WIDE_INT size;
12406       if (GET_MODE_SIZE (mode).is_constant (&size))
12407         {
12408           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12409                                                              mode);
12410           if (base_offset != 0)
12411             {
12412               base = plus_constant (Pmode, base, base_offset);
12413               base = force_operand (base, NULL_RTX);
12414               return plus_constant (Pmode, base, offset - base_offset);
12415             }
12416         }
12417     }
12418
12419   return x;
12420 }
12421
12422 static reg_class_t
12423 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12424                           reg_class_t rclass,
12425                           machine_mode mode,
12426                           secondary_reload_info *sri)
12427 {
12428   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12429      LDR and STR.  See the comment at the head of aarch64-sve.md for
12430      more details about the big-endian handling.  */
12431   if (reg_class_subset_p (rclass, FP_REGS)
12432       && !((REG_P (x) && HARD_REGISTER_P (x))
12433            || aarch64_simd_valid_immediate (x, NULL))
12434       && mode != VNx16QImode)
12435     {
12436       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12437       if ((vec_flags & VEC_SVE_DATA)
12438           && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12439         {
12440           sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12441           return NO_REGS;
12442         }
12443     }
12444
12445   /* If we have to disable direct literal pool loads and stores because the
12446      function is too big, then we need a scratch register.  */
12447   if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12448       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12449           || targetm.vector_mode_supported_p (GET_MODE (x)))
12450       && !aarch64_pcrelative_literal_loads)
12451     {
12452       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12453       return NO_REGS;
12454     }
12455
12456   /* Without the TARGET_SIMD instructions we cannot move a Q register
12457      to a Q register directly.  We need a scratch.  */
12458   if (REG_P (x)
12459       && (mode == TFmode || mode == TImode || mode == TDmode)
12460       && mode == GET_MODE (x)
12461       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
12462       && reg_class_subset_p (rclass, FP_REGS))
12463     {
12464       sri->icode = code_for_aarch64_reload_mov (mode);
12465       return NO_REGS;
12466     }
12467
12468   /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12469      because AArch64 has richer addressing modes for LDR/STR instructions
12470      than LDP/STP instructions.  */
12471   if (TARGET_FLOAT && rclass == GENERAL_REGS
12472       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12473     return FP_REGS;
12474
12475   if (rclass == FP_REGS
12476       && (mode == TImode || mode == TFmode || mode == TDmode)
12477       && CONSTANT_P(x))
12478       return GENERAL_REGS;
12479
12480   return NO_REGS;
12481 }
12482
12483 static bool
12484 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12485 {
12486   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12487
12488   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12489      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
12490   if (frame_pointer_needed)
12491     return to == HARD_FRAME_POINTER_REGNUM;
12492   return true;
12493 }
12494
12495 poly_int64
12496 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12497 {
12498   if (to == HARD_FRAME_POINTER_REGNUM)
12499     {
12500       if (from == ARG_POINTER_REGNUM)
12501         return cfun->machine->frame.hard_fp_offset;
12502
12503       if (from == FRAME_POINTER_REGNUM)
12504         return cfun->machine->frame.hard_fp_offset
12505                - cfun->machine->frame.locals_offset;
12506     }
12507
12508   if (to == STACK_POINTER_REGNUM)
12509     {
12510       if (from == FRAME_POINTER_REGNUM)
12511           return cfun->machine->frame.frame_size
12512                  - cfun->machine->frame.locals_offset;
12513     }
12514
12515   return cfun->machine->frame.frame_size;
12516 }
12517
12518
12519 /* Get return address without mangling.  */
12520
12521 rtx
12522 aarch64_return_addr_rtx (void)
12523 {
12524   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12525   /* Note: aarch64_return_address_signing_enabled only
12526      works after cfun->machine->frame.laid_out is set,
12527      so here we don't know if the return address will
12528      be signed or not.  */
12529   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12530   emit_move_insn (lr, val);
12531   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12532   return lr;
12533 }
12534
12535
12536 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
12537    previous frame.  */
12538
12539 rtx
12540 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12541 {
12542   if (count != 0)
12543     return const0_rtx;
12544   return aarch64_return_addr_rtx ();
12545 }
12546
12547 static void
12548 aarch64_asm_trampoline_template (FILE *f)
12549 {
12550   /* Even if the current function doesn't have branch protection, some
12551      later function might, so since this template is only generated once
12552      we have to add a BTI just in case. */
12553   asm_fprintf (f, "\thint\t34 // bti c\n");
12554
12555   if (TARGET_ILP32)
12556     {
12557       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12558       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12559     }
12560   else
12561     {
12562       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12563       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12564     }
12565   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12566
12567   /* We always emit a speculation barrier.
12568      This is because the same trampoline template is used for every nested
12569      function.  Since nested functions are not particularly common or
12570      performant we don't worry too much about the extra instructions to copy
12571      around.
12572      This is not yet a problem, since we have not yet implemented function
12573      specific attributes to choose between hardening against straight line
12574      speculation or not, but such function specific attributes are likely to
12575      happen in the future.  */
12576   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12577
12578   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12579   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12580 }
12581
12582 static void
12583 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12584 {
12585   rtx fnaddr, mem, a_tramp;
12586   const int tramp_code_sz = 24;
12587
12588   /* Don't need to copy the trailing D-words, we fill those in below.  */
12589   /* We create our own memory address in Pmode so that `emit_block_move` can
12590      use parts of the backend which expect Pmode addresses.  */
12591   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12592   emit_block_move (gen_rtx_MEM (BLKmode, temp),
12593                    assemble_trampoline_template (),
12594                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12595   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
12596   fnaddr = XEXP (DECL_RTL (fndecl), 0);
12597   if (GET_MODE (fnaddr) != ptr_mode)
12598     fnaddr = convert_memory_address (ptr_mode, fnaddr);
12599   emit_move_insn (mem, fnaddr);
12600
12601   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
12602   emit_move_insn (mem, chain_value);
12603
12604   /* XXX We should really define a "clear_cache" pattern and use
12605      gen_clear_cache().  */
12606   a_tramp = XEXP (m_tramp, 0);
12607   maybe_emit_call_builtin___clear_cache (a_tramp,
12608                                          plus_constant (ptr_mode,
12609                                                         a_tramp,
12610                                                         TRAMPOLINE_SIZE));
12611 }
12612
12613 static unsigned char
12614 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
12615 {
12616   /* ??? Logically we should only need to provide a value when
12617      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
12618      can hold MODE, but at the moment we need to handle all modes.
12619      Just ignore any runtime parts for registers that can't store them.  */
12620   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
12621   unsigned int nregs, vec_flags;
12622   switch (regclass)
12623     {
12624     case STUB_REGS:
12625     case TAILCALL_ADDR_REGS:
12626     case POINTER_REGS:
12627     case GENERAL_REGS:
12628     case ALL_REGS:
12629     case POINTER_AND_FP_REGS:
12630     case FP_REGS:
12631     case FP_LO_REGS:
12632     case FP_LO8_REGS:
12633       vec_flags = aarch64_classify_vector_mode (mode);
12634       if ((vec_flags & VEC_SVE_DATA)
12635           && constant_multiple_p (GET_MODE_SIZE (mode),
12636                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
12637         return nregs;
12638       return (vec_flags & VEC_ADVSIMD
12639               ? CEIL (lowest_size, UNITS_PER_VREG)
12640               : CEIL (lowest_size, UNITS_PER_WORD));
12641     case STACK_REG:
12642     case PR_REGS:
12643     case PR_LO_REGS:
12644     case PR_HI_REGS:
12645     case FFR_REGS:
12646     case PR_AND_FFR_REGS:
12647       return 1;
12648
12649     case NO_REGS:
12650       return 0;
12651
12652     default:
12653       break;
12654     }
12655   gcc_unreachable ();
12656 }
12657
12658 static reg_class_t
12659 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
12660 {
12661   if (regclass == POINTER_REGS)
12662     return GENERAL_REGS;
12663
12664   if (regclass == STACK_REG)
12665     {
12666       if (REG_P(x)
12667           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
12668           return regclass;
12669
12670       return NO_REGS;
12671     }
12672
12673   /* Register eliminiation can result in a request for
12674      SP+constant->FP_REGS.  We cannot support such operations which
12675      use SP as source and an FP_REG as destination, so reject out
12676      right now.  */
12677   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
12678     {
12679       rtx lhs = XEXP (x, 0);
12680
12681       /* Look through a possible SUBREG introduced by ILP32.  */
12682       if (SUBREG_P (lhs))
12683         lhs = SUBREG_REG (lhs);
12684
12685       gcc_assert (REG_P (lhs));
12686       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
12687                                       POINTER_REGS));
12688       return NO_REGS;
12689     }
12690
12691   return regclass;
12692 }
12693
12694 void
12695 aarch64_asm_output_labelref (FILE* f, const char *name)
12696 {
12697   asm_fprintf (f, "%U%s", name);
12698 }
12699
12700 static void
12701 aarch64_elf_asm_constructor (rtx symbol, int priority)
12702 {
12703   if (priority == DEFAULT_INIT_PRIORITY)
12704     default_ctor_section_asm_out_constructor (symbol, priority);
12705   else
12706     {
12707       section *s;
12708       /* While priority is known to be in range [0, 65535], so 18 bytes
12709          would be enough, the compiler might not know that.  To avoid
12710          -Wformat-truncation false positive, use a larger size.  */
12711       char buf[23];
12712       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
12713       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12714       switch_to_section (s);
12715       assemble_align (POINTER_SIZE);
12716       assemble_aligned_integer (POINTER_BYTES, symbol);
12717     }
12718 }
12719
12720 static void
12721 aarch64_elf_asm_destructor (rtx symbol, int priority)
12722 {
12723   if (priority == DEFAULT_INIT_PRIORITY)
12724     default_dtor_section_asm_out_destructor (symbol, priority);
12725   else
12726     {
12727       section *s;
12728       /* While priority is known to be in range [0, 65535], so 18 bytes
12729          would be enough, the compiler might not know that.  To avoid
12730          -Wformat-truncation false positive, use a larger size.  */
12731       char buf[23];
12732       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
12733       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12734       switch_to_section (s);
12735       assemble_align (POINTER_SIZE);
12736       assemble_aligned_integer (POINTER_BYTES, symbol);
12737     }
12738 }
12739
12740 const char*
12741 aarch64_output_casesi (rtx *operands)
12742 {
12743   char buf[100];
12744   char label[100];
12745   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
12746   int index;
12747   static const char *const patterns[4][2] =
12748   {
12749     {
12750       "ldrb\t%w3, [%0,%w1,uxtw]",
12751       "add\t%3, %4, %w3, sxtb #2"
12752     },
12753     {
12754       "ldrh\t%w3, [%0,%w1,uxtw #1]",
12755       "add\t%3, %4, %w3, sxth #2"
12756     },
12757     {
12758       "ldr\t%w3, [%0,%w1,uxtw #2]",
12759       "add\t%3, %4, %w3, sxtw #2"
12760     },
12761     /* We assume that DImode is only generated when not optimizing and
12762        that we don't really need 64-bit address offsets.  That would
12763        imply an object file with 8GB of code in a single function!  */
12764     {
12765       "ldr\t%w3, [%0,%w1,uxtw #2]",
12766       "add\t%3, %4, %w3, sxtw #2"
12767     }
12768   };
12769
12770   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
12771
12772   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
12773   index = exact_log2 (GET_MODE_SIZE (mode));
12774
12775   gcc_assert (index >= 0 && index <= 3);
12776
12777   /* Need to implement table size reduction, by chaning the code below.  */
12778   output_asm_insn (patterns[index][0], operands);
12779   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
12780   snprintf (buf, sizeof (buf),
12781             "adr\t%%4, %s", targetm.strip_name_encoding (label));
12782   output_asm_insn (buf, operands);
12783   output_asm_insn (patterns[index][1], operands);
12784   output_asm_insn ("br\t%3", operands);
12785   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
12786                    operands);
12787   assemble_label (asm_out_file, label);
12788   return "";
12789 }
12790
12791
12792 /* Return size in bits of an arithmetic operand which is shifted/scaled and
12793    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
12794    operator.  */
12795
12796 int
12797 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
12798 {
12799   if (shift >= 0 && shift <= 3)
12800     {
12801       int size;
12802       for (size = 8; size <= 32; size *= 2)
12803         {
12804           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
12805           if (mask == bits << shift)
12806             return size;
12807         }
12808     }
12809   return 0;
12810 }
12811
12812 /* Constant pools are per function only when PC relative
12813    literal loads are true or we are in the large memory
12814    model.  */
12815
12816 static inline bool
12817 aarch64_can_use_per_function_literal_pools_p (void)
12818 {
12819   return (aarch64_pcrelative_literal_loads
12820           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
12821 }
12822
12823 static bool
12824 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
12825 {
12826   /* We can't use blocks for constants when we're using a per-function
12827      constant pool.  */
12828   return !aarch64_can_use_per_function_literal_pools_p ();
12829 }
12830
12831 /* Select appropriate section for constants depending
12832    on where we place literal pools.  */
12833
12834 static section *
12835 aarch64_select_rtx_section (machine_mode mode,
12836                             rtx x,
12837                             unsigned HOST_WIDE_INT align)
12838 {
12839   if (aarch64_can_use_per_function_literal_pools_p ())
12840     return function_section (current_function_decl);
12841
12842   return default_elf_select_rtx_section (mode, x, align);
12843 }
12844
12845 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
12846 void
12847 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
12848                                   HOST_WIDE_INT offset)
12849 {
12850   /* When using per-function literal pools, we must ensure that any code
12851      section is aligned to the minimal instruction length, lest we get
12852      errors from the assembler re "unaligned instructions".  */
12853   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
12854     ASM_OUTPUT_ALIGN (f, 2);
12855 }
12856
12857 /* Costs.  */
12858
12859 /* Helper function for rtx cost calculation.  Strip a shift expression
12860    from X.  Returns the inner operand if successful, or the original
12861    expression on failure.  */
12862 static rtx
12863 aarch64_strip_shift (rtx x)
12864 {
12865   rtx op = x;
12866
12867   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
12868      we can convert both to ROR during final output.  */
12869   if ((GET_CODE (op) == ASHIFT
12870        || GET_CODE (op) == ASHIFTRT
12871        || GET_CODE (op) == LSHIFTRT
12872        || GET_CODE (op) == ROTATERT
12873        || GET_CODE (op) == ROTATE)
12874       && CONST_INT_P (XEXP (op, 1)))
12875     return XEXP (op, 0);
12876
12877   if (GET_CODE (op) == MULT
12878       && CONST_INT_P (XEXP (op, 1))
12879       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
12880     return XEXP (op, 0);
12881
12882   return x;
12883 }
12884
12885 /* Helper function for rtx cost calculation.  Strip an extend
12886    expression from X.  Returns the inner operand if successful, or the
12887    original expression on failure.  We deal with a number of possible
12888    canonicalization variations here. If STRIP_SHIFT is true, then
12889    we can strip off a shift also.  */
12890 static rtx
12891 aarch64_strip_extend (rtx x, bool strip_shift)
12892 {
12893   scalar_int_mode mode;
12894   rtx op = x;
12895
12896   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
12897     return op;
12898
12899   if (GET_CODE (op) == AND
12900       && GET_CODE (XEXP (op, 0)) == MULT
12901       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
12902       && CONST_INT_P (XEXP (op, 1))
12903       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
12904                            INTVAL (XEXP (op, 1))) != 0)
12905     return XEXP (XEXP (op, 0), 0);
12906
12907   /* Now handle extended register, as this may also have an optional
12908      left shift by 1..4.  */
12909   if (strip_shift
12910       && GET_CODE (op) == ASHIFT
12911       && CONST_INT_P (XEXP (op, 1))
12912       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
12913     op = XEXP (op, 0);
12914
12915   if (GET_CODE (op) == ZERO_EXTEND
12916       || GET_CODE (op) == SIGN_EXTEND)
12917     op = XEXP (op, 0);
12918
12919   if (op != x)
12920     return op;
12921
12922   return x;
12923 }
12924
12925 /* Helper function for rtx cost calculation. Strip extension as well as any
12926    inner VEC_SELECT high-half from X. Returns the inner vector operand if
12927    successful, or the original expression on failure.  */
12928 static rtx
12929 aarch64_strip_extend_vec_half (rtx x)
12930 {
12931   if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
12932     {
12933       x = XEXP (x, 0);
12934       if (GET_CODE (x) == VEC_SELECT
12935           && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
12936                                     XEXP (x, 1)))
12937         x = XEXP (x, 0);
12938     }
12939   return x;
12940 }
12941
12942 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
12943    any subsequent extend and VEC_SELECT from X. Returns the inner scalar
12944    operand if successful, or the original expression on failure.  */
12945 static rtx
12946 aarch64_strip_duplicate_vec_elt (rtx x)
12947 {
12948   if (GET_CODE (x) == VEC_DUPLICATE
12949       && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
12950     {
12951       x = XEXP (x, 0);
12952       if (GET_CODE (x) == VEC_SELECT)
12953         x = XEXP (x, 0);
12954       else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
12955                && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
12956         x = XEXP (XEXP (x, 0), 0);
12957     }
12958   return x;
12959 }
12960
12961 /* Return true iff CODE is a shift supported in combination
12962    with arithmetic instructions.  */
12963
12964 static bool
12965 aarch64_shift_p (enum rtx_code code)
12966 {
12967   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
12968 }
12969
12970
12971 /* Return true iff X is a cheap shift without a sign extend. */
12972
12973 static bool
12974 aarch64_cheap_mult_shift_p (rtx x)
12975 {
12976   rtx op0, op1;
12977
12978   op0 = XEXP (x, 0);
12979   op1 = XEXP (x, 1);
12980
12981   if (!(aarch64_tune_params.extra_tuning_flags
12982                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
12983     return false;
12984
12985   if (GET_CODE (op0) == SIGN_EXTEND)
12986     return false;
12987
12988   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
12989       && UINTVAL (op1) <= 4)
12990     return true;
12991
12992   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
12993     return false;
12994
12995   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
12996
12997   if (l2 > 0 && l2 <= 4)
12998     return true;
12999
13000   return false;
13001 }
13002
13003 /* Helper function for rtx cost calculation.  Calculate the cost of
13004    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13005    Return the calculated cost of the expression, recursing manually in to
13006    operands where needed.  */
13007
13008 static int
13009 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13010 {
13011   rtx op0, op1;
13012   const struct cpu_cost_table *extra_cost
13013     = aarch64_tune_params.insn_extra_cost;
13014   int cost = 0;
13015   bool compound_p = (outer == PLUS || outer == MINUS);
13016   machine_mode mode = GET_MODE (x);
13017
13018   gcc_checking_assert (code == MULT);
13019
13020   op0 = XEXP (x, 0);
13021   op1 = XEXP (x, 1);
13022
13023   if (VECTOR_MODE_P (mode))
13024     {
13025       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13026       if (vec_flags & VEC_ADVSIMD)
13027         {
13028           /* The select-operand-high-half versions of the instruction have the
13029              same cost as the three vector version - don't add the costs of the
13030              extension or selection into the costs of the multiply.  */
13031           op0 = aarch64_strip_extend_vec_half (op0);
13032           op1 = aarch64_strip_extend_vec_half (op1);
13033           /* The by-element versions of the instruction have the same costs as
13034              the normal 3-vector version.  We make an assumption that the input
13035              to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
13036              costing of a MUL by element pre RA is a bit optimistic.  */
13037           op0 = aarch64_strip_duplicate_vec_elt (op0);
13038           op1 = aarch64_strip_duplicate_vec_elt (op1);
13039         }
13040       cost += rtx_cost (op0, mode, MULT, 0, speed);
13041       cost += rtx_cost (op1, mode, MULT, 1, speed);
13042       if (speed)
13043         {
13044           if (GET_CODE (x) == MULT)
13045             cost += extra_cost->vect.mult;
13046           /* This is to catch the SSRA costing currently flowing here.  */
13047           else
13048             cost += extra_cost->vect.alu;
13049         }
13050       return cost;
13051     }
13052
13053   /* Integer multiply/fma.  */
13054   if (GET_MODE_CLASS (mode) == MODE_INT)
13055     {
13056       /* The multiply will be canonicalized as a shift, cost it as such.  */
13057       if (aarch64_shift_p (GET_CODE (x))
13058           || (CONST_INT_P (op1)
13059               && exact_log2 (INTVAL (op1)) > 0))
13060         {
13061           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13062                            || GET_CODE (op0) == SIGN_EXTEND;
13063           if (speed)
13064             {
13065               if (compound_p)
13066                 {
13067                   /* If the shift is considered cheap,
13068                      then don't add any cost. */
13069                   if (aarch64_cheap_mult_shift_p (x))
13070                     ;
13071                   else if (REG_P (op1))
13072                     /* ARITH + shift-by-register.  */
13073                     cost += extra_cost->alu.arith_shift_reg;
13074                   else if (is_extend)
13075                     /* ARITH + extended register.  We don't have a cost field
13076                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
13077                     cost += extra_cost->alu.extend_arith;
13078                   else
13079                     /* ARITH + shift-by-immediate.  */
13080                     cost += extra_cost->alu.arith_shift;
13081                 }
13082               else
13083                 /* LSL (immediate).  */
13084                 cost += extra_cost->alu.shift;
13085
13086             }
13087           /* Strip extends as we will have costed them in the case above.  */
13088           if (is_extend)
13089             op0 = aarch64_strip_extend (op0, true);
13090
13091           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13092
13093           return cost;
13094         }
13095
13096       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
13097          compound and let the below cases handle it.  After all, MNEG is a
13098          special-case alias of MSUB.  */
13099       if (GET_CODE (op0) == NEG)
13100         {
13101           op0 = XEXP (op0, 0);
13102           compound_p = true;
13103         }
13104
13105       /* Integer multiplies or FMAs have zero/sign extending variants.  */
13106       if ((GET_CODE (op0) == ZERO_EXTEND
13107            && GET_CODE (op1) == ZERO_EXTEND)
13108           || (GET_CODE (op0) == SIGN_EXTEND
13109               && GET_CODE (op1) == SIGN_EXTEND))
13110         {
13111           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13112           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13113
13114           if (speed)
13115             {
13116               if (compound_p)
13117                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
13118                 cost += extra_cost->mult[0].extend_add;
13119               else
13120                 /* MUL/SMULL/UMULL.  */
13121                 cost += extra_cost->mult[0].extend;
13122             }
13123
13124           return cost;
13125         }
13126
13127       /* This is either an integer multiply or a MADD.  In both cases
13128          we want to recurse and cost the operands.  */
13129       cost += rtx_cost (op0, mode, MULT, 0, speed);
13130       cost += rtx_cost (op1, mode, MULT, 1, speed);
13131
13132       if (speed)
13133         {
13134           if (compound_p)
13135             /* MADD/MSUB.  */
13136             cost += extra_cost->mult[mode == DImode].add;
13137           else
13138             /* MUL.  */
13139             cost += extra_cost->mult[mode == DImode].simple;
13140         }
13141
13142       return cost;
13143     }
13144   else
13145     {
13146       if (speed)
13147         {
13148           /* Floating-point FMA/FMUL can also support negations of the
13149              operands, unless the rounding mode is upward or downward in
13150              which case FNMUL is different than FMUL with operand negation.  */
13151           bool neg0 = GET_CODE (op0) == NEG;
13152           bool neg1 = GET_CODE (op1) == NEG;
13153           if (compound_p || !flag_rounding_math || (neg0 && neg1))
13154             {
13155               if (neg0)
13156                 op0 = XEXP (op0, 0);
13157               if (neg1)
13158                 op1 = XEXP (op1, 0);
13159             }
13160
13161           if (compound_p)
13162             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
13163             cost += extra_cost->fp[mode == DFmode].fma;
13164           else
13165             /* FMUL/FNMUL.  */
13166             cost += extra_cost->fp[mode == DFmode].mult;
13167         }
13168
13169       cost += rtx_cost (op0, mode, MULT, 0, speed);
13170       cost += rtx_cost (op1, mode, MULT, 1, speed);
13171       return cost;
13172     }
13173 }
13174
13175 static int
13176 aarch64_address_cost (rtx x,
13177                       machine_mode mode,
13178                       addr_space_t as ATTRIBUTE_UNUSED,
13179                       bool speed)
13180 {
13181   enum rtx_code c = GET_CODE (x);
13182   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13183   struct aarch64_address_info info;
13184   int cost = 0;
13185   info.shift = 0;
13186
13187   if (!aarch64_classify_address (&info, x, mode, false))
13188     {
13189       if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13190         {
13191           /* This is a CONST or SYMBOL ref which will be split
13192              in a different way depending on the code model in use.
13193              Cost it through the generic infrastructure.  */
13194           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13195           /* Divide through by the cost of one instruction to
13196              bring it to the same units as the address costs.  */
13197           cost_symbol_ref /= COSTS_N_INSNS (1);
13198           /* The cost is then the cost of preparing the address,
13199              followed by an immediate (possibly 0) offset.  */
13200           return cost_symbol_ref + addr_cost->imm_offset;
13201         }
13202       else
13203         {
13204           /* This is most likely a jump table from a case
13205              statement.  */
13206           return addr_cost->register_offset;
13207         }
13208     }
13209
13210   switch (info.type)
13211     {
13212       case ADDRESS_LO_SUM:
13213       case ADDRESS_SYMBOLIC:
13214       case ADDRESS_REG_IMM:
13215         cost += addr_cost->imm_offset;
13216         break;
13217
13218       case ADDRESS_REG_WB:
13219         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13220           cost += addr_cost->pre_modify;
13221         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13222           {
13223             unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13224             if (nvectors == 3)
13225               cost += addr_cost->post_modify_ld3_st3;
13226             else if (nvectors == 4)
13227               cost += addr_cost->post_modify_ld4_st4;
13228             else
13229               cost += addr_cost->post_modify;
13230           }
13231         else
13232           gcc_unreachable ();
13233
13234         break;
13235
13236       case ADDRESS_REG_REG:
13237         cost += addr_cost->register_offset;
13238         break;
13239
13240       case ADDRESS_REG_SXTW:
13241         cost += addr_cost->register_sextend;
13242         break;
13243
13244       case ADDRESS_REG_UXTW:
13245         cost += addr_cost->register_zextend;
13246         break;
13247
13248       default:
13249         gcc_unreachable ();
13250     }
13251
13252
13253   if (info.shift > 0)
13254     {
13255       /* For the sake of calculating the cost of the shifted register
13256          component, we can treat same sized modes in the same way.  */
13257       if (known_eq (GET_MODE_BITSIZE (mode), 16))
13258         cost += addr_cost->addr_scale_costs.hi;
13259       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13260         cost += addr_cost->addr_scale_costs.si;
13261       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13262         cost += addr_cost->addr_scale_costs.di;
13263       else
13264         /* We can't tell, or this is a 128-bit vector.  */
13265         cost += addr_cost->addr_scale_costs.ti;
13266     }
13267
13268   return cost;
13269 }
13270
13271 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
13272    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
13273    to be taken.  */
13274
13275 int
13276 aarch64_branch_cost (bool speed_p, bool predictable_p)
13277 {
13278   /* When optimizing for speed, use the cost of unpredictable branches.  */
13279   const struct cpu_branch_cost *branch_costs =
13280     aarch64_tune_params.branch_costs;
13281
13282   if (!speed_p || predictable_p)
13283     return branch_costs->predictable;
13284   else
13285     return branch_costs->unpredictable;
13286 }
13287
13288 /* Return true if X is a zero or sign extract
13289    usable in an ADD or SUB (extended register) instruction.  */
13290 static bool
13291 aarch64_rtx_arith_op_extract_p (rtx x)
13292 {
13293   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13294      No shift.  */
13295   if (GET_CODE (x) == SIGN_EXTEND
13296       || GET_CODE (x) == ZERO_EXTEND)
13297     return REG_P (XEXP (x, 0));
13298
13299   return false;
13300 }
13301
13302 static bool
13303 aarch64_frint_unspec_p (unsigned int u)
13304 {
13305   switch (u)
13306     {
13307       case UNSPEC_FRINTZ:
13308       case UNSPEC_FRINTP:
13309       case UNSPEC_FRINTM:
13310       case UNSPEC_FRINTA:
13311       case UNSPEC_FRINTN:
13312       case UNSPEC_FRINTX:
13313       case UNSPEC_FRINTI:
13314         return true;
13315
13316       default:
13317         return false;
13318     }
13319 }
13320
13321 /* Return true iff X is an rtx that will match an extr instruction
13322    i.e. as described in the *extr<mode>5_insn family of patterns.
13323    OP0 and OP1 will be set to the operands of the shifts involved
13324    on success and will be NULL_RTX otherwise.  */
13325
13326 static bool
13327 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13328 {
13329   rtx op0, op1;
13330   scalar_int_mode mode;
13331   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13332     return false;
13333
13334   *res_op0 = NULL_RTX;
13335   *res_op1 = NULL_RTX;
13336
13337   if (GET_CODE (x) != IOR)
13338     return false;
13339
13340   op0 = XEXP (x, 0);
13341   op1 = XEXP (x, 1);
13342
13343   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13344       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13345     {
13346      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
13347       if (GET_CODE (op1) == ASHIFT)
13348         std::swap (op0, op1);
13349
13350       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13351         return false;
13352
13353       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13354       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13355
13356       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13357           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13358         {
13359           *res_op0 = XEXP (op0, 0);
13360           *res_op1 = XEXP (op1, 0);
13361           return true;
13362         }
13363     }
13364
13365   return false;
13366 }
13367
13368 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13369    storing it in *COST.  Result is true if the total cost of the operation
13370    has now been calculated.  */
13371 static bool
13372 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13373 {
13374   rtx inner;
13375   rtx comparator;
13376   enum rtx_code cmpcode;
13377   const struct cpu_cost_table *extra_cost
13378     = aarch64_tune_params.insn_extra_cost;
13379
13380   if (COMPARISON_P (op0))
13381     {
13382       inner = XEXP (op0, 0);
13383       comparator = XEXP (op0, 1);
13384       cmpcode = GET_CODE (op0);
13385     }
13386   else
13387     {
13388       inner = op0;
13389       comparator = const0_rtx;
13390       cmpcode = NE;
13391     }
13392
13393   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13394     {
13395       /* Conditional branch.  */
13396       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13397         return true;
13398       else
13399         {
13400           if (cmpcode == NE || cmpcode == EQ)
13401             {
13402               if (comparator == const0_rtx)
13403                 {
13404                   /* TBZ/TBNZ/CBZ/CBNZ.  */
13405                   if (GET_CODE (inner) == ZERO_EXTRACT)
13406                     /* TBZ/TBNZ.  */
13407                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13408                                        ZERO_EXTRACT, 0, speed);
13409                   else
13410                     /* CBZ/CBNZ.  */
13411                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13412
13413                   return true;
13414                 }
13415               if (register_operand (inner, VOIDmode)
13416                   && aarch64_imm24 (comparator, VOIDmode))
13417                 {
13418                   /* SUB and SUBS.  */
13419                   *cost += COSTS_N_INSNS (2);
13420                   if (speed)
13421                     *cost += extra_cost->alu.arith * 2;
13422                   return true;
13423                 }
13424             }
13425           else if (cmpcode == LT || cmpcode == GE)
13426             {
13427               /* TBZ/TBNZ.  */
13428               if (comparator == const0_rtx)
13429                 return true;
13430             }
13431         }
13432     }
13433   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13434     {
13435       /* CCMP.  */
13436       if (GET_CODE (op1) == COMPARE)
13437         {
13438           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
13439           if (XEXP (op1, 1) == const0_rtx)
13440             *cost += 1;
13441           if (speed)
13442             {
13443               machine_mode mode = GET_MODE (XEXP (op1, 0));
13444
13445               if (GET_MODE_CLASS (mode) == MODE_INT)
13446                 *cost += extra_cost->alu.arith;
13447               else
13448                 *cost += extra_cost->fp[mode == DFmode].compare;
13449             }
13450           return true;
13451         }
13452
13453       /* It's a conditional operation based on the status flags,
13454          so it must be some flavor of CSEL.  */
13455
13456       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
13457       if (GET_CODE (op1) == NEG
13458           || GET_CODE (op1) == NOT
13459           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13460         op1 = XEXP (op1, 0);
13461       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13462         {
13463           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
13464           op1 = XEXP (op1, 0);
13465           op2 = XEXP (op2, 0);
13466         }
13467       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13468         {
13469           inner = XEXP (op1, 0);
13470           if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13471             /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
13472             op1 = XEXP (inner, 0);
13473         }
13474
13475       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13476       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13477       return true;
13478     }
13479
13480   /* We don't know what this is, cost all operands.  */
13481   return false;
13482 }
13483
13484 /* Check whether X is a bitfield operation of the form shift + extend that
13485    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
13486    operand to which the bitfield operation is applied.  Otherwise return
13487    NULL_RTX.  */
13488
13489 static rtx
13490 aarch64_extend_bitfield_pattern_p (rtx x)
13491 {
13492   rtx_code outer_code = GET_CODE (x);
13493   machine_mode outer_mode = GET_MODE (x);
13494
13495   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13496       && outer_mode != SImode && outer_mode != DImode)
13497     return NULL_RTX;
13498
13499   rtx inner = XEXP (x, 0);
13500   rtx_code inner_code = GET_CODE (inner);
13501   machine_mode inner_mode = GET_MODE (inner);
13502   rtx op = NULL_RTX;
13503
13504   switch (inner_code)
13505     {
13506       case ASHIFT:
13507         if (CONST_INT_P (XEXP (inner, 1))
13508             && (inner_mode == QImode || inner_mode == HImode))
13509           op = XEXP (inner, 0);
13510         break;
13511       case LSHIFTRT:
13512         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13513             && (inner_mode == QImode || inner_mode == HImode))
13514           op = XEXP (inner, 0);
13515         break;
13516       case ASHIFTRT:
13517         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13518             && (inner_mode == QImode || inner_mode == HImode))
13519           op = XEXP (inner, 0);
13520         break;
13521       default:
13522         break;
13523     }
13524
13525   return op;
13526 }
13527
13528 /* Return true if the mask and a shift amount from an RTX of the form
13529    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13530    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
13531
13532 bool
13533 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13534                                     rtx shft_amnt)
13535 {
13536   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
13537          && INTVAL (mask) > 0
13538          && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
13539          && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
13540          && (UINTVAL (mask)
13541              & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
13542 }
13543
13544 /* Return true if the masks and a shift amount from an RTX of the form
13545    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
13546    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
13547
13548 bool
13549 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
13550                                    unsigned HOST_WIDE_INT mask1,
13551                                    unsigned HOST_WIDE_INT shft_amnt,
13552                                    unsigned HOST_WIDE_INT mask2)
13553 {
13554   unsigned HOST_WIDE_INT t;
13555
13556   /* Verify that there is no overlap in what bits are set in the two masks.  */
13557   if (mask1 != ~mask2)
13558     return false;
13559
13560   /* Verify that mask2 is not all zeros or ones.  */
13561   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
13562     return false;
13563
13564   /* The shift amount should always be less than the mode size.  */
13565   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
13566
13567   /* Verify that the mask being shifted is contiguous and would be in the
13568      least significant bits after shifting by shft_amnt.  */
13569   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
13570   return (t == (t & -t));
13571 }
13572
13573 /* Calculate the cost of calculating X, storing it in *COST.  Result
13574    is true if the total cost of the operation has now been calculated.  */
13575 static bool
13576 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
13577                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
13578 {
13579   rtx op0, op1, op2;
13580   const struct cpu_cost_table *extra_cost
13581     = aarch64_tune_params.insn_extra_cost;
13582   rtx_code code = GET_CODE (x);
13583   scalar_int_mode int_mode;
13584
13585   /* By default, assume that everything has equivalent cost to the
13586      cheapest instruction.  Any additional costs are applied as a delta
13587      above this default.  */
13588   *cost = COSTS_N_INSNS (1);
13589
13590   switch (code)
13591     {
13592     case SET:
13593       /* The cost depends entirely on the operands to SET.  */
13594       *cost = 0;
13595       op0 = SET_DEST (x);
13596       op1 = SET_SRC (x);
13597
13598       switch (GET_CODE (op0))
13599         {
13600         case MEM:
13601           if (speed)
13602             {
13603               rtx address = XEXP (op0, 0);
13604               if (VECTOR_MODE_P (mode))
13605                 *cost += extra_cost->ldst.storev;
13606               else if (GET_MODE_CLASS (mode) == MODE_INT)
13607                 *cost += extra_cost->ldst.store;
13608               else if (mode == SFmode || mode == SDmode)
13609                 *cost += extra_cost->ldst.storef;
13610               else if (mode == DFmode || mode == DDmode)
13611                 *cost += extra_cost->ldst.stored;
13612
13613               *cost +=
13614                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13615                                                      0, speed));
13616             }
13617
13618           *cost += rtx_cost (op1, mode, SET, 1, speed);
13619           return true;
13620
13621         case SUBREG:
13622           if (! REG_P (SUBREG_REG (op0)))
13623             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
13624
13625           /* Fall through.  */
13626         case REG:
13627           /* The cost is one per vector-register copied.  */
13628           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
13629             {
13630               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
13631               *cost = COSTS_N_INSNS (nregs);
13632             }
13633           /* const0_rtx is in general free, but we will use an
13634              instruction to set a register to 0.  */
13635           else if (REG_P (op1) || op1 == const0_rtx)
13636             {
13637               /* The cost is 1 per register copied.  */
13638               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
13639               *cost = COSTS_N_INSNS (nregs);
13640             }
13641           else
13642             /* Cost is just the cost of the RHS of the set.  */
13643             *cost += rtx_cost (op1, mode, SET, 1, speed);
13644           return true;
13645
13646         case ZERO_EXTRACT:
13647         case SIGN_EXTRACT:
13648           /* Bit-field insertion.  Strip any redundant widening of
13649              the RHS to meet the width of the target.  */
13650           if (SUBREG_P (op1))
13651             op1 = SUBREG_REG (op1);
13652           if ((GET_CODE (op1) == ZERO_EXTEND
13653                || GET_CODE (op1) == SIGN_EXTEND)
13654               && CONST_INT_P (XEXP (op0, 1))
13655               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
13656               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
13657             op1 = XEXP (op1, 0);
13658
13659           if (CONST_INT_P (op1))
13660             {
13661               /* MOV immediate is assumed to always be cheap.  */
13662               *cost = COSTS_N_INSNS (1);
13663             }
13664           else
13665             {
13666               /* BFM.  */
13667               if (speed)
13668                 *cost += extra_cost->alu.bfi;
13669               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
13670             }
13671
13672           return true;
13673
13674         default:
13675           /* We can't make sense of this, assume default cost.  */
13676           *cost = COSTS_N_INSNS (1);
13677           return false;
13678         }
13679       return false;
13680
13681     case CONST_INT:
13682       /* If an instruction can incorporate a constant within the
13683          instruction, the instruction's expression avoids calling
13684          rtx_cost() on the constant.  If rtx_cost() is called on a
13685          constant, then it is usually because the constant must be
13686          moved into a register by one or more instructions.
13687
13688          The exception is constant 0, which can be expressed
13689          as XZR/WZR and is therefore free.  The exception to this is
13690          if we have (set (reg) (const0_rtx)) in which case we must cost
13691          the move.  However, we can catch that when we cost the SET, so
13692          we don't need to consider that here.  */
13693       if (x == const0_rtx)
13694         *cost = 0;
13695       else
13696         {
13697           /* To an approximation, building any other constant is
13698              proportionally expensive to the number of instructions
13699              required to build that constant.  This is true whether we
13700              are compiling for SPEED or otherwise.  */
13701           if (!is_a <scalar_int_mode> (mode, &int_mode))
13702             int_mode = word_mode;
13703           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
13704                                  (NULL_RTX, x, false, int_mode));
13705         }
13706       return true;
13707
13708     case CONST_DOUBLE:
13709
13710       /* First determine number of instructions to do the move
13711           as an integer constant.  */
13712       if (!aarch64_float_const_representable_p (x)
13713            && !aarch64_can_const_movi_rtx_p (x, mode)
13714            && aarch64_float_const_rtx_p (x))
13715         {
13716           unsigned HOST_WIDE_INT ival;
13717           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
13718           gcc_assert (succeed);
13719
13720           scalar_int_mode imode = (mode == HFmode
13721                                    ? SImode
13722                                    : int_mode_for_mode (mode).require ());
13723           int ncost = aarch64_internal_mov_immediate
13724                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
13725           *cost += COSTS_N_INSNS (ncost);
13726           return true;
13727         }
13728
13729       if (speed)
13730         {
13731           /* mov[df,sf]_aarch64.  */
13732           if (aarch64_float_const_representable_p (x))
13733             /* FMOV (scalar immediate).  */
13734             *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
13735           else if (!aarch64_float_const_zero_rtx_p (x))
13736             {
13737               /* This will be a load from memory.  */
13738               if (mode == DFmode || mode == DDmode)
13739                 *cost += extra_cost->ldst.loadd;
13740               else
13741                 *cost += extra_cost->ldst.loadf;
13742             }
13743           else
13744             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
13745                or MOV v0.s[0], wzr - neither of which are modeled by the
13746                cost tables.  Just use the default cost.  */
13747             {
13748             }
13749         }
13750
13751       return true;
13752
13753     case MEM:
13754       if (speed)
13755         {
13756           /* For loads we want the base cost of a load, plus an
13757              approximation for the additional cost of the addressing
13758              mode.  */
13759           rtx address = XEXP (x, 0);
13760           if (VECTOR_MODE_P (mode))
13761             *cost += extra_cost->ldst.loadv;
13762           else if (GET_MODE_CLASS (mode) == MODE_INT)
13763             *cost += extra_cost->ldst.load;
13764           else if (mode == SFmode || mode == SDmode)
13765             *cost += extra_cost->ldst.loadf;
13766           else if (mode == DFmode || mode == DDmode)
13767             *cost += extra_cost->ldst.loadd;
13768
13769           *cost +=
13770                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13771                                                      0, speed));
13772         }
13773
13774       return true;
13775
13776     case NEG:
13777       op0 = XEXP (x, 0);
13778
13779       if (VECTOR_MODE_P (mode))
13780         {
13781           if (speed)
13782             {
13783               /* FNEG.  */
13784               *cost += extra_cost->vect.alu;
13785             }
13786           return false;
13787         }
13788
13789       if (GET_MODE_CLASS (mode) == MODE_INT)
13790         {
13791           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
13792               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
13793             {
13794               /* CSETM.  */
13795               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
13796               return true;
13797             }
13798
13799           /* Cost this as SUB wzr, X.  */
13800           op0 = CONST0_RTX (mode);
13801           op1 = XEXP (x, 0);
13802           goto cost_minus;
13803         }
13804
13805       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13806         {
13807           /* Support (neg(fma...)) as a single instruction only if
13808              sign of zeros is unimportant.  This matches the decision
13809              making in aarch64.md.  */
13810           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
13811             {
13812               /* FNMADD.  */
13813               *cost = rtx_cost (op0, mode, NEG, 0, speed);
13814               return true;
13815             }
13816           if (GET_CODE (op0) == MULT)
13817             {
13818               /* FNMUL.  */
13819               *cost = rtx_cost (op0, mode, NEG, 0, speed);
13820               return true;
13821             }
13822           if (speed)
13823             /* FNEG.  */
13824             *cost += extra_cost->fp[mode == DFmode].neg;
13825           return false;
13826         }
13827
13828       return false;
13829
13830     case CLRSB:
13831     case CLZ:
13832       if (speed)
13833         {
13834           if (VECTOR_MODE_P (mode))
13835             *cost += extra_cost->vect.alu;
13836           else
13837             *cost += extra_cost->alu.clz;
13838         }
13839
13840       return false;
13841
13842     case CTZ:
13843       *cost = COSTS_N_INSNS (2);
13844
13845       if (speed)
13846         *cost += extra_cost->alu.clz + extra_cost->alu.rev;
13847       return false;
13848
13849     case COMPARE:
13850       op0 = XEXP (x, 0);
13851       op1 = XEXP (x, 1);
13852
13853       if (op1 == const0_rtx
13854           && GET_CODE (op0) == AND)
13855         {
13856           x = op0;
13857           mode = GET_MODE (op0);
13858           goto cost_logic;
13859         }
13860
13861       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
13862         {
13863           /* TODO: A write to the CC flags possibly costs extra, this
13864              needs encoding in the cost tables.  */
13865
13866           mode = GET_MODE (op0);
13867           /* ANDS.  */
13868           if (GET_CODE (op0) == AND)
13869             {
13870               x = op0;
13871               goto cost_logic;
13872             }
13873
13874           if (GET_CODE (op0) == PLUS)
13875             {
13876               /* ADDS (and CMN alias).  */
13877               x = op0;
13878               goto cost_plus;
13879             }
13880
13881           if (GET_CODE (op0) == MINUS)
13882             {
13883               /* SUBS.  */
13884               x = op0;
13885               goto cost_minus;
13886             }
13887
13888           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
13889               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
13890               && CONST_INT_P (XEXP (op0, 2)))
13891             {
13892               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
13893                  Handle it here directly rather than going to cost_logic
13894                  since we know the immediate generated for the TST is valid
13895                  so we can avoid creating an intermediate rtx for it only
13896                  for costing purposes.  */
13897               if (speed)
13898                 *cost += extra_cost->alu.logical;
13899
13900               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
13901                                  ZERO_EXTRACT, 0, speed);
13902               return true;
13903             }
13904
13905           if (GET_CODE (op1) == NEG)
13906             {
13907               /* CMN.  */
13908               if (speed)
13909                 *cost += extra_cost->alu.arith;
13910
13911               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
13912               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
13913               return true;
13914             }
13915
13916           /* CMP.
13917
13918              Compare can freely swap the order of operands, and
13919              canonicalization puts the more complex operation first.
13920              But the integer MINUS logic expects the shift/extend
13921              operation in op1.  */
13922           if (! (REG_P (op0)
13923                  || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
13924           {
13925             op0 = XEXP (x, 1);
13926             op1 = XEXP (x, 0);
13927           }
13928           goto cost_minus;
13929         }
13930
13931       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
13932         {
13933           /* FCMP.  */
13934           if (speed)
13935             *cost += extra_cost->fp[mode == DFmode].compare;
13936
13937           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
13938             {
13939               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
13940               /* FCMP supports constant 0.0 for no extra cost. */
13941               return true;
13942             }
13943           return false;
13944         }
13945
13946       if (VECTOR_MODE_P (mode))
13947         {
13948           /* Vector compare.  */
13949           if (speed)
13950             *cost += extra_cost->vect.alu;
13951
13952           if (aarch64_float_const_zero_rtx_p (op1))
13953             {
13954               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
13955                  cost.  */
13956               return true;
13957             }
13958           return false;
13959         }
13960       return false;
13961
13962     case MINUS:
13963       {
13964         op0 = XEXP (x, 0);
13965         op1 = XEXP (x, 1);
13966
13967 cost_minus:
13968         if (VECTOR_MODE_P (mode))
13969           {
13970             /* SUBL2 and SUBW2.  */
13971             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13972             if (vec_flags & VEC_ADVSIMD)
13973               {
13974                 /* The select-operand-high-half versions of the sub instruction
13975                    have the same cost as the regular three vector version -
13976                    don't add the costs of the select into the costs of the sub.
13977                    */
13978                 op0 = aarch64_strip_extend_vec_half (op0);
13979                 op1 = aarch64_strip_extend_vec_half (op1);
13980               }
13981           }
13982
13983         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
13984
13985         /* Detect valid immediates.  */
13986         if ((GET_MODE_CLASS (mode) == MODE_INT
13987              || (GET_MODE_CLASS (mode) == MODE_CC
13988                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
13989             && CONST_INT_P (op1)
13990             && aarch64_uimm12_shift (INTVAL (op1)))
13991           {
13992             if (speed)
13993               /* SUB(S) (immediate).  */
13994               *cost += extra_cost->alu.arith;
13995             return true;
13996           }
13997
13998         /* Look for SUB (extended register).  */
13999         if (is_a <scalar_int_mode> (mode)
14000             && aarch64_rtx_arith_op_extract_p (op1))
14001           {
14002             if (speed)
14003               *cost += extra_cost->alu.extend_arith;
14004
14005             op1 = aarch64_strip_extend (op1, true);
14006             *cost += rtx_cost (op1, VOIDmode,
14007                                (enum rtx_code) GET_CODE (op1), 0, speed);
14008             return true;
14009           }
14010
14011         rtx new_op1 = aarch64_strip_extend (op1, false);
14012
14013         /* Cost this as an FMA-alike operation.  */
14014         if ((GET_CODE (new_op1) == MULT
14015              || aarch64_shift_p (GET_CODE (new_op1)))
14016             && code != COMPARE)
14017           {
14018             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14019                                             (enum rtx_code) code,
14020                                             speed);
14021             return true;
14022           }
14023
14024         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14025
14026         if (speed)
14027           {
14028             if (VECTOR_MODE_P (mode))
14029               {
14030                 /* Vector SUB.  */
14031                 *cost += extra_cost->vect.alu;
14032               }
14033             else if (GET_MODE_CLASS (mode) == MODE_INT)
14034               {
14035                 /* SUB(S).  */
14036                 *cost += extra_cost->alu.arith;
14037               }
14038             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14039               {
14040                 /* FSUB.  */
14041                 *cost += extra_cost->fp[mode == DFmode].addsub;
14042               }
14043           }
14044         return true;
14045       }
14046
14047     case PLUS:
14048       {
14049         rtx new_op0;
14050
14051         op0 = XEXP (x, 0);
14052         op1 = XEXP (x, 1);
14053
14054 cost_plus:
14055         if (VECTOR_MODE_P (mode))
14056           {
14057             /* ADDL2 and ADDW2.  */
14058             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14059             if (vec_flags & VEC_ADVSIMD)
14060               {
14061                 /* The select-operand-high-half versions of the add instruction
14062                    have the same cost as the regular three vector version -
14063                    don't add the costs of the select into the costs of the add.
14064                    */
14065                 op0 = aarch64_strip_extend_vec_half (op0);
14066                 op1 = aarch64_strip_extend_vec_half (op1);
14067               }
14068           }
14069
14070         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14071             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14072           {
14073             /* CSINC.  */
14074             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14075             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14076             return true;
14077           }
14078
14079         if (GET_MODE_CLASS (mode) == MODE_INT
14080             && (aarch64_plus_immediate (op1, mode)
14081                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14082           {
14083             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14084
14085             if (speed)
14086               {
14087                 /* ADD (immediate).  */
14088                 *cost += extra_cost->alu.arith;
14089
14090                 /* Some tunings prefer to not use the VL-based scalar ops.
14091                    Increase the cost of the poly immediate to prevent their
14092                    formation.  */
14093                 if (GET_CODE (op1) == CONST_POLY_INT
14094                     && (aarch64_tune_params.extra_tuning_flags
14095                         & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14096                   *cost += COSTS_N_INSNS (1);
14097               }
14098             return true;
14099           }
14100
14101         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14102
14103         /* Look for ADD (extended register).  */
14104         if (is_a <scalar_int_mode> (mode)
14105             && aarch64_rtx_arith_op_extract_p (op0))
14106           {
14107             if (speed)
14108               *cost += extra_cost->alu.extend_arith;
14109
14110             op0 = aarch64_strip_extend (op0, true);
14111             *cost += rtx_cost (op0, VOIDmode,
14112                                (enum rtx_code) GET_CODE (op0), 0, speed);
14113             return true;
14114           }
14115
14116         /* Strip any extend, leave shifts behind as we will
14117            cost them through mult_cost.  */
14118         new_op0 = aarch64_strip_extend (op0, false);
14119
14120         if (GET_CODE (new_op0) == MULT
14121             || aarch64_shift_p (GET_CODE (new_op0)))
14122           {
14123             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14124                                             speed);
14125             return true;
14126           }
14127
14128         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14129
14130         if (speed)
14131           {
14132             if (VECTOR_MODE_P (mode))
14133               {
14134                 /* Vector ADD.  */
14135                 *cost += extra_cost->vect.alu;
14136               }
14137             else if (GET_MODE_CLASS (mode) == MODE_INT)
14138               {
14139                 /* ADD.  */
14140                 *cost += extra_cost->alu.arith;
14141               }
14142             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14143               {
14144                 /* FADD.  */
14145                 *cost += extra_cost->fp[mode == DFmode].addsub;
14146               }
14147           }
14148         return true;
14149       }
14150
14151     case BSWAP:
14152       *cost = COSTS_N_INSNS (1);
14153
14154       if (speed)
14155         {
14156           if (VECTOR_MODE_P (mode))
14157             *cost += extra_cost->vect.alu;
14158           else
14159             *cost += extra_cost->alu.rev;
14160         }
14161       return false;
14162
14163     case IOR:
14164       if (aarch_rev16_p (x))
14165         {
14166           *cost = COSTS_N_INSNS (1);
14167
14168           if (speed)
14169             {
14170               if (VECTOR_MODE_P (mode))
14171                 *cost += extra_cost->vect.alu;
14172               else
14173                 *cost += extra_cost->alu.rev;
14174             }
14175           return true;
14176         }
14177
14178       if (aarch64_extr_rtx_p (x, &op0, &op1))
14179         {
14180           *cost += rtx_cost (op0, mode, IOR, 0, speed);
14181           *cost += rtx_cost (op1, mode, IOR, 1, speed);
14182           if (speed)
14183             *cost += extra_cost->alu.shift;
14184
14185           return true;
14186         }
14187     /* Fall through.  */
14188     case XOR:
14189     case AND:
14190     cost_logic:
14191       op0 = XEXP (x, 0);
14192       op1 = XEXP (x, 1);
14193
14194       if (VECTOR_MODE_P (mode))
14195         {
14196           if (speed)
14197             *cost += extra_cost->vect.alu;
14198           return true;
14199         }
14200
14201       if (code == AND
14202           && GET_CODE (op0) == MULT
14203           && CONST_INT_P (XEXP (op0, 1))
14204           && CONST_INT_P (op1)
14205           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14206                                INTVAL (op1)) != 0)
14207         {
14208           /* This is a UBFM/SBFM.  */
14209           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14210           if (speed)
14211             *cost += extra_cost->alu.bfx;
14212           return true;
14213         }
14214
14215       if (is_int_mode (mode, &int_mode))
14216         {
14217           if (CONST_INT_P (op1))
14218             {
14219               /* We have a mask + shift version of a UBFIZ
14220                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
14221               if (GET_CODE (op0) == ASHIFT
14222                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14223                                                          XEXP (op0, 1)))
14224                 {
14225                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
14226                                      (enum rtx_code) code, 0, speed);
14227                   if (speed)
14228                     *cost += extra_cost->alu.bfx;
14229
14230                   return true;
14231                 }
14232               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14233                 {
14234                 /* We possibly get the immediate for free, this is not
14235                    modelled.  */
14236                   *cost += rtx_cost (op0, int_mode,
14237                                      (enum rtx_code) code, 0, speed);
14238                   if (speed)
14239                     *cost += extra_cost->alu.logical;
14240
14241                   return true;
14242                 }
14243             }
14244           else
14245             {
14246               rtx new_op0 = op0;
14247
14248               /* Handle ORN, EON, or BIC.  */
14249               if (GET_CODE (op0) == NOT)
14250                 op0 = XEXP (op0, 0);
14251
14252               new_op0 = aarch64_strip_shift (op0);
14253
14254               /* If we had a shift on op0 then this is a logical-shift-
14255                  by-register/immediate operation.  Otherwise, this is just
14256                  a logical operation.  */
14257               if (speed)
14258                 {
14259                   if (new_op0 != op0)
14260                     {
14261                       /* Shift by immediate.  */
14262                       if (CONST_INT_P (XEXP (op0, 1)))
14263                         *cost += extra_cost->alu.log_shift;
14264                       else
14265                         *cost += extra_cost->alu.log_shift_reg;
14266                     }
14267                   else
14268                     *cost += extra_cost->alu.logical;
14269                 }
14270
14271               /* In both cases we want to cost both operands.  */
14272               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14273                                  0, speed);
14274               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14275                                  1, speed);
14276
14277               return true;
14278             }
14279         }
14280       return false;
14281
14282     case NOT:
14283       x = XEXP (x, 0);
14284       op0 = aarch64_strip_shift (x);
14285
14286       if (VECTOR_MODE_P (mode))
14287         {
14288           /* Vector NOT.  */
14289           *cost += extra_cost->vect.alu;
14290           return false;
14291         }
14292
14293       /* MVN-shifted-reg.  */
14294       if (op0 != x)
14295         {
14296           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14297
14298           if (speed)
14299             *cost += extra_cost->alu.log_shift;
14300
14301           return true;
14302         }
14303       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14304          Handle the second form here taking care that 'a' in the above can
14305          be a shift.  */
14306       else if (GET_CODE (op0) == XOR)
14307         {
14308           rtx newop0 = XEXP (op0, 0);
14309           rtx newop1 = XEXP (op0, 1);
14310           rtx op0_stripped = aarch64_strip_shift (newop0);
14311
14312           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14313           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14314
14315           if (speed)
14316             {
14317               if (op0_stripped != newop0)
14318                 *cost += extra_cost->alu.log_shift;
14319               else
14320                 *cost += extra_cost->alu.logical;
14321             }
14322
14323           return true;
14324         }
14325       /* MVN.  */
14326       if (speed)
14327         *cost += extra_cost->alu.logical;
14328
14329       return false;
14330
14331     case ZERO_EXTEND:
14332
14333       op0 = XEXP (x, 0);
14334       /* If a value is written in SI mode, then zero extended to DI
14335          mode, the operation will in general be free as a write to
14336          a 'w' register implicitly zeroes the upper bits of an 'x'
14337          register.  However, if this is
14338
14339            (set (reg) (zero_extend (reg)))
14340
14341          we must cost the explicit register move.  */
14342       if (mode == DImode
14343           && GET_MODE (op0) == SImode)
14344         {
14345           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14346
14347         /* If OP_COST is non-zero, then the cost of the zero extend
14348            is effectively the cost of the inner operation.  Otherwise
14349            we have a MOV instruction and we take the cost from the MOV
14350            itself.  This is true independently of whether we are
14351            optimizing for space or time.  */
14352           if (op_cost)
14353             *cost = op_cost;
14354
14355           return true;
14356         }
14357       else if (MEM_P (op0))
14358         {
14359           /* All loads can zero extend to any size for free.  */
14360           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14361           return true;
14362         }
14363
14364       op0 = aarch64_extend_bitfield_pattern_p (x);
14365       if (op0)
14366         {
14367           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14368           if (speed)
14369             *cost += extra_cost->alu.bfx;
14370           return true;
14371         }
14372
14373       if (speed)
14374         {
14375           if (VECTOR_MODE_P (mode))
14376             {
14377               /* UMOV.  */
14378               *cost += extra_cost->vect.alu;
14379             }
14380           else
14381             {
14382               /* We generate an AND instead of UXTB/UXTH.  */
14383               *cost += extra_cost->alu.logical;
14384             }
14385         }
14386       return false;
14387
14388     case SIGN_EXTEND:
14389       if (MEM_P (XEXP (x, 0)))
14390         {
14391           /* LDRSH.  */
14392           if (speed)
14393             {
14394               rtx address = XEXP (XEXP (x, 0), 0);
14395               *cost += extra_cost->ldst.load_sign_extend;
14396
14397               *cost +=
14398                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14399                                                      0, speed));
14400             }
14401           return true;
14402         }
14403
14404       op0 = aarch64_extend_bitfield_pattern_p (x);
14405       if (op0)
14406         {
14407           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14408           if (speed)
14409             *cost += extra_cost->alu.bfx;
14410           return true;
14411         }
14412
14413       if (speed)
14414         {
14415           if (VECTOR_MODE_P (mode))
14416             *cost += extra_cost->vect.alu;
14417           else
14418             *cost += extra_cost->alu.extend;
14419         }
14420       return false;
14421
14422     case ASHIFT:
14423       op0 = XEXP (x, 0);
14424       op1 = XEXP (x, 1);
14425
14426       if (CONST_INT_P (op1))
14427         {
14428           if (speed)
14429             {
14430               if (VECTOR_MODE_P (mode))
14431                 {
14432                   /* Vector shift (immediate).  */
14433                   *cost += extra_cost->vect.alu;
14434                 }
14435               else
14436                 {
14437                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
14438                      aliases.  */
14439                   *cost += extra_cost->alu.shift;
14440                 }
14441             }
14442
14443           /* We can incorporate zero/sign extend for free.  */
14444           if (GET_CODE (op0) == ZERO_EXTEND
14445               || GET_CODE (op0) == SIGN_EXTEND)
14446             op0 = XEXP (op0, 0);
14447
14448           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
14449           return true;
14450         }
14451       else
14452         {
14453           if (VECTOR_MODE_P (mode))
14454             {
14455               if (speed)
14456                 /* Vector shift (register).  */
14457                 *cost += extra_cost->vect.alu;
14458             }
14459           else
14460             {
14461               if (speed)
14462                 /* LSLV.  */
14463                 *cost += extra_cost->alu.shift_reg;
14464
14465               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14466                   && CONST_INT_P (XEXP (op1, 1))
14467                   && known_eq (INTVAL (XEXP (op1, 1)),
14468                                GET_MODE_BITSIZE (mode) - 1))
14469                 {
14470                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14471                   /* We already demanded XEXP (op1, 0) to be REG_P, so
14472                      don't recurse into it.  */
14473                   return true;
14474                 }
14475             }
14476           return false;  /* All arguments need to be in registers.  */
14477         }
14478
14479     case ROTATE:
14480     case ROTATERT:
14481     case LSHIFTRT:
14482     case ASHIFTRT:
14483       op0 = XEXP (x, 0);
14484       op1 = XEXP (x, 1);
14485
14486       if (CONST_INT_P (op1))
14487         {
14488           /* ASR (immediate) and friends.  */
14489           if (speed)
14490             {
14491               if (VECTOR_MODE_P (mode))
14492                 *cost += extra_cost->vect.alu;
14493               else
14494                 *cost += extra_cost->alu.shift;
14495             }
14496
14497           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14498           return true;
14499         }
14500       else
14501         {
14502           if (VECTOR_MODE_P (mode))
14503             {
14504               if (speed)
14505                 /* Vector shift (register).  */
14506                 *cost += extra_cost->vect.alu;
14507             }
14508           else
14509             {
14510               if (speed)
14511                 /* ASR (register) and friends.  */
14512                 *cost += extra_cost->alu.shift_reg;
14513
14514               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14515                   && CONST_INT_P (XEXP (op1, 1))
14516                   && known_eq (INTVAL (XEXP (op1, 1)),
14517                                GET_MODE_BITSIZE (mode) - 1))
14518                 {
14519                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14520                   /* We already demanded XEXP (op1, 0) to be REG_P, so
14521                      don't recurse into it.  */
14522                   return true;
14523                 }
14524             }
14525           return false;  /* All arguments need to be in registers.  */
14526         }
14527
14528     case SYMBOL_REF:
14529
14530       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
14531           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
14532         {
14533           /* LDR.  */
14534           if (speed)
14535             *cost += extra_cost->ldst.load;
14536         }
14537       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
14538                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
14539         {
14540           /* ADRP, followed by ADD.  */
14541           *cost += COSTS_N_INSNS (1);
14542           if (speed)
14543             *cost += 2 * extra_cost->alu.arith;
14544         }
14545       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
14546                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14547         {
14548           /* ADR.  */
14549           if (speed)
14550             *cost += extra_cost->alu.arith;
14551         }
14552
14553       if (flag_pic)
14554         {
14555           /* One extra load instruction, after accessing the GOT.  */
14556           *cost += COSTS_N_INSNS (1);
14557           if (speed)
14558             *cost += extra_cost->ldst.load;
14559         }
14560       return true;
14561
14562     case HIGH:
14563     case LO_SUM:
14564       /* ADRP/ADD (immediate).  */
14565       if (speed)
14566         *cost += extra_cost->alu.arith;
14567       return true;
14568
14569     case ZERO_EXTRACT:
14570     case SIGN_EXTRACT:
14571       /* UBFX/SBFX.  */
14572       if (speed)
14573         {
14574           if (VECTOR_MODE_P (mode))
14575             *cost += extra_cost->vect.alu;
14576           else
14577             *cost += extra_cost->alu.bfx;
14578         }
14579
14580       /* We can trust that the immediates used will be correct (there
14581          are no by-register forms), so we need only cost op0.  */
14582       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
14583       return true;
14584
14585     case MULT:
14586       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
14587       /* aarch64_rtx_mult_cost always handles recursion to its
14588          operands.  */
14589       return true;
14590
14591     case MOD:
14592     /* We can expand signed mod by power of 2 using a NEGS, two parallel
14593        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
14594        an unconditional negate.  This case should only ever be reached through
14595        the set_smod_pow2_cheap check in expmed.cc.  */
14596       if (CONST_INT_P (XEXP (x, 1))
14597           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
14598           && (mode == SImode || mode == DImode))
14599         {
14600           /* We expand to 4 instructions.  Reset the baseline.  */
14601           *cost = COSTS_N_INSNS (4);
14602
14603           if (speed)
14604             *cost += 2 * extra_cost->alu.logical
14605                      + 2 * extra_cost->alu.arith;
14606
14607           return true;
14608         }
14609
14610     /* Fall-through.  */
14611     case UMOD:
14612       if (speed)
14613         {
14614           /* Slighly prefer UMOD over SMOD.  */
14615           if (VECTOR_MODE_P (mode))
14616             *cost += extra_cost->vect.alu;
14617           else if (GET_MODE_CLASS (mode) == MODE_INT)
14618             *cost += (extra_cost->mult[mode == DImode].add
14619                       + extra_cost->mult[mode == DImode].idiv
14620                       + (code == MOD ? 1 : 0));
14621         }
14622       return false;  /* All arguments need to be in registers.  */
14623
14624     case DIV:
14625     case UDIV:
14626     case SQRT:
14627       if (speed)
14628         {
14629           if (VECTOR_MODE_P (mode))
14630             *cost += extra_cost->vect.alu;
14631           else if (GET_MODE_CLASS (mode) == MODE_INT)
14632             /* There is no integer SQRT, so only DIV and UDIV can get
14633                here.  */
14634             *cost += (extra_cost->mult[mode == DImode].idiv
14635                      /* Slighly prefer UDIV over SDIV.  */
14636                      + (code == DIV ? 1 : 0));
14637           else
14638             *cost += extra_cost->fp[mode == DFmode].div;
14639         }
14640       return false;  /* All arguments need to be in registers.  */
14641
14642     case IF_THEN_ELSE:
14643       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
14644                                          XEXP (x, 2), cost, speed);
14645
14646     case EQ:
14647     case NE:
14648     case GT:
14649     case GTU:
14650     case LT:
14651     case LTU:
14652     case GE:
14653     case GEU:
14654     case LE:
14655     case LEU:
14656
14657       return false; /* All arguments must be in registers.  */
14658
14659     case FMA:
14660       op0 = XEXP (x, 0);
14661       op1 = XEXP (x, 1);
14662       op2 = XEXP (x, 2);
14663
14664       if (speed)
14665         {
14666           if (VECTOR_MODE_P (mode))
14667             *cost += extra_cost->vect.alu;
14668           else
14669             *cost += extra_cost->fp[mode == DFmode].fma;
14670         }
14671
14672       /* FMSUB, FNMADD, and FNMSUB are free.  */
14673       if (GET_CODE (op0) == NEG)
14674         op0 = XEXP (op0, 0);
14675
14676       if (GET_CODE (op2) == NEG)
14677         op2 = XEXP (op2, 0);
14678
14679       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
14680          and the by-element operand as operand 0.  */
14681       if (GET_CODE (op1) == NEG)
14682         op1 = XEXP (op1, 0);
14683
14684       /* Catch vector-by-element operations.  The by-element operand can
14685          either be (vec_duplicate (vec_select (x))) or just
14686          (vec_select (x)), depending on whether we are multiplying by
14687          a vector or a scalar.
14688
14689          Canonicalization is not very good in these cases, FMA4 will put the
14690          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
14691       if (GET_CODE (op0) == VEC_DUPLICATE)
14692         op0 = XEXP (op0, 0);
14693       else if (GET_CODE (op1) == VEC_DUPLICATE)
14694         op1 = XEXP (op1, 0);
14695
14696       if (GET_CODE (op0) == VEC_SELECT)
14697         op0 = XEXP (op0, 0);
14698       else if (GET_CODE (op1) == VEC_SELECT)
14699         op1 = XEXP (op1, 0);
14700
14701       /* If the remaining parameters are not registers,
14702          get the cost to put them into registers.  */
14703       *cost += rtx_cost (op0, mode, FMA, 0, speed);
14704       *cost += rtx_cost (op1, mode, FMA, 1, speed);
14705       *cost += rtx_cost (op2, mode, FMA, 2, speed);
14706       return true;
14707
14708     case FLOAT:
14709     case UNSIGNED_FLOAT:
14710       if (speed)
14711         *cost += extra_cost->fp[mode == DFmode].fromint;
14712       return false;
14713
14714     case FLOAT_EXTEND:
14715       if (speed)
14716         {
14717           if (VECTOR_MODE_P (mode))
14718             {
14719               /*Vector truncate.  */
14720               *cost += extra_cost->vect.alu;
14721             }
14722           else
14723             *cost += extra_cost->fp[mode == DFmode].widen;
14724         }
14725       return false;
14726
14727     case FLOAT_TRUNCATE:
14728       if (speed)
14729         {
14730           if (VECTOR_MODE_P (mode))
14731             {
14732               /*Vector conversion.  */
14733               *cost += extra_cost->vect.alu;
14734             }
14735           else
14736             *cost += extra_cost->fp[mode == DFmode].narrow;
14737         }
14738       return false;
14739
14740     case FIX:
14741     case UNSIGNED_FIX:
14742       x = XEXP (x, 0);
14743       /* Strip the rounding part.  They will all be implemented
14744          by the fcvt* family of instructions anyway.  */
14745       if (GET_CODE (x) == UNSPEC)
14746         {
14747           unsigned int uns_code = XINT (x, 1);
14748
14749           if (uns_code == UNSPEC_FRINTA
14750               || uns_code == UNSPEC_FRINTM
14751               || uns_code == UNSPEC_FRINTN
14752               || uns_code == UNSPEC_FRINTP
14753               || uns_code == UNSPEC_FRINTZ)
14754             x = XVECEXP (x, 0, 0);
14755         }
14756
14757       if (speed)
14758         {
14759           if (VECTOR_MODE_P (mode))
14760             *cost += extra_cost->vect.alu;
14761           else
14762             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
14763         }
14764
14765       /* We can combine fmul by a power of 2 followed by a fcvt into a single
14766          fixed-point fcvt.  */
14767       if (GET_CODE (x) == MULT
14768           && ((VECTOR_MODE_P (mode)
14769                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
14770               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
14771         {
14772           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
14773                              0, speed);
14774           return true;
14775         }
14776
14777       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
14778       return true;
14779
14780     case ABS:
14781       if (VECTOR_MODE_P (mode))
14782         {
14783           /* ABS (vector).  */
14784           if (speed)
14785             *cost += extra_cost->vect.alu;
14786         }
14787       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14788         {
14789           op0 = XEXP (x, 0);
14790
14791           /* FABD, which is analogous to FADD.  */
14792           if (GET_CODE (op0) == MINUS)
14793             {
14794               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
14795               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
14796               if (speed)
14797                 *cost += extra_cost->fp[mode == DFmode].addsub;
14798
14799               return true;
14800             }
14801           /* Simple FABS is analogous to FNEG.  */
14802           if (speed)
14803             *cost += extra_cost->fp[mode == DFmode].neg;
14804         }
14805       else
14806         {
14807           /* Integer ABS will either be split to
14808              two arithmetic instructions, or will be an ABS
14809              (scalar), which we don't model.  */
14810           *cost = COSTS_N_INSNS (2);
14811           if (speed)
14812             *cost += 2 * extra_cost->alu.arith;
14813         }
14814       return false;
14815
14816     case SMAX:
14817     case SMIN:
14818       if (speed)
14819         {
14820           if (VECTOR_MODE_P (mode))
14821             *cost += extra_cost->vect.alu;
14822           else
14823             {
14824               /* FMAXNM/FMINNM/FMAX/FMIN.
14825                  TODO: This may not be accurate for all implementations, but
14826                  we do not model this in the cost tables.  */
14827               *cost += extra_cost->fp[mode == DFmode].addsub;
14828             }
14829         }
14830       return false;
14831
14832     case UNSPEC:
14833       /* The floating point round to integer frint* instructions.  */
14834       if (aarch64_frint_unspec_p (XINT (x, 1)))
14835         {
14836           if (speed)
14837             *cost += extra_cost->fp[mode == DFmode].roundint;
14838
14839           return false;
14840         }
14841
14842       if (XINT (x, 1) == UNSPEC_RBIT)
14843         {
14844           if (speed)
14845             *cost += extra_cost->alu.rev;
14846
14847           return false;
14848         }
14849       break;
14850
14851     case TRUNCATE:
14852
14853       /* Decompose <su>muldi3_highpart.  */
14854       if (/* (truncate:DI  */
14855           mode == DImode
14856           /*   (lshiftrt:TI  */
14857           && GET_MODE (XEXP (x, 0)) == TImode
14858           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
14859           /*      (mult:TI  */
14860           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
14861           /*        (ANY_EXTEND:TI (reg:DI))
14862                     (ANY_EXTEND:TI (reg:DI)))  */
14863           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
14864                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
14865               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
14866                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
14867           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
14868           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
14869           /*     (const_int 64)  */
14870           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
14871           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
14872         {
14873           /* UMULH/SMULH.  */
14874           if (speed)
14875             *cost += extra_cost->mult[mode == DImode].extend;
14876           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
14877                              mode, MULT, 0, speed);
14878           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
14879                              mode, MULT, 1, speed);
14880           return true;
14881         }
14882         break;
14883     case CONST_VECTOR:
14884         {
14885           /* Load using MOVI/MVNI.  */
14886           if (aarch64_simd_valid_immediate (x, NULL))
14887             *cost = extra_cost->vect.movi;
14888           else /* Load using constant pool.  */
14889             *cost = extra_cost->ldst.load;
14890           break;
14891         }
14892     case VEC_CONCAT:
14893         /* depending on the operation, either DUP or INS.
14894            For now, keep default costing.  */
14895         break;
14896     case VEC_DUPLICATE:
14897         /* Load using a DUP.  */
14898         *cost = extra_cost->vect.dup;
14899         return false;
14900     case VEC_SELECT:
14901         {
14902           rtx op0 = XEXP (x, 0);
14903           *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
14904
14905           /* cost subreg of 0 as free, otherwise as DUP */
14906           rtx op1 = XEXP (x, 1);
14907           if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
14908             ;
14909           else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
14910             *cost = extra_cost->vect.dup;
14911           else
14912             *cost = extra_cost->vect.extract;
14913           return true;
14914         }
14915     default:
14916       break;
14917     }
14918
14919   if (dump_file
14920       && flag_aarch64_verbose_cost)
14921     fprintf (dump_file,
14922       "\nFailed to cost RTX.  Assuming default cost.\n");
14923
14924   return true;
14925 }
14926
14927 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
14928    calculated for X.  This cost is stored in *COST.  Returns true
14929    if the total cost of X was calculated.  */
14930 static bool
14931 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
14932                    int param, int *cost, bool speed)
14933 {
14934   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
14935
14936   if (dump_file
14937       && flag_aarch64_verbose_cost)
14938     {
14939       print_rtl_single (dump_file, x);
14940       fprintf (dump_file, "\n%s cost: %d (%s)\n",
14941                speed ? "Hot" : "Cold",
14942                *cost, result ? "final" : "partial");
14943     }
14944
14945   return result;
14946 }
14947
14948 static int
14949 aarch64_register_move_cost (machine_mode mode,
14950                             reg_class_t from_i, reg_class_t to_i)
14951 {
14952   enum reg_class from = (enum reg_class) from_i;
14953   enum reg_class to = (enum reg_class) to_i;
14954   const struct cpu_regmove_cost *regmove_cost
14955     = aarch64_tune_params.regmove_cost;
14956
14957   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
14958   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
14959       || to == STUB_REGS)
14960     to = GENERAL_REGS;
14961
14962   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
14963       || from == STUB_REGS)
14964     from = GENERAL_REGS;
14965
14966   /* Make RDFFR very expensive.  In particular, if we know that the FFR
14967      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
14968      as a way of obtaining a PTRUE.  */
14969   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
14970       && hard_reg_set_subset_p (reg_class_contents[from_i],
14971                                 reg_class_contents[FFR_REGS]))
14972     return 80;
14973
14974   /* Moving between GPR and stack cost is the same as GP2GP.  */
14975   if ((from == GENERAL_REGS && to == STACK_REG)
14976       || (to == GENERAL_REGS && from == STACK_REG))
14977     return regmove_cost->GP2GP;
14978
14979   /* To/From the stack register, we move via the gprs.  */
14980   if (to == STACK_REG || from == STACK_REG)
14981     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
14982             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
14983
14984   if (known_eq (GET_MODE_SIZE (mode), 16))
14985     {
14986       /* 128-bit operations on general registers require 2 instructions.  */
14987       if (from == GENERAL_REGS && to == GENERAL_REGS)
14988         return regmove_cost->GP2GP * 2;
14989       else if (from == GENERAL_REGS)
14990         return regmove_cost->GP2FP * 2;
14991       else if (to == GENERAL_REGS)
14992         return regmove_cost->FP2GP * 2;
14993
14994       /* When AdvSIMD instructions are disabled it is not possible to move
14995          a 128-bit value directly between Q registers.  This is handled in
14996          secondary reload.  A general register is used as a scratch to move
14997          the upper DI value and the lower DI value is moved directly,
14998          hence the cost is the sum of three moves. */
14999       if (! TARGET_SIMD)
15000         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15001
15002       return regmove_cost->FP2FP;
15003     }
15004
15005   if (from == GENERAL_REGS && to == GENERAL_REGS)
15006     return regmove_cost->GP2GP;
15007   else if (from == GENERAL_REGS)
15008     return regmove_cost->GP2FP;
15009   else if (to == GENERAL_REGS)
15010     return regmove_cost->FP2GP;
15011
15012   return regmove_cost->FP2FP;
15013 }
15014
15015 /* Implements TARGET_MEMORY_MOVE_COST.  */
15016 static int
15017 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15018 {
15019   enum reg_class rclass = (enum reg_class) rclass_i;
15020   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15021       ? reg_classes_intersect_p (rclass, PR_REGS)
15022       : reg_class_subset_p (rclass, PR_REGS))
15023     return (in
15024             ? aarch64_tune_params.memmov_cost.load_pred
15025             : aarch64_tune_params.memmov_cost.store_pred);
15026
15027   if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15028       ? reg_classes_intersect_p (rclass, FP_REGS)
15029       : reg_class_subset_p (rclass, FP_REGS))
15030     return (in
15031             ? aarch64_tune_params.memmov_cost.load_fp
15032             : aarch64_tune_params.memmov_cost.store_fp);
15033
15034   return (in
15035           ? aarch64_tune_params.memmov_cost.load_int
15036           : aarch64_tune_params.memmov_cost.store_int);
15037 }
15038
15039 /* Implement TARGET_INIT_BUILTINS.  */
15040 static void
15041 aarch64_init_builtins ()
15042 {
15043   aarch64_general_init_builtins ();
15044   aarch64_sve::init_builtins ();
15045 #ifdef SUBTARGET_INIT_BUILTINS
15046   SUBTARGET_INIT_BUILTINS;
15047 #endif
15048 }
15049
15050 /* Implement TARGET_FOLD_BUILTIN.  */
15051 static tree
15052 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15053 {
15054   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15055   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15056   tree type = TREE_TYPE (TREE_TYPE (fndecl));
15057   switch (code & AARCH64_BUILTIN_CLASS)
15058     {
15059     case AARCH64_BUILTIN_GENERAL:
15060       return aarch64_general_fold_builtin (subcode, type, nargs, args);
15061
15062     case AARCH64_BUILTIN_SVE:
15063       return NULL_TREE;
15064     }
15065   gcc_unreachable ();
15066 }
15067
15068 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
15069 static bool
15070 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15071 {
15072   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15073   tree fndecl = gimple_call_fndecl (stmt);
15074   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15075   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15076   gimple *new_stmt = NULL;
15077   switch (code & AARCH64_BUILTIN_CLASS)
15078     {
15079     case AARCH64_BUILTIN_GENERAL:
15080       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15081       break;
15082
15083     case AARCH64_BUILTIN_SVE:
15084       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15085       break;
15086     }
15087
15088   if (!new_stmt)
15089     return false;
15090
15091   gsi_replace (gsi, new_stmt, true);
15092   return true;
15093 }
15094
15095 /* Implement TARGET_EXPAND_BUILTIN.  */
15096 static rtx
15097 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15098 {
15099   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15100   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15101   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15102   switch (code & AARCH64_BUILTIN_CLASS)
15103     {
15104     case AARCH64_BUILTIN_GENERAL:
15105       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15106
15107     case AARCH64_BUILTIN_SVE:
15108       return aarch64_sve::expand_builtin (subcode, exp, target);
15109     }
15110   gcc_unreachable ();
15111 }
15112
15113 /* Implement TARGET_BUILTIN_DECL.  */
15114 static tree
15115 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15116 {
15117   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15118   switch (code & AARCH64_BUILTIN_CLASS)
15119     {
15120     case AARCH64_BUILTIN_GENERAL:
15121       return aarch64_general_builtin_decl (subcode, initialize_p);
15122
15123     case AARCH64_BUILTIN_SVE:
15124       return aarch64_sve::builtin_decl (subcode, initialize_p);
15125     }
15126   gcc_unreachable ();
15127 }
15128
15129 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15130    to optimize 1.0/sqrt.  */
15131
15132 static bool
15133 use_rsqrt_p (machine_mode mode)
15134 {
15135   return (!flag_trapping_math
15136           && flag_unsafe_math_optimizations
15137           && ((aarch64_tune_params.approx_modes->recip_sqrt
15138                & AARCH64_APPROX_MODE (mode))
15139               || flag_mrecip_low_precision_sqrt));
15140 }
15141
15142 /* Function to decide when to use the approximate reciprocal square root
15143    builtin.  */
15144
15145 static tree
15146 aarch64_builtin_reciprocal (tree fndecl)
15147 {
15148   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15149
15150   if (!use_rsqrt_p (mode))
15151     return NULL_TREE;
15152   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15153   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15154   switch (code & AARCH64_BUILTIN_CLASS)
15155     {
15156     case AARCH64_BUILTIN_GENERAL:
15157       return aarch64_general_builtin_rsqrt (subcode);
15158
15159     case AARCH64_BUILTIN_SVE:
15160       return NULL_TREE;
15161     }
15162   gcc_unreachable ();
15163 }
15164
15165 /* Emit code to perform the floating-point operation:
15166
15167      DST = SRC1 * SRC2
15168
15169    where all three operands are already known to be registers.
15170    If the operation is an SVE one, PTRUE is a suitable all-true
15171    predicate.  */
15172
15173 static void
15174 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15175 {
15176   if (ptrue)
15177     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15178                                  dst, ptrue, src1, src2,
15179                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
15180   else
15181     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15182 }
15183
15184 /* Emit instruction sequence to compute either the approximate square root
15185    or its approximate reciprocal, depending on the flag RECP, and return
15186    whether the sequence was emitted or not.  */
15187
15188 bool
15189 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15190 {
15191   machine_mode mode = GET_MODE (dst);
15192
15193   if (GET_MODE_INNER (mode) == HFmode)
15194     {
15195       gcc_assert (!recp);
15196       return false;
15197     }
15198
15199   if (!recp)
15200     {
15201       if (!(flag_mlow_precision_sqrt
15202             || (aarch64_tune_params.approx_modes->sqrt
15203                 & AARCH64_APPROX_MODE (mode))))
15204         return false;
15205
15206       if (!flag_finite_math_only
15207           || flag_trapping_math
15208           || !flag_unsafe_math_optimizations
15209           || optimize_function_for_size_p (cfun))
15210         return false;
15211     }
15212   else
15213     /* Caller assumes we cannot fail.  */
15214     gcc_assert (use_rsqrt_p (mode));
15215
15216   rtx pg = NULL_RTX;
15217   if (aarch64_sve_mode_p (mode))
15218     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15219   machine_mode mmsk = (VECTOR_MODE_P (mode)
15220                        ? related_int_vector_mode (mode).require ()
15221                        : int_mode_for_mode (mode).require ());
15222   rtx xmsk = NULL_RTX;
15223   if (!recp)
15224     {
15225       /* When calculating the approximate square root, compare the
15226          argument with 0.0 and create a mask.  */
15227       rtx zero = CONST0_RTX (mode);
15228       if (pg)
15229         {
15230           xmsk = gen_reg_rtx (GET_MODE (pg));
15231           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15232           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15233                                            xmsk, pg, hint, src, zero));
15234         }
15235       else
15236         {
15237           xmsk = gen_reg_rtx (mmsk);
15238           emit_insn (gen_rtx_SET (xmsk,
15239                                   gen_rtx_NEG (mmsk,
15240                                                gen_rtx_EQ (mmsk, src, zero))));
15241         }
15242     }
15243
15244   /* Estimate the approximate reciprocal square root.  */
15245   rtx xdst = gen_reg_rtx (mode);
15246   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15247
15248   /* Iterate over the series twice for SF and thrice for DF.  */
15249   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15250
15251   /* Optionally iterate over the series once less for faster performance
15252      while sacrificing the accuracy.  */
15253   if ((recp && flag_mrecip_low_precision_sqrt)
15254       || (!recp && flag_mlow_precision_sqrt))
15255     iterations--;
15256
15257   /* Iterate over the series to calculate the approximate reciprocal square
15258      root.  */
15259   rtx x1 = gen_reg_rtx (mode);
15260   while (iterations--)
15261     {
15262       rtx x2 = gen_reg_rtx (mode);
15263       aarch64_emit_mult (x2, pg, xdst, xdst);
15264
15265       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15266
15267       if (iterations > 0)
15268         aarch64_emit_mult (xdst, pg, xdst, x1);
15269     }
15270
15271   if (!recp)
15272     {
15273       if (pg)
15274         /* Multiply nonzero source values by the corresponding intermediate
15275            result elements, so that the final calculation is the approximate
15276            square root rather than its reciprocal.  Select a zero result for
15277            zero source values, to avoid the Inf * 0 -> NaN that we'd get
15278            otherwise.  */
15279         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15280                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15281       else
15282         {
15283           /* Qualify the approximate reciprocal square root when the
15284              argument is 0.0 by squashing the intermediary result to 0.0.  */
15285           rtx xtmp = gen_reg_rtx (mmsk);
15286           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15287                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
15288           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15289
15290           /* Calculate the approximate square root.  */
15291           aarch64_emit_mult (xdst, pg, xdst, src);
15292         }
15293     }
15294
15295   /* Finalize the approximation.  */
15296   aarch64_emit_mult (dst, pg, xdst, x1);
15297
15298   return true;
15299 }
15300
15301 /* Emit the instruction sequence to compute the approximation for the division
15302    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
15303
15304 bool
15305 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15306 {
15307   machine_mode mode = GET_MODE (quo);
15308
15309   if (GET_MODE_INNER (mode) == HFmode)
15310     return false;
15311
15312   bool use_approx_division_p = (flag_mlow_precision_div
15313                                 || (aarch64_tune_params.approx_modes->division
15314                                     & AARCH64_APPROX_MODE (mode)));
15315
15316   if (!flag_finite_math_only
15317       || flag_trapping_math
15318       || !flag_unsafe_math_optimizations
15319       || optimize_function_for_size_p (cfun)
15320       || !use_approx_division_p)
15321     return false;
15322
15323   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15324     return false;
15325
15326   rtx pg = NULL_RTX;
15327   if (aarch64_sve_mode_p (mode))
15328     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15329
15330   /* Estimate the approximate reciprocal.  */
15331   rtx xrcp = gen_reg_rtx (mode);
15332   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15333
15334   /* Iterate over the series twice for SF and thrice for DF.  */
15335   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15336
15337   /* Optionally iterate over the series less for faster performance,
15338      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
15339   if (flag_mlow_precision_div)
15340     iterations = (GET_MODE_INNER (mode) == DFmode
15341                   ? aarch64_double_recp_precision
15342                   : aarch64_float_recp_precision);
15343
15344   /* Iterate over the series to calculate the approximate reciprocal.  */
15345   rtx xtmp = gen_reg_rtx (mode);
15346   while (iterations--)
15347     {
15348       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15349
15350       if (iterations > 0)
15351         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15352     }
15353
15354   if (num != CONST1_RTX (mode))
15355     {
15356       /* As the approximate reciprocal of DEN is already calculated, only
15357          calculate the approximate division when NUM is not 1.0.  */
15358       rtx xnum = force_reg (mode, num);
15359       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15360     }
15361
15362   /* Finalize the approximation.  */
15363   aarch64_emit_mult (quo, pg, xrcp, xtmp);
15364   return true;
15365 }
15366
15367 /* Return the number of instructions that can be issued per cycle.  */
15368 static int
15369 aarch64_sched_issue_rate (void)
15370 {
15371   return aarch64_tune_params.issue_rate;
15372 }
15373
15374 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
15375 static int
15376 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15377 {
15378   if (DEBUG_INSN_P (insn))
15379     return more;
15380
15381   rtx_code code = GET_CODE (PATTERN (insn));
15382   if (code == USE || code == CLOBBER)
15383     return more;
15384
15385   if (get_attr_type (insn) == TYPE_NO_INSN)
15386     return more;
15387
15388   return more - 1;
15389 }
15390
15391 static int
15392 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15393 {
15394   int issue_rate = aarch64_sched_issue_rate ();
15395
15396   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15397 }
15398
15399
15400 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15401    autopref_multipass_dfa_lookahead_guard from haifa-sched.cc.  It only
15402    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
15403
15404 static int
15405 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15406                                                     int ready_index)
15407 {
15408   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15409 }
15410
15411
15412 /* Vectorizer cost model target hooks.  */
15413
15414 /* Information about how the CPU would issue the scalar, Advanced SIMD
15415    or SVE version of a vector loop, using the scheme defined by the
15416    aarch64_base_vec_issue_info hierarchy of structures.  */
15417 class aarch64_vec_op_count
15418 {
15419 public:
15420   aarch64_vec_op_count () = default;
15421   aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15422                         unsigned int = 1);
15423
15424   unsigned int vec_flags () const { return m_vec_flags; }
15425   unsigned int vf_factor () const { return m_vf_factor; }
15426
15427   const aarch64_base_vec_issue_info *base_issue_info () const;
15428   const aarch64_simd_vec_issue_info *simd_issue_info () const;
15429   const aarch64_sve_vec_issue_info *sve_issue_info () const;
15430
15431   fractional_cost rename_cycles_per_iter () const;
15432   fractional_cost min_nonpred_cycles_per_iter () const;
15433   fractional_cost min_pred_cycles_per_iter () const;
15434   fractional_cost min_cycles_per_iter () const;
15435
15436   void dump () const;
15437
15438   /* The number of individual "general" operations.  See the comments
15439      in aarch64_base_vec_issue_info for details.  */
15440   unsigned int general_ops = 0;
15441
15442   /* The number of load and store operations, under the same scheme
15443      as above.  */
15444   unsigned int loads = 0;
15445   unsigned int stores = 0;
15446
15447   /* The minimum number of cycles needed to execute all loop-carried
15448      operations, which in the vector code become associated with
15449      reductions.  */
15450   unsigned int reduction_latency = 0;
15451
15452   /* The number of individual predicate operations.  See the comments
15453      in aarch64_sve_vec_issue_info for details.  */
15454   unsigned int pred_ops = 0;
15455
15456 private:
15457   /* The issue information for the core.  */
15458   const aarch64_vec_issue_info *m_issue_info = nullptr;
15459
15460   /* - If M_VEC_FLAGS is zero then this structure describes scalar code
15461      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
15462        Advanced SIMD code.
15463      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
15464        SVE code.  */
15465   unsigned int m_vec_flags = 0;
15466
15467   /* Assume that, when the code is executing on the core described
15468      by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
15469      times more data than the vectorizer anticipates.
15470
15471      This is only ever different from 1 for SVE.  It allows us to consider
15472      what would happen on a 256-bit SVE target even when the -mtune
15473      parameters say that the “likely” SVE length is 128 bits.  */
15474   unsigned int m_vf_factor = 1;
15475 };
15476
15477 aarch64_vec_op_count::
15478 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
15479                       unsigned int vec_flags, unsigned int vf_factor)
15480   : m_issue_info (issue_info),
15481     m_vec_flags (vec_flags),
15482     m_vf_factor (vf_factor)
15483 {
15484 }
15485
15486 /* Return the base issue information (i.e. the parts that make sense
15487    for both scalar and vector code).  Return null if we have no issue
15488    information.  */
15489 const aarch64_base_vec_issue_info *
15490 aarch64_vec_op_count::base_issue_info () const
15491 {
15492   if (auto *ret = simd_issue_info ())
15493     return ret;
15494   return m_issue_info->scalar;
15495 }
15496
15497 /* If the structure describes vector code and we have associated issue
15498    information, return that issue information, otherwise return null.  */
15499 const aarch64_simd_vec_issue_info *
15500 aarch64_vec_op_count::simd_issue_info () const
15501 {
15502   if (auto *ret = sve_issue_info ())
15503     return ret;
15504   if (m_vec_flags)
15505     return m_issue_info->advsimd;
15506   return nullptr;
15507 }
15508
15509 /* If the structure describes SVE code and we have associated issue
15510    information, return that issue information, otherwise return null.  */
15511 const aarch64_sve_vec_issue_info *
15512 aarch64_vec_op_count::sve_issue_info () const
15513 {
15514   if (m_vec_flags & VEC_ANY_SVE)
15515     return m_issue_info->sve;
15516   return nullptr;
15517 }
15518
15519 /* Estimate the minimum number of cycles per iteration needed to rename
15520    the instructions.
15521
15522    ??? For now this is done inline rather than via cost tables, since it
15523    isn't clear how it should be parameterized for the general case.  */
15524 fractional_cost
15525 aarch64_vec_op_count::rename_cycles_per_iter () const
15526 {
15527   if (sve_issue_info () == &neoverse512tvb_sve_issue_info
15528       || sve_issue_info () == &neoversen2_sve_issue_info
15529       || sve_issue_info () == &demeter_sve_issue_info)
15530     /* + 1 for an addition.  We've already counted a general op for each
15531        store, so we don't need to account for stores separately.  The branch
15532        reads no registers and so does not need to be counted either.
15533
15534        ??? This value is very much on the pessimistic side, but seems to work
15535        pretty well in practice.  */
15536     return { general_ops + loads + pred_ops + 1, 5 };
15537
15538   return 0;
15539 }
15540
15541 /* Like min_cycles_per_iter, but excluding predicate operations.  */
15542 fractional_cost
15543 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
15544 {
15545   auto *issue_info = base_issue_info ();
15546
15547   fractional_cost cycles = MAX (reduction_latency, 1);
15548   cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
15549   cycles = std::max (cycles, { loads + stores,
15550                                issue_info->loads_stores_per_cycle });
15551   cycles = std::max (cycles, { general_ops,
15552                                issue_info->general_ops_per_cycle });
15553   cycles = std::max (cycles, rename_cycles_per_iter ());
15554   return cycles;
15555 }
15556
15557 /* Like min_cycles_per_iter, but including only the predicate operations.  */
15558 fractional_cost
15559 aarch64_vec_op_count::min_pred_cycles_per_iter () const
15560 {
15561   if (auto *issue_info = sve_issue_info ())
15562     return { pred_ops, issue_info->pred_ops_per_cycle };
15563   return 0;
15564 }
15565
15566 /* Estimate the minimum number of cycles needed to issue the operations.
15567    This is a very simplistic model!  */
15568 fractional_cost
15569 aarch64_vec_op_count::min_cycles_per_iter () const
15570 {
15571   return std::max (min_nonpred_cycles_per_iter (),
15572                    min_pred_cycles_per_iter ());
15573 }
15574
15575 /* Dump information about the structure.  */
15576 void
15577 aarch64_vec_op_count::dump () const
15578 {
15579   dump_printf_loc (MSG_NOTE, vect_location,
15580                    "  load operations = %d\n", loads);
15581   dump_printf_loc (MSG_NOTE, vect_location,
15582                    "  store operations = %d\n", stores);
15583   dump_printf_loc (MSG_NOTE, vect_location,
15584                    "  general operations = %d\n", general_ops);
15585   if (sve_issue_info ())
15586     dump_printf_loc (MSG_NOTE, vect_location,
15587                      "  predicate operations = %d\n", pred_ops);
15588   dump_printf_loc (MSG_NOTE, vect_location,
15589                    "  reduction latency = %d\n", reduction_latency);
15590   if (auto rcpi = rename_cycles_per_iter ())
15591     dump_printf_loc (MSG_NOTE, vect_location,
15592                      "  estimated cycles per iteration to rename = %f\n",
15593                      rcpi.as_double ());
15594   if (auto pred_cpi = min_pred_cycles_per_iter ())
15595     {
15596       dump_printf_loc (MSG_NOTE, vect_location,
15597                        "  estimated min cycles per iteration"
15598                        " without predication = %f\n",
15599                        min_nonpred_cycles_per_iter ().as_double ());
15600       dump_printf_loc (MSG_NOTE, vect_location,
15601                        "  estimated min cycles per iteration"
15602                        " for predication = %f\n", pred_cpi.as_double ());
15603     }
15604   if (auto cpi = min_cycles_per_iter ())
15605     dump_printf_loc (MSG_NOTE, vect_location,
15606                      "  estimated min cycles per iteration = %f\n",
15607                      cpi.as_double ());
15608 }
15609
15610 /* Information about vector code that we're in the process of costing.  */
15611 class aarch64_vector_costs : public vector_costs
15612 {
15613 public:
15614   aarch64_vector_costs (vec_info *, bool);
15615
15616   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
15617                               stmt_vec_info stmt_info, slp_tree, tree vectype,
15618                               int misalign,
15619                               vect_cost_model_location where) override;
15620   void finish_cost (const vector_costs *) override;
15621   bool better_main_loop_than_p (const vector_costs *other) const override;
15622
15623 private:
15624   void record_potential_advsimd_unrolling (loop_vec_info);
15625   void analyze_loop_vinfo (loop_vec_info);
15626   void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
15627                   aarch64_vec_op_count *);
15628   fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
15629                                         fractional_cost, unsigned int,
15630                                         unsigned int *, bool *);
15631   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
15632                                  unsigned int);
15633   bool prefer_unrolled_loop () const;
15634   unsigned int determine_suggested_unroll_factor (loop_vec_info);
15635
15636   /* True if we have performed one-time initialization based on the
15637      vec_info.  */
15638   bool m_analyzed_vinfo = false;
15639
15640   /* This loop uses an average operation that is not supported by SVE, but is
15641      supported by Advanced SIMD and SVE2.  */
15642   bool m_has_avg = false;
15643
15644   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
15645      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
15646        SIMD code.
15647      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
15648   unsigned int m_vec_flags = 0;
15649
15650   /* At the moment, we do not model LDP and STP in the vector and scalar costs.
15651      This means that code such as:
15652
15653         a[0] = x;
15654         a[1] = x;
15655
15656      will be costed as two scalar instructions and two vector instructions
15657      (a scalar_to_vec and an unaligned_store).  For SLP, the vector form
15658      wins if the costs are equal, because of the fact that the vector costs
15659      include constant initializations whereas the scalar costs don't.
15660      We would therefore tend to vectorize the code above, even though
15661      the scalar version can use a single STP.
15662
15663      We should eventually fix this and model LDP and STP in the main costs;
15664      see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
15665      Until then, we look specifically for code that does nothing more than
15666      STP-like operations.  We cost them on that basis in addition to the
15667      normal latency-based costs.
15668
15669      If the scalar or vector code could be a sequence of STPs +
15670      initialization, this variable counts the cost of the sequence,
15671      with 2 units per instruction.  The variable is ~0U for other
15672      kinds of code.  */
15673   unsigned int m_stp_sequence_cost = 0;
15674
15675   /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
15676      throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
15677      situations, we try to predict whether an Advanced SIMD implementation
15678      of the loop could be completely unrolled and become straight-line code.
15679      If so, it is generally better to use the Advanced SIMD version rather
15680      than length-agnostic SVE, since the SVE loop would execute an unknown
15681      number of times and so could not be completely unrolled in the same way.
15682
15683      If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
15684      number of Advanced SIMD loop iterations that would be unrolled and
15685      M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
15686      in the unrolled loop.  Both values are zero if we're not applying
15687      the heuristic.  */
15688   unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
15689   unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
15690
15691   /* If we're vectorizing a loop that executes a constant number of times,
15692      this variable gives the number of times that the vector loop would
15693      iterate, otherwise it is zero.  */
15694   uint64_t m_num_vector_iterations = 0;
15695
15696   /* Used only when vectorizing loops.  Estimates the number and kind of
15697      operations that would be needed by one iteration of the scalar
15698      or vector loop.  There is one entry for each tuning option of
15699      interest.  */
15700   auto_vec<aarch64_vec_op_count, 2> m_ops;
15701 };
15702
15703 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
15704                                             bool costing_for_scalar)
15705   : vector_costs (vinfo, costing_for_scalar),
15706     m_vec_flags (costing_for_scalar ? 0
15707                  : aarch64_classify_vector_mode (vinfo->vector_mode))
15708 {
15709   if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
15710     {
15711       m_ops.quick_push ({ issue_info, m_vec_flags });
15712       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
15713         {
15714           unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
15715           m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
15716                               vf_factor });
15717         }
15718     }
15719 }
15720
15721 /* Implement TARGET_VECTORIZE_CREATE_COSTS.  */
15722 vector_costs *
15723 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
15724 {
15725   return new aarch64_vector_costs (vinfo, costing_for_scalar);
15726 }
15727
15728 /* Return true if the current CPU should use the new costs defined
15729    in GCC 11.  This should be removed for GCC 12 and above, with the
15730    costs applying to all CPUs instead.  */
15731 static bool
15732 aarch64_use_new_vector_costs_p ()
15733 {
15734   return (aarch64_tune_params.extra_tuning_flags
15735           & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
15736 }
15737
15738 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
15739 static const simd_vec_cost *
15740 aarch64_simd_vec_costs (tree vectype)
15741 {
15742   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15743   if (vectype != NULL
15744       && aarch64_sve_mode_p (TYPE_MODE (vectype))
15745       && costs->sve != NULL)
15746     return costs->sve;
15747   return costs->advsimd;
15748 }
15749
15750 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
15751 static const simd_vec_cost *
15752 aarch64_simd_vec_costs_for_flags (unsigned int flags)
15753 {
15754   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15755   if ((flags & VEC_ANY_SVE) && costs->sve)
15756     return costs->sve;
15757   return costs->advsimd;
15758 }
15759
15760 /* If STMT_INFO is a memory reference, return the scalar memory type,
15761    otherwise return null.  */
15762 static tree
15763 aarch64_dr_type (stmt_vec_info stmt_info)
15764 {
15765   if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
15766     return TREE_TYPE (DR_REF (dr));
15767   return NULL_TREE;
15768 }
15769
15770 /* Decide whether to use the unrolling heuristic described above
15771    m_unrolled_advsimd_niters, updating that field if so.  LOOP_VINFO
15772    describes the loop that we're vectorizing.  */
15773 void
15774 aarch64_vector_costs::
15775 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
15776 {
15777   /* The heuristic only makes sense on targets that have the same
15778      vector throughput for SVE and Advanced SIMD.  */
15779   if (!(aarch64_tune_params.extra_tuning_flags
15780         & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
15781     return;
15782
15783   /* We only want to apply the heuristic if LOOP_VINFO is being
15784      vectorized for SVE.  */
15785   if (!(m_vec_flags & VEC_ANY_SVE))
15786     return;
15787
15788   /* Check whether it is possible in principle to use Advanced SIMD
15789      instead.  */
15790   if (aarch64_autovec_preference == 2)
15791     return;
15792
15793   /* We don't want to apply the heuristic to outer loops, since it's
15794      harder to track two levels of unrolling.  */
15795   if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
15796     return;
15797
15798   /* Only handle cases in which the number of Advanced SIMD iterations
15799      would be known at compile time but the number of SVE iterations
15800      would not.  */
15801   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
15802       || aarch64_sve_vg.is_constant ())
15803     return;
15804
15805   /* Guess how many times the Advanced SIMD loop would iterate and make
15806      sure that it is within the complete unrolling limit.  Even if the
15807      number of iterations is small enough, the number of statements might
15808      not be, which is why we need to estimate the number of statements too.  */
15809   unsigned int estimated_vq = aarch64_estimated_sve_vq ();
15810   unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
15811   unsigned HOST_WIDE_INT unrolled_advsimd_niters
15812     = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
15813   if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
15814     return;
15815
15816   /* Record that we're applying the heuristic and should try to estimate
15817      the number of statements in the Advanced SIMD loop.  */
15818   m_unrolled_advsimd_niters = unrolled_advsimd_niters;
15819 }
15820
15821 /* Do one-time initialization of the aarch64_vector_costs given that we're
15822    costing the loop vectorization described by LOOP_VINFO.  */
15823 void
15824 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
15825 {
15826   /* Record the number of times that the vector loop would execute,
15827      if known.  */
15828   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
15829   auto scalar_niters = max_stmt_executions_int (loop);
15830   if (scalar_niters >= 0)
15831     {
15832       unsigned int vf = vect_vf_for_cost (loop_vinfo);
15833       if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
15834         m_num_vector_iterations = scalar_niters / vf;
15835       else
15836         m_num_vector_iterations = CEIL (scalar_niters, vf);
15837     }
15838
15839   /* Detect whether we're vectorizing for SVE and should apply the unrolling
15840      heuristic described above m_unrolled_advsimd_niters.  */
15841   record_potential_advsimd_unrolling (loop_vinfo);
15842
15843   /* Record the issue information for any SVE WHILE instructions that the
15844      loop needs.  */
15845   if (!m_ops.is_empty () && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
15846     {
15847       unsigned int num_masks = 0;
15848       rgroup_controls *rgm;
15849       unsigned int num_vectors_m1;
15850       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
15851         if (rgm->type)
15852           num_masks += num_vectors_m1 + 1;
15853       for (auto &ops : m_ops)
15854         if (auto *issue = ops.sve_issue_info ())
15855           ops.pred_ops += num_masks * issue->while_pred_ops;
15856     }
15857 }
15858
15859 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
15860 static int
15861 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
15862                                     tree vectype,
15863                                     int misalign ATTRIBUTE_UNUSED)
15864 {
15865   unsigned elements;
15866   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15867   bool fp = false;
15868
15869   if (vectype != NULL)
15870     fp = FLOAT_TYPE_P (vectype);
15871
15872   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
15873
15874   switch (type_of_cost)
15875     {
15876       case scalar_stmt:
15877         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
15878
15879       case scalar_load:
15880         return costs->scalar_load_cost;
15881
15882       case scalar_store:
15883         return costs->scalar_store_cost;
15884
15885       case vector_stmt:
15886         return fp ? simd_costs->fp_stmt_cost
15887                   : simd_costs->int_stmt_cost;
15888
15889       case vector_load:
15890         return simd_costs->align_load_cost;
15891
15892       case vector_store:
15893         return simd_costs->store_cost;
15894
15895       case vec_to_scalar:
15896         return simd_costs->vec_to_scalar_cost;
15897
15898       case scalar_to_vec:
15899         return simd_costs->scalar_to_vec_cost;
15900
15901       case unaligned_load:
15902       case vector_gather_load:
15903         return simd_costs->unalign_load_cost;
15904
15905       case unaligned_store:
15906       case vector_scatter_store:
15907         return simd_costs->unalign_store_cost;
15908
15909       case cond_branch_taken:
15910         return costs->cond_taken_branch_cost;
15911
15912       case cond_branch_not_taken:
15913         return costs->cond_not_taken_branch_cost;
15914
15915       case vec_perm:
15916         return simd_costs->permute_cost;
15917
15918       case vec_promote_demote:
15919         return fp ? simd_costs->fp_stmt_cost
15920                   : simd_costs->int_stmt_cost;
15921
15922       case vec_construct:
15923         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
15924         return elements / 2 + 1;
15925
15926       default:
15927         gcc_unreachable ();
15928     }
15929 }
15930
15931 /* Return true if an access of kind KIND for STMT_INFO represents one
15932    vector of an LD[234] or ST[234] operation.  Return the total number of
15933    vectors (2, 3 or 4) if so, otherwise return a value outside that range.  */
15934 static int
15935 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
15936 {
15937   if ((kind == vector_load
15938        || kind == unaligned_load
15939        || kind == vector_store
15940        || kind == unaligned_store)
15941       && STMT_VINFO_DATA_REF (stmt_info))
15942     {
15943       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
15944       if (stmt_info
15945           && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
15946         return DR_GROUP_SIZE (stmt_info);
15947     }
15948   return 0;
15949 }
15950
15951 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
15952    vectors would produce a series of LDP or STP operations.  KIND is the
15953    kind of statement that STMT_INFO represents.  */
15954 static bool
15955 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
15956                            stmt_vec_info stmt_info)
15957 {
15958   switch (kind)
15959     {
15960     case vector_load:
15961     case vector_store:
15962     case unaligned_load:
15963     case unaligned_store:
15964       break;
15965
15966     default:
15967       return false;
15968     }
15969
15970   if (aarch64_tune_params.extra_tuning_flags
15971       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
15972     return false;
15973
15974   return is_gimple_assign (stmt_info->stmt);
15975 }
15976
15977 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
15978    or multiply-subtract sequence that might be suitable for fusing into a
15979    single instruction.  If VEC_FLAGS is zero, analyze the operation as
15980    a scalar one, otherwise analyze it as an operation on vectors with those
15981    VEC_* flags.  */
15982 static bool
15983 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
15984                         unsigned int vec_flags)
15985 {
15986   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
15987   if (!assign)
15988     return false;
15989   tree_code code = gimple_assign_rhs_code (assign);
15990   if (code != PLUS_EXPR && code != MINUS_EXPR)
15991     return false;
15992
15993   if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
15994       || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
15995     return false;
15996
15997   for (int i = 1; i < 3; ++i)
15998     {
15999       tree rhs = gimple_op (assign, i);
16000       /* ??? Should we try to check for a single use as well?  */
16001       if (TREE_CODE (rhs) != SSA_NAME)
16002         continue;
16003
16004       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16005       if (!def_stmt_info
16006           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16007         continue;
16008       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16009       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16010         continue;
16011
16012       if (vec_flags & VEC_ADVSIMD)
16013         {
16014           /* Scalar and SVE code can tie the result to any FMLA input (or none,
16015              although that requires a MOVPRFX for SVE).  However, Advanced SIMD
16016              only supports MLA forms, so will require a move if the result
16017              cannot be tied to the accumulator.  The most important case in
16018              which this is true is when the accumulator input is invariant.  */
16019           rhs = gimple_op (assign, 3 - i);
16020           if (TREE_CODE (rhs) != SSA_NAME)
16021             return false;
16022           def_stmt_info = vinfo->lookup_def (rhs);
16023           if (!def_stmt_info
16024               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
16025             return false;
16026         }
16027
16028       return true;
16029     }
16030   return false;
16031 }
16032
16033 /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
16034    in-loop reduction that SVE supports directly, return its latency in cycles,
16035    otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
16036    instructions.  */
16037 static unsigned int
16038 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16039                                        stmt_vec_info stmt_info,
16040                                        const sve_vec_cost *sve_costs)
16041 {
16042   switch (vect_reduc_type (vinfo, stmt_info))
16043     {
16044     case EXTRACT_LAST_REDUCTION:
16045       return sve_costs->clast_cost;
16046
16047     case FOLD_LEFT_REDUCTION:
16048       switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16049         {
16050         case E_HFmode:
16051         case E_BFmode:
16052           return sve_costs->fadda_f16_cost;
16053
16054         case E_SFmode:
16055           return sve_costs->fadda_f32_cost;
16056
16057         case E_DFmode:
16058           return sve_costs->fadda_f64_cost;
16059
16060         default:
16061           break;
16062         }
16063       break;
16064     }
16065
16066   return 0;
16067 }
16068
16069 /* STMT_INFO describes a loop-carried operation in the original scalar code
16070    that we are considering implementing as a reduction.  Return one of the
16071    following values, depending on VEC_FLAGS:
16072
16073    - If VEC_FLAGS is zero, return the loop carry latency of the original
16074      scalar operation.
16075
16076    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16077      Advanced SIMD implementation.
16078
16079    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16080      SVE implementation.  */
16081 static unsigned int
16082 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16083                                    unsigned int vec_flags)
16084 {
16085   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16086   const sve_vec_cost *sve_costs = nullptr;
16087   if (vec_flags & VEC_ANY_SVE)
16088     sve_costs = aarch64_tune_params.vec_costs->sve;
16089
16090   /* If the caller is asking for the SVE latency, check for forms of reduction
16091      that only SVE can handle directly.  */
16092   if (sve_costs)
16093     {
16094       unsigned int latency
16095         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16096       if (latency)
16097         return latency;
16098     }
16099
16100   /* Handle scalar costs.  */
16101   bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16102   if (vec_flags == 0)
16103     {
16104       if (is_float)
16105         return vec_costs->scalar_fp_stmt_cost;
16106       return vec_costs->scalar_int_stmt_cost;
16107     }
16108
16109   /* Otherwise, the loop body just contains normal integer or FP operations,
16110      with a vector reduction outside the loop.  */
16111   const simd_vec_cost *simd_costs
16112     = aarch64_simd_vec_costs_for_flags (vec_flags);
16113   if (is_float)
16114     return simd_costs->fp_stmt_cost;
16115   return simd_costs->int_stmt_cost;
16116 }
16117
16118 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16119    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
16120    try to subdivide the target-independent categorization provided by KIND
16121    to get a more accurate cost.  */
16122 static fractional_cost
16123 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16124                                     stmt_vec_info stmt_info,
16125                                     fractional_cost stmt_cost)
16126 {
16127   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
16128      the extension with the load.  */
16129   if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16130     return 0;
16131
16132   return stmt_cost;
16133 }
16134
16135 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16136    for the vectorized form of STMT_INFO, which has cost kind KIND and which
16137    when vectorized would operate on vector type VECTYPE.  Try to subdivide
16138    the target-independent categorization provided by KIND to get a more
16139    accurate cost.  WHERE specifies where the cost associated with KIND
16140    occurs.  */
16141 static fractional_cost
16142 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16143                                     stmt_vec_info stmt_info, tree vectype,
16144                                     enum vect_cost_model_location where,
16145                                     fractional_cost stmt_cost)
16146 {
16147   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16148   const sve_vec_cost *sve_costs = nullptr;
16149   if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16150     sve_costs = aarch64_tune_params.vec_costs->sve;
16151
16152   /* It's generally better to avoid costing inductions, since the induction
16153      will usually be hidden by other operations.  This is particularly true
16154      for things like COND_REDUCTIONS.  */
16155   if (is_a<gphi *> (stmt_info->stmt))
16156     return 0;
16157
16158   /* Detect cases in which vec_to_scalar is describing the extraction of a
16159      vector element in preparation for a scalar store.  The store itself is
16160      costed separately.  */
16161   if (vect_is_store_elt_extraction (kind, stmt_info))
16162     return simd_costs->store_elt_extra_cost;
16163
16164   /* Detect SVE gather loads, which are costed as a single scalar_load
16165      for each element.  We therefore need to divide the full-instruction
16166      cost by the number of elements in the vector.  */
16167   if (kind == scalar_load
16168       && sve_costs
16169       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16170     {
16171       unsigned int nunits = vect_nunits_for_cost (vectype);
16172       if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16173         return { sve_costs->gather_load_x64_cost, nunits };
16174       return { sve_costs->gather_load_x32_cost, nunits };
16175     }
16176
16177   /* Detect cases in which a scalar_store is really storing one element
16178      in a scatter operation.  */
16179   if (kind == scalar_store
16180       && sve_costs
16181       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16182     return sve_costs->scatter_store_elt_cost;
16183
16184   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
16185   if (kind == vec_to_scalar
16186       && where == vect_body
16187       && sve_costs)
16188     {
16189       unsigned int latency
16190         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16191       if (latency)
16192         return latency;
16193     }
16194
16195   /* Detect cases in which vec_to_scalar represents a single reduction
16196      instruction like FADDP or MAXV.  */
16197   if (kind == vec_to_scalar
16198       && where == vect_epilogue
16199       && vect_is_reduction (stmt_info))
16200     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16201       {
16202       case E_QImode:
16203         return simd_costs->reduc_i8_cost;
16204
16205       case E_HImode:
16206         return simd_costs->reduc_i16_cost;
16207
16208       case E_SImode:
16209         return simd_costs->reduc_i32_cost;
16210
16211       case E_DImode:
16212         return simd_costs->reduc_i64_cost;
16213
16214       case E_HFmode:
16215       case E_BFmode:
16216         return simd_costs->reduc_f16_cost;
16217
16218       case E_SFmode:
16219         return simd_costs->reduc_f32_cost;
16220
16221       case E_DFmode:
16222         return simd_costs->reduc_f64_cost;
16223
16224       default:
16225         break;
16226       }
16227
16228   /* Otherwise stick with the original categorization.  */
16229   return stmt_cost;
16230 }
16231
16232 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16233    for STMT_INFO, which has cost kind KIND and which when vectorized would
16234    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
16235    targets.  */
16236 static fractional_cost
16237 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16238                               stmt_vec_info stmt_info, tree vectype,
16239                               fractional_cost stmt_cost)
16240 {
16241   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16242      vector register size or number of units.  Integer promotions of this
16243      type therefore map to SXT[BHW] or UXT[BHW].
16244
16245      Most loads have extending forms that can do the sign or zero extension
16246      on the fly.  Optimistically assume that a load followed by an extension
16247      will fold to this form during combine, and that the extension therefore
16248      comes for free.  */
16249   if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16250     stmt_cost = 0;
16251
16252   /* For similar reasons, vector_stmt integer truncations are a no-op,
16253      because we can just ignore the unused upper bits of the source.  */
16254   if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16255     stmt_cost = 0;
16256
16257   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16258      but there are no equivalent instructions for SVE.  This means that
16259      (all other things being equal) 128-bit SVE needs twice as many load
16260      and store instructions as Advanced SIMD in order to process vector pairs.
16261
16262      Also, scalar code can often use LDP and STP to access pairs of values,
16263      so it is too simplistic to say that one SVE load or store replaces
16264      VF scalar loads and stores.
16265
16266      Ideally we would account for this in the scalar and Advanced SIMD
16267      costs by making suitable load/store pairs as cheap as a single
16268      load/store.  However, that would be a very invasive change and in
16269      practice it tends to stress other parts of the cost model too much.
16270      E.g. stores of scalar constants currently count just a store,
16271      whereas stores of vector constants count a store and a vec_init.
16272      This is an artificial distinction for AArch64, where stores of
16273      nonzero scalar constants need the same kind of register invariant
16274      as vector stores.
16275
16276      An alternative would be to double the cost of any SVE loads and stores
16277      that could be paired in Advanced SIMD (and possibly also paired in
16278      scalar code).  But this tends to stress other parts of the cost model
16279      in the same way.  It also means that we can fall back to Advanced SIMD
16280      even if full-loop predication would have been useful.
16281
16282      Here we go for a more conservative version: double the costs of SVE
16283      loads and stores if one iteration of the scalar loop processes enough
16284      elements for it to use a whole number of Advanced SIMD LDP or STP
16285      instructions.  This makes it very likely that the VF would be 1 for
16286      Advanced SIMD, and so no epilogue should be needed.  */
16287   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16288     {
16289       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16290       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16291       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16292       if (multiple_p (count * elt_bits, 256)
16293           && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16294         stmt_cost *= 2;
16295     }
16296
16297   return stmt_cost;
16298 }
16299
16300 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16301    and which when vectorized would operate on vector type VECTYPE.  Add the
16302    cost of any embedded operations.  */
16303 static fractional_cost
16304 aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
16305                           tree vectype, fractional_cost stmt_cost)
16306 {
16307   if (vectype)
16308     {
16309       const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16310
16311       /* Detect cases in which a vector load or store represents an
16312          LD[234] or ST[234] instruction.  */
16313       switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16314         {
16315         case 2:
16316           stmt_cost += simd_costs->ld2_st2_permute_cost;
16317           break;
16318
16319         case 3:
16320           stmt_cost += simd_costs->ld3_st3_permute_cost;
16321           break;
16322
16323         case 4:
16324           stmt_cost += simd_costs->ld4_st4_permute_cost;
16325           break;
16326         }
16327
16328       if (kind == vector_stmt || kind == vec_to_scalar)
16329         if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16330           {
16331             if (FLOAT_TYPE_P (cmp_type))
16332               stmt_cost += simd_costs->fp_stmt_cost;
16333             else
16334               stmt_cost += simd_costs->int_stmt_cost;
16335           }
16336     }
16337
16338   if (kind == scalar_stmt)
16339     if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16340       {
16341         if (FLOAT_TYPE_P (cmp_type))
16342           stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16343         else
16344           stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16345       }
16346
16347   return stmt_cost;
16348 }
16349
16350 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16351    and they describe an operation in the body of a vector loop.  Record issue
16352    information relating to the vector operation in OPS.  */
16353 void
16354 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
16355                                  stmt_vec_info stmt_info,
16356                                  aarch64_vec_op_count *ops)
16357 {
16358   const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16359   if (!base_issue)
16360     return;
16361   const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
16362   const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
16363
16364   /* Calculate the minimum cycles per iteration imposed by a reduction
16365      operation.  */
16366   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16367       && vect_is_reduction (stmt_info))
16368     {
16369       unsigned int base
16370         = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
16371
16372       /* ??? Ideally we'd do COUNT reductions in parallel, but unfortunately
16373          that's not yet the case.  */
16374       ops->reduction_latency = MAX (ops->reduction_latency, base * count);
16375     }
16376
16377   /* Assume that multiply-adds will become a single operation.  */
16378   if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
16379     return;
16380
16381   /* Count the basic operation cost associated with KIND.  */
16382   switch (kind)
16383     {
16384     case cond_branch_taken:
16385     case cond_branch_not_taken:
16386     case vector_gather_load:
16387     case vector_scatter_store:
16388       /* We currently don't expect these to be used in a loop body.  */
16389       break;
16390
16391     case vec_perm:
16392     case vec_promote_demote:
16393     case vec_construct:
16394     case vec_to_scalar:
16395     case scalar_to_vec:
16396     case vector_stmt:
16397     case scalar_stmt:
16398       ops->general_ops += count;
16399       break;
16400
16401     case scalar_load:
16402     case vector_load:
16403     case unaligned_load:
16404       ops->loads += count;
16405       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16406         ops->general_ops += base_issue->fp_simd_load_general_ops * count;
16407       break;
16408
16409     case vector_store:
16410     case unaligned_store:
16411     case scalar_store:
16412       ops->stores += count;
16413       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16414         ops->general_ops += base_issue->fp_simd_store_general_ops * count;
16415       break;
16416     }
16417
16418   /* Add any embedded comparison operations.  */
16419   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16420       && vect_embedded_comparison_type (stmt_info))
16421     ops->general_ops += count;
16422
16423   /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
16424      have only accounted for one.  */
16425   if ((kind == vector_stmt || kind == vec_to_scalar)
16426       && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
16427     ops->general_ops += count;
16428
16429   /* Count the predicate operations needed by an SVE comparison.  */
16430   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
16431     if (tree type = vect_comparison_type (stmt_info))
16432       {
16433         unsigned int base = (FLOAT_TYPE_P (type)
16434                              ? sve_issue->fp_cmp_pred_ops
16435                              : sve_issue->int_cmp_pred_ops);
16436         ops->pred_ops += base * count;
16437       }
16438
16439   /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
16440   if (simd_issue)
16441     switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16442       {
16443       case 2:
16444         ops->general_ops += simd_issue->ld2_st2_general_ops * count;
16445         break;
16446
16447       case 3:
16448         ops->general_ops += simd_issue->ld3_st3_general_ops * count;
16449         break;
16450
16451       case 4:
16452         ops->general_ops += simd_issue->ld4_st4_general_ops * count;
16453         break;
16454       }
16455
16456   /* Add any overhead associated with gather loads and scatter stores.  */
16457   if (sve_issue
16458       && (kind == scalar_load || kind == scalar_store)
16459       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16460     {
16461       unsigned int pairs = CEIL (count, 2);
16462       ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
16463       ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
16464     }
16465 }
16466
16467 /* Return true if STMT_INFO contains a memory access and if the constant
16468    component of the memory address is aligned to SIZE bytes.  */
16469 static bool
16470 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
16471                                    poly_uint64 size)
16472 {
16473   if (!STMT_VINFO_DATA_REF (stmt_info))
16474     return false;
16475
16476   if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
16477     stmt_info = first_stmt;
16478   tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
16479   /* Needed for gathers & scatters, for example.  */
16480   if (!constant_offset)
16481     return false;
16482
16483   return multiple_p (wi::to_poly_offset (constant_offset), size);
16484 }
16485
16486 /* Check if a scalar or vector stmt could be part of a region of code
16487    that does nothing more than store values to memory, in the scalar
16488    case using STP.  Return the cost of the stmt if so, counting 2 for
16489    one instruction.  Return ~0U otherwise.
16490
16491    The arguments are a subset of those passed to add_stmt_cost.  */
16492 unsigned int
16493 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
16494                            stmt_vec_info stmt_info, tree vectype)
16495 {
16496   /* Code that stores vector constants uses a vector_load to create
16497      the constant.  We don't apply the heuristic to that case for two
16498      main reasons:
16499
16500      - At the moment, STPs are only formed via peephole2, and the
16501        constant scalar moves would often come between STRs and so
16502        prevent STP formation.
16503
16504      - The scalar code also has to load the constant somehow, and that
16505        isn't costed.  */
16506   switch (kind)
16507     {
16508     case scalar_to_vec:
16509       /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup.  */
16510       return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
16511
16512     case vec_construct:
16513       if (FLOAT_TYPE_P (vectype))
16514         /* Count 1 insn for the maximum number of FP->SIMD INS
16515            instructions.  */
16516         return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
16517
16518       /* Count 2 insns for a GPR->SIMD move and 2 insns for the
16519          maximum number of GPR->SIMD INS instructions.  */
16520       return vect_nunits_for_cost (vectype) * 4 * count;
16521
16522     case vector_store:
16523     case unaligned_store:
16524       /* Count 1 insn per vector if we can't form STP Q pairs.  */
16525       if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16526         return count * 2;
16527       if (aarch64_tune_params.extra_tuning_flags
16528           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16529         return count * 2;
16530
16531       if (stmt_info)
16532         {
16533           /* Assume we won't be able to use STP if the constant offset
16534              component of the address is misaligned.  ??? This could be
16535              removed if we formed STP pairs earlier, rather than relying
16536              on peephole2.  */
16537           auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
16538           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16539             return count * 2;
16540         }
16541       return CEIL (count, 2) * 2;
16542
16543     case scalar_store:
16544       if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
16545         {
16546           /* Check for a mode in which STP pairs can be formed.  */
16547           auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
16548           if (maybe_ne (size, 4) && maybe_ne (size, 8))
16549             return ~0U;
16550
16551           /* Assume we won't be able to use STP if the constant offset
16552              component of the address is misaligned.  ??? This could be
16553              removed if we formed STP pairs earlier, rather than relying
16554              on peephole2.  */
16555           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16556             return ~0U;
16557         }
16558       return count;
16559
16560     default:
16561       return ~0U;
16562     }
16563 }
16564
16565 unsigned
16566 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
16567                                      stmt_vec_info stmt_info, slp_tree,
16568                                      tree vectype, int misalign,
16569                                      vect_cost_model_location where)
16570 {
16571   fractional_cost stmt_cost
16572     = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
16573
16574   bool in_inner_loop_p = (where == vect_body
16575                           && stmt_info
16576                           && stmt_in_inner_loop_p (m_vinfo, stmt_info));
16577
16578   /* Do one-time initialization based on the vinfo.  */
16579   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
16580   if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
16581     {
16582       if (loop_vinfo)
16583         analyze_loop_vinfo (loop_vinfo);
16584
16585       m_analyzed_vinfo = true;
16586     }
16587
16588   /* Apply the heuristic described above m_stp_sequence_cost.  */
16589   if (m_stp_sequence_cost != ~0U)
16590     {
16591       uint64_t cost = aarch64_stp_sequence_cost (count, kind,
16592                                                  stmt_info, vectype);
16593       m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
16594     }
16595
16596   /* Try to get a more accurate cost by looking at STMT_INFO instead
16597      of just looking at KIND.  */
16598   if (stmt_info && aarch64_use_new_vector_costs_p ())
16599     {
16600       /* If we scalarize a strided store, the vectorizer costs one
16601          vec_to_scalar for each element.  However, we can store the first
16602          element using an FP store without a separate extract step.  */
16603       if (vect_is_store_elt_extraction (kind, stmt_info))
16604         count -= 1;
16605
16606       stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
16607                                                       stmt_info, stmt_cost);
16608
16609       if (vectype && m_vec_flags)
16610         stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
16611                                                         stmt_info, vectype,
16612                                                         where, stmt_cost);
16613     }
16614
16615   /* Do any SVE-specific adjustments to the cost.  */
16616   if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
16617     stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
16618                                               vectype, stmt_cost);
16619
16620   if (stmt_info && aarch64_use_new_vector_costs_p ())
16621     {
16622       /* Account for any extra "embedded" costs that apply additively
16623          to the base cost calculated above.  */
16624       stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
16625                                             stmt_cost);
16626
16627       /* If we're recording a nonzero vector loop body cost for the
16628          innermost loop, also estimate the operations that would need
16629          to be issued by all relevant implementations of the loop.  */
16630       if (loop_vinfo
16631           && (m_costing_for_scalar || where == vect_body)
16632           && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
16633           && stmt_cost != 0)
16634         for (auto &ops : m_ops)
16635           count_ops (count, kind, stmt_info, &ops);
16636
16637       /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
16638          estimate the number of statements in the unrolled Advanced SIMD
16639          loop.  For simplicitly, we assume that one iteration of the
16640          Advanced SIMD loop would need the same number of statements
16641          as one iteration of the SVE loop.  */
16642       if (where == vect_body && m_unrolled_advsimd_niters)
16643         m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
16644
16645       /* Detect the use of an averaging operation.  */
16646       gimple *stmt = stmt_info->stmt;
16647       if (is_gimple_call (stmt)
16648           && gimple_call_internal_p (stmt))
16649         {
16650           switch (gimple_call_internal_fn (stmt))
16651             {
16652             case IFN_AVG_FLOOR:
16653             case IFN_AVG_CEIL:
16654               m_has_avg = true;
16655             default:
16656               break;
16657             }
16658         }
16659     }
16660   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
16661 }
16662
16663 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
16664    heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
16665    says that we should prefer the Advanced SIMD loop.  */
16666 bool
16667 aarch64_vector_costs::prefer_unrolled_loop () const
16668 {
16669   if (!m_unrolled_advsimd_stmts)
16670     return false;
16671
16672   if (dump_enabled_p ())
16673     dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
16674                      " unrolled Advanced SIMD loop = %d\n",
16675                      m_unrolled_advsimd_stmts);
16676
16677   /* The balance here is tricky.  On the one hand, we can't be sure whether
16678      the code is vectorizable with Advanced SIMD or not.  However, even if
16679      it isn't vectorizable with Advanced SIMD, there's a possibility that
16680      the scalar code could also be unrolled.  Some of the code might then
16681      benefit from SLP, or from using LDP and STP.  We therefore apply
16682      the heuristic regardless of can_use_advsimd_p.  */
16683   return (m_unrolled_advsimd_stmts
16684           && (m_unrolled_advsimd_stmts
16685               <= (unsigned int) param_max_completely_peeled_insns));
16686 }
16687
16688 /* Subroutine of adjust_body_cost for handling SVE.  Use ISSUE_INFO to work out
16689    how fast the SVE code can be issued and compare it to the equivalent value
16690    for scalar code (SCALAR_CYCLES_PER_ITER).  If COULD_USE_ADVSIMD is true,
16691    also compare it to the issue rate of Advanced SIMD code
16692    (ADVSIMD_CYCLES_PER_ITER).
16693
16694    ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
16695    *BODY_COST is the current value of the adjusted cost.  *SHOULD_DISPARAGE
16696    is true if we think the loop body is too expensive.  */
16697
16698 fractional_cost
16699 aarch64_vector_costs::
16700 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
16701                       fractional_cost scalar_cycles_per_iter,
16702                       unsigned int orig_body_cost, unsigned int *body_cost,
16703                       bool *should_disparage)
16704 {
16705   if (dump_enabled_p ())
16706     ops->dump ();
16707
16708   fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
16709   fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
16710
16711   /* If the scalar version of the loop could issue at least as
16712      quickly as the predicate parts of the SVE loop, make the SVE loop
16713      prohibitively expensive.  In this case vectorization is adding an
16714      overhead that the original scalar code didn't have.
16715
16716      This is mostly intended to detect cases in which WHILELOs dominate
16717      for very tight loops, which is something that normal latency-based
16718      costs would not model.  Adding this kind of cliffedge would be
16719      too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
16720      code in the caller handles that case in a more conservative way.  */
16721   fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
16722   if (scalar_cycles_per_iter < sve_estimate)
16723     {
16724       unsigned int min_cost
16725         = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
16726       if (*body_cost < min_cost)
16727         {
16728           if (dump_enabled_p ())
16729             dump_printf_loc (MSG_NOTE, vect_location,
16730                              "Increasing body cost to %d because the"
16731                              " scalar code could issue within the limit"
16732                              " imposed by predicate operations\n",
16733                              min_cost);
16734           *body_cost = min_cost;
16735           *should_disparage = true;
16736         }
16737     }
16738
16739   return sve_cycles_per_iter;
16740 }
16741
16742 unsigned int
16743 aarch64_vector_costs::
16744 determine_suggested_unroll_factor (loop_vec_info loop_vinfo)
16745 {
16746   bool sve = m_vec_flags & VEC_ANY_SVE;
16747   /* If we are trying to unroll an Advanced SIMD main loop that contains
16748      an averaging operation that we do not support with SVE and we might use a
16749      predicated epilogue, we need to be conservative and block unrolling as
16750      this might lead to a less optimal loop for the first and only epilogue
16751      using the original loop's vectorization factor.
16752      TODO: Remove this constraint when we add support for multiple epilogue
16753      vectorization.  */
16754   if (!sve && !TARGET_SVE2 && m_has_avg)
16755     return 1;
16756
16757   unsigned int max_unroll_factor = 1;
16758   auto vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
16759   for (auto vec_ops : m_ops)
16760     {
16761       aarch64_simd_vec_issue_info const *vec_issue
16762         = vec_ops.simd_issue_info ();
16763       if (!vec_issue)
16764         return 1;
16765       /* Limit unroll factor to a value adjustable by the user, the default
16766          value is 4. */
16767       unsigned int unroll_factor = MIN (aarch64_vect_unroll_limit,
16768                                         (int) known_alignment (vf));
16769       unsigned int factor
16770        = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
16771       unsigned int temp;
16772
16773       /* Sanity check, this should never happen.  */
16774       if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
16775         return 1;
16776
16777       /* Check stores.  */
16778       if (vec_ops.stores > 0)
16779         {
16780           temp = CEIL (factor * vec_issue->stores_per_cycle,
16781                        vec_ops.stores);
16782           unroll_factor = MIN (unroll_factor, temp);
16783         }
16784
16785       /* Check loads + stores.  */
16786       if (vec_ops.loads > 0)
16787         {
16788           temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
16789                        vec_ops.loads + vec_ops.stores);
16790           unroll_factor = MIN (unroll_factor, temp);
16791         }
16792
16793       /* Check general ops.  */
16794       if (vec_ops.general_ops > 0)
16795         {
16796           temp = CEIL (factor * vec_issue->general_ops_per_cycle,
16797                        vec_ops.general_ops);
16798           unroll_factor = MIN (unroll_factor, temp);
16799          }
16800       max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
16801     }
16802
16803   /* Make sure unroll factor is power of 2.  */
16804   return 1 << ceil_log2 (max_unroll_factor);
16805 }
16806
16807 /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
16808    and return the new cost.  */
16809 unsigned int
16810 aarch64_vector_costs::
16811 adjust_body_cost (loop_vec_info loop_vinfo,
16812                   const aarch64_vector_costs *scalar_costs,
16813                   unsigned int body_cost)
16814 {
16815   if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
16816     return body_cost;
16817
16818   const auto &scalar_ops = scalar_costs->m_ops[0];
16819   const auto &vector_ops = m_ops[0];
16820   unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
16821   unsigned int orig_body_cost = body_cost;
16822   bool should_disparage = false;
16823
16824   if (dump_enabled_p ())
16825     dump_printf_loc (MSG_NOTE, vect_location,
16826                      "Original vector body cost = %d\n", body_cost);
16827
16828   fractional_cost scalar_cycles_per_iter
16829     = scalar_ops.min_cycles_per_iter () * estimated_vf;
16830
16831   fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
16832
16833   if (dump_enabled_p ())
16834     {
16835       if (IN_RANGE (m_num_vector_iterations, 0, 65536))
16836         dump_printf_loc (MSG_NOTE, vect_location,
16837                          "Vector loop iterates at most %wd times\n",
16838                          m_num_vector_iterations);
16839       dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
16840       scalar_ops.dump ();
16841       dump_printf_loc (MSG_NOTE, vect_location,
16842                        "  estimated cycles per vector iteration"
16843                        " (for VF %d) = %f\n",
16844                        estimated_vf, scalar_cycles_per_iter.as_double ());
16845     }
16846
16847   if (vector_ops.sve_issue_info ())
16848     {
16849       if (dump_enabled_p ())
16850         dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
16851       vector_cycles_per_iter
16852         = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
16853                                 orig_body_cost, &body_cost, &should_disparage);
16854
16855       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
16856         {
16857           /* Also take Neoverse V1 tuning into account, doubling the
16858              scalar and Advanced SIMD estimates to account for the
16859              doubling in SVE vector length.  */
16860           if (dump_enabled_p ())
16861             dump_printf_loc (MSG_NOTE, vect_location,
16862                              "Neoverse V1 estimate:\n");
16863           auto vf_factor = m_ops[1].vf_factor ();
16864           adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
16865                                 orig_body_cost, &body_cost, &should_disparage);
16866         }
16867     }
16868   else
16869     {
16870       if (dump_enabled_p ())
16871         {
16872           dump_printf_loc (MSG_NOTE, vect_location,
16873                            "Vector issue estimate:\n");
16874           vector_ops.dump ();
16875         }
16876     }
16877
16878   /* Decide whether to stick to latency-based costs or whether to try to
16879      take issue rates into account.  */
16880   unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
16881   if (m_vec_flags & VEC_ANY_SVE)
16882     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
16883
16884   if (m_num_vector_iterations >= 1
16885       && m_num_vector_iterations < threshold)
16886     {
16887       if (dump_enabled_p ())
16888         dump_printf_loc (MSG_NOTE, vect_location,
16889                          "Low iteration count, so using pure latency"
16890                          " costs\n");
16891     }
16892   /* Increase the cost of the vector code if it looks like the scalar code
16893      could issue more quickly.  These values are only rough estimates,
16894      so minor differences should only result in minor changes.  */
16895   else if (scalar_cycles_per_iter < vector_cycles_per_iter)
16896     {
16897       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
16898                                           scalar_cycles_per_iter);
16899       if (dump_enabled_p ())
16900         dump_printf_loc (MSG_NOTE, vect_location,
16901                          "Increasing body cost to %d because scalar code"
16902                          " would issue more quickly\n", body_cost);
16903     }
16904   /* In general, it's expected that the proposed vector code would be able
16905      to issue more quickly than the original scalar code.  This should
16906      already be reflected to some extent in the latency-based costs.
16907
16908      However, the latency-based costs effectively assume that the scalar
16909      code and the vector code execute serially, which tends to underplay
16910      one important case: if the real (non-serialized) execution time of
16911      a scalar iteration is dominated by loop-carried dependencies,
16912      and if the vector code is able to reduce both the length of
16913      the loop-carried dependencies *and* the number of cycles needed
16914      to issue the code in general, we can be more confident that the
16915      vector code is an improvement, even if adding the other (non-loop-carried)
16916      latencies tends to hide this saving.  We therefore reduce the cost of the
16917      vector loop body in proportion to the saving.  */
16918   else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
16919            && scalar_ops.reduction_latency == scalar_cycles_per_iter
16920            && scalar_cycles_per_iter > vector_cycles_per_iter
16921            && !should_disparage)
16922     {
16923       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
16924                                           scalar_cycles_per_iter);
16925       if (dump_enabled_p ())
16926         dump_printf_loc (MSG_NOTE, vect_location,
16927                          "Decreasing body cost to %d account for smaller"
16928                          " reduction latency\n", body_cost);
16929     }
16930
16931   return body_cost;
16932 }
16933
16934 void
16935 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
16936 {
16937   auto *scalar_costs
16938     = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
16939   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
16940   if (loop_vinfo
16941       && m_vec_flags
16942       && aarch64_use_new_vector_costs_p ())
16943     {
16944       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
16945                                              m_costs[vect_body]);
16946       m_suggested_unroll_factor
16947         = determine_suggested_unroll_factor (loop_vinfo);
16948     }
16949
16950   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
16951      the scalar code in the event of a tie, since there is more chance
16952      of scalar code being optimized with surrounding operations.  */
16953   if (!loop_vinfo
16954       && scalar_costs
16955       && m_stp_sequence_cost != ~0U
16956       && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
16957     m_costs[vect_body] = 2 * scalar_costs->total_cost ();
16958
16959   vector_costs::finish_cost (scalar_costs);
16960 }
16961
16962 bool
16963 aarch64_vector_costs::
16964 better_main_loop_than_p (const vector_costs *uncast_other) const
16965 {
16966   auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
16967
16968   auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
16969   auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
16970
16971   if (dump_enabled_p ())
16972     dump_printf_loc (MSG_NOTE, vect_location,
16973                      "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
16974                      GET_MODE_NAME (this_loop_vinfo->vector_mode),
16975                      vect_vf_for_cost (this_loop_vinfo),
16976                      GET_MODE_NAME (other_loop_vinfo->vector_mode),
16977                      vect_vf_for_cost (other_loop_vinfo));
16978
16979   /* Apply the unrolling heuristic described above
16980      m_unrolled_advsimd_niters.  */
16981   if (bool (m_unrolled_advsimd_stmts)
16982       != bool (other->m_unrolled_advsimd_stmts))
16983     {
16984       bool this_prefer_unrolled = this->prefer_unrolled_loop ();
16985       bool other_prefer_unrolled = other->prefer_unrolled_loop ();
16986       if (this_prefer_unrolled != other_prefer_unrolled)
16987         {
16988           if (dump_enabled_p ())
16989             dump_printf_loc (MSG_NOTE, vect_location,
16990                              "Preferring Advanced SIMD loop because"
16991                              " it can be unrolled\n");
16992           return other_prefer_unrolled;
16993         }
16994     }
16995
16996   for (unsigned int i = 0; i < m_ops.length (); ++i)
16997     {
16998       if (dump_enabled_p ())
16999         {
17000           if (i)
17001             dump_printf_loc (MSG_NOTE, vect_location,
17002                              "Reconsidering with subtuning %d\n", i);
17003           dump_printf_loc (MSG_NOTE, vect_location,
17004                            "Issue info for %s loop:\n",
17005                            GET_MODE_NAME (this_loop_vinfo->vector_mode));
17006           this->m_ops[i].dump ();
17007           dump_printf_loc (MSG_NOTE, vect_location,
17008                            "Issue info for %s loop:\n",
17009                            GET_MODE_NAME (other_loop_vinfo->vector_mode));
17010           other->m_ops[i].dump ();
17011         }
17012
17013       auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17014                                 * this->m_ops[i].vf_factor ());
17015       auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17016                                  * other->m_ops[i].vf_factor ());
17017
17018       /* If it appears that one loop could process the same amount of data
17019          in fewer cycles, prefer that loop over the other one.  */
17020       fractional_cost this_cost
17021         = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17022       fractional_cost other_cost
17023         = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17024       if (dump_enabled_p ())
17025         {
17026           dump_printf_loc (MSG_NOTE, vect_location,
17027                            "Weighted cycles per iteration of %s loop ~= %f\n",
17028                            GET_MODE_NAME (this_loop_vinfo->vector_mode),
17029                            this_cost.as_double ());
17030           dump_printf_loc (MSG_NOTE, vect_location,
17031                            "Weighted cycles per iteration of %s loop ~= %f\n",
17032                            GET_MODE_NAME (other_loop_vinfo->vector_mode),
17033                            other_cost.as_double ());
17034         }
17035       if (this_cost != other_cost)
17036         {
17037           if (dump_enabled_p ())
17038             dump_printf_loc (MSG_NOTE, vect_location,
17039                              "Preferring loop with lower cycles"
17040                              " per iteration\n");
17041           return this_cost < other_cost;
17042         }
17043
17044       /* If the issue rate of SVE code is limited by predicate operations
17045          (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17046          and if Advanced SIMD code could issue within the limit imposed
17047          by the predicate operations, the predicate operations are adding an
17048          overhead that the original code didn't have and so we should prefer
17049          the Advanced SIMD version.  */
17050       auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17051                                     const aarch64_vec_op_count &b) -> bool
17052         {
17053           if (a.pred_ops == 0
17054               && (b.min_pred_cycles_per_iter ()
17055                   > b.min_nonpred_cycles_per_iter ()))
17056             {
17057               if (dump_enabled_p ())
17058                 dump_printf_loc (MSG_NOTE, vect_location,
17059                                  "Preferring Advanced SIMD loop since"
17060                                  " SVE loop is predicate-limited\n");
17061               return true;
17062             }
17063           return false;
17064         };
17065       if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17066         return true;
17067       if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17068         return false;
17069     }
17070
17071   return vector_costs::better_main_loop_than_p (other);
17072 }
17073
17074 static void initialize_aarch64_code_model (struct gcc_options *);
17075
17076 /* Parse the TO_PARSE string and put the architecture struct that it
17077    selects into RES and the architectural features into ISA_FLAGS.
17078    Return an aarch64_parse_opt_result describing the parse result.
17079    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17080    When the TO_PARSE string contains an invalid extension,
17081    a copy of the string is created and stored to INVALID_EXTENSION.  */
17082
17083 static enum aarch64_parse_opt_result
17084 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17085                     uint64_t *isa_flags, std::string *invalid_extension)
17086 {
17087   const char *ext;
17088   const struct processor *arch;
17089   size_t len;
17090
17091   ext = strchr (to_parse, '+');
17092
17093   if (ext != NULL)
17094     len = ext - to_parse;
17095   else
17096     len = strlen (to_parse);
17097
17098   if (len == 0)
17099     return AARCH64_PARSE_MISSING_ARG;
17100
17101
17102   /* Loop through the list of supported ARCHes to find a match.  */
17103   for (arch = all_architectures; arch->name != NULL; arch++)
17104     {
17105       if (strlen (arch->name) == len
17106           && strncmp (arch->name, to_parse, len) == 0)
17107         {
17108           uint64_t isa_temp = arch->flags;
17109
17110           if (ext != NULL)
17111             {
17112               /* TO_PARSE string contains at least one extension.  */
17113               enum aarch64_parse_opt_result ext_res
17114                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17115
17116               if (ext_res != AARCH64_PARSE_OK)
17117                 return ext_res;
17118             }
17119           /* Extension parsing was successful.  Confirm the result
17120              arch and ISA flags.  */
17121           *res = arch;
17122           *isa_flags = isa_temp;
17123           return AARCH64_PARSE_OK;
17124         }
17125     }
17126
17127   /* ARCH name not found in list.  */
17128   return AARCH64_PARSE_INVALID_ARG;
17129 }
17130
17131 /* Parse the TO_PARSE string and put the result tuning in RES and the
17132    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
17133    describing the parse result.  If there is an error parsing, RES and
17134    ISA_FLAGS are left unchanged.
17135    When the TO_PARSE string contains an invalid extension,
17136    a copy of the string is created and stored to INVALID_EXTENSION.  */
17137
17138 static enum aarch64_parse_opt_result
17139 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17140                    uint64_t *isa_flags, std::string *invalid_extension)
17141 {
17142   const char *ext;
17143   const struct processor *cpu;
17144   size_t len;
17145
17146   ext = strchr (to_parse, '+');
17147
17148   if (ext != NULL)
17149     len = ext - to_parse;
17150   else
17151     len = strlen (to_parse);
17152
17153   if (len == 0)
17154     return AARCH64_PARSE_MISSING_ARG;
17155
17156
17157   /* Loop through the list of supported CPUs to find a match.  */
17158   for (cpu = all_cores; cpu->name != NULL; cpu++)
17159     {
17160       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17161         {
17162           uint64_t isa_temp = cpu->flags;
17163
17164
17165           if (ext != NULL)
17166             {
17167               /* TO_PARSE string contains at least one extension.  */
17168               enum aarch64_parse_opt_result ext_res
17169                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17170
17171               if (ext_res != AARCH64_PARSE_OK)
17172                 return ext_res;
17173             }
17174           /* Extension parsing was successfull.  Confirm the result
17175              cpu and ISA flags.  */
17176           *res = cpu;
17177           *isa_flags = isa_temp;
17178           return AARCH64_PARSE_OK;
17179         }
17180     }
17181
17182   /* CPU name not found in list.  */
17183   return AARCH64_PARSE_INVALID_ARG;
17184 }
17185
17186 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17187    Return an aarch64_parse_opt_result describing the parse result.
17188    If the parsing fails the RES does not change.  */
17189
17190 static enum aarch64_parse_opt_result
17191 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17192 {
17193   const struct processor *cpu;
17194
17195   /* Loop through the list of supported CPUs to find a match.  */
17196   for (cpu = all_cores; cpu->name != NULL; cpu++)
17197     {
17198       if (strcmp (cpu->name, to_parse) == 0)
17199         {
17200           *res = cpu;
17201           return AARCH64_PARSE_OK;
17202         }
17203     }
17204
17205   /* CPU name not found in list.  */
17206   return AARCH64_PARSE_INVALID_ARG;
17207 }
17208
17209 /* Parse TOKEN, which has length LENGTH to see if it is an option
17210    described in FLAG.  If it is, return the index bit for that fusion type.
17211    If not, error (printing OPTION_NAME) and return zero.  */
17212
17213 static unsigned int
17214 aarch64_parse_one_option_token (const char *token,
17215                                 size_t length,
17216                                 const struct aarch64_flag_desc *flag,
17217                                 const char *option_name)
17218 {
17219   for (; flag->name != NULL; flag++)
17220     {
17221       if (length == strlen (flag->name)
17222           && !strncmp (flag->name, token, length))
17223         return flag->flag;
17224     }
17225
17226   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17227   return 0;
17228 }
17229
17230 /* Parse OPTION which is a comma-separated list of flags to enable.
17231    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17232    default state we inherit from the CPU tuning structures.  OPTION_NAME
17233    gives the top-level option we are parsing in the -moverride string,
17234    for use in error messages.  */
17235
17236 static unsigned int
17237 aarch64_parse_boolean_options (const char *option,
17238                                const struct aarch64_flag_desc *flags,
17239                                unsigned int initial_state,
17240                                const char *option_name)
17241 {
17242   const char separator = '.';
17243   const char* specs = option;
17244   const char* ntoken = option;
17245   unsigned int found_flags = initial_state;
17246
17247   while ((ntoken = strchr (specs, separator)))
17248     {
17249       size_t token_length = ntoken - specs;
17250       unsigned token_ops = aarch64_parse_one_option_token (specs,
17251                                                            token_length,
17252                                                            flags,
17253                                                            option_name);
17254       /* If we find "none" (or, for simplicity's sake, an error) anywhere
17255          in the token stream, reset the supported operations.  So:
17256
17257            adrp+add.cmp+branch.none.adrp+add
17258
17259            would have the result of turning on only adrp+add fusion.  */
17260       if (!token_ops)
17261         found_flags = 0;
17262
17263       found_flags |= token_ops;
17264       specs = ++ntoken;
17265     }
17266
17267   /* We ended with a comma, print something.  */
17268   if (!(*specs))
17269     {
17270       error ("%qs string ill-formed", option_name);
17271       return 0;
17272     }
17273
17274   /* We still have one more token to parse.  */
17275   size_t token_length = strlen (specs);
17276   unsigned token_ops = aarch64_parse_one_option_token (specs,
17277                                                        token_length,
17278                                                        flags,
17279                                                        option_name);
17280    if (!token_ops)
17281      found_flags = 0;
17282
17283   found_flags |= token_ops;
17284   return found_flags;
17285 }
17286
17287 /* Support for overriding instruction fusion.  */
17288
17289 static void
17290 aarch64_parse_fuse_string (const char *fuse_string,
17291                             struct tune_params *tune)
17292 {
17293   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
17294                                                      aarch64_fusible_pairs,
17295                                                      tune->fusible_ops,
17296                                                      "fuse=");
17297 }
17298
17299 /* Support for overriding other tuning flags.  */
17300
17301 static void
17302 aarch64_parse_tune_string (const char *tune_string,
17303                             struct tune_params *tune)
17304 {
17305   tune->extra_tuning_flags
17306     = aarch64_parse_boolean_options (tune_string,
17307                                      aarch64_tuning_flags,
17308                                      tune->extra_tuning_flags,
17309                                      "tune=");
17310 }
17311
17312 /* Parse the sve_width tuning moverride string in TUNE_STRING.
17313    Accept the valid SVE vector widths allowed by
17314    aarch64_sve_vector_bits_enum and use it to override sve_width
17315    in TUNE.  */
17316
17317 static void
17318 aarch64_parse_sve_width_string (const char *tune_string,
17319                                 struct tune_params *tune)
17320 {
17321   int width = -1;
17322
17323   int n = sscanf (tune_string, "%d", &width);
17324   if (n == EOF)
17325     {
17326       error ("invalid format for %<sve_width%>");
17327       return;
17328     }
17329   switch (width)
17330     {
17331     case SVE_128:
17332     case SVE_256:
17333     case SVE_512:
17334     case SVE_1024:
17335     case SVE_2048:
17336       break;
17337     default:
17338       error ("invalid %<sve_width%> value: %d", width);
17339     }
17340   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
17341 }
17342
17343 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
17344    we understand.  If it is, extract the option string and handoff to
17345    the appropriate function.  */
17346
17347 void
17348 aarch64_parse_one_override_token (const char* token,
17349                                   size_t length,
17350                                   struct tune_params *tune)
17351 {
17352   const struct aarch64_tuning_override_function *fn
17353     = aarch64_tuning_override_functions;
17354
17355   const char *option_part = strchr (token, '=');
17356   if (!option_part)
17357     {
17358       error ("tuning string missing in option (%s)", token);
17359       return;
17360     }
17361
17362   /* Get the length of the option name.  */
17363   length = option_part - token;
17364   /* Skip the '=' to get to the option string.  */
17365   option_part++;
17366
17367   for (; fn->name != NULL; fn++)
17368     {
17369       if (!strncmp (fn->name, token, length))
17370         {
17371           fn->parse_override (option_part, tune);
17372           return;
17373         }
17374     }
17375
17376   error ("unknown tuning option (%s)",token);
17377   return;
17378 }
17379
17380 /* A checking mechanism for the implementation of the tls size.  */
17381
17382 static void
17383 initialize_aarch64_tls_size (struct gcc_options *opts)
17384 {
17385   if (aarch64_tls_size == 0)
17386     aarch64_tls_size = 24;
17387
17388   switch (opts->x_aarch64_cmodel_var)
17389     {
17390     case AARCH64_CMODEL_TINY:
17391       /* Both the default and maximum TLS size allowed under tiny is 1M which
17392          needs two instructions to address, so we clamp the size to 24.  */
17393       if (aarch64_tls_size > 24)
17394         aarch64_tls_size = 24;
17395       break;
17396     case AARCH64_CMODEL_SMALL:
17397       /* The maximum TLS size allowed under small is 4G.  */
17398       if (aarch64_tls_size > 32)
17399         aarch64_tls_size = 32;
17400       break;
17401     case AARCH64_CMODEL_LARGE:
17402       /* The maximum TLS size allowed under large is 16E.
17403          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
17404       if (aarch64_tls_size > 48)
17405         aarch64_tls_size = 48;
17406       break;
17407     default:
17408       gcc_unreachable ();
17409     }
17410
17411   return;
17412 }
17413
17414 /* Return the CPU corresponding to the enum CPU.  */
17415
17416 static const struct processor *
17417 aarch64_get_tune_cpu (enum aarch64_processor cpu)
17418 {
17419   gcc_assert (cpu != aarch64_none);
17420
17421   return &all_cores[cpu];
17422 }
17423
17424 /* Return the architecture corresponding to the enum ARCH.  */
17425
17426 static const struct processor *
17427 aarch64_get_arch (enum aarch64_arch arch)
17428 {
17429   gcc_assert (arch != aarch64_no_arch);
17430
17431   return &all_architectures[arch];
17432 }
17433
17434 /* Parse STRING looking for options in the format:
17435      string     :: option:string
17436      option     :: name=substring
17437      name       :: {a-z}
17438      substring  :: defined by option.  */
17439
17440 static void
17441 aarch64_parse_override_string (const char* input_string,
17442                                struct tune_params* tune)
17443 {
17444   const char separator = ':';
17445   size_t string_length = strlen (input_string) + 1;
17446   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
17447   char *string = string_root;
17448   strncpy (string, input_string, string_length);
17449   string[string_length - 1] = '\0';
17450
17451   char* ntoken = string;
17452
17453   while ((ntoken = strchr (string, separator)))
17454     {
17455       size_t token_length = ntoken - string;
17456       /* Make this substring look like a string.  */
17457       *ntoken = '\0';
17458       aarch64_parse_one_override_token (string, token_length, tune);
17459       string = ++ntoken;
17460     }
17461
17462   /* One last option to parse.  */
17463   aarch64_parse_one_override_token (string, strlen (string), tune);
17464   free (string_root);
17465 }
17466
17467 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
17468    are best for a generic target with the currently-enabled architecture
17469    extensions.  */
17470 static void
17471 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
17472 {
17473   /* Neoverse V1 is the only core that is known to benefit from
17474      AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.  There is therefore no
17475      point enabling it for SVE2 and above.  */
17476   if (TARGET_SVE2)
17477     current_tune.extra_tuning_flags
17478       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
17479 }
17480
17481 static void
17482 aarch64_override_options_after_change_1 (struct gcc_options *opts)
17483 {
17484   if (accepted_branch_protection_string)
17485     {
17486       opts->x_aarch64_branch_protection_string
17487         = xstrdup (accepted_branch_protection_string);
17488     }
17489
17490   /* PR 70044: We have to be careful about being called multiple times for the
17491      same function.  This means all changes should be repeatable.  */
17492
17493   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
17494      Disable the frame pointer flag so the mid-end will not use a frame
17495      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
17496      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
17497      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
17498   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
17499   if (opts->x_flag_omit_frame_pointer == 0)
17500     opts->x_flag_omit_frame_pointer = 2;
17501
17502   /* If not optimizing for size, set the default
17503      alignment to what the target wants.  */
17504   if (!opts->x_optimize_size)
17505     {
17506       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
17507         opts->x_str_align_loops = aarch64_tune_params.loop_align;
17508       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
17509         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
17510       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
17511         opts->x_str_align_functions = aarch64_tune_params.function_align;
17512     }
17513
17514   /* We default to no pc-relative literal loads.  */
17515
17516   aarch64_pcrelative_literal_loads = false;
17517
17518   /* If -mpc-relative-literal-loads is set on the command line, this
17519      implies that the user asked for PC relative literal loads.  */
17520   if (opts->x_pcrelative_literal_loads == 1)
17521     aarch64_pcrelative_literal_loads = true;
17522
17523   /* In the tiny memory model it makes no sense to disallow PC relative
17524      literal pool loads.  */
17525   if (aarch64_cmodel == AARCH64_CMODEL_TINY
17526       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
17527     aarch64_pcrelative_literal_loads = true;
17528
17529   /* When enabling the lower precision Newton series for the square root, also
17530      enable it for the reciprocal square root, since the latter is an
17531      intermediary step for the former.  */
17532   if (flag_mlow_precision_sqrt)
17533     flag_mrecip_low_precision_sqrt = true;
17534 }
17535
17536 /* 'Unpack' up the internal tuning structs and update the options
17537     in OPTS.  The caller must have set up selected_tune and selected_arch
17538     as all the other target-specific codegen decisions are
17539     derived from them.  */
17540
17541 void
17542 aarch64_override_options_internal (struct gcc_options *opts)
17543 {
17544   const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
17545   aarch64_tune_flags = tune->flags;
17546   aarch64_tune = tune->sched_core;
17547   /* Make a copy of the tuning parameters attached to the core, which
17548      we may later overwrite.  */
17549   aarch64_tune_params = *(tune->tune);
17550   if (tune->tune == &generic_tunings)
17551     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
17552
17553   if (opts->x_aarch64_override_tune_string)
17554     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
17555                                    &aarch64_tune_params);
17556
17557   /* This target defaults to strict volatile bitfields.  */
17558   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
17559     opts->x_flag_strict_volatile_bitfields = 1;
17560
17561   if (aarch64_stack_protector_guard == SSP_GLOBAL
17562       && opts->x_aarch64_stack_protector_guard_offset_str)
17563     {
17564       error ("incompatible options %<-mstack-protector-guard=global%> and "
17565              "%<-mstack-protector-guard-offset=%s%>",
17566              aarch64_stack_protector_guard_offset_str);
17567     }
17568
17569   if (aarch64_stack_protector_guard == SSP_SYSREG
17570       && !(opts->x_aarch64_stack_protector_guard_offset_str
17571            && opts->x_aarch64_stack_protector_guard_reg_str))
17572     {
17573       error ("both %<-mstack-protector-guard-offset%> and "
17574              "%<-mstack-protector-guard-reg%> must be used "
17575              "with %<-mstack-protector-guard=sysreg%>");
17576     }
17577
17578   if (opts->x_aarch64_stack_protector_guard_reg_str)
17579     {
17580       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
17581           error ("specify a system register with a small string length");
17582     }
17583
17584   if (opts->x_aarch64_stack_protector_guard_offset_str)
17585     {
17586       char *end;
17587       const char *str = aarch64_stack_protector_guard_offset_str;
17588       errno = 0;
17589       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
17590       if (!*str || *end || errno)
17591         error ("%qs is not a valid offset in %qs", str,
17592                "-mstack-protector-guard-offset=");
17593       aarch64_stack_protector_guard_offset = offs;
17594     }
17595
17596   if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
17597       && !fixed_regs[R18_REGNUM])
17598     error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
17599
17600   initialize_aarch64_code_model (opts);
17601   initialize_aarch64_tls_size (opts);
17602
17603   int queue_depth = 0;
17604   switch (aarch64_tune_params.autoprefetcher_model)
17605     {
17606       case tune_params::AUTOPREFETCHER_OFF:
17607         queue_depth = -1;
17608         break;
17609       case tune_params::AUTOPREFETCHER_WEAK:
17610         queue_depth = 0;
17611         break;
17612       case tune_params::AUTOPREFETCHER_STRONG:
17613         queue_depth = max_insn_queue_index + 1;
17614         break;
17615       default:
17616         gcc_unreachable ();
17617     }
17618
17619   /* We don't mind passing in global_options_set here as we don't use
17620      the *options_set structs anyway.  */
17621   SET_OPTION_IF_UNSET (opts, &global_options_set,
17622                        param_sched_autopref_queue_depth, queue_depth);
17623
17624   /* If using Advanced SIMD only for autovectorization disable SVE vector costs
17625      comparison.  */
17626   if (aarch64_autovec_preference == 1)
17627     SET_OPTION_IF_UNSET (opts, &global_options_set,
17628                          aarch64_sve_compare_costs, 0);
17629
17630   /* Set up parameters to be used in prefetching algorithm.  Do not
17631      override the defaults unless we are tuning for a core we have
17632      researched values for.  */
17633   if (aarch64_tune_params.prefetch->num_slots > 0)
17634     SET_OPTION_IF_UNSET (opts, &global_options_set,
17635                          param_simultaneous_prefetches,
17636                          aarch64_tune_params.prefetch->num_slots);
17637   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
17638     SET_OPTION_IF_UNSET (opts, &global_options_set,
17639                          param_l1_cache_size,
17640                          aarch64_tune_params.prefetch->l1_cache_size);
17641   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17642     SET_OPTION_IF_UNSET (opts, &global_options_set,
17643                          param_l1_cache_line_size,
17644                          aarch64_tune_params.prefetch->l1_cache_line_size);
17645
17646   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17647     {
17648       SET_OPTION_IF_UNSET (opts, &global_options_set,
17649                            param_destruct_interfere_size,
17650                            aarch64_tune_params.prefetch->l1_cache_line_size);
17651       SET_OPTION_IF_UNSET (opts, &global_options_set,
17652                            param_construct_interfere_size,
17653                            aarch64_tune_params.prefetch->l1_cache_line_size);
17654     }
17655   else
17656     {
17657       /* For a generic AArch64 target, cover the current range of cache line
17658          sizes.  */
17659       SET_OPTION_IF_UNSET (opts, &global_options_set,
17660                            param_destruct_interfere_size,
17661                            256);
17662       SET_OPTION_IF_UNSET (opts, &global_options_set,
17663                            param_construct_interfere_size,
17664                            64);
17665     }
17666
17667   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
17668     SET_OPTION_IF_UNSET (opts, &global_options_set,
17669                          param_l2_cache_size,
17670                          aarch64_tune_params.prefetch->l2_cache_size);
17671   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
17672     SET_OPTION_IF_UNSET (opts, &global_options_set,
17673                          param_prefetch_dynamic_strides, 0);
17674   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
17675     SET_OPTION_IF_UNSET (opts, &global_options_set,
17676                          param_prefetch_minimum_stride,
17677                          aarch64_tune_params.prefetch->minimum_stride);
17678
17679   /* Use the alternative scheduling-pressure algorithm by default.  */
17680   SET_OPTION_IF_UNSET (opts, &global_options_set,
17681                        param_sched_pressure_algorithm,
17682                        SCHED_PRESSURE_MODEL);
17683
17684   /* Validate the guard size.  */
17685   int guard_size = param_stack_clash_protection_guard_size;
17686
17687   if (guard_size != 12 && guard_size != 16)
17688     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
17689            "size.  Given value %d (%llu KB) is out of range",
17690            guard_size, (1ULL << guard_size) / 1024ULL);
17691
17692   /* Enforce that interval is the same size as size so the mid-end does the
17693      right thing.  */
17694   SET_OPTION_IF_UNSET (opts, &global_options_set,
17695                        param_stack_clash_protection_probe_interval,
17696                        guard_size);
17697
17698   /* The maybe_set calls won't update the value if the user has explicitly set
17699      one.  Which means we need to validate that probing interval and guard size
17700      are equal.  */
17701   int probe_interval
17702     = param_stack_clash_protection_probe_interval;
17703   if (guard_size != probe_interval)
17704     error ("stack clash guard size %<%d%> must be equal to probing interval "
17705            "%<%d%>", guard_size, probe_interval);
17706
17707   /* Enable sw prefetching at specified optimization level for
17708      CPUS that have prefetch.  Lower optimization level threshold by 1
17709      when profiling is enabled.  */
17710   if (opts->x_flag_prefetch_loop_arrays < 0
17711       && !opts->x_optimize_size
17712       && aarch64_tune_params.prefetch->default_opt_level >= 0
17713       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
17714     opts->x_flag_prefetch_loop_arrays = 1;
17715
17716   aarch64_override_options_after_change_1 (opts);
17717 }
17718
17719 /* Print a hint with a suggestion for a core or architecture name that
17720    most closely resembles what the user passed in STR.  ARCH is true if
17721    the user is asking for an architecture name.  ARCH is false if the user
17722    is asking for a core name.  */
17723
17724 static void
17725 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
17726 {
17727   auto_vec<const char *> candidates;
17728   const struct processor *entry = arch ? all_architectures : all_cores;
17729   for (; entry->name != NULL; entry++)
17730     candidates.safe_push (entry->name);
17731
17732 #ifdef HAVE_LOCAL_CPU_DETECT
17733   /* Add also "native" as possible value.  */
17734   if (arch)
17735     candidates.safe_push ("native");
17736 #endif
17737
17738   char *s;
17739   const char *hint = candidates_list_and_hint (str, s, candidates);
17740   if (hint)
17741     inform (input_location, "valid arguments are: %s;"
17742                              " did you mean %qs?", s, hint);
17743   else
17744     inform (input_location, "valid arguments are: %s", s);
17745
17746   XDELETEVEC (s);
17747 }
17748
17749 /* Print a hint with a suggestion for a core name that most closely resembles
17750    what the user passed in STR.  */
17751
17752 inline static void
17753 aarch64_print_hint_for_core (const char *str)
17754 {
17755   aarch64_print_hint_for_core_or_arch (str, false);
17756 }
17757
17758 /* Print a hint with a suggestion for an architecture name that most closely
17759    resembles what the user passed in STR.  */
17760
17761 inline static void
17762 aarch64_print_hint_for_arch (const char *str)
17763 {
17764   aarch64_print_hint_for_core_or_arch (str, true);
17765 }
17766
17767
17768 /* Print a hint with a suggestion for an extension name
17769    that most closely resembles what the user passed in STR.  */
17770
17771 void
17772 aarch64_print_hint_for_extensions (const std::string &str)
17773 {
17774   auto_vec<const char *> candidates;
17775   aarch64_get_all_extension_candidates (&candidates);
17776   char *s;
17777   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
17778   if (hint)
17779     inform (input_location, "valid arguments are: %s;"
17780                              " did you mean %qs?", s, hint);
17781   else
17782     inform (input_location, "valid arguments are: %s", s);
17783
17784   XDELETEVEC (s);
17785 }
17786
17787 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
17788    specified in STR and throw errors if appropriate.  Put the results if
17789    they are valid in RES and ISA_FLAGS.  Return whether the option is
17790    valid.  */
17791
17792 static bool
17793 aarch64_validate_mcpu (const char *str, const struct processor **res,
17794                        uint64_t *isa_flags)
17795 {
17796   std::string invalid_extension;
17797   enum aarch64_parse_opt_result parse_res
17798     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
17799
17800   if (parse_res == AARCH64_PARSE_OK)
17801     return true;
17802
17803   switch (parse_res)
17804     {
17805       case AARCH64_PARSE_MISSING_ARG:
17806         error ("missing cpu name in %<-mcpu=%s%>", str);
17807         break;
17808       case AARCH64_PARSE_INVALID_ARG:
17809         error ("unknown value %qs for %<-mcpu%>", str);
17810         aarch64_print_hint_for_core (str);
17811         break;
17812       case AARCH64_PARSE_INVALID_FEATURE:
17813         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
17814                invalid_extension.c_str (), str);
17815         aarch64_print_hint_for_extensions (invalid_extension);
17816         break;
17817       default:
17818         gcc_unreachable ();
17819     }
17820
17821   return false;
17822 }
17823
17824 /* Straight line speculation indicators.  */
17825 enum aarch64_sls_hardening_type
17826 {
17827   SLS_NONE = 0,
17828   SLS_RETBR = 1,
17829   SLS_BLR = 2,
17830   SLS_ALL = 3,
17831 };
17832 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
17833
17834 /* Return whether we should mitigatate Straight Line Speculation for the RET
17835    and BR instructions.  */
17836 bool
17837 aarch64_harden_sls_retbr_p (void)
17838 {
17839   return aarch64_sls_hardening & SLS_RETBR;
17840 }
17841
17842 /* Return whether we should mitigatate Straight Line Speculation for the BLR
17843    instruction.  */
17844 bool
17845 aarch64_harden_sls_blr_p (void)
17846 {
17847   return aarch64_sls_hardening & SLS_BLR;
17848 }
17849
17850 /* As of yet we only allow setting these options globally, in the future we may
17851    allow setting them per function.  */
17852 static void
17853 aarch64_validate_sls_mitigation (const char *const_str)
17854 {
17855   char *token_save = NULL;
17856   char *str = NULL;
17857
17858   if (strcmp (const_str, "none") == 0)
17859     {
17860       aarch64_sls_hardening = SLS_NONE;
17861       return;
17862     }
17863   if (strcmp (const_str, "all") == 0)
17864     {
17865       aarch64_sls_hardening = SLS_ALL;
17866       return;
17867     }
17868
17869   char *str_root = xstrdup (const_str);
17870   str = strtok_r (str_root, ",", &token_save);
17871   if (!str)
17872     error ("invalid argument given to %<-mharden-sls=%>");
17873
17874   int temp = SLS_NONE;
17875   while (str)
17876     {
17877       if (strcmp (str, "blr") == 0)
17878         temp |= SLS_BLR;
17879       else if (strcmp (str, "retbr") == 0)
17880         temp |= SLS_RETBR;
17881       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
17882         {
17883           error ("%qs must be by itself for %<-mharden-sls=%>", str);
17884           break;
17885         }
17886       else
17887         {
17888           error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
17889           break;
17890         }
17891       str = strtok_r (NULL, ",", &token_save);
17892     }
17893   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
17894   free (str_root);
17895 }
17896
17897 /* Parses CONST_STR for branch protection features specified in
17898    aarch64_branch_protect_types, and set any global variables required.  Returns
17899    the parsing result and assigns LAST_STR to the last processed token from
17900    CONST_STR so that it can be used for error reporting.  */
17901
17902 static enum
17903 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
17904                                                           char** last_str)
17905 {
17906   char *str_root = xstrdup (const_str);
17907   char* token_save = NULL;
17908   char *str = strtok_r (str_root, "+", &token_save);
17909   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
17910   if (!str)
17911     res = AARCH64_PARSE_MISSING_ARG;
17912   else
17913     {
17914       char *next_str = strtok_r (NULL, "+", &token_save);
17915       /* Reset the branch protection features to their defaults.  */
17916       aarch64_handle_no_branch_protection (NULL, NULL);
17917
17918       while (str && res == AARCH64_PARSE_OK)
17919         {
17920           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
17921           bool found = false;
17922           /* Search for this type.  */
17923           while (type && type->name && !found && res == AARCH64_PARSE_OK)
17924             {
17925               if (strcmp (str, type->name) == 0)
17926                 {
17927                   found = true;
17928                   res = type->handler (str, next_str);
17929                   str = next_str;
17930                   next_str = strtok_r (NULL, "+", &token_save);
17931                 }
17932               else
17933                 type++;
17934             }
17935           if (found && res == AARCH64_PARSE_OK)
17936             {
17937               bool found_subtype = true;
17938               /* Loop through each token until we find one that isn't a
17939                  subtype.  */
17940               while (found_subtype)
17941                 {
17942                   found_subtype = false;
17943                   const aarch64_branch_protect_type *subtype = type->subtypes;
17944                   /* Search for the subtype.  */
17945                   while (str && subtype && subtype->name && !found_subtype
17946                           && res == AARCH64_PARSE_OK)
17947                     {
17948                       if (strcmp (str, subtype->name) == 0)
17949                         {
17950                           found_subtype = true;
17951                           res = subtype->handler (str, next_str);
17952                           str = next_str;
17953                           next_str = strtok_r (NULL, "+", &token_save);
17954                         }
17955                       else
17956                         subtype++;
17957                     }
17958                 }
17959             }
17960           else if (!found)
17961             res = AARCH64_PARSE_INVALID_ARG;
17962         }
17963     }
17964   /* Copy the last processed token into the argument to pass it back.
17965     Used by option and attribute validation to print the offending token.  */
17966   if (last_str)
17967     {
17968       if (str) strcpy (*last_str, str);
17969       else *last_str = NULL;
17970     }
17971   if (res == AARCH64_PARSE_OK)
17972     {
17973       /* If needed, alloc the accepted string then copy in const_str.
17974         Used by override_option_after_change_1.  */
17975       if (!accepted_branch_protection_string)
17976         accepted_branch_protection_string = (char *) xmalloc (
17977                                                       BRANCH_PROTECT_STR_MAX
17978                                                         + 1);
17979       strncpy (accepted_branch_protection_string, const_str,
17980                 BRANCH_PROTECT_STR_MAX + 1);
17981       /* Forcibly null-terminate.  */
17982       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
17983     }
17984   return res;
17985 }
17986
17987 static bool
17988 aarch64_validate_mbranch_protection (const char *const_str)
17989 {
17990   char *str = (char *) xmalloc (strlen (const_str));
17991   enum aarch64_parse_opt_result res =
17992     aarch64_parse_branch_protection (const_str, &str);
17993   if (res == AARCH64_PARSE_INVALID_ARG)
17994     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
17995   else if (res == AARCH64_PARSE_MISSING_ARG)
17996     error ("missing argument for %<-mbranch-protection=%>");
17997   free (str);
17998   return res == AARCH64_PARSE_OK;
17999 }
18000
18001 /* Validate a command-line -march option.  Parse the arch and extensions
18002    (if any) specified in STR and throw errors if appropriate.  Put the
18003    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
18004    option is valid.  */
18005
18006 static bool
18007 aarch64_validate_march (const char *str, const struct processor **res,
18008                          uint64_t *isa_flags)
18009 {
18010   std::string invalid_extension;
18011   enum aarch64_parse_opt_result parse_res
18012     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18013
18014   if (parse_res == AARCH64_PARSE_OK)
18015     return true;
18016
18017   switch (parse_res)
18018     {
18019       case AARCH64_PARSE_MISSING_ARG:
18020         error ("missing arch name in %<-march=%s%>", str);
18021         break;
18022       case AARCH64_PARSE_INVALID_ARG:
18023         error ("unknown value %qs for %<-march%>", str);
18024         aarch64_print_hint_for_arch (str);
18025         break;
18026       case AARCH64_PARSE_INVALID_FEATURE:
18027         error ("invalid feature modifier %qs in %<-march=%s%>",
18028                invalid_extension.c_str (), str);
18029         aarch64_print_hint_for_extensions (invalid_extension);
18030         break;
18031       default:
18032         gcc_unreachable ();
18033     }
18034
18035   return false;
18036 }
18037
18038 /* Validate a command-line -mtune option.  Parse the cpu
18039    specified in STR and throw errors if appropriate.  Put the
18040    result, if it is valid, in RES.  Return whether the option is
18041    valid.  */
18042
18043 static bool
18044 aarch64_validate_mtune (const char *str, const struct processor **res)
18045 {
18046   enum aarch64_parse_opt_result parse_res
18047     = aarch64_parse_tune (str, res);
18048
18049   if (parse_res == AARCH64_PARSE_OK)
18050     return true;
18051
18052   switch (parse_res)
18053     {
18054       case AARCH64_PARSE_MISSING_ARG:
18055         error ("missing cpu name in %<-mtune=%s%>", str);
18056         break;
18057       case AARCH64_PARSE_INVALID_ARG:
18058         error ("unknown value %qs for %<-mtune%>", str);
18059         aarch64_print_hint_for_core (str);
18060         break;
18061       default:
18062         gcc_unreachable ();
18063     }
18064   return false;
18065 }
18066
18067 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
18068
18069 static poly_uint16
18070 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18071 {
18072   /* 128-bit SVE and Advanced SIMD modes use different register layouts
18073      on big-endian targets, so we would need to forbid subregs that convert
18074      from one to the other.  By default a reinterpret sequence would then
18075      involve a store to memory in one mode and a load back in the other.
18076      Even if we optimize that sequence using reverse instructions,
18077      it would still be a significant potential overhead.
18078
18079      For now, it seems better to generate length-agnostic code for that
18080      case instead.  */
18081   if (value == SVE_SCALABLE
18082       || (value == SVE_128 && BYTES_BIG_ENDIAN))
18083     return poly_uint16 (2, 2);
18084   else
18085     return (int) value / 64;
18086 }
18087
18088 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
18089    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18090    tuning structs.  In particular it must set selected_tune and
18091    aarch64_isa_flags that define the available ISA features and tuning
18092    decisions.  It must also set selected_arch as this will be used to
18093    output the .arch asm tags for each function.  */
18094
18095 static void
18096 aarch64_override_options (void)
18097 {
18098   uint64_t cpu_isa = 0;
18099   uint64_t arch_isa = 0;
18100   aarch64_isa_flags = 0;
18101
18102   const struct processor *cpu = NULL;
18103   const struct processor *arch = NULL;
18104   const struct processor *tune = NULL;
18105
18106   if (aarch64_harden_sls_string)
18107     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18108
18109   if (aarch64_branch_protection_string)
18110     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
18111
18112   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18113      If either of -march or -mtune is given, they override their
18114      respective component of -mcpu.  */
18115   if (aarch64_cpu_string)
18116     aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18117
18118   if (aarch64_arch_string)
18119     aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18120
18121   if (aarch64_tune_string)
18122     aarch64_validate_mtune (aarch64_tune_string, &tune);
18123
18124 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18125   SUBTARGET_OVERRIDE_OPTIONS;
18126 #endif
18127
18128   if (cpu && arch)
18129     {
18130       /* If both -mcpu and -march are specified, warn if they are not
18131          architecturally compatible and prefer the -march ISA flags.  */
18132       if (arch->arch != cpu->arch)
18133         {
18134           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
18135                        aarch64_cpu_string,
18136                        aarch64_arch_string);
18137         }
18138
18139       selected_arch = arch->arch;
18140       aarch64_isa_flags = arch_isa;
18141     }
18142   else if (cpu)
18143     {
18144       selected_arch = cpu->arch;
18145       aarch64_isa_flags = cpu_isa;
18146     }
18147   else if (arch)
18148     {
18149       cpu = &all_cores[arch->ident];
18150       selected_arch = arch->arch;
18151       aarch64_isa_flags = arch_isa;
18152     }
18153   else
18154     {
18155       /* No -mcpu or -march specified, so use the default CPU.  */
18156       cpu = &all_cores[TARGET_CPU_DEFAULT];
18157       selected_arch = cpu->arch;
18158       aarch64_isa_flags = cpu->flags;
18159     }
18160
18161   selected_tune = tune ? tune->ident : cpu->ident;
18162
18163   if (aarch64_enable_bti == 2)
18164     {
18165 #ifdef TARGET_ENABLE_BTI
18166       aarch64_enable_bti = 1;
18167 #else
18168       aarch64_enable_bti = 0;
18169 #endif
18170     }
18171
18172   /* Return address signing is currently not supported for ILP32 targets.  For
18173      LP64 targets use the configured option in the absence of a command-line
18174      option for -mbranch-protection.  */
18175   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
18176     {
18177 #ifdef TARGET_ENABLE_PAC_RET
18178       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
18179 #else
18180       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
18181 #endif
18182     }
18183
18184 #ifndef HAVE_AS_MABI_OPTION
18185   /* The compiler may have been configured with 2.23.* binutils, which does
18186      not have support for ILP32.  */
18187   if (TARGET_ILP32)
18188     error ("assembler does not support %<-mabi=ilp32%>");
18189 #endif
18190
18191   /* Convert -msve-vector-bits to a VG count.  */
18192   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18193
18194   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
18195     sorry ("return address signing is only supported for %<-mabi=lp64%>");
18196
18197   /* The pass to insert speculation tracking runs before
18198      shrink-wrapping and the latter does not know how to update the
18199      tracking status.  So disable it in this case.  */
18200   if (aarch64_track_speculation)
18201     flag_shrink_wrap = 0;
18202
18203   aarch64_override_options_internal (&global_options);
18204
18205   /* Save these options as the default ones in case we push and pop them later
18206      while processing functions with potential target attributes.  */
18207   target_option_default_node = target_option_current_node
18208     = build_target_option_node (&global_options, &global_options_set);
18209 }
18210
18211 /* Implement targetm.override_options_after_change.  */
18212
18213 static void
18214 aarch64_override_options_after_change (void)
18215 {
18216   aarch64_override_options_after_change_1 (&global_options);
18217 }
18218
18219 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
18220 static char *
18221 aarch64_offload_options (void)
18222 {
18223   if (TARGET_ILP32)
18224     return xstrdup ("-foffload-abi=ilp32");
18225   else
18226     return xstrdup ("-foffload-abi=lp64");
18227 }
18228
18229 static struct machine_function *
18230 aarch64_init_machine_status (void)
18231 {
18232   struct machine_function *machine;
18233   machine = ggc_cleared_alloc<machine_function> ();
18234   return machine;
18235 }
18236
18237 void
18238 aarch64_init_expanders (void)
18239 {
18240   init_machine_status = aarch64_init_machine_status;
18241 }
18242
18243 /* A checking mechanism for the implementation of the various code models.  */
18244 static void
18245 initialize_aarch64_code_model (struct gcc_options *opts)
18246 {
18247   aarch64_cmodel = opts->x_aarch64_cmodel_var;
18248   switch (opts->x_aarch64_cmodel_var)
18249     {
18250     case AARCH64_CMODEL_TINY:
18251       if (opts->x_flag_pic)
18252         aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18253       break;
18254     case AARCH64_CMODEL_SMALL:
18255       if (opts->x_flag_pic)
18256         {
18257 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18258           aarch64_cmodel = (flag_pic == 2
18259                             ? AARCH64_CMODEL_SMALL_PIC
18260                             : AARCH64_CMODEL_SMALL_SPIC);
18261 #else
18262           aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
18263 #endif
18264         }
18265       break;
18266     case AARCH64_CMODEL_LARGE:
18267       if (opts->x_flag_pic)
18268         sorry ("code model %qs with %<-f%s%>", "large",
18269                opts->x_flag_pic > 1 ? "PIC" : "pic");
18270       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18271         sorry ("code model %qs not supported in ilp32 mode", "large");
18272       break;
18273     case AARCH64_CMODEL_TINY_PIC:
18274     case AARCH64_CMODEL_SMALL_PIC:
18275     case AARCH64_CMODEL_SMALL_SPIC:
18276       gcc_unreachable ();
18277     }
18278 }
18279
18280 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
18281    using the information saved in PTR.  */
18282
18283 static void
18284 aarch64_option_restore (struct gcc_options *opts,
18285                         struct gcc_options * /* opts_set */,
18286                         struct cl_target_option * /* ptr */)
18287 {
18288   aarch64_override_options_internal (opts);
18289 }
18290
18291 /* Implement TARGET_OPTION_PRINT.  */
18292
18293 static void
18294 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
18295 {
18296   const struct processor *cpu
18297     = aarch64_get_tune_cpu (ptr->x_selected_tune);
18298   const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
18299   std::string extension
18300     = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_isa_flags,
18301                                                   arch->flags);
18302
18303   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
18304   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
18305            arch->name, extension.c_str ());
18306 }
18307
18308 static GTY(()) tree aarch64_previous_fndecl;
18309
18310 void
18311 aarch64_reset_previous_fndecl (void)
18312 {
18313   aarch64_previous_fndecl = NULL;
18314 }
18315
18316 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
18317    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
18318    make sure optab availability predicates are recomputed when necessary.  */
18319
18320 void
18321 aarch64_save_restore_target_globals (tree new_tree)
18322 {
18323   if (TREE_TARGET_GLOBALS (new_tree))
18324     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
18325   else if (new_tree == target_option_default_node)
18326     restore_target_globals (&default_target_globals);
18327   else
18328     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
18329 }
18330
18331 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
18332    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
18333    of the function, if such exists.  This function may be called multiple
18334    times on a single function so use aarch64_previous_fndecl to avoid
18335    setting up identical state.  */
18336
18337 static void
18338 aarch64_set_current_function (tree fndecl)
18339 {
18340   if (!fndecl || fndecl == aarch64_previous_fndecl)
18341     return;
18342
18343   tree old_tree = (aarch64_previous_fndecl
18344                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
18345                    : NULL_TREE);
18346
18347   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18348
18349   /* If current function has no attributes but the previous one did,
18350      use the default node.  */
18351   if (!new_tree && old_tree)
18352     new_tree = target_option_default_node;
18353
18354   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
18355      the default have been handled by aarch64_save_restore_target_globals from
18356      aarch64_pragma_target_parse.  */
18357   if (old_tree == new_tree)
18358     return;
18359
18360   aarch64_previous_fndecl = fndecl;
18361
18362   /* First set the target options.  */
18363   cl_target_option_restore (&global_options, &global_options_set,
18364                             TREE_TARGET_OPTION (new_tree));
18365
18366   aarch64_save_restore_target_globals (new_tree);
18367 }
18368
18369 /* Enum describing the various ways we can handle attributes.
18370    In many cases we can reuse the generic option handling machinery.  */
18371
18372 enum aarch64_attr_opt_type
18373 {
18374   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
18375   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
18376   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
18377   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
18378 };
18379
18380 /* All the information needed to handle a target attribute.
18381    NAME is the name of the attribute.
18382    ATTR_TYPE specifies the type of behavior of the attribute as described
18383    in the definition of enum aarch64_attr_opt_type.
18384    ALLOW_NEG is true if the attribute supports a "no-" form.
18385    HANDLER is the function that takes the attribute string as an argument
18386    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
18387    OPT_NUM is the enum specifying the option that the attribute modifies.
18388    This is needed for attributes that mirror the behavior of a command-line
18389    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
18390    aarch64_attr_enum.  */
18391
18392 struct aarch64_attribute_info
18393 {
18394   const char *name;
18395   enum aarch64_attr_opt_type attr_type;
18396   bool allow_neg;
18397   bool (*handler) (const char *);
18398   enum opt_code opt_num;
18399 };
18400
18401 /* Handle the ARCH_STR argument to the arch= target attribute.  */
18402
18403 static bool
18404 aarch64_handle_attr_arch (const char *str)
18405 {
18406   const struct processor *tmp_arch = NULL;
18407   std::string invalid_extension;
18408   enum aarch64_parse_opt_result parse_res
18409     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
18410
18411   if (parse_res == AARCH64_PARSE_OK)
18412     {
18413       gcc_assert (tmp_arch);
18414       selected_arch = tmp_arch->arch;
18415       return true;
18416     }
18417
18418   switch (parse_res)
18419     {
18420       case AARCH64_PARSE_MISSING_ARG:
18421         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
18422         break;
18423       case AARCH64_PARSE_INVALID_ARG:
18424         error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
18425         aarch64_print_hint_for_arch (str);
18426         break;
18427       case AARCH64_PARSE_INVALID_FEATURE:
18428         error ("invalid feature modifier %s of value %qs in "
18429                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18430         aarch64_print_hint_for_extensions (invalid_extension);
18431         break;
18432       default:
18433         gcc_unreachable ();
18434     }
18435
18436   return false;
18437 }
18438
18439 /* Handle the argument CPU_STR to the cpu= target attribute.  */
18440
18441 static bool
18442 aarch64_handle_attr_cpu (const char *str)
18443 {
18444   const struct processor *tmp_cpu = NULL;
18445   std::string invalid_extension;
18446   enum aarch64_parse_opt_result parse_res
18447     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
18448
18449   if (parse_res == AARCH64_PARSE_OK)
18450     {
18451       gcc_assert (tmp_cpu);
18452       selected_tune = tmp_cpu->ident;
18453       selected_arch = tmp_cpu->arch;
18454       return true;
18455     }
18456
18457   switch (parse_res)
18458     {
18459       case AARCH64_PARSE_MISSING_ARG:
18460         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
18461         break;
18462       case AARCH64_PARSE_INVALID_ARG:
18463         error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
18464         aarch64_print_hint_for_core (str);
18465         break;
18466       case AARCH64_PARSE_INVALID_FEATURE:
18467         error ("invalid feature modifier %qs of value %qs in "
18468                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18469         aarch64_print_hint_for_extensions (invalid_extension);
18470         break;
18471       default:
18472         gcc_unreachable ();
18473     }
18474
18475   return false;
18476 }
18477
18478 /* Handle the argument STR to the branch-protection= attribute.  */
18479
18480  static bool
18481  aarch64_handle_attr_branch_protection (const char* str)
18482  {
18483   char *err_str = (char *) xmalloc (strlen (str) + 1);
18484   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
18485                                                                       &err_str);
18486   bool success = false;
18487   switch (res)
18488     {
18489      case AARCH64_PARSE_MISSING_ARG:
18490        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
18491               " attribute");
18492        break;
18493      case AARCH64_PARSE_INVALID_ARG:
18494        error ("invalid protection type %qs in %<target(\"branch-protection"
18495               "=\")%> pragma or attribute", err_str);
18496        break;
18497      case AARCH64_PARSE_OK:
18498        success = true;
18499       /* Fall through.  */
18500      case AARCH64_PARSE_INVALID_FEATURE:
18501        break;
18502      default:
18503        gcc_unreachable ();
18504     }
18505   free (err_str);
18506   return success;
18507  }
18508
18509 /* Handle the argument STR to the tune= target attribute.  */
18510
18511 static bool
18512 aarch64_handle_attr_tune (const char *str)
18513 {
18514   const struct processor *tmp_tune = NULL;
18515   enum aarch64_parse_opt_result parse_res
18516     = aarch64_parse_tune (str, &tmp_tune);
18517
18518   if (parse_res == AARCH64_PARSE_OK)
18519     {
18520       gcc_assert (tmp_tune);
18521       selected_tune = tmp_tune->ident;
18522       return true;
18523     }
18524
18525   switch (parse_res)
18526     {
18527       case AARCH64_PARSE_INVALID_ARG:
18528         error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
18529         aarch64_print_hint_for_core (str);
18530         break;
18531       default:
18532         gcc_unreachable ();
18533     }
18534
18535   return false;
18536 }
18537
18538 /* Parse an architecture extensions target attribute string specified in STR.
18539    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
18540    if successful.  Update aarch64_isa_flags to reflect the ISA features
18541    modified.  */
18542
18543 static bool
18544 aarch64_handle_attr_isa_flags (char *str)
18545 {
18546   enum aarch64_parse_opt_result parse_res;
18547   uint64_t isa_flags = aarch64_isa_flags;
18548
18549   /* We allow "+nothing" in the beginning to clear out all architectural
18550      features if the user wants to handpick specific features.  */
18551   if (strncmp ("+nothing", str, 8) == 0)
18552     {
18553       isa_flags = 0;
18554       str += 8;
18555     }
18556
18557   std::string invalid_extension;
18558   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
18559
18560   if (parse_res == AARCH64_PARSE_OK)
18561     {
18562       aarch64_isa_flags = isa_flags;
18563       return true;
18564     }
18565
18566   switch (parse_res)
18567     {
18568       case AARCH64_PARSE_MISSING_ARG:
18569         error ("missing value in %<target()%> pragma or attribute");
18570         break;
18571
18572       case AARCH64_PARSE_INVALID_FEATURE:
18573         error ("invalid feature modifier %qs of value %qs in "
18574                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18575         break;
18576
18577       default:
18578         gcc_unreachable ();
18579     }
18580
18581  return false;
18582 }
18583
18584 /* The target attributes that we support.  On top of these we also support just
18585    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
18586    handled explicitly in aarch64_process_one_target_attr.  */
18587
18588 static const struct aarch64_attribute_info aarch64_attributes[] =
18589 {
18590   { "general-regs-only", aarch64_attr_mask, false, NULL,
18591      OPT_mgeneral_regs_only },
18592   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
18593      OPT_mfix_cortex_a53_835769 },
18594   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
18595      OPT_mfix_cortex_a53_843419 },
18596   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
18597   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
18598   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
18599      OPT_momit_leaf_frame_pointer },
18600   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
18601   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
18602      OPT_march_ },
18603   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
18604   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
18605      OPT_mtune_ },
18606   { "branch-protection", aarch64_attr_custom, false,
18607      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
18608   { "sign-return-address", aarch64_attr_enum, false, NULL,
18609      OPT_msign_return_address_ },
18610   { "outline-atomics", aarch64_attr_bool, true, NULL,
18611      OPT_moutline_atomics},
18612   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
18613 };
18614
18615 /* Parse ARG_STR which contains the definition of one target attribute.
18616    Show appropriate errors if any or return true if the attribute is valid.  */
18617
18618 static bool
18619 aarch64_process_one_target_attr (char *arg_str)
18620 {
18621   bool invert = false;
18622
18623   size_t len = strlen (arg_str);
18624
18625   if (len == 0)
18626     {
18627       error ("malformed %<target()%> pragma or attribute");
18628       return false;
18629     }
18630
18631   char *str_to_check = (char *) alloca (len + 1);
18632   strcpy (str_to_check, arg_str);
18633
18634   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
18635      It is easier to detect and handle it explicitly here rather than going
18636      through the machinery for the rest of the target attributes in this
18637      function.  */
18638   if (*str_to_check == '+')
18639     return aarch64_handle_attr_isa_flags (str_to_check);
18640
18641   if (len > 3 && startswith (str_to_check, "no-"))
18642     {
18643       invert = true;
18644       str_to_check += 3;
18645     }
18646   char *arg = strchr (str_to_check, '=');
18647
18648   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
18649      and point ARG to "foo".  */
18650   if (arg)
18651     {
18652       *arg = '\0';
18653       arg++;
18654     }
18655   const struct aarch64_attribute_info *p_attr;
18656   bool found = false;
18657   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
18658     {
18659       /* If the names don't match up, or the user has given an argument
18660          to an attribute that doesn't accept one, or didn't give an argument
18661          to an attribute that expects one, fail to match.  */
18662       if (strcmp (str_to_check, p_attr->name) != 0)
18663         continue;
18664
18665       found = true;
18666       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
18667                               || p_attr->attr_type == aarch64_attr_enum;
18668
18669       if (attr_need_arg_p ^ (arg != NULL))
18670         {
18671           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
18672           return false;
18673         }
18674
18675       /* If the name matches but the attribute does not allow "no-" versions
18676          then we can't match.  */
18677       if (invert && !p_attr->allow_neg)
18678         {
18679           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
18680           return false;
18681         }
18682
18683       switch (p_attr->attr_type)
18684         {
18685         /* Has a custom handler registered.
18686            For example, cpu=, arch=, tune=.  */
18687           case aarch64_attr_custom:
18688             gcc_assert (p_attr->handler);
18689             if (!p_attr->handler (arg))
18690               return false;
18691             break;
18692
18693           /* Either set or unset a boolean option.  */
18694           case aarch64_attr_bool:
18695             {
18696               struct cl_decoded_option decoded;
18697
18698               generate_option (p_attr->opt_num, NULL, !invert,
18699                                CL_TARGET, &decoded);
18700               aarch64_handle_option (&global_options, &global_options_set,
18701                                       &decoded, input_location);
18702               break;
18703             }
18704           /* Set or unset a bit in the target_flags.  aarch64_handle_option
18705              should know what mask to apply given the option number.  */
18706           case aarch64_attr_mask:
18707             {
18708               struct cl_decoded_option decoded;
18709               /* We only need to specify the option number.
18710                  aarch64_handle_option will know which mask to apply.  */
18711               decoded.opt_index = p_attr->opt_num;
18712               decoded.value = !invert;
18713               aarch64_handle_option (&global_options, &global_options_set,
18714                                       &decoded, input_location);
18715               break;
18716             }
18717           /* Use the option setting machinery to set an option to an enum.  */
18718           case aarch64_attr_enum:
18719             {
18720               gcc_assert (arg);
18721               bool valid;
18722               int value;
18723               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
18724                                               &value, CL_TARGET);
18725               if (valid)
18726                 {
18727                   set_option (&global_options, NULL, p_attr->opt_num, value,
18728                               NULL, DK_UNSPECIFIED, input_location,
18729                               global_dc);
18730                 }
18731               else
18732                 {
18733                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
18734                 }
18735               break;
18736             }
18737           default:
18738             gcc_unreachable ();
18739         }
18740     }
18741
18742   /* If we reached here we either have found an attribute and validated
18743      it or didn't match any.  If we matched an attribute but its arguments
18744      were malformed we will have returned false already.  */
18745   return found;
18746 }
18747
18748 /* Count how many times the character C appears in
18749    NULL-terminated string STR.  */
18750
18751 static unsigned int
18752 num_occurences_in_str (char c, char *str)
18753 {
18754   unsigned int res = 0;
18755   while (*str != '\0')
18756     {
18757       if (*str == c)
18758         res++;
18759
18760       str++;
18761     }
18762
18763   return res;
18764 }
18765
18766 /* Parse the tree in ARGS that contains the target attribute information
18767    and update the global target options space.  */
18768
18769 bool
18770 aarch64_process_target_attr (tree args)
18771 {
18772   if (TREE_CODE (args) == TREE_LIST)
18773     {
18774       do
18775         {
18776           tree head = TREE_VALUE (args);
18777           if (head)
18778             {
18779               if (!aarch64_process_target_attr (head))
18780                 return false;
18781             }
18782           args = TREE_CHAIN (args);
18783         } while (args);
18784
18785       return true;
18786     }
18787
18788   if (TREE_CODE (args) != STRING_CST)
18789     {
18790       error ("attribute %<target%> argument not a string");
18791       return false;
18792     }
18793
18794   size_t len = strlen (TREE_STRING_POINTER (args));
18795   char *str_to_check = (char *) alloca (len + 1);
18796   strcpy (str_to_check, TREE_STRING_POINTER (args));
18797
18798   if (len == 0)
18799     {
18800       error ("malformed %<target()%> pragma or attribute");
18801       return false;
18802     }
18803
18804   /* Used to catch empty spaces between commas i.e.
18805      attribute ((target ("attr1,,attr2"))).  */
18806   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
18807
18808   /* Handle multiple target attributes separated by ','.  */
18809   char *token = strtok_r (str_to_check, ",", &str_to_check);
18810
18811   unsigned int num_attrs = 0;
18812   while (token)
18813     {
18814       num_attrs++;
18815       if (!aarch64_process_one_target_attr (token))
18816         {
18817           /* Check if token is possibly an arch extension without
18818              leading '+'.  */
18819           uint64_t isa_temp = 0;
18820           auto with_plus = std::string ("+") + token;
18821           enum aarch64_parse_opt_result ext_res
18822             = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
18823
18824           if (ext_res == AARCH64_PARSE_OK)
18825             error ("arch extension %<%s%> should be prefixed by %<+%>",
18826                    token);
18827           else
18828             error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
18829           return false;
18830         }
18831
18832       token = strtok_r (NULL, ",", &str_to_check);
18833     }
18834
18835   if (num_attrs != num_commas + 1)
18836     {
18837       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
18838       return false;
18839     }
18840
18841   return true;
18842 }
18843
18844 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
18845    process attribute ((target ("..."))).  */
18846
18847 static bool
18848 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
18849 {
18850   struct cl_target_option cur_target;
18851   bool ret;
18852   tree old_optimize;
18853   tree new_target, new_optimize;
18854   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18855
18856   /* If what we're processing is the current pragma string then the
18857      target option node is already stored in target_option_current_node
18858      by aarch64_pragma_target_parse in aarch64-c.cc.  Use that to avoid
18859      having to re-parse the string.  This is especially useful to keep
18860      arm_neon.h compile times down since that header contains a lot
18861      of intrinsics enclosed in pragmas.  */
18862   if (!existing_target && args == current_target_pragma)
18863     {
18864       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
18865       return true;
18866     }
18867   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
18868
18869   old_optimize
18870     = build_optimization_node (&global_options, &global_options_set);
18871   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
18872
18873   /* If the function changed the optimization levels as well as setting
18874      target options, start with the optimizations specified.  */
18875   if (func_optimize && func_optimize != old_optimize)
18876     cl_optimization_restore (&global_options, &global_options_set,
18877                              TREE_OPTIMIZATION (func_optimize));
18878
18879   /* Save the current target options to restore at the end.  */
18880   cl_target_option_save (&cur_target, &global_options, &global_options_set);
18881
18882   /* If fndecl already has some target attributes applied to it, unpack
18883      them so that we add this attribute on top of them, rather than
18884      overwriting them.  */
18885   if (existing_target)
18886     {
18887       struct cl_target_option *existing_options
18888         = TREE_TARGET_OPTION (existing_target);
18889
18890       if (existing_options)
18891         cl_target_option_restore (&global_options, &global_options_set,
18892                                   existing_options);
18893     }
18894   else
18895     cl_target_option_restore (&global_options, &global_options_set,
18896                               TREE_TARGET_OPTION (target_option_current_node));
18897
18898   ret = aarch64_process_target_attr (args);
18899
18900   /* Set up any additional state.  */
18901   if (ret)
18902     {
18903       aarch64_override_options_internal (&global_options);
18904       /* Initialize SIMD builtins if we haven't already.
18905          Set current_target_pragma to NULL for the duration so that
18906          the builtin initialization code doesn't try to tag the functions
18907          being built with the attributes specified by any current pragma, thus
18908          going into an infinite recursion.  */
18909       if (TARGET_SIMD)
18910         {
18911           tree saved_current_target_pragma = current_target_pragma;
18912           current_target_pragma = NULL;
18913           aarch64_init_simd_builtins ();
18914           current_target_pragma = saved_current_target_pragma;
18915         }
18916       new_target = build_target_option_node (&global_options,
18917                                              &global_options_set);
18918     }
18919   else
18920     new_target = NULL;
18921
18922   new_optimize = build_optimization_node (&global_options,
18923                                           &global_options_set);
18924
18925   if (fndecl && ret)
18926     {
18927       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
18928
18929       if (old_optimize != new_optimize)
18930         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
18931     }
18932
18933   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
18934
18935   if (old_optimize != new_optimize)
18936     cl_optimization_restore (&global_options, &global_options_set,
18937                              TREE_OPTIMIZATION (old_optimize));
18938   return ret;
18939 }
18940
18941 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
18942    tri-bool options (yes, no, don't care) and the default value is
18943    DEF, determine whether to reject inlining.  */
18944
18945 static bool
18946 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
18947                                      int dont_care, int def)
18948 {
18949   /* If the callee doesn't care, always allow inlining.  */
18950   if (callee == dont_care)
18951     return true;
18952
18953   /* If the caller doesn't care, always allow inlining.  */
18954   if (caller == dont_care)
18955     return true;
18956
18957   /* Otherwise, allow inlining if either the callee and caller values
18958      agree, or if the callee is using the default value.  */
18959   return (callee == caller || callee == def);
18960 }
18961
18962 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
18963    to inline CALLEE into CALLER based on target-specific info.
18964    Make sure that the caller and callee have compatible architectural
18965    features.  Then go through the other possible target attributes
18966    and see if they can block inlining.  Try not to reject always_inline
18967    callees unless they are incompatible architecturally.  */
18968
18969 static bool
18970 aarch64_can_inline_p (tree caller, tree callee)
18971 {
18972   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
18973   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
18974
18975   struct cl_target_option *caller_opts
18976         = TREE_TARGET_OPTION (caller_tree ? caller_tree
18977                                            : target_option_default_node);
18978
18979   struct cl_target_option *callee_opts
18980         = TREE_TARGET_OPTION (callee_tree ? callee_tree
18981                                            : target_option_default_node);
18982
18983   /* Callee's ISA flags should be a subset of the caller's.  */
18984   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
18985        != callee_opts->x_aarch64_isa_flags)
18986     return false;
18987
18988   /* Allow non-strict aligned functions inlining into strict
18989      aligned ones.  */
18990   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
18991        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
18992       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
18993            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
18994     return false;
18995
18996   bool always_inline = lookup_attribute ("always_inline",
18997                                           DECL_ATTRIBUTES (callee));
18998
18999   /* If the architectural features match up and the callee is always_inline
19000      then the other attributes don't matter.  */
19001   if (always_inline)
19002     return true;
19003
19004   if (caller_opts->x_aarch64_cmodel_var
19005       != callee_opts->x_aarch64_cmodel_var)
19006     return false;
19007
19008   if (caller_opts->x_aarch64_tls_dialect
19009       != callee_opts->x_aarch64_tls_dialect)
19010     return false;
19011
19012   /* Honour explicit requests to workaround errata.  */
19013   if (!aarch64_tribools_ok_for_inlining_p (
19014           caller_opts->x_aarch64_fix_a53_err835769,
19015           callee_opts->x_aarch64_fix_a53_err835769,
19016           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
19017     return false;
19018
19019   if (!aarch64_tribools_ok_for_inlining_p (
19020           caller_opts->x_aarch64_fix_a53_err843419,
19021           callee_opts->x_aarch64_fix_a53_err843419,
19022           2, TARGET_FIX_ERR_A53_843419))
19023     return false;
19024
19025   /* If the user explicitly specified -momit-leaf-frame-pointer for the
19026      caller and calle and they don't match up, reject inlining.  */
19027   if (!aarch64_tribools_ok_for_inlining_p (
19028           caller_opts->x_flag_omit_leaf_frame_pointer,
19029           callee_opts->x_flag_omit_leaf_frame_pointer,
19030           2, 1))
19031     return false;
19032
19033   /* If the callee has specific tuning overrides, respect them.  */
19034   if (callee_opts->x_aarch64_override_tune_string != NULL
19035       && caller_opts->x_aarch64_override_tune_string == NULL)
19036     return false;
19037
19038   /* If the user specified tuning override strings for the
19039      caller and callee and they don't match up, reject inlining.
19040      We just do a string compare here, we don't analyze the meaning
19041      of the string, as it would be too costly for little gain.  */
19042   if (callee_opts->x_aarch64_override_tune_string
19043       && caller_opts->x_aarch64_override_tune_string
19044       && (strcmp (callee_opts->x_aarch64_override_tune_string,
19045                   caller_opts->x_aarch64_override_tune_string) != 0))
19046     return false;
19047
19048   return true;
19049 }
19050
19051 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
19052    been already.  */
19053
19054 unsigned int
19055 aarch64_tlsdesc_abi_id ()
19056 {
19057   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
19058   if (!tlsdesc_abi.initialized_p ())
19059     {
19060       HARD_REG_SET full_reg_clobbers;
19061       CLEAR_HARD_REG_SET (full_reg_clobbers);
19062       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
19063       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
19064       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
19065         SET_HARD_REG_BIT (full_reg_clobbers, regno);
19066       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
19067     }
19068   return tlsdesc_abi.id ();
19069 }
19070
19071 /* Return true if SYMBOL_REF X binds locally.  */
19072
19073 static bool
19074 aarch64_symbol_binds_local_p (const_rtx x)
19075 {
19076   return (SYMBOL_REF_DECL (x)
19077           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
19078           : SYMBOL_REF_LOCAL_P (x));
19079 }
19080
19081 /* Return true if SYMBOL_REF X is thread local */
19082 static bool
19083 aarch64_tls_symbol_p (rtx x)
19084 {
19085   if (! TARGET_HAVE_TLS)
19086     return false;
19087
19088   x = strip_salt (x);
19089   if (!SYMBOL_REF_P (x))
19090     return false;
19091
19092   return SYMBOL_REF_TLS_MODEL (x) != 0;
19093 }
19094
19095 /* Classify a TLS symbol into one of the TLS kinds.  */
19096 enum aarch64_symbol_type
19097 aarch64_classify_tls_symbol (rtx x)
19098 {
19099   enum tls_model tls_kind = tls_symbolic_operand_type (x);
19100
19101   switch (tls_kind)
19102     {
19103     case TLS_MODEL_GLOBAL_DYNAMIC:
19104     case TLS_MODEL_LOCAL_DYNAMIC:
19105       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
19106
19107     case TLS_MODEL_INITIAL_EXEC:
19108       switch (aarch64_cmodel)
19109         {
19110         case AARCH64_CMODEL_TINY:
19111         case AARCH64_CMODEL_TINY_PIC:
19112           return SYMBOL_TINY_TLSIE;
19113         default:
19114           return SYMBOL_SMALL_TLSIE;
19115         }
19116
19117     case TLS_MODEL_LOCAL_EXEC:
19118       if (aarch64_tls_size == 12)
19119         return SYMBOL_TLSLE12;
19120       else if (aarch64_tls_size == 24)
19121         return SYMBOL_TLSLE24;
19122       else if (aarch64_tls_size == 32)
19123         return SYMBOL_TLSLE32;
19124       else if (aarch64_tls_size == 48)
19125         return SYMBOL_TLSLE48;
19126       else
19127         gcc_unreachable ();
19128
19129     case TLS_MODEL_EMULATED:
19130     case TLS_MODEL_NONE:
19131       return SYMBOL_FORCE_TO_MEM;
19132
19133     default:
19134       gcc_unreachable ();
19135     }
19136 }
19137
19138 /* Return the correct method for accessing X + OFFSET, where X is either
19139    a SYMBOL_REF or LABEL_REF.  */
19140
19141 enum aarch64_symbol_type
19142 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
19143 {
19144   x = strip_salt (x);
19145
19146   if (LABEL_REF_P (x))
19147     {
19148       switch (aarch64_cmodel)
19149         {
19150         case AARCH64_CMODEL_LARGE:
19151           return SYMBOL_FORCE_TO_MEM;
19152
19153         case AARCH64_CMODEL_TINY_PIC:
19154         case AARCH64_CMODEL_TINY:
19155           return SYMBOL_TINY_ABSOLUTE;
19156
19157         case AARCH64_CMODEL_SMALL_SPIC:
19158         case AARCH64_CMODEL_SMALL_PIC:
19159         case AARCH64_CMODEL_SMALL:
19160           return SYMBOL_SMALL_ABSOLUTE;
19161
19162         default:
19163           gcc_unreachable ();
19164         }
19165     }
19166
19167   if (SYMBOL_REF_P (x))
19168     {
19169       if (aarch64_tls_symbol_p (x))
19170         return aarch64_classify_tls_symbol (x);
19171
19172       switch (aarch64_cmodel)
19173         {
19174         case AARCH64_CMODEL_TINY_PIC:
19175         case AARCH64_CMODEL_TINY:
19176           /* With -fPIC non-local symbols use the GOT.  For orthogonality
19177              always use the GOT for extern weak symbols.  */
19178           if ((flag_pic || SYMBOL_REF_WEAK (x))
19179               && !aarch64_symbol_binds_local_p (x))
19180             return SYMBOL_TINY_GOT;
19181
19182           /* When we retrieve symbol + offset address, we have to make sure
19183              the offset does not cause overflow of the final address.  But
19184              we have no way of knowing the address of symbol at compile time
19185              so we can't accurately say if the distance between the PC and
19186              symbol + offset is outside the addressible range of +/-1MB in the
19187              TINY code model.  So we limit the maximum offset to +/-64KB and
19188              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
19189              If offset_within_block_p is true we allow larger offsets.  */
19190           if (!(IN_RANGE (offset, -0x10000, 0x10000)
19191                 || offset_within_block_p (x, offset)))
19192             return SYMBOL_FORCE_TO_MEM;
19193
19194           return SYMBOL_TINY_ABSOLUTE;
19195
19196
19197         case AARCH64_CMODEL_SMALL_SPIC:
19198         case AARCH64_CMODEL_SMALL_PIC:
19199         case AARCH64_CMODEL_SMALL:
19200           if ((flag_pic || SYMBOL_REF_WEAK (x))
19201               && !aarch64_symbol_binds_local_p (x))
19202             return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
19203                     ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
19204
19205           /* Same reasoning as the tiny code model, but the offset cap here is
19206              1MB, allowing +/-3.9GB for the offset to the symbol.  */
19207           if (!(IN_RANGE (offset, -0x100000, 0x100000)
19208                 || offset_within_block_p (x, offset)))
19209             return SYMBOL_FORCE_TO_MEM;
19210
19211           return SYMBOL_SMALL_ABSOLUTE;
19212
19213         case AARCH64_CMODEL_LARGE:
19214           /* This is alright even in PIC code as the constant
19215              pool reference is always PC relative and within
19216              the same translation unit.  */
19217           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
19218             return SYMBOL_SMALL_ABSOLUTE;
19219           else
19220             return SYMBOL_FORCE_TO_MEM;
19221
19222         default:
19223           gcc_unreachable ();
19224         }
19225     }
19226
19227   /* By default push everything into the constant pool.  */
19228   return SYMBOL_FORCE_TO_MEM;
19229 }
19230
19231 bool
19232 aarch64_constant_address_p (rtx x)
19233 {
19234   return (CONSTANT_P (x) && memory_address_p (DImode, x));
19235 }
19236
19237 bool
19238 aarch64_legitimate_pic_operand_p (rtx x)
19239 {
19240   poly_int64 offset;
19241   x = strip_offset_and_salt (x, &offset);
19242   if (SYMBOL_REF_P (x))
19243     return false;
19244
19245   return true;
19246 }
19247
19248 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
19249    that should be rematerialized rather than spilled.  */
19250
19251 static bool
19252 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
19253 {
19254   /* Support CSE and rematerialization of common constants.  */
19255   if (CONST_INT_P (x)
19256       || CONST_DOUBLE_P (x))
19257     return true;
19258
19259   /* Only accept variable-length vector constants if they can be
19260      handled directly.
19261
19262      ??? It would be possible (but complex) to handle rematerialization
19263      of other constants via secondary reloads.  */
19264   if (!GET_MODE_SIZE (mode).is_constant ())
19265     return aarch64_simd_valid_immediate (x, NULL);
19266
19267   /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
19268      least be forced to memory and loaded from there.  */
19269   if (CONST_VECTOR_P (x))
19270     return !targetm.cannot_force_const_mem (mode, x);
19271
19272   /* Do not allow vector struct mode constants for Advanced SIMD.
19273      We could support 0 and -1 easily, but they need support in
19274      aarch64-simd.md.  */
19275   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19276   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
19277     return false;
19278
19279   if (GET_CODE (x) == HIGH)
19280     x = XEXP (x, 0);
19281
19282   /* Accept polynomial constants that can be calculated by using the
19283      destination of a move as the sole temporary.  Constants that
19284      require a second temporary cannot be rematerialized (they can't be
19285      forced to memory and also aren't legitimate constants).  */
19286   poly_int64 offset;
19287   if (poly_int_rtx_p (x, &offset))
19288     return aarch64_offset_temporaries (false, offset) <= 1;
19289
19290   /* If an offset is being added to something else, we need to allow the
19291      base to be moved into the destination register, meaning that there
19292      are no free temporaries for the offset.  */
19293   x = strip_offset_and_salt (x, &offset);
19294   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
19295     return false;
19296
19297   /* Do not allow const (plus (anchor_symbol, const_int)).  */
19298   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
19299     return false;
19300
19301   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
19302      so spilling them is better than rematerialization.  */
19303   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
19304     return true;
19305
19306   /* Label references are always constant.  */
19307   if (LABEL_REF_P (x))
19308     return true;
19309
19310   return false;
19311 }
19312
19313 rtx
19314 aarch64_load_tp (rtx target)
19315 {
19316   if (!target
19317       || GET_MODE (target) != Pmode
19318       || !register_operand (target, Pmode))
19319     target = gen_reg_rtx (Pmode);
19320
19321   /* Can return in any reg.  */
19322   emit_insn (gen_aarch64_load_tp_hard (target));
19323   return target;
19324 }
19325
19326 /* On AAPCS systems, this is the "struct __va_list".  */
19327 static GTY(()) tree va_list_type;
19328
19329 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
19330    Return the type to use as __builtin_va_list.
19331
19332    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
19333
19334    struct __va_list
19335    {
19336      void *__stack;
19337      void *__gr_top;
19338      void *__vr_top;
19339      int   __gr_offs;
19340      int   __vr_offs;
19341    };  */
19342
19343 static tree
19344 aarch64_build_builtin_va_list (void)
19345 {
19346   tree va_list_name;
19347   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19348
19349   /* Create the type.  */
19350   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
19351   /* Give it the required name.  */
19352   va_list_name = build_decl (BUILTINS_LOCATION,
19353                              TYPE_DECL,
19354                              get_identifier ("__va_list"),
19355                              va_list_type);
19356   DECL_ARTIFICIAL (va_list_name) = 1;
19357   TYPE_NAME (va_list_type) = va_list_name;
19358   TYPE_STUB_DECL (va_list_type) = va_list_name;
19359
19360   /* Create the fields.  */
19361   f_stack = build_decl (BUILTINS_LOCATION,
19362                         FIELD_DECL, get_identifier ("__stack"),
19363                         ptr_type_node);
19364   f_grtop = build_decl (BUILTINS_LOCATION,
19365                         FIELD_DECL, get_identifier ("__gr_top"),
19366                         ptr_type_node);
19367   f_vrtop = build_decl (BUILTINS_LOCATION,
19368                         FIELD_DECL, get_identifier ("__vr_top"),
19369                         ptr_type_node);
19370   f_groff = build_decl (BUILTINS_LOCATION,
19371                         FIELD_DECL, get_identifier ("__gr_offs"),
19372                         integer_type_node);
19373   f_vroff = build_decl (BUILTINS_LOCATION,
19374                         FIELD_DECL, get_identifier ("__vr_offs"),
19375                         integer_type_node);
19376
19377   /* Tell tree-stdarg pass about our internal offset fields.
19378      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
19379      purpose to identify whether the code is updating va_list internal
19380      offset fields through irregular way.  */
19381   va_list_gpr_counter_field = f_groff;
19382   va_list_fpr_counter_field = f_vroff;
19383
19384   DECL_ARTIFICIAL (f_stack) = 1;
19385   DECL_ARTIFICIAL (f_grtop) = 1;
19386   DECL_ARTIFICIAL (f_vrtop) = 1;
19387   DECL_ARTIFICIAL (f_groff) = 1;
19388   DECL_ARTIFICIAL (f_vroff) = 1;
19389
19390   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
19391   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
19392   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
19393   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
19394   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
19395
19396   TYPE_FIELDS (va_list_type) = f_stack;
19397   DECL_CHAIN (f_stack) = f_grtop;
19398   DECL_CHAIN (f_grtop) = f_vrtop;
19399   DECL_CHAIN (f_vrtop) = f_groff;
19400   DECL_CHAIN (f_groff) = f_vroff;
19401
19402   /* Compute its layout.  */
19403   layout_type (va_list_type);
19404
19405   return va_list_type;
19406 }
19407
19408 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
19409 static void
19410 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
19411 {
19412   const CUMULATIVE_ARGS *cum;
19413   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19414   tree stack, grtop, vrtop, groff, vroff;
19415   tree t;
19416   int gr_save_area_size = cfun->va_list_gpr_size;
19417   int vr_save_area_size = cfun->va_list_fpr_size;
19418   int vr_offset;
19419
19420   cum = &crtl->args.info;
19421   if (cfun->va_list_gpr_size)
19422     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
19423                              cfun->va_list_gpr_size);
19424   if (cfun->va_list_fpr_size)
19425     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
19426                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
19427
19428   if (!TARGET_FLOAT)
19429     {
19430       gcc_assert (cum->aapcs_nvrn == 0);
19431       vr_save_area_size = 0;
19432     }
19433
19434   f_stack = TYPE_FIELDS (va_list_type_node);
19435   f_grtop = DECL_CHAIN (f_stack);
19436   f_vrtop = DECL_CHAIN (f_grtop);
19437   f_groff = DECL_CHAIN (f_vrtop);
19438   f_vroff = DECL_CHAIN (f_groff);
19439
19440   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
19441                   NULL_TREE);
19442   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
19443                   NULL_TREE);
19444   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
19445                   NULL_TREE);
19446   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
19447                   NULL_TREE);
19448   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
19449                   NULL_TREE);
19450
19451   /* Emit code to initialize STACK, which points to the next varargs stack
19452      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
19453      by named arguments.  STACK is 8-byte aligned.  */
19454   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
19455   if (cum->aapcs_stack_size > 0)
19456     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
19457   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
19458   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19459
19460   /* Emit code to initialize GRTOP, the top of the GR save area.
19461      virtual_incoming_args_rtx should have been 16 byte aligned.  */
19462   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
19463   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
19464   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19465
19466   /* Emit code to initialize VRTOP, the top of the VR save area.
19467      This address is gr_save_area_bytes below GRTOP, rounded
19468      down to the next 16-byte boundary.  */
19469   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
19470   vr_offset = ROUND_UP (gr_save_area_size,
19471                         STACK_BOUNDARY / BITS_PER_UNIT);
19472
19473   if (vr_offset)
19474     t = fold_build_pointer_plus_hwi (t, -vr_offset);
19475   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
19476   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19477
19478   /* Emit code to initialize GROFF, the offset from GRTOP of the
19479      next GPR argument.  */
19480   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
19481               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
19482   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19483
19484   /* Likewise emit code to initialize VROFF, the offset from FTOP
19485      of the next VR argument.  */
19486   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
19487               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
19488   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19489 }
19490
19491 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
19492
19493 static tree
19494 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
19495                               gimple_seq *post_p ATTRIBUTE_UNUSED)
19496 {
19497   tree addr;
19498   bool indirect_p;
19499   bool is_ha;           /* is HFA or HVA.  */
19500   bool dw_align;        /* double-word align.  */
19501   machine_mode ag_mode = VOIDmode;
19502   int nregs;
19503   machine_mode mode;
19504
19505   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19506   tree stack, f_top, f_off, off, arg, roundup, on_stack;
19507   HOST_WIDE_INT size, rsize, adjust, align;
19508   tree t, u, cond1, cond2;
19509
19510   indirect_p = pass_va_arg_by_reference (type);
19511   if (indirect_p)
19512     type = build_pointer_type (type);
19513
19514   mode = TYPE_MODE (type);
19515
19516   f_stack = TYPE_FIELDS (va_list_type_node);
19517   f_grtop = DECL_CHAIN (f_stack);
19518   f_vrtop = DECL_CHAIN (f_grtop);
19519   f_groff = DECL_CHAIN (f_vrtop);
19520   f_vroff = DECL_CHAIN (f_groff);
19521
19522   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
19523                   f_stack, NULL_TREE);
19524   size = int_size_in_bytes (type);
19525
19526   unsigned int abi_break;
19527   align
19528     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
19529
19530   dw_align = false;
19531   adjust = 0;
19532   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
19533                                                &is_ha, false))
19534     {
19535       /* No frontends can create types with variable-sized modes, so we
19536          shouldn't be asked to pass or return them.  */
19537       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
19538
19539       /* TYPE passed in fp/simd registers.  */
19540       if (!TARGET_FLOAT)
19541         aarch64_err_no_fpadvsimd (mode);
19542
19543       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
19544                       unshare_expr (valist), f_vrtop, NULL_TREE);
19545       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
19546                       unshare_expr (valist), f_vroff, NULL_TREE);
19547
19548       rsize = nregs * UNITS_PER_VREG;
19549
19550       if (is_ha)
19551         {
19552           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
19553             adjust = UNITS_PER_VREG - ag_size;
19554         }
19555       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19556                && size < UNITS_PER_VREG)
19557         {
19558           adjust = UNITS_PER_VREG - size;
19559         }
19560     }
19561   else
19562     {
19563       /* TYPE passed in general registers.  */
19564       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
19565                       unshare_expr (valist), f_grtop, NULL_TREE);
19566       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
19567                       unshare_expr (valist), f_groff, NULL_TREE);
19568       rsize = ROUND_UP (size, UNITS_PER_WORD);
19569       nregs = rsize / UNITS_PER_WORD;
19570
19571       if (align > 8)
19572         {
19573           if (abi_break && warn_psabi)
19574             inform (input_location, "parameter passing for argument of type "
19575                     "%qT changed in GCC 9.1", type);
19576           dw_align = true;
19577         }
19578
19579       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19580           && size < UNITS_PER_WORD)
19581         {
19582           adjust = UNITS_PER_WORD  - size;
19583         }
19584     }
19585
19586   /* Get a local temporary for the field value.  */
19587   off = get_initialized_tmp_var (f_off, pre_p, NULL);
19588
19589   /* Emit code to branch if off >= 0.  */
19590   t = build2 (GE_EXPR, boolean_type_node, off,
19591               build_int_cst (TREE_TYPE (off), 0));
19592   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
19593
19594   if (dw_align)
19595     {
19596       /* Emit: offs = (offs + 15) & -16.  */
19597       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19598                   build_int_cst (TREE_TYPE (off), 15));
19599       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
19600                   build_int_cst (TREE_TYPE (off), -16));
19601       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
19602     }
19603   else
19604     roundup = NULL;
19605
19606   /* Update ap.__[g|v]r_offs  */
19607   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19608               build_int_cst (TREE_TYPE (off), rsize));
19609   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
19610
19611   /* String up.  */
19612   if (roundup)
19613     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19614
19615   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
19616   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
19617               build_int_cst (TREE_TYPE (f_off), 0));
19618   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
19619
19620   /* String up: make sure the assignment happens before the use.  */
19621   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
19622   COND_EXPR_ELSE (cond1) = t;
19623
19624   /* Prepare the trees handling the argument that is passed on the stack;
19625      the top level node will store in ON_STACK.  */
19626   arg = get_initialized_tmp_var (stack, pre_p, NULL);
19627   if (align > 8)
19628     {
19629       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
19630       t = fold_build_pointer_plus_hwi (arg, 15);
19631       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19632                   build_int_cst (TREE_TYPE (t), -16));
19633       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
19634     }
19635   else
19636     roundup = NULL;
19637   /* Advance ap.__stack  */
19638   t = fold_build_pointer_plus_hwi (arg, size + 7);
19639   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19640               build_int_cst (TREE_TYPE (t), -8));
19641   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
19642   /* String up roundup and advance.  */
19643   if (roundup)
19644     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19645   /* String up with arg */
19646   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
19647   /* Big-endianness related address adjustment.  */
19648   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19649       && size < UNITS_PER_WORD)
19650   {
19651     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
19652                 size_int (UNITS_PER_WORD - size));
19653     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
19654   }
19655
19656   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
19657   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
19658
19659   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
19660   t = off;
19661   if (adjust)
19662     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
19663                 build_int_cst (TREE_TYPE (off), adjust));
19664
19665   t = fold_convert (sizetype, t);
19666   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
19667
19668   if (is_ha)
19669     {
19670       /* type ha; // treat as "struct {ftype field[n];}"
19671          ... [computing offs]
19672          for (i = 0; i <nregs; ++i, offs += 16)
19673            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
19674          return ha;  */
19675       int i;
19676       tree tmp_ha, field_t, field_ptr_t;
19677
19678       /* Declare a local variable.  */
19679       tmp_ha = create_tmp_var_raw (type, "ha");
19680       gimple_add_tmp_var (tmp_ha);
19681
19682       /* Establish the base type.  */
19683       switch (ag_mode)
19684         {
19685         case E_SFmode:
19686           field_t = float_type_node;
19687           field_ptr_t = float_ptr_type_node;
19688           break;
19689         case E_DFmode:
19690           field_t = double_type_node;
19691           field_ptr_t = double_ptr_type_node;
19692           break;
19693         case E_TFmode:
19694           field_t = long_double_type_node;
19695           field_ptr_t = long_double_ptr_type_node;
19696           break;
19697         case E_SDmode:
19698           field_t = dfloat32_type_node;
19699           field_ptr_t = build_pointer_type (dfloat32_type_node);
19700           break;
19701         case E_DDmode:
19702           field_t = dfloat64_type_node;
19703           field_ptr_t = build_pointer_type (dfloat64_type_node);
19704           break;
19705         case E_TDmode:
19706           field_t = dfloat128_type_node;
19707           field_ptr_t = build_pointer_type (dfloat128_type_node);
19708           break;
19709         case E_HFmode:
19710           field_t = aarch64_fp16_type_node;
19711           field_ptr_t = aarch64_fp16_ptr_type_node;
19712           break;
19713         case E_BFmode:
19714           field_t = aarch64_bf16_type_node;
19715           field_ptr_t = aarch64_bf16_ptr_type_node;
19716           break;
19717         case E_V2SImode:
19718         case E_V4SImode:
19719             {
19720               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
19721               field_t = build_vector_type_for_mode (innertype, ag_mode);
19722               field_ptr_t = build_pointer_type (field_t);
19723             }
19724           break;
19725         default:
19726           gcc_assert (0);
19727         }
19728
19729       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
19730       TREE_ADDRESSABLE (tmp_ha) = 1;
19731       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
19732       addr = t;
19733       t = fold_convert (field_ptr_t, addr);
19734       t = build2 (MODIFY_EXPR, field_t,
19735                   build1 (INDIRECT_REF, field_t, tmp_ha),
19736                   build1 (INDIRECT_REF, field_t, t));
19737
19738       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
19739       for (i = 1; i < nregs; ++i)
19740         {
19741           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
19742           u = fold_convert (field_ptr_t, addr);
19743           u = build2 (MODIFY_EXPR, field_t,
19744                       build2 (MEM_REF, field_t, tmp_ha,
19745                               build_int_cst (field_ptr_t,
19746                                              (i *
19747                                               int_size_in_bytes (field_t)))),
19748                       build1 (INDIRECT_REF, field_t, u));
19749           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
19750         }
19751
19752       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
19753       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
19754     }
19755
19756   COND_EXPR_ELSE (cond2) = t;
19757   addr = fold_convert (build_pointer_type (type), cond1);
19758   addr = build_va_arg_indirect_ref (addr);
19759
19760   if (indirect_p)
19761     addr = build_va_arg_indirect_ref (addr);
19762
19763   return addr;
19764 }
19765
19766 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
19767
19768 static void
19769 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
19770                                 const function_arg_info &arg,
19771                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
19772 {
19773   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
19774   CUMULATIVE_ARGS local_cum;
19775   int gr_saved = cfun->va_list_gpr_size;
19776   int vr_saved = cfun->va_list_fpr_size;
19777
19778   /* The caller has advanced CUM up to, but not beyond, the last named
19779      argument.  Advance a local copy of CUM past the last "real" named
19780      argument, to find out how many registers are left over.  */
19781   local_cum = *cum;
19782   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
19783
19784   /* Found out how many registers we need to save.
19785      Honor tree-stdvar analysis results.  */
19786   if (cfun->va_list_gpr_size)
19787     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
19788                     cfun->va_list_gpr_size / UNITS_PER_WORD);
19789   if (cfun->va_list_fpr_size)
19790     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
19791                     cfun->va_list_fpr_size / UNITS_PER_VREG);
19792
19793   if (!TARGET_FLOAT)
19794     {
19795       gcc_assert (local_cum.aapcs_nvrn == 0);
19796       vr_saved = 0;
19797     }
19798
19799   if (!no_rtl)
19800     {
19801       if (gr_saved > 0)
19802         {
19803           rtx ptr, mem;
19804
19805           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
19806           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
19807                                - gr_saved * UNITS_PER_WORD);
19808           mem = gen_frame_mem (BLKmode, ptr);
19809           set_mem_alias_set (mem, get_varargs_alias_set ());
19810
19811           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
19812                                mem, gr_saved);
19813         }
19814       if (vr_saved > 0)
19815         {
19816           /* We can't use move_block_from_reg, because it will use
19817              the wrong mode, storing D regs only.  */
19818           machine_mode mode = TImode;
19819           int off, i, vr_start;
19820
19821           /* Set OFF to the offset from virtual_incoming_args_rtx of
19822              the first vector register.  The VR save area lies below
19823              the GR one, and is aligned to 16 bytes.  */
19824           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
19825                            STACK_BOUNDARY / BITS_PER_UNIT);
19826           off -= vr_saved * UNITS_PER_VREG;
19827
19828           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
19829           for (i = 0; i < vr_saved; ++i)
19830             {
19831               rtx ptr, mem;
19832
19833               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
19834               mem = gen_frame_mem (mode, ptr);
19835               set_mem_alias_set (mem, get_varargs_alias_set ());
19836               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
19837               off += UNITS_PER_VREG;
19838             }
19839         }
19840     }
19841
19842   /* We don't save the size into *PRETEND_SIZE because we want to avoid
19843      any complication of having crtl->args.pretend_args_size changed.  */
19844   cfun->machine->frame.saved_varargs_size
19845     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
19846                  STACK_BOUNDARY / BITS_PER_UNIT)
19847        + vr_saved * UNITS_PER_VREG);
19848 }
19849
19850 static void
19851 aarch64_conditional_register_usage (void)
19852 {
19853   int i;
19854   if (!TARGET_FLOAT)
19855     {
19856       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
19857         {
19858           fixed_regs[i] = 1;
19859           call_used_regs[i] = 1;
19860         }
19861     }
19862   if (!TARGET_SVE)
19863     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
19864       {
19865         fixed_regs[i] = 1;
19866         call_used_regs[i] = 1;
19867       }
19868
19869   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
19870   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
19871   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
19872
19873   /* When tracking speculation, we need a couple of call-clobbered registers
19874      to track the speculation state.  It would be nice to just use
19875      IP0 and IP1, but currently there are numerous places that just
19876      assume these registers are free for other uses (eg pointer
19877      authentication).  */
19878   if (aarch64_track_speculation)
19879     {
19880       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
19881       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
19882       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
19883       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
19884     }
19885 }
19886
19887 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
19888
19889 bool
19890 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
19891 {
19892   /* For records we're passed a FIELD_DECL, for arrays we're passed
19893      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
19894   const_tree type = TREE_TYPE (field_or_array);
19895
19896   /* Assign BLKmode to anything that contains multiple SVE predicates.
19897      For structures, the "multiple" case is indicated by MODE being
19898      VOIDmode.  */
19899   unsigned int num_zr, num_pr;
19900   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
19901     {
19902       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
19903         return !simple_cst_equal (TYPE_SIZE (field_or_array),
19904                                   TYPE_SIZE (type));
19905       return mode == VOIDmode;
19906     }
19907
19908   return default_member_type_forces_blk (field_or_array, mode);
19909 }
19910
19911 /* Bitmasks that indicate whether earlier versions of GCC would have
19912    taken a different path through the ABI logic.  This should result in
19913    a -Wpsabi warning if the earlier path led to a different ABI decision.
19914
19915    WARN_PSABI_EMPTY_CXX17_BASE
19916       Indicates that the type includes an artificial empty C++17 base field
19917       that, prior to GCC 10.1, would prevent the type from being treated as
19918       a HFA or HVA.  See PR94383 for details.
19919
19920    WARN_PSABI_NO_UNIQUE_ADDRESS
19921       Indicates that the type includes an empty [[no_unique_address]] field
19922       that, prior to GCC 10.1, would prevent the type from being treated as
19923       a HFA or HVA.  */
19924 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
19925 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
19926 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
19927
19928 /* Walk down the type tree of TYPE counting consecutive base elements.
19929    If *MODEP is VOIDmode, then set it to the first valid floating point
19930    type.  If a non-floating point type is found, or if a floating point
19931    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
19932    otherwise return the count in the sub-tree.
19933
19934    The WARN_PSABI_FLAGS argument allows the caller to check whether this
19935    function has changed its behavior relative to earlier versions of GCC.
19936    Normally the argument should be nonnull and point to a zero-initialized
19937    variable.  The function then records whether the ABI decision might
19938    be affected by a known fix to the ABI logic, setting the associated
19939    WARN_PSABI_* bits if so.
19940
19941    When the argument is instead a null pointer, the function tries to
19942    simulate the behavior of GCC before all such ABI fixes were made.
19943    This is useful to check whether the function returns something
19944    different after the ABI fixes.  */
19945 static int
19946 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
19947                          unsigned int *warn_psabi_flags)
19948 {
19949   machine_mode mode;
19950   HOST_WIDE_INT size;
19951
19952   if (aarch64_sve::builtin_type_p (type))
19953     return -1;
19954
19955   switch (TREE_CODE (type))
19956     {
19957     case REAL_TYPE:
19958       mode = TYPE_MODE (type);
19959       if (mode != DFmode && mode != SFmode
19960           && mode != TFmode && mode != HFmode
19961           && mode != SDmode && mode != DDmode && mode != TDmode)
19962         return -1;
19963
19964       if (*modep == VOIDmode)
19965         *modep = mode;
19966
19967       if (*modep == mode)
19968         return 1;
19969
19970       break;
19971
19972     case COMPLEX_TYPE:
19973       mode = TYPE_MODE (TREE_TYPE (type));
19974       if (mode != DFmode && mode != SFmode
19975           && mode != TFmode && mode != HFmode)
19976         return -1;
19977
19978       if (*modep == VOIDmode)
19979         *modep = mode;
19980
19981       if (*modep == mode)
19982         return 2;
19983
19984       break;
19985
19986     case VECTOR_TYPE:
19987       /* Use V2SImode and V4SImode as representatives of all 64-bit
19988          and 128-bit vector types.  */
19989       size = int_size_in_bytes (type);
19990       switch (size)
19991         {
19992         case 8:
19993           mode = V2SImode;
19994           break;
19995         case 16:
19996           mode = V4SImode;
19997           break;
19998         default:
19999           return -1;
20000         }
20001
20002       if (*modep == VOIDmode)
20003         *modep = mode;
20004
20005       /* Vector modes are considered to be opaque: two vectors are
20006          equivalent for the purposes of being homogeneous aggregates
20007          if they are the same size.  */
20008       if (*modep == mode)
20009         return 1;
20010
20011       break;
20012
20013     case ARRAY_TYPE:
20014       {
20015         int count;
20016         tree index = TYPE_DOMAIN (type);
20017
20018         /* Can't handle incomplete types nor sizes that are not
20019            fixed.  */
20020         if (!COMPLETE_TYPE_P (type)
20021             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20022           return -1;
20023
20024         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
20025                                          warn_psabi_flags);
20026         if (count == -1
20027             || !index
20028             || !TYPE_MAX_VALUE (index)
20029             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
20030             || !TYPE_MIN_VALUE (index)
20031             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
20032             || count < 0)
20033           return -1;
20034
20035         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
20036                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
20037
20038         /* There must be no padding.  */
20039         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20040                       count * GET_MODE_BITSIZE (*modep)))
20041           return -1;
20042
20043         return count;
20044       }
20045
20046     case RECORD_TYPE:
20047       {
20048         int count = 0;
20049         int sub_count;
20050         tree field;
20051
20052         /* Can't handle incomplete types nor sizes that are not
20053            fixed.  */
20054         if (!COMPLETE_TYPE_P (type)
20055             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20056           return -1;
20057
20058         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20059           {
20060             if (TREE_CODE (field) != FIELD_DECL)
20061               continue;
20062
20063             if (DECL_FIELD_ABI_IGNORED (field))
20064               {
20065                 /* See whether this is something that earlier versions of
20066                    GCC failed to ignore.  */
20067                 unsigned int flag;
20068                 if (lookup_attribute ("no_unique_address",
20069                                       DECL_ATTRIBUTES (field)))
20070                   flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
20071                 else if (cxx17_empty_base_field_p (field))
20072                   flag = WARN_PSABI_EMPTY_CXX17_BASE;
20073                 else
20074                   /* No compatibility problem.  */
20075                   continue;
20076
20077                 /* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
20078                 if (warn_psabi_flags)
20079                   {
20080                     *warn_psabi_flags |= flag;
20081                     continue;
20082                   }
20083               }
20084             /* A zero-width bitfield may affect layout in some
20085                circumstances, but adds no members.  The determination
20086                of whether or not a type is an HFA is performed after
20087                layout is complete, so if the type still looks like an
20088                HFA afterwards, it is still classed as one.  This is
20089                potentially an ABI break for the hard-float ABI.  */
20090             else if (DECL_BIT_FIELD (field)
20091                      && integer_zerop (DECL_SIZE (field)))
20092               {
20093                 /* Prior to GCC-12 these fields were striped early,
20094                    hiding them from the back-end entirely and
20095                    resulting in the correct behaviour for argument
20096                    passing.  Simulate that old behaviour without
20097                    generating a warning.  */
20098                 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
20099                   continue;
20100                 if (warn_psabi_flags)
20101                   {
20102                     *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
20103                     continue;
20104                   }
20105               }
20106
20107             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20108                                                  warn_psabi_flags);
20109             if (sub_count < 0)
20110               return -1;
20111             count += sub_count;
20112           }
20113
20114         /* There must be no padding.  */
20115         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20116                       count * GET_MODE_BITSIZE (*modep)))
20117           return -1;
20118
20119         return count;
20120       }
20121
20122     case UNION_TYPE:
20123     case QUAL_UNION_TYPE:
20124       {
20125         /* These aren't very interesting except in a degenerate case.  */
20126         int count = 0;
20127         int sub_count;
20128         tree field;
20129
20130         /* Can't handle incomplete types nor sizes that are not
20131            fixed.  */
20132         if (!COMPLETE_TYPE_P (type)
20133             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20134           return -1;
20135
20136         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20137           {
20138             if (TREE_CODE (field) != FIELD_DECL)
20139               continue;
20140
20141             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20142                                                  warn_psabi_flags);
20143             if (sub_count < 0)
20144               return -1;
20145             count = count > sub_count ? count : sub_count;
20146           }
20147
20148         /* There must be no padding.  */
20149         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20150                       count * GET_MODE_BITSIZE (*modep)))
20151           return -1;
20152
20153         return count;
20154       }
20155
20156     default:
20157       break;
20158     }
20159
20160   return -1;
20161 }
20162
20163 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
20164    type as described in AAPCS64 \S 4.1.2.
20165
20166    See the comment above aarch64_composite_type_p for the notes on MODE.  */
20167
20168 static bool
20169 aarch64_short_vector_p (const_tree type,
20170                         machine_mode mode)
20171 {
20172   poly_int64 size = -1;
20173
20174   if (type && TREE_CODE (type) == VECTOR_TYPE)
20175     {
20176       if (aarch64_sve::builtin_type_p (type))
20177         return false;
20178       size = int_size_in_bytes (type);
20179     }
20180   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
20181            || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
20182     {
20183       /* The containing "else if" is too loose: it means that we look at TYPE
20184          if the type is a vector type (good), but that we otherwise ignore TYPE
20185          and look only at the mode.  This is wrong because the type describes
20186          the language-level information whereas the mode is purely an internal
20187          GCC concept.  We can therefore reach here for types that are not
20188          vectors in the AAPCS64 sense.
20189
20190          We can't "fix" that for the traditional Advanced SIMD vector modes
20191          without breaking backwards compatibility.  However, there's no such
20192          baggage for the structure modes, which were introduced in GCC 12.  */
20193       if (aarch64_advsimd_struct_mode_p (mode))
20194         return false;
20195
20196       /* For similar reasons, rely only on the type, not the mode, when
20197          processing SVE types.  */
20198       if (type && aarch64_some_values_include_pst_objects_p (type))
20199         /* Leave later code to report an error if SVE is disabled.  */
20200         gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
20201       else
20202         size = GET_MODE_SIZE (mode);
20203     }
20204   if (known_eq (size, 8) || known_eq (size, 16))
20205     {
20206       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
20207          they are being treated as scalable AAPCS64 types.  */
20208       gcc_assert (!aarch64_sve_mode_p (mode)
20209                   && !aarch64_advsimd_struct_mode_p (mode));
20210       return true;
20211     }
20212   return false;
20213 }
20214
20215 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
20216    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
20217    array types.  The C99 floating-point complex types are also considered
20218    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
20219    types, which are GCC extensions and out of the scope of AAPCS64, are
20220    treated as composite types here as well.
20221
20222    Note that MODE itself is not sufficient in determining whether a type
20223    is such a composite type or not.  This is because
20224    stor-layout.cc:compute_record_mode may have already changed the MODE
20225    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
20226    structure with only one field may have its MODE set to the mode of the
20227    field.  Also an integer mode whose size matches the size of the
20228    RECORD_TYPE type may be used to substitute the original mode
20229    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
20230    solely relied on.  */
20231
20232 static bool
20233 aarch64_composite_type_p (const_tree type,
20234                           machine_mode mode)
20235 {
20236   if (aarch64_short_vector_p (type, mode))
20237     return false;
20238
20239   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
20240     return true;
20241
20242   if (mode == BLKmode
20243       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
20244       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
20245     return true;
20246
20247   return false;
20248 }
20249
20250 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
20251    shall be passed or returned in simd/fp register(s) (providing these
20252    parameter passing registers are available).
20253
20254    Upon successful return, *COUNT returns the number of needed registers,
20255    *BASE_MODE returns the mode of the individual register and when IS_HA
20256    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
20257    floating-point aggregate or a homogeneous short-vector aggregate.
20258
20259    SILENT_P is true if the function should refrain from reporting any
20260    diagnostics.  This should only be used if the caller is certain that
20261    any ABI decisions would eventually come through this function with
20262    SILENT_P set to false.  */
20263
20264 static bool
20265 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
20266                                          const_tree type,
20267                                          machine_mode *base_mode,
20268                                          int *count,
20269                                          bool *is_ha,
20270                                          bool silent_p)
20271 {
20272   if (is_ha != NULL) *is_ha = false;
20273
20274   machine_mode new_mode = VOIDmode;
20275   bool composite_p = aarch64_composite_type_p (type, mode);
20276
20277   if ((!composite_p
20278        && (GET_MODE_CLASS (mode) == MODE_FLOAT
20279            || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
20280       || aarch64_short_vector_p (type, mode))
20281     {
20282       *count = 1;
20283       new_mode = mode;
20284     }
20285   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
20286     {
20287       if (is_ha != NULL) *is_ha = true;
20288       *count = 2;
20289       new_mode = GET_MODE_INNER (mode);
20290     }
20291   else if (type && composite_p)
20292     {
20293       unsigned int warn_psabi_flags = 0;
20294       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
20295                                               &warn_psabi_flags);
20296       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
20297         {
20298           static unsigned last_reported_type_uid;
20299           unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
20300           int alt;
20301           if (!silent_p
20302               && warn_psabi
20303               && warn_psabi_flags
20304               && uid != last_reported_type_uid
20305               && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
20306                   != ag_count))
20307             {
20308               const char *url10
20309                 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
20310               const char *url12
20311                 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
20312               gcc_assert (alt == -1);
20313               last_reported_type_uid = uid;
20314               /* Use TYPE_MAIN_VARIANT to strip any redundant const
20315                  qualification.  */
20316               if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
20317                 inform (input_location, "parameter passing for argument of "
20318                         "type %qT with %<[[no_unique_address]]%> members "
20319                         "changed %{in GCC 10.1%}",
20320                         TYPE_MAIN_VARIANT (type), url10);
20321               else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
20322                 inform (input_location, "parameter passing for argument of "
20323                         "type %qT when C++17 is enabled changed to match "
20324                         "C++14 %{in GCC 10.1%}",
20325                         TYPE_MAIN_VARIANT (type), url10);
20326               else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
20327                 inform (input_location, "parameter passing for argument of "
20328                         "type %qT changed %{in GCC 12.1%}",
20329                         TYPE_MAIN_VARIANT (type), url12);
20330             }
20331
20332           if (is_ha != NULL) *is_ha = true;
20333           *count = ag_count;
20334         }
20335       else
20336         return false;
20337     }
20338   else
20339     return false;
20340
20341   gcc_assert (!aarch64_sve_mode_p (new_mode));
20342   *base_mode = new_mode;
20343   return true;
20344 }
20345
20346 /* Implement TARGET_STRUCT_VALUE_RTX.  */
20347
20348 static rtx
20349 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
20350                           int incoming ATTRIBUTE_UNUSED)
20351 {
20352   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
20353 }
20354
20355 /* Implements target hook vector_mode_supported_p.  */
20356 static bool
20357 aarch64_vector_mode_supported_p (machine_mode mode)
20358 {
20359   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
20360   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
20361 }
20362
20363 /* Return the full-width SVE vector mode for element mode MODE, if one
20364    exists.  */
20365 opt_machine_mode
20366 aarch64_full_sve_mode (scalar_mode mode)
20367 {
20368   switch (mode)
20369     {
20370     case E_DFmode:
20371       return VNx2DFmode;
20372     case E_SFmode:
20373       return VNx4SFmode;
20374     case E_HFmode:
20375       return VNx8HFmode;
20376     case E_BFmode:
20377       return VNx8BFmode;
20378     case E_DImode:
20379       return VNx2DImode;
20380     case E_SImode:
20381       return VNx4SImode;
20382     case E_HImode:
20383       return VNx8HImode;
20384     case E_QImode:
20385       return VNx16QImode;
20386     default:
20387       return opt_machine_mode ();
20388     }
20389 }
20390
20391 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
20392    if it exists.  */
20393 opt_machine_mode
20394 aarch64_vq_mode (scalar_mode mode)
20395 {
20396   switch (mode)
20397     {
20398     case E_DFmode:
20399       return V2DFmode;
20400     case E_SFmode:
20401       return V4SFmode;
20402     case E_HFmode:
20403       return V8HFmode;
20404     case E_BFmode:
20405       return V8BFmode;
20406     case E_SImode:
20407       return V4SImode;
20408     case E_HImode:
20409       return V8HImode;
20410     case E_QImode:
20411       return V16QImode;
20412     case E_DImode:
20413       return V2DImode;
20414     default:
20415       return opt_machine_mode ();
20416     }
20417 }
20418
20419 /* Return appropriate SIMD container
20420    for MODE within a vector of WIDTH bits.  */
20421 static machine_mode
20422 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
20423 {
20424   if (TARGET_SVE
20425       && maybe_ne (width, 128)
20426       && known_eq (width, BITS_PER_SVE_VECTOR))
20427     return aarch64_full_sve_mode (mode).else_mode (word_mode);
20428
20429   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
20430   if (TARGET_SIMD)
20431     {
20432       if (known_eq (width, 128))
20433         return aarch64_vq_mode (mode).else_mode (word_mode);
20434       else
20435         switch (mode)
20436           {
20437           case E_SFmode:
20438             return V2SFmode;
20439           case E_HFmode:
20440             return V4HFmode;
20441           case E_BFmode:
20442             return V4BFmode;
20443           case E_SImode:
20444             return V2SImode;
20445           case E_HImode:
20446             return V4HImode;
20447           case E_QImode:
20448             return V8QImode;
20449           default:
20450             break;
20451           }
20452     }
20453   return word_mode;
20454 }
20455
20456 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
20457    and return whether the SVE mode should be preferred over the
20458    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
20459 static bool
20460 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
20461 {
20462   /* Take into account the aarch64-autovec-preference param if non-zero.  */
20463   bool only_asimd_p = aarch64_autovec_preference == 1;
20464   bool only_sve_p = aarch64_autovec_preference == 2;
20465
20466   if (only_asimd_p)
20467     return false;
20468   if (only_sve_p)
20469     return true;
20470
20471   /* The preference in case of a tie in costs.  */
20472   bool prefer_asimd = aarch64_autovec_preference == 3;
20473   bool prefer_sve = aarch64_autovec_preference == 4;
20474
20475   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
20476   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
20477   /* If the CPU information does not have an SVE width registered use the
20478      generic poly_int comparison that prefers SVE.  If a preference is
20479      explicitly requested avoid this path.  */
20480   if (aarch64_tune_params.sve_width == SVE_SCALABLE
20481       && !prefer_asimd
20482       && !prefer_sve)
20483     return maybe_gt (nunits_sve, nunits_asimd);
20484
20485   /* Otherwise estimate the runtime width of the modes involved.  */
20486   HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
20487   HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
20488
20489   /* Preferring SVE means picking it first unless the Advanced SIMD mode
20490      is clearly wider.  */
20491   if (prefer_sve)
20492     return est_sve >= est_asimd;
20493   /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
20494      is clearly wider.  */
20495   if (prefer_asimd)
20496     return est_sve > est_asimd;
20497
20498   /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
20499   return est_sve > est_asimd;
20500 }
20501
20502 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
20503 static machine_mode
20504 aarch64_preferred_simd_mode (scalar_mode mode)
20505 {
20506   /* Take into account explicit auto-vectorization ISA preferences through
20507      aarch64_cmp_autovec_modes.  */
20508   if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
20509     return aarch64_full_sve_mode (mode).else_mode (word_mode);
20510   if (TARGET_SIMD)
20511     return aarch64_vq_mode (mode).else_mode (word_mode);
20512   return word_mode;
20513 }
20514
20515 /* Return a list of possible vector sizes for the vectorizer
20516    to iterate over.  */
20517 static unsigned int
20518 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
20519 {
20520   static const machine_mode sve_modes[] = {
20521     /* Try using full vectors for all element types.  */
20522     VNx16QImode,
20523
20524     /* Try using 16-bit containers for 8-bit elements and full vectors
20525        for wider elements.  */
20526     VNx8QImode,
20527
20528     /* Try using 32-bit containers for 8-bit and 16-bit elements and
20529        full vectors for wider elements.  */
20530     VNx4QImode,
20531
20532     /* Try using 64-bit containers for all element types.  */
20533     VNx2QImode
20534   };
20535
20536   static const machine_mode advsimd_modes[] = {
20537     /* Try using 128-bit vectors for all element types.  */
20538     V16QImode,
20539
20540     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
20541        for wider elements.  */
20542     V8QImode,
20543
20544     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
20545        for wider elements.
20546
20547        TODO: We could support a limited form of V4QImode too, so that
20548        we use 32-bit vectors for 8-bit elements.  */
20549     V4HImode,
20550
20551     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
20552        for 64-bit elements.
20553
20554        TODO: We could similarly support limited forms of V2QImode and V2HImode
20555        for this case.  */
20556     V2SImode
20557   };
20558
20559   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
20560      This is because:
20561
20562      - If we can't use N-byte Advanced SIMD vectors then the placement
20563        doesn't matter; we'll just continue as though the Advanced SIMD
20564        entry didn't exist.
20565
20566      - If an SVE main loop with N bytes ends up being cheaper than an
20567        Advanced SIMD main loop with N bytes then by default we'll replace
20568        the Advanced SIMD version with the SVE one.
20569
20570      - If an Advanced SIMD main loop with N bytes ends up being cheaper
20571        than an SVE main loop with N bytes then by default we'll try to
20572        use the SVE loop to vectorize the epilogue instead.  */
20573
20574   bool only_asimd_p = aarch64_autovec_preference == 1;
20575   bool only_sve_p = aarch64_autovec_preference == 2;
20576
20577   unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
20578   unsigned int advsimd_i = 0;
20579
20580   while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
20581     {
20582       if (sve_i < ARRAY_SIZE (sve_modes)
20583           && aarch64_cmp_autovec_modes (sve_modes[sve_i],
20584                                         advsimd_modes[advsimd_i]))
20585         modes->safe_push (sve_modes[sve_i++]);
20586       else
20587         modes->safe_push (advsimd_modes[advsimd_i++]);
20588     }
20589   while (sve_i < ARRAY_SIZE (sve_modes))
20590    modes->safe_push (sve_modes[sve_i++]);
20591
20592   unsigned int flags = 0;
20593   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
20594      can compare SVE against Advanced SIMD and so that we can compare
20595      multiple SVE vectorization approaches against each other.  There's
20596      not really any point doing this for Advanced SIMD only, since the
20597      first mode that works should always be the best.  */
20598   if (TARGET_SVE && aarch64_sve_compare_costs)
20599     flags |= VECT_COMPARE_COSTS;
20600   return flags;
20601 }
20602
20603 /* Implement TARGET_MANGLE_TYPE.  */
20604
20605 static const char *
20606 aarch64_mangle_type (const_tree type)
20607 {
20608   /* The AArch64 ABI documents say that "__va_list" has to be
20609      mangled as if it is in the "std" namespace.  */
20610   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
20611     return "St9__va_list";
20612
20613   /* Half-precision floating point types.  */
20614   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
20615     {
20616       if (TYPE_MODE (type) == BFmode)
20617         return "u6__bf16";
20618       else
20619         return "Dh";
20620     }
20621
20622   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
20623      builtin types.  */
20624   if (TYPE_NAME (type) != NULL)
20625     {
20626       const char *res;
20627       if ((res = aarch64_general_mangle_builtin_type (type))
20628           || (res = aarch64_sve::mangle_builtin_type (type)))
20629         return res;
20630     }
20631
20632   /* Use the default mangling.  */
20633   return NULL;
20634 }
20635
20636 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
20637
20638 static bool
20639 aarch64_verify_type_context (location_t loc, type_context_kind context,
20640                              const_tree type, bool silent_p)
20641 {
20642   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
20643 }
20644
20645 /* Find the first rtx_insn before insn that will generate an assembly
20646    instruction.  */
20647
20648 static rtx_insn *
20649 aarch64_prev_real_insn (rtx_insn *insn)
20650 {
20651   if (!insn)
20652     return NULL;
20653
20654   do
20655     {
20656       insn = prev_real_insn (insn);
20657     }
20658   while (insn && recog_memoized (insn) < 0);
20659
20660   return insn;
20661 }
20662
20663 static bool
20664 is_madd_op (enum attr_type t1)
20665 {
20666   unsigned int i;
20667   /* A number of these may be AArch32 only.  */
20668   enum attr_type mlatypes[] = {
20669     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
20670     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
20671     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
20672   };
20673
20674   for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
20675     {
20676       if (t1 == mlatypes[i])
20677         return true;
20678     }
20679
20680   return false;
20681 }
20682
20683 /* Check if there is a register dependency between a load and the insn
20684    for which we hold recog_data.  */
20685
20686 static bool
20687 dep_between_memop_and_curr (rtx memop)
20688 {
20689   rtx load_reg;
20690   int opno;
20691
20692   gcc_assert (GET_CODE (memop) == SET);
20693
20694   if (!REG_P (SET_DEST (memop)))
20695     return false;
20696
20697   load_reg = SET_DEST (memop);
20698   for (opno = 1; opno < recog_data.n_operands; opno++)
20699     {
20700       rtx operand = recog_data.operand[opno];
20701       if (REG_P (operand)
20702           && reg_overlap_mentioned_p (load_reg, operand))
20703         return true;
20704
20705     }
20706   return false;
20707 }
20708
20709
20710 /* When working around the Cortex-A53 erratum 835769,
20711    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
20712    instruction and has a preceding memory instruction such that a NOP
20713    should be inserted between them.  */
20714
20715 bool
20716 aarch64_madd_needs_nop (rtx_insn* insn)
20717 {
20718   enum attr_type attr_type;
20719   rtx_insn *prev;
20720   rtx body;
20721
20722   if (!TARGET_FIX_ERR_A53_835769)
20723     return false;
20724
20725   if (!INSN_P (insn) || recog_memoized (insn) < 0)
20726     return false;
20727
20728   attr_type = get_attr_type (insn);
20729   if (!is_madd_op (attr_type))
20730     return false;
20731
20732   prev = aarch64_prev_real_insn (insn);
20733   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
20734      Restore recog state to INSN to avoid state corruption.  */
20735   extract_constrain_insn_cached (insn);
20736
20737   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
20738     return false;
20739
20740   body = single_set (prev);
20741
20742   /* If the previous insn is a memory op and there is no dependency between
20743      it and the DImode madd, emit a NOP between them.  If body is NULL then we
20744      have a complex memory operation, probably a load/store pair.
20745      Be conservative for now and emit a NOP.  */
20746   if (GET_MODE (recog_data.operand[0]) == DImode
20747       && (!body || !dep_between_memop_and_curr (body)))
20748     return true;
20749
20750   return false;
20751
20752 }
20753
20754
20755 /* Implement FINAL_PRESCAN_INSN.  */
20756
20757 void
20758 aarch64_final_prescan_insn (rtx_insn *insn)
20759 {
20760   if (aarch64_madd_needs_nop (insn))
20761     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
20762 }
20763
20764
20765 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
20766    instruction.  */
20767
20768 bool
20769 aarch64_sve_index_immediate_p (rtx base_or_step)
20770 {
20771   return (CONST_INT_P (base_or_step)
20772           && IN_RANGE (INTVAL (base_or_step), -16, 15));
20773 }
20774
20775 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
20776    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
20777
20778 bool
20779 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
20780 {
20781   rtx elt = unwrap_const_vec_duplicate (x);
20782   if (!CONST_INT_P (elt))
20783     return false;
20784
20785   HOST_WIDE_INT val = INTVAL (elt);
20786   if (negate_p)
20787     val = -val;
20788   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
20789
20790   if (val & 0xff)
20791     return IN_RANGE (val, 0, 0xff);
20792   return IN_RANGE (val, 0, 0xff00);
20793 }
20794
20795 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
20796    instructions when applied to mode MODE.  Negate X first if NEGATE_P
20797    is true.  */
20798
20799 bool
20800 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
20801 {
20802   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
20803     return false;
20804
20805   /* After the optional negation, the immediate must be nonnegative.
20806      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
20807      instead of SQADD Zn.B, Zn.B, #129.  */
20808   rtx elt = unwrap_const_vec_duplicate (x);
20809   return negate_p == (INTVAL (elt) < 0);
20810 }
20811
20812 /* Return true if X is a valid immediate operand for an SVE logical
20813    instruction such as AND.  */
20814
20815 bool
20816 aarch64_sve_bitmask_immediate_p (rtx x)
20817 {
20818   rtx elt;
20819
20820   return (const_vec_duplicate_p (x, &elt)
20821           && CONST_INT_P (elt)
20822           && aarch64_bitmask_imm (INTVAL (elt),
20823                                   GET_MODE_INNER (GET_MODE (x))));
20824 }
20825
20826 /* Return true if X is a valid immediate for the SVE DUP and CPY
20827    instructions.  */
20828
20829 bool
20830 aarch64_sve_dup_immediate_p (rtx x)
20831 {
20832   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
20833   if (!CONST_INT_P (x))
20834     return false;
20835
20836   HOST_WIDE_INT val = INTVAL (x);
20837   if (val & 0xff)
20838     return IN_RANGE (val, -0x80, 0x7f);
20839   return IN_RANGE (val, -0x8000, 0x7f00);
20840 }
20841
20842 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
20843    SIGNED_P says whether the operand is signed rather than unsigned.  */
20844
20845 bool
20846 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
20847 {
20848   x = unwrap_const_vec_duplicate (x);
20849   return (CONST_INT_P (x)
20850           && (signed_p
20851               ? IN_RANGE (INTVAL (x), -16, 15)
20852               : IN_RANGE (INTVAL (x), 0, 127)));
20853 }
20854
20855 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
20856    instruction.  Negate X first if NEGATE_P is true.  */
20857
20858 bool
20859 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
20860 {
20861   rtx elt;
20862   REAL_VALUE_TYPE r;
20863
20864   if (!const_vec_duplicate_p (x, &elt)
20865       || !CONST_DOUBLE_P (elt))
20866     return false;
20867
20868   r = *CONST_DOUBLE_REAL_VALUE (elt);
20869
20870   if (negate_p)
20871     r = real_value_negate (&r);
20872
20873   if (real_equal (&r, &dconst1))
20874     return true;
20875   if (real_equal (&r, &dconsthalf))
20876     return true;
20877   return false;
20878 }
20879
20880 /* Return true if X is a valid immediate operand for an SVE FMUL
20881    instruction.  */
20882
20883 bool
20884 aarch64_sve_float_mul_immediate_p (rtx x)
20885 {
20886   rtx elt;
20887
20888   return (const_vec_duplicate_p (x, &elt)
20889           && CONST_DOUBLE_P (elt)
20890           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
20891               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
20892 }
20893
20894 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
20895    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
20896    is nonnull, use it to describe valid immediates.  */
20897 static bool
20898 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
20899                                     simd_immediate_info *info,
20900                                     enum simd_immediate_check which,
20901                                     simd_immediate_info::insn_type insn)
20902 {
20903   /* Try a 4-byte immediate with LSL.  */
20904   for (unsigned int shift = 0; shift < 32; shift += 8)
20905     if ((val32 & (0xff << shift)) == val32)
20906       {
20907         if (info)
20908           *info = simd_immediate_info (SImode, val32 >> shift, insn,
20909                                        simd_immediate_info::LSL, shift);
20910         return true;
20911       }
20912
20913   /* Try a 2-byte immediate with LSL.  */
20914   unsigned int imm16 = val32 & 0xffff;
20915   if (imm16 == (val32 >> 16))
20916     for (unsigned int shift = 0; shift < 16; shift += 8)
20917       if ((imm16 & (0xff << shift)) == imm16)
20918         {
20919           if (info)
20920             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
20921                                          simd_immediate_info::LSL, shift);
20922           return true;
20923         }
20924
20925   /* Try a 4-byte immediate with MSL, except for cases that MVN
20926      can handle.  */
20927   if (which == AARCH64_CHECK_MOV)
20928     for (unsigned int shift = 8; shift < 24; shift += 8)
20929       {
20930         unsigned int low = (1 << shift) - 1;
20931         if (((val32 & (0xff << shift)) | low) == val32)
20932           {
20933             if (info)
20934               *info = simd_immediate_info (SImode, val32 >> shift, insn,
20935                                            simd_immediate_info::MSL, shift);
20936             return true;
20937           }
20938       }
20939
20940   return false;
20941 }
20942
20943 /* Return true if replicating VAL64 is a valid immediate for the
20944    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
20945    use it to describe valid immediates.  */
20946 static bool
20947 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
20948                                  simd_immediate_info *info,
20949                                  enum simd_immediate_check which)
20950 {
20951   unsigned int val32 = val64 & 0xffffffff;
20952   unsigned int val16 = val64 & 0xffff;
20953   unsigned int val8 = val64 & 0xff;
20954
20955   if (val32 == (val64 >> 32))
20956     {
20957       if ((which & AARCH64_CHECK_ORR) != 0
20958           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
20959                                                  simd_immediate_info::MOV))
20960         return true;
20961
20962       if ((which & AARCH64_CHECK_BIC) != 0
20963           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
20964                                                  simd_immediate_info::MVN))
20965         return true;
20966
20967       /* Try using a replicated byte.  */
20968       if (which == AARCH64_CHECK_MOV
20969           && val16 == (val32 >> 16)
20970           && val8 == (val16 >> 8))
20971         {
20972           if (info)
20973             *info = simd_immediate_info (QImode, val8);
20974           return true;
20975         }
20976     }
20977
20978   /* Try using a bit-to-bytemask.  */
20979   if (which == AARCH64_CHECK_MOV)
20980     {
20981       unsigned int i;
20982       for (i = 0; i < 64; i += 8)
20983         {
20984           unsigned char byte = (val64 >> i) & 0xff;
20985           if (byte != 0 && byte != 0xff)
20986             break;
20987         }
20988       if (i == 64)
20989         {
20990           if (info)
20991             *info = simd_immediate_info (DImode, val64);
20992           return true;
20993         }
20994     }
20995   return false;
20996 }
20997
20998 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
20999    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
21000
21001 static bool
21002 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
21003                              simd_immediate_info *info)
21004 {
21005   scalar_int_mode mode = DImode;
21006   unsigned int val32 = val64 & 0xffffffff;
21007   if (val32 == (val64 >> 32))
21008     {
21009       mode = SImode;
21010       unsigned int val16 = val32 & 0xffff;
21011       if (val16 == (val32 >> 16))
21012         {
21013           mode = HImode;
21014           unsigned int val8 = val16 & 0xff;
21015           if (val8 == (val16 >> 8))
21016             mode = QImode;
21017         }
21018     }
21019   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
21020   if (IN_RANGE (val, -0x80, 0x7f))
21021     {
21022       /* DUP with no shift.  */
21023       if (info)
21024         *info = simd_immediate_info (mode, val);
21025       return true;
21026     }
21027   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
21028     {
21029       /* DUP with LSL #8.  */
21030       if (info)
21031         *info = simd_immediate_info (mode, val);
21032       return true;
21033     }
21034   if (aarch64_bitmask_imm (val64, mode))
21035     {
21036       /* DUPM.  */
21037       if (info)
21038         *info = simd_immediate_info (mode, val);
21039       return true;
21040     }
21041   return false;
21042 }
21043
21044 /* Return true if X is an UNSPEC_PTRUE constant of the form:
21045
21046        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
21047
21048    where PATTERN is the svpattern as a CONST_INT and where ZERO
21049    is a zero constant of the required PTRUE mode (which can have
21050    fewer elements than X's mode, if zero bits are significant).
21051
21052    If so, and if INFO is nonnull, describe the immediate in INFO.  */
21053 bool
21054 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
21055 {
21056   if (GET_CODE (x) != CONST)
21057     return false;
21058
21059   x = XEXP (x, 0);
21060   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
21061     return false;
21062
21063   if (info)
21064     {
21065       aarch64_svpattern pattern
21066         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
21067       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
21068       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
21069       *info = simd_immediate_info (int_mode, pattern);
21070     }
21071   return true;
21072 }
21073
21074 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
21075    it to describe valid immediates.  */
21076
21077 static bool
21078 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
21079 {
21080   if (aarch64_sve_ptrue_svpattern_p (x, info))
21081     return true;
21082
21083   if (x == CONST0_RTX (GET_MODE (x)))
21084     {
21085       if (info)
21086         *info = simd_immediate_info (DImode, 0);
21087       return true;
21088     }
21089
21090   /* Analyze the value as a VNx16BImode.  This should be relatively
21091      efficient, since rtx_vector_builder has enough built-in capacity
21092      to store all VLA predicate constants without needing the heap.  */
21093   rtx_vector_builder builder;
21094   if (!aarch64_get_sve_pred_bits (builder, x))
21095     return false;
21096
21097   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
21098   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
21099     {
21100       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
21101       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
21102       if (pattern != AARCH64_NUM_SVPATTERNS)
21103         {
21104           if (info)
21105             {
21106               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
21107               *info = simd_immediate_info (int_mode, pattern);
21108             }
21109           return true;
21110         }
21111     }
21112   return false;
21113 }
21114
21115 /* Return true if OP is a valid SIMD immediate for the operation
21116    described by WHICH.  If INFO is nonnull, use it to describe valid
21117    immediates.  */
21118 bool
21119 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
21120                               enum simd_immediate_check which)
21121 {
21122   machine_mode mode = GET_MODE (op);
21123   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21124   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21125     return false;
21126
21127   if (vec_flags & VEC_SVE_PRED)
21128     return aarch64_sve_pred_valid_immediate (op, info);
21129
21130   scalar_mode elt_mode = GET_MODE_INNER (mode);
21131   rtx base, step;
21132   unsigned int n_elts;
21133   if (CONST_VECTOR_P (op)
21134       && CONST_VECTOR_DUPLICATE_P (op))
21135     n_elts = CONST_VECTOR_NPATTERNS (op);
21136   else if ((vec_flags & VEC_SVE_DATA)
21137            && const_vec_series_p (op, &base, &step))
21138     {
21139       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
21140       if (!aarch64_sve_index_immediate_p (base)
21141           || !aarch64_sve_index_immediate_p (step))
21142         return false;
21143
21144       if (info)
21145         {
21146           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
21147              should yield two integer values per 128-bit block, meaning
21148              that we need to treat it in the same way as V2DI and then
21149              ignore the upper 32 bits of each element.  */
21150           elt_mode = aarch64_sve_container_int_mode (mode);
21151           *info = simd_immediate_info (elt_mode, base, step);
21152         }
21153       return true;
21154     }
21155   else if (CONST_VECTOR_P (op)
21156            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
21157     /* N_ELTS set above.  */;
21158   else
21159     return false;
21160
21161   scalar_float_mode elt_float_mode;
21162   if (n_elts == 1
21163       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
21164     {
21165       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
21166       if (aarch64_float_const_zero_rtx_p (elt)
21167           || aarch64_float_const_representable_p (elt))
21168         {
21169           if (info)
21170             *info = simd_immediate_info (elt_float_mode, elt);
21171           return true;
21172         }
21173     }
21174
21175   /* If all elements in an SVE vector have the same value, we have a free
21176      choice between using the element mode and using the container mode.
21177      Using the element mode means that unused parts of the vector are
21178      duplicates of the used elements, while using the container mode means
21179      that the unused parts are an extension of the used elements.  Using the
21180      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
21181      for its container mode VNx4SI while 0x00000101 isn't.
21182
21183      If not all elements in an SVE vector have the same value, we need the
21184      transition from one element to the next to occur at container boundaries.
21185      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
21186      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
21187   scalar_int_mode elt_int_mode;
21188   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
21189     elt_int_mode = aarch64_sve_container_int_mode (mode);
21190   else
21191     elt_int_mode = int_mode_for_mode (elt_mode).require ();
21192
21193   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
21194   if (elt_size > 8)
21195     return false;
21196
21197   /* Expand the vector constant out into a byte vector, with the least
21198      significant byte of the register first.  */
21199   auto_vec<unsigned char, 16> bytes;
21200   bytes.reserve (n_elts * elt_size);
21201   for (unsigned int i = 0; i < n_elts; i++)
21202     {
21203       /* The vector is provided in gcc endian-neutral fashion.
21204          For aarch64_be Advanced SIMD, it must be laid out in the vector
21205          register in reverse order.  */
21206       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
21207       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
21208
21209       if (elt_mode != elt_int_mode)
21210         elt = gen_lowpart (elt_int_mode, elt);
21211
21212       if (!CONST_INT_P (elt))
21213         return false;
21214
21215       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
21216       for (unsigned int byte = 0; byte < elt_size; byte++)
21217         {
21218           bytes.quick_push (elt_val & 0xff);
21219           elt_val >>= BITS_PER_UNIT;
21220         }
21221     }
21222
21223   /* The immediate must repeat every eight bytes.  */
21224   unsigned int nbytes = bytes.length ();
21225   for (unsigned i = 8; i < nbytes; ++i)
21226     if (bytes[i] != bytes[i - 8])
21227       return false;
21228
21229   /* Get the repeating 8-byte value as an integer.  No endian correction
21230      is needed here because bytes is already in lsb-first order.  */
21231   unsigned HOST_WIDE_INT val64 = 0;
21232   for (unsigned int i = 0; i < 8; i++)
21233     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
21234               << (i * BITS_PER_UNIT));
21235
21236   if (vec_flags & VEC_SVE_DATA)
21237     return aarch64_sve_valid_immediate (val64, info);
21238   else
21239     return aarch64_advsimd_valid_immediate (val64, info, which);
21240 }
21241
21242 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
21243    has a step in the range of INDEX.  Return the index expression if so,
21244    otherwise return null.  */
21245 rtx
21246 aarch64_check_zero_based_sve_index_immediate (rtx x)
21247 {
21248   rtx base, step;
21249   if (const_vec_series_p (x, &base, &step)
21250       && base == const0_rtx
21251       && aarch64_sve_index_immediate_p (step))
21252     return step;
21253   return NULL_RTX;
21254 }
21255
21256 /* Check of immediate shift constants are within range.  */
21257 bool
21258 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
21259 {
21260   x = unwrap_const_vec_duplicate (x);
21261   if (!CONST_INT_P (x))
21262     return false;
21263   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
21264   if (left)
21265     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
21266   else
21267     return IN_RANGE (INTVAL (x), 1, bit_width);
21268 }
21269
21270 /* Return the bitmask CONST_INT to select the bits required by a zero extract
21271    operation of width WIDTH at bit position POS.  */
21272
21273 rtx
21274 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
21275 {
21276   gcc_assert (CONST_INT_P (width));
21277   gcc_assert (CONST_INT_P (pos));
21278
21279   unsigned HOST_WIDE_INT mask
21280     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
21281   return GEN_INT (mask << UINTVAL (pos));
21282 }
21283
21284 bool
21285 aarch64_mov_operand_p (rtx x, machine_mode mode)
21286 {
21287   if (GET_CODE (x) == HIGH
21288       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
21289     return true;
21290
21291   if (CONST_INT_P (x))
21292     return true;
21293
21294   if (VECTOR_MODE_P (GET_MODE (x)))
21295     {
21296       /* Require predicate constants to be VNx16BI before RA, so that we
21297          force everything to have a canonical form.  */
21298       if (!lra_in_progress
21299           && !reload_completed
21300           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
21301           && GET_MODE (x) != VNx16BImode)
21302         return false;
21303
21304       return aarch64_simd_valid_immediate (x, NULL);
21305     }
21306
21307   /* Remove UNSPEC_SALT_ADDR before checking symbol reference.  */
21308   x = strip_salt (x);
21309
21310   /* GOT accesses are valid moves.  */
21311   if (SYMBOL_REF_P (x)
21312       && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
21313     return true;
21314
21315   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
21316     return true;
21317
21318   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
21319     return true;
21320
21321   return aarch64_classify_symbolic_expression (x)
21322     == SYMBOL_TINY_ABSOLUTE;
21323 }
21324
21325 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
21326    the constant creation.  */
21327
21328 rtx
21329 aarch64_gen_shareable_zero (machine_mode mode)
21330 {
21331   machine_mode zmode = V4SImode;
21332   rtx tmp = gen_reg_rtx (zmode);
21333   emit_move_insn (tmp, CONST0_RTX (zmode));
21334   return lowpart_subreg (mode, tmp, zmode);
21335 }
21336
21337 /* Return a const_int vector of VAL.  */
21338 rtx
21339 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
21340 {
21341   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
21342   return gen_const_vec_duplicate (mode, c);
21343 }
21344
21345 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
21346
21347 bool
21348 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
21349 {
21350   machine_mode vmode;
21351
21352   vmode = aarch64_simd_container_mode (mode, 64);
21353   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
21354   return aarch64_simd_valid_immediate (op_v, NULL);
21355 }
21356
21357 /* Construct and return a PARALLEL RTX vector with elements numbering the
21358    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
21359    the vector - from the perspective of the architecture.  This does not
21360    line up with GCC's perspective on lane numbers, so we end up with
21361    different masks depending on our target endian-ness.  The diagram
21362    below may help.  We must draw the distinction when building masks
21363    which select one half of the vector.  An instruction selecting
21364    architectural low-lanes for a big-endian target, must be described using
21365    a mask selecting GCC high-lanes.
21366
21367                  Big-Endian             Little-Endian
21368
21369 GCC             0   1   2   3           3   2   1   0
21370               | x | x | x | x |       | x | x | x | x |
21371 Architecture    3   2   1   0           3   2   1   0
21372
21373 Low Mask:         { 2, 3 }                { 0, 1 }
21374 High Mask:        { 0, 1 }                { 2, 3 }
21375
21376    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
21377
21378 rtx
21379 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
21380 {
21381   rtvec v = rtvec_alloc (nunits / 2);
21382   int high_base = nunits / 2;
21383   int low_base = 0;
21384   int base;
21385   rtx t1;
21386   int i;
21387
21388   if (BYTES_BIG_ENDIAN)
21389     base = high ? low_base : high_base;
21390   else
21391     base = high ? high_base : low_base;
21392
21393   for (i = 0; i < nunits / 2; i++)
21394     RTVEC_ELT (v, i) = GEN_INT (base + i);
21395
21396   t1 = gen_rtx_PARALLEL (mode, v);
21397   return t1;
21398 }
21399
21400 /* Check OP for validity as a PARALLEL RTX vector with elements
21401    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
21402    from the perspective of the architecture.  See the diagram above
21403    aarch64_simd_vect_par_cnst_half for more details.  */
21404
21405 bool
21406 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
21407                                        bool high)
21408 {
21409   int nelts;
21410   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
21411     return false;
21412
21413   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
21414   HOST_WIDE_INT count_op = XVECLEN (op, 0);
21415   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
21416   int i = 0;
21417
21418   if (count_op != count_ideal)
21419     return false;
21420
21421   for (i = 0; i < count_ideal; i++)
21422     {
21423       rtx elt_op = XVECEXP (op, 0, i);
21424       rtx elt_ideal = XVECEXP (ideal, 0, i);
21425
21426       if (!CONST_INT_P (elt_op)
21427           || INTVAL (elt_ideal) != INTVAL (elt_op))
21428         return false;
21429     }
21430   return true;
21431 }
21432
21433 /* Return a PARALLEL containing NELTS elements, with element I equal
21434    to BASE + I * STEP.  */
21435
21436 rtx
21437 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
21438 {
21439   rtvec vec = rtvec_alloc (nelts);
21440   for (unsigned int i = 0; i < nelts; ++i)
21441     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
21442   return gen_rtx_PARALLEL (VOIDmode, vec);
21443 }
21444
21445 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
21446    series with step STEP.  */
21447
21448 bool
21449 aarch64_stepped_int_parallel_p (rtx op, int step)
21450 {
21451   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
21452     return false;
21453
21454   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
21455   for (int i = 1; i < XVECLEN (op, 0); ++i)
21456     if (!CONST_INT_P (XVECEXP (op, 0, i))
21457         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
21458       return false;
21459
21460   return true;
21461 }
21462
21463 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
21464    HIGH (exclusive).  */
21465 void
21466 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
21467                           const_tree exp)
21468 {
21469   HOST_WIDE_INT lane;
21470   gcc_assert (CONST_INT_P (operand));
21471   lane = INTVAL (operand);
21472
21473   if (lane < low || lane >= high)
21474   {
21475     if (exp)
21476       error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
21477                 lane, low, high - 1);
21478     else
21479       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
21480   }
21481 }
21482
21483 /* Peform endian correction on lane number N, which indexes a vector
21484    of mode MODE, and return the result as an SImode rtx.  */
21485
21486 rtx
21487 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
21488 {
21489   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
21490 }
21491
21492 /* Return TRUE if OP is a valid vector addressing mode.  */
21493
21494 bool
21495 aarch64_simd_mem_operand_p (rtx op)
21496 {
21497   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
21498                         || REG_P (XEXP (op, 0)));
21499 }
21500
21501 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
21502
21503 bool
21504 aarch64_sve_ld1r_operand_p (rtx op)
21505 {
21506   struct aarch64_address_info addr;
21507   scalar_mode mode;
21508
21509   return (MEM_P (op)
21510           && is_a <scalar_mode> (GET_MODE (op), &mode)
21511           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
21512           && addr.type == ADDRESS_REG_IMM
21513           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
21514 }
21515
21516 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
21517    where the size of the read data is specified by `mode` and the size of the
21518    vector elements are specified by `elem_mode`.   */
21519 bool
21520 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
21521                                    scalar_mode elem_mode)
21522 {
21523   struct aarch64_address_info addr;
21524   if (!MEM_P (op)
21525       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
21526     return false;
21527
21528   if (addr.type == ADDRESS_REG_IMM)
21529     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
21530
21531   if (addr.type == ADDRESS_REG_REG)
21532     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
21533
21534   return false;
21535 }
21536
21537 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
21538 bool
21539 aarch64_sve_ld1rq_operand_p (rtx op)
21540 {
21541   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
21542                                             GET_MODE_INNER (GET_MODE (op)));
21543 }
21544
21545 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
21546    accessing a vector where the element size is specified by `elem_mode`.  */
21547 bool
21548 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
21549 {
21550   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
21551 }
21552
21553 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
21554 bool
21555 aarch64_sve_ldff1_operand_p (rtx op)
21556 {
21557   if (!MEM_P (op))
21558     return false;
21559
21560   struct aarch64_address_info addr;
21561   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
21562     return false;
21563
21564   if (addr.type == ADDRESS_REG_IMM)
21565     return known_eq (addr.const_offset, 0);
21566
21567   return addr.type == ADDRESS_REG_REG;
21568 }
21569
21570 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
21571 bool
21572 aarch64_sve_ldnf1_operand_p (rtx op)
21573 {
21574   struct aarch64_address_info addr;
21575
21576   return (MEM_P (op)
21577           && aarch64_classify_address (&addr, XEXP (op, 0),
21578                                        GET_MODE (op), false)
21579           && addr.type == ADDRESS_REG_IMM);
21580 }
21581
21582 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
21583    The conditions for STR are the same.  */
21584 bool
21585 aarch64_sve_ldr_operand_p (rtx op)
21586 {
21587   struct aarch64_address_info addr;
21588
21589   return (MEM_P (op)
21590           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
21591                                        false, ADDR_QUERY_ANY)
21592           && addr.type == ADDRESS_REG_IMM);
21593 }
21594
21595 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
21596    addressing memory of mode MODE.  */
21597 bool
21598 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
21599 {
21600   struct aarch64_address_info addr;
21601   if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
21602     return false;
21603
21604   if (addr.type == ADDRESS_REG_IMM)
21605     return offset_6bit_signed_scaled_p (mode, addr.const_offset);
21606
21607   return addr.type == ADDRESS_REG_REG;
21608 }
21609
21610 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
21611    We need to be able to access the individual pieces, so the range
21612    is different from LD[234] and ST[234].  */
21613 bool
21614 aarch64_sve_struct_memory_operand_p (rtx op)
21615 {
21616   if (!MEM_P (op))
21617     return false;
21618
21619   machine_mode mode = GET_MODE (op);
21620   struct aarch64_address_info addr;
21621   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
21622                                  ADDR_QUERY_ANY)
21623       || addr.type != ADDRESS_REG_IMM)
21624     return false;
21625
21626   poly_int64 first = addr.const_offset;
21627   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
21628   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
21629           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
21630 }
21631
21632 /* Emit a register copy from operand to operand, taking care not to
21633    early-clobber source registers in the process.
21634
21635    COUNT is the number of components into which the copy needs to be
21636    decomposed.  */
21637 void
21638 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
21639                                 unsigned int count)
21640 {
21641   unsigned int i;
21642   int rdest = REGNO (operands[0]);
21643   int rsrc = REGNO (operands[1]);
21644
21645   if (!reg_overlap_mentioned_p (operands[0], operands[1])
21646       || rdest < rsrc)
21647     for (i = 0; i < count; i++)
21648       emit_move_insn (gen_rtx_REG (mode, rdest + i),
21649                       gen_rtx_REG (mode, rsrc + i));
21650   else
21651     for (i = 0; i < count; i++)
21652       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
21653                       gen_rtx_REG (mode, rsrc + count - i - 1));
21654 }
21655
21656 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
21657    one of VSTRUCT modes: OI, CI, or XI.  */
21658 int
21659 aarch64_simd_attr_length_rglist (machine_mode mode)
21660 {
21661   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
21662   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
21663 }
21664
21665 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
21666    alignment of a vector to 128 bits.  SVE predicates have an alignment of
21667    16 bits.  */
21668 static HOST_WIDE_INT
21669 aarch64_simd_vector_alignment (const_tree type)
21670 {
21671   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
21672      be set for non-predicate vectors of booleans.  Modes are the most
21673      direct way we have of identifying real SVE predicate types.  */
21674   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
21675     return 16;
21676   widest_int min_size
21677     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
21678   return wi::umin (min_size, 128).to_uhwi ();
21679 }
21680
21681 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
21682 static poly_uint64
21683 aarch64_vectorize_preferred_vector_alignment (const_tree type)
21684 {
21685   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
21686     {
21687       /* If the length of the vector is a fixed power of 2, try to align
21688          to that length, otherwise don't try to align at all.  */
21689       HOST_WIDE_INT result;
21690       if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
21691           || !pow2p_hwi (result))
21692         result = TYPE_ALIGN (TREE_TYPE (type));
21693       return result;
21694     }
21695   return TYPE_ALIGN (type);
21696 }
21697
21698 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
21699 static bool
21700 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
21701 {
21702   if (is_packed)
21703     return false;
21704
21705   /* For fixed-length vectors, check that the vectorizer will aim for
21706      full-vector alignment.  This isn't true for generic GCC vectors
21707      that are wider than the ABI maximum of 128 bits.  */
21708   poly_uint64 preferred_alignment =
21709     aarch64_vectorize_preferred_vector_alignment (type);
21710   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
21711       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
21712                    preferred_alignment))
21713     return false;
21714
21715   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
21716   return true;
21717 }
21718
21719 /* Return true if the vector misalignment factor is supported by the
21720    target.  */
21721 static bool
21722 aarch64_builtin_support_vector_misalignment (machine_mode mode,
21723                                              const_tree type, int misalignment,
21724                                              bool is_packed)
21725 {
21726   if (TARGET_SIMD && STRICT_ALIGNMENT)
21727     {
21728       /* Return if movmisalign pattern is not supported for this mode.  */
21729       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
21730         return false;
21731
21732       /* Misalignment factor is unknown at compile time.  */
21733       if (misalignment == -1)
21734         return false;
21735     }
21736   return default_builtin_support_vector_misalignment (mode, type, misalignment,
21737                                                       is_packed);
21738 }
21739
21740 /* If VALS is a vector constant that can be loaded into a register
21741    using DUP, generate instructions to do so and return an RTX to
21742    assign to the register.  Otherwise return NULL_RTX.  */
21743 static rtx
21744 aarch64_simd_dup_constant (rtx vals)
21745 {
21746   machine_mode mode = GET_MODE (vals);
21747   machine_mode inner_mode = GET_MODE_INNER (mode);
21748   rtx x;
21749
21750   if (!const_vec_duplicate_p (vals, &x))
21751     return NULL_RTX;
21752
21753   /* We can load this constant by using DUP and a constant in a
21754      single ARM register.  This will be cheaper than a vector
21755      load.  */
21756   x = copy_to_mode_reg (inner_mode, x);
21757   return gen_vec_duplicate (mode, x);
21758 }
21759
21760
21761 /* Generate code to load VALS, which is a PARALLEL containing only
21762    constants (for vec_init) or CONST_VECTOR, efficiently into a
21763    register.  Returns an RTX to copy into the register, or NULL_RTX
21764    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
21765 static rtx
21766 aarch64_simd_make_constant (rtx vals)
21767 {
21768   machine_mode mode = GET_MODE (vals);
21769   rtx const_dup;
21770   rtx const_vec = NULL_RTX;
21771   int n_const = 0;
21772   int i;
21773
21774   if (CONST_VECTOR_P (vals))
21775     const_vec = vals;
21776   else if (GET_CODE (vals) == PARALLEL)
21777     {
21778       /* A CONST_VECTOR must contain only CONST_INTs and
21779          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
21780          Only store valid constants in a CONST_VECTOR.  */
21781       int n_elts = XVECLEN (vals, 0);
21782       for (i = 0; i < n_elts; ++i)
21783         {
21784           rtx x = XVECEXP (vals, 0, i);
21785           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
21786             n_const++;
21787         }
21788       if (n_const == n_elts)
21789         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
21790     }
21791   else
21792     gcc_unreachable ();
21793
21794   if (const_vec != NULL_RTX
21795       && aarch64_simd_valid_immediate (const_vec, NULL))
21796     /* Load using MOVI/MVNI.  */
21797     return const_vec;
21798   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
21799     /* Loaded using DUP.  */
21800     return const_dup;
21801   else if (const_vec != NULL_RTX)
21802     /* Load from constant pool. We cannot take advantage of single-cycle
21803        LD1 because we need a PC-relative addressing mode.  */
21804     return const_vec;
21805   else
21806     /* A PARALLEL containing something not valid inside CONST_VECTOR.
21807        We cannot construct an initializer.  */
21808     return NULL_RTX;
21809 }
21810
21811 /* Expand a vector initialisation sequence, such that TARGET is
21812    initialised to contain VALS.  */
21813
21814 void
21815 aarch64_expand_vector_init (rtx target, rtx vals)
21816 {
21817   machine_mode mode = GET_MODE (target);
21818   scalar_mode inner_mode = GET_MODE_INNER (mode);
21819   /* The number of vector elements.  */
21820   int n_elts = XVECLEN (vals, 0);
21821   /* The number of vector elements which are not constant.  */
21822   int n_var = 0;
21823   rtx any_const = NULL_RTX;
21824   /* The first element of vals.  */
21825   rtx v0 = XVECEXP (vals, 0, 0);
21826   bool all_same = true;
21827
21828   /* This is a special vec_init<M><N> where N is not an element mode but a
21829      vector mode with half the elements of M.  We expect to find two entries
21830      of mode N in VALS and we must put their concatentation into TARGET.  */
21831   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
21832     {
21833       machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
21834       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
21835                   && known_eq (GET_MODE_SIZE (mode),
21836                                2 * GET_MODE_SIZE (narrow_mode)));
21837       emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
21838                                          XVECEXP (vals, 0, 0),
21839                                          XVECEXP (vals, 0, 1)));
21840      return;
21841    }
21842
21843   /* Count the number of variable elements to initialise.  */
21844   for (int i = 0; i < n_elts; ++i)
21845     {
21846       rtx x = XVECEXP (vals, 0, i);
21847       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
21848         ++n_var;
21849       else
21850         any_const = x;
21851
21852       all_same &= rtx_equal_p (x, v0);
21853     }
21854
21855   /* No variable elements, hand off to aarch64_simd_make_constant which knows
21856      how best to handle this.  */
21857   if (n_var == 0)
21858     {
21859       rtx constant = aarch64_simd_make_constant (vals);
21860       if (constant != NULL_RTX)
21861         {
21862           emit_move_insn (target, constant);
21863           return;
21864         }
21865     }
21866
21867   /* Splat a single non-constant element if we can.  */
21868   if (all_same)
21869     {
21870       rtx x = copy_to_mode_reg (inner_mode, v0);
21871       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
21872       return;
21873     }
21874
21875   enum insn_code icode = optab_handler (vec_set_optab, mode);
21876   gcc_assert (icode != CODE_FOR_nothing);
21877
21878   /* If there are only variable elements, try to optimize
21879      the insertion using dup for the most common element
21880      followed by insertions.  */
21881
21882   /* The algorithm will fill matches[*][0] with the earliest matching element,
21883      and matches[X][1] with the count of duplicate elements (if X is the
21884      earliest element which has duplicates).  */
21885
21886   if (n_var == n_elts && n_elts <= 16)
21887     {
21888       int matches[16][2] = {0};
21889       for (int i = 0; i < n_elts; i++)
21890         {
21891           for (int j = 0; j <= i; j++)
21892             {
21893               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
21894                 {
21895                   matches[i][0] = j;
21896                   matches[j][1]++;
21897                   break;
21898                 }
21899             }
21900         }
21901       int maxelement = 0;
21902       int maxv = 0;
21903       for (int i = 0; i < n_elts; i++)
21904         if (matches[i][1] > maxv)
21905           {
21906             maxelement = i;
21907             maxv = matches[i][1];
21908           }
21909
21910       /* Create a duplicate of the most common element, unless all elements
21911          are equally useless to us, in which case just immediately set the
21912          vector register using the first element.  */
21913
21914       if (maxv == 1)
21915         {
21916           /* For vectors of two 64-bit elements, we can do even better.  */
21917           if (n_elts == 2
21918               && (inner_mode == E_DImode
21919                   || inner_mode == E_DFmode))
21920
21921             {
21922               rtx x0 = XVECEXP (vals, 0, 0);
21923               rtx x1 = XVECEXP (vals, 0, 1);
21924               /* Combine can pick up this case, but handling it directly
21925                  here leaves clearer RTL.
21926
21927                  This is load_pair_lanes<mode>, and also gives us a clean-up
21928                  for store_pair_lanes<mode>.  */
21929               if (memory_operand (x0, inner_mode)
21930                   && memory_operand (x1, inner_mode)
21931                   && aarch64_mergeable_load_pair_p (mode, x0, x1))
21932                 {
21933                   rtx t;
21934                   if (inner_mode == DFmode)
21935                     t = gen_load_pair_lanesdf (target, x0, x1);
21936                   else
21937                     t = gen_load_pair_lanesdi (target, x0, x1);
21938                   emit_insn (t);
21939                   return;
21940                 }
21941             }
21942           /* The subreg-move sequence below will move into lane zero of the
21943              vector register.  For big-endian we want that position to hold
21944              the last element of VALS.  */
21945           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
21946           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
21947           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
21948         }
21949       else
21950         {
21951           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
21952           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
21953         }
21954
21955       /* Insert the rest.  */
21956       for (int i = 0; i < n_elts; i++)
21957         {
21958           rtx x = XVECEXP (vals, 0, i);
21959           if (matches[i][0] == maxelement)
21960             continue;
21961           x = copy_to_mode_reg (inner_mode, x);
21962           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
21963         }
21964       return;
21965     }
21966
21967   /* Initialise a vector which is part-variable.  We want to first try
21968      to build those lanes which are constant in the most efficient way we
21969      can.  */
21970   if (n_var != n_elts)
21971     {
21972       rtx copy = copy_rtx (vals);
21973
21974       /* Load constant part of vector.  We really don't care what goes into the
21975          parts we will overwrite, but we're more likely to be able to load the
21976          constant efficiently if it has fewer, larger, repeating parts
21977          (see aarch64_simd_valid_immediate).  */
21978       for (int i = 0; i < n_elts; i++)
21979         {
21980           rtx x = XVECEXP (vals, 0, i);
21981           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
21982             continue;
21983           rtx subst = any_const;
21984           for (int bit = n_elts / 2; bit > 0; bit /= 2)
21985             {
21986               /* Look in the copied vector, as more elements are const.  */
21987               rtx test = XVECEXP (copy, 0, i ^ bit);
21988               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
21989                 {
21990                   subst = test;
21991                   break;
21992                 }
21993             }
21994           XVECEXP (copy, 0, i) = subst;
21995         }
21996       aarch64_expand_vector_init (target, copy);
21997     }
21998
21999   /* Insert the variable lanes directly.  */
22000   for (int i = 0; i < n_elts; i++)
22001     {
22002       rtx x = XVECEXP (vals, 0, i);
22003       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22004         continue;
22005       x = copy_to_mode_reg (inner_mode, x);
22006       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22007     }
22008 }
22009
22010 /* Emit RTL corresponding to:
22011    insr TARGET, ELEM.  */
22012
22013 static void
22014 emit_insr (rtx target, rtx elem)
22015 {
22016   machine_mode mode = GET_MODE (target);
22017   scalar_mode elem_mode = GET_MODE_INNER (mode);
22018   elem = force_reg (elem_mode, elem);
22019
22020   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
22021   gcc_assert (icode != CODE_FOR_nothing);
22022   emit_insn (GEN_FCN (icode) (target, target, elem));
22023 }
22024
22025 /* Subroutine of aarch64_sve_expand_vector_init for handling
22026    trailing constants.
22027    This function works as follows:
22028    (a) Create a new vector consisting of trailing constants.
22029    (b) Initialize TARGET with the constant vector using emit_move_insn.
22030    (c) Insert remaining elements in TARGET using insr.
22031    NELTS is the total number of elements in original vector while
22032    while NELTS_REQD is the number of elements that are actually
22033    significant.
22034
22035    ??? The heuristic used is to do above only if number of constants
22036    is at least half the total number of elements.  May need fine tuning.  */
22037
22038 static bool
22039 aarch64_sve_expand_vector_init_handle_trailing_constants
22040  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
22041 {
22042   machine_mode mode = GET_MODE (target);
22043   scalar_mode elem_mode = GET_MODE_INNER (mode);
22044   int n_trailing_constants = 0;
22045
22046   for (int i = nelts_reqd - 1;
22047        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
22048        i--)
22049     n_trailing_constants++;
22050
22051   if (n_trailing_constants >= nelts_reqd / 2)
22052     {
22053       /* Try to use the natural pattern of BUILDER to extend the trailing
22054          constant elements to a full vector.  Replace any variables in the
22055          extra elements with zeros.
22056
22057          ??? It would be better if the builders supported "don't care"
22058              elements, with the builder filling in whichever elements
22059              give the most compact encoding.  */
22060       rtx_vector_builder v (mode, nelts, 1);
22061       for (int i = 0; i < nelts; i++)
22062         {
22063           rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
22064           if (!valid_for_const_vector_p (elem_mode, x))
22065             x = CONST0_RTX (elem_mode);
22066           v.quick_push (x);
22067         }
22068       rtx const_vec = v.build ();
22069       emit_move_insn (target, const_vec);
22070
22071       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
22072         emit_insr (target, builder.elt (i));
22073
22074       return true;
22075     }
22076
22077   return false;
22078 }
22079
22080 /* Subroutine of aarch64_sve_expand_vector_init.
22081    Works as follows:
22082    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
22083    (b) Skip trailing elements from BUILDER, which are the same as
22084        element NELTS_REQD - 1.
22085    (c) Insert earlier elements in reverse order in TARGET using insr.  */
22086
22087 static void
22088 aarch64_sve_expand_vector_init_insert_elems (rtx target,
22089                                              const rtx_vector_builder &builder,
22090                                              int nelts_reqd)
22091 {
22092   machine_mode mode = GET_MODE (target);
22093   scalar_mode elem_mode = GET_MODE_INNER (mode);
22094
22095   struct expand_operand ops[2];
22096   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
22097   gcc_assert (icode != CODE_FOR_nothing);
22098
22099   create_output_operand (&ops[0], target, mode);
22100   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
22101   expand_insn (icode, 2, ops);
22102
22103   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22104   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
22105     emit_insr (target, builder.elt (i));
22106 }
22107
22108 /* Subroutine of aarch64_sve_expand_vector_init to handle case
22109    when all trailing elements of builder are same.
22110    This works as follows:
22111    (a) Use expand_insn interface to broadcast last vector element in TARGET.
22112    (b) Insert remaining elements in TARGET using insr.
22113
22114    ??? The heuristic used is to do above if number of same trailing elements
22115    is at least 3/4 of total number of elements, loosely based on
22116    heuristic from mostly_zeros_p.  May need fine-tuning.  */
22117
22118 static bool
22119 aarch64_sve_expand_vector_init_handle_trailing_same_elem
22120  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
22121 {
22122   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22123   if (ndups >= (3 * nelts_reqd) / 4)
22124     {
22125       aarch64_sve_expand_vector_init_insert_elems (target, builder,
22126                                                    nelts_reqd - ndups + 1);
22127       return true;
22128     }
22129
22130   return false;
22131 }
22132
22133 /* Initialize register TARGET from BUILDER. NELTS is the constant number
22134    of elements in BUILDER.
22135
22136    The function tries to initialize TARGET from BUILDER if it fits one
22137    of the special cases outlined below.
22138
22139    Failing that, the function divides BUILDER into two sub-vectors:
22140    v_even = even elements of BUILDER;
22141    v_odd = odd elements of BUILDER;
22142
22143    and recursively calls itself with v_even and v_odd.
22144
22145    if (recursive call succeeded for v_even or v_odd)
22146      TARGET = zip (v_even, v_odd)
22147
22148    The function returns true if it managed to build TARGET from BUILDER
22149    with one of the special cases, false otherwise.
22150
22151    Example: {a, 1, b, 2, c, 3, d, 4}
22152
22153    The vector gets divided into:
22154    v_even = {a, b, c, d}
22155    v_odd = {1, 2, 3, 4}
22156
22157    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
22158    initialize tmp2 from constant vector v_odd using emit_move_insn.
22159
22160    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
22161    4 elements, so we construct tmp1 from v_even using insr:
22162    tmp1 = dup(d)
22163    insr tmp1, c
22164    insr tmp1, b
22165    insr tmp1, a
22166
22167    And finally:
22168    TARGET = zip (tmp1, tmp2)
22169    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
22170
22171 static bool
22172 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
22173                                 int nelts, int nelts_reqd)
22174 {
22175   machine_mode mode = GET_MODE (target);
22176
22177   /* Case 1: Vector contains trailing constants.  */
22178
22179   if (aarch64_sve_expand_vector_init_handle_trailing_constants
22180        (target, builder, nelts, nelts_reqd))
22181     return true;
22182
22183   /* Case 2: Vector contains leading constants.  */
22184
22185   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
22186   for (int i = 0; i < nelts_reqd; i++)
22187     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
22188   rev_builder.finalize ();
22189
22190   if (aarch64_sve_expand_vector_init_handle_trailing_constants
22191        (target, rev_builder, nelts, nelts_reqd))
22192     {
22193       emit_insn (gen_aarch64_sve_rev (mode, target, target));
22194       return true;
22195     }
22196
22197   /* Case 3: Vector contains trailing same element.  */
22198
22199   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22200        (target, builder, nelts_reqd))
22201     return true;
22202
22203   /* Case 4: Vector contains leading same element.  */
22204
22205   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22206        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
22207     {
22208       emit_insn (gen_aarch64_sve_rev (mode, target, target));
22209       return true;
22210     }
22211
22212   /* Avoid recursing below 4-elements.
22213      ??? The threshold 4 may need fine-tuning.  */
22214
22215   if (nelts_reqd <= 4)
22216     return false;
22217
22218   rtx_vector_builder v_even (mode, nelts, 1);
22219   rtx_vector_builder v_odd (mode, nelts, 1);
22220
22221   for (int i = 0; i < nelts * 2; i += 2)
22222     {
22223       v_even.quick_push (builder.elt (i));
22224       v_odd.quick_push (builder.elt (i + 1));
22225     }
22226
22227   v_even.finalize ();
22228   v_odd.finalize ();
22229
22230   rtx tmp1 = gen_reg_rtx (mode);
22231   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
22232                                                     nelts, nelts_reqd / 2);
22233
22234   rtx tmp2 = gen_reg_rtx (mode);
22235   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
22236                                                    nelts, nelts_reqd / 2);
22237
22238   if (!did_even_p && !did_odd_p)
22239     return false;
22240
22241   /* Initialize v_even and v_odd using INSR if it didn't match any of the
22242      special cases and zip v_even, v_odd.  */
22243
22244   if (!did_even_p)
22245     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
22246
22247   if (!did_odd_p)
22248     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
22249
22250   rtvec v = gen_rtvec (2, tmp1, tmp2);
22251   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22252   return true;
22253 }
22254
22255 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
22256
22257 void
22258 aarch64_sve_expand_vector_init (rtx target, rtx vals)
22259 {
22260   machine_mode mode = GET_MODE (target);
22261   int nelts = XVECLEN (vals, 0);
22262
22263   rtx_vector_builder v (mode, nelts, 1);
22264   for (int i = 0; i < nelts; i++)
22265     v.quick_push (XVECEXP (vals, 0, i));
22266   v.finalize ();
22267
22268   /* If neither sub-vectors of v could be initialized specially,
22269      then use INSR to insert all elements from v into TARGET.
22270      ??? This might not be optimal for vectors with large
22271      initializers like 16-element or above.
22272      For nelts < 4, it probably isn't useful to handle specially.  */
22273
22274   if (nelts < 4
22275       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
22276     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
22277 }
22278
22279 /* Check whether VALUE is a vector constant in which every element
22280    is either a power of 2 or a negated power of 2.  If so, return
22281    a constant vector of log2s, and flip CODE between PLUS and MINUS
22282    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
22283
22284 static rtx
22285 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
22286 {
22287   if (!CONST_VECTOR_P (value))
22288     return NULL_RTX;
22289
22290   rtx_vector_builder builder;
22291   if (!builder.new_unary_operation (GET_MODE (value), value, false))
22292     return NULL_RTX;
22293
22294   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
22295   /* 1 if the result of the multiplication must be negated,
22296      0 if it mustn't, or -1 if we don't yet care.  */
22297   int negate = -1;
22298   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
22299   for (unsigned int i = 0; i < encoded_nelts; ++i)
22300     {
22301       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
22302       if (!CONST_SCALAR_INT_P (elt))
22303         return NULL_RTX;
22304       rtx_mode_t val (elt, int_mode);
22305       wide_int pow2 = wi::neg (val);
22306       if (val != pow2)
22307         {
22308           /* It matters whether we negate or not.  Make that choice,
22309              and make sure that it's consistent with previous elements.  */
22310           if (negate == !wi::neg_p (val))
22311             return NULL_RTX;
22312           negate = wi::neg_p (val);
22313           if (!negate)
22314             pow2 = val;
22315         }
22316       /* POW2 is now the value that we want to be a power of 2.  */
22317       int shift = wi::exact_log2 (pow2);
22318       if (shift < 0)
22319         return NULL_RTX;
22320       builder.quick_push (gen_int_mode (shift, int_mode));
22321     }
22322   if (negate == -1)
22323     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
22324     code = PLUS;
22325   else if (negate == 1)
22326     code = code == PLUS ? MINUS : PLUS;
22327   return builder.build ();
22328 }
22329
22330 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
22331    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
22332    operands array, in the same order as for fma_optab.  Return true if
22333    the function emitted all the necessary instructions, false if the caller
22334    should generate the pattern normally with the new OPERANDS array.  */
22335
22336 bool
22337 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
22338 {
22339   machine_mode mode = GET_MODE (operands[0]);
22340   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
22341     {
22342       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
22343                                   NULL_RTX, true, OPTAB_DIRECT);
22344       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
22345                           operands[3], product, operands[0], true,
22346                           OPTAB_DIRECT);
22347       return true;
22348     }
22349   operands[2] = force_reg (mode, operands[2]);
22350   return false;
22351 }
22352
22353 /* Likewise, but for a conditional pattern.  */
22354
22355 bool
22356 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
22357 {
22358   machine_mode mode = GET_MODE (operands[0]);
22359   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
22360     {
22361       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
22362                                   NULL_RTX, true, OPTAB_DIRECT);
22363       emit_insn (gen_cond (code, mode, operands[0], operands[1],
22364                            operands[4], product, operands[5]));
22365       return true;
22366     }
22367   operands[3] = force_reg (mode, operands[3]);
22368   return false;
22369 }
22370
22371 static unsigned HOST_WIDE_INT
22372 aarch64_shift_truncation_mask (machine_mode mode)
22373 {
22374   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
22375     return 0;
22376   return GET_MODE_UNIT_BITSIZE (mode) - 1;
22377 }
22378
22379 /* Select a format to encode pointers in exception handling data.  */
22380 int
22381 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
22382 {
22383    int type;
22384    switch (aarch64_cmodel)
22385      {
22386      case AARCH64_CMODEL_TINY:
22387      case AARCH64_CMODEL_TINY_PIC:
22388      case AARCH64_CMODEL_SMALL:
22389      case AARCH64_CMODEL_SMALL_PIC:
22390      case AARCH64_CMODEL_SMALL_SPIC:
22391        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
22392           for everything.  */
22393        type = DW_EH_PE_sdata4;
22394        break;
22395      default:
22396        /* No assumptions here.  8-byte relocs required.  */
22397        type = DW_EH_PE_sdata8;
22398        break;
22399      }
22400    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
22401 }
22402
22403 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
22404
22405 static void
22406 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
22407 {
22408   if (TREE_CODE (decl) == FUNCTION_DECL)
22409     {
22410       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
22411       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
22412         {
22413           fprintf (stream, "\t.variant_pcs\t");
22414           assemble_name (stream, name);
22415           fprintf (stream, "\n");
22416         }
22417     }
22418 }
22419
22420 /* The last .arch and .tune assembly strings that we printed.  */
22421 static std::string aarch64_last_printed_arch_string;
22422 static std::string aarch64_last_printed_tune_string;
22423
22424 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
22425    by the function fndecl.  */
22426
22427 void
22428 aarch64_declare_function_name (FILE *stream, const char* name,
22429                                 tree fndecl)
22430 {
22431   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
22432
22433   struct cl_target_option *targ_options;
22434   if (target_parts)
22435     targ_options = TREE_TARGET_OPTION (target_parts);
22436   else
22437     targ_options = TREE_TARGET_OPTION (target_option_current_node);
22438   gcc_assert (targ_options);
22439
22440   const struct processor *this_arch
22441     = aarch64_get_arch (targ_options->x_selected_arch);
22442
22443   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
22444   std::string extension
22445     = aarch64_get_extension_string_for_isa_flags (isa_flags,
22446                                                   this_arch->flags);
22447   /* Only update the assembler .arch string if it is distinct from the last
22448      such string we printed.  */
22449   std::string to_print = this_arch->name + extension;
22450   if (to_print != aarch64_last_printed_arch_string)
22451     {
22452       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
22453       aarch64_last_printed_arch_string = to_print;
22454     }
22455
22456   /* Print the cpu name we're tuning for in the comments, might be
22457      useful to readers of the generated asm.  Do it only when it changes
22458      from function to function and verbose assembly is requested.  */
22459   const struct processor *this_tune
22460     = aarch64_get_tune_cpu (targ_options->x_selected_tune);
22461
22462   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
22463     {
22464       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
22465                    this_tune->name);
22466       aarch64_last_printed_tune_string = this_tune->name;
22467     }
22468
22469   aarch64_asm_output_variant_pcs (stream, fndecl, name);
22470
22471   /* Don't forget the type directive for ELF.  */
22472   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
22473   ASM_OUTPUT_LABEL (stream, name);
22474
22475   cfun->machine->label_is_assembled = true;
22476 }
22477
22478 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  Check if the patch area is after
22479    the function label and emit a BTI if necessary.  */
22480
22481 void
22482 aarch64_print_patchable_function_entry (FILE *file,
22483                                         unsigned HOST_WIDE_INT patch_area_size,
22484                                         bool record_p)
22485 {
22486   if (cfun->machine->label_is_assembled
22487       && aarch64_bti_enabled ()
22488       && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
22489     {
22490       /* Remove the BTI that follows the patch area and insert a new BTI
22491          before the patch area right after the function label.  */
22492       rtx_insn *insn = next_real_nondebug_insn (get_insns ());
22493       if (insn
22494           && INSN_P (insn)
22495           && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
22496           && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
22497         delete_insn (insn);
22498       asm_fprintf (file, "\thint\t34 // bti c\n");
22499     }
22500
22501   default_print_patchable_function_entry (file, patch_area_size, record_p);
22502 }
22503
22504 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
22505
22506 void
22507 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
22508 {
22509   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
22510   const char *value = IDENTIFIER_POINTER (target);
22511   aarch64_asm_output_variant_pcs (stream, decl, name);
22512   ASM_OUTPUT_DEF (stream, name, value);
22513 }
22514
22515 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
22516    function symbol references.  */
22517
22518 void
22519 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
22520 {
22521   default_elf_asm_output_external (stream, decl, name);
22522   aarch64_asm_output_variant_pcs (stream, decl, name);
22523 }
22524
22525 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
22526    Used to output the .cfi_b_key_frame directive when signing the current
22527    function with the B key.  */
22528
22529 void
22530 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
22531 {
22532   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
22533       && aarch64_ra_sign_key == AARCH64_KEY_B)
22534         asm_fprintf (f, "\t.cfi_b_key_frame\n");
22535 }
22536
22537 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
22538
22539 static void
22540 aarch64_start_file (void)
22541 {
22542   struct cl_target_option *default_options
22543     = TREE_TARGET_OPTION (target_option_default_node);
22544
22545   const struct processor *default_arch
22546     = aarch64_get_arch (default_options->x_selected_arch);
22547   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
22548   std::string extension
22549     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
22550                                                   default_arch->flags);
22551
22552    aarch64_last_printed_arch_string = default_arch->name + extension;
22553    aarch64_last_printed_tune_string = "";
22554    asm_fprintf (asm_out_file, "\t.arch %s\n",
22555                 aarch64_last_printed_arch_string.c_str ());
22556
22557    default_file_start ();
22558 }
22559
22560 /* Emit load exclusive.  */
22561
22562 static void
22563 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
22564                              rtx mem, rtx model_rtx)
22565 {
22566   if (mode == TImode)
22567     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
22568                                                 gen_highpart (DImode, rval),
22569                                                 mem, model_rtx));
22570   else
22571     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
22572 }
22573
22574 /* Emit store exclusive.  */
22575
22576 static void
22577 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
22578                               rtx mem, rtx rval, rtx model_rtx)
22579 {
22580   if (mode == TImode)
22581     emit_insn (gen_aarch64_store_exclusive_pair
22582                (bval, mem, operand_subword (rval, 0, 0, TImode),
22583                 operand_subword (rval, 1, 0, TImode), model_rtx));
22584   else
22585     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
22586 }
22587
22588 /* Mark the previous jump instruction as unlikely.  */
22589
22590 static void
22591 aarch64_emit_unlikely_jump (rtx insn)
22592 {
22593   rtx_insn *jump = emit_jump_insn (insn);
22594   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
22595 }
22596
22597 /* We store the names of the various atomic helpers in a 5x5 array.
22598    Return the libcall function given MODE, MODEL and NAMES.  */
22599
22600 rtx
22601 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
22602                         const atomic_ool_names *names)
22603 {
22604   memmodel model = memmodel_from_int (INTVAL (model_rtx));
22605   int mode_idx, model_idx;
22606
22607   switch (mode)
22608     {
22609     case E_QImode:
22610       mode_idx = 0;
22611       break;
22612     case E_HImode:
22613       mode_idx = 1;
22614       break;
22615     case E_SImode:
22616       mode_idx = 2;
22617       break;
22618     case E_DImode:
22619       mode_idx = 3;
22620       break;
22621     case E_TImode:
22622       mode_idx = 4;
22623       break;
22624     default:
22625       gcc_unreachable ();
22626     }
22627
22628   switch (model)
22629     {
22630     case MEMMODEL_RELAXED:
22631       model_idx = 0;
22632       break;
22633     case MEMMODEL_CONSUME:
22634     case MEMMODEL_ACQUIRE:
22635       model_idx = 1;
22636       break;
22637     case MEMMODEL_RELEASE:
22638       model_idx = 2;
22639       break;
22640     case MEMMODEL_ACQ_REL:
22641     case MEMMODEL_SEQ_CST:
22642       model_idx = 3;
22643       break;
22644     case MEMMODEL_SYNC_ACQUIRE:
22645     case MEMMODEL_SYNC_RELEASE:
22646     case MEMMODEL_SYNC_SEQ_CST:
22647       model_idx = 4;
22648       break;
22649     default:
22650       gcc_unreachable ();
22651     }
22652
22653   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
22654                                       VISIBILITY_HIDDEN);
22655 }
22656
22657 #define DEF0(B, N) \
22658   { "__aarch64_" #B #N "_relax", \
22659     "__aarch64_" #B #N "_acq", \
22660     "__aarch64_" #B #N "_rel", \
22661     "__aarch64_" #B #N "_acq_rel", \
22662     "__aarch64_" #B #N "_sync" }
22663
22664 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
22665                  { NULL, NULL, NULL, NULL }
22666 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
22667
22668 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
22669 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
22670 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
22671 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
22672 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
22673 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
22674
22675 #undef DEF0
22676 #undef DEF4
22677 #undef DEF5
22678
22679 /* Expand a compare and swap pattern.  */
22680
22681 void
22682 aarch64_expand_compare_and_swap (rtx operands[])
22683 {
22684   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
22685   machine_mode mode, r_mode;
22686
22687   bval = operands[0];
22688   rval = operands[1];
22689   mem = operands[2];
22690   oldval = operands[3];
22691   newval = operands[4];
22692   is_weak = operands[5];
22693   mod_s = operands[6];
22694   mod_f = operands[7];
22695   mode = GET_MODE (mem);
22696
22697   /* Normally the succ memory model must be stronger than fail, but in the
22698      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
22699      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
22700   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
22701       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
22702     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
22703
22704   r_mode = mode;
22705   if (mode == QImode || mode == HImode)
22706     {
22707       r_mode = SImode;
22708       rval = gen_reg_rtx (r_mode);
22709     }
22710
22711   if (TARGET_LSE)
22712     {
22713       /* The CAS insn requires oldval and rval overlap, but we need to
22714          have a copy of oldval saved across the operation to tell if
22715          the operation is successful.  */
22716       if (reg_overlap_mentioned_p (rval, oldval))
22717         rval = copy_to_mode_reg (r_mode, oldval);
22718       else
22719         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
22720
22721       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
22722                                                    newval, mod_s));
22723       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
22724     }
22725   else if (TARGET_OUTLINE_ATOMICS)
22726     {
22727       /* Oldval must satisfy compare afterward.  */
22728       if (!aarch64_plus_operand (oldval, mode))
22729         oldval = force_reg (mode, oldval);
22730       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
22731       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
22732                                       oldval, mode, newval, mode,
22733                                       XEXP (mem, 0), Pmode);
22734       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
22735     }
22736   else
22737     {
22738       /* The oldval predicate varies by mode.  Test it and force to reg.  */
22739       insn_code code = code_for_aarch64_compare_and_swap (mode);
22740       if (!insn_data[code].operand[2].predicate (oldval, mode))
22741         oldval = force_reg (mode, oldval);
22742
22743       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
22744                                  is_weak, mod_s, mod_f));
22745       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
22746     }
22747
22748   if (r_mode != mode)
22749     rval = gen_lowpart (mode, rval);
22750   emit_move_insn (operands[1], rval);
22751
22752   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
22753   emit_insn (gen_rtx_SET (bval, x));
22754 }
22755
22756 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
22757    sequence implementing an atomic operation.  */
22758
22759 static void
22760 aarch64_emit_post_barrier (enum memmodel model)
22761 {
22762   const enum memmodel base_model = memmodel_base (model);
22763
22764   if (is_mm_sync (model)
22765       && (base_model == MEMMODEL_ACQUIRE
22766           || base_model == MEMMODEL_ACQ_REL
22767           || base_model == MEMMODEL_SEQ_CST))
22768     {
22769       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
22770     }
22771 }
22772
22773 /* Split a compare and swap pattern.  */
22774
22775 void
22776 aarch64_split_compare_and_swap (rtx operands[])
22777 {
22778   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
22779   gcc_assert (epilogue_completed);
22780
22781   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
22782   machine_mode mode;
22783   bool is_weak;
22784   rtx_code_label *label1, *label2;
22785   enum memmodel model;
22786
22787   rval = operands[0];
22788   mem = operands[1];
22789   oldval = operands[2];
22790   newval = operands[3];
22791   is_weak = (operands[4] != const0_rtx);
22792   model_rtx = operands[5];
22793   scratch = operands[7];
22794   mode = GET_MODE (mem);
22795   model = memmodel_from_int (INTVAL (model_rtx));
22796
22797   /* When OLDVAL is zero and we want the strong version we can emit a tighter
22798     loop:
22799     .label1:
22800         LD[A]XR rval, [mem]
22801         CBNZ    rval, .label2
22802         ST[L]XR scratch, newval, [mem]
22803         CBNZ    scratch, .label1
22804     .label2:
22805         CMP     rval, 0.  */
22806   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
22807                         oldval == const0_rtx && mode != TImode);
22808
22809   label1 = NULL;
22810   if (!is_weak)
22811     {
22812       label1 = gen_label_rtx ();
22813       emit_label (label1);
22814     }
22815   label2 = gen_label_rtx ();
22816
22817   /* The initial load can be relaxed for a __sync operation since a final
22818      barrier will be emitted to stop code hoisting.  */
22819   if (is_mm_sync (model))
22820     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
22821   else
22822     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
22823
22824   if (strong_zero_p)
22825     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
22826   else
22827     {
22828       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
22829       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
22830     }
22831   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
22832                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
22833   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
22834
22835   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
22836
22837   if (!is_weak)
22838     {
22839       if (aarch64_track_speculation)
22840         {
22841           /* Emit an explicit compare instruction, so that we can correctly
22842              track the condition codes.  */
22843           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
22844           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
22845         }
22846       else
22847         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
22848
22849       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
22850                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
22851       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
22852     }
22853   else
22854     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
22855
22856   emit_label (label2);
22857
22858   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
22859      to set the condition flags.  If this is not used it will be removed by
22860      later passes.  */
22861   if (strong_zero_p)
22862     aarch64_gen_compare_reg (NE, rval, const0_rtx);
22863
22864   /* Emit any final barrier needed for a __sync operation.  */
22865   if (is_mm_sync (model))
22866     aarch64_emit_post_barrier (model);
22867 }
22868
22869 /* Split an atomic operation.  */
22870
22871 void
22872 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
22873                          rtx value, rtx model_rtx, rtx cond)
22874 {
22875   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
22876   gcc_assert (epilogue_completed);
22877
22878   machine_mode mode = GET_MODE (mem);
22879   machine_mode wmode = (mode == DImode ? DImode : SImode);
22880   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
22881   const bool is_sync = is_mm_sync (model);
22882   rtx_code_label *label;
22883   rtx x;
22884
22885   /* Split the atomic operation into a sequence.  */
22886   label = gen_label_rtx ();
22887   emit_label (label);
22888
22889   if (new_out)
22890     new_out = gen_lowpart (wmode, new_out);
22891   if (old_out)
22892     old_out = gen_lowpart (wmode, old_out);
22893   else
22894     old_out = new_out;
22895   value = simplify_gen_subreg (wmode, value, mode, 0);
22896
22897   /* The initial load can be relaxed for a __sync operation since a final
22898      barrier will be emitted to stop code hoisting.  */
22899  if (is_sync)
22900     aarch64_emit_load_exclusive (mode, old_out, mem,
22901                                  GEN_INT (MEMMODEL_RELAXED));
22902   else
22903     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
22904
22905   switch (code)
22906     {
22907     case SET:
22908       new_out = value;
22909       break;
22910
22911     case NOT:
22912       x = gen_rtx_AND (wmode, old_out, value);
22913       emit_insn (gen_rtx_SET (new_out, x));
22914       x = gen_rtx_NOT (wmode, new_out);
22915       emit_insn (gen_rtx_SET (new_out, x));
22916       break;
22917
22918     case MINUS:
22919       if (CONST_INT_P (value))
22920         {
22921           value = GEN_INT (-UINTVAL (value));
22922           code = PLUS;
22923         }
22924       /* Fall through.  */
22925
22926     default:
22927       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
22928       emit_insn (gen_rtx_SET (new_out, x));
22929       break;
22930     }
22931
22932   aarch64_emit_store_exclusive (mode, cond, mem,
22933                                 gen_lowpart (mode, new_out), model_rtx);
22934
22935   if (aarch64_track_speculation)
22936     {
22937       /* Emit an explicit compare instruction, so that we can correctly
22938          track the condition codes.  */
22939       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
22940       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
22941     }
22942   else
22943     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
22944
22945   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
22946                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
22947   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
22948
22949   /* Emit any final barrier needed for a __sync operation.  */
22950   if (is_sync)
22951     aarch64_emit_post_barrier (model);
22952 }
22953
22954 static void
22955 aarch64_init_libfuncs (void)
22956 {
22957    /* Half-precision float operations.  The compiler handles all operations
22958      with NULL libfuncs by converting to SFmode.  */
22959
22960   /* Conversions.  */
22961   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
22962   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
22963
22964   /* Arithmetic.  */
22965   set_optab_libfunc (add_optab, HFmode, NULL);
22966   set_optab_libfunc (sdiv_optab, HFmode, NULL);
22967   set_optab_libfunc (smul_optab, HFmode, NULL);
22968   set_optab_libfunc (neg_optab, HFmode, NULL);
22969   set_optab_libfunc (sub_optab, HFmode, NULL);
22970
22971   /* Comparisons.  */
22972   set_optab_libfunc (eq_optab, HFmode, NULL);
22973   set_optab_libfunc (ne_optab, HFmode, NULL);
22974   set_optab_libfunc (lt_optab, HFmode, NULL);
22975   set_optab_libfunc (le_optab, HFmode, NULL);
22976   set_optab_libfunc (ge_optab, HFmode, NULL);
22977   set_optab_libfunc (gt_optab, HFmode, NULL);
22978   set_optab_libfunc (unord_optab, HFmode, NULL);
22979 }
22980
22981 /* Target hook for c_mode_for_suffix.  */
22982 static machine_mode
22983 aarch64_c_mode_for_suffix (char suffix)
22984 {
22985   if (suffix == 'q')
22986     return TFmode;
22987
22988   return VOIDmode;
22989 }
22990
22991 /* We can only represent floating point constants which will fit in
22992    "quarter-precision" values.  These values are characterised by
22993    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
22994    by:
22995
22996    (-1)^s * (n/16) * 2^r
22997
22998    Where:
22999      's' is the sign bit.
23000      'n' is an integer in the range 16 <= n <= 31.
23001      'r' is an integer in the range -3 <= r <= 4.  */
23002
23003 /* Return true iff X can be represented by a quarter-precision
23004    floating point immediate operand X.  Note, we cannot represent 0.0.  */
23005 bool
23006 aarch64_float_const_representable_p (rtx x)
23007 {
23008   /* This represents our current view of how many bits
23009      make up the mantissa.  */
23010   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
23011   int exponent;
23012   unsigned HOST_WIDE_INT mantissa, mask;
23013   REAL_VALUE_TYPE r, m;
23014   bool fail;
23015
23016   x = unwrap_const_vec_duplicate (x);
23017   if (!CONST_DOUBLE_P (x))
23018     return false;
23019
23020   if (GET_MODE (x) == VOIDmode
23021       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
23022     return false;
23023
23024   r = *CONST_DOUBLE_REAL_VALUE (x);
23025
23026   /* We cannot represent infinities, NaNs or +/-zero.  We won't
23027      know if we have +zero until we analyse the mantissa, but we
23028      can reject the other invalid values.  */
23029   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
23030       || REAL_VALUE_MINUS_ZERO (r))
23031     return false;
23032
23033   /* Extract exponent.  */
23034   r = real_value_abs (&r);
23035   exponent = REAL_EXP (&r);
23036
23037   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23038      highest (sign) bit, with a fixed binary point at bit point_pos.
23039      m1 holds the low part of the mantissa, m2 the high part.
23040      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23041      bits for the mantissa, this can fail (low bits will be lost).  */
23042   real_ldexp (&m, &r, point_pos - exponent);
23043   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
23044
23045   /* If the low part of the mantissa has bits set we cannot represent
23046      the value.  */
23047   if (w.ulow () != 0)
23048     return false;
23049   /* We have rejected the lower HOST_WIDE_INT, so update our
23050      understanding of how many bits lie in the mantissa and
23051      look only at the high HOST_WIDE_INT.  */
23052   mantissa = w.elt (1);
23053   point_pos -= HOST_BITS_PER_WIDE_INT;
23054
23055   /* We can only represent values with a mantissa of the form 1.xxxx.  */
23056   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
23057   if ((mantissa & mask) != 0)
23058     return false;
23059
23060   /* Having filtered unrepresentable values, we may now remove all
23061      but the highest 5 bits.  */
23062   mantissa >>= point_pos - 5;
23063
23064   /* We cannot represent the value 0.0, so reject it.  This is handled
23065      elsewhere.  */
23066   if (mantissa == 0)
23067     return false;
23068
23069   /* Then, as bit 4 is always set, we can mask it off, leaving
23070      the mantissa in the range [0, 15].  */
23071   mantissa &= ~(1 << 4);
23072   gcc_assert (mantissa <= 15);
23073
23074   /* GCC internally does not use IEEE754-like encoding (where normalized
23075      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
23076      Our mantissa values are shifted 4 places to the left relative to
23077      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23078      by 5 places to correct for GCC's representation.  */
23079   exponent = 5 - exponent;
23080
23081   return (exponent >= 0 && exponent <= 7);
23082 }
23083
23084 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
23085    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
23086    output MOVI/MVNI, ORR or BIC immediate.  */
23087 char*
23088 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
23089                                    enum simd_immediate_check which)
23090 {
23091   bool is_valid;
23092   static char templ[40];
23093   const char *mnemonic;
23094   const char *shift_op;
23095   unsigned int lane_count = 0;
23096   char element_char;
23097
23098   struct simd_immediate_info info;
23099
23100   /* This will return true to show const_vector is legal for use as either
23101      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
23102      It will also update INFO to show how the immediate should be generated.
23103      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
23104   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
23105   gcc_assert (is_valid);
23106
23107   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23108   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
23109
23110   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23111     {
23112       gcc_assert (info.insn == simd_immediate_info::MOV
23113                   && info.u.mov.shift == 0);
23114       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
23115          move immediate path.  */
23116       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23117         info.u.mov.value = GEN_INT (0);
23118       else
23119         {
23120           const unsigned int buf_size = 20;
23121           char float_buf[buf_size] = {'\0'};
23122           real_to_decimal_for_mode (float_buf,
23123                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23124                                     buf_size, buf_size, 1, info.elt_mode);
23125
23126           if (lane_count == 1)
23127             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
23128           else
23129             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
23130                       lane_count, element_char, float_buf);
23131           return templ;
23132         }
23133     }
23134
23135   gcc_assert (CONST_INT_P (info.u.mov.value));
23136
23137   if (which == AARCH64_CHECK_MOV)
23138     {
23139       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
23140       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
23141                   ? "msl" : "lsl");
23142       if (lane_count == 1)
23143         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
23144                   mnemonic, UINTVAL (info.u.mov.value));
23145       else if (info.u.mov.shift)
23146         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23147                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
23148                   element_char, UINTVAL (info.u.mov.value), shift_op,
23149                   info.u.mov.shift);
23150       else
23151         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23152                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
23153                   element_char, UINTVAL (info.u.mov.value));
23154     }
23155   else
23156     {
23157       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
23158       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
23159       if (info.u.mov.shift)
23160         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23161                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
23162                   element_char, UINTVAL (info.u.mov.value), "lsl",
23163                   info.u.mov.shift);
23164       else
23165         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23166                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
23167                   element_char, UINTVAL (info.u.mov.value));
23168     }
23169   return templ;
23170 }
23171
23172 char*
23173 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
23174 {
23175
23176   /* If a floating point number was passed and we desire to use it in an
23177      integer mode do the conversion to integer.  */
23178   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
23179     {
23180       unsigned HOST_WIDE_INT ival;
23181       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
23182           gcc_unreachable ();
23183       immediate = gen_int_mode (ival, mode);
23184     }
23185
23186   machine_mode vmode;
23187   /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
23188      a 128 bit vector mode.  */
23189   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
23190
23191   vmode = aarch64_simd_container_mode (mode, width);
23192   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
23193   return aarch64_output_simd_mov_immediate (v_op, width);
23194 }
23195
23196 /* Return the output string to use for moving immediate CONST_VECTOR
23197    into an SVE register.  */
23198
23199 char *
23200 aarch64_output_sve_mov_immediate (rtx const_vector)
23201 {
23202   static char templ[40];
23203   struct simd_immediate_info info;
23204   char element_char;
23205
23206   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
23207   gcc_assert (is_valid);
23208
23209   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23210
23211   machine_mode vec_mode = GET_MODE (const_vector);
23212   if (aarch64_sve_pred_mode_p (vec_mode))
23213     {
23214       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
23215       if (info.insn == simd_immediate_info::MOV)
23216         {
23217           gcc_assert (info.u.mov.value == const0_rtx);
23218           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
23219         }
23220       else
23221         {
23222           gcc_assert (info.insn == simd_immediate_info::PTRUE);
23223           unsigned int total_bytes;
23224           if (info.u.pattern == AARCH64_SV_ALL
23225               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
23226             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
23227                       total_bytes / GET_MODE_SIZE (info.elt_mode));
23228           else
23229             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
23230                       svpattern_token (info.u.pattern));
23231         }
23232       return buf;
23233     }
23234
23235   if (info.insn == simd_immediate_info::INDEX)
23236     {
23237       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
23238                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
23239                 element_char, INTVAL (info.u.index.base),
23240                 INTVAL (info.u.index.step));
23241       return templ;
23242     }
23243
23244   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23245     {
23246       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23247         info.u.mov.value = GEN_INT (0);
23248       else
23249         {
23250           const int buf_size = 20;
23251           char float_buf[buf_size] = {};
23252           real_to_decimal_for_mode (float_buf,
23253                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23254                                     buf_size, buf_size, 1, info.elt_mode);
23255
23256           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
23257                     element_char, float_buf);
23258           return templ;
23259         }
23260     }
23261
23262   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
23263             element_char, INTVAL (info.u.mov.value));
23264   return templ;
23265 }
23266
23267 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
23268    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
23269    pattern.  */
23270
23271 char *
23272 aarch64_output_sve_ptrues (rtx const_unspec)
23273 {
23274   static char templ[40];
23275
23276   struct simd_immediate_info info;
23277   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
23278   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
23279
23280   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23281   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
23282             svpattern_token (info.u.pattern));
23283   return templ;
23284 }
23285
23286 /* Split operands into moves from op[1] + op[2] into op[0].  */
23287
23288 void
23289 aarch64_split_combinev16qi (rtx operands[3])
23290 {
23291   unsigned int dest = REGNO (operands[0]);
23292   unsigned int src1 = REGNO (operands[1]);
23293   unsigned int src2 = REGNO (operands[2]);
23294   machine_mode halfmode = GET_MODE (operands[1]);
23295   unsigned int halfregs = REG_NREGS (operands[1]);
23296   rtx destlo, desthi;
23297
23298   gcc_assert (halfmode == V16QImode);
23299
23300   if (src1 == dest && src2 == dest + halfregs)
23301     {
23302       /* No-op move.  Can't split to nothing; emit something.  */
23303       emit_note (NOTE_INSN_DELETED);
23304       return;
23305     }
23306
23307   /* Preserve register attributes for variable tracking.  */
23308   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
23309   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
23310                                GET_MODE_SIZE (halfmode));
23311
23312   /* Special case of reversed high/low parts.  */
23313   if (reg_overlap_mentioned_p (operands[2], destlo)
23314       && reg_overlap_mentioned_p (operands[1], desthi))
23315     {
23316       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23317       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
23318       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23319     }
23320   else if (!reg_overlap_mentioned_p (operands[2], destlo))
23321     {
23322       /* Try to avoid unnecessary moves if part of the result
23323          is in the right place already.  */
23324       if (src1 != dest)
23325         emit_move_insn (destlo, operands[1]);
23326       if (src2 != dest + halfregs)
23327         emit_move_insn (desthi, operands[2]);
23328     }
23329   else
23330     {
23331       if (src2 != dest + halfregs)
23332         emit_move_insn (desthi, operands[2]);
23333       if (src1 != dest)
23334         emit_move_insn (destlo, operands[1]);
23335     }
23336 }
23337
23338 /* vec_perm support.  */
23339
23340 struct expand_vec_perm_d
23341 {
23342   rtx target, op0, op1;
23343   vec_perm_indices perm;
23344   machine_mode vmode;
23345   machine_mode op_mode;
23346   unsigned int vec_flags;
23347   unsigned int op_vec_flags;
23348   bool one_vector_p;
23349   bool testing_p;
23350 };
23351
23352 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
23353
23354 /* Generate a variable permutation.  */
23355
23356 static void
23357 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
23358 {
23359   machine_mode vmode = GET_MODE (target);
23360   bool one_vector_p = rtx_equal_p (op0, op1);
23361
23362   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
23363   gcc_checking_assert (GET_MODE (op0) == vmode);
23364   gcc_checking_assert (GET_MODE (op1) == vmode);
23365   gcc_checking_assert (GET_MODE (sel) == vmode);
23366   gcc_checking_assert (TARGET_SIMD);
23367
23368   if (one_vector_p)
23369     {
23370       if (vmode == V8QImode)
23371         {
23372           /* Expand the argument to a V16QI mode by duplicating it.  */
23373           rtx pair = gen_reg_rtx (V16QImode);
23374           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
23375           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23376         }
23377       else
23378         {
23379           emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
23380         }
23381     }
23382   else
23383     {
23384       rtx pair;
23385
23386       if (vmode == V8QImode)
23387         {
23388           pair = gen_reg_rtx (V16QImode);
23389           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
23390           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23391         }
23392       else
23393         {
23394           pair = gen_reg_rtx (V2x16QImode);
23395           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
23396           emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
23397         }
23398     }
23399 }
23400
23401 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
23402    NELT is the number of elements in the vector.  */
23403
23404 void
23405 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
23406                          unsigned int nelt)
23407 {
23408   machine_mode vmode = GET_MODE (target);
23409   bool one_vector_p = rtx_equal_p (op0, op1);
23410   rtx mask;
23411
23412   /* The TBL instruction does not use a modulo index, so we must take care
23413      of that ourselves.  */
23414   mask = aarch64_simd_gen_const_vector_dup (vmode,
23415       one_vector_p ? nelt - 1 : 2 * nelt - 1);
23416   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
23417
23418   /* For big-endian, we also need to reverse the index within the vector
23419      (but not which vector).  */
23420   if (BYTES_BIG_ENDIAN)
23421     {
23422       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
23423       if (!one_vector_p)
23424         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
23425       sel = expand_simple_binop (vmode, XOR, sel, mask,
23426                                  NULL, 0, OPTAB_LIB_WIDEN);
23427     }
23428   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
23429 }
23430
23431 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
23432
23433 static void
23434 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
23435 {
23436   emit_insn (gen_rtx_SET (target,
23437                           gen_rtx_UNSPEC (GET_MODE (target),
23438                                           gen_rtvec (2, op0, op1), code)));
23439 }
23440
23441 /* Expand an SVE vec_perm with the given operands.  */
23442
23443 void
23444 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
23445 {
23446   machine_mode data_mode = GET_MODE (target);
23447   machine_mode sel_mode = GET_MODE (sel);
23448   /* Enforced by the pattern condition.  */
23449   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
23450
23451   /* Note: vec_perm indices are supposed to wrap when they go beyond the
23452      size of the two value vectors, i.e. the upper bits of the indices
23453      are effectively ignored.  SVE TBL instead produces 0 for any
23454      out-of-range indices, so we need to modulo all the vec_perm indices
23455      to ensure they are all in range.  */
23456   rtx sel_reg = force_reg (sel_mode, sel);
23457
23458   /* Check if the sel only references the first values vector.  */
23459   if (CONST_VECTOR_P (sel)
23460       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
23461     {
23462       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
23463       return;
23464     }
23465
23466   /* Check if the two values vectors are the same.  */
23467   if (rtx_equal_p (op0, op1))
23468     {
23469       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
23470       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23471                                          NULL, 0, OPTAB_DIRECT);
23472       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
23473       return;
23474     }
23475
23476   /* Run TBL on for each value vector and combine the results.  */
23477
23478   rtx res0 = gen_reg_rtx (data_mode);
23479   rtx res1 = gen_reg_rtx (data_mode);
23480   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
23481   if (!CONST_VECTOR_P (sel)
23482       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
23483     {
23484       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
23485                                                        2 * nunits - 1);
23486       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23487                                      NULL, 0, OPTAB_DIRECT);
23488     }
23489   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
23490   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
23491                                      NULL, 0, OPTAB_DIRECT);
23492   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
23493   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
23494     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
23495   else
23496     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
23497 }
23498
23499 /* Recognize patterns suitable for the TRN instructions.  */
23500 static bool
23501 aarch64_evpc_trn (struct expand_vec_perm_d *d)
23502 {
23503   HOST_WIDE_INT odd;
23504   poly_uint64 nelt = d->perm.length ();
23505   rtx out, in0, in1, x;
23506   machine_mode vmode = d->vmode;
23507
23508   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23509     return false;
23510
23511   /* Note that these are little-endian tests.
23512      We correct for big-endian later.  */
23513   if (!d->perm[0].is_constant (&odd)
23514       || (odd != 0 && odd != 1)
23515       || !d->perm.series_p (0, 2, odd, 2)
23516       || !d->perm.series_p (1, 2, nelt + odd, 2))
23517     return false;
23518
23519   /* Success!  */
23520   if (d->testing_p)
23521     return true;
23522
23523   in0 = d->op0;
23524   in1 = d->op1;
23525   /* We don't need a big-endian lane correction for SVE; see the comment
23526      at the head of aarch64-sve.md for details.  */
23527   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23528     {
23529       x = in0, in0 = in1, in1 = x;
23530       odd = !odd;
23531     }
23532   out = d->target;
23533
23534   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23535                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
23536   return true;
23537 }
23538
23539 /* Try to re-encode the PERM constant so it combines odd and even elements.
23540    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
23541    We retry with this new constant with the full suite of patterns.  */
23542 static bool
23543 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
23544 {
23545   expand_vec_perm_d newd;
23546   unsigned HOST_WIDE_INT nelt;
23547
23548   if (d->vec_flags != VEC_ADVSIMD)
23549     return false;
23550
23551   /* Get the new mode.  Always twice the size of the inner
23552      and half the elements.  */
23553   poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
23554   unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
23555   auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
23556   machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
23557
23558   if (new_mode == word_mode)
23559     return false;
23560
23561   /* to_constant is safe since this routine is specific to Advanced SIMD
23562      vectors.  */
23563   nelt = d->perm.length ().to_constant ();
23564
23565   vec_perm_builder newpermconst;
23566   newpermconst.new_vector (nelt / 2, nelt / 2, 1);
23567
23568   /* Convert the perm constant if we can.  Require even, odd as the pairs.  */
23569   for (unsigned int i = 0; i < nelt; i += 2)
23570     {
23571       poly_int64 elt0 = d->perm[i];
23572       poly_int64 elt1 = d->perm[i + 1];
23573       poly_int64 newelt;
23574       if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
23575         return false;
23576       newpermconst.quick_push (newelt.to_constant ());
23577     }
23578   newpermconst.finalize ();
23579
23580   newd.vmode = new_mode;
23581   newd.vec_flags = VEC_ADVSIMD;
23582   newd.op_mode = newd.vmode;
23583   newd.op_vec_flags = newd.vec_flags;
23584   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
23585   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
23586   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
23587   newd.testing_p = d->testing_p;
23588   newd.one_vector_p = d->one_vector_p;
23589
23590   newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
23591   return aarch64_expand_vec_perm_const_1 (&newd);
23592 }
23593
23594 /* Recognize patterns suitable for the UZP instructions.  */
23595 static bool
23596 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
23597 {
23598   HOST_WIDE_INT odd;
23599   rtx out, in0, in1, x;
23600   machine_mode vmode = d->vmode;
23601
23602   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23603     return false;
23604
23605   /* Note that these are little-endian tests.
23606      We correct for big-endian later.  */
23607   if (!d->perm[0].is_constant (&odd)
23608       || (odd != 0 && odd != 1)
23609       || !d->perm.series_p (0, 1, odd, 2))
23610     return false;
23611
23612   /* Success!  */
23613   if (d->testing_p)
23614     return true;
23615
23616   in0 = d->op0;
23617   in1 = d->op1;
23618   /* We don't need a big-endian lane correction for SVE; see the comment
23619      at the head of aarch64-sve.md for details.  */
23620   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23621     {
23622       x = in0, in0 = in1, in1 = x;
23623       odd = !odd;
23624     }
23625   out = d->target;
23626
23627   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23628                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
23629   return true;
23630 }
23631
23632 /* Recognize patterns suitable for the ZIP instructions.  */
23633 static bool
23634 aarch64_evpc_zip (struct expand_vec_perm_d *d)
23635 {
23636   unsigned int high;
23637   poly_uint64 nelt = d->perm.length ();
23638   rtx out, in0, in1, x;
23639   machine_mode vmode = d->vmode;
23640
23641   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23642     return false;
23643
23644   /* Note that these are little-endian tests.
23645      We correct for big-endian later.  */
23646   poly_uint64 first = d->perm[0];
23647   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
23648       || !d->perm.series_p (0, 2, first, 1)
23649       || !d->perm.series_p (1, 2, first + nelt, 1))
23650     return false;
23651   high = maybe_ne (first, 0U);
23652
23653   /* Success!  */
23654   if (d->testing_p)
23655     return true;
23656
23657   in0 = d->op0;
23658   in1 = d->op1;
23659   /* We don't need a big-endian lane correction for SVE; see the comment
23660      at the head of aarch64-sve.md for details.  */
23661   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23662     {
23663       x = in0, in0 = in1, in1 = x;
23664       high = !high;
23665     }
23666   out = d->target;
23667
23668   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23669                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
23670   return true;
23671 }
23672
23673 /* Recognize patterns for the EXT insn.  */
23674
23675 static bool
23676 aarch64_evpc_ext (struct expand_vec_perm_d *d)
23677 {
23678   HOST_WIDE_INT location;
23679   rtx offset;
23680
23681   /* The first element always refers to the first vector.
23682      Check if the extracted indices are increasing by one.  */
23683   if (d->vec_flags == VEC_SVE_PRED
23684       || !d->perm[0].is_constant (&location)
23685       || !d->perm.series_p (0, 1, location, 1))
23686     return false;
23687
23688   /* Success! */
23689   if (d->testing_p)
23690     return true;
23691
23692   /* The case where (location == 0) is a no-op for both big- and little-endian,
23693      and is removed by the mid-end at optimization levels -O1 and higher.
23694
23695      We don't need a big-endian lane correction for SVE; see the comment
23696      at the head of aarch64-sve.md for details.  */
23697   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
23698     {
23699       /* After setup, we want the high elements of the first vector (stored
23700          at the LSB end of the register), and the low elements of the second
23701          vector (stored at the MSB end of the register). So swap.  */
23702       std::swap (d->op0, d->op1);
23703       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
23704          to_constant () is safe since this is restricted to Advanced SIMD
23705          vectors.  */
23706       location = d->perm.length ().to_constant () - location;
23707     }
23708
23709   offset = GEN_INT (location);
23710   emit_set_insn (d->target,
23711                  gen_rtx_UNSPEC (d->vmode,
23712                                  gen_rtvec (3, d->op0, d->op1, offset),
23713                                  UNSPEC_EXT));
23714   return true;
23715 }
23716
23717 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
23718    within each 64-bit, 32-bit or 16-bit granule.  */
23719
23720 static bool
23721 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
23722 {
23723   HOST_WIDE_INT diff;
23724   unsigned int i, size, unspec;
23725   machine_mode pred_mode;
23726
23727   if (d->vec_flags == VEC_SVE_PRED
23728       || !d->one_vector_p
23729       || !d->perm[0].is_constant (&diff)
23730       || !diff)
23731     return false;
23732
23733   if (d->vec_flags & VEC_SVE_DATA)
23734     size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
23735   else
23736     size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
23737   if (size == 64)
23738     {
23739       unspec = UNSPEC_REV64;
23740       pred_mode = VNx2BImode;
23741     }
23742   else if (size == 32)
23743     {
23744       unspec = UNSPEC_REV32;
23745       pred_mode = VNx4BImode;
23746     }
23747   else if (size == 16)
23748     {
23749       unspec = UNSPEC_REV16;
23750       pred_mode = VNx8BImode;
23751     }
23752   else
23753     return false;
23754
23755   unsigned int step = diff + 1;
23756   for (i = 0; i < step; ++i)
23757     if (!d->perm.series_p (i, step, diff - i, step))
23758       return false;
23759
23760   /* Success! */
23761   if (d->testing_p)
23762     return true;
23763
23764   if (d->vec_flags & VEC_SVE_DATA)
23765     {
23766       rtx pred = aarch64_ptrue_reg (pred_mode);
23767       emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
23768                                          d->target, pred, d->op0));
23769       return true;
23770     }
23771   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
23772   emit_set_insn (d->target, src);
23773   return true;
23774 }
23775
23776 /* Recognize patterns for the REV insn, which reverses elements within
23777    a full vector.  */
23778
23779 static bool
23780 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
23781 {
23782   poly_uint64 nelt = d->perm.length ();
23783
23784   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
23785     return false;
23786
23787   if (!d->perm.series_p (0, 1, nelt - 1, -1))
23788     return false;
23789
23790   /* Success! */
23791   if (d->testing_p)
23792     return true;
23793
23794   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
23795   emit_set_insn (d->target, src);
23796   return true;
23797 }
23798
23799 static bool
23800 aarch64_evpc_dup (struct expand_vec_perm_d *d)
23801 {
23802   rtx out = d->target;
23803   rtx in0;
23804   HOST_WIDE_INT elt;
23805   machine_mode vmode = d->vmode;
23806   rtx lane;
23807
23808   if (d->vec_flags == VEC_SVE_PRED
23809       || d->perm.encoding ().encoded_nelts () != 1
23810       || !d->perm[0].is_constant (&elt))
23811     return false;
23812
23813   if ((d->vec_flags & VEC_SVE_DATA)
23814       && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
23815     return false;
23816
23817   /* Success! */
23818   if (d->testing_p)
23819     return true;
23820
23821   /* The generic preparation in aarch64_expand_vec_perm_const_1
23822      swaps the operand order and the permute indices if it finds
23823      d->perm[0] to be in the second operand.  Thus, we can always
23824      use d->op0 and need not do any extra arithmetic to get the
23825      correct lane number.  */
23826   in0 = d->op0;
23827   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
23828
23829   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
23830   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
23831   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
23832   return true;
23833 }
23834
23835 static bool
23836 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
23837 {
23838   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
23839   machine_mode vmode = d->vmode;
23840
23841   /* Make sure that the indices are constant.  */
23842   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
23843   for (unsigned int i = 0; i < encoded_nelts; ++i)
23844     if (!d->perm[i].is_constant ())
23845       return false;
23846
23847   if (d->testing_p)
23848     return true;
23849
23850   /* Generic code will try constant permutation twice.  Once with the
23851      original mode and again with the elements lowered to QImode.
23852      So wait and don't do the selector expansion ourselves.  */
23853   if (vmode != V8QImode && vmode != V16QImode)
23854     return false;
23855
23856   /* to_constant is safe since this routine is specific to Advanced SIMD
23857      vectors.  */
23858   unsigned int nelt = d->perm.length ().to_constant ();
23859   for (unsigned int i = 0; i < nelt; ++i)
23860     /* If big-endian and two vectors we end up with a weird mixed-endian
23861        mode on NEON.  Reverse the index within each word but not the word
23862        itself.  to_constant is safe because we checked is_constant above.  */
23863     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
23864                         ? d->perm[i].to_constant () ^ (nelt - 1)
23865                         : d->perm[i].to_constant ());
23866
23867   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
23868   sel = force_reg (vmode, sel);
23869
23870   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
23871   return true;
23872 }
23873
23874 /* Try to implement D using an SVE TBL instruction.  */
23875
23876 static bool
23877 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
23878 {
23879   unsigned HOST_WIDE_INT nelt;
23880
23881   /* Permuting two variable-length vectors could overflow the
23882      index range.  */
23883   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
23884     return false;
23885
23886   if (d->testing_p)
23887     return true;
23888
23889   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
23890   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
23891   if (d->one_vector_p)
23892     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
23893   else
23894     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
23895   return true;
23896 }
23897
23898 /* Try to implement D using SVE dup instruction.  */
23899
23900 static bool
23901 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
23902 {
23903   if (BYTES_BIG_ENDIAN
23904       || !d->one_vector_p
23905       || d->vec_flags != VEC_SVE_DATA
23906       || d->op_vec_flags != VEC_ADVSIMD
23907       || d->perm.encoding ().nelts_per_pattern () != 1
23908       || !known_eq (d->perm.encoding ().npatterns (),
23909                     GET_MODE_NUNITS (d->op_mode))
23910       || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
23911     return false;
23912
23913   int npatterns = d->perm.encoding ().npatterns ();
23914   for (int i = 0; i < npatterns; i++)
23915     if (!known_eq (d->perm[i], i))
23916       return false;
23917
23918   if (d->testing_p)
23919     return true;
23920
23921   aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
23922   return true;
23923 }
23924
23925 /* Try to implement D using SVE SEL instruction.  */
23926
23927 static bool
23928 aarch64_evpc_sel (struct expand_vec_perm_d *d)
23929 {
23930   machine_mode vmode = d->vmode;
23931   int unit_size = GET_MODE_UNIT_SIZE (vmode);
23932
23933   if (d->vec_flags != VEC_SVE_DATA
23934       || unit_size > 8)
23935     return false;
23936
23937   int n_patterns = d->perm.encoding ().npatterns ();
23938   poly_int64 vec_len = d->perm.length ();
23939
23940   for (int i = 0; i < n_patterns; ++i)
23941     if (!known_eq (d->perm[i], i)
23942         && !known_eq (d->perm[i], vec_len + i))
23943       return false;
23944
23945   for (int i = n_patterns; i < n_patterns * 2; i++)
23946     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
23947         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
23948       return false;
23949
23950   if (d->testing_p)
23951     return true;
23952
23953   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
23954
23955   /* Build a predicate that is true when op0 elements should be used.  */
23956   rtx_vector_builder builder (pred_mode, n_patterns, 2);
23957   for (int i = 0; i < n_patterns * 2; i++)
23958     {
23959       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
23960                                           : CONST0_RTX (BImode);
23961       builder.quick_push (elem);
23962     }
23963
23964   rtx const_vec = builder.build ();
23965   rtx pred = force_reg (pred_mode, const_vec);
23966   /* TARGET = PRED ? OP0 : OP1.  */
23967   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
23968   return true;
23969 }
23970
23971 /* Recognize patterns suitable for the INS instructions.  */
23972 static bool
23973 aarch64_evpc_ins (struct expand_vec_perm_d *d)
23974 {
23975   machine_mode mode = d->vmode;
23976   unsigned HOST_WIDE_INT nelt;
23977
23978   if (d->vec_flags != VEC_ADVSIMD)
23979     return false;
23980
23981   /* to_constant is safe since this routine is specific to Advanced SIMD
23982      vectors.  */
23983   nelt = d->perm.length ().to_constant ();
23984   rtx insv = d->op0;
23985
23986   HOST_WIDE_INT idx = -1;
23987
23988   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
23989     {
23990       HOST_WIDE_INT elt;
23991       if (!d->perm[i].is_constant (&elt))
23992         return false;
23993       if (elt == (HOST_WIDE_INT) i)
23994         continue;
23995       if (idx != -1)
23996         {
23997           idx = -1;
23998           break;
23999         }
24000       idx = i;
24001     }
24002
24003   if (idx == -1)
24004     {
24005       insv = d->op1;
24006       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24007         {
24008           if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
24009             continue;
24010           if (idx != -1)
24011             return false;
24012           idx = i;
24013         }
24014
24015       if (idx == -1)
24016         return false;
24017     }
24018
24019   if (d->testing_p)
24020     return true;
24021
24022   gcc_assert (idx != -1);
24023
24024   unsigned extractindex = d->perm[idx].to_constant ();
24025   rtx extractv = d->op0;
24026   if (extractindex >= nelt)
24027     {
24028       extractv = d->op1;
24029       extractindex -= nelt;
24030     }
24031   gcc_assert (extractindex < nelt);
24032
24033   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
24034   expand_operand ops[5];
24035   create_output_operand (&ops[0], d->target, mode);
24036   create_input_operand (&ops[1], insv, mode);
24037   create_integer_operand (&ops[2], 1 << idx);
24038   create_input_operand (&ops[3], extractv, mode);
24039   create_integer_operand (&ops[4], extractindex);
24040   expand_insn (icode, 5, ops);
24041
24042   return true;
24043 }
24044
24045 static bool
24046 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
24047 {
24048   gcc_assert (d->op_mode != E_VOIDmode);
24049
24050   /* The pattern matching functions above are written to look for a small
24051      number to begin the sequence (0, 1, N/2).  If we begin with an index
24052      from the second operand, we can swap the operands.  */
24053   poly_int64 nelt = d->perm.length ();
24054   if (known_ge (d->perm[0], nelt))
24055     {
24056       d->perm.rotate_inputs (1);
24057       std::swap (d->op0, d->op1);
24058     }
24059
24060   if ((d->vec_flags == VEC_ADVSIMD
24061        || d->vec_flags == VEC_SVE_DATA
24062        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
24063        || d->vec_flags == VEC_SVE_PRED)
24064       && known_gt (nelt, 1))
24065     {
24066       if (d->vmode == d->op_mode)
24067         {
24068           if (aarch64_evpc_rev_local (d))
24069             return true;
24070           else if (aarch64_evpc_rev_global (d))
24071             return true;
24072           else if (aarch64_evpc_ext (d))
24073             return true;
24074           else if (aarch64_evpc_dup (d))
24075             return true;
24076           else if (aarch64_evpc_zip (d))
24077             return true;
24078           else if (aarch64_evpc_uzp (d))
24079             return true;
24080           else if (aarch64_evpc_trn (d))
24081             return true;
24082           else if (aarch64_evpc_sel (d))
24083             return true;
24084           else if (aarch64_evpc_ins (d))
24085             return true;
24086           else if (aarch64_evpc_reencode (d))
24087             return true;
24088
24089           if (d->vec_flags == VEC_SVE_DATA)
24090             return aarch64_evpc_sve_tbl (d);
24091           else if (d->vec_flags == VEC_ADVSIMD)
24092             return aarch64_evpc_tbl (d);
24093         }
24094       else
24095         {
24096           if (aarch64_evpc_sve_dup (d))
24097             return true;
24098         }
24099     }
24100   return false;
24101 }
24102
24103 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
24104
24105 static bool
24106 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
24107                                   rtx target, rtx op0, rtx op1,
24108                                   const vec_perm_indices &sel)
24109 {
24110   struct expand_vec_perm_d d;
24111
24112   /* Check whether the mask can be applied to a single vector.  */
24113   if (sel.ninputs () == 1
24114       || (op0 && rtx_equal_p (op0, op1)))
24115     d.one_vector_p = true;
24116   else if (sel.all_from_input_p (0))
24117     {
24118       d.one_vector_p = true;
24119       op1 = op0;
24120     }
24121   else if (sel.all_from_input_p (1))
24122     {
24123       d.one_vector_p = true;
24124       op0 = op1;
24125     }
24126   else
24127     d.one_vector_p = false;
24128
24129   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
24130                      sel.nelts_per_input ());
24131   d.vmode = vmode;
24132   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
24133   d.op_mode = op_mode;
24134   d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
24135   d.target = target;
24136   d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
24137   if (op0 == op1)
24138     d.op1 = d.op0;
24139   else
24140     d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX;
24141   d.testing_p = !target;
24142
24143   if (!d.testing_p)
24144     return aarch64_expand_vec_perm_const_1 (&d);
24145
24146   rtx_insn *last = get_last_insn ();
24147   bool ret = aarch64_expand_vec_perm_const_1 (&d);
24148   gcc_assert (last == get_last_insn ());
24149
24150   return ret;
24151 }
24152
24153 /* Generate a byte permute mask for a register of mode MODE,
24154    which has NUNITS units.  */
24155
24156 rtx
24157 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
24158 {
24159   /* We have to reverse each vector because we dont have
24160      a permuted load that can reverse-load according to ABI rules.  */
24161   rtx mask;
24162   rtvec v = rtvec_alloc (16);
24163   unsigned int i, j;
24164   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
24165
24166   gcc_assert (BYTES_BIG_ENDIAN);
24167   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
24168
24169   for (i = 0; i < nunits; i++)
24170     for (j = 0; j < usize; j++)
24171       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
24172   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
24173   return force_reg (V16QImode, mask);
24174 }
24175
24176 /* Expand an SVE integer comparison using the SVE equivalent of:
24177
24178      (set TARGET (CODE OP0 OP1)).  */
24179
24180 void
24181 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
24182 {
24183   machine_mode pred_mode = GET_MODE (target);
24184   machine_mode data_mode = GET_MODE (op0);
24185   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
24186                                       op0, op1);
24187   if (!rtx_equal_p (target, res))
24188     emit_move_insn (target, res);
24189 }
24190
24191 /* Return the UNSPEC_COND_* code for comparison CODE.  */
24192
24193 static unsigned int
24194 aarch64_unspec_cond_code (rtx_code code)
24195 {
24196   switch (code)
24197     {
24198     case NE:
24199       return UNSPEC_COND_FCMNE;
24200     case EQ:
24201       return UNSPEC_COND_FCMEQ;
24202     case LT:
24203       return UNSPEC_COND_FCMLT;
24204     case GT:
24205       return UNSPEC_COND_FCMGT;
24206     case LE:
24207       return UNSPEC_COND_FCMLE;
24208     case GE:
24209       return UNSPEC_COND_FCMGE;
24210     case UNORDERED:
24211       return UNSPEC_COND_FCMUO;
24212     default:
24213       gcc_unreachable ();
24214     }
24215 }
24216
24217 /* Emit:
24218
24219       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24220
24221    where <X> is the operation associated with comparison CODE.
24222    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24223
24224 static void
24225 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
24226                           bool known_ptrue_p, rtx op0, rtx op1)
24227 {
24228   rtx flag = gen_int_mode (known_ptrue_p, SImode);
24229   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
24230                                gen_rtvec (4, pred, flag, op0, op1),
24231                                aarch64_unspec_cond_code (code));
24232   emit_set_insn (target, unspec);
24233 }
24234
24235 /* Emit the SVE equivalent of:
24236
24237       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
24238       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
24239       (set TARGET (ior:PRED_MODE TMP1 TMP2))
24240
24241    where <Xi> is the operation associated with comparison CODEi.
24242    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24243
24244 static void
24245 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
24246                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
24247 {
24248   machine_mode pred_mode = GET_MODE (pred);
24249   rtx tmp1 = gen_reg_rtx (pred_mode);
24250   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
24251   rtx tmp2 = gen_reg_rtx (pred_mode);
24252   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
24253   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
24254 }
24255
24256 /* Emit the SVE equivalent of:
24257
24258       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24259       (set TARGET (not TMP))
24260
24261    where <X> is the operation associated with comparison CODE.
24262    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24263
24264 static void
24265 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
24266                                  bool known_ptrue_p, rtx op0, rtx op1)
24267 {
24268   machine_mode pred_mode = GET_MODE (pred);
24269   rtx tmp = gen_reg_rtx (pred_mode);
24270   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
24271   aarch64_emit_unop (target, one_cmpl_optab, tmp);
24272 }
24273
24274 /* Expand an SVE floating-point comparison using the SVE equivalent of:
24275
24276      (set TARGET (CODE OP0 OP1))
24277
24278    If CAN_INVERT_P is true, the caller can also handle inverted results;
24279    return true if the result is in fact inverted.  */
24280
24281 bool
24282 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
24283                                   rtx op0, rtx op1, bool can_invert_p)
24284 {
24285   machine_mode pred_mode = GET_MODE (target);
24286   machine_mode data_mode = GET_MODE (op0);
24287
24288   rtx ptrue = aarch64_ptrue_reg (pred_mode);
24289   switch (code)
24290     {
24291     case UNORDERED:
24292       /* UNORDERED has no immediate form.  */
24293       op1 = force_reg (data_mode, op1);
24294       /* fall through */
24295     case LT:
24296     case LE:
24297     case GT:
24298     case GE:
24299     case EQ:
24300     case NE:
24301       {
24302         /* There is native support for the comparison.  */
24303         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24304         return false;
24305       }
24306
24307     case LTGT:
24308       /* This is a trapping operation (LT or GT).  */
24309       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
24310       return false;
24311
24312     case UNEQ:
24313       if (!flag_trapping_math)
24314         {
24315           /* This would trap for signaling NaNs.  */
24316           op1 = force_reg (data_mode, op1);
24317           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
24318                                         ptrue, true, op0, op1);
24319           return false;
24320         }
24321       /* fall through */
24322     case UNLT:
24323     case UNLE:
24324     case UNGT:
24325     case UNGE:
24326       if (flag_trapping_math)
24327         {
24328           /* Work out which elements are ordered.  */
24329           rtx ordered = gen_reg_rtx (pred_mode);
24330           op1 = force_reg (data_mode, op1);
24331           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
24332                                            ptrue, true, op0, op1);
24333
24334           /* Test the opposite condition for the ordered elements,
24335              then invert the result.  */
24336           if (code == UNEQ)
24337             code = NE;
24338           else
24339             code = reverse_condition_maybe_unordered (code);
24340           if (can_invert_p)
24341             {
24342               aarch64_emit_sve_fp_cond (target, code,
24343                                         ordered, false, op0, op1);
24344               return true;
24345             }
24346           aarch64_emit_sve_invert_fp_cond (target, code,
24347                                            ordered, false, op0, op1);
24348           return false;
24349         }
24350       break;
24351
24352     case ORDERED:
24353       /* ORDERED has no immediate form.  */
24354       op1 = force_reg (data_mode, op1);
24355       break;
24356
24357     default:
24358       gcc_unreachable ();
24359     }
24360
24361   /* There is native support for the inverse comparison.  */
24362   code = reverse_condition_maybe_unordered (code);
24363   if (can_invert_p)
24364     {
24365       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24366       return true;
24367     }
24368   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
24369   return false;
24370 }
24371
24372 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
24373    of the data being selected and CMP_MODE is the mode of the values being
24374    compared.  */
24375
24376 void
24377 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
24378                           rtx *ops)
24379 {
24380   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
24381   rtx pred = gen_reg_rtx (pred_mode);
24382   if (FLOAT_MODE_P (cmp_mode))
24383     {
24384       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
24385                                             ops[4], ops[5], true))
24386         std::swap (ops[1], ops[2]);
24387     }
24388   else
24389     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
24390
24391   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
24392     ops[1] = force_reg (data_mode, ops[1]);
24393   /* The "false" value can only be zero if the "true" value is a constant.  */
24394   if (register_operand (ops[1], data_mode)
24395       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
24396     ops[2] = force_reg (data_mode, ops[2]);
24397
24398   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
24399   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
24400 }
24401
24402 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
24403    true.  However due to issues with register allocation it is preferable
24404    to avoid tieing integer scalar and FP scalar modes.  Executing integer
24405    operations in general registers is better than treating them as scalar
24406    vector operations.  This reduces latency and avoids redundant int<->FP
24407    moves.  So tie modes if they are either the same class, or vector modes
24408    with other vector modes, vector structs or any scalar mode.  */
24409
24410 static bool
24411 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
24412 {
24413   if ((aarch64_advsimd_partial_struct_mode_p (mode1)
24414        != aarch64_advsimd_partial_struct_mode_p (mode2))
24415       && maybe_gt (GET_MODE_SIZE (mode1), 8)
24416       && maybe_gt (GET_MODE_SIZE (mode2), 8))
24417     return false;
24418
24419   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
24420     return true;
24421
24422   /* We specifically want to allow elements of "structure" modes to
24423      be tieable to the structure.  This more general condition allows
24424      other rarer situations too.  The reason we don't extend this to
24425      predicate modes is that there are no predicate structure modes
24426      nor any specific instructions for extracting part of a predicate
24427      register.  */
24428   if (aarch64_vector_data_mode_p (mode1)
24429       && aarch64_vector_data_mode_p (mode2))
24430     return true;
24431
24432   /* Also allow any scalar modes with vectors.  */
24433   if (aarch64_vector_mode_supported_p (mode1)
24434       || aarch64_vector_mode_supported_p (mode2))
24435     return true;
24436
24437   return false;
24438 }
24439
24440 /* Return a new RTX holding the result of moving POINTER forward by
24441    AMOUNT bytes.  */
24442
24443 static rtx
24444 aarch64_move_pointer (rtx pointer, poly_int64 amount)
24445 {
24446   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
24447
24448   return adjust_automodify_address (pointer, GET_MODE (pointer),
24449                                     next, amount);
24450 }
24451
24452 /* Return a new RTX holding the result of moving POINTER forward by the
24453    size of the mode it points to.  */
24454
24455 static rtx
24456 aarch64_progress_pointer (rtx pointer)
24457 {
24458   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
24459 }
24460
24461 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
24462    MODE bytes.  */
24463
24464 static void
24465 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
24466                                               machine_mode mode)
24467 {
24468   /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
24469      address copies using V4SImode so that we can use Q registers.  */
24470   if (known_eq (GET_MODE_BITSIZE (mode), 256))
24471     {
24472       mode = V4SImode;
24473       rtx reg1 = gen_reg_rtx (mode);
24474       rtx reg2 = gen_reg_rtx (mode);
24475       /* "Cast" the pointers to the correct mode.  */
24476       *src = adjust_address (*src, mode, 0);
24477       *dst = adjust_address (*dst, mode, 0);
24478       /* Emit the memcpy.  */
24479       emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
24480                                         aarch64_progress_pointer (*src)));
24481       emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
24482                                          aarch64_progress_pointer (*dst), reg2));
24483       /* Move the pointers forward.  */
24484       *src = aarch64_move_pointer (*src, 32);
24485       *dst = aarch64_move_pointer (*dst, 32);
24486       return;
24487     }
24488
24489   rtx reg = gen_reg_rtx (mode);
24490
24491   /* "Cast" the pointers to the correct mode.  */
24492   *src = adjust_address (*src, mode, 0);
24493   *dst = adjust_address (*dst, mode, 0);
24494   /* Emit the memcpy.  */
24495   emit_move_insn (reg, *src);
24496   emit_move_insn (*dst, reg);
24497   /* Move the pointers forward.  */
24498   *src = aarch64_progress_pointer (*src);
24499   *dst = aarch64_progress_pointer (*dst);
24500 }
24501
24502 /* Expand a cpymem using the MOPS extension.  OPERANDS are taken
24503    from the cpymem pattern.  Return true iff we succeeded.  */
24504 static bool
24505 aarch64_expand_cpymem_mops (rtx *operands)
24506 {
24507   if (!TARGET_MOPS)
24508     return false;
24509
24510   /* All three registers are changed by the instruction, so each one
24511      must be a fresh pseudo.  */
24512   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24513   rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
24514   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24515   rtx src_mem = replace_equiv_address (operands[1], src_addr);
24516   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
24517   emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
24518
24519   return true;
24520 }
24521
24522 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
24523    we succeed, otherwise return false, indicating that a libcall to
24524    memcpy should be emitted.  */
24525
24526 bool
24527 aarch64_expand_cpymem (rtx *operands)
24528 {
24529   int mode_bits;
24530   rtx dst = operands[0];
24531   rtx src = operands[1];
24532   rtx base;
24533   machine_mode cur_mode = BLKmode;
24534
24535   /* Variable-sized memcpy can go through the MOPS expansion if available.  */
24536   if (!CONST_INT_P (operands[2]))
24537     return aarch64_expand_cpymem_mops (operands);
24538
24539   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
24540
24541   /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
24542   unsigned HOST_WIDE_INT max_copy_size
24543     = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
24544
24545   bool size_p = optimize_function_for_size_p (cfun);
24546
24547   /* Large constant-sized cpymem should go through MOPS when possible.
24548      It should be a win even for size optimization in the general case.
24549      For speed optimization the choice between MOPS and the SIMD sequence
24550      depends on the size of the copy, rather than number of instructions,
24551      alignment etc.  */
24552   if (size > max_copy_size)
24553     return aarch64_expand_cpymem_mops (operands);
24554
24555   int copy_bits = 256;
24556
24557   /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
24558      support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
24559   if (size <= 24
24560       || !TARGET_SIMD
24561       || (aarch64_tune_params.extra_tuning_flags
24562           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
24563     copy_bits = 128;
24564
24565   /* Emit an inline load+store sequence and count the number of operations
24566      involved.  We use a simple count of just the loads and stores emitted
24567      rather than rtx_insn count as all the pointer adjustments and reg copying
24568      in this function will get optimized away later in the pipeline.  */
24569   start_sequence ();
24570   unsigned nops = 0;
24571
24572   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24573   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24574
24575   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
24576   src = adjust_automodify_address (src, VOIDmode, base, 0);
24577
24578   /* Convert size to bits to make the rest of the code simpler.  */
24579   int n = size * BITS_PER_UNIT;
24580
24581   while (n > 0)
24582     {
24583       /* Find the largest mode in which to do the copy in without over reading
24584          or writing.  */
24585       opt_scalar_int_mode mode_iter;
24586       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
24587         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
24588           cur_mode = mode_iter.require ();
24589
24590       gcc_assert (cur_mode != BLKmode);
24591
24592       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
24593
24594       /* Prefer Q-register accesses for the last bytes.  */
24595       if (mode_bits == 128 && copy_bits == 256)
24596         cur_mode = V4SImode;
24597
24598       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
24599       /* A single block copy is 1 load + 1 store.  */
24600       nops += 2;
24601       n -= mode_bits;
24602
24603       /* Emit trailing copies using overlapping unaligned accesses
24604         (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
24605       if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
24606         {
24607           machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
24608           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
24609           gcc_assert (n_bits <= mode_bits);
24610           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
24611           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
24612           n = n_bits;
24613         }
24614     }
24615   rtx_insn *seq = get_insns ();
24616   end_sequence ();
24617   /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
24618      the constant size into a register.  */
24619   unsigned mops_cost = 3 + 1;
24620
24621   /* If MOPS is available at this point we don't consider the libcall as it's
24622      not a win even on code size.  At this point only consider MOPS if
24623      optimizing for size.  For speed optimizations we will have chosen between
24624      the two based on copy size already.  */
24625   if (TARGET_MOPS)
24626     {
24627       if (size_p && mops_cost < nops)
24628         return aarch64_expand_cpymem_mops (operands);
24629       emit_insn (seq);
24630       return true;
24631     }
24632
24633   /* A memcpy libcall in the worst case takes 3 instructions to prepare the
24634      arguments + 1 for the call.  When MOPS is not available and we're
24635      optimizing for size a libcall may be preferable.  */
24636   unsigned libcall_cost = 4;
24637   if (size_p && libcall_cost < nops)
24638     return false;
24639
24640   emit_insn (seq);
24641   return true;
24642 }
24643
24644 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
24645    SRC is a register we have created with the duplicated value to be set.  */
24646 static void
24647 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
24648                                             machine_mode mode)
24649 {
24650   /* If we are copying 128bits or 256bits, we can do that straight from
24651      the SIMD register we prepared.  */
24652   if (known_eq (GET_MODE_BITSIZE (mode), 256))
24653     {
24654       mode = GET_MODE (src);
24655       /* "Cast" the *dst to the correct mode.  */
24656       *dst = adjust_address (*dst, mode, 0);
24657       /* Emit the memset.  */
24658       emit_insn (aarch64_gen_store_pair (mode, *dst, src,
24659                                          aarch64_progress_pointer (*dst), src));
24660
24661       /* Move the pointers forward.  */
24662       *dst = aarch64_move_pointer (*dst, 32);
24663       return;
24664     }
24665   if (known_eq (GET_MODE_BITSIZE (mode), 128))
24666     {
24667       /* "Cast" the *dst to the correct mode.  */
24668       *dst = adjust_address (*dst, GET_MODE (src), 0);
24669       /* Emit the memset.  */
24670       emit_move_insn (*dst, src);
24671       /* Move the pointers forward.  */
24672       *dst = aarch64_move_pointer (*dst, 16);
24673       return;
24674     }
24675   /* For copying less, we have to extract the right amount from src.  */
24676   rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
24677
24678   /* "Cast" the *dst to the correct mode.  */
24679   *dst = adjust_address (*dst, mode, 0);
24680   /* Emit the memset.  */
24681   emit_move_insn (*dst, reg);
24682   /* Move the pointer forward.  */
24683   *dst = aarch64_progress_pointer (*dst);
24684 }
24685
24686 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
24687    as for the setmem pattern.  Return true iff we succeed.  */
24688 static bool
24689 aarch64_expand_setmem_mops (rtx *operands)
24690 {
24691   if (!TARGET_MOPS)
24692     return false;
24693
24694   /* The first two registers are changed by the instruction, so both
24695      of them must be a fresh pseudo.  */
24696   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24697   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24698   rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
24699   rtx val = operands[2];
24700   if (val != CONST0_RTX (QImode))
24701     val = force_reg (QImode, val);
24702   emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
24703   return true;
24704 }
24705
24706 /* Expand setmem, as if from a __builtin_memset.  Return true if
24707    we succeed, otherwise return false.  */
24708
24709 bool
24710 aarch64_expand_setmem (rtx *operands)
24711 {
24712   int n, mode_bits;
24713   unsigned HOST_WIDE_INT len;
24714   rtx dst = operands[0];
24715   rtx val = operands[2], src;
24716   rtx base;
24717   machine_mode cur_mode = BLKmode, next_mode;
24718
24719   /* If we don't have SIMD registers or the size is variable use the MOPS
24720      inlined sequence if possible.  */
24721   if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
24722     return aarch64_expand_setmem_mops (operands);
24723
24724   bool size_p = optimize_function_for_size_p (cfun);
24725
24726   /* Default the maximum to 256-bytes when considering only libcall vs
24727      SIMD broadcast sequence.  */
24728   unsigned max_set_size = 256;
24729
24730   len = INTVAL (operands[1]);
24731   if (len > max_set_size && !TARGET_MOPS)
24732     return false;
24733
24734   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
24735   /* The MOPS sequence takes:
24736      3 instructions for the memory storing
24737      + 1 to move the constant size into a reg
24738      + 1 if VAL is a non-zero constant to move into a reg
24739     (zero constants can use XZR directly).  */
24740   unsigned mops_cost = 3 + 1 + cst_val;
24741   /* A libcall to memset in the worst case takes 3 instructions to prepare
24742      the arguments + 1 for the call.  */
24743   unsigned libcall_cost = 4;
24744
24745   /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
24746      when available.  */
24747   if (TARGET_MOPS
24748       && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
24749     return aarch64_expand_setmem_mops (operands);
24750
24751   /* Attempt a sequence with a vector broadcast followed by stores.
24752      Count the number of operations involved to see if it's worth it
24753      against the alternatives.  A simple counter simd_ops on the
24754      algorithmically-relevant operations is used rather than an rtx_insn count
24755      as all the pointer adjusmtents and mode reinterprets will be optimized
24756      away later.  */
24757   start_sequence ();
24758   unsigned simd_ops = 0;
24759
24760   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24761   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24762
24763   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
24764   src = expand_vector_broadcast (V16QImode, val);
24765   src = force_reg (V16QImode, src);
24766   simd_ops++;
24767   /* Convert len to bits to make the rest of the code simpler.  */
24768   n = len * BITS_PER_UNIT;
24769
24770   /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
24771      AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
24772   const int copy_limit = (aarch64_tune_params.extra_tuning_flags
24773                           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
24774                           ? GET_MODE_BITSIZE (TImode) : 256;
24775
24776   while (n > 0)
24777     {
24778       /* Find the largest mode in which to do the copy without
24779          over writing.  */
24780       opt_scalar_int_mode mode_iter;
24781       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
24782         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
24783           cur_mode = mode_iter.require ();
24784
24785       gcc_assert (cur_mode != BLKmode);
24786
24787       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
24788       aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
24789       simd_ops++;
24790       n -= mode_bits;
24791
24792       /* Do certain trailing copies as overlapping if it's going to be
24793          cheaper.  i.e. less instructions to do so.  For instance doing a 15
24794          byte copy it's more efficient to do two overlapping 8 byte copies than
24795          8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
24796       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
24797         {
24798           next_mode = smallest_mode_for_size (n, MODE_INT);
24799           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
24800           gcc_assert (n_bits <= mode_bits);
24801           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
24802           n = n_bits;
24803         }
24804     }
24805   rtx_insn *seq = get_insns ();
24806   end_sequence ();
24807
24808   if (size_p)
24809     {
24810       /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
24811          call to memset or the MOPS expansion.  */
24812       if (TARGET_MOPS
24813           && mops_cost <= libcall_cost
24814           && mops_cost <= simd_ops)
24815         return aarch64_expand_setmem_mops (operands);
24816       /* If MOPS is not available or not shorter pick a libcall if the SIMD
24817          sequence is too long.  */
24818       else if (libcall_cost < simd_ops)
24819         return false;
24820       emit_insn (seq);
24821       return true;
24822     }
24823
24824   /* At this point the SIMD broadcast sequence is the best choice when
24825      optimizing for speed.  */
24826   emit_insn (seq);
24827   return true;
24828 }
24829
24830
24831 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
24832    SImode stores.  Handle the case when the constant has identical
24833    bottom and top halves.  This is beneficial when the two stores can be
24834    merged into an STP and we avoid synthesising potentially expensive
24835    immediates twice.  Return true if such a split is possible.  */
24836
24837 bool
24838 aarch64_split_dimode_const_store (rtx dst, rtx src)
24839 {
24840   rtx lo = gen_lowpart (SImode, src);
24841   rtx hi = gen_highpart_mode (SImode, DImode, src);
24842
24843   bool size_p = optimize_function_for_size_p (cfun);
24844
24845   if (!rtx_equal_p (lo, hi))
24846     return false;
24847
24848   unsigned int orig_cost
24849     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
24850   unsigned int lo_cost
24851     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
24852
24853   /* We want to transform:
24854      MOV        x1, 49370
24855      MOVK       x1, 0x140, lsl 16
24856      MOVK       x1, 0xc0da, lsl 32
24857      MOVK       x1, 0x140, lsl 48
24858      STR        x1, [x0]
24859    into:
24860      MOV        w1, 49370
24861      MOVK       w1, 0x140, lsl 16
24862      STP        w1, w1, [x0]
24863    So we want to perform this only when we save two instructions
24864    or more.  When optimizing for size, however, accept any code size
24865    savings we can.  */
24866   if (size_p && orig_cost <= lo_cost)
24867     return false;
24868
24869   if (!size_p
24870       && (orig_cost <= lo_cost + 1))
24871     return false;
24872
24873   rtx mem_lo = adjust_address (dst, SImode, 0);
24874   if (!aarch64_mem_pair_operand (mem_lo, SImode))
24875     return false;
24876
24877   rtx tmp_reg = gen_reg_rtx (SImode);
24878   aarch64_expand_mov_immediate (tmp_reg, lo);
24879   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
24880   /* Don't emit an explicit store pair as this may not be always profitable.
24881      Let the sched-fusion logic decide whether to merge them.  */
24882   emit_move_insn (mem_lo, tmp_reg);
24883   emit_move_insn (mem_hi, tmp_reg);
24884
24885   return true;
24886 }
24887
24888 /* Generate RTL for a conditional branch with rtx comparison CODE in
24889    mode CC_MODE.  The destination of the unlikely conditional branch
24890    is LABEL_REF.  */
24891
24892 void
24893 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
24894                               rtx label_ref)
24895 {
24896   rtx x;
24897   x = gen_rtx_fmt_ee (code, VOIDmode,
24898                       gen_rtx_REG (cc_mode, CC_REGNUM),
24899                       const0_rtx);
24900
24901   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24902                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
24903                             pc_rtx);
24904   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24905 }
24906
24907 /* Generate DImode scratch registers for 128-bit (TImode) addition.
24908
24909    OP1 represents the TImode destination operand 1
24910    OP2 represents the TImode destination operand 2
24911    LOW_DEST represents the low half (DImode) of TImode operand 0
24912    LOW_IN1 represents the low half (DImode) of TImode operand 1
24913    LOW_IN2 represents the low half (DImode) of TImode operand 2
24914    HIGH_DEST represents the high half (DImode) of TImode operand 0
24915    HIGH_IN1 represents the high half (DImode) of TImode operand 1
24916    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
24917
24918 void
24919 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
24920                             rtx *low_in1, rtx *low_in2,
24921                             rtx *high_dest, rtx *high_in1,
24922                             rtx *high_in2)
24923 {
24924   *low_dest = gen_reg_rtx (DImode);
24925   *low_in1 = gen_lowpart (DImode, op1);
24926   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
24927                                   subreg_lowpart_offset (DImode, TImode));
24928   *high_dest = gen_reg_rtx (DImode);
24929   *high_in1 = gen_highpart (DImode, op1);
24930   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
24931                                    subreg_highpart_offset (DImode, TImode));
24932 }
24933
24934 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
24935
24936    This function differs from 'arch64_addti_scratch_regs' in that
24937    OP1 can be an immediate constant (zero). We must call
24938    subreg_highpart_offset with DImode and TImode arguments, otherwise
24939    VOIDmode will be used for the const_int which generates an internal
24940    error from subreg_size_highpart_offset which does not expect a size of zero.
24941
24942    OP1 represents the TImode destination operand 1
24943    OP2 represents the TImode destination operand 2
24944    LOW_DEST represents the low half (DImode) of TImode operand 0
24945    LOW_IN1 represents the low half (DImode) of TImode operand 1
24946    LOW_IN2 represents the low half (DImode) of TImode operand 2
24947    HIGH_DEST represents the high half (DImode) of TImode operand 0
24948    HIGH_IN1 represents the high half (DImode) of TImode operand 1
24949    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
24950
24951
24952 void
24953 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
24954                              rtx *low_in1, rtx *low_in2,
24955                              rtx *high_dest, rtx *high_in1,
24956                              rtx *high_in2)
24957 {
24958   *low_dest = gen_reg_rtx (DImode);
24959   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
24960                                   subreg_lowpart_offset (DImode, TImode));
24961
24962   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
24963                                   subreg_lowpart_offset (DImode, TImode));
24964   *high_dest = gen_reg_rtx (DImode);
24965
24966   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
24967                                    subreg_highpart_offset (DImode, TImode));
24968   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
24969                                    subreg_highpart_offset (DImode, TImode));
24970 }
24971
24972 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
24973
24974    OP0 represents the TImode destination operand 0
24975    LOW_DEST represents the low half (DImode) of TImode operand 0
24976    LOW_IN1 represents the low half (DImode) of TImode operand 1
24977    LOW_IN2 represents the low half (DImode) of TImode operand 2
24978    HIGH_DEST represents the high half (DImode) of TImode operand 0
24979    HIGH_IN1 represents the high half (DImode) of TImode operand 1
24980    HIGH_IN2 represents the high half (DImode) of TImode operand 2
24981    UNSIGNED_P is true if the operation is being performed on unsigned
24982    values.  */
24983 void
24984 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
24985                        rtx low_in2, rtx high_dest, rtx high_in1,
24986                        rtx high_in2, bool unsigned_p)
24987 {
24988   if (low_in2 == const0_rtx)
24989     {
24990       low_dest = low_in1;
24991       high_in2 = force_reg (DImode, high_in2);
24992       if (unsigned_p)
24993         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
24994       else
24995         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
24996     }
24997   else
24998     {
24999       if (aarch64_plus_immediate (low_in2, DImode))
25000         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
25001                                             GEN_INT (-UINTVAL (low_in2))));
25002       else
25003         {
25004           low_in2 = force_reg (DImode, low_in2);
25005           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
25006         }
25007       high_in2 = force_reg (DImode, high_in2);
25008
25009       if (unsigned_p)
25010         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
25011       else
25012         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
25013     }
25014
25015   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
25016   emit_move_insn (gen_highpart (DImode, op0), high_dest);
25017
25018 }
25019
25020 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
25021
25022 static unsigned HOST_WIDE_INT
25023 aarch64_asan_shadow_offset (void)
25024 {
25025   if (TARGET_ILP32)
25026     return (HOST_WIDE_INT_1 << 29);
25027   else
25028     return (HOST_WIDE_INT_1 << 36);
25029 }
25030
25031 static rtx
25032 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
25033                         int code, tree treeop0, tree treeop1)
25034 {
25035   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25036   rtx op0, op1;
25037   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25038   insn_code icode;
25039   struct expand_operand ops[4];
25040
25041   start_sequence ();
25042   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25043
25044   op_mode = GET_MODE (op0);
25045   if (op_mode == VOIDmode)
25046     op_mode = GET_MODE (op1);
25047
25048   switch (op_mode)
25049     {
25050     case E_QImode:
25051     case E_HImode:
25052     case E_SImode:
25053       cmp_mode = SImode;
25054       icode = CODE_FOR_cmpsi;
25055       break;
25056
25057     case E_DImode:
25058       cmp_mode = DImode;
25059       icode = CODE_FOR_cmpdi;
25060       break;
25061
25062     case E_SFmode:
25063       cmp_mode = SFmode;
25064       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25065       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
25066       break;
25067
25068     case E_DFmode:
25069       cmp_mode = DFmode;
25070       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25071       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
25072       break;
25073
25074     default:
25075       end_sequence ();
25076       return NULL_RTX;
25077     }
25078
25079   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
25080   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
25081   if (!op0 || !op1)
25082     {
25083       end_sequence ();
25084       return NULL_RTX;
25085     }
25086   *prep_seq = get_insns ();
25087   end_sequence ();
25088
25089   create_fixed_operand (&ops[0], op0);
25090   create_fixed_operand (&ops[1], op1);
25091
25092   start_sequence ();
25093   if (!maybe_expand_insn (icode, 2, ops))
25094     {
25095       end_sequence ();
25096       return NULL_RTX;
25097     }
25098   *gen_seq = get_insns ();
25099   end_sequence ();
25100
25101   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
25102                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
25103 }
25104
25105 static rtx
25106 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
25107                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
25108 {
25109   rtx op0, op1, target;
25110   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25111   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25112   insn_code icode;
25113   struct expand_operand ops[6];
25114   int aarch64_cond;
25115
25116   push_to_sequence (*prep_seq);
25117   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25118
25119   op_mode = GET_MODE (op0);
25120   if (op_mode == VOIDmode)
25121     op_mode = GET_MODE (op1);
25122
25123   switch (op_mode)
25124     {
25125     case E_QImode:
25126     case E_HImode:
25127     case E_SImode:
25128       cmp_mode = SImode;
25129       break;
25130
25131     case E_DImode:
25132       cmp_mode = DImode;
25133       break;
25134
25135     case E_SFmode:
25136       cmp_mode = SFmode;
25137       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25138       break;
25139
25140     case E_DFmode:
25141       cmp_mode = DFmode;
25142       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25143       break;
25144
25145     default:
25146       end_sequence ();
25147       return NULL_RTX;
25148     }
25149
25150   icode = code_for_ccmp (cc_mode, cmp_mode);
25151
25152   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
25153   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
25154   if (!op0 || !op1)
25155     {
25156       end_sequence ();
25157       return NULL_RTX;
25158     }
25159   *prep_seq = get_insns ();
25160   end_sequence ();
25161
25162   target = gen_rtx_REG (cc_mode, CC_REGNUM);
25163   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
25164
25165   if (bit_code != AND)
25166     {
25167       /* Treat the ccmp patterns as canonical and use them where possible,
25168          but fall back to ccmp_rev patterns if there's no other option.  */
25169       rtx_code prev_code = GET_CODE (prev);
25170       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
25171       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
25172           && !(prev_code == EQ
25173                || prev_code == NE
25174                || prev_code == ORDERED
25175                || prev_code == UNORDERED))
25176         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
25177       else
25178         {
25179           rtx_code code = reverse_condition (prev_code);
25180           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
25181         }
25182       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
25183     }
25184
25185   create_fixed_operand (&ops[0], XEXP (prev, 0));
25186   create_fixed_operand (&ops[1], target);
25187   create_fixed_operand (&ops[2], op0);
25188   create_fixed_operand (&ops[3], op1);
25189   create_fixed_operand (&ops[4], prev);
25190   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
25191
25192   push_to_sequence (*gen_seq);
25193   if (!maybe_expand_insn (icode, 6, ops))
25194     {
25195       end_sequence ();
25196       return NULL_RTX;
25197     }
25198
25199   *gen_seq = get_insns ();
25200   end_sequence ();
25201
25202   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
25203 }
25204
25205 #undef TARGET_GEN_CCMP_FIRST
25206 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
25207
25208 #undef TARGET_GEN_CCMP_NEXT
25209 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
25210
25211 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
25212    instruction fusion of some sort.  */
25213
25214 static bool
25215 aarch64_macro_fusion_p (void)
25216 {
25217   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
25218 }
25219
25220
25221 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
25222    should be kept together during scheduling.  */
25223
25224 static bool
25225 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
25226 {
25227   rtx set_dest;
25228   rtx prev_set = single_set (prev);
25229   rtx curr_set = single_set (curr);
25230   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
25231   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
25232
25233   if (!aarch64_macro_fusion_p ())
25234     return false;
25235
25236   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
25237     {
25238       /* We are trying to match:
25239          prev (mov)  == (set (reg r0) (const_int imm16))
25240          curr (movk) == (set (zero_extract (reg r0)
25241                                            (const_int 16)
25242                                            (const_int 16))
25243                              (const_int imm16_1))  */
25244
25245       set_dest = SET_DEST (curr_set);
25246
25247       if (GET_CODE (set_dest) == ZERO_EXTRACT
25248           && CONST_INT_P (SET_SRC (curr_set))
25249           && CONST_INT_P (SET_SRC (prev_set))
25250           && CONST_INT_P (XEXP (set_dest, 2))
25251           && INTVAL (XEXP (set_dest, 2)) == 16
25252           && REG_P (XEXP (set_dest, 0))
25253           && REG_P (SET_DEST (prev_set))
25254           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
25255         {
25256           return true;
25257         }
25258     }
25259
25260   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
25261     {
25262
25263       /*  We're trying to match:
25264           prev (adrp) == (set (reg r1)
25265                               (high (symbol_ref ("SYM"))))
25266           curr (add) == (set (reg r0)
25267                              (lo_sum (reg r1)
25268                                      (symbol_ref ("SYM"))))
25269           Note that r0 need not necessarily be the same as r1, especially
25270           during pre-regalloc scheduling.  */
25271
25272       if (satisfies_constraint_Ush (SET_SRC (prev_set))
25273           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25274         {
25275           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
25276               && REG_P (XEXP (SET_SRC (curr_set), 0))
25277               && REGNO (XEXP (SET_SRC (curr_set), 0))
25278                  == REGNO (SET_DEST (prev_set))
25279               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
25280                               XEXP (SET_SRC (curr_set), 1)))
25281             return true;
25282         }
25283     }
25284
25285   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
25286     {
25287
25288       /* We're trying to match:
25289          prev (movk) == (set (zero_extract (reg r0)
25290                                            (const_int 16)
25291                                            (const_int 32))
25292                              (const_int imm16_1))
25293          curr (movk) == (set (zero_extract (reg r0)
25294                                            (const_int 16)
25295                                            (const_int 48))
25296                              (const_int imm16_2))  */
25297
25298       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
25299           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
25300           && REG_P (XEXP (SET_DEST (prev_set), 0))
25301           && REG_P (XEXP (SET_DEST (curr_set), 0))
25302           && REGNO (XEXP (SET_DEST (prev_set), 0))
25303              == REGNO (XEXP (SET_DEST (curr_set), 0))
25304           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
25305           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
25306           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
25307           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
25308           && CONST_INT_P (SET_SRC (prev_set))
25309           && CONST_INT_P (SET_SRC (curr_set)))
25310         return true;
25311
25312     }
25313   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
25314     {
25315       /* We're trying to match:
25316           prev (adrp) == (set (reg r0)
25317                               (high (symbol_ref ("SYM"))))
25318           curr (ldr) == (set (reg r1)
25319                              (mem (lo_sum (reg r0)
25320                                              (symbol_ref ("SYM")))))
25321                  or
25322           curr (ldr) == (set (reg r1)
25323                              (zero_extend (mem
25324                                            (lo_sum (reg r0)
25325                                                    (symbol_ref ("SYM"))))))  */
25326       if (satisfies_constraint_Ush (SET_SRC (prev_set))
25327           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25328         {
25329           rtx curr_src = SET_SRC (curr_set);
25330
25331           if (GET_CODE (curr_src) == ZERO_EXTEND)
25332             curr_src = XEXP (curr_src, 0);
25333
25334           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
25335               && REG_P (XEXP (XEXP (curr_src, 0), 0))
25336               && REGNO (XEXP (XEXP (curr_src, 0), 0))
25337                  == REGNO (SET_DEST (prev_set))
25338               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
25339                               XEXP (SET_SRC (prev_set), 0)))
25340               return true;
25341         }
25342     }
25343
25344   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
25345   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
25346       && prev_set && curr_set && any_condjump_p (curr)
25347       && GET_CODE (SET_SRC (prev_set)) == COMPARE
25348       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
25349       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
25350     return true;
25351
25352   /* Fuse flag-setting ALU instructions and conditional branch.  */
25353   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
25354       && any_condjump_p (curr))
25355     {
25356       unsigned int condreg1, condreg2;
25357       rtx cc_reg_1;
25358       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
25359       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
25360
25361       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
25362           && prev
25363           && modified_in_p (cc_reg_1, prev))
25364         {
25365           enum attr_type prev_type = get_attr_type (prev);
25366
25367           /* FIXME: this misses some which is considered simple arthematic
25368              instructions for ThunderX.  Simple shifts are missed here.  */
25369           if (prev_type == TYPE_ALUS_SREG
25370               || prev_type == TYPE_ALUS_IMM
25371               || prev_type == TYPE_LOGICS_REG
25372               || prev_type == TYPE_LOGICS_IMM)
25373             return true;
25374         }
25375     }
25376
25377   /* Fuse ALU instructions and CBZ/CBNZ.  */
25378   if (prev_set
25379       && curr_set
25380       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
25381       && any_condjump_p (curr))
25382     {
25383       /* We're trying to match:
25384           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
25385           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
25386                                                          (const_int 0))
25387                                                  (label_ref ("SYM"))
25388                                                  (pc))  */
25389       if (SET_DEST (curr_set) == (pc_rtx)
25390           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
25391           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
25392           && REG_P (SET_DEST (prev_set))
25393           && REGNO (SET_DEST (prev_set))
25394              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
25395         {
25396           /* Fuse ALU operations followed by conditional branch instruction.  */
25397           switch (get_attr_type (prev))
25398             {
25399             case TYPE_ALU_IMM:
25400             case TYPE_ALU_SREG:
25401             case TYPE_ADC_REG:
25402             case TYPE_ADC_IMM:
25403             case TYPE_ADCS_REG:
25404             case TYPE_ADCS_IMM:
25405             case TYPE_LOGIC_REG:
25406             case TYPE_LOGIC_IMM:
25407             case TYPE_CSEL:
25408             case TYPE_ADR:
25409             case TYPE_MOV_IMM:
25410             case TYPE_SHIFT_REG:
25411             case TYPE_SHIFT_IMM:
25412             case TYPE_BFM:
25413             case TYPE_RBIT:
25414             case TYPE_REV:
25415             case TYPE_EXTEND:
25416               return true;
25417
25418             default:;
25419             }
25420         }
25421     }
25422
25423   return false;
25424 }
25425
25426 /* Return true iff the instruction fusion described by OP is enabled.  */
25427
25428 bool
25429 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
25430 {
25431   return (aarch64_tune_params.fusible_ops & op) != 0;
25432 }
25433
25434 /* If MEM is in the form of [base+offset], extract the two parts
25435    of address and set to BASE and OFFSET, otherwise return false
25436    after clearing BASE and OFFSET.  */
25437
25438 bool
25439 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
25440 {
25441   rtx addr;
25442
25443   gcc_assert (MEM_P (mem));
25444
25445   addr = XEXP (mem, 0);
25446
25447   if (REG_P (addr))
25448     {
25449       *base = addr;
25450       *offset = const0_rtx;
25451       return true;
25452     }
25453
25454   if (GET_CODE (addr) == PLUS
25455       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
25456     {
25457       *base = XEXP (addr, 0);
25458       *offset = XEXP (addr, 1);
25459       return true;
25460     }
25461
25462   *base = NULL_RTX;
25463   *offset = NULL_RTX;
25464
25465   return false;
25466 }
25467
25468 /* Types for scheduling fusion.  */
25469 enum sched_fusion_type
25470 {
25471   SCHED_FUSION_NONE = 0,
25472   SCHED_FUSION_LD_SIGN_EXTEND,
25473   SCHED_FUSION_LD_ZERO_EXTEND,
25474   SCHED_FUSION_LD,
25475   SCHED_FUSION_ST,
25476   SCHED_FUSION_NUM
25477 };
25478
25479 /* If INSN is a load or store of address in the form of [base+offset],
25480    extract the two parts and set to BASE and OFFSET.  Return scheduling
25481    fusion type this INSN is.  */
25482
25483 static enum sched_fusion_type
25484 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
25485 {
25486   rtx x, dest, src;
25487   enum sched_fusion_type fusion = SCHED_FUSION_LD;
25488
25489   gcc_assert (INSN_P (insn));
25490   x = PATTERN (insn);
25491   if (GET_CODE (x) != SET)
25492     return SCHED_FUSION_NONE;
25493
25494   src = SET_SRC (x);
25495   dest = SET_DEST (x);
25496
25497   machine_mode dest_mode = GET_MODE (dest);
25498
25499   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
25500     return SCHED_FUSION_NONE;
25501
25502   if (GET_CODE (src) == SIGN_EXTEND)
25503     {
25504       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
25505       src = XEXP (src, 0);
25506       if (!MEM_P (src) || GET_MODE (src) != SImode)
25507         return SCHED_FUSION_NONE;
25508     }
25509   else if (GET_CODE (src) == ZERO_EXTEND)
25510     {
25511       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
25512       src = XEXP (src, 0);
25513       if (!MEM_P (src) || GET_MODE (src) != SImode)
25514         return SCHED_FUSION_NONE;
25515     }
25516
25517   if (MEM_P (src) && REG_P (dest))
25518     extract_base_offset_in_addr (src, base, offset);
25519   else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
25520     {
25521       fusion = SCHED_FUSION_ST;
25522       extract_base_offset_in_addr (dest, base, offset);
25523     }
25524   else
25525     return SCHED_FUSION_NONE;
25526
25527   if (*base == NULL_RTX || *offset == NULL_RTX)
25528     fusion = SCHED_FUSION_NONE;
25529
25530   return fusion;
25531 }
25532
25533 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
25534
25535    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
25536    and PRI are only calculated for these instructions.  For other instruction,
25537    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
25538    type instruction fusion can be added by returning different priorities.
25539
25540    It's important that irrelevant instructions get the largest FUSION_PRI.  */
25541
25542 static void
25543 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
25544                                int *fusion_pri, int *pri)
25545 {
25546   int tmp, off_val;
25547   rtx base, offset;
25548   enum sched_fusion_type fusion;
25549
25550   gcc_assert (INSN_P (insn));
25551
25552   tmp = max_pri - 1;
25553   fusion = fusion_load_store (insn, &base, &offset);
25554   if (fusion == SCHED_FUSION_NONE)
25555     {
25556       *pri = tmp;
25557       *fusion_pri = tmp;
25558       return;
25559     }
25560
25561   /* Set FUSION_PRI according to fusion type and base register.  */
25562   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
25563
25564   /* Calculate PRI.  */
25565   tmp /= 2;
25566
25567   /* INSN with smaller offset goes first.  */
25568   off_val = (int)(INTVAL (offset));
25569   if (off_val >= 0)
25570     tmp -= (off_val & 0xfffff);
25571   else
25572     tmp += ((- off_val) & 0xfffff);
25573
25574   *pri = tmp;
25575   return;
25576 }
25577
25578 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
25579    Adjust priority of sha1h instructions so they are scheduled before
25580    other SHA1 instructions.  */
25581
25582 static int
25583 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
25584 {
25585   rtx x = PATTERN (insn);
25586
25587   if (GET_CODE (x) == SET)
25588     {
25589       x = SET_SRC (x);
25590
25591       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
25592         return priority + 10;
25593     }
25594
25595   return priority;
25596 }
25597
25598 /* If REVERSED is null, return true if memory reference *MEM2 comes
25599    immediately after memory reference *MEM1.  Do not change the references
25600    in this case.
25601
25602    Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
25603    if they are, try to make them use constant offsets from the same base
25604    register.  Return true on success.  When returning true, set *REVERSED
25605    to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2.  */
25606 static bool
25607 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
25608 {
25609   if (reversed)
25610     *reversed = false;
25611
25612   if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
25613       || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
25614     return false;
25615
25616   if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
25617     return false;
25618
25619   auto size1 = MEM_SIZE (*mem1);
25620   auto size2 = MEM_SIZE (*mem2);
25621
25622   rtx base1, base2, offset1, offset2;
25623   extract_base_offset_in_addr (*mem1, &base1, &offset1);
25624   extract_base_offset_in_addr (*mem2, &base2, &offset2);
25625
25626   /* Make sure at least one memory is in base+offset form.  */
25627   if (!(base1 && offset1) && !(base2 && offset2))
25628     return false;
25629
25630   /* If both mems already use the same base register, just check the
25631      offsets.  */
25632   if (base1 && base2 && rtx_equal_p (base1, base2))
25633     {
25634       if (!offset1 || !offset2)
25635         return false;
25636
25637       if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
25638         return true;
25639
25640       if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
25641         {
25642           *reversed = true;
25643           return true;
25644         }
25645
25646       return false;
25647     }
25648
25649   /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
25650      guarantee that the values are consecutive.  */
25651   if (MEM_EXPR (*mem1)
25652       && MEM_EXPR (*mem2)
25653       && MEM_OFFSET_KNOWN_P (*mem1)
25654       && MEM_OFFSET_KNOWN_P (*mem2))
25655     {
25656       poly_int64 expr_offset1;
25657       poly_int64 expr_offset2;
25658       tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
25659                                                        &expr_offset1);
25660       tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
25661                                                        &expr_offset2);
25662       if (!expr_base1
25663           || !expr_base2
25664           || !DECL_P (expr_base1)
25665           || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
25666         return false;
25667
25668       expr_offset1 += MEM_OFFSET (*mem1);
25669       expr_offset2 += MEM_OFFSET (*mem2);
25670
25671       if (known_eq (expr_offset1 + size1, expr_offset2))
25672         ;
25673       else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
25674         *reversed = true;
25675       else
25676         return false;
25677
25678       if (reversed)
25679         {
25680           if (base2)
25681             {
25682               rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
25683                                          expr_offset1 - expr_offset2);
25684               *mem1 = replace_equiv_address_nv (*mem1, addr1);
25685             }
25686           else
25687             {
25688               rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
25689                                          expr_offset2 - expr_offset1);
25690               *mem2 = replace_equiv_address_nv (*mem2, addr2);
25691             }
25692         }
25693       return true;
25694     }
25695
25696   return false;
25697 }
25698
25699 /* Return true if MEM1 and MEM2 can be combined into a single access
25700    of mode MODE, with the combined access having the same address as MEM1.  */
25701
25702 bool
25703 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
25704 {
25705   if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
25706     return false;
25707   return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
25708 }
25709
25710 /* Given OPERANDS of consecutive load/store, check if we can merge
25711    them into ldp/stp.  LOAD is true if they are load instructions.
25712    MODE is the mode of memory operands.  */
25713
25714 bool
25715 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
25716                                 machine_mode mode)
25717 {
25718   enum reg_class rclass_1, rclass_2;
25719   rtx mem_1, mem_2, reg_1, reg_2;
25720
25721   if (load)
25722     {
25723       mem_1 = operands[1];
25724       mem_2 = operands[3];
25725       reg_1 = operands[0];
25726       reg_2 = operands[2];
25727       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
25728       if (REGNO (reg_1) == REGNO (reg_2))
25729         return false;
25730       if (reg_overlap_mentioned_p (reg_1, mem_2))
25731         return false;
25732     }
25733   else
25734     {
25735       mem_1 = operands[0];
25736       mem_2 = operands[2];
25737       reg_1 = operands[1];
25738       reg_2 = operands[3];
25739     }
25740
25741   /* The mems cannot be volatile.  */
25742   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
25743     return false;
25744
25745   /* If we have SImode and slow unaligned ldp,
25746      check the alignment to be at least 8 byte. */
25747   if (mode == SImode
25748       && (aarch64_tune_params.extra_tuning_flags
25749           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
25750       && !optimize_size
25751       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
25752     return false;
25753
25754   /* Check if the addresses are in the form of [base+offset].  */
25755   bool reversed = false;
25756   if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
25757     return false;
25758
25759   /* The operands must be of the same size.  */
25760   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
25761                         GET_MODE_SIZE (GET_MODE (mem_2))));
25762
25763   /* One of the memory accesses must be a mempair operand.
25764      If it is not the first one, they need to be swapped by the
25765      peephole.  */
25766   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
25767        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
25768     return false;
25769
25770   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
25771     rclass_1 = FP_REGS;
25772   else
25773     rclass_1 = GENERAL_REGS;
25774
25775   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
25776     rclass_2 = FP_REGS;
25777   else
25778     rclass_2 = GENERAL_REGS;
25779
25780   /* Check if the registers are of same class.  */
25781   if (rclass_1 != rclass_2)
25782     return false;
25783
25784   return true;
25785 }
25786
25787 /* Given OPERANDS of consecutive load/store that can be merged,
25788    swap them if they are not in ascending order.  */
25789 void
25790 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
25791 {
25792   int mem_op = load ? 1 : 0;
25793   bool reversed = false;
25794   if (!aarch64_check_consecutive_mems (operands + mem_op,
25795                                        operands + mem_op + 2, &reversed))
25796     gcc_unreachable ();
25797
25798   if (reversed)
25799     {
25800       /* Irrespective of whether this is a load or a store,
25801          we do the same swap.  */
25802       std::swap (operands[0], operands[2]);
25803       std::swap (operands[1], operands[3]);
25804     }
25805 }
25806
25807 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
25808    comparison between the two.  */
25809 int
25810 aarch64_host_wide_int_compare (const void *x, const void *y)
25811 {
25812   return wi::cmps (* ((const HOST_WIDE_INT *) x),
25813                    * ((const HOST_WIDE_INT *) y));
25814 }
25815
25816 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
25817    other pointing to a REG rtx containing an offset, compare the offsets
25818    of the two pairs.
25819
25820    Return:
25821
25822         1 iff offset (X) > offset (Y)
25823         0 iff offset (X) == offset (Y)
25824         -1 iff offset (X) < offset (Y)  */
25825 int
25826 aarch64_ldrstr_offset_compare (const void *x, const void *y)
25827 {
25828   const rtx * operands_1 = (const rtx *) x;
25829   const rtx * operands_2 = (const rtx *) y;
25830   rtx mem_1, mem_2, base, offset_1, offset_2;
25831
25832   if (MEM_P (operands_1[0]))
25833     mem_1 = operands_1[0];
25834   else
25835     mem_1 = operands_1[1];
25836
25837   if (MEM_P (operands_2[0]))
25838     mem_2 = operands_2[0];
25839   else
25840     mem_2 = operands_2[1];
25841
25842   /* Extract the offsets.  */
25843   extract_base_offset_in_addr (mem_1, &base, &offset_1);
25844   extract_base_offset_in_addr (mem_2, &base, &offset_2);
25845
25846   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
25847
25848   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
25849 }
25850
25851 /* Given OPERANDS of consecutive load/store, check if we can merge
25852    them into ldp/stp by adjusting the offset.  LOAD is true if they
25853    are load instructions.  MODE is the mode of memory operands.
25854
25855    Given below consecutive stores:
25856
25857      str  w1, [xb, 0x100]
25858      str  w1, [xb, 0x104]
25859      str  w1, [xb, 0x108]
25860      str  w1, [xb, 0x10c]
25861
25862    Though the offsets are out of the range supported by stp, we can
25863    still pair them after adjusting the offset, like:
25864
25865      add  scratch, xb, 0x100
25866      stp  w1, w1, [scratch]
25867      stp  w1, w1, [scratch, 0x8]
25868
25869    The peephole patterns detecting this opportunity should guarantee
25870    the scratch register is avaliable.  */
25871
25872 bool
25873 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
25874                                        machine_mode mode)
25875 {
25876   const int num_insns = 4;
25877   enum reg_class rclass;
25878   HOST_WIDE_INT offvals[num_insns], msize;
25879   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
25880
25881   if (load)
25882     {
25883       for (int i = 0; i < num_insns; i++)
25884         {
25885           reg[i] = operands[2 * i];
25886           mem[i] = operands[2 * i + 1];
25887
25888           gcc_assert (REG_P (reg[i]));
25889         }
25890
25891       /* Do not attempt to merge the loads if the loads clobber each other.  */
25892       for (int i = 0; i < 8; i += 2)
25893         for (int j = i + 2; j < 8; j += 2)
25894           if (reg_overlap_mentioned_p (operands[i], operands[j]))
25895             return false;
25896     }
25897   else
25898     for (int i = 0; i < num_insns; i++)
25899       {
25900         mem[i] = operands[2 * i];
25901         reg[i] = operands[2 * i + 1];
25902       }
25903
25904   /* Skip if memory operand is by itself valid for ldp/stp.  */
25905   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
25906     return false;
25907
25908   for (int i = 0; i < num_insns; i++)
25909     {
25910       /* The mems cannot be volatile.  */
25911       if (MEM_VOLATILE_P (mem[i]))
25912         return false;
25913
25914       /* Check if the addresses are in the form of [base+offset].  */
25915       extract_base_offset_in_addr (mem[i], base + i, offset + i);
25916       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
25917         return false;
25918     }
25919
25920   /* Check if the registers are of same class.  */
25921   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
25922     ? FP_REGS : GENERAL_REGS;
25923
25924   for (int i = 1; i < num_insns; i++)
25925     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
25926       {
25927         if (rclass != FP_REGS)
25928           return false;
25929       }
25930     else
25931       {
25932         if (rclass != GENERAL_REGS)
25933           return false;
25934       }
25935
25936   /* Only the last register in the order in which they occur
25937      may be clobbered by the load.  */
25938   if (rclass == GENERAL_REGS && load)
25939     for (int i = 0; i < num_insns - 1; i++)
25940       if (reg_mentioned_p (reg[i], mem[i]))
25941         return false;
25942
25943   /* Check if the bases are same.  */
25944   for (int i = 0; i < num_insns - 1; i++)
25945     if (!rtx_equal_p (base[i], base[i + 1]))
25946       return false;
25947
25948   for (int i = 0; i < num_insns; i++)
25949     offvals[i] = INTVAL (offset[i]);
25950
25951   msize = GET_MODE_SIZE (mode).to_constant ();
25952
25953   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
25954   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
25955          aarch64_host_wide_int_compare);
25956
25957   if (!(offvals[1] == offvals[0] + msize
25958         && offvals[3] == offvals[2] + msize))
25959     return false;
25960
25961   /* Check that offsets are within range of each other.  The ldp/stp
25962      instructions have 7 bit immediate offsets, so use 0x80.  */
25963   if (offvals[2] - offvals[0] >= msize * 0x80)
25964     return false;
25965
25966   /* The offsets must be aligned with respect to each other.  */
25967   if (offvals[0] % msize != offvals[2] % msize)
25968     return false;
25969
25970   /* If we have SImode and slow unaligned ldp,
25971      check the alignment to be at least 8 byte. */
25972   if (mode == SImode
25973       && (aarch64_tune_params.extra_tuning_flags
25974           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
25975       && !optimize_size
25976       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
25977     return false;
25978
25979   return true;
25980 }
25981
25982 /* Given OPERANDS of consecutive load/store, this function pairs them
25983    into LDP/STP after adjusting the offset.  It depends on the fact
25984    that the operands can be sorted so the offsets are correct for STP.
25985    MODE is the mode of memory operands.  CODE is the rtl operator
25986    which should be applied to all memory operands, it's SIGN_EXTEND,
25987    ZERO_EXTEND or UNKNOWN.  */
25988
25989 bool
25990 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
25991                              machine_mode mode, RTX_CODE code)
25992 {
25993   rtx base, offset_1, offset_3, t1, t2;
25994   rtx mem_1, mem_2, mem_3, mem_4;
25995   rtx temp_operands[8];
25996   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
25997                 stp_off_upper_limit, stp_off_lower_limit, msize;
25998
25999   /* We make changes on a copy as we may still bail out.  */
26000   for (int i = 0; i < 8; i ++)
26001     temp_operands[i] = operands[i];
26002
26003   /* Sort the operands.  Note for cases as below:
26004        [base + 0x310] = A
26005        [base + 0x320] = B
26006        [base + 0x330] = C
26007        [base + 0x320] = D
26008      We need stable sorting otherwise wrong data may be store to offset 0x320.
26009      Also note the dead store in above case should be optimized away, but no
26010      guarantees here.  */
26011   gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
26012                  aarch64_ldrstr_offset_compare);
26013
26014   /* Copy the memory operands so that if we have to bail for some
26015      reason the original addresses are unchanged.  */
26016   if (load)
26017     {
26018       mem_1 = copy_rtx (temp_operands[1]);
26019       mem_2 = copy_rtx (temp_operands[3]);
26020       mem_3 = copy_rtx (temp_operands[5]);
26021       mem_4 = copy_rtx (temp_operands[7]);
26022     }
26023   else
26024     {
26025       mem_1 = copy_rtx (temp_operands[0]);
26026       mem_2 = copy_rtx (temp_operands[2]);
26027       mem_3 = copy_rtx (temp_operands[4]);
26028       mem_4 = copy_rtx (temp_operands[6]);
26029       gcc_assert (code == UNKNOWN);
26030     }
26031
26032   extract_base_offset_in_addr (mem_1, &base, &offset_1);
26033   extract_base_offset_in_addr (mem_3, &base, &offset_3);
26034   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
26035               && offset_3 != NULL_RTX);
26036
26037   /* Adjust offset so it can fit in LDP/STP instruction.  */
26038   msize = GET_MODE_SIZE (mode).to_constant();
26039   stp_off_upper_limit = msize * (0x40 - 1);
26040   stp_off_lower_limit = - msize * 0x40;
26041
26042   off_val_1 = INTVAL (offset_1);
26043   off_val_3 = INTVAL (offset_3);
26044
26045   /* The base offset is optimally half way between the two STP/LDP offsets.  */
26046   if (msize <= 4)
26047     base_off = (off_val_1 + off_val_3) / 2;
26048   else
26049     /* However, due to issues with negative LDP/STP offset generation for
26050        larger modes, for DF, DD, DI and vector modes. we must not use negative
26051        addresses smaller than 9 signed unadjusted bits can store.  This
26052        provides the most range in this case.  */
26053     base_off = off_val_1;
26054
26055   /* Adjust the base so that it is aligned with the addresses but still
26056      optimal.  */
26057   if (base_off % msize != off_val_1 % msize)
26058     /* Fix the offset, bearing in mind we want to make it bigger not
26059        smaller.  */
26060     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26061   else if (msize <= 4)
26062     /* The negative range of LDP/STP is one larger than the positive range.  */
26063     base_off += msize;
26064
26065   /* Check if base offset is too big or too small.  We can attempt to resolve
26066      this issue by setting it to the maximum value and seeing if the offsets
26067      still fit.  */
26068   if (base_off >= 0x1000)
26069     {
26070       base_off = 0x1000 - 1;
26071       /* We must still make sure that the base offset is aligned with respect
26072          to the address.  But it may not be made any bigger.  */
26073       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26074     }
26075
26076   /* Likewise for the case where the base is too small.  */
26077   if (base_off <= -0x1000)
26078     {
26079       base_off = -0x1000 + 1;
26080       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26081     }
26082
26083   /* Offset of the first STP/LDP.  */
26084   new_off_1 = off_val_1 - base_off;
26085
26086   /* Offset of the second STP/LDP.  */
26087   new_off_3 = off_val_3 - base_off;
26088
26089   /* The offsets must be within the range of the LDP/STP instructions.  */
26090   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
26091       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
26092     return false;
26093
26094   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
26095                                                   new_off_1), true);
26096   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
26097                                                   new_off_1 + msize), true);
26098   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
26099                                                   new_off_3), true);
26100   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
26101                                                   new_off_3 + msize), true);
26102
26103   if (!aarch64_mem_pair_operand (mem_1, mode)
26104       || !aarch64_mem_pair_operand (mem_3, mode))
26105     return false;
26106
26107   if (code == ZERO_EXTEND)
26108     {
26109       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
26110       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
26111       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
26112       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
26113     }
26114   else if (code == SIGN_EXTEND)
26115     {
26116       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
26117       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
26118       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
26119       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
26120     }
26121
26122   if (load)
26123     {
26124       operands[0] = temp_operands[0];
26125       operands[1] = mem_1;
26126       operands[2] = temp_operands[2];
26127       operands[3] = mem_2;
26128       operands[4] = temp_operands[4];
26129       operands[5] = mem_3;
26130       operands[6] = temp_operands[6];
26131       operands[7] = mem_4;
26132     }
26133   else
26134     {
26135       operands[0] = mem_1;
26136       operands[1] = temp_operands[1];
26137       operands[2] = mem_2;
26138       operands[3] = temp_operands[3];
26139       operands[4] = mem_3;
26140       operands[5] = temp_operands[5];
26141       operands[6] = mem_4;
26142       operands[7] = temp_operands[7];
26143     }
26144
26145   /* Emit adjusting instruction.  */
26146   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
26147   /* Emit ldp/stp instructions.  */
26148   t1 = gen_rtx_SET (operands[0], operands[1]);
26149   t2 = gen_rtx_SET (operands[2], operands[3]);
26150   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26151   t1 = gen_rtx_SET (operands[4], operands[5]);
26152   t2 = gen_rtx_SET (operands[6], operands[7]);
26153   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26154   return true;
26155 }
26156
26157 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
26158    it isn't worth branching around empty masked ops (including masked
26159    stores).  */
26160
26161 static bool
26162 aarch64_empty_mask_is_expensive (unsigned)
26163 {
26164   return false;
26165 }
26166
26167 /* Return 1 if pseudo register should be created and used to hold
26168    GOT address for PIC code.  */
26169
26170 bool
26171 aarch64_use_pseudo_pic_reg (void)
26172 {
26173   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
26174 }
26175
26176 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
26177
26178 static int
26179 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
26180 {
26181   switch (XINT (x, 1))
26182     {
26183     case UNSPEC_GOTSMALLPIC:
26184     case UNSPEC_GOTSMALLPIC28K:
26185     case UNSPEC_GOTTINYPIC:
26186       return 0;
26187     default:
26188       break;
26189     }
26190
26191   return default_unspec_may_trap_p (x, flags);
26192 }
26193
26194
26195 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
26196    return the log2 of that value.  Otherwise return -1.  */
26197
26198 int
26199 aarch64_fpconst_pow_of_2 (rtx x)
26200 {
26201   const REAL_VALUE_TYPE *r;
26202
26203   if (!CONST_DOUBLE_P (x))
26204     return -1;
26205
26206   r = CONST_DOUBLE_REAL_VALUE (x);
26207
26208   if (REAL_VALUE_NEGATIVE (*r)
26209       || REAL_VALUE_ISNAN (*r)
26210       || REAL_VALUE_ISINF (*r)
26211       || !real_isinteger (r, DFmode))
26212     return -1;
26213
26214   return exact_log2 (real_to_integer (r));
26215 }
26216
26217 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
26218    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
26219    return n. Otherwise return -1.  */
26220
26221 int
26222 aarch64_fpconst_pow2_recip (rtx x)
26223 {
26224   REAL_VALUE_TYPE r0;
26225
26226   if (!CONST_DOUBLE_P (x))
26227     return -1;
26228
26229   r0 = *CONST_DOUBLE_REAL_VALUE (x);
26230   if (exact_real_inverse (DFmode, &r0)
26231       && !REAL_VALUE_NEGATIVE (r0))
26232     {
26233         int ret = exact_log2 (real_to_integer (&r0));
26234         if (ret >= 1 && ret <= 32)
26235             return ret;
26236     }
26237   return -1;
26238 }
26239
26240 /* If X is a vector of equal CONST_DOUBLE values and that value is
26241    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
26242
26243 int
26244 aarch64_vec_fpconst_pow_of_2 (rtx x)
26245 {
26246   int nelts;
26247   if (!CONST_VECTOR_P (x)
26248       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
26249     return -1;
26250
26251   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
26252     return -1;
26253
26254   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
26255   if (firstval <= 0)
26256     return -1;
26257
26258   for (int i = 1; i < nelts; i++)
26259     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
26260       return -1;
26261
26262   return firstval;
26263 }
26264
26265 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
26266    to float.
26267
26268    __fp16 always promotes through this hook.
26269    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
26270    through the generic excess precision logic rather than here.  */
26271
26272 static tree
26273 aarch64_promoted_type (const_tree t)
26274 {
26275   if (SCALAR_FLOAT_TYPE_P (t)
26276       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
26277     return float_type_node;
26278
26279   return NULL_TREE;
26280 }
26281
26282 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
26283
26284 static bool
26285 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
26286                            optimization_type opt_type)
26287 {
26288   switch (op)
26289     {
26290     case rsqrt_optab:
26291       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
26292
26293     default:
26294       return true;
26295     }
26296 }
26297
26298 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
26299
26300 static unsigned int
26301 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
26302                                         int *offset)
26303 {
26304   /* Polynomial invariant 1 == (VG / 2) - 1.  */
26305   gcc_assert (i == 1);
26306   *factor = 2;
26307   *offset = 1;
26308   return AARCH64_DWARF_VG;
26309 }
26310
26311 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
26312    if MODE is HFmode, and punt to the generic implementation otherwise.  */
26313
26314 static bool
26315 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
26316 {
26317   return (mode == HFmode
26318           ? true
26319           : default_libgcc_floating_mode_supported_p (mode));
26320 }
26321
26322 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
26323    if MODE is HFmode, and punt to the generic implementation otherwise.  */
26324
26325 static bool
26326 aarch64_scalar_mode_supported_p (scalar_mode mode)
26327 {
26328   if (DECIMAL_FLOAT_MODE_P (mode))
26329     return default_decimal_float_supported_p ();
26330
26331   return (mode == HFmode
26332           ? true
26333           : default_scalar_mode_supported_p (mode));
26334 }
26335
26336 /* Set the value of FLT_EVAL_METHOD.
26337    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
26338
26339     0: evaluate all operations and constants, whose semantic type has at
26340        most the range and precision of type float, to the range and
26341        precision of float; evaluate all other operations and constants to
26342        the range and precision of the semantic type;
26343
26344     N, where _FloatN is a supported interchange floating type
26345        evaluate all operations and constants, whose semantic type has at
26346        most the range and precision of _FloatN type, to the range and
26347        precision of the _FloatN type; evaluate all other operations and
26348        constants to the range and precision of the semantic type;
26349
26350    If we have the ARMv8.2-A extensions then we support _Float16 in native
26351    precision, so we should set this to 16.  Otherwise, we support the type,
26352    but want to evaluate expressions in float precision, so set this to
26353    0.  */
26354
26355 static enum flt_eval_method
26356 aarch64_excess_precision (enum excess_precision_type type)
26357 {
26358   switch (type)
26359     {
26360       case EXCESS_PRECISION_TYPE_FAST:
26361       case EXCESS_PRECISION_TYPE_STANDARD:
26362         /* We can calculate either in 16-bit range and precision or
26363            32-bit range and precision.  Make that decision based on whether
26364            we have native support for the ARMv8.2-A 16-bit floating-point
26365            instructions or not.  */
26366         return (TARGET_FP_F16INST
26367                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
26368                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
26369       case EXCESS_PRECISION_TYPE_IMPLICIT:
26370       case EXCESS_PRECISION_TYPE_FLOAT16:
26371         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
26372       default:
26373         gcc_unreachable ();
26374     }
26375   return FLT_EVAL_METHOD_UNPREDICTABLE;
26376 }
26377
26378 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
26379    scheduled for speculative execution.  Reject the long-running division
26380    and square-root instructions.  */
26381
26382 static bool
26383 aarch64_sched_can_speculate_insn (rtx_insn *insn)
26384 {
26385   switch (get_attr_type (insn))
26386     {
26387       case TYPE_SDIV:
26388       case TYPE_UDIV:
26389       case TYPE_FDIVS:
26390       case TYPE_FDIVD:
26391       case TYPE_FSQRTS:
26392       case TYPE_FSQRTD:
26393       case TYPE_NEON_FP_SQRT_S:
26394       case TYPE_NEON_FP_SQRT_D:
26395       case TYPE_NEON_FP_SQRT_S_Q:
26396       case TYPE_NEON_FP_SQRT_D_Q:
26397       case TYPE_NEON_FP_DIV_S:
26398       case TYPE_NEON_FP_DIV_D:
26399       case TYPE_NEON_FP_DIV_S_Q:
26400       case TYPE_NEON_FP_DIV_D_Q:
26401         return false;
26402       default:
26403         return true;
26404     }
26405 }
26406
26407 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
26408
26409 static int
26410 aarch64_compute_pressure_classes (reg_class *classes)
26411 {
26412   int i = 0;
26413   classes[i++] = GENERAL_REGS;
26414   classes[i++] = FP_REGS;
26415   /* PR_REGS isn't a useful pressure class because many predicate pseudo
26416      registers need to go in PR_LO_REGS at some point during their
26417      lifetime.  Splitting it into two halves has the effect of making
26418      all predicates count against PR_LO_REGS, so that we try whenever
26419      possible to restrict the number of live predicates to 8.  This
26420      greatly reduces the amount of spilling in certain loops.  */
26421   classes[i++] = PR_LO_REGS;
26422   classes[i++] = PR_HI_REGS;
26423   return i;
26424 }
26425
26426 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
26427
26428 static bool
26429 aarch64_can_change_mode_class (machine_mode from,
26430                                machine_mode to, reg_class_t)
26431 {
26432   unsigned int from_flags = aarch64_classify_vector_mode (from);
26433   unsigned int to_flags = aarch64_classify_vector_mode (to);
26434
26435   bool from_sve_p = (from_flags & VEC_ANY_SVE);
26436   bool to_sve_p = (to_flags & VEC_ANY_SVE);
26437
26438   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
26439   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
26440
26441   bool from_pred_p = (from_flags & VEC_SVE_PRED);
26442   bool to_pred_p = (to_flags & VEC_SVE_PRED);
26443
26444   bool from_full_advsimd_struct_p = (from_flags == (VEC_ADVSIMD | VEC_STRUCT));
26445   bool to_partial_advsimd_struct_p = (to_flags == (VEC_ADVSIMD | VEC_STRUCT
26446                                                    | VEC_PARTIAL));
26447
26448   /* Don't allow changes between predicate modes and other modes.
26449      Only predicate registers can hold predicate modes and only
26450      non-predicate registers can hold non-predicate modes, so any
26451      attempt to mix them would require a round trip through memory.  */
26452   if (from_pred_p != to_pred_p)
26453     return false;
26454
26455   /* Don't allow changes between partial SVE modes and other modes.
26456      The contents of partial SVE modes are distributed evenly across
26457      the register, whereas GCC expects them to be clustered together.  */
26458   if (from_partial_sve_p != to_partial_sve_p)
26459     return false;
26460
26461   /* Similarly reject changes between partial SVE modes that have
26462      different patterns of significant and insignificant bits.  */
26463   if (from_partial_sve_p
26464       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
26465           || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
26466     return false;
26467
26468   /* Don't allow changes between partial and full Advanced SIMD structure
26469      modes.  */
26470   if (from_full_advsimd_struct_p && to_partial_advsimd_struct_p)
26471     return false;
26472
26473   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26474     {
26475       /* Don't allow changes between SVE modes and other modes that might
26476          be bigger than 128 bits.  In particular, OImode, CImode and XImode
26477          divide into 128-bit quantities while SVE modes divide into
26478          BITS_PER_SVE_VECTOR quantities.  */
26479       if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
26480         return false;
26481       if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
26482         return false;
26483     }
26484
26485   if (BYTES_BIG_ENDIAN)
26486     {
26487       /* Don't allow changes between SVE data modes and non-SVE modes.
26488          See the comment at the head of aarch64-sve.md for details.  */
26489       if (from_sve_p != to_sve_p)
26490         return false;
26491
26492       /* Don't allow changes in element size: lane 0 of the new vector
26493          would not then be lane 0 of the old vector.  See the comment
26494          above aarch64_maybe_expand_sve_subreg_move for a more detailed
26495          description.
26496
26497          In the worst case, this forces a register to be spilled in
26498          one mode and reloaded in the other, which handles the
26499          endianness correctly.  */
26500       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
26501         return false;
26502     }
26503   return true;
26504 }
26505
26506 /* Implement TARGET_EARLY_REMAT_MODES.  */
26507
26508 static void
26509 aarch64_select_early_remat_modes (sbitmap modes)
26510 {
26511   /* SVE values are not normally live across a call, so it should be
26512      worth doing early rematerialization even in VL-specific mode.  */
26513   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
26514     if (aarch64_sve_mode_p ((machine_mode) i))
26515       bitmap_set_bit (modes, i);
26516 }
26517
26518 /* Override the default target speculation_safe_value.  */
26519 static rtx
26520 aarch64_speculation_safe_value (machine_mode mode,
26521                                 rtx result, rtx val, rtx failval)
26522 {
26523   /* Maybe we should warn if falling back to hard barriers.  They are
26524      likely to be noticably more expensive than the alternative below.  */
26525   if (!aarch64_track_speculation)
26526     return default_speculation_safe_value (mode, result, val, failval);
26527
26528   if (!REG_P (val))
26529     val = copy_to_mode_reg (mode, val);
26530
26531   if (!aarch64_reg_or_zero (failval, mode))
26532     failval = copy_to_mode_reg (mode, failval);
26533
26534   emit_insn (gen_despeculate_copy (mode, result, val, failval));
26535   return result;
26536 }
26537
26538 /* Implement TARGET_ESTIMATED_POLY_VALUE.
26539    Look into the tuning structure for an estimate.
26540    KIND specifies the type of requested estimate: min, max or likely.
26541    For cores with a known SVE width all three estimates are the same.
26542    For generic SVE tuning we want to distinguish the maximum estimate from
26543    the minimum and likely ones.
26544    The likely estimate is the same as the minimum in that case to give a
26545    conservative behavior of auto-vectorizing with SVE when it is a win
26546    even for 128-bit SVE.
26547    When SVE width information is available VAL.coeffs[1] is multiplied by
26548    the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
26549
26550 static HOST_WIDE_INT
26551 aarch64_estimated_poly_value (poly_int64 val,
26552                               poly_value_estimate_kind kind
26553                                 = POLY_VALUE_LIKELY)
26554 {
26555   unsigned int width_source = aarch64_tune_params.sve_width;
26556
26557   /* If there is no core-specific information then the minimum and likely
26558      values are based on 128-bit vectors and the maximum is based on
26559      the architectural maximum of 2048 bits.  */
26560   if (width_source == SVE_SCALABLE)
26561     switch (kind)
26562       {
26563       case POLY_VALUE_MIN:
26564       case POLY_VALUE_LIKELY:
26565         return val.coeffs[0];
26566       case POLY_VALUE_MAX:
26567           return val.coeffs[0] + val.coeffs[1] * 15;
26568       }
26569
26570   /* Allow sve_width to be a bitmask of different VL, treating the lowest
26571      as likely.  This could be made more general if future -mtune options
26572      need it to be.  */
26573   if (kind == POLY_VALUE_MAX)
26574     width_source = 1 << floor_log2 (width_source);
26575   else
26576     width_source = least_bit_hwi (width_source);
26577
26578   /* If the core provides width information, use that.  */
26579   HOST_WIDE_INT over_128 = width_source - 128;
26580   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
26581 }
26582
26583
26584 /* Return true for types that could be supported as SIMD return or
26585    argument types.  */
26586
26587 static bool
26588 supported_simd_type (tree t)
26589 {
26590   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
26591     {
26592       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
26593       return s == 1 || s == 2 || s == 4 || s == 8;
26594     }
26595   return false;
26596 }
26597
26598 /* Return true for types that currently are supported as SIMD return
26599    or argument types.  */
26600
26601 static bool
26602 currently_supported_simd_type (tree t, tree b)
26603 {
26604   if (COMPLEX_FLOAT_TYPE_P (t))
26605     return false;
26606
26607   if (TYPE_SIZE (t) != TYPE_SIZE (b))
26608     return false;
26609
26610   return supported_simd_type (t);
26611 }
26612
26613 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
26614
26615 static int
26616 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
26617                                         struct cgraph_simd_clone *clonei,
26618                                         tree base_type, int num)
26619 {
26620   tree t, ret_type;
26621   unsigned int elt_bits, count;
26622   unsigned HOST_WIDE_INT const_simdlen;
26623   poly_uint64 vec_bits;
26624
26625   if (!TARGET_SIMD)
26626     return 0;
26627
26628   /* For now, SVE simdclones won't produce illegal simdlen, So only check
26629      const simdlens here.  */
26630   if (maybe_ne (clonei->simdlen, 0U)
26631       && clonei->simdlen.is_constant (&const_simdlen)
26632       && (const_simdlen < 2
26633           || const_simdlen > 1024
26634           || (const_simdlen & (const_simdlen - 1)) != 0))
26635     {
26636       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26637                   "unsupported simdlen %wd", const_simdlen);
26638       return 0;
26639     }
26640
26641   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
26642   if (TREE_CODE (ret_type) != VOID_TYPE
26643       && !currently_supported_simd_type (ret_type, base_type))
26644     {
26645       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
26646         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26647                     "GCC does not currently support mixed size types "
26648                     "for %<simd%> functions");
26649       else if (supported_simd_type (ret_type))
26650         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26651                     "GCC does not currently support return type %qT "
26652                     "for %<simd%> functions", ret_type);
26653       else
26654         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26655                     "unsupported return type %qT for %<simd%> functions",
26656                     ret_type);
26657       return 0;
26658     }
26659
26660   int i;
26661   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
26662   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
26663
26664   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
26665        t && t != void_list_node; t = TREE_CHAIN (t), i++)
26666     {
26667       tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
26668
26669       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
26670           && !currently_supported_simd_type (arg_type, base_type))
26671         {
26672           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
26673             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26674                         "GCC does not currently support mixed size types "
26675                         "for %<simd%> functions");
26676           else
26677             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26678                         "GCC does not currently support argument type %qT "
26679                         "for %<simd%> functions", arg_type);
26680           return 0;
26681         }
26682     }
26683
26684   clonei->vecsize_mangle = 'n';
26685   clonei->mask_mode = VOIDmode;
26686   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
26687   if (known_eq (clonei->simdlen, 0U))
26688     {
26689       count = 2;
26690       vec_bits = (num == 0 ? 64 : 128);
26691       clonei->simdlen = exact_div (vec_bits, elt_bits);
26692     }
26693   else
26694     {
26695       count = 1;
26696       vec_bits = clonei->simdlen * elt_bits;
26697       /* For now, SVE simdclones won't produce illegal simdlen, So only check
26698          const simdlens here.  */
26699       if (clonei->simdlen.is_constant (&const_simdlen)
26700           && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
26701         {
26702           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26703                       "GCC does not currently support simdlen %wd for type %qT",
26704                       const_simdlen, base_type);
26705           return 0;
26706         }
26707     }
26708   clonei->vecsize_int = vec_bits;
26709   clonei->vecsize_float = vec_bits;
26710   return count;
26711 }
26712
26713 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
26714
26715 static void
26716 aarch64_simd_clone_adjust (struct cgraph_node *node)
26717 {
26718   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
26719      use the correct ABI.  */
26720
26721   tree t = TREE_TYPE (node->decl);
26722   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
26723                                         TYPE_ATTRIBUTES (t));
26724 }
26725
26726 /* Implement TARGET_SIMD_CLONE_USABLE.  */
26727
26728 static int
26729 aarch64_simd_clone_usable (struct cgraph_node *node)
26730 {
26731   switch (node->simdclone->vecsize_mangle)
26732     {
26733     case 'n':
26734       if (!TARGET_SIMD)
26735         return -1;
26736       return 0;
26737     default:
26738       gcc_unreachable ();
26739     }
26740 }
26741
26742 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
26743
26744 static int
26745 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
26746 {
26747   auto check_attr = [&](const char *name) {
26748     tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
26749     tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
26750     if (!attr1 && !attr2)
26751       return true;
26752
26753     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
26754   };
26755
26756   if (!check_attr ("aarch64_vector_pcs"))
26757     return 0;
26758   if (!check_attr ("Advanced SIMD type"))
26759     return 0;
26760   if (!check_attr ("SVE type"))
26761     return 0;
26762   if (!check_attr ("SVE sizeless type"))
26763     return 0;
26764   return 1;
26765 }
26766
26767 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
26768
26769 static const char *
26770 aarch64_get_multilib_abi_name (void)
26771 {
26772   if (TARGET_BIG_END)
26773     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
26774   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
26775 }
26776
26777 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
26778    global variable based guard use the default else
26779    return a null tree.  */
26780 static tree
26781 aarch64_stack_protect_guard (void)
26782 {
26783   if (aarch64_stack_protector_guard == SSP_GLOBAL)
26784     return default_stack_protect_guard ();
26785
26786   return NULL_TREE;
26787 }
26788
26789 /* Return the diagnostic message string if conversion from FROMTYPE to
26790    TOTYPE is not allowed, NULL otherwise.  */
26791
26792 static const char *
26793 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
26794 {
26795   if (element_mode (fromtype) != element_mode (totype))
26796     {
26797       /* Do no allow conversions to/from BFmode scalar types.  */
26798       if (TYPE_MODE (fromtype) == BFmode)
26799         return N_("invalid conversion from type %<bfloat16_t%>");
26800       if (TYPE_MODE (totype) == BFmode)
26801         return N_("invalid conversion to type %<bfloat16_t%>");
26802     }
26803
26804   /* Conversion allowed.  */
26805   return NULL;
26806 }
26807
26808 /* Return the diagnostic message string if the unary operation OP is
26809    not permitted on TYPE, NULL otherwise.  */
26810
26811 static const char *
26812 aarch64_invalid_unary_op (int op, const_tree type)
26813 {
26814   /* Reject all single-operand operations on BFmode except for &.  */
26815   if (element_mode (type) == BFmode && op != ADDR_EXPR)
26816     return N_("operation not permitted on type %<bfloat16_t%>");
26817
26818   /* Operation allowed.  */
26819   return NULL;
26820 }
26821
26822 /* Return the diagnostic message string if the binary operation OP is
26823    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
26824
26825 static const char *
26826 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
26827                            const_tree type2)
26828 {
26829   /* Reject all 2-operand operations on BFmode.  */
26830   if (element_mode (type1) == BFmode
26831       || element_mode (type2) == BFmode)
26832     return N_("operation not permitted on type %<bfloat16_t%>");
26833
26834   if (VECTOR_TYPE_P (type1)
26835       && VECTOR_TYPE_P (type2)
26836       && !TYPE_INDIVISIBLE_P (type1)
26837       && !TYPE_INDIVISIBLE_P (type2)
26838       && (aarch64_sve::builtin_type_p (type1)
26839           != aarch64_sve::builtin_type_p (type2)))
26840     return N_("cannot combine GNU and SVE vectors in a binary operation");
26841
26842   /* Operation allowed.  */
26843   return NULL;
26844 }
26845
26846 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES.  Here we tell the rest of the
26847    compiler that we automatically ignore the top byte of our pointers, which
26848    allows using -fsanitize=hwaddress.  */
26849 bool
26850 aarch64_can_tag_addresses ()
26851 {
26852   return !TARGET_ILP32;
26853 }
26854
26855 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
26856    section at the end if needed.  */
26857 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
26858 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
26859 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
26860 void
26861 aarch64_file_end_indicate_exec_stack ()
26862 {
26863   file_end_indicate_exec_stack ();
26864
26865   unsigned feature_1_and = 0;
26866   if (aarch64_bti_enabled ())
26867     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
26868
26869   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
26870     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
26871
26872   if (feature_1_and)
26873     {
26874       /* Generate .note.gnu.property section.  */
26875       switch_to_section (get_section (".note.gnu.property",
26876                                       SECTION_NOTYPE, NULL));
26877
26878       /* PT_NOTE header: namesz, descsz, type.
26879          namesz = 4 ("GNU\0")
26880          descsz = 16 (Size of the program property array)
26881                   [(12 + padding) * Number of array elements]
26882          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
26883       assemble_align (POINTER_SIZE);
26884       assemble_integer (GEN_INT (4), 4, 32, 1);
26885       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
26886       assemble_integer (GEN_INT (5), 4, 32, 1);
26887
26888       /* PT_NOTE name.  */
26889       assemble_string ("GNU", 4);
26890
26891       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
26892          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
26893          datasz = 4
26894          data   = feature_1_and.  */
26895       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
26896       assemble_integer (GEN_INT (4), 4, 32, 1);
26897       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
26898
26899       /* Pad the size of the note to the required alignment.  */
26900       assemble_align (POINTER_SIZE);
26901     }
26902 }
26903 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
26904 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
26905 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
26906
26907 /* Helper function for straight line speculation.
26908    Return what barrier should be emitted for straight line speculation
26909    mitigation.
26910    When not mitigating against straight line speculation this function returns
26911    an empty string.
26912    When mitigating against straight line speculation, use:
26913    * SB when the v8.5-A SB extension is enabled.
26914    * DSB+ISB otherwise.  */
26915 const char *
26916 aarch64_sls_barrier (int mitigation_required)
26917 {
26918   return mitigation_required
26919     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
26920     : "";
26921 }
26922
26923 static GTY (()) tree aarch64_sls_shared_thunks[30];
26924 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
26925 const char *indirect_symbol_names[30] = {
26926     "__call_indirect_x0",
26927     "__call_indirect_x1",
26928     "__call_indirect_x2",
26929     "__call_indirect_x3",
26930     "__call_indirect_x4",
26931     "__call_indirect_x5",
26932     "__call_indirect_x6",
26933     "__call_indirect_x7",
26934     "__call_indirect_x8",
26935     "__call_indirect_x9",
26936     "__call_indirect_x10",
26937     "__call_indirect_x11",
26938     "__call_indirect_x12",
26939     "__call_indirect_x13",
26940     "__call_indirect_x14",
26941     "__call_indirect_x15",
26942     "", /* "__call_indirect_x16",  */
26943     "", /* "__call_indirect_x17",  */
26944     "__call_indirect_x18",
26945     "__call_indirect_x19",
26946     "__call_indirect_x20",
26947     "__call_indirect_x21",
26948     "__call_indirect_x22",
26949     "__call_indirect_x23",
26950     "__call_indirect_x24",
26951     "__call_indirect_x25",
26952     "__call_indirect_x26",
26953     "__call_indirect_x27",
26954     "__call_indirect_x28",
26955     "__call_indirect_x29",
26956 };
26957
26958 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
26959    line speculation.  Instead of a simple BLR that can be speculated past,
26960    we emit a BL to this thunk, and this thunk contains a BR to the relevant
26961    register.  These thunks have the relevant speculation barries put after
26962    their indirect branch so that speculation is blocked.
26963
26964    We use such a thunk so the speculation barriers are kept off the
26965    architecturally executed path in order to reduce the performance overhead.
26966
26967    When optimizing for size we use stubs shared by the linked object.
26968    When optimizing for performance we emit stubs for each function in the hope
26969    that the branch predictor can better train on jumps specific for a given
26970    function.  */
26971 rtx
26972 aarch64_sls_create_blr_label (int regnum)
26973 {
26974   gcc_assert (STUB_REGNUM_P (regnum));
26975   if (optimize_function_for_size_p (cfun))
26976     {
26977       /* For the thunks shared between different functions in this compilation
26978          unit we use a named symbol -- this is just for users to more easily
26979          understand the generated assembly.  */
26980       aarch64_sls_shared_thunks_needed = true;
26981       const char *thunk_name = indirect_symbol_names[regnum];
26982       if (aarch64_sls_shared_thunks[regnum] == NULL)
26983         {
26984           /* Build a decl representing this function stub and record it for
26985              later.  We build a decl here so we can use the GCC machinery for
26986              handling sections automatically (through `get_named_section` and
26987              `make_decl_one_only`).  That saves us a lot of trouble handling
26988              the specifics of different output file formats.  */
26989           tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
26990                                   get_identifier (thunk_name),
26991                                   build_function_type_list (void_type_node,
26992                                                             NULL_TREE));
26993           DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
26994                                            NULL_TREE, void_type_node);
26995           TREE_PUBLIC (decl) = 1;
26996           TREE_STATIC (decl) = 1;
26997           DECL_IGNORED_P (decl) = 1;
26998           DECL_ARTIFICIAL (decl) = 1;
26999           make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
27000           resolve_unique_section (decl, 0, false);
27001           aarch64_sls_shared_thunks[regnum] = decl;
27002         }
27003
27004       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
27005     }
27006
27007   if (cfun->machine->call_via[regnum] == NULL)
27008     cfun->machine->call_via[regnum]
27009       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
27010   return cfun->machine->call_via[regnum];
27011 }
27012
27013 /* Helper function for aarch64_sls_emit_blr_function_thunks and
27014    aarch64_sls_emit_shared_blr_thunks below.  */
27015 static void
27016 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
27017 {
27018   /* Save in x16 and branch to that function so this transformation does
27019      not prevent jumping to `BTI c` instructions.  */
27020   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
27021   asm_fprintf (out_file, "\tbr\tx16\n");
27022 }
27023
27024 /* Emit all BLR stubs for this particular function.
27025    Here we emit all the BLR stubs needed for the current function.  Since we
27026    emit these stubs in a consecutive block we know there will be no speculation
27027    gadgets between each stub, and hence we only emit a speculation barrier at
27028    the end of the stub sequences.
27029
27030    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
27031 void
27032 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
27033 {
27034   if (! aarch64_harden_sls_blr_p ())
27035     return;
27036
27037   bool any_functions_emitted = false;
27038   /* We must save and restore the current function section since this assembly
27039      is emitted at the end of the function.  This means it can be emitted *just
27040      after* the cold section of a function.  That cold part would be emitted in
27041      a different section.  That switch would trigger a `.cfi_endproc` directive
27042      to be emitted in the original section and a `.cfi_startproc` directive to
27043      be emitted in the new section.  Switching to the original section without
27044      restoring would mean that the `.cfi_endproc` emitted as a function ends
27045      would happen in a different section -- leaving an unmatched
27046      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
27047      in the standard text section.  */
27048   section *save_text_section = in_section;
27049   switch_to_section (function_section (current_function_decl));
27050   for (int regnum = 0; regnum < 30; ++regnum)
27051     {
27052       rtx specu_label = cfun->machine->call_via[regnum];
27053       if (specu_label == NULL)
27054         continue;
27055
27056       targetm.asm_out.print_operand (out_file, specu_label, 0);
27057       asm_fprintf (out_file, ":\n");
27058       aarch64_sls_emit_function_stub (out_file, regnum);
27059       any_functions_emitted = true;
27060     }
27061   if (any_functions_emitted)
27062     /* Can use the SB if needs be here, since this stub will only be used
27063       by the current function, and hence for the current target.  */
27064     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
27065   switch_to_section (save_text_section);
27066 }
27067
27068 /* Emit shared BLR stubs for the current compilation unit.
27069    Over the course of compiling this unit we may have converted some BLR
27070    instructions to a BL to a shared stub function.  This is where we emit those
27071    stub functions.
27072    This function is for the stubs shared between different functions in this
27073    compilation unit.  We share when optimizing for size instead of speed.
27074
27075    This function is called through the TARGET_ASM_FILE_END hook.  */
27076 void
27077 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
27078 {
27079   if (! aarch64_sls_shared_thunks_needed)
27080     return;
27081
27082   for (int regnum = 0; regnum < 30; ++regnum)
27083     {
27084       tree decl = aarch64_sls_shared_thunks[regnum];
27085       if (!decl)
27086         continue;
27087
27088       const char *name = indirect_symbol_names[regnum];
27089       switch_to_section (get_named_section (decl, NULL, 0));
27090       ASM_OUTPUT_ALIGN (out_file, 2);
27091       targetm.asm_out.globalize_label (out_file, name);
27092       /* Only emits if the compiler is configured for an assembler that can
27093          handle visibility directives.  */
27094       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
27095       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
27096       ASM_OUTPUT_LABEL (out_file, name);
27097       aarch64_sls_emit_function_stub (out_file, regnum);
27098       /* Use the most conservative target to ensure it can always be used by any
27099          function in the translation unit.  */
27100       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
27101       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
27102     }
27103 }
27104
27105 /* Implement TARGET_ASM_FILE_END.  */
27106 void
27107 aarch64_asm_file_end ()
27108 {
27109   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
27110   /* Since this function will be called for the ASM_FILE_END hook, we ensure
27111      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
27112      for FreeBSD) still gets called.  */
27113 #ifdef TARGET_ASM_FILE_END
27114   TARGET_ASM_FILE_END ();
27115 #endif
27116 }
27117
27118 const char *
27119 aarch64_indirect_call_asm (rtx addr)
27120 {
27121   gcc_assert (REG_P (addr));
27122   if (aarch64_harden_sls_blr_p ())
27123     {
27124       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
27125       output_asm_insn ("bl\t%0", &stub_label);
27126     }
27127   else
27128    output_asm_insn ("blr\t%0", &addr);
27129   return "";
27130 }
27131
27132 /* Target-specific selftests.  */
27133
27134 #if CHECKING_P
27135
27136 namespace selftest {
27137
27138 /* Selftest for the RTL loader.
27139    Verify that the RTL loader copes with a dump from
27140    print_rtx_function.  This is essentially just a test that class
27141    function_reader can handle a real dump, but it also verifies
27142    that lookup_reg_by_dump_name correctly handles hard regs.
27143    The presence of hard reg names in the dump means that the test is
27144    target-specific, hence it is in this file.  */
27145
27146 static void
27147 aarch64_test_loading_full_dump ()
27148 {
27149   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
27150
27151   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
27152
27153   rtx_insn *insn_1 = get_insn_by_uid (1);
27154   ASSERT_EQ (NOTE, GET_CODE (insn_1));
27155
27156   rtx_insn *insn_15 = get_insn_by_uid (15);
27157   ASSERT_EQ (INSN, GET_CODE (insn_15));
27158   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
27159
27160   /* Verify crtl->return_rtx.  */
27161   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
27162   ASSERT_EQ (0, REGNO (crtl->return_rtx));
27163   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
27164 }
27165
27166 /* Test the fractional_cost class.  */
27167
27168 static void
27169 aarch64_test_fractional_cost ()
27170 {
27171   using cf = fractional_cost;
27172
27173   ASSERT_EQ (cf (0, 20), 0);
27174
27175   ASSERT_EQ (cf (4, 2), 2);
27176   ASSERT_EQ (3, cf (9, 3));
27177
27178   ASSERT_NE (cf (5, 2), 2);
27179   ASSERT_NE (3, cf (8, 3));
27180
27181   ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
27182   ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
27183   ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
27184
27185   ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
27186   ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
27187   ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
27188   ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
27189   ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
27190   ASSERT_EQ (3 - cf (10, 3), 0);
27191
27192   ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
27193   ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
27194
27195   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27196   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27197   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27198   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27199   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27200   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27201   ASSERT_TRUE (cf (239, 240) < 1);
27202   ASSERT_FALSE (cf (240, 240) < 1);
27203   ASSERT_FALSE (cf (241, 240) < 1);
27204   ASSERT_FALSE (2 < cf (207, 104));
27205   ASSERT_FALSE (2 < cf (208, 104));
27206   ASSERT_TRUE (2 < cf (209, 104));
27207
27208   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27209   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27210   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27211   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27212   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27213   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27214   ASSERT_TRUE (cf (239, 240) < 1);
27215   ASSERT_FALSE (cf (240, 240) < 1);
27216   ASSERT_FALSE (cf (241, 240) < 1);
27217   ASSERT_FALSE (2 < cf (207, 104));
27218   ASSERT_FALSE (2 < cf (208, 104));
27219   ASSERT_TRUE (2 < cf (209, 104));
27220
27221   ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
27222   ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
27223   ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
27224   ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
27225   ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
27226   ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
27227   ASSERT_FALSE (cf (239, 240) >= 1);
27228   ASSERT_TRUE (cf (240, 240) >= 1);
27229   ASSERT_TRUE (cf (241, 240) >= 1);
27230   ASSERT_TRUE (2 >= cf (207, 104));
27231   ASSERT_TRUE (2 >= cf (208, 104));
27232   ASSERT_FALSE (2 >= cf (209, 104));
27233
27234   ASSERT_FALSE (cf (4, 15) > cf (5, 15));
27235   ASSERT_FALSE (cf (5, 15) > cf (5, 15));
27236   ASSERT_TRUE (cf (6, 15) > cf (5, 15));
27237   ASSERT_FALSE (cf (1, 3) > cf (2, 5));
27238   ASSERT_FALSE (cf (1, 12) > cf (1, 6));
27239   ASSERT_FALSE (cf (5, 3) > cf (5, 3));
27240   ASSERT_FALSE (cf (239, 240) > 1);
27241   ASSERT_FALSE (cf (240, 240) > 1);
27242   ASSERT_TRUE (cf (241, 240) > 1);
27243   ASSERT_TRUE (2 > cf (207, 104));
27244   ASSERT_FALSE (2 > cf (208, 104));
27245   ASSERT_FALSE (2 > cf (209, 104));
27246
27247   ASSERT_EQ (cf (1, 2).ceil (), 1);
27248   ASSERT_EQ (cf (11, 7).ceil (), 2);
27249   ASSERT_EQ (cf (20, 1).ceil (), 20);
27250   ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
27251   ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
27252   ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
27253   ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
27254   ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
27255
27256   ASSERT_EQ (cf (1, 2).as_double (), 0.5);
27257 }
27258
27259 /* Run all target-specific selftests.  */
27260
27261 static void
27262 aarch64_run_selftests (void)
27263 {
27264   aarch64_test_loading_full_dump ();
27265   aarch64_test_fractional_cost ();
27266 }
27267
27268 } // namespace selftest
27269
27270 #endif /* #if CHECKING_P */
27271
27272 #undef TARGET_STACK_PROTECT_GUARD
27273 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
27274
27275 #undef TARGET_ADDRESS_COST
27276 #define TARGET_ADDRESS_COST aarch64_address_cost
27277
27278 /* This hook will determines whether unnamed bitfields affect the alignment
27279    of the containing structure.  The hook returns true if the structure
27280    should inherit the alignment requirements of an unnamed bitfield's
27281    type.  */
27282 #undef TARGET_ALIGN_ANON_BITFIELD
27283 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
27284
27285 #undef TARGET_ASM_ALIGNED_DI_OP
27286 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
27287
27288 #undef TARGET_ASM_ALIGNED_HI_OP
27289 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
27290
27291 #undef TARGET_ASM_ALIGNED_SI_OP
27292 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
27293
27294 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
27295 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
27296   hook_bool_const_tree_hwi_hwi_const_tree_true
27297
27298 #undef TARGET_ASM_FILE_START
27299 #define TARGET_ASM_FILE_START aarch64_start_file
27300
27301 #undef TARGET_ASM_OUTPUT_MI_THUNK
27302 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
27303
27304 #undef TARGET_ASM_SELECT_RTX_SECTION
27305 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
27306
27307 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
27308 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
27309
27310 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
27311 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
27312
27313 #undef TARGET_BUILD_BUILTIN_VA_LIST
27314 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
27315
27316 #undef TARGET_CALLEE_COPIES
27317 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
27318
27319 #undef TARGET_CAN_ELIMINATE
27320 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
27321
27322 #undef TARGET_CAN_INLINE_P
27323 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
27324
27325 #undef TARGET_CANNOT_FORCE_CONST_MEM
27326 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
27327
27328 #undef TARGET_CASE_VALUES_THRESHOLD
27329 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
27330
27331 #undef TARGET_CONDITIONAL_REGISTER_USAGE
27332 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
27333
27334 #undef TARGET_MEMBER_TYPE_FORCES_BLK
27335 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
27336
27337 /* Only the least significant bit is used for initialization guard
27338    variables.  */
27339 #undef TARGET_CXX_GUARD_MASK_BIT
27340 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
27341
27342 #undef TARGET_C_MODE_FOR_SUFFIX
27343 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
27344
27345 #ifdef TARGET_BIG_ENDIAN_DEFAULT
27346 #undef  TARGET_DEFAULT_TARGET_FLAGS
27347 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
27348 #endif
27349
27350 #undef TARGET_CLASS_MAX_NREGS
27351 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
27352
27353 #undef TARGET_BUILTIN_DECL
27354 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
27355
27356 #undef TARGET_BUILTIN_RECIPROCAL
27357 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
27358
27359 #undef TARGET_C_EXCESS_PRECISION
27360 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
27361
27362 #undef  TARGET_EXPAND_BUILTIN
27363 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
27364
27365 #undef TARGET_EXPAND_BUILTIN_VA_START
27366 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
27367
27368 #undef TARGET_FOLD_BUILTIN
27369 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
27370
27371 #undef TARGET_FUNCTION_ARG
27372 #define TARGET_FUNCTION_ARG aarch64_function_arg
27373
27374 #undef TARGET_FUNCTION_ARG_ADVANCE
27375 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
27376
27377 #undef TARGET_FUNCTION_ARG_BOUNDARY
27378 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
27379
27380 #undef TARGET_FUNCTION_ARG_PADDING
27381 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
27382
27383 #undef TARGET_GET_RAW_RESULT_MODE
27384 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
27385 #undef TARGET_GET_RAW_ARG_MODE
27386 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
27387
27388 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
27389 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
27390
27391 #undef TARGET_FUNCTION_VALUE
27392 #define TARGET_FUNCTION_VALUE aarch64_function_value
27393
27394 #undef TARGET_FUNCTION_VALUE_REGNO_P
27395 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
27396
27397 #undef TARGET_GIMPLE_FOLD_BUILTIN
27398 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
27399
27400 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
27401 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
27402
27403 #undef  TARGET_INIT_BUILTINS
27404 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
27405
27406 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
27407 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
27408   aarch64_ira_change_pseudo_allocno_class
27409
27410 #undef TARGET_LEGITIMATE_ADDRESS_P
27411 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
27412
27413 #undef TARGET_LEGITIMATE_CONSTANT_P
27414 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
27415
27416 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
27417 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
27418   aarch64_legitimize_address_displacement
27419
27420 #undef TARGET_LIBGCC_CMP_RETURN_MODE
27421 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
27422
27423 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
27424 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
27425 aarch64_libgcc_floating_mode_supported_p
27426
27427 #undef TARGET_MANGLE_TYPE
27428 #define TARGET_MANGLE_TYPE aarch64_mangle_type
27429
27430 #undef TARGET_INVALID_CONVERSION
27431 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
27432
27433 #undef TARGET_INVALID_UNARY_OP
27434 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
27435
27436 #undef TARGET_INVALID_BINARY_OP
27437 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
27438
27439 #undef TARGET_VERIFY_TYPE_CONTEXT
27440 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
27441
27442 #undef TARGET_MEMORY_MOVE_COST
27443 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
27444
27445 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
27446 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
27447
27448 #undef TARGET_MUST_PASS_IN_STACK
27449 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
27450
27451 /* This target hook should return true if accesses to volatile bitfields
27452    should use the narrowest mode possible.  It should return false if these
27453    accesses should use the bitfield container type.  */
27454 #undef TARGET_NARROW_VOLATILE_BITFIELD
27455 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
27456
27457 #undef  TARGET_OPTION_OVERRIDE
27458 #define TARGET_OPTION_OVERRIDE aarch64_override_options
27459
27460 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
27461 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
27462   aarch64_override_options_after_change
27463
27464 #undef TARGET_OFFLOAD_OPTIONS
27465 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
27466
27467 #undef TARGET_OPTION_RESTORE
27468 #define TARGET_OPTION_RESTORE aarch64_option_restore
27469
27470 #undef TARGET_OPTION_PRINT
27471 #define TARGET_OPTION_PRINT aarch64_option_print
27472
27473 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
27474 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
27475
27476 #undef TARGET_SET_CURRENT_FUNCTION
27477 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
27478
27479 #undef TARGET_PASS_BY_REFERENCE
27480 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
27481
27482 #undef TARGET_PREFERRED_RELOAD_CLASS
27483 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
27484
27485 #undef TARGET_SCHED_REASSOCIATION_WIDTH
27486 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
27487
27488 #undef TARGET_PROMOTED_TYPE
27489 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
27490
27491 #undef TARGET_SECONDARY_RELOAD
27492 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
27493
27494 #undef TARGET_SHIFT_TRUNCATION_MASK
27495 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
27496
27497 #undef TARGET_SETUP_INCOMING_VARARGS
27498 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
27499
27500 #undef TARGET_STRUCT_VALUE_RTX
27501 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
27502
27503 #undef TARGET_REGISTER_MOVE_COST
27504 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
27505
27506 #undef TARGET_RETURN_IN_MEMORY
27507 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
27508
27509 #undef TARGET_RETURN_IN_MSB
27510 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
27511
27512 #undef TARGET_RTX_COSTS
27513 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
27514
27515 #undef TARGET_SCALAR_MODE_SUPPORTED_P
27516 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
27517
27518 #undef TARGET_SCHED_ISSUE_RATE
27519 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
27520
27521 #undef TARGET_SCHED_VARIABLE_ISSUE
27522 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
27523
27524 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
27525 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
27526   aarch64_sched_first_cycle_multipass_dfa_lookahead
27527
27528 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
27529 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
27530   aarch64_first_cycle_multipass_dfa_lookahead_guard
27531
27532 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
27533 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
27534   aarch64_get_separate_components
27535
27536 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
27537 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
27538   aarch64_components_for_bb
27539
27540 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
27541 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
27542   aarch64_disqualify_components
27543
27544 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
27545 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
27546   aarch64_emit_prologue_components
27547
27548 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
27549 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
27550   aarch64_emit_epilogue_components
27551
27552 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
27553 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
27554   aarch64_set_handled_components
27555
27556 #undef TARGET_TRAMPOLINE_INIT
27557 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
27558
27559 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
27560 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
27561
27562 #undef TARGET_VECTOR_MODE_SUPPORTED_P
27563 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
27564
27565 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
27566 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
27567
27568 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
27569 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
27570   aarch64_builtin_support_vector_misalignment
27571
27572 #undef TARGET_ARRAY_MODE
27573 #define TARGET_ARRAY_MODE aarch64_array_mode
27574
27575 #undef TARGET_ARRAY_MODE_SUPPORTED_P
27576 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
27577
27578 #undef TARGET_VECTORIZE_CREATE_COSTS
27579 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
27580
27581 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
27582 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
27583   aarch64_builtin_vectorization_cost
27584
27585 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
27586 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
27587
27588 #undef TARGET_VECTORIZE_BUILTINS
27589 #define TARGET_VECTORIZE_BUILTINS
27590
27591 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
27592 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
27593   aarch64_builtin_vectorized_function
27594
27595 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
27596 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
27597   aarch64_autovectorize_vector_modes
27598
27599 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
27600 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
27601   aarch64_atomic_assign_expand_fenv
27602
27603 /* Section anchor support.  */
27604
27605 #undef TARGET_MIN_ANCHOR_OFFSET
27606 #define TARGET_MIN_ANCHOR_OFFSET -256
27607
27608 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
27609    byte offset; we can do much more for larger data types, but have no way
27610    to determine the size of the access.  We assume accesses are aligned.  */
27611 #undef TARGET_MAX_ANCHOR_OFFSET
27612 #define TARGET_MAX_ANCHOR_OFFSET 4095
27613
27614 #undef TARGET_VECTOR_ALIGNMENT
27615 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
27616
27617 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
27618 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
27619   aarch64_vectorize_preferred_vector_alignment
27620 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
27621 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
27622   aarch64_simd_vector_alignment_reachable
27623
27624 /* vec_perm support.  */
27625
27626 #undef TARGET_VECTORIZE_VEC_PERM_CONST
27627 #define TARGET_VECTORIZE_VEC_PERM_CONST \
27628   aarch64_vectorize_vec_perm_const
27629
27630 #undef TARGET_VECTORIZE_RELATED_MODE
27631 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
27632 #undef TARGET_VECTORIZE_GET_MASK_MODE
27633 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
27634 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
27635 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
27636   aarch64_empty_mask_is_expensive
27637 #undef TARGET_PREFERRED_ELSE_VALUE
27638 #define TARGET_PREFERRED_ELSE_VALUE \
27639   aarch64_preferred_else_value
27640
27641 #undef TARGET_INIT_LIBFUNCS
27642 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
27643
27644 #undef TARGET_FIXED_CONDITION_CODE_REGS
27645 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
27646
27647 #undef TARGET_FLAGS_REGNUM
27648 #define TARGET_FLAGS_REGNUM CC_REGNUM
27649
27650 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
27651 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
27652
27653 #undef TARGET_ASAN_SHADOW_OFFSET
27654 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
27655
27656 #undef TARGET_LEGITIMIZE_ADDRESS
27657 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
27658
27659 #undef TARGET_SCHED_CAN_SPECULATE_INSN
27660 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
27661
27662 #undef TARGET_CAN_USE_DOLOOP_P
27663 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
27664
27665 #undef TARGET_SCHED_ADJUST_PRIORITY
27666 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
27667
27668 #undef TARGET_SCHED_MACRO_FUSION_P
27669 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
27670
27671 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
27672 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
27673
27674 #undef TARGET_SCHED_FUSION_PRIORITY
27675 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
27676
27677 #undef TARGET_UNSPEC_MAY_TRAP_P
27678 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
27679
27680 #undef TARGET_USE_PSEUDO_PIC_REG
27681 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
27682
27683 #undef TARGET_PRINT_OPERAND
27684 #define TARGET_PRINT_OPERAND aarch64_print_operand
27685
27686 #undef TARGET_PRINT_OPERAND_ADDRESS
27687 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
27688
27689 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
27690 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
27691
27692 #undef TARGET_OPTAB_SUPPORTED_P
27693 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
27694
27695 #undef TARGET_OMIT_STRUCT_RETURN_REG
27696 #define TARGET_OMIT_STRUCT_RETURN_REG true
27697
27698 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
27699 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
27700   aarch64_dwarf_poly_indeterminate_value
27701
27702 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
27703 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
27704 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
27705
27706 #undef TARGET_HARD_REGNO_NREGS
27707 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
27708 #undef TARGET_HARD_REGNO_MODE_OK
27709 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
27710
27711 #undef TARGET_MODES_TIEABLE_P
27712 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
27713
27714 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
27715 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
27716   aarch64_hard_regno_call_part_clobbered
27717
27718 #undef TARGET_INSN_CALLEE_ABI
27719 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
27720
27721 #undef TARGET_CONSTANT_ALIGNMENT
27722 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
27723
27724 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
27725 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
27726   aarch64_stack_clash_protection_alloca_probe_range
27727
27728 #undef TARGET_COMPUTE_PRESSURE_CLASSES
27729 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
27730
27731 #undef TARGET_CAN_CHANGE_MODE_CLASS
27732 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
27733
27734 #undef TARGET_SELECT_EARLY_REMAT_MODES
27735 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
27736
27737 #undef TARGET_SPECULATION_SAFE_VALUE
27738 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
27739
27740 #undef TARGET_ESTIMATED_POLY_VALUE
27741 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
27742
27743 #undef TARGET_ATTRIBUTE_TABLE
27744 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
27745
27746 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
27747 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
27748   aarch64_simd_clone_compute_vecsize_and_simdlen
27749
27750 #undef TARGET_SIMD_CLONE_ADJUST
27751 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
27752
27753 #undef TARGET_SIMD_CLONE_USABLE
27754 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
27755
27756 #undef TARGET_COMP_TYPE_ATTRIBUTES
27757 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
27758
27759 #undef TARGET_GET_MULTILIB_ABI_NAME
27760 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
27761
27762 #undef TARGET_FNTYPE_ABI
27763 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
27764
27765 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
27766 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
27767
27768 #if CHECKING_P
27769 #undef TARGET_RUN_TARGET_SELFTESTS
27770 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
27771 #endif /* #if CHECKING_P */
27772
27773 #undef TARGET_ASM_POST_CFI_STARTPROC
27774 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
27775
27776 #undef TARGET_STRICT_ARGUMENT_NAMING
27777 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
27778
27779 #undef TARGET_MD_ASM_ADJUST
27780 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
27781
27782 #undef TARGET_ASM_FILE_END
27783 #define TARGET_ASM_FILE_END aarch64_asm_file_end
27784
27785 #undef TARGET_ASM_FUNCTION_EPILOGUE
27786 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
27787
27788 #undef TARGET_HAVE_SHADOW_CALL_STACK
27789 #define TARGET_HAVE_SHADOW_CALL_STACK true
27790
27791 struct gcc_target targetm = TARGET_INITIALIZER;
27792
27793 #include "gt-aarch64.h"