gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2021 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74 #include "intl.h"
  75 #include "expmed.h"
  76 #include "function-abi.h"
  77 #include "gimple-pretty-print.h"
  78 #include "tree-ssa-loop-niter.h"
  79
  80 /* This file should be included last.  */
  81 #include "target-def.h"
  82
  83 /* Defined for convenience.  */
  84 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  85
  86 /* Information about a legitimate vector immediate operand.  */
  87 struct simd_immediate_info
  88 {
  89   enum insn_type { MOV, MVN, INDEX, PTRUE };
  90   enum modifier_type { LSL, MSL };
  91
  92   simd_immediate_info () {}
  93   simd_immediate_info (scalar_float_mode, rtx);
  94   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  95                        insn_type = MOV, modifier_type = LSL,
  96                        unsigned int = 0);
  97   simd_immediate_info (scalar_mode, rtx, rtx);
  98   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  99
 100   /* The mode of the elements.  */
 101   scalar_mode elt_mode;
 102
 103   /* The instruction to use to move the immediate into a vector.  */
 104   insn_type insn;
 105
 106   union
 107   {
 108     /* For MOV and MVN.  */
 109     struct
 110     {
 111       /* The value of each element.  */
 112       rtx value;
 113
 114       /* The kind of shift modifier to use, and the number of bits to shift.
 115          This is (LSL, 0) if no shift is needed.  */
 116       modifier_type modifier;
 117       unsigned int shift;
 118     } mov;
 119
 120     /* For INDEX.  */
 121     struct
 122     {
 123       /* The value of the first element and the step to be added for each
 124          subsequent element.  */
 125       rtx base, step;
 126     } index;
 127
 128     /* For PTRUE.  */
 129     aarch64_svpattern pattern;
 130   } u;
 131 };
 132
 133 /* Construct a floating-point immediate in which each element has mode
 134    ELT_MODE_IN and value VALUE_IN.  */
 135 inline simd_immediate_info
 136 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 137   : elt_mode (elt_mode_in), insn (MOV)
 138 {
 139   u.mov.value = value_in;
 140   u.mov.modifier = LSL;
 141   u.mov.shift = 0;
 142 }
 143
 144 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 145    and value VALUE_IN.  The other parameters are as for the structure
 146    fields.  */
 147 inline simd_immediate_info
 148 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 149                        unsigned HOST_WIDE_INT value_in,
 150                        insn_type insn_in, modifier_type modifier_in,
 151                        unsigned int shift_in)
 152   : elt_mode (elt_mode_in), insn (insn_in)
 153 {
 154   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 155   u.mov.modifier = modifier_in;
 156   u.mov.shift = shift_in;
 157 }
 158
 159 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 160    and where element I is equal to BASE_IN + I * STEP_IN.  */
 161 inline simd_immediate_info
 162 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 163   : elt_mode (elt_mode_in), insn (INDEX)
 164 {
 165   u.index.base = base_in;
 166   u.index.step = step_in;
 167 }
 168
 169 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 170    and has PTRUE pattern PATTERN_IN.  */
 171 inline simd_immediate_info
 172 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 173                        aarch64_svpattern pattern_in)
 174   : elt_mode (elt_mode_in), insn (PTRUE)
 175 {
 176   u.pattern = pattern_in;
 177 }
 178
 179 namespace {
 180
 181 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
 182 class pure_scalable_type_info
 183 {
 184 public:
 185   /* Represents the result of analyzing a type.  All values are nonzero,
 186      in the possibly forlorn hope that accidental conversions to bool
 187      trigger a warning.  */
 188   enum analysis_result
 189   {
 190     /* The type does not have an ABI identity; i.e. it doesn't contain
 191        at least one object whose type is a Fundamental Data Type.  */
 192     NO_ABI_IDENTITY = 1,
 193
 194     /* The type is definitely a Pure Scalable Type.  */
 195     IS_PST,
 196
 197     /* The type is definitely not a Pure Scalable Type.  */
 198     ISNT_PST,
 199
 200     /* It doesn't matter for PCS purposes whether the type is a Pure
 201        Scalable Type or not, since the type will be handled the same
 202        way regardless.
 203
 204        Specifically, this means that if the type is a Pure Scalable Type,
 205        there aren't enough argument registers to hold it, and so it will
 206        need to be passed or returned in memory.  If the type isn't a
 207        Pure Scalable Type, it's too big to be passed or returned in core
 208        or SIMD&FP registers, and so again will need to go in memory.  */
 209     DOESNT_MATTER
 210   };
 211
 212   /* Aggregates of 17 bytes or more are normally passed and returned
 213      in memory, so aggregates of that size can safely be analyzed as
 214      DOESNT_MATTER.  We need to be able to collect enough pieces to
 215      represent a PST that is smaller than that.  Since predicates are
 216      2 bytes in size for -msve-vector-bits=128, that means we need to be
 217      able to store at least 8 pieces.
 218
 219      We also need to be able to store enough pieces to represent
 220      a single vector in each vector argument register and a single
 221      predicate in each predicate argument register.  This means that
 222      we need at least 12 pieces.  */
 223   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
 224   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
 225
 226   /* Describes one piece of a PST.  Each piece is one of:
 227
 228      - a single Scalable Vector Type (SVT)
 229      - a single Scalable Predicate Type (SPT)
 230      - a PST containing 2, 3 or 4 SVTs, with no padding
 231
 232      It either represents a single built-in type or a PST formed from
 233      multiple homogeneous built-in types.  */
 234   struct piece
 235   {
 236     rtx get_rtx (unsigned int, unsigned int) const;
 237
 238     /* The number of vector and predicate registers that the piece
 239        occupies.  One of the two is always zero.  */
 240     unsigned int num_zr;
 241     unsigned int num_pr;
 242
 243     /* The mode of the registers described above.  */
 244     machine_mode mode;
 245
 246     /* If this piece is formed from multiple homogeneous built-in types,
 247        this is the mode of the built-in types, otherwise it is MODE.  */
 248     machine_mode orig_mode;
 249
 250     /* The offset in bytes of the piece from the start of the type.  */
 251     poly_uint64_pod offset;
 252   };
 253
 254   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
 255      are in memory order.  */
 256   auto_vec<piece, MAX_PIECES> pieces;
 257
 258   unsigned int num_zr () const;
 259   unsigned int num_pr () const;
 260
 261   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
 262
 263   analysis_result analyze (const_tree);
 264   bool analyze_registers (const_tree);
 265
 266 private:
 267   analysis_result analyze_array (const_tree);
 268   analysis_result analyze_record (const_tree);
 269   void add_piece (const piece &);
 270 };
 271 }
 272
 273 /* The current code model.  */
 274 enum aarch64_code_model aarch64_cmodel;
 275
 276 /* The number of 64-bit elements in an SVE vector.  */
 277 poly_uint16 aarch64_sve_vg;
 278
 279 #ifdef HAVE_AS_TLS
 280 #undef TARGET_HAVE_TLS
 281 #define TARGET_HAVE_TLS 1
 282 #endif
 283
 284 static bool aarch64_composite_type_p (const_tree, machine_mode);
 285 static bool aarch64_return_in_memory_1 (const_tree);
 286 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 287                                                      const_tree,
 288                                                      machine_mode *, int *,
 289                                                      bool *, bool);
 290 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 291 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 292 static void aarch64_override_options_after_change (void);
 293 static bool aarch64_vector_mode_supported_p (machine_mode);
 294 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 295 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 296                                                          const_tree type,
 297                                                          int misalignment,
 298                                                          bool is_packed);
 299 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 300 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 301                                             aarch64_addr_query_type);
 302 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 303
 304 /* Major revision number of the ARM Architecture implemented by the target.  */
 305 unsigned aarch64_architecture_version;
 306
 307 /* The processor for which instructions should be scheduled.  */
 308 enum aarch64_processor aarch64_tune = cortexa53;
 309
 310 /* Mask to specify which instruction scheduling options should be used.  */
 311 uint64_t aarch64_tune_flags = 0;
 312
 313 /* Global flag for PC relative loads.  */
 314 bool aarch64_pcrelative_literal_loads;
 315
 316 /* Global flag for whether frame pointer is enabled.  */
 317 bool aarch64_use_frame_pointer;
 318
 319 #define BRANCH_PROTECT_STR_MAX 255
 320 char *accepted_branch_protection_string = NULL;
 321
 322 static enum aarch64_parse_opt_result
 323 aarch64_parse_branch_protection (const char*, char**);
 324
 325 /* Support for command line parsing of boolean flags in the tuning
 326    structures.  */
 327 struct aarch64_flag_desc
 328 {
 329   const char* name;
 330   unsigned int flag;
 331 };
 332
 333 #define AARCH64_FUSION_PAIR(name, internal_name) \
 334   { name, AARCH64_FUSE_##internal_name },
 335 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 336 {
 337   { "none", AARCH64_FUSE_NOTHING },
 338 #include "aarch64-fusion-pairs.def"
 339   { "all", AARCH64_FUSE_ALL },
 340   { NULL, AARCH64_FUSE_NOTHING }
 341 };
 342
 343 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 344   { name, AARCH64_EXTRA_TUNE_##internal_name },
 345 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 346 {
 347   { "none", AARCH64_EXTRA_TUNE_NONE },
 348 #include "aarch64-tuning-flags.def"
 349   { "all", AARCH64_EXTRA_TUNE_ALL },
 350   { NULL, AARCH64_EXTRA_TUNE_NONE }
 351 };
 352
 353 /* Tuning parameters.  */
 354
 355 static const struct cpu_addrcost_table generic_addrcost_table =
 356 {
 357     {
 358       1, /* hi  */
 359       0, /* si  */
 360       0, /* di  */
 361       1, /* ti  */
 362     },
 363   0, /* pre_modify  */
 364   0, /* post_modify  */
 365   0, /* post_modify_ld3_st3  */
 366   0, /* post_modify_ld4_st4  */
 367   0, /* register_offset  */
 368   0, /* register_sextend  */
 369   0, /* register_zextend  */
 370   0 /* imm_offset  */
 371 };
 372
 373 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 374 {
 375     {
 376       0, /* hi  */
 377       0, /* si  */
 378       0, /* di  */
 379       2, /* ti  */
 380     },
 381   0, /* pre_modify  */
 382   0, /* post_modify  */
 383   0, /* post_modify_ld3_st3  */
 384   0, /* post_modify_ld4_st4  */
 385   1, /* register_offset  */
 386   1, /* register_sextend  */
 387   2, /* register_zextend  */
 388   0, /* imm_offset  */
 389 };
 390
 391 static const struct cpu_addrcost_table xgene1_addrcost_table =
 392 {
 393     {
 394       1, /* hi  */
 395       0, /* si  */
 396       0, /* di  */
 397       1, /* ti  */
 398     },
 399   1, /* pre_modify  */
 400   1, /* post_modify  */
 401   1, /* post_modify_ld3_st3  */
 402   1, /* post_modify_ld4_st4  */
 403   0, /* register_offset  */
 404   1, /* register_sextend  */
 405   1, /* register_zextend  */
 406   0, /* imm_offset  */
 407 };
 408
 409 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 410 {
 411     {
 412       1, /* hi  */
 413       1, /* si  */
 414       1, /* di  */
 415       2, /* ti  */
 416     },
 417   0, /* pre_modify  */
 418   0, /* post_modify  */
 419   0, /* post_modify_ld3_st3  */
 420   0, /* post_modify_ld4_st4  */
 421   2, /* register_offset  */
 422   3, /* register_sextend  */
 423   3, /* register_zextend  */
 424   0, /* imm_offset  */
 425 };
 426
 427 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
 428 {
 429     {
 430       1, /* hi  */
 431       1, /* si  */
 432       1, /* di  */
 433       2, /* ti  */
 434     },
 435   0, /* pre_modify  */
 436   0, /* post_modify  */
 437   0, /* post_modify_ld3_st3  */
 438   0, /* post_modify_ld4_st4  */
 439   2, /* register_offset  */
 440   3, /* register_sextend  */
 441   3, /* register_zextend  */
 442   0, /* imm_offset  */
 443 };
 444
 445 static const struct cpu_addrcost_table tsv110_addrcost_table =
 446 {
 447     {
 448       1, /* hi  */
 449       0, /* si  */
 450       0, /* di  */
 451       1, /* ti  */
 452     },
 453   0, /* pre_modify  */
 454   0, /* post_modify  */
 455   0, /* post_modify_ld3_st3  */
 456   0, /* post_modify_ld4_st4  */
 457   0, /* register_offset  */
 458   1, /* register_sextend  */
 459   1, /* register_zextend  */
 460   0, /* imm_offset  */
 461 };
 462
 463 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 464 {
 465     {
 466       1, /* hi  */
 467       1, /* si  */
 468       1, /* di  */
 469       2, /* ti  */
 470     },
 471   1, /* pre_modify  */
 472   1, /* post_modify  */
 473   1, /* post_modify_ld3_st3  */
 474   1, /* post_modify_ld4_st4  */
 475   3, /* register_offset  */
 476   3, /* register_sextend  */
 477   3, /* register_zextend  */
 478   2, /* imm_offset  */
 479 };
 480
 481 static const struct cpu_addrcost_table a64fx_addrcost_table =
 482 {
 483     {
 484       1, /* hi  */
 485       1, /* si  */
 486       1, /* di  */
 487       2, /* ti  */
 488     },
 489   0, /* pre_modify  */
 490   0, /* post_modify  */
 491   0, /* post_modify_ld3_st3  */
 492   0, /* post_modify_ld4_st4  */
 493   2, /* register_offset  */
 494   3, /* register_sextend  */
 495   3, /* register_zextend  */
 496   0, /* imm_offset  */
 497 };
 498
 499 static const struct cpu_addrcost_table neoversev1_addrcost_table =
 500 {
 501     {
 502       1, /* hi  */
 503       0, /* si  */
 504       0, /* di  */
 505       1, /* ti  */
 506     },
 507   0, /* pre_modify  */
 508   0, /* post_modify  */
 509   3, /* post_modify_ld3_st3  */
 510   3, /* post_modify_ld4_st4  */
 511   0, /* register_offset  */
 512   0, /* register_sextend  */
 513   0, /* register_zextend  */
 514   0 /* imm_offset  */
 515 };
 516
 517 static const struct cpu_regmove_cost generic_regmove_cost =
 518 {
 519   1, /* GP2GP  */
 520   /* Avoid the use of slow int<->fp moves for spilling by setting
 521      their cost higher than memmov_cost.  */
 522   5, /* GP2FP  */
 523   5, /* FP2GP  */
 524   2 /* FP2FP  */
 525 };
 526
 527 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 528 {
 529   1, /* GP2GP  */
 530   /* Avoid the use of slow int<->fp moves for spilling by setting
 531      their cost higher than memmov_cost.  */
 532   5, /* GP2FP  */
 533   5, /* FP2GP  */
 534   2 /* FP2FP  */
 535 };
 536
 537 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 538 {
 539   1, /* GP2GP  */
 540   /* Avoid the use of slow int<->fp moves for spilling by setting
 541      their cost higher than memmov_cost.  */
 542   5, /* GP2FP  */
 543   5, /* FP2GP  */
 544   2 /* FP2FP  */
 545 };
 546
 547 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 548 {
 549   1, /* GP2GP  */
 550   /* Avoid the use of slow int<->fp moves for spilling by setting
 551      their cost higher than memmov_cost (actual, 4 and 9).  */
 552   9, /* GP2FP  */
 553   9, /* FP2GP  */
 554   1 /* FP2FP  */
 555 };
 556
 557 static const struct cpu_regmove_cost thunderx_regmove_cost =
 558 {
 559   2, /* GP2GP  */
 560   2, /* GP2FP  */
 561   6, /* FP2GP  */
 562   4 /* FP2FP  */
 563 };
 564
 565 static const struct cpu_regmove_cost xgene1_regmove_cost =
 566 {
 567   1, /* GP2GP  */
 568   /* Avoid the use of slow int<->fp moves for spilling by setting
 569      their cost higher than memmov_cost.  */
 570   8, /* GP2FP  */
 571   8, /* FP2GP  */
 572   2 /* FP2FP  */
 573 };
 574
 575 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 576 {
 577   2, /* GP2GP  */
 578   /* Avoid the use of int<->fp moves for spilling.  */
 579   6, /* GP2FP  */
 580   6, /* FP2GP  */
 581   4 /* FP2FP  */
 582 };
 583
 584 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 585 {
 586   1, /* GP2GP  */
 587   /* Avoid the use of int<->fp moves for spilling.  */
 588   5, /* GP2FP  */
 589   6, /* FP2GP  */
 590   3, /* FP2FP  */
 591 };
 592
 593 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
 594 {
 595   1, /* GP2GP  */
 596   /* Avoid the use of int<->fp moves for spilling.  */
 597   4, /* GP2FP  */
 598   5, /* FP2GP  */
 599   4  /* FP2FP  */
 600 };
 601
 602 static const struct cpu_regmove_cost tsv110_regmove_cost =
 603 {
 604   1, /* GP2GP  */
 605   /* Avoid the use of slow int<->fp moves for spilling by setting
 606      their cost higher than memmov_cost.  */
 607   2, /* GP2FP  */
 608   3, /* FP2GP  */
 609   2  /* FP2FP  */
 610 };
 611
 612 static const struct cpu_regmove_cost a64fx_regmove_cost =
 613 {
 614   1, /* GP2GP  */
 615   /* Avoid the use of slow int<->fp moves for spilling by setting
 616      their cost higher than memmov_cost.  */
 617   5, /* GP2FP  */
 618   7, /* FP2GP  */
 619   2 /* FP2FP  */
 620 };
 621
 622 /* Generic costs for Advanced SIMD vector operations.   */
 623 static const advsimd_vec_cost generic_advsimd_vector_cost =
 624 {
 625   1, /* int_stmt_cost  */
 626   1, /* fp_stmt_cost  */
 627   0, /* ld2_st2_permute_cost  */
 628   0, /* ld3_st3_permute_cost  */
 629   0, /* ld4_st4_permute_cost  */
 630   2, /* permute_cost  */
 631   2, /* reduc_i8_cost  */
 632   2, /* reduc_i16_cost  */
 633   2, /* reduc_i32_cost  */
 634   2, /* reduc_i64_cost  */
 635   2, /* reduc_f16_cost  */
 636   2, /* reduc_f32_cost  */
 637   2, /* reduc_f64_cost  */
 638   2, /* store_elt_extra_cost  */
 639   2, /* vec_to_scalar_cost  */
 640   1, /* scalar_to_vec_cost  */
 641   1, /* align_load_cost  */
 642   1, /* unalign_load_cost  */
 643   1, /* unalign_store_cost  */
 644   1  /* store_cost  */
 645 };
 646
 647 /* Generic costs for SVE vector operations.  */
 648 static const sve_vec_cost generic_sve_vector_cost =
 649 {
 650   {
 651     1, /* int_stmt_cost  */
 652     1, /* fp_stmt_cost  */
 653     0, /* ld2_st2_permute_cost  */
 654     0, /* ld3_st3_permute_cost  */
 655     0, /* ld4_st4_permute_cost  */
 656     2, /* permute_cost  */
 657     2, /* reduc_i8_cost  */
 658     2, /* reduc_i16_cost  */
 659     2, /* reduc_i32_cost  */
 660     2, /* reduc_i64_cost  */
 661     2, /* reduc_f16_cost  */
 662     2, /* reduc_f32_cost  */
 663     2, /* reduc_f64_cost  */
 664     2, /* store_elt_extra_cost  */
 665     2, /* vec_to_scalar_cost  */
 666     1, /* scalar_to_vec_cost  */
 667     1, /* align_load_cost  */
 668     1, /* unalign_load_cost  */
 669     1, /* unalign_store_cost  */
 670     1  /* store_cost  */
 671   },
 672   2, /* clast_cost  */
 673   2, /* fadda_f16_cost  */
 674   2, /* fadda_f32_cost  */
 675   2, /* fadda_f64_cost  */
 676   1 /* scatter_store_elt_cost  */
 677 };
 678
 679 /* Generic costs for vector insn classes.  */
 680 static const struct cpu_vector_cost generic_vector_cost =
 681 {
 682   1, /* scalar_int_stmt_cost  */
 683   1, /* scalar_fp_stmt_cost  */
 684   1, /* scalar_load_cost  */
 685   1, /* scalar_store_cost  */
 686   3, /* cond_taken_branch_cost  */
 687   1, /* cond_not_taken_branch_cost  */
 688   &generic_advsimd_vector_cost, /* advsimd  */
 689   &generic_sve_vector_cost, /* sve */
 690   nullptr /* issue_info  */
 691 };
 692
 693 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
 694 {
 695   2, /* int_stmt_cost  */
 696   5, /* fp_stmt_cost  */
 697   0, /* ld2_st2_permute_cost  */
 698   0, /* ld3_st3_permute_cost  */
 699   0, /* ld4_st4_permute_cost  */
 700   3, /* permute_cost  */
 701   13, /* reduc_i8_cost  */
 702   13, /* reduc_i16_cost  */
 703   13, /* reduc_i32_cost  */
 704   13, /* reduc_i64_cost  */
 705   13, /* reduc_f16_cost  */
 706   13, /* reduc_f32_cost  */
 707   13, /* reduc_f64_cost  */
 708   13, /* store_elt_extra_cost  */
 709   13, /* vec_to_scalar_cost  */
 710   4, /* scalar_to_vec_cost  */
 711   6, /* align_load_cost  */
 712   6, /* unalign_load_cost  */
 713   1, /* unalign_store_cost  */
 714   1  /* store_cost  */
 715 };
 716
 717 static const sve_vec_cost a64fx_sve_vector_cost =
 718 {
 719   {
 720     2, /* int_stmt_cost  */
 721     5, /* fp_stmt_cost  */
 722     0, /* ld2_st2_permute_cost  */
 723     0, /* ld3_st3_permute_cost  */
 724     0, /* ld4_st4_permute_cost  */
 725     3, /* permute_cost  */
 726     13, /* reduc_i8_cost  */
 727     13, /* reduc_i16_cost  */
 728     13, /* reduc_i32_cost  */
 729     13, /* reduc_i64_cost  */
 730     13, /* reduc_f16_cost  */
 731     13, /* reduc_f32_cost  */
 732     13, /* reduc_f64_cost  */
 733     13, /* store_elt_extra_cost  */
 734     13, /* vec_to_scalar_cost  */
 735     4, /* scalar_to_vec_cost  */
 736     6, /* align_load_cost  */
 737     6, /* unalign_load_cost  */
 738     1, /* unalign_store_cost  */
 739     1  /* store_cost  */
 740   },
 741   13, /* clast_cost  */
 742   13, /* fadda_f16_cost  */
 743   13, /* fadda_f32_cost  */
 744   13, /* fadda_f64_cost  */
 745   1 /* scatter_store_elt_cost  */
 746 };
 747
 748 static const struct cpu_vector_cost a64fx_vector_cost =
 749 {
 750   1, /* scalar_int_stmt_cost  */
 751   5, /* scalar_fp_stmt_cost  */
 752   4, /* scalar_load_cost  */
 753   1, /* scalar_store_cost  */
 754   3, /* cond_taken_branch_cost  */
 755   1, /* cond_not_taken_branch_cost  */
 756   &a64fx_advsimd_vector_cost, /* advsimd  */
 757   &a64fx_sve_vector_cost, /* sve  */
 758   nullptr /* issue_info  */
 759 };
 760
 761 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
 762 {
 763   1, /* int_stmt_cost  */
 764   3, /* fp_stmt_cost  */
 765   0, /* ld2_st2_permute_cost  */
 766   0, /* ld3_st3_permute_cost  */
 767   0, /* ld4_st4_permute_cost  */
 768   2, /* permute_cost  */
 769   1, /* reduc_i8_cost  */
 770   1, /* reduc_i16_cost  */
 771   1, /* reduc_i32_cost  */
 772   1, /* reduc_i64_cost  */
 773   1, /* reduc_f16_cost  */
 774   1, /* reduc_f32_cost  */
 775   1, /* reduc_f64_cost  */
 776   1, /* store_elt_extra_cost  */
 777   1, /* vec_to_scalar_cost  */
 778   1, /* scalar_to_vec_cost  */
 779   1, /* align_load_cost  */
 780   1, /* unalign_load_cost  */
 781   1, /* unalign_store_cost  */
 782   1  /* store_cost  */
 783 };
 784
 785 /* QDF24XX costs for vector insn classes.  */
 786 static const struct cpu_vector_cost qdf24xx_vector_cost =
 787 {
 788   1, /* scalar_int_stmt_cost  */
 789   1, /* scalar_fp_stmt_cost  */
 790   1, /* scalar_load_cost  */
 791   1, /* scalar_store_cost  */
 792   3, /* cond_taken_branch_cost  */
 793   1, /* cond_not_taken_branch_cost  */
 794   &qdf24xx_advsimd_vector_cost, /* advsimd  */
 795   nullptr, /* sve  */
 796   nullptr /* issue_info  */
 797 };
 798
 799
 800 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
 801 {
 802   4, /* int_stmt_cost  */
 803   1, /* fp_stmt_cost  */
 804   0, /* ld2_st2_permute_cost  */
 805   0, /* ld3_st3_permute_cost  */
 806   0, /* ld4_st4_permute_cost  */
 807   4, /* permute_cost  */
 808   2, /* reduc_i8_cost  */
 809   2, /* reduc_i16_cost  */
 810   2, /* reduc_i32_cost  */
 811   2, /* reduc_i64_cost  */
 812   2, /* reduc_f16_cost  */
 813   2, /* reduc_f32_cost  */
 814   2, /* reduc_f64_cost  */
 815   2, /* store_elt_extra_cost  */
 816   2, /* vec_to_scalar_cost  */
 817   2, /* scalar_to_vec_cost  */
 818   3, /* align_load_cost  */
 819   5, /* unalign_load_cost  */
 820   5, /* unalign_store_cost  */
 821   1  /* store_cost  */
 822 };
 823
 824 /* ThunderX costs for vector insn classes.  */
 825 static const struct cpu_vector_cost thunderx_vector_cost =
 826 {
 827   1, /* scalar_int_stmt_cost  */
 828   1, /* scalar_fp_stmt_cost  */
 829   3, /* scalar_load_cost  */
 830   1, /* scalar_store_cost  */
 831   3, /* cond_taken_branch_cost  */
 832   3, /* cond_not_taken_branch_cost  */
 833   &thunderx_advsimd_vector_cost, /* advsimd  */
 834   nullptr, /* sve  */
 835   nullptr /* issue_info  */
 836 };
 837
 838 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
 839 {
 840   2, /* int_stmt_cost  */
 841   2, /* fp_stmt_cost  */
 842   0, /* ld2_st2_permute_cost  */
 843   0, /* ld3_st3_permute_cost  */
 844   0, /* ld4_st4_permute_cost  */
 845   2, /* permute_cost  */
 846   3, /* reduc_i8_cost  */
 847   3, /* reduc_i16_cost  */
 848   3, /* reduc_i32_cost  */
 849   3, /* reduc_i64_cost  */
 850   3, /* reduc_f16_cost  */
 851   3, /* reduc_f32_cost  */
 852   3, /* reduc_f64_cost  */
 853   3, /* store_elt_extra_cost  */
 854   3, /* vec_to_scalar_cost  */
 855   2, /* scalar_to_vec_cost  */
 856   5, /* align_load_cost  */
 857   5, /* unalign_load_cost  */
 858   1, /* unalign_store_cost  */
 859   1  /* store_cost  */
 860 };
 861
 862 static const struct cpu_vector_cost tsv110_vector_cost =
 863 {
 864   1, /* scalar_int_stmt_cost  */
 865   1, /* scalar_fp_stmt_cost  */
 866   5, /* scalar_load_cost  */
 867   1, /* scalar_store_cost  */
 868   1, /* cond_taken_branch_cost  */
 869   1, /* cond_not_taken_branch_cost  */
 870   &tsv110_advsimd_vector_cost, /* advsimd  */
 871   nullptr, /* sve  */
 872   nullptr /* issue_info  */
 873 };
 874
 875 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
 876 {
 877   2, /* int_stmt_cost  */
 878   2, /* fp_stmt_cost  */
 879   0, /* ld2_st2_permute_cost  */
 880   0, /* ld3_st3_permute_cost  */
 881   0, /* ld4_st4_permute_cost  */
 882   3, /* permute_cost  */
 883   8, /* reduc_i8_cost  */
 884   8, /* reduc_i16_cost  */
 885   8, /* reduc_i32_cost  */
 886   8, /* reduc_i64_cost  */
 887   8, /* reduc_f16_cost  */
 888   8, /* reduc_f32_cost  */
 889   8, /* reduc_f64_cost  */
 890   8, /* store_elt_extra_cost  */
 891   8, /* vec_to_scalar_cost  */
 892   8, /* scalar_to_vec_cost  */
 893   4, /* align_load_cost  */
 894   4, /* unalign_load_cost  */
 895   1, /* unalign_store_cost  */
 896   1  /* store_cost  */
 897 };
 898
 899 /* Cortex-A57 costs for vector insn classes.  */
 900 static const struct cpu_vector_cost cortexa57_vector_cost =
 901 {
 902   1, /* scalar_int_stmt_cost  */
 903   1, /* scalar_fp_stmt_cost  */
 904   4, /* scalar_load_cost  */
 905   1, /* scalar_store_cost  */
 906   1, /* cond_taken_branch_cost  */
 907   1, /* cond_not_taken_branch_cost  */
 908   &cortexa57_advsimd_vector_cost, /* advsimd  */
 909   nullptr, /* sve  */
 910   nullptr /* issue_info  */
 911 };
 912
 913 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
 914 {
 915   3, /* int_stmt_cost  */
 916   3, /* fp_stmt_cost  */
 917   0, /* ld2_st2_permute_cost  */
 918   0, /* ld3_st3_permute_cost  */
 919   0, /* ld4_st4_permute_cost  */
 920   3, /* permute_cost  */
 921   3, /* reduc_i8_cost  */
 922   3, /* reduc_i16_cost  */
 923   3, /* reduc_i32_cost  */
 924   3, /* reduc_i64_cost  */
 925   3, /* reduc_f16_cost  */
 926   3, /* reduc_f32_cost  */
 927   3, /* reduc_f64_cost  */
 928   3, /* store_elt_extra_cost  */
 929   3, /* vec_to_scalar_cost  */
 930   3, /* scalar_to_vec_cost  */
 931   5, /* align_load_cost  */
 932   5, /* unalign_load_cost  */
 933   1, /* unalign_store_cost  */
 934   1  /* store_cost  */
 935 };
 936
 937 static const struct cpu_vector_cost exynosm1_vector_cost =
 938 {
 939   1, /* scalar_int_stmt_cost  */
 940   1, /* scalar_fp_stmt_cost  */
 941   5, /* scalar_load_cost  */
 942   1, /* scalar_store_cost  */
 943   1, /* cond_taken_branch_cost  */
 944   1, /* cond_not_taken_branch_cost  */
 945   &exynosm1_advsimd_vector_cost, /* advsimd  */
 946   nullptr, /* sve  */
 947   nullptr /* issue_info  */
 948 };
 949
 950 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
 951 {
 952   2, /* int_stmt_cost  */
 953   2, /* fp_stmt_cost  */
 954   0, /* ld2_st2_permute_cost  */
 955   0, /* ld3_st3_permute_cost  */
 956   0, /* ld4_st4_permute_cost  */
 957   2, /* permute_cost  */
 958   4, /* reduc_i8_cost  */
 959   4, /* reduc_i16_cost  */
 960   4, /* reduc_i32_cost  */
 961   4, /* reduc_i64_cost  */
 962   4, /* reduc_f16_cost  */
 963   4, /* reduc_f32_cost  */
 964   4, /* reduc_f64_cost  */
 965   4, /* store_elt_extra_cost  */
 966   4, /* vec_to_scalar_cost  */
 967   4, /* scalar_to_vec_cost  */
 968   10, /* align_load_cost  */
 969   10, /* unalign_load_cost  */
 970   2, /* unalign_store_cost  */
 971   2  /* store_cost  */
 972 };
 973
 974 /* Generic costs for vector insn classes.  */
 975 static const struct cpu_vector_cost xgene1_vector_cost =
 976 {
 977   1, /* scalar_int_stmt_cost  */
 978   1, /* scalar_fp_stmt_cost  */
 979   5, /* scalar_load_cost  */
 980   1, /* scalar_store_cost  */
 981   2, /* cond_taken_branch_cost  */
 982   1, /* cond_not_taken_branch_cost  */
 983   &xgene1_advsimd_vector_cost, /* advsimd  */
 984   nullptr, /* sve  */
 985   nullptr /* issue_info  */
 986 };
 987
 988 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
 989 {
 990   4, /* int_stmt_cost  */
 991   5, /* fp_stmt_cost  */
 992   0, /* ld2_st2_permute_cost  */
 993   0, /* ld3_st3_permute_cost  */
 994   0, /* ld4_st4_permute_cost  */
 995   10, /* permute_cost  */
 996   6, /* reduc_i8_cost  */
 997   6, /* reduc_i16_cost  */
 998   6, /* reduc_i32_cost  */
 999   6, /* reduc_i64_cost  */
1000   6, /* reduc_f16_cost  */
1001   6, /* reduc_f32_cost  */
1002   6, /* reduc_f64_cost  */
1003   6, /* store_elt_extra_cost  */
1004   6, /* vec_to_scalar_cost  */
1005   5, /* scalar_to_vec_cost  */
1006   4, /* align_load_cost  */
1007   4, /* unalign_load_cost  */
1008   1, /* unalign_store_cost  */
1009   1  /* store_cost  */
1010 };
1011
1012 /* Costs for vector insn classes for Vulcan.  */
1013 static const struct cpu_vector_cost thunderx2t99_vector_cost =
1014 {
1015   1, /* scalar_int_stmt_cost  */
1016   6, /* scalar_fp_stmt_cost  */
1017   4, /* scalar_load_cost  */
1018   1, /* scalar_store_cost  */
1019   2, /* cond_taken_branch_cost  */
1020   1,  /* cond_not_taken_branch_cost  */
1021   &thunderx2t99_advsimd_vector_cost, /* advsimd  */
1022   nullptr, /* sve  */
1023   nullptr /* issue_info  */
1024 };
1025
1026 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1027 {
1028   5, /* int_stmt_cost  */
1029   5, /* fp_stmt_cost  */
1030   0, /* ld2_st2_permute_cost  */
1031   0, /* ld3_st3_permute_cost  */
1032   0, /* ld4_st4_permute_cost  */
1033   10, /* permute_cost  */
1034   5, /* reduc_i8_cost  */
1035   5, /* reduc_i16_cost  */
1036   5, /* reduc_i32_cost  */
1037   5, /* reduc_i64_cost  */
1038   5, /* reduc_f16_cost  */
1039   5, /* reduc_f32_cost  */
1040   5, /* reduc_f64_cost  */
1041   5, /* store_elt_extra_cost  */
1042   5, /* vec_to_scalar_cost  */
1043   5, /* scalar_to_vec_cost  */
1044   4, /* align_load_cost  */
1045   4, /* unalign_load_cost  */
1046   4, /* unalign_store_cost  */
1047   4  /* store_cost  */
1048 };
1049
1050 static const struct cpu_vector_cost thunderx3t110_vector_cost =
1051 {
1052   1, /* scalar_int_stmt_cost  */
1053   5, /* scalar_fp_stmt_cost  */
1054   4, /* scalar_load_cost  */
1055   1, /* scalar_store_cost  */
1056   2, /* cond_taken_branch_cost  */
1057   1,  /* cond_not_taken_branch_cost  */
1058   &thunderx3t110_advsimd_vector_cost, /* advsimd  */
1059   nullptr, /* sve  */
1060   nullptr /* issue_info  */
1061 };
1062
1063
1064 /* Generic costs for branch instructions.  */
1065 static const struct cpu_branch_cost generic_branch_cost =
1066 {
1067   1,  /* Predictable.  */
1068   3   /* Unpredictable.  */
1069 };
1070
1071 /* Generic approximation modes.  */
1072 static const cpu_approx_modes generic_approx_modes =
1073 {
1074   AARCH64_APPROX_NONE,  /* division  */
1075   AARCH64_APPROX_NONE,  /* sqrt  */
1076   AARCH64_APPROX_NONE   /* recip_sqrt  */
1077 };
1078
1079 /* Approximation modes for Exynos M1.  */
1080 static const cpu_approx_modes exynosm1_approx_modes =
1081 {
1082   AARCH64_APPROX_NONE,  /* division  */
1083   AARCH64_APPROX_ALL,   /* sqrt  */
1084   AARCH64_APPROX_ALL    /* recip_sqrt  */
1085 };
1086
1087 /* Approximation modes for X-Gene 1.  */
1088 static const cpu_approx_modes xgene1_approx_modes =
1089 {
1090   AARCH64_APPROX_NONE,  /* division  */
1091   AARCH64_APPROX_NONE,  /* sqrt  */
1092   AARCH64_APPROX_ALL    /* recip_sqrt  */
1093 };
1094
1095 /* Generic prefetch settings (which disable prefetch).  */
1096 static const cpu_prefetch_tune generic_prefetch_tune =
1097 {
1098   0,                    /* num_slots  */
1099   -1,                   /* l1_cache_size  */
1100   -1,                   /* l1_cache_line_size  */
1101   -1,                   /* l2_cache_size  */
1102   true,                 /* prefetch_dynamic_strides */
1103   -1,                   /* minimum_stride */
1104   -1                    /* default_opt_level  */
1105 };
1106
1107 static const cpu_prefetch_tune exynosm1_prefetch_tune =
1108 {
1109   0,                    /* num_slots  */
1110   -1,                   /* l1_cache_size  */
1111   64,                   /* l1_cache_line_size  */
1112   -1,                   /* l2_cache_size  */
1113   true,                 /* prefetch_dynamic_strides */
1114   -1,                   /* minimum_stride */
1115   -1                    /* default_opt_level  */
1116 };
1117
1118 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1119 {
1120   4,                    /* num_slots  */
1121   32,                   /* l1_cache_size  */
1122   64,                   /* l1_cache_line_size  */
1123   512,                  /* l2_cache_size  */
1124   false,                /* prefetch_dynamic_strides */
1125   2048,                 /* minimum_stride */
1126   3                     /* default_opt_level  */
1127 };
1128
1129 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1130 {
1131   8,                    /* num_slots  */
1132   32,                   /* l1_cache_size  */
1133   128,                  /* l1_cache_line_size  */
1134   16*1024,              /* l2_cache_size  */
1135   true,                 /* prefetch_dynamic_strides */
1136   -1,                   /* minimum_stride */
1137   3                     /* default_opt_level  */
1138 };
1139
1140 static const cpu_prefetch_tune thunderx_prefetch_tune =
1141 {
1142   8,                    /* num_slots  */
1143   32,                   /* l1_cache_size  */
1144   128,                  /* l1_cache_line_size  */
1145   -1,                   /* l2_cache_size  */
1146   true,                 /* prefetch_dynamic_strides */
1147   -1,                   /* minimum_stride */
1148   -1                    /* default_opt_level  */
1149 };
1150
1151 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1152 {
1153   8,                    /* num_slots  */
1154   32,                   /* l1_cache_size  */
1155   64,                   /* l1_cache_line_size  */
1156   256,                  /* l2_cache_size  */
1157   true,                 /* prefetch_dynamic_strides */
1158   -1,                   /* minimum_stride */
1159   -1                    /* default_opt_level  */
1160 };
1161
1162 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1163 {
1164   8,                    /* num_slots  */
1165   32,                   /* l1_cache_size  */
1166   64,                   /* l1_cache_line_size  */
1167   256,                  /* l2_cache_size  */
1168   true,                 /* prefetch_dynamic_strides */
1169   -1,                   /* minimum_stride */
1170   -1                    /* default_opt_level  */
1171 };
1172
1173 static const cpu_prefetch_tune tsv110_prefetch_tune =
1174 {
1175   0,                    /* num_slots  */
1176   64,                   /* l1_cache_size  */
1177   64,                   /* l1_cache_line_size  */
1178   512,                  /* l2_cache_size  */
1179   true,                 /* prefetch_dynamic_strides */
1180   -1,                   /* minimum_stride */
1181   -1                    /* default_opt_level  */
1182 };
1183
1184 static const cpu_prefetch_tune xgene1_prefetch_tune =
1185 {
1186   8,                    /* num_slots  */
1187   32,                   /* l1_cache_size  */
1188   64,                   /* l1_cache_line_size  */
1189   256,                  /* l2_cache_size  */
1190   true,                 /* prefetch_dynamic_strides */
1191   -1,                   /* minimum_stride */
1192   -1                    /* default_opt_level  */
1193 };
1194
1195 static const cpu_prefetch_tune a64fx_prefetch_tune =
1196 {
1197   8,                    /* num_slots  */
1198   64,                   /* l1_cache_size  */
1199   256,                  /* l1_cache_line_size  */
1200   32768,                /* l2_cache_size  */
1201   true,                 /* prefetch_dynamic_strides */
1202   -1,                   /* minimum_stride */
1203   -1                    /* default_opt_level  */
1204 };
1205
1206 static const struct tune_params generic_tunings =
1207 {
1208   &cortexa57_extra_costs,
1209   &generic_addrcost_table,
1210   &generic_regmove_cost,
1211   &generic_vector_cost,
1212   &generic_branch_cost,
1213   &generic_approx_modes,
1214   SVE_NOT_IMPLEMENTED, /* sve_width  */
1215   4, /* memmov_cost  */
1216   2, /* issue_rate  */
1217   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1218   "16:12",      /* function_align.  */
1219   "4",  /* jump_align.  */
1220   "8",  /* loop_align.  */
1221   2,    /* int_reassoc_width.  */
1222   4,    /* fp_reassoc_width.  */
1223   1,    /* vec_reassoc_width.  */
1224   2,    /* min_div_recip_mul_sf.  */
1225   2,    /* min_div_recip_mul_df.  */
1226   0,    /* max_case_values.  */
1227   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1228   /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1229      Neoverse V1.  It does not have a noticeable effect on A64FX and should
1230      have at most a very minor effect on SVE2 cores.  */
1231   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),    /* tune_flags.  */
1232   &generic_prefetch_tune
1233 };
1234
1235 static const struct tune_params cortexa35_tunings =
1236 {
1237   &cortexa53_extra_costs,
1238   &generic_addrcost_table,
1239   &cortexa53_regmove_cost,
1240   &generic_vector_cost,
1241   &generic_branch_cost,
1242   &generic_approx_modes,
1243   SVE_NOT_IMPLEMENTED, /* sve_width  */
1244   4, /* memmov_cost  */
1245   1, /* issue_rate  */
1246   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1247    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1248   "16", /* function_align.  */
1249   "4",  /* jump_align.  */
1250   "8",  /* loop_align.  */
1251   2,    /* int_reassoc_width.  */
1252   4,    /* fp_reassoc_width.  */
1253   1,    /* vec_reassoc_width.  */
1254   2,    /* min_div_recip_mul_sf.  */
1255   2,    /* min_div_recip_mul_df.  */
1256   0,    /* max_case_values.  */
1257   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1258   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1259   &generic_prefetch_tune
1260 };
1261
1262 static const struct tune_params cortexa53_tunings =
1263 {
1264   &cortexa53_extra_costs,
1265   &generic_addrcost_table,
1266   &cortexa53_regmove_cost,
1267   &generic_vector_cost,
1268   &generic_branch_cost,
1269   &generic_approx_modes,
1270   SVE_NOT_IMPLEMENTED, /* sve_width  */
1271   4, /* memmov_cost  */
1272   2, /* issue_rate  */
1273   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1274    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1275   "16", /* function_align.  */
1276   "4",  /* jump_align.  */
1277   "8",  /* loop_align.  */
1278   2,    /* int_reassoc_width.  */
1279   4,    /* fp_reassoc_width.  */
1280   1,    /* vec_reassoc_width.  */
1281   2,    /* min_div_recip_mul_sf.  */
1282   2,    /* min_div_recip_mul_df.  */
1283   0,    /* max_case_values.  */
1284   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1285   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1286   &generic_prefetch_tune
1287 };
1288
1289 static const struct tune_params cortexa57_tunings =
1290 {
1291   &cortexa57_extra_costs,
1292   &generic_addrcost_table,
1293   &cortexa57_regmove_cost,
1294   &cortexa57_vector_cost,
1295   &generic_branch_cost,
1296   &generic_approx_modes,
1297   SVE_NOT_IMPLEMENTED, /* sve_width  */
1298   4, /* memmov_cost  */
1299   3, /* issue_rate  */
1300   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1301    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1302   "16", /* function_align.  */
1303   "4",  /* jump_align.  */
1304   "8",  /* loop_align.  */
1305   2,    /* int_reassoc_width.  */
1306   4,    /* fp_reassoc_width.  */
1307   1,    /* vec_reassoc_width.  */
1308   2,    /* min_div_recip_mul_sf.  */
1309   2,    /* min_div_recip_mul_df.  */
1310   0,    /* max_case_values.  */
1311   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1312   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
1313   &generic_prefetch_tune
1314 };
1315
1316 static const struct tune_params cortexa72_tunings =
1317 {
1318   &cortexa57_extra_costs,
1319   &generic_addrcost_table,
1320   &cortexa57_regmove_cost,
1321   &cortexa57_vector_cost,
1322   &generic_branch_cost,
1323   &generic_approx_modes,
1324   SVE_NOT_IMPLEMENTED, /* sve_width  */
1325   4, /* memmov_cost  */
1326   3, /* issue_rate  */
1327   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1328    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1329   "16", /* function_align.  */
1330   "4",  /* jump_align.  */
1331   "8",  /* loop_align.  */
1332   2,    /* int_reassoc_width.  */
1333   4,    /* fp_reassoc_width.  */
1334   1,    /* vec_reassoc_width.  */
1335   2,    /* min_div_recip_mul_sf.  */
1336   2,    /* min_div_recip_mul_df.  */
1337   0,    /* max_case_values.  */
1338   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1339   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1340   &generic_prefetch_tune
1341 };
1342
1343 static const struct tune_params cortexa73_tunings =
1344 {
1345   &cortexa57_extra_costs,
1346   &generic_addrcost_table,
1347   &cortexa57_regmove_cost,
1348   &cortexa57_vector_cost,
1349   &generic_branch_cost,
1350   &generic_approx_modes,
1351   SVE_NOT_IMPLEMENTED, /* sve_width  */
1352   4, /* memmov_cost.  */
1353   2, /* issue_rate.  */
1354   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1355    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1356   "16", /* function_align.  */
1357   "4",  /* jump_align.  */
1358   "8",  /* loop_align.  */
1359   2,    /* int_reassoc_width.  */
1360   4,    /* fp_reassoc_width.  */
1361   1,    /* vec_reassoc_width.  */
1362   2,    /* min_div_recip_mul_sf.  */
1363   2,    /* min_div_recip_mul_df.  */
1364   0,    /* max_case_values.  */
1365   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1366   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1367   &generic_prefetch_tune
1368 };
1369
1370
1371
1372 static const struct tune_params exynosm1_tunings =
1373 {
1374   &exynosm1_extra_costs,
1375   &exynosm1_addrcost_table,
1376   &exynosm1_regmove_cost,
1377   &exynosm1_vector_cost,
1378   &generic_branch_cost,
1379   &exynosm1_approx_modes,
1380   SVE_NOT_IMPLEMENTED, /* sve_width  */
1381   4,    /* memmov_cost  */
1382   3,    /* issue_rate  */
1383   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
1384   "4",  /* function_align.  */
1385   "4",  /* jump_align.  */
1386   "4",  /* loop_align.  */
1387   2,    /* int_reassoc_width.  */
1388   4,    /* fp_reassoc_width.  */
1389   1,    /* vec_reassoc_width.  */
1390   2,    /* min_div_recip_mul_sf.  */
1391   2,    /* min_div_recip_mul_df.  */
1392   48,   /* max_case_values.  */
1393   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
1394   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
1395   &exynosm1_prefetch_tune
1396 };
1397
1398 static const struct tune_params thunderxt88_tunings =
1399 {
1400   &thunderx_extra_costs,
1401   &generic_addrcost_table,
1402   &thunderx_regmove_cost,
1403   &thunderx_vector_cost,
1404   &generic_branch_cost,
1405   &generic_approx_modes,
1406   SVE_NOT_IMPLEMENTED, /* sve_width  */
1407   6, /* memmov_cost  */
1408   2, /* issue_rate  */
1409   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1410   "8",  /* function_align.  */
1411   "8",  /* jump_align.  */
1412   "8",  /* loop_align.  */
1413   2,    /* int_reassoc_width.  */
1414   4,    /* fp_reassoc_width.  */
1415   1,    /* vec_reassoc_width.  */
1416   2,    /* min_div_recip_mul_sf.  */
1417   2,    /* min_div_recip_mul_df.  */
1418   0,    /* max_case_values.  */
1419   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1420   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
1421   &thunderxt88_prefetch_tune
1422 };
1423
1424 static const struct tune_params thunderx_tunings =
1425 {
1426   &thunderx_extra_costs,
1427   &generic_addrcost_table,
1428   &thunderx_regmove_cost,
1429   &thunderx_vector_cost,
1430   &generic_branch_cost,
1431   &generic_approx_modes,
1432   SVE_NOT_IMPLEMENTED, /* sve_width  */
1433   6, /* memmov_cost  */
1434   2, /* issue_rate  */
1435   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1436   "8",  /* function_align.  */
1437   "8",  /* jump_align.  */
1438   "8",  /* loop_align.  */
1439   2,    /* int_reassoc_width.  */
1440   4,    /* fp_reassoc_width.  */
1441   1,    /* vec_reassoc_width.  */
1442   2,    /* min_div_recip_mul_sf.  */
1443   2,    /* min_div_recip_mul_df.  */
1444   0,    /* max_case_values.  */
1445   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1446   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1447    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
1448   &thunderx_prefetch_tune
1449 };
1450
1451 static const struct tune_params tsv110_tunings =
1452 {
1453   &tsv110_extra_costs,
1454   &tsv110_addrcost_table,
1455   &tsv110_regmove_cost,
1456   &tsv110_vector_cost,
1457   &generic_branch_cost,
1458   &generic_approx_modes,
1459   SVE_NOT_IMPLEMENTED, /* sve_width  */
1460   4,    /* memmov_cost  */
1461   4,    /* issue_rate  */
1462   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1463    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1464   "16", /* function_align.  */
1465   "4",  /* jump_align.  */
1466   "8",  /* loop_align.  */
1467   2,    /* int_reassoc_width.  */
1468   4,    /* fp_reassoc_width.  */
1469   1,    /* vec_reassoc_width.  */
1470   2,    /* min_div_recip_mul_sf.  */
1471   2,    /* min_div_recip_mul_df.  */
1472   0,    /* max_case_values.  */
1473   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1474   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
1475   &tsv110_prefetch_tune
1476 };
1477
1478 static const struct tune_params xgene1_tunings =
1479 {
1480   &xgene1_extra_costs,
1481   &xgene1_addrcost_table,
1482   &xgene1_regmove_cost,
1483   &xgene1_vector_cost,
1484   &generic_branch_cost,
1485   &xgene1_approx_modes,
1486   SVE_NOT_IMPLEMENTED, /* sve_width  */
1487   6, /* memmov_cost  */
1488   4, /* issue_rate  */
1489   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1490   "16", /* function_align.  */
1491   "16", /* jump_align.  */
1492   "16", /* loop_align.  */
1493   2,    /* int_reassoc_width.  */
1494   4,    /* fp_reassoc_width.  */
1495   1,    /* vec_reassoc_width.  */
1496   2,    /* min_div_recip_mul_sf.  */
1497   2,    /* min_div_recip_mul_df.  */
1498   17,   /* max_case_values.  */
1499   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1500   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1501   &xgene1_prefetch_tune
1502 };
1503
1504 static const struct tune_params emag_tunings =
1505 {
1506   &xgene1_extra_costs,
1507   &xgene1_addrcost_table,
1508   &xgene1_regmove_cost,
1509   &xgene1_vector_cost,
1510   &generic_branch_cost,
1511   &xgene1_approx_modes,
1512   SVE_NOT_IMPLEMENTED,
1513   6, /* memmov_cost  */
1514   4, /* issue_rate  */
1515   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1516   "16", /* function_align.  */
1517   "16", /* jump_align.  */
1518   "16", /* loop_align.  */
1519   2,    /* int_reassoc_width.  */
1520   4,    /* fp_reassoc_width.  */
1521   1,    /* vec_reassoc_width.  */
1522   2,    /* min_div_recip_mul_sf.  */
1523   2,    /* min_div_recip_mul_df.  */
1524   17,   /* max_case_values.  */
1525   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1526   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1527   &xgene1_prefetch_tune
1528 };
1529
1530 static const struct tune_params qdf24xx_tunings =
1531 {
1532   &qdf24xx_extra_costs,
1533   &qdf24xx_addrcost_table,
1534   &qdf24xx_regmove_cost,
1535   &qdf24xx_vector_cost,
1536   &generic_branch_cost,
1537   &generic_approx_modes,
1538   SVE_NOT_IMPLEMENTED, /* sve_width  */
1539   4, /* memmov_cost  */
1540   4, /* issue_rate  */
1541   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1542    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1543   "16", /* function_align.  */
1544   "8",  /* jump_align.  */
1545   "16", /* loop_align.  */
1546   2,    /* int_reassoc_width.  */
1547   4,    /* fp_reassoc_width.  */
1548   1,    /* vec_reassoc_width.  */
1549   2,    /* min_div_recip_mul_sf.  */
1550   2,    /* min_div_recip_mul_df.  */
1551   0,    /* max_case_values.  */
1552   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1553   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1554   &qdf24xx_prefetch_tune
1555 };
1556
1557 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1558    for now.  */
1559 static const struct tune_params saphira_tunings =
1560 {
1561   &generic_extra_costs,
1562   &generic_addrcost_table,
1563   &generic_regmove_cost,
1564   &generic_vector_cost,
1565   &generic_branch_cost,
1566   &generic_approx_modes,
1567   SVE_NOT_IMPLEMENTED, /* sve_width  */
1568   4, /* memmov_cost  */
1569   4, /* issue_rate  */
1570   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1571    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1572   "16", /* function_align.  */
1573   "8",  /* jump_align.  */
1574   "16", /* loop_align.  */
1575   2,    /* int_reassoc_width.  */
1576   4,    /* fp_reassoc_width.  */
1577   1,    /* vec_reassoc_width.  */
1578   2,    /* min_div_recip_mul_sf.  */
1579   2,    /* min_div_recip_mul_df.  */
1580   0,    /* max_case_values.  */
1581   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1582   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1583   &generic_prefetch_tune
1584 };
1585
1586 static const struct tune_params thunderx2t99_tunings =
1587 {
1588   &thunderx2t99_extra_costs,
1589   &thunderx2t99_addrcost_table,
1590   &thunderx2t99_regmove_cost,
1591   &thunderx2t99_vector_cost,
1592   &generic_branch_cost,
1593   &generic_approx_modes,
1594   SVE_NOT_IMPLEMENTED, /* sve_width  */
1595   4, /* memmov_cost.  */
1596   4, /* issue_rate.  */
1597   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1598    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1599   "16", /* function_align.  */
1600   "8",  /* jump_align.  */
1601   "16", /* loop_align.  */
1602   3,    /* int_reassoc_width.  */
1603   2,    /* fp_reassoc_width.  */
1604   2,    /* vec_reassoc_width.  */
1605   2,    /* min_div_recip_mul_sf.  */
1606   2,    /* min_div_recip_mul_df.  */
1607   0,    /* max_case_values.  */
1608   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1609   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1610   &thunderx2t99_prefetch_tune
1611 };
1612
1613 static const struct tune_params thunderx3t110_tunings =
1614 {
1615   &thunderx3t110_extra_costs,
1616   &thunderx3t110_addrcost_table,
1617   &thunderx3t110_regmove_cost,
1618   &thunderx3t110_vector_cost,
1619   &generic_branch_cost,
1620   &generic_approx_modes,
1621   SVE_NOT_IMPLEMENTED, /* sve_width  */
1622   4, /* memmov_cost.  */
1623   6, /* issue_rate.  */
1624   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1625    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1626   "16", /* function_align.  */
1627   "8",  /* jump_align.  */
1628   "16", /* loop_align.  */
1629   3,    /* int_reassoc_width.  */
1630   2,    /* fp_reassoc_width.  */
1631   2,    /* vec_reassoc_width.  */
1632   2,    /* min_div_recip_mul_sf.  */
1633   2,    /* min_div_recip_mul_df.  */
1634   0,    /* max_case_values.  */
1635   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1636   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1637   &thunderx3t110_prefetch_tune
1638 };
1639
1640 static const struct tune_params neoversen1_tunings =
1641 {
1642   &cortexa76_extra_costs,
1643   &generic_addrcost_table,
1644   &generic_regmove_cost,
1645   &cortexa57_vector_cost,
1646   &generic_branch_cost,
1647   &generic_approx_modes,
1648   SVE_NOT_IMPLEMENTED, /* sve_width  */
1649   4, /* memmov_cost  */
1650   3, /* issue_rate  */
1651   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1652   "32:16",      /* function_align.  */
1653   "4",          /* jump_align.  */
1654   "32:16",      /* loop_align.  */
1655   2,    /* int_reassoc_width.  */
1656   4,    /* fp_reassoc_width.  */
1657   2,    /* vec_reassoc_width.  */
1658   2,    /* min_div_recip_mul_sf.  */
1659   2,    /* min_div_recip_mul_df.  */
1660   0,    /* max_case_values.  */
1661   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1662   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1663   &generic_prefetch_tune
1664 };
1665
1666 static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1667 {
1668   2, /* int_stmt_cost  */
1669   2, /* fp_stmt_cost  */
1670   4, /* ld2_st2_permute_cost */
1671   4, /* ld3_st3_permute_cost  */
1672   5, /* ld4_st4_permute_cost  */
1673   3, /* permute_cost  */
1674   4, /* reduc_i8_cost  */
1675   4, /* reduc_i16_cost  */
1676   2, /* reduc_i32_cost  */
1677   2, /* reduc_i64_cost  */
1678   6, /* reduc_f16_cost  */
1679   3, /* reduc_f32_cost  */
1680   2, /* reduc_f64_cost  */
1681   2, /* store_elt_extra_cost  */
1682   /* This value is just inherited from the Cortex-A57 table.  */
1683   8, /* vec_to_scalar_cost  */
1684   /* This depends very much on what the scalar value is and
1685      where it comes from.  E.g. some constants take two dependent
1686      instructions or a load, while others might be moved from a GPR.
1687      4 seems to be a reasonable compromise in practice.  */
1688   4, /* scalar_to_vec_cost  */
1689   4, /* align_load_cost  */
1690   4, /* unalign_load_cost  */
1691   /* Although stores have a latency of 2 and compete for the
1692      vector pipes, in practice it's better not to model that.  */
1693   1, /* unalign_store_cost  */
1694   1  /* store_cost  */
1695 };
1696
1697 static const sve_vec_cost neoversev1_sve_vector_cost =
1698 {
1699   {
1700     2, /* int_stmt_cost  */
1701     2, /* fp_stmt_cost  */
1702     4, /* ld2_st2_permute_cost  */
1703     7, /* ld3_st3_permute_cost  */
1704     8, /* ld4_st4_permute_cost  */
1705     3, /* permute_cost  */
1706     /* Theoretically, a reduction involving 31 scalar ADDs could
1707        complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
1708        completes in 14 cycles, so give it a cost of 31 + 5.  */
1709     36, /* reduc_i8_cost  */
1710     /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
1711     22, /* reduc_i16_cost  */
1712     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
1713     14, /* reduc_i32_cost  */
1714     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
1715     11, /* reduc_i64_cost  */
1716     /* Theoretically, a reduction involving 15 scalar FADDs could
1717        complete in ~9 cycles and would have a cost of 30.  FADDV
1718        completes in 13 cycles, so give it a cost of 30 + 4.  */
1719     34, /* reduc_f16_cost  */
1720     /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
1721     19, /* reduc_f32_cost  */
1722     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
1723     11, /* reduc_f64_cost  */
1724     2, /* store_elt_extra_cost  */
1725     /* This value is just inherited from the Cortex-A57 table.  */
1726     8, /* vec_to_scalar_cost  */
1727     /* See the comment above the Advanced SIMD versions.  */
1728     4, /* scalar_to_vec_cost  */
1729     4, /* align_load_cost  */
1730     4, /* unalign_load_cost  */
1731     /* Although stores have a latency of 2 and compete for the
1732        vector pipes, in practice it's better not to model that.  */
1733     1, /* unalign_store_cost  */
1734     1  /* store_cost  */
1735   },
1736   3, /* clast_cost  */
1737   19, /* fadda_f16_cost  */
1738   11, /* fadda_f32_cost  */
1739   8, /* fadda_f64_cost  */
1740   3 /* scatter_store_elt_cost  */
1741 };
1742
1743 static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
1744 {
1745   3, /* loads_stores_per_cycle  */
1746   2, /* stores_per_cycle  */
1747   4, /* general_ops_per_cycle  */
1748   0, /* fp_simd_load_general_ops  */
1749   1 /* fp_simd_store_general_ops  */
1750 };
1751
1752 static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
1753 {
1754   {
1755     3, /* loads_stores_per_cycle  */
1756     2, /* stores_per_cycle  */
1757     4, /* general_ops_per_cycle  */
1758     0, /* fp_simd_load_general_ops  */
1759     1 /* fp_simd_store_general_ops  */
1760   },
1761   2, /* ld2_st2_general_ops  */
1762   2, /* ld3_st3_general_ops  */
1763   3 /* ld4_st4_general_ops  */
1764 };
1765
1766 static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
1767 {
1768   {
1769     {
1770       2, /* loads_per_cycle  */
1771       2, /* stores_per_cycle  */
1772       2, /* general_ops_per_cycle  */
1773       0, /* fp_simd_load_general_ops  */
1774       1 /* fp_simd_store_general_ops  */
1775     },
1776     2, /* ld2_st2_general_ops  */
1777     2, /* ld3_st3_general_ops  */
1778     3 /* ld4_st4_general_ops  */
1779   },
1780   1, /* pred_ops_per_cycle  */
1781   2, /* while_pred_ops  */
1782   2, /* int_cmp_pred_ops  */
1783   1, /* fp_cmp_pred_ops  */
1784   1, /* gather_scatter_pair_general_ops  */
1785   1 /* gather_scatter_pair_pred_ops  */
1786 };
1787
1788 static const aarch64_vec_issue_info neoversev1_vec_issue_info =
1789 {
1790   &neoversev1_scalar_issue_info,
1791   &neoversev1_advsimd_issue_info,
1792   &neoversev1_sve_issue_info
1793 };
1794
1795 /* Neoverse V1 costs for vector insn classes.  */
1796 static const struct cpu_vector_cost neoversev1_vector_cost =
1797 {
1798   1, /* scalar_int_stmt_cost  */
1799   2, /* scalar_fp_stmt_cost  */
1800   4, /* scalar_load_cost  */
1801   1, /* scalar_store_cost  */
1802   1, /* cond_taken_branch_cost  */
1803   1, /* cond_not_taken_branch_cost  */
1804   &neoversev1_advsimd_vector_cost, /* advsimd  */
1805   &neoversev1_sve_vector_cost, /* sve  */
1806   &neoversev1_vec_issue_info /* issue_info  */
1807 };
1808
1809 static const struct tune_params neoversev1_tunings =
1810 {
1811   &cortexa76_extra_costs,
1812   &neoversev1_addrcost_table,
1813   &generic_regmove_cost,
1814   &neoversev1_vector_cost,
1815   &generic_branch_cost,
1816   &generic_approx_modes,
1817   SVE_256, /* sve_width  */
1818   4, /* memmov_cost  */
1819   3, /* issue_rate  */
1820   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1821   "32:16",      /* function_align.  */
1822   "4",          /* jump_align.  */
1823   "32:16",      /* loop_align.  */
1824   2,    /* int_reassoc_width.  */
1825   4,    /* fp_reassoc_width.  */
1826   2,    /* vec_reassoc_width.  */
1827   2,    /* min_div_recip_mul_sf.  */
1828   2,    /* min_div_recip_mul_df.  */
1829   0,    /* max_case_values.  */
1830   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1831   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
1832    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
1833    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),     /* tune_flags.  */
1834   &generic_prefetch_tune
1835 };
1836
1837 static const struct tune_params neoversen2_tunings =
1838 {
1839   &cortexa76_extra_costs,
1840   &generic_addrcost_table,
1841   &generic_regmove_cost,
1842   &cortexa57_vector_cost,
1843   &generic_branch_cost,
1844   &generic_approx_modes,
1845   SVE_128, /* sve_width  */
1846   4, /* memmov_cost  */
1847   3, /* issue_rate  */
1848   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1849   "32:16",      /* function_align.  */
1850   "4",          /* jump_align.  */
1851   "32:16",      /* loop_align.  */
1852   2,    /* int_reassoc_width.  */
1853   4,    /* fp_reassoc_width.  */
1854   2,    /* vec_reassoc_width.  */
1855   2,    /* min_div_recip_mul_sf.  */
1856   2,    /* min_div_recip_mul_df.  */
1857   0,    /* max_case_values.  */
1858   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1859   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1860   &generic_prefetch_tune
1861 };
1862
1863 static const struct tune_params a64fx_tunings =
1864 {
1865   &a64fx_extra_costs,
1866   &a64fx_addrcost_table,
1867   &a64fx_regmove_cost,
1868   &a64fx_vector_cost,
1869   &generic_branch_cost,
1870   &generic_approx_modes,
1871   SVE_512, /* sve_width  */
1872   4, /* memmov_cost  */
1873   7, /* issue_rate  */
1874   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1875   "32", /* function_align.  */
1876   "16", /* jump_align.  */
1877   "32", /* loop_align.  */
1878   4,    /* int_reassoc_width.  */
1879   2,    /* fp_reassoc_width.  */
1880   2,    /* vec_reassoc_width.  */
1881   2,    /* min_div_recip_mul_sf.  */
1882   2,    /* min_div_recip_mul_df.  */
1883   0,    /* max_case_values.  */
1884   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1885   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1886   &a64fx_prefetch_tune
1887 };
1888
1889 /* Support for fine-grained override of the tuning structures.  */
1890 struct aarch64_tuning_override_function
1891 {
1892   const char* name;
1893   void (*parse_override)(const char*, struct tune_params*);
1894 };
1895
1896 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1897 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1898 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1899
1900 static const struct aarch64_tuning_override_function
1901 aarch64_tuning_override_functions[] =
1902 {
1903   { "fuse", aarch64_parse_fuse_string },
1904   { "tune", aarch64_parse_tune_string },
1905   { "sve_width", aarch64_parse_sve_width_string },
1906   { NULL, NULL }
1907 };
1908
1909 /* A processor implementing AArch64.  */
1910 struct processor
1911 {
1912   const char *const name;
1913   enum aarch64_processor ident;
1914   enum aarch64_processor sched_core;
1915   enum aarch64_arch arch;
1916   unsigned architecture_version;
1917   const uint64_t flags;
1918   const struct tune_params *const tune;
1919 };
1920
1921 /* Architectures implementing AArch64.  */
1922 static const struct processor all_architectures[] =
1923 {
1924 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1925   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1926 #include "aarch64-arches.def"
1927   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1928 };
1929
1930 /* Processor cores implementing AArch64.  */
1931 static const struct processor all_cores[] =
1932 {
1933 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1934   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1935   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1936   FLAGS, &COSTS##_tunings},
1937 #include "aarch64-cores.def"
1938   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1939     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1940   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1941 };
1942
1943
1944 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1945    handling code or by target attributes.  */
1946 static const struct processor *selected_arch;
1947 static const struct processor *selected_cpu;
1948 static const struct processor *selected_tune;
1949
1950 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1951
1952 /* The current tuning set.  */
1953 struct tune_params aarch64_tune_params = generic_tunings;
1954
1955 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
1956
1957 static tree
1958 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1959                                      int, bool *no_add_attrs)
1960 {
1961   /* Since we set fn_type_req to true, the caller should have checked
1962      this for us.  */
1963   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1964   switch ((arm_pcs) fntype_abi (*node).id ())
1965     {
1966     case ARM_PCS_AAPCS64:
1967     case ARM_PCS_SIMD:
1968       return NULL_TREE;
1969
1970     case ARM_PCS_SVE:
1971       error ("the %qE attribute cannot be applied to an SVE function type",
1972              name);
1973       *no_add_attrs = true;
1974       return NULL_TREE;
1975
1976     case ARM_PCS_TLSDESC:
1977     case ARM_PCS_UNKNOWN:
1978       break;
1979     }
1980   gcc_unreachable ();
1981 }
1982
1983 /* Table of machine attributes.  */
1984 static const struct attribute_spec aarch64_attribute_table[] =
1985 {
1986   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1987        affects_type_identity, handler, exclude } */
1988   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
1989                           handle_aarch64_vector_pcs_attribute, NULL },
1990   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
1991                           aarch64_sve::handle_arm_sve_vector_bits_attribute,
1992                           NULL },
1993   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
1994   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
1995   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
1996   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1997 };
1998
1999 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
2000
2001 /* An ISA extension in the co-processor and main instruction set space.  */
2002 struct aarch64_option_extension
2003 {
2004   const char *const name;
2005   const unsigned long flags_on;
2006   const unsigned long flags_off;
2007 };
2008
2009 typedef enum aarch64_cond_code
2010 {
2011   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2012   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2013   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2014 }
2015 aarch64_cc;
2016
2017 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2018
2019 struct aarch64_branch_protect_type
2020 {
2021   /* The type's name that the user passes to the branch-protection option
2022     string.  */
2023   const char* name;
2024   /* Function to handle the protection type and set global variables.
2025     First argument is the string token corresponding with this type and the
2026     second argument is the next token in the option string.
2027     Return values:
2028     * AARCH64_PARSE_OK: Handling was sucessful.
2029     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
2030       should print an error.
2031     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
2032       own error.  */
2033   enum aarch64_parse_opt_result (*handler)(char*, char*);
2034   /* A list of types that can follow this type in the option string.  */
2035   const aarch64_branch_protect_type* subtypes;
2036   unsigned int num_subtypes;
2037 };
2038
2039 static enum aarch64_parse_opt_result
2040 aarch64_handle_no_branch_protection (char* str, char* rest)
2041 {
2042   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
2043   aarch64_enable_bti = 0;
2044   if (rest)
2045     {
2046       error ("unexpected %<%s%> after %<%s%>", rest, str);
2047       return AARCH64_PARSE_INVALID_FEATURE;
2048     }
2049   return AARCH64_PARSE_OK;
2050 }
2051
2052 static enum aarch64_parse_opt_result
2053 aarch64_handle_standard_branch_protection (char* str, char* rest)
2054 {
2055   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2056   aarch64_ra_sign_key = AARCH64_KEY_A;
2057   aarch64_enable_bti = 1;
2058   if (rest)
2059     {
2060       error ("unexpected %<%s%> after %<%s%>", rest, str);
2061       return AARCH64_PARSE_INVALID_FEATURE;
2062     }
2063   return AARCH64_PARSE_OK;
2064 }
2065
2066 static enum aarch64_parse_opt_result
2067 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
2068                                     char* rest ATTRIBUTE_UNUSED)
2069 {
2070   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2071   aarch64_ra_sign_key = AARCH64_KEY_A;
2072   return AARCH64_PARSE_OK;
2073 }
2074
2075 static enum aarch64_parse_opt_result
2076 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
2077                               char* rest ATTRIBUTE_UNUSED)
2078 {
2079   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
2080   return AARCH64_PARSE_OK;
2081 }
2082
2083 static enum aarch64_parse_opt_result
2084 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
2085                               char* rest ATTRIBUTE_UNUSED)
2086 {
2087   aarch64_ra_sign_key = AARCH64_KEY_B;
2088   return AARCH64_PARSE_OK;
2089 }
2090
2091 static enum aarch64_parse_opt_result
2092 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
2093                                     char* rest ATTRIBUTE_UNUSED)
2094 {
2095   aarch64_enable_bti = 1;
2096   return AARCH64_PARSE_OK;
2097 }
2098
2099 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
2100   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
2101   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
2102   { NULL, NULL, NULL, 0 }
2103 };
2104
2105 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
2106   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
2107   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
2108   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
2109     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
2110   { "bti", aarch64_handle_bti_protection, NULL, 0 },
2111   { NULL, NULL, NULL, 0 }
2112 };
2113
2114 /* The condition codes of the processor, and the inverse function.  */
2115 static const char * const aarch64_condition_codes[] =
2116 {
2117   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2118   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2119 };
2120
2121 /* The preferred condition codes for SVE conditions.  */
2122 static const char *const aarch64_sve_condition_codes[] =
2123 {
2124   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2125   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2126 };
2127
2128 /* Return the assembly token for svpattern value VALUE.  */
2129
2130 static const char *
2131 svpattern_token (enum aarch64_svpattern pattern)
2132 {
2133   switch (pattern)
2134     {
2135 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2136     AARCH64_FOR_SVPATTERN (CASE)
2137 #undef CASE
2138     case AARCH64_NUM_SVPATTERNS:
2139       break;
2140     }
2141   gcc_unreachable ();
2142 }
2143
2144 /* Return the location of a piece that is known to be passed or returned
2145    in registers.  FIRST_ZR is the first unused vector argument register
2146    and FIRST_PR is the first unused predicate argument register.  */
2147
2148 rtx
2149 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2150                                          unsigned int first_pr) const
2151 {
2152   gcc_assert (VECTOR_MODE_P (mode)
2153               && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2154               && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2155
2156   if (num_zr > 0 && num_pr == 0)
2157     return gen_rtx_REG (mode, first_zr);
2158
2159   if (num_zr == 0 && num_pr == 1)
2160     return gen_rtx_REG (mode, first_pr);
2161
2162   gcc_unreachable ();
2163 }
2164
2165 /* Return the total number of vector registers required by the PST.  */
2166
2167 unsigned int
2168 pure_scalable_type_info::num_zr () const
2169 {
2170   unsigned int res = 0;
2171   for (unsigned int i = 0; i < pieces.length (); ++i)
2172     res += pieces[i].num_zr;
2173   return res;
2174 }
2175
2176 /* Return the total number of predicate registers required by the PST.  */
2177
2178 unsigned int
2179 pure_scalable_type_info::num_pr () const
2180 {
2181   unsigned int res = 0;
2182   for (unsigned int i = 0; i < pieces.length (); ++i)
2183     res += pieces[i].num_pr;
2184   return res;
2185 }
2186
2187 /* Return the location of a PST that is known to be passed or returned
2188    in registers.  FIRST_ZR is the first unused vector argument register
2189    and FIRST_PR is the first unused predicate argument register.  */
2190
2191 rtx
2192 pure_scalable_type_info::get_rtx (machine_mode mode,
2193                                   unsigned int first_zr,
2194                                   unsigned int first_pr) const
2195 {
2196   /* Try to return a single REG if possible.  This leads to better
2197      code generation; it isn't required for correctness.  */
2198   if (mode == pieces[0].mode)
2199     {
2200       gcc_assert (pieces.length () == 1);
2201       return pieces[0].get_rtx (first_zr, first_pr);
2202     }
2203
2204   /* Build up a PARALLEL that contains the individual pieces.  */
2205   rtvec rtxes = rtvec_alloc (pieces.length ());
2206   for (unsigned int i = 0; i < pieces.length (); ++i)
2207     {
2208       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
2209       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
2210       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
2211       first_zr += pieces[i].num_zr;
2212       first_pr += pieces[i].num_pr;
2213     }
2214   return gen_rtx_PARALLEL (mode, rtxes);
2215 }
2216
2217 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
2218    in the AAPCS64.  */
2219
2220 pure_scalable_type_info::analysis_result
2221 pure_scalable_type_info::analyze (const_tree type)
2222 {
2223   /* Prevent accidental reuse.  */
2224   gcc_assert (pieces.is_empty ());
2225
2226   /* No code will be generated for erroneous types, so we won't establish
2227      an ABI mapping.  */
2228   if (type == error_mark_node)
2229     return NO_ABI_IDENTITY;
2230
2231   /* Zero-sized types disappear in the language->ABI mapping.  */
2232   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2233     return NO_ABI_IDENTITY;
2234
2235   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
2236   piece p = {};
2237   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
2238     {
2239       machine_mode mode = TYPE_MODE_RAW (type);
2240       gcc_assert (VECTOR_MODE_P (mode)
2241                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
2242
2243       p.mode = p.orig_mode = mode;
2244       add_piece (p);
2245       return IS_PST;
2246     }
2247
2248   /* Check for user-defined PSTs.  */
2249   if (TREE_CODE (type) == ARRAY_TYPE)
2250     return analyze_array (type);
2251   if (TREE_CODE (type) == RECORD_TYPE)
2252     return analyze_record (type);
2253
2254   return ISNT_PST;
2255 }
2256
2257 /* Analyze a type that is known not to be passed or returned in memory.
2258    Return true if it has an ABI identity and is a Pure Scalable Type.  */
2259
2260 bool
2261 pure_scalable_type_info::analyze_registers (const_tree type)
2262 {
2263   analysis_result result = analyze (type);
2264   gcc_assert (result != DOESNT_MATTER);
2265   return result == IS_PST;
2266 }
2267
2268 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
2269
2270 pure_scalable_type_info::analysis_result
2271 pure_scalable_type_info::analyze_array (const_tree type)
2272 {
2273   /* Analyze the element type.  */
2274   pure_scalable_type_info element_info;
2275   analysis_result result = element_info.analyze (TREE_TYPE (type));
2276   if (result != IS_PST)
2277     return result;
2278
2279   /* An array of unknown, flexible or variable length will be passed and
2280      returned by reference whatever we do.  */
2281   tree nelts_minus_one = array_type_nelts (type);
2282   if (!tree_fits_uhwi_p (nelts_minus_one))
2283     return DOESNT_MATTER;
2284
2285   /* Likewise if the array is constant-sized but too big to be interesting.
2286      The double checks against MAX_PIECES are to protect against overflow.  */
2287   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
2288   if (count > MAX_PIECES)
2289     return DOESNT_MATTER;
2290   count += 1;
2291   if (count * element_info.pieces.length () > MAX_PIECES)
2292     return DOESNT_MATTER;
2293
2294   /* The above checks should have weeded out elements of unknown size.  */
2295   poly_uint64 element_bytes;
2296   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
2297     gcc_unreachable ();
2298
2299   /* Build up the list of individual vectors and predicates.  */
2300   gcc_assert (!element_info.pieces.is_empty ());
2301   for (unsigned int i = 0; i < count; ++i)
2302     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
2303       {
2304         piece p = element_info.pieces[j];
2305         p.offset += i * element_bytes;
2306         add_piece (p);
2307       }
2308   return IS_PST;
2309 }
2310
2311 /* Subroutine of analyze for handling RECORD_TYPEs.  */
2312
2313 pure_scalable_type_info::analysis_result
2314 pure_scalable_type_info::analyze_record (const_tree type)
2315 {
2316   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
2317     {
2318       if (TREE_CODE (field) != FIELD_DECL)
2319         continue;
2320
2321       /* Zero-sized fields disappear in the language->ABI mapping.  */
2322       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
2323         continue;
2324
2325       /* All fields with an ABI identity must be PSTs for the record as
2326          a whole to be a PST.  If any individual field is too big to be
2327          interesting then the record is too.  */
2328       pure_scalable_type_info field_info;
2329       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
2330       if (subresult == NO_ABI_IDENTITY)
2331         continue;
2332       if (subresult != IS_PST)
2333         return subresult;
2334
2335       /* Since all previous fields are PSTs, we ought to be able to track
2336          the field offset using poly_ints.  */
2337       tree bitpos = bit_position (field);
2338       gcc_assert (poly_int_tree_p (bitpos));
2339
2340       /* For the same reason, it shouldn't be possible to create a PST field
2341          whose offset isn't byte-aligned.  */
2342       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
2343                                                 BITS_PER_UNIT);
2344
2345       /* Punt if the record is too big to be interesting.  */
2346       poly_uint64 bytepos;
2347       if (!wide_bytepos.to_uhwi (&bytepos)
2348           || pieces.length () + field_info.pieces.length () > MAX_PIECES)
2349         return DOESNT_MATTER;
2350
2351       /* Add the individual vectors and predicates in the field to the
2352          record's list.  */
2353       gcc_assert (!field_info.pieces.is_empty ());
2354       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
2355         {
2356           piece p = field_info.pieces[i];
2357           p.offset += bytepos;
2358           add_piece (p);
2359         }
2360     }
2361   /* Empty structures disappear in the language->ABI mapping.  */
2362   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
2363 }
2364
2365 /* Add P to the list of pieces in the type.  */
2366
2367 void
2368 pure_scalable_type_info::add_piece (const piece &p)
2369 {
2370   /* Try to fold the new piece into the previous one to form a
2371      single-mode PST.  For example, if we see three consecutive vectors
2372      of the same mode, we can represent them using the corresponding
2373      3-tuple mode.
2374
2375      This is purely an optimization.  */
2376   if (!pieces.is_empty ())
2377     {
2378       piece &prev = pieces.last ();
2379       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
2380       unsigned int nelems1, nelems2;
2381       if (prev.orig_mode == p.orig_mode
2382           && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
2383           && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
2384                                   GET_MODE_NUNITS (p.orig_mode), &nelems1)
2385           && constant_multiple_p (GET_MODE_NUNITS (p.mode),
2386                                   GET_MODE_NUNITS (p.orig_mode), &nelems2)
2387           && targetm.array_mode (p.orig_mode,
2388                                  nelems1 + nelems2).exists (&prev.mode))
2389         {
2390           prev.num_zr += p.num_zr;
2391           prev.num_pr += p.num_pr;
2392           return;
2393         }
2394     }
2395   pieces.quick_push (p);
2396 }
2397
2398 /* Return true if at least one possible value of type TYPE includes at
2399    least one object of Pure Scalable Type, in the sense of the AAPCS64.
2400
2401    This is a relatively expensive test for some types, so it should
2402    generally be made as late as possible.  */
2403
2404 static bool
2405 aarch64_some_values_include_pst_objects_p (const_tree type)
2406 {
2407   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2408     return false;
2409
2410   if (aarch64_sve::builtin_type_p (type))
2411     return true;
2412
2413   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
2414     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
2415
2416   if (RECORD_OR_UNION_TYPE_P (type))
2417     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
2418       if (TREE_CODE (field) == FIELD_DECL
2419           && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
2420         return true;
2421
2422   return false;
2423 }
2424
2425 /* Return the descriptor of the SIMD ABI.  */
2426
2427 static const predefined_function_abi &
2428 aarch64_simd_abi (void)
2429 {
2430   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
2431   if (!simd_abi.initialized_p ())
2432     {
2433       HARD_REG_SET full_reg_clobbers
2434         = default_function_abi.full_reg_clobbers ();
2435       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2436         if (FP_SIMD_SAVED_REGNUM_P (regno))
2437           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2438       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
2439     }
2440   return simd_abi;
2441 }
2442
2443 /* Return the descriptor of the SVE PCS.  */
2444
2445 static const predefined_function_abi &
2446 aarch64_sve_abi (void)
2447 {
2448   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
2449   if (!sve_abi.initialized_p ())
2450     {
2451       HARD_REG_SET full_reg_clobbers
2452         = default_function_abi.full_reg_clobbers ();
2453       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
2454         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2455       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
2456         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2457       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
2458     }
2459   return sve_abi;
2460 }
2461
2462 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
2463    wraps, otherwise return X itself.  */
2464
2465 static rtx
2466 strip_salt (rtx x)
2467 {
2468   rtx search = x;
2469   if (GET_CODE (search) == CONST)
2470     search = XEXP (search, 0);
2471   if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
2472     x = XVECEXP (search, 0, 0);
2473   return x;
2474 }
2475
2476 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
2477    expression.  */
2478
2479 static rtx
2480 strip_offset_and_salt (rtx addr, poly_int64 *offset)
2481 {
2482   return strip_salt (strip_offset (addr, offset));
2483 }
2484
2485 /* Generate code to enable conditional branches in functions over 1 MiB.  */
2486 const char *
2487 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
2488                         const char * branch_format)
2489 {
2490     rtx_code_label * tmp_label = gen_label_rtx ();
2491     char label_buf[256];
2492     char buffer[128];
2493     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
2494                                  CODE_LABEL_NUMBER (tmp_label));
2495     const char *label_ptr = targetm.strip_name_encoding (label_buf);
2496     rtx dest_label = operands[pos_label];
2497     operands[pos_label] = tmp_label;
2498
2499     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
2500     output_asm_insn (buffer, operands);
2501
2502     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
2503     operands[pos_label] = dest_label;
2504     output_asm_insn (buffer, operands);
2505     return "";
2506 }
2507
2508 void
2509 aarch64_err_no_fpadvsimd (machine_mode mode)
2510 {
2511   if (TARGET_GENERAL_REGS_ONLY)
2512     if (FLOAT_MODE_P (mode))
2513       error ("%qs is incompatible with the use of floating-point types",
2514              "-mgeneral-regs-only");
2515     else
2516       error ("%qs is incompatible with the use of vector types",
2517              "-mgeneral-regs-only");
2518   else
2519     if (FLOAT_MODE_P (mode))
2520       error ("%qs feature modifier is incompatible with the use of"
2521              " floating-point types", "+nofp");
2522     else
2523       error ("%qs feature modifier is incompatible with the use of"
2524              " vector types", "+nofp");
2525 }
2526
2527 /* Report when we try to do something that requires SVE when SVE is disabled.
2528    This is an error of last resort and isn't very high-quality.  It usually
2529    involves attempts to measure the vector length in some way.  */
2530 static void
2531 aarch64_report_sve_required (void)
2532 {
2533   static bool reported_p = false;
2534
2535   /* Avoid reporting a slew of messages for a single oversight.  */
2536   if (reported_p)
2537     return;
2538
2539   error ("this operation requires the SVE ISA extension");
2540   inform (input_location, "you can enable SVE using the command-line"
2541           " option %<-march%>, or by using the %<target%>"
2542           " attribute or pragma");
2543   reported_p = true;
2544 }
2545
2546 /* Return true if REGNO is P0-P15 or one of the special FFR-related
2547    registers.  */
2548 inline bool
2549 pr_or_ffr_regnum_p (unsigned int regno)
2550 {
2551   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
2552 }
2553
2554 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
2555    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
2556    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
2557    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
2558    and GENERAL_REGS is lower than the memory cost (in this case the best class
2559    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
2560    cost results in bad allocations with many redundant int<->FP moves which
2561    are expensive on various cores.
2562    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
2563    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
2564    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
2565    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
2566    The result of this is that it is no longer inefficient to have a higher
2567    memory move cost than the register move cost.
2568 */
2569
2570 static reg_class_t
2571 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
2572                                          reg_class_t best_class)
2573 {
2574   machine_mode mode;
2575
2576   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
2577       || !reg_class_subset_p (FP_REGS, allocno_class))
2578     return allocno_class;
2579
2580   if (!reg_class_subset_p (GENERAL_REGS, best_class)
2581       || !reg_class_subset_p (FP_REGS, best_class))
2582     return best_class;
2583
2584   mode = PSEUDO_REGNO_MODE (regno);
2585   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
2586 }
2587
2588 static unsigned int
2589 aarch64_min_divisions_for_recip_mul (machine_mode mode)
2590 {
2591   if (GET_MODE_UNIT_SIZE (mode) == 4)
2592     return aarch64_tune_params.min_div_recip_mul_sf;
2593   return aarch64_tune_params.min_div_recip_mul_df;
2594 }
2595
2596 /* Return the reassociation width of treeop OPC with mode MODE.  */
2597 static int
2598 aarch64_reassociation_width (unsigned opc, machine_mode mode)
2599 {
2600   if (VECTOR_MODE_P (mode))
2601     return aarch64_tune_params.vec_reassoc_width;
2602   if (INTEGRAL_MODE_P (mode))
2603     return aarch64_tune_params.int_reassoc_width;
2604   /* Avoid reassociating floating point addition so we emit more FMAs.  */
2605   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
2606     return aarch64_tune_params.fp_reassoc_width;
2607   return 1;
2608 }
2609
2610 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
2611 unsigned
2612 aarch64_dbx_register_number (unsigned regno)
2613 {
2614    if (GP_REGNUM_P (regno))
2615      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
2616    else if (regno == SP_REGNUM)
2617      return AARCH64_DWARF_SP;
2618    else if (FP_REGNUM_P (regno))
2619      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
2620    else if (PR_REGNUM_P (regno))
2621      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
2622    else if (regno == VG_REGNUM)
2623      return AARCH64_DWARF_VG;
2624
2625    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
2626       equivalent DWARF register.  */
2627    return DWARF_FRAME_REGISTERS;
2628 }
2629
2630 /* If X is a CONST_DOUBLE, return its bit representation as a constant
2631    integer, otherwise return X unmodified.  */
2632 static rtx
2633 aarch64_bit_representation (rtx x)
2634 {
2635   if (CONST_DOUBLE_P (x))
2636     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
2637   return x;
2638 }
2639
2640 /* Return an estimate for the number of quadwords in an SVE vector.  This is
2641    equivalent to the number of Advanced SIMD vectors in an SVE vector.  */
2642 static unsigned int
2643 aarch64_estimated_sve_vq ()
2644 {
2645   return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
2646 }
2647
2648 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
2649 static bool
2650 aarch64_advsimd_struct_mode_p (machine_mode mode)
2651 {
2652   return (TARGET_SIMD
2653           && (mode == OImode || mode == CImode || mode == XImode));
2654 }
2655
2656 /* Return true if MODE is an SVE predicate mode.  */
2657 static bool
2658 aarch64_sve_pred_mode_p (machine_mode mode)
2659 {
2660   return (TARGET_SVE
2661           && (mode == VNx16BImode
2662               || mode == VNx8BImode
2663               || mode == VNx4BImode
2664               || mode == VNx2BImode));
2665 }
2666
2667 /* Three mutually-exclusive flags describing a vector or predicate type.  */
2668 const unsigned int VEC_ADVSIMD  = 1;
2669 const unsigned int VEC_SVE_DATA = 2;
2670 const unsigned int VEC_SVE_PRED = 4;
2671 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
2672    a structure of 2, 3 or 4 vectors.  */
2673 const unsigned int VEC_STRUCT   = 8;
2674 /* Can be used in combination with VEC_SVE_DATA to indicate that the
2675    vector has fewer significant bytes than a full SVE vector.  */
2676 const unsigned int VEC_PARTIAL  = 16;
2677 /* Useful combinations of the above.  */
2678 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
2679 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2680
2681 /* Return a set of flags describing the vector properties of mode MODE.
2682    Ignore modes that are not supported by the current target.  */
2683 static unsigned int
2684 aarch64_classify_vector_mode (machine_mode mode)
2685 {
2686   if (aarch64_advsimd_struct_mode_p (mode))
2687     return VEC_ADVSIMD | VEC_STRUCT;
2688
2689   if (aarch64_sve_pred_mode_p (mode))
2690     return VEC_SVE_PRED;
2691
2692   /* Make the decision based on the mode's enum value rather than its
2693      properties, so that we keep the correct classification regardless
2694      of -msve-vector-bits.  */
2695   switch (mode)
2696     {
2697     /* Partial SVE QI vectors.  */
2698     case E_VNx2QImode:
2699     case E_VNx4QImode:
2700     case E_VNx8QImode:
2701     /* Partial SVE HI vectors.  */
2702     case E_VNx2HImode:
2703     case E_VNx4HImode:
2704     /* Partial SVE SI vector.  */
2705     case E_VNx2SImode:
2706     /* Partial SVE HF vectors.  */
2707     case E_VNx2HFmode:
2708     case E_VNx4HFmode:
2709     /* Partial SVE BF vectors.  */
2710     case E_VNx2BFmode:
2711     case E_VNx4BFmode:
2712     /* Partial SVE SF vector.  */
2713     case E_VNx2SFmode:
2714       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2715
2716     case E_VNx16QImode:
2717     case E_VNx8HImode:
2718     case E_VNx4SImode:
2719     case E_VNx2DImode:
2720     case E_VNx8BFmode:
2721     case E_VNx8HFmode:
2722     case E_VNx4SFmode:
2723     case E_VNx2DFmode:
2724       return TARGET_SVE ? VEC_SVE_DATA : 0;
2725
2726     /* x2 SVE vectors.  */
2727     case E_VNx32QImode:
2728     case E_VNx16HImode:
2729     case E_VNx8SImode:
2730     case E_VNx4DImode:
2731     case E_VNx16BFmode:
2732     case E_VNx16HFmode:
2733     case E_VNx8SFmode:
2734     case E_VNx4DFmode:
2735     /* x3 SVE vectors.  */
2736     case E_VNx48QImode:
2737     case E_VNx24HImode:
2738     case E_VNx12SImode:
2739     case E_VNx6DImode:
2740     case E_VNx24BFmode:
2741     case E_VNx24HFmode:
2742     case E_VNx12SFmode:
2743     case E_VNx6DFmode:
2744     /* x4 SVE vectors.  */
2745     case E_VNx64QImode:
2746     case E_VNx32HImode:
2747     case E_VNx16SImode:
2748     case E_VNx8DImode:
2749     case E_VNx32BFmode:
2750     case E_VNx32HFmode:
2751     case E_VNx16SFmode:
2752     case E_VNx8DFmode:
2753       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2754
2755     /* 64-bit Advanced SIMD vectors.  */
2756     case E_V8QImode:
2757     case E_V4HImode:
2758     case E_V2SImode:
2759     /* ...E_V1DImode doesn't exist.  */
2760     case E_V4HFmode:
2761     case E_V4BFmode:
2762     case E_V2SFmode:
2763     case E_V1DFmode:
2764     /* 128-bit Advanced SIMD vectors.  */
2765     case E_V16QImode:
2766     case E_V8HImode:
2767     case E_V4SImode:
2768     case E_V2DImode:
2769     case E_V8HFmode:
2770     case E_V8BFmode:
2771     case E_V4SFmode:
2772     case E_V2DFmode:
2773       return TARGET_SIMD ? VEC_ADVSIMD : 0;
2774
2775     default:
2776       return 0;
2777     }
2778 }
2779
2780 /* Return true if MODE is any of the data vector modes, including
2781    structure modes.  */
2782 static bool
2783 aarch64_vector_data_mode_p (machine_mode mode)
2784 {
2785   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
2786 }
2787
2788 /* Return true if MODE is any form of SVE mode, including predicates,
2789    vectors and structures.  */
2790 bool
2791 aarch64_sve_mode_p (machine_mode mode)
2792 {
2793   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
2794 }
2795
2796 /* Return true if MODE is an SVE data vector mode; either a single vector
2797    or a structure of vectors.  */
2798 static bool
2799 aarch64_sve_data_mode_p (machine_mode mode)
2800 {
2801   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
2802 }
2803
2804 /* Return the number of defined bytes in one constituent vector of
2805    SVE mode MODE, which has vector flags VEC_FLAGS.  */
2806 static poly_int64
2807 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
2808 {
2809   if (vec_flags & VEC_PARTIAL)
2810     /* A single partial vector.  */
2811     return GET_MODE_SIZE (mode);
2812
2813   if (vec_flags & VEC_SVE_DATA)
2814     /* A single vector or a tuple.  */
2815     return BYTES_PER_SVE_VECTOR;
2816
2817   /* A single predicate.  */
2818   gcc_assert (vec_flags & VEC_SVE_PRED);
2819   return BYTES_PER_SVE_PRED;
2820 }
2821
2822 /* Implement target hook TARGET_ARRAY_MODE.  */
2823 static opt_machine_mode
2824 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
2825 {
2826   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
2827       && IN_RANGE (nelems, 2, 4))
2828     return mode_for_vector (GET_MODE_INNER (mode),
2829                             GET_MODE_NUNITS (mode) * nelems);
2830
2831   return opt_machine_mode ();
2832 }
2833
2834 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
2835 static bool
2836 aarch64_array_mode_supported_p (machine_mode mode,
2837                                 unsigned HOST_WIDE_INT nelems)
2838 {
2839   if (TARGET_SIMD
2840       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
2841           || AARCH64_VALID_SIMD_DREG_MODE (mode))
2842       && (nelems >= 2 && nelems <= 4))
2843     return true;
2844
2845   return false;
2846 }
2847
2848 /* MODE is some form of SVE vector mode.  For data modes, return the number
2849    of vector register bits that each element of MODE occupies, such as 64
2850    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2851    in a 64-bit container).  For predicate modes, return the number of
2852    data bits controlled by each significant predicate bit.  */
2853
2854 static unsigned int
2855 aarch64_sve_container_bits (machine_mode mode)
2856 {
2857   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2858   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
2859                              ? BITS_PER_SVE_VECTOR
2860                              : GET_MODE_BITSIZE (mode));
2861   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
2862 }
2863
2864 /* Return the SVE predicate mode to use for elements that have
2865    ELEM_NBYTES bytes, if such a mode exists.  */
2866
2867 opt_machine_mode
2868 aarch64_sve_pred_mode (unsigned int elem_nbytes)
2869 {
2870   if (TARGET_SVE)
2871     {
2872       if (elem_nbytes == 1)
2873         return VNx16BImode;
2874       if (elem_nbytes == 2)
2875         return VNx8BImode;
2876       if (elem_nbytes == 4)
2877         return VNx4BImode;
2878       if (elem_nbytes == 8)
2879         return VNx2BImode;
2880     }
2881   return opt_machine_mode ();
2882 }
2883
2884 /* Return the SVE predicate mode that should be used to control
2885    SVE mode MODE.  */
2886
2887 machine_mode
2888 aarch64_sve_pred_mode (machine_mode mode)
2889 {
2890   unsigned int bits = aarch64_sve_container_bits (mode);
2891   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
2892 }
2893
2894 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
2895
2896 static opt_machine_mode
2897 aarch64_get_mask_mode (machine_mode mode)
2898 {
2899   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2900   if (vec_flags & VEC_SVE_DATA)
2901     return aarch64_sve_pred_mode (mode);
2902
2903   return default_get_mask_mode (mode);
2904 }
2905
2906 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
2907
2908 opt_machine_mode
2909 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
2910 {
2911   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
2912                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
2913   machine_mode mode;
2914   FOR_EACH_MODE_IN_CLASS (mode, mclass)
2915     if (inner_mode == GET_MODE_INNER (mode)
2916         && known_eq (nunits, GET_MODE_NUNITS (mode))
2917         && aarch64_sve_data_mode_p (mode))
2918       return mode;
2919   return opt_machine_mode ();
2920 }
2921
2922 /* Return the integer element mode associated with SVE mode MODE.  */
2923
2924 static scalar_int_mode
2925 aarch64_sve_element_int_mode (machine_mode mode)
2926 {
2927   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2928                              ? BITS_PER_SVE_VECTOR
2929                              : GET_MODE_BITSIZE (mode));
2930   unsigned int elt_bits = vector_element_size (vector_bits,
2931                                                GET_MODE_NUNITS (mode));
2932   return int_mode_for_size (elt_bits, 0).require ();
2933 }
2934
2935 /* Return an integer element mode that contains exactly
2936    aarch64_sve_container_bits (MODE) bits.  This is wider than
2937    aarch64_sve_element_int_mode if MODE is a partial vector,
2938    otherwise it's the same.  */
2939
2940 static scalar_int_mode
2941 aarch64_sve_container_int_mode (machine_mode mode)
2942 {
2943   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
2944 }
2945
2946 /* Return the integer vector mode associated with SVE mode MODE.
2947    Unlike related_int_vector_mode, this can handle the case in which
2948    MODE is a predicate (and thus has a different total size).  */
2949
2950 machine_mode
2951 aarch64_sve_int_mode (machine_mode mode)
2952 {
2953   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
2954   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
2955 }
2956
2957 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
2958
2959 static opt_machine_mode
2960 aarch64_vectorize_related_mode (machine_mode vector_mode,
2961                                 scalar_mode element_mode,
2962                                 poly_uint64 nunits)
2963 {
2964   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2965
2966   /* If we're operating on SVE vectors, try to return an SVE mode.  */
2967   poly_uint64 sve_nunits;
2968   if ((vec_flags & VEC_SVE_DATA)
2969       && multiple_p (BYTES_PER_SVE_VECTOR,
2970                      GET_MODE_SIZE (element_mode), &sve_nunits))
2971     {
2972       machine_mode sve_mode;
2973       if (maybe_ne (nunits, 0U))
2974         {
2975           /* Try to find a full or partial SVE mode with exactly
2976              NUNITS units.  */
2977           if (multiple_p (sve_nunits, nunits)
2978               && aarch64_sve_data_mode (element_mode,
2979                                         nunits).exists (&sve_mode))
2980             return sve_mode;
2981         }
2982       else
2983         {
2984           /* Take the preferred number of units from the number of bytes
2985              that fit in VECTOR_MODE.  We always start by "autodetecting"
2986              a full vector mode with preferred_simd_mode, so vectors
2987              chosen here will also be full vector modes.  Then
2988              autovectorize_vector_modes tries smaller starting modes
2989              and thus smaller preferred numbers of units.  */
2990           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2991           if (aarch64_sve_data_mode (element_mode,
2992                                      sve_nunits).exists (&sve_mode))
2993             return sve_mode;
2994         }
2995     }
2996
2997   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
2998   if ((vec_flags & VEC_ADVSIMD)
2999       && known_eq (nunits, 0U)
3000       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3001       && maybe_ge (GET_MODE_BITSIZE (element_mode)
3002                    * GET_MODE_NUNITS (vector_mode), 128U))
3003     {
3004       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3005       if (VECTOR_MODE_P (res))
3006         return res;
3007     }
3008
3009   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3010 }
3011
3012 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
3013    prefer to use the first arithmetic operand as the else value if
3014    the else value doesn't matter, since that exactly matches the SVE
3015    destructive merging form.  For ternary operations we could either
3016    pick the first operand and use FMAD-like instructions or the last
3017    operand and use FMLA-like instructions; the latter seems more
3018    natural.  */
3019
3020 static tree
3021 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
3022 {
3023   return nops == 3 ? ops[2] : ops[0];
3024 }
3025
3026 /* Implement TARGET_HARD_REGNO_NREGS.  */
3027
3028 static unsigned int
3029 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
3030 {
3031   /* ??? Logically we should only need to provide a value when
3032      HARD_REGNO_MODE_OK says that the combination is valid,
3033      but at the moment we need to handle all modes.  Just ignore
3034      any runtime parts for registers that can't store them.  */
3035   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
3036   switch (aarch64_regno_regclass (regno))
3037     {
3038     case FP_REGS:
3039     case FP_LO_REGS:
3040     case FP_LO8_REGS:
3041       {
3042         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3043         if (vec_flags & VEC_SVE_DATA)
3044           return exact_div (GET_MODE_SIZE (mode),
3045                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
3046         return CEIL (lowest_size, UNITS_PER_VREG);
3047       }
3048     case PR_REGS:
3049     case PR_LO_REGS:
3050     case PR_HI_REGS:
3051     case FFR_REGS:
3052     case PR_AND_FFR_REGS:
3053       return 1;
3054     default:
3055       return CEIL (lowest_size, UNITS_PER_WORD);
3056     }
3057   gcc_unreachable ();
3058 }
3059
3060 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
3061
3062 static bool
3063 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
3064 {
3065   if (GET_MODE_CLASS (mode) == MODE_CC)
3066     return regno == CC_REGNUM;
3067
3068   if (regno == VG_REGNUM)
3069     /* This must have the same size as _Unwind_Word.  */
3070     return mode == DImode;
3071
3072   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3073   if (vec_flags & VEC_SVE_PRED)
3074     return pr_or_ffr_regnum_p (regno);
3075
3076   if (pr_or_ffr_regnum_p (regno))
3077     return false;
3078
3079   if (regno == SP_REGNUM)
3080     /* The purpose of comparing with ptr_mode is to support the
3081        global register variable associated with the stack pointer
3082        register via the syntax of asm ("wsp") in ILP32.  */
3083     return mode == Pmode || mode == ptr_mode;
3084
3085   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
3086     return mode == Pmode;
3087
3088   if (GP_REGNUM_P (regno))
3089     {
3090       if (vec_flags & VEC_ANY_SVE)
3091         return false;
3092       if (known_le (GET_MODE_SIZE (mode), 8))
3093         return true;
3094       if (known_le (GET_MODE_SIZE (mode), 16))
3095         return (regno & 1) == 0;
3096     }
3097   else if (FP_REGNUM_P (regno))
3098     {
3099       if (vec_flags & VEC_STRUCT)
3100         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
3101       else
3102         return !VECTOR_MODE_P (mode) || vec_flags != 0;
3103     }
3104
3105   return false;
3106 }
3107
3108 /* Return true if a function with type FNTYPE returns its value in
3109    SVE vector or predicate registers.  */
3110
3111 static bool
3112 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
3113 {
3114   tree return_type = TREE_TYPE (fntype);
3115
3116   pure_scalable_type_info pst_info;
3117   switch (pst_info.analyze (return_type))
3118     {
3119     case pure_scalable_type_info::IS_PST:
3120       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
3121               && pst_info.num_pr () <= NUM_PR_ARG_REGS);
3122
3123     case pure_scalable_type_info::DOESNT_MATTER:
3124       gcc_assert (aarch64_return_in_memory_1 (return_type));
3125       return false;
3126
3127     case pure_scalable_type_info::NO_ABI_IDENTITY:
3128     case pure_scalable_type_info::ISNT_PST:
3129       return false;
3130     }
3131   gcc_unreachable ();
3132 }
3133
3134 /* Return true if a function with type FNTYPE takes arguments in
3135    SVE vector or predicate registers.  */
3136
3137 static bool
3138 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
3139 {
3140   CUMULATIVE_ARGS args_so_far_v;
3141   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
3142                                 NULL_TREE, 0, true);
3143   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
3144
3145   for (tree chain = TYPE_ARG_TYPES (fntype);
3146        chain && chain != void_list_node;
3147        chain = TREE_CHAIN (chain))
3148     {
3149       tree arg_type = TREE_VALUE (chain);
3150       if (arg_type == error_mark_node)
3151         return false;
3152
3153       function_arg_info arg (arg_type, /*named=*/true);
3154       apply_pass_by_reference_rules (&args_so_far_v, arg);
3155       pure_scalable_type_info pst_info;
3156       if (pst_info.analyze_registers (arg.type))
3157         {
3158           unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
3159           unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
3160           gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
3161           return true;
3162         }
3163
3164       targetm.calls.function_arg_advance (args_so_far, arg);
3165     }
3166   return false;
3167 }
3168
3169 /* Implement TARGET_FNTYPE_ABI.  */
3170
3171 static const predefined_function_abi &
3172 aarch64_fntype_abi (const_tree fntype)
3173 {
3174   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
3175     return aarch64_simd_abi ();
3176
3177   if (aarch64_returns_value_in_sve_regs_p (fntype)
3178       || aarch64_takes_arguments_in_sve_regs_p (fntype))
3179     return aarch64_sve_abi ();
3180
3181   return default_function_abi;
3182 }
3183
3184 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
3185
3186 static bool
3187 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
3188 {
3189   return (aarch64_sve::builtin_type_p (type1)
3190           == aarch64_sve::builtin_type_p (type2));
3191 }
3192
3193 /* Return true if we should emit CFI for register REGNO.  */
3194
3195 static bool
3196 aarch64_emit_cfi_for_reg_p (unsigned int regno)
3197 {
3198   return (GP_REGNUM_P (regno)
3199           || !default_function_abi.clobbers_full_reg_p (regno));
3200 }
3201
3202 /* Return the mode we should use to save and restore register REGNO.  */
3203
3204 static machine_mode
3205 aarch64_reg_save_mode (unsigned int regno)
3206 {
3207   if (GP_REGNUM_P (regno))
3208     return DImode;
3209
3210   if (FP_REGNUM_P (regno))
3211     switch (crtl->abi->id ())
3212       {
3213       case ARM_PCS_AAPCS64:
3214         /* Only the low 64 bits are saved by the base PCS.  */
3215         return DFmode;
3216
3217       case ARM_PCS_SIMD:
3218         /* The vector PCS saves the low 128 bits (which is the full
3219            register on non-SVE targets).  */
3220         return TFmode;
3221
3222       case ARM_PCS_SVE:
3223         /* Use vectors of DImode for registers that need frame
3224            information, so that the first 64 bytes of the save slot
3225            are always the equivalent of what storing D<n> would give.  */
3226         if (aarch64_emit_cfi_for_reg_p (regno))
3227           return VNx2DImode;
3228
3229         /* Use vectors of bytes otherwise, so that the layout is
3230            endian-agnostic, and so that we can use LDR and STR for
3231            big-endian targets.  */
3232         return VNx16QImode;
3233
3234       case ARM_PCS_TLSDESC:
3235       case ARM_PCS_UNKNOWN:
3236         break;
3237       }
3238
3239   if (PR_REGNUM_P (regno))
3240     /* Save the full predicate register.  */
3241     return VNx16BImode;
3242
3243   gcc_unreachable ();
3244 }
3245
3246 /* Implement TARGET_INSN_CALLEE_ABI.  */
3247
3248 const predefined_function_abi &
3249 aarch64_insn_callee_abi (const rtx_insn *insn)
3250 {
3251   rtx pat = PATTERN (insn);
3252   gcc_assert (GET_CODE (pat) == PARALLEL);
3253   rtx unspec = XVECEXP (pat, 0, 1);
3254   gcc_assert (GET_CODE (unspec) == UNSPEC
3255               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
3256   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
3257 }
3258
3259 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
3260    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
3261    clobbers the top 64 bits when restoring the bottom 64 bits.  */
3262
3263 static bool
3264 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
3265                                         unsigned int regno,
3266                                         machine_mode mode)
3267 {
3268   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
3269     {
3270       poly_int64 per_register_size = GET_MODE_SIZE (mode);
3271       unsigned int nregs = hard_regno_nregs (regno, mode);
3272       if (nregs > 1)
3273         per_register_size = exact_div (per_register_size, nregs);
3274       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
3275         return maybe_gt (per_register_size, 16);
3276       return maybe_gt (per_register_size, 8);
3277     }
3278   return false;
3279 }
3280
3281 /* Implement REGMODE_NATURAL_SIZE.  */
3282 poly_uint64
3283 aarch64_regmode_natural_size (machine_mode mode)
3284 {
3285   /* The natural size for SVE data modes is one SVE data vector,
3286      and similarly for predicates.  We can't independently modify
3287      anything smaller than that.  */
3288   /* ??? For now, only do this for variable-width SVE registers.
3289      Doing it for constant-sized registers breaks lower-subreg.c.  */
3290   /* ??? And once that's fixed, we should probably have similar
3291      code for Advanced SIMD.  */
3292   if (!aarch64_sve_vg.is_constant ())
3293     {
3294       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3295       if (vec_flags & VEC_SVE_PRED)
3296         return BYTES_PER_SVE_PRED;
3297       if (vec_flags & VEC_SVE_DATA)
3298         return BYTES_PER_SVE_VECTOR;
3299     }
3300   return UNITS_PER_WORD;
3301 }
3302
3303 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
3304 machine_mode
3305 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
3306                                      machine_mode mode)
3307 {
3308   /* The predicate mode determines which bits are significant and
3309      which are "don't care".  Decreasing the number of lanes would
3310      lose data while increasing the number of lanes would make bits
3311      unnecessarily significant.  */
3312   if (PR_REGNUM_P (regno))
3313     return mode;
3314   if (known_ge (GET_MODE_SIZE (mode), 4))
3315     return mode;
3316   else
3317     return SImode;
3318 }
3319
3320 /* Return true if I's bits are consecutive ones from the MSB.  */
3321 bool
3322 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
3323 {
3324   return exact_log2 (-i) != HOST_WIDE_INT_M1;
3325 }
3326
3327 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
3328    that strcpy from constants will be faster.  */
3329
3330 static HOST_WIDE_INT
3331 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
3332 {
3333   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
3334     return MAX (align, BITS_PER_WORD);
3335   return align;
3336 }
3337
3338 /* Return true if calls to DECL should be treated as
3339    long-calls (ie called via a register).  */
3340 static bool
3341 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
3342 {
3343   return false;
3344 }
3345
3346 /* Return true if calls to symbol-ref SYM should be treated as
3347    long-calls (ie called via a register).  */
3348 bool
3349 aarch64_is_long_call_p (rtx sym)
3350 {
3351   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
3352 }
3353
3354 /* Return true if calls to symbol-ref SYM should not go through
3355    plt stubs.  */
3356
3357 bool
3358 aarch64_is_noplt_call_p (rtx sym)
3359 {
3360   const_tree decl = SYMBOL_REF_DECL (sym);
3361
3362   if (flag_pic
3363       && decl
3364       && (!flag_plt
3365           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
3366       && !targetm.binds_local_p (decl))
3367     return true;
3368
3369   return false;
3370 }
3371
3372 /* Emit an insn that's a simple single-set.  Both the operands must be
3373    known to be valid.  */
3374 inline static rtx_insn *
3375 emit_set_insn (rtx x, rtx y)
3376 {
3377   return emit_insn (gen_rtx_SET (x, y));
3378 }
3379
3380 /* X and Y are two things to compare using CODE.  Emit the compare insn and
3381    return the rtx for register 0 in the proper mode.  */
3382 rtx
3383 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
3384 {
3385   machine_mode cmp_mode = GET_MODE (x);
3386   machine_mode cc_mode;
3387   rtx cc_reg;
3388
3389   if (cmp_mode == TImode)
3390     {
3391       gcc_assert (code == NE);
3392
3393       cc_mode = CCmode;
3394       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3395
3396       rtx x_lo = operand_subword (x, 0, 0, TImode);
3397       rtx y_lo = operand_subword (y, 0, 0, TImode);
3398       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
3399
3400       rtx x_hi = operand_subword (x, 1, 0, TImode);
3401       rtx y_hi = operand_subword (y, 1, 0, TImode);
3402       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
3403                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
3404                                GEN_INT (AARCH64_EQ)));
3405     }
3406   else
3407     {
3408       cc_mode = SELECT_CC_MODE (code, x, y);
3409       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3410       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
3411     }
3412   return cc_reg;
3413 }
3414
3415 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
3416
3417 static rtx
3418 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
3419                                   machine_mode y_mode)
3420 {
3421   if (y_mode == E_QImode || y_mode == E_HImode)
3422     {
3423       if (CONST_INT_P (y))
3424         {
3425           y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
3426           y_mode = SImode;
3427         }
3428       else
3429         {
3430           rtx t, cc_reg;
3431           machine_mode cc_mode;
3432
3433           t = gen_rtx_ZERO_EXTEND (SImode, y);
3434           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
3435           cc_mode = CC_SWPmode;
3436           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3437           emit_set_insn (cc_reg, t);
3438           return cc_reg;
3439         }
3440     }
3441
3442   if (!aarch64_plus_operand (y, y_mode))
3443     y = force_reg (y_mode, y);
3444
3445   return aarch64_gen_compare_reg (code, x, y);
3446 }
3447
3448 /* Build the SYMBOL_REF for __tls_get_addr.  */
3449
3450 static GTY(()) rtx tls_get_addr_libfunc;
3451
3452 rtx
3453 aarch64_tls_get_addr (void)
3454 {
3455   if (!tls_get_addr_libfunc)
3456     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
3457   return tls_get_addr_libfunc;
3458 }
3459
3460 /* Return the TLS model to use for ADDR.  */
3461
3462 static enum tls_model
3463 tls_symbolic_operand_type (rtx addr)
3464 {
3465   enum tls_model tls_kind = TLS_MODEL_NONE;
3466   poly_int64 offset;
3467   addr = strip_offset_and_salt (addr, &offset);
3468   if (SYMBOL_REF_P (addr))
3469     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
3470
3471   return tls_kind;
3472 }
3473
3474 /* We'll allow lo_sum's in addresses in our legitimate addresses
3475    so that combine would take care of combining addresses where
3476    necessary, but for generation purposes, we'll generate the address
3477    as :
3478    RTL                               Absolute
3479    tmp = hi (symbol_ref);            adrp  x1, foo
3480    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
3481                                      nop
3482
3483    PIC                               TLS
3484    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
3485    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
3486                                      bl   __tls_get_addr
3487                                      nop
3488
3489    Load TLS symbol, depending on TLS mechanism and TLS access model.
3490
3491    Global Dynamic - Traditional TLS:
3492    adrp tmp, :tlsgd:imm
3493    add  dest, tmp, #:tlsgd_lo12:imm
3494    bl   __tls_get_addr
3495
3496    Global Dynamic - TLS Descriptors:
3497    adrp dest, :tlsdesc:imm
3498    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
3499    add  dest, dest, #:tlsdesc_lo12:imm
3500    blr  tmp
3501    mrs  tp, tpidr_el0
3502    add  dest, dest, tp
3503
3504    Initial Exec:
3505    mrs  tp, tpidr_el0
3506    adrp tmp, :gottprel:imm
3507    ldr  dest, [tmp, #:gottprel_lo12:imm]
3508    add  dest, dest, tp
3509
3510    Local Exec:
3511    mrs  tp, tpidr_el0
3512    add  t0, tp, #:tprel_hi12:imm, lsl #12
3513    add  t0, t0, #:tprel_lo12_nc:imm
3514 */
3515
3516 static void
3517 aarch64_load_symref_appropriately (rtx dest, rtx imm,
3518                                    enum aarch64_symbol_type type)
3519 {
3520   switch (type)
3521     {
3522     case SYMBOL_SMALL_ABSOLUTE:
3523       {
3524         /* In ILP32, the mode of dest can be either SImode or DImode.  */
3525         rtx tmp_reg = dest;
3526         machine_mode mode = GET_MODE (dest);
3527
3528         gcc_assert (mode == Pmode || mode == ptr_mode);
3529
3530         if (can_create_pseudo_p ())
3531           tmp_reg = gen_reg_rtx (mode);
3532
3533         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3534         emit_insn (gen_add_losym (dest, tmp_reg, imm));
3535         return;
3536       }
3537
3538     case SYMBOL_TINY_ABSOLUTE:
3539       emit_insn (gen_rtx_SET (dest, imm));
3540       return;
3541
3542     case SYMBOL_SMALL_GOT_28K:
3543       {
3544         machine_mode mode = GET_MODE (dest);
3545         rtx gp_rtx = pic_offset_table_rtx;
3546         rtx insn;
3547         rtx mem;
3548
3549         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
3550            here before rtl expand.  Tree IVOPT will generate rtl pattern to
3551            decide rtx costs, in which case pic_offset_table_rtx is not
3552            initialized.  For that case no need to generate the first adrp
3553            instruction as the final cost for global variable access is
3554            one instruction.  */
3555         if (gp_rtx != NULL)
3556           {
3557             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
3558                using the page base as GOT base, the first page may be wasted,
3559                in the worst scenario, there is only 28K space for GOT).
3560
3561                The generate instruction sequence for accessing global variable
3562                is:
3563
3564                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
3565
3566                Only one instruction needed. But we must initialize
3567                pic_offset_table_rtx properly.  We generate initialize insn for
3568                every global access, and allow CSE to remove all redundant.
3569
3570                The final instruction sequences will look like the following
3571                for multiply global variables access.
3572
3573                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3574
3575                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3576                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3577                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3578                  ...  */
3579
3580             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3581             crtl->uses_pic_offset_table = 1;
3582             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3583
3584             if (mode != GET_MODE (gp_rtx))
3585              gp_rtx = gen_lowpart (mode, gp_rtx);
3586
3587           }
3588
3589         if (mode == ptr_mode)
3590           {
3591             if (mode == DImode)
3592               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
3593             else
3594               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3595
3596             mem = XVECEXP (SET_SRC (insn), 0, 0);
3597           }
3598         else
3599           {
3600             gcc_assert (mode == Pmode);
3601
3602             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3603             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3604           }
3605
3606         /* The operand is expected to be MEM.  Whenever the related insn
3607            pattern changed, above code which calculate mem should be
3608            updated.  */
3609         gcc_assert (MEM_P (mem));
3610         MEM_READONLY_P (mem) = 1;
3611         MEM_NOTRAP_P (mem) = 1;
3612         emit_insn (insn);
3613         return;
3614       }
3615
3616     case SYMBOL_SMALL_GOT_4G:
3617       {
3618         /* In ILP32, the mode of dest can be either SImode or DImode,
3619            while the got entry is always of SImode size.  The mode of
3620            dest depends on how dest is used: if dest is assigned to a
3621            pointer (e.g. in the memory), it has SImode; it may have
3622            DImode if dest is dereferenced to access the memeory.
3623            This is why we have to handle three different ldr_got_small
3624            patterns here (two patterns for ILP32).  */
3625
3626         rtx insn;
3627         rtx mem;
3628         rtx tmp_reg = dest;
3629         machine_mode mode = GET_MODE (dest);
3630
3631         if (can_create_pseudo_p ())
3632           tmp_reg = gen_reg_rtx (mode);
3633
3634         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3635         if (mode == ptr_mode)
3636           {
3637             if (mode == DImode)
3638               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
3639             else
3640               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
3641
3642             mem = XVECEXP (SET_SRC (insn), 0, 0);
3643           }
3644         else
3645           {
3646             gcc_assert (mode == Pmode);
3647
3648             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
3649             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3650           }
3651
3652         gcc_assert (MEM_P (mem));
3653         MEM_READONLY_P (mem) = 1;
3654         MEM_NOTRAP_P (mem) = 1;
3655         emit_insn (insn);
3656         return;
3657       }
3658
3659     case SYMBOL_SMALL_TLSGD:
3660       {
3661         rtx_insn *insns;
3662         /* The return type of __tls_get_addr is the C pointer type
3663            so use ptr_mode.  */
3664         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3665         rtx tmp_reg = dest;
3666
3667         if (GET_MODE (dest) != ptr_mode)
3668           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3669
3670         start_sequence ();
3671         if (ptr_mode == SImode)
3672           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3673         else
3674           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3675         insns = get_insns ();
3676         end_sequence ();
3677
3678         RTL_CONST_CALL_P (insns) = 1;
3679         emit_libcall_block (insns, tmp_reg, result, imm);
3680         /* Convert back to the mode of the dest adding a zero_extend
3681            from SImode (ptr_mode) to DImode (Pmode). */
3682         if (dest != tmp_reg)
3683           convert_move (dest, tmp_reg, true);
3684         return;
3685       }
3686
3687     case SYMBOL_SMALL_TLSDESC:
3688       {
3689         machine_mode mode = GET_MODE (dest);
3690         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3691         rtx tp;
3692
3693         gcc_assert (mode == Pmode || mode == ptr_mode);
3694
3695         /* In ILP32, the got entry is always of SImode size.  Unlike
3696            small GOT, the dest is fixed at reg 0.  */
3697         if (TARGET_ILP32)
3698           emit_insn (gen_tlsdesc_small_si (imm));
3699         else
3700           emit_insn (gen_tlsdesc_small_di (imm));
3701         tp = aarch64_load_tp (NULL);
3702
3703         if (mode != Pmode)
3704           tp = gen_lowpart (mode, tp);
3705
3706         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3707         if (REG_P (dest))
3708           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3709         return;
3710       }
3711
3712     case SYMBOL_SMALL_TLSIE:
3713       {
3714         /* In ILP32, the mode of dest can be either SImode or DImode,
3715            while the got entry is always of SImode size.  The mode of
3716            dest depends on how dest is used: if dest is assigned to a
3717            pointer (e.g. in the memory), it has SImode; it may have
3718            DImode if dest is dereferenced to access the memeory.
3719            This is why we have to handle three different tlsie_small
3720            patterns here (two patterns for ILP32).  */
3721         machine_mode mode = GET_MODE (dest);
3722         rtx tmp_reg = gen_reg_rtx (mode);
3723         rtx tp = aarch64_load_tp (NULL);
3724
3725         if (mode == ptr_mode)
3726           {
3727             if (mode == DImode)
3728               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3729             else
3730               {
3731                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3732                 tp = gen_lowpart (mode, tp);
3733               }
3734           }
3735         else
3736           {
3737             gcc_assert (mode == Pmode);
3738             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3739           }
3740
3741         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3742         if (REG_P (dest))
3743           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3744         return;
3745       }
3746
3747     case SYMBOL_TLSLE12:
3748     case SYMBOL_TLSLE24:
3749     case SYMBOL_TLSLE32:
3750     case SYMBOL_TLSLE48:
3751       {
3752         machine_mode mode = GET_MODE (dest);
3753         rtx tp = aarch64_load_tp (NULL);
3754
3755         if (mode != Pmode)
3756           tp = gen_lowpart (mode, tp);
3757
3758         switch (type)
3759           {
3760           case SYMBOL_TLSLE12:
3761             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3762                         (dest, tp, imm));
3763             break;
3764           case SYMBOL_TLSLE24:
3765             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3766                         (dest, tp, imm));
3767           break;
3768           case SYMBOL_TLSLE32:
3769             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3770                         (dest, imm));
3771             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3772                         (dest, dest, tp));
3773           break;
3774           case SYMBOL_TLSLE48:
3775             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3776                         (dest, imm));
3777             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3778                         (dest, dest, tp));
3779             break;
3780           default:
3781             gcc_unreachable ();
3782           }
3783
3784         if (REG_P (dest))
3785           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3786         return;
3787       }
3788
3789     case SYMBOL_TINY_GOT:
3790       {
3791         rtx insn;
3792         machine_mode mode = GET_MODE (dest);
3793
3794         if (mode == ptr_mode)
3795           insn = gen_ldr_got_tiny (mode, dest, imm);
3796         else
3797           {
3798             gcc_assert (mode == Pmode);
3799             insn = gen_ldr_got_tiny_sidi (dest, imm);
3800           }
3801
3802         emit_insn (insn);
3803         return;
3804       }
3805
3806     case SYMBOL_TINY_TLSIE:
3807       {
3808         machine_mode mode = GET_MODE (dest);
3809         rtx tp = aarch64_load_tp (NULL);
3810
3811         if (mode == ptr_mode)
3812           {
3813             if (mode == DImode)
3814               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3815             else
3816               {
3817                 tp = gen_lowpart (mode, tp);
3818                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3819               }
3820           }
3821         else
3822           {
3823             gcc_assert (mode == Pmode);
3824             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3825           }
3826
3827         if (REG_P (dest))
3828           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3829         return;
3830       }
3831
3832     default:
3833       gcc_unreachable ();
3834     }
3835 }
3836
3837 /* Emit a move from SRC to DEST.  Assume that the move expanders can
3838    handle all moves if !can_create_pseudo_p ().  The distinction is
3839    important because, unlike emit_move_insn, the move expanders know
3840    how to force Pmode objects into the constant pool even when the
3841    constant pool address is not itself legitimate.  */
3842 static rtx
3843 aarch64_emit_move (rtx dest, rtx src)
3844 {
3845   return (can_create_pseudo_p ()
3846           ? emit_move_insn (dest, src)
3847           : emit_move_insn_1 (dest, src));
3848 }
3849
3850 /* Apply UNOPTAB to OP and store the result in DEST.  */
3851
3852 static void
3853 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3854 {
3855   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3856   if (dest != tmp)
3857     emit_move_insn (dest, tmp);
3858 }
3859
3860 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
3861
3862 static void
3863 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3864 {
3865   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3866                           OPTAB_DIRECT);
3867   if (dest != tmp)
3868     emit_move_insn (dest, tmp);
3869 }
3870
3871 /* Split a 128-bit move operation into two 64-bit move operations,
3872    taking care to handle partial overlap of register to register
3873    copies.  Special cases are needed when moving between GP regs and
3874    FP regs.  SRC can be a register, constant or memory; DST a register
3875    or memory.  If either operand is memory it must not have any side
3876    effects.  */
3877 void
3878 aarch64_split_128bit_move (rtx dst, rtx src)
3879 {
3880   rtx dst_lo, dst_hi;
3881   rtx src_lo, src_hi;
3882
3883   machine_mode mode = GET_MODE (dst);
3884
3885   gcc_assert (mode == TImode || mode == TFmode);
3886   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3887   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3888
3889   if (REG_P (dst) && REG_P (src))
3890     {
3891       int src_regno = REGNO (src);
3892       int dst_regno = REGNO (dst);
3893
3894       /* Handle FP <-> GP regs.  */
3895       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3896         {
3897           src_lo = gen_lowpart (word_mode, src);
3898           src_hi = gen_highpart (word_mode, src);
3899
3900           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3901           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3902           return;
3903         }
3904       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3905         {
3906           dst_lo = gen_lowpart (word_mode, dst);
3907           dst_hi = gen_highpart (word_mode, dst);
3908
3909           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3910           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3911           return;
3912         }
3913     }
3914
3915   dst_lo = gen_lowpart (word_mode, dst);
3916   dst_hi = gen_highpart (word_mode, dst);
3917   src_lo = gen_lowpart (word_mode, src);
3918   src_hi = gen_highpart_mode (word_mode, mode, src);
3919
3920   /* At most one pairing may overlap.  */
3921   if (reg_overlap_mentioned_p (dst_lo, src_hi))
3922     {
3923       aarch64_emit_move (dst_hi, src_hi);
3924       aarch64_emit_move (dst_lo, src_lo);
3925     }
3926   else
3927     {
3928       aarch64_emit_move (dst_lo, src_lo);
3929       aarch64_emit_move (dst_hi, src_hi);
3930     }
3931 }
3932
3933 /* Return true if we should split a move from 128-bit value SRC
3934    to 128-bit register DEST.  */
3935
3936 bool
3937 aarch64_split_128bit_move_p (rtx dst, rtx src)
3938 {
3939   if (FP_REGNUM_P (REGNO (dst)))
3940     return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3941   /* All moves to GPRs need to be split.  */
3942   return true;
3943 }
3944
3945 /* Split a complex SIMD combine.  */
3946
3947 void
3948 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
3949 {
3950   machine_mode src_mode = GET_MODE (src1);
3951   machine_mode dst_mode = GET_MODE (dst);
3952
3953   gcc_assert (VECTOR_MODE_P (dst_mode));
3954   gcc_assert (register_operand (dst, dst_mode)
3955               && register_operand (src1, src_mode)
3956               && register_operand (src2, src_mode));
3957
3958   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
3959   return;
3960 }
3961
3962 /* Split a complex SIMD move.  */
3963
3964 void
3965 aarch64_split_simd_move (rtx dst, rtx src)
3966 {
3967   machine_mode src_mode = GET_MODE (src);
3968   machine_mode dst_mode = GET_MODE (dst);
3969
3970   gcc_assert (VECTOR_MODE_P (dst_mode));
3971
3972   if (REG_P (dst) && REG_P (src))
3973     {
3974       gcc_assert (VECTOR_MODE_P (src_mode));
3975       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3976     }
3977 }
3978
3979 bool
3980 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3981                               machine_mode ymode, rtx y)
3982 {
3983   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3984   gcc_assert (r != NULL);
3985   return rtx_equal_p (x, r);
3986 }
3987
3988 /* Return TARGET if it is nonnull and a register of mode MODE.
3989    Otherwise, return a fresh register of mode MODE if we can,
3990    or TARGET reinterpreted as MODE if we can't.  */
3991
3992 static rtx
3993 aarch64_target_reg (rtx target, machine_mode mode)
3994 {
3995   if (target && REG_P (target) && GET_MODE (target) == mode)
3996     return target;
3997   if (!can_create_pseudo_p ())
3998     {
3999       gcc_assert (target);
4000       return gen_lowpart (mode, target);
4001     }
4002   return gen_reg_rtx (mode);
4003 }
4004
4005 /* Return a register that contains the constant in BUILDER, given that
4006    the constant is a legitimate move operand.  Use TARGET as the register
4007    if it is nonnull and convenient.  */
4008
4009 static rtx
4010 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
4011 {
4012   rtx src = builder.build ();
4013   target = aarch64_target_reg (target, GET_MODE (src));
4014   emit_insn (gen_rtx_SET (target, src));
4015   return target;
4016 }
4017
4018 static rtx
4019 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
4020 {
4021   if (can_create_pseudo_p ())
4022     return force_reg (mode, value);
4023   else
4024     {
4025       gcc_assert (x);
4026       aarch64_emit_move (x, value);
4027       return x;
4028     }
4029 }
4030
4031 /* Return true if predicate value X is a constant in which every element
4032    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
4033    value, i.e. as a predicate in which all bits are significant.  */
4034
4035 static bool
4036 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
4037 {
4038   if (GET_CODE (x) != CONST_VECTOR)
4039     return false;
4040
4041   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
4042                                              GET_MODE_NUNITS (GET_MODE (x)));
4043   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
4044   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
4045   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
4046
4047   unsigned int nelts = const_vector_encoded_nelts (x);
4048   for (unsigned int i = 0; i < nelts; ++i)
4049     {
4050       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
4051       if (!CONST_INT_P (elt))
4052         return false;
4053
4054       builder.quick_push (elt);
4055       for (unsigned int j = 1; j < factor; ++j)
4056         builder.quick_push (const0_rtx);
4057     }
4058   builder.finalize ();
4059   return true;
4060 }
4061
4062 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
4063    widest predicate element size it can have (that is, the largest size
4064    for which each element would still be 0 or 1).  */
4065
4066 unsigned int
4067 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
4068 {
4069   /* Start with the most optimistic assumption: that we only need
4070      one bit per pattern.  This is what we will use if only the first
4071      bit in each pattern is ever set.  */
4072   unsigned int mask = GET_MODE_SIZE (DImode);
4073   mask |= builder.npatterns ();
4074
4075   /* Look for set bits.  */
4076   unsigned int nelts = builder.encoded_nelts ();
4077   for (unsigned int i = 1; i < nelts; ++i)
4078     if (INTVAL (builder.elt (i)) != 0)
4079       {
4080         if (i & 1)
4081           return 1;
4082         mask |= i;
4083       }
4084   return mask & -mask;
4085 }
4086
4087 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
4088    return that predicate mode, otherwise return opt_machine_mode ().  */
4089
4090 opt_machine_mode
4091 aarch64_ptrue_all_mode (rtx x)
4092 {
4093   gcc_assert (GET_MODE (x) == VNx16BImode);
4094   if (GET_CODE (x) != CONST_VECTOR
4095       || !CONST_VECTOR_DUPLICATE_P (x)
4096       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
4097       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
4098     return opt_machine_mode ();
4099
4100   unsigned int nelts = const_vector_encoded_nelts (x);
4101   for (unsigned int i = 1; i < nelts; ++i)
4102     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
4103       return opt_machine_mode ();
4104
4105   return aarch64_sve_pred_mode (nelts);
4106 }
4107
4108 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
4109    that the constant would have with predicate element size ELT_SIZE
4110    (ignoring the upper bits in each element) and return:
4111
4112    * -1 if all bits are set
4113    * N if the predicate has N leading set bits followed by all clear bits
4114    * 0 if the predicate does not have any of these forms.  */
4115
4116 int
4117 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
4118                               unsigned int elt_size)
4119 {
4120   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
4121      followed by set bits.  */
4122   if (builder.nelts_per_pattern () == 3)
4123     return 0;
4124
4125   /* Skip over leading set bits.  */
4126   unsigned int nelts = builder.encoded_nelts ();
4127   unsigned int i = 0;
4128   for (; i < nelts; i += elt_size)
4129     if (INTVAL (builder.elt (i)) == 0)
4130       break;
4131   unsigned int vl = i / elt_size;
4132
4133   /* Check for the all-true case.  */
4134   if (i == nelts)
4135     return -1;
4136
4137   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
4138      repeating pattern of set bits followed by clear bits.  */
4139   if (builder.nelts_per_pattern () != 2)
4140     return 0;
4141
4142   /* We have a "foreground" value and a duplicated "background" value.
4143      If the background might repeat and the last set bit belongs to it,
4144      we might have set bits followed by clear bits followed by set bits.  */
4145   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
4146     return 0;
4147
4148   /* Make sure that the rest are all clear.  */
4149   for (; i < nelts; i += elt_size)
4150     if (INTVAL (builder.elt (i)) != 0)
4151       return 0;
4152
4153   return vl;
4154 }
4155
4156 /* See if there is an svpattern that encodes an SVE predicate of mode
4157    PRED_MODE in which the first VL bits are set and the rest are clear.
4158    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
4159    A VL of -1 indicates an all-true vector.  */
4160
4161 aarch64_svpattern
4162 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
4163 {
4164   if (vl < 0)
4165     return AARCH64_SV_ALL;
4166
4167   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
4168     return AARCH64_NUM_SVPATTERNS;
4169
4170   if (vl >= 1 && vl <= 8)
4171     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
4172
4173   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
4174     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
4175
4176   int max_vl;
4177   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
4178     {
4179       if (vl == (max_vl / 3) * 3)
4180         return AARCH64_SV_MUL3;
4181       /* These would only trigger for non-power-of-2 lengths.  */
4182       if (vl == (max_vl & -4))
4183         return AARCH64_SV_MUL4;
4184       if (vl == (1 << floor_log2 (max_vl)))
4185         return AARCH64_SV_POW2;
4186       if (vl == max_vl)
4187         return AARCH64_SV_ALL;
4188     }
4189   return AARCH64_NUM_SVPATTERNS;
4190 }
4191
4192 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
4193    bits has the lowest bit set and the upper bits clear.  This is the
4194    VNx16BImode equivalent of a PTRUE for controlling elements of
4195    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
4196    all bits are significant, even the upper zeros.  */
4197
4198 rtx
4199 aarch64_ptrue_all (unsigned int elt_size)
4200 {
4201   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
4202   builder.quick_push (const1_rtx);
4203   for (unsigned int i = 1; i < elt_size; ++i)
4204     builder.quick_push (const0_rtx);
4205   return builder.build ();
4206 }
4207
4208 /* Return an all-true predicate register of mode MODE.  */
4209
4210 rtx
4211 aarch64_ptrue_reg (machine_mode mode)
4212 {
4213   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
4214   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
4215   return gen_lowpart (mode, reg);
4216 }
4217
4218 /* Return an all-false predicate register of mode MODE.  */
4219
4220 rtx
4221 aarch64_pfalse_reg (machine_mode mode)
4222 {
4223   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
4224   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
4225   return gen_lowpart (mode, reg);
4226 }
4227
4228 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
4229    for it.  PRED2[0] is the predicate for the instruction whose result
4230    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
4231    for it.  Return true if we can prove that the two predicates are
4232    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
4233    with PRED1[0] without changing behavior.  */
4234
4235 bool
4236 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
4237 {
4238   machine_mode mode = GET_MODE (pred1[0]);
4239   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
4240               && mode == GET_MODE (pred2[0])
4241               && aarch64_sve_ptrue_flag (pred1[1], SImode)
4242               && aarch64_sve_ptrue_flag (pred2[1], SImode));
4243
4244   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
4245                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
4246   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
4247                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
4248   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
4249 }
4250
4251 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
4252    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
4253    Use TARGET as the target register if nonnull and convenient.  */
4254
4255 static rtx
4256 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
4257                           machine_mode data_mode, rtx op1, rtx op2)
4258 {
4259   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
4260   expand_operand ops[5];
4261   create_output_operand (&ops[0], target, pred_mode);
4262   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
4263   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
4264   create_input_operand (&ops[3], op1, data_mode);
4265   create_input_operand (&ops[4], op2, data_mode);
4266   expand_insn (icode, 5, ops);
4267   return ops[0].value;
4268 }
4269
4270 /* Use a comparison to convert integer vector SRC into MODE, which is
4271    the corresponding SVE predicate mode.  Use TARGET for the result
4272    if it's nonnull and convenient.  */
4273
4274 rtx
4275 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
4276 {
4277   machine_mode src_mode = GET_MODE (src);
4278   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
4279                                    src, CONST0_RTX (src_mode));
4280 }
4281
4282 /* Return the assembly token for svprfop value PRFOP.  */
4283
4284 static const char *
4285 svprfop_token (enum aarch64_svprfop prfop)
4286 {
4287   switch (prfop)
4288     {
4289 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
4290     AARCH64_FOR_SVPRFOP (CASE)
4291 #undef CASE
4292     case AARCH64_NUM_SVPRFOPS:
4293       break;
4294     }
4295   gcc_unreachable ();
4296 }
4297
4298 /* Return the assembly string for an SVE prefetch operation with
4299    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
4300    and that SUFFIX is the format for the remaining operands.  */
4301
4302 char *
4303 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
4304                              const char *suffix)
4305 {
4306   static char buffer[128];
4307   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
4308   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
4309                                    mnemonic, svprfop_token (prfop), suffix);
4310   gcc_assert (written < sizeof (buffer));
4311   return buffer;
4312 }
4313
4314 /* Check whether we can calculate the number of elements in PATTERN
4315    at compile time, given that there are NELTS_PER_VQ elements per
4316    128-bit block.  Return the value if so, otherwise return -1.  */
4317
4318 HOST_WIDE_INT
4319 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
4320 {
4321   unsigned int vl, const_vg;
4322   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
4323     vl = 1 + (pattern - AARCH64_SV_VL1);
4324   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
4325     vl = 16 << (pattern - AARCH64_SV_VL16);
4326   else if (aarch64_sve_vg.is_constant (&const_vg))
4327     {
4328       /* There are two vector granules per quadword.  */
4329       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
4330       switch (pattern)
4331         {
4332         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
4333         case AARCH64_SV_MUL4: return nelts & -4;
4334         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
4335         case AARCH64_SV_ALL: return nelts;
4336         default: gcc_unreachable ();
4337         }
4338     }
4339   else
4340     return -1;
4341
4342   /* There are two vector granules per quadword.  */
4343   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
4344   if (known_le (vl, nelts_all))
4345     return vl;
4346
4347   /* Requesting more elements than are available results in a PFALSE.  */
4348   if (known_gt (vl, nelts_all))
4349     return 0;
4350
4351   return -1;
4352 }
4353
4354 /* Return true if we can move VALUE into a register using a single
4355    CNT[BHWD] instruction.  */
4356
4357 static bool
4358 aarch64_sve_cnt_immediate_p (poly_int64 value)
4359 {
4360   HOST_WIDE_INT factor = value.coeffs[0];
4361   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
4362   return (value.coeffs[1] == factor
4363           && IN_RANGE (factor, 2, 16 * 16)
4364           && (factor & 1) == 0
4365           && factor <= 16 * (factor & -factor));
4366 }
4367
4368 /* Likewise for rtx X.  */
4369
4370 bool
4371 aarch64_sve_cnt_immediate_p (rtx x)
4372 {
4373   poly_int64 value;
4374   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
4375 }
4376
4377 /* Return the asm string for an instruction with a CNT-like vector size
4378    operand (a vector pattern followed by a multiplier in the range [1, 16]).
4379    PREFIX is the mnemonic without the size suffix and OPERANDS is the
4380    first part of the operands template (the part that comes before the
4381    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
4382    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
4383    in each quadword.  If it is zero, we can use any element size.  */
4384
4385 static char *
4386 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4387                                   aarch64_svpattern pattern,
4388                                   unsigned int factor,
4389                                   unsigned int nelts_per_vq)
4390 {
4391   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
4392
4393   if (nelts_per_vq == 0)
4394     /* There is some overlap in the ranges of the four CNT instructions.
4395        Here we always use the smallest possible element size, so that the
4396        multiplier is 1 whereever possible.  */
4397     nelts_per_vq = factor & -factor;
4398   int shift = std::min (exact_log2 (nelts_per_vq), 4);
4399   gcc_assert (IN_RANGE (shift, 1, 4));
4400   char suffix = "dwhb"[shift - 1];
4401
4402   factor >>= shift;
4403   unsigned int written;
4404   if (pattern == AARCH64_SV_ALL && factor == 1)
4405     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
4406                         prefix, suffix, operands);
4407   else if (factor == 1)
4408     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
4409                         prefix, suffix, operands, svpattern_token (pattern));
4410   else
4411     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
4412                         prefix, suffix, operands, svpattern_token (pattern),
4413                         factor);
4414   gcc_assert (written < sizeof (buffer));
4415   return buffer;
4416 }
4417
4418 /* Return the asm string for an instruction with a CNT-like vector size
4419    operand (a vector pattern followed by a multiplier in the range [1, 16]).
4420    PREFIX is the mnemonic without the size suffix and OPERANDS is the
4421    first part of the operands template (the part that comes before the
4422    vector size itself).  X is the value of the vector size operand,
4423    as a polynomial integer rtx; we need to convert this into an "all"
4424    pattern with a multiplier.  */
4425
4426 char *
4427 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4428                                   rtx x)
4429 {
4430   poly_int64 value = rtx_to_poly_int64 (x);
4431   gcc_assert (aarch64_sve_cnt_immediate_p (value));
4432   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
4433                                            value.coeffs[1], 0);
4434 }
4435
4436 /* Return the asm string for an instruction with a CNT-like vector size
4437    operand (a vector pattern followed by a multiplier in the range [1, 16]).
4438    PREFIX is the mnemonic without the size suffix and OPERANDS is the
4439    first part of the operands template (the part that comes before the
4440    vector size itself).  CNT_PAT[0..2] are the operands of the
4441    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
4442
4443 char *
4444 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
4445                                       const char *operands, rtx *cnt_pat)
4446 {
4447   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
4448   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
4449   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
4450   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
4451                                            factor, nelts_per_vq);
4452 }
4453
4454 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
4455
4456 bool
4457 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
4458 {
4459   poly_int64 value;
4460   return (poly_int_rtx_p (x, &value)
4461           && (aarch64_sve_cnt_immediate_p (value)
4462               || aarch64_sve_cnt_immediate_p (-value)));
4463 }
4464
4465 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
4466    operand 0.  */
4467
4468 char *
4469 aarch64_output_sve_scalar_inc_dec (rtx offset)
4470 {
4471   poly_int64 offset_value = rtx_to_poly_int64 (offset);
4472   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
4473   if (offset_value.coeffs[1] > 0)
4474     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
4475                                              offset_value.coeffs[1], 0);
4476   else
4477     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
4478                                              -offset_value.coeffs[1], 0);
4479 }
4480
4481 /* Return true if we can add VALUE to a register using a single ADDVL
4482    or ADDPL instruction.  */
4483
4484 static bool
4485 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
4486 {
4487   HOST_WIDE_INT factor = value.coeffs[0];
4488   if (factor == 0 || value.coeffs[1] != factor)
4489     return false;
4490   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
4491      and a value of 16 is one vector width.  */
4492   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
4493           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
4494 }
4495
4496 /* Likewise for rtx X.  */
4497
4498 bool
4499 aarch64_sve_addvl_addpl_immediate_p (rtx x)
4500 {
4501   poly_int64 value;
4502   return (poly_int_rtx_p (x, &value)
4503           && aarch64_sve_addvl_addpl_immediate_p (value));
4504 }
4505
4506 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
4507    to operand 1 and storing the result in operand 0.  */
4508
4509 char *
4510 aarch64_output_sve_addvl_addpl (rtx offset)
4511 {
4512   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
4513   poly_int64 offset_value = rtx_to_poly_int64 (offset);
4514   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
4515
4516   int factor = offset_value.coeffs[1];
4517   if ((factor & 15) == 0)
4518     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
4519   else
4520     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
4521   return buffer;
4522 }
4523
4524 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4525    instruction.  If it is, store the number of elements in each vector
4526    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
4527    factor in *FACTOR_OUT (if nonnull).  */
4528
4529 bool
4530 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
4531                                         unsigned int *nelts_per_vq_out)
4532 {
4533   rtx elt;
4534   poly_int64 value;
4535
4536   if (!const_vec_duplicate_p (x, &elt)
4537       || !poly_int_rtx_p (elt, &value))
4538     return false;
4539
4540   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
4541   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
4542     /* There's no vector INCB.  */
4543     return false;
4544
4545   HOST_WIDE_INT factor = value.coeffs[0];
4546   if (value.coeffs[1] != factor)
4547     return false;
4548
4549   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
4550   if ((factor % nelts_per_vq) != 0
4551       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4552     return false;
4553
4554   if (factor_out)
4555     *factor_out = factor;
4556   if (nelts_per_vq_out)
4557     *nelts_per_vq_out = nelts_per_vq;
4558   return true;
4559 }
4560
4561 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4562    instruction.  */
4563
4564 bool
4565 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
4566 {
4567   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
4568 }
4569
4570 /* Return the asm template for an SVE vector INC or DEC instruction.
4571    OPERANDS gives the operands before the vector count and X is the
4572    value of the vector count operand itself.  */
4573
4574 char *
4575 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
4576 {
4577   int factor;
4578   unsigned int nelts_per_vq;
4579   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
4580     gcc_unreachable ();
4581   if (factor < 0)
4582     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4583                                              -factor, nelts_per_vq);
4584   else
4585     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4586                                              factor, nelts_per_vq);
4587 }
4588
4589 static int
4590 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4591                                 scalar_int_mode mode)
4592 {
4593   int i;
4594   unsigned HOST_WIDE_INT val, val2, mask;
4595   int one_match, zero_match;
4596   int num_insns;
4597
4598   val = INTVAL (imm);
4599
4600   if (aarch64_move_imm (val, mode))
4601     {
4602       if (generate)
4603         emit_insn (gen_rtx_SET (dest, imm));
4604       return 1;
4605     }
4606
4607   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
4608      (with XXXX non-zero). In that case check to see if the move can be done in
4609      a smaller mode.  */
4610   val2 = val & 0xffffffff;
4611   if (mode == DImode
4612       && aarch64_move_imm (val2, SImode)
4613       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
4614     {
4615       if (generate)
4616         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4617
4618       /* Check if we have to emit a second instruction by checking to see
4619          if any of the upper 32 bits of the original DI mode value is set.  */
4620       if (val == val2)
4621         return 1;
4622
4623       i = (val >> 48) ? 48 : 32;
4624
4625       if (generate)
4626          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4627                                     GEN_INT ((val >> i) & 0xffff)));
4628
4629       return 2;
4630     }
4631
4632   if ((val >> 32) == 0 || mode == SImode)
4633     {
4634       if (generate)
4635         {
4636           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4637           if (mode == SImode)
4638             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4639                                        GEN_INT ((val >> 16) & 0xffff)));
4640           else
4641             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4642                                        GEN_INT ((val >> 16) & 0xffff)));
4643         }
4644       return 2;
4645     }
4646
4647   /* Remaining cases are all for DImode.  */
4648
4649   mask = 0xffff;
4650   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4651     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4652   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4653     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4654
4655   if (zero_match != 2 && one_match != 2)
4656     {
4657       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4658          For a 64-bit bitmask try whether changing 16 bits to all ones or
4659          zeroes creates a valid bitmask.  To check any repeated bitmask,
4660          try using 16 bits from the other 32-bit half of val.  */
4661
4662       for (i = 0; i < 64; i += 16, mask <<= 16)
4663         {
4664           val2 = val & ~mask;
4665           if (val2 != val && aarch64_bitmask_imm (val2, mode))
4666             break;
4667           val2 = val | mask;
4668           if (val2 != val && aarch64_bitmask_imm (val2, mode))
4669             break;
4670           val2 = val2 & ~mask;
4671           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4672           if (val2 != val && aarch64_bitmask_imm (val2, mode))
4673             break;
4674         }
4675       if (i != 64)
4676         {
4677           if (generate)
4678             {
4679               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4680               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4681                                          GEN_INT ((val >> i) & 0xffff)));
4682             }
4683           return 2;
4684         }
4685     }
4686
4687   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4688      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
4689      otherwise skip zero bits.  */
4690
4691   num_insns = 1;
4692   mask = 0xffff;
4693   val2 = one_match > zero_match ? ~val : val;
4694   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4695
4696   if (generate)
4697     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4698                                            ? (val | ~(mask << i))
4699                                            : (val & (mask << i)))));
4700   for (i += 16; i < 64; i += 16)
4701     {
4702       if ((val2 & (mask << i)) == 0)
4703         continue;
4704       if (generate)
4705         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4706                                    GEN_INT ((val >> i) & 0xffff)));
4707       num_insns ++;
4708     }
4709
4710   return num_insns;
4711 }
4712
4713 /* Return whether imm is a 128-bit immediate which is simple enough to
4714    expand inline.  */
4715 bool
4716 aarch64_mov128_immediate (rtx imm)
4717 {
4718   if (CONST_INT_P (imm))
4719     return true;
4720
4721   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4722
4723   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4724   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4725
4726   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4727          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4728 }
4729
4730
4731 /* Return the number of temporary registers that aarch64_add_offset_1
4732    would need to add OFFSET to a register.  */
4733
4734 static unsigned int
4735 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4736 {
4737   return absu_hwi (offset) < 0x1000000 ? 0 : 1;
4738 }
4739
4740 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
4741    a non-polynomial OFFSET.  MODE is the mode of the addition.
4742    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4743    be set and CFA adjustments added to the generated instructions.
4744
4745    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4746    temporary if register allocation is already complete.  This temporary
4747    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
4748    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4749    the immediate again.
4750
4751    Since this function may be used to adjust the stack pointer, we must
4752    ensure that it cannot cause transient stack deallocation (for example
4753    by first incrementing SP and then decrementing when adjusting by a
4754    large immediate).  */
4755
4756 static void
4757 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4758                       rtx src, HOST_WIDE_INT offset, rtx temp1,
4759                       bool frame_related_p, bool emit_move_imm)
4760 {
4761   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4762   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4763
4764   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4765   rtx_insn *insn;
4766
4767   if (!moffset)
4768     {
4769       if (!rtx_equal_p (dest, src))
4770         {
4771           insn = emit_insn (gen_rtx_SET (dest, src));
4772           RTX_FRAME_RELATED_P (insn) = frame_related_p;
4773         }
4774       return;
4775     }
4776
4777   /* Single instruction adjustment.  */
4778   if (aarch64_uimm12_shift (moffset))
4779     {
4780       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4781       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4782       return;
4783     }
4784
4785   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4786      and either:
4787
4788      a) the offset cannot be loaded by a 16-bit move or
4789      b) there is no spare register into which we can move it.  */
4790   if (moffset < 0x1000000
4791       && ((!temp1 && !can_create_pseudo_p ())
4792           || !aarch64_move_imm (moffset, mode)))
4793     {
4794       HOST_WIDE_INT low_off = moffset & 0xfff;
4795
4796       low_off = offset < 0 ? -low_off : low_off;
4797       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4798       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4799       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4800       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4801       return;
4802     }
4803
4804   /* Emit a move immediate if required and an addition/subtraction.  */
4805   if (emit_move_imm)
4806     {
4807       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4808       temp1 = aarch64_force_temporary (mode, temp1,
4809                                        gen_int_mode (moffset, mode));
4810     }
4811   insn = emit_insn (offset < 0
4812                     ? gen_sub3_insn (dest, src, temp1)
4813                     : gen_add3_insn (dest, src, temp1));
4814   if (frame_related_p)
4815     {
4816       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4817       rtx adj = plus_constant (mode, src, offset);
4818       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4819     }
4820 }
4821
4822 /* Return the number of temporary registers that aarch64_add_offset
4823    would need to move OFFSET into a register or add OFFSET to a register;
4824    ADD_P is true if we want the latter rather than the former.  */
4825
4826 static unsigned int
4827 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4828 {
4829   /* This follows the same structure as aarch64_add_offset.  */
4830   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4831     return 0;
4832
4833   unsigned int count = 0;
4834   HOST_WIDE_INT factor = offset.coeffs[1];
4835   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4836   poly_int64 poly_offset (factor, factor);
4837   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4838     /* Need one register for the ADDVL/ADDPL result.  */
4839     count += 1;
4840   else if (factor != 0)
4841     {
4842       factor = abs (factor);
4843       if (factor > 16 * (factor & -factor))
4844         /* Need one register for the CNT result and one for the multiplication
4845            factor.  If necessary, the second temporary can be reused for the
4846            constant part of the offset.  */
4847         return 2;
4848       /* Need one register for the CNT result (which might then
4849          be shifted).  */
4850       count += 1;
4851     }
4852   return count + aarch64_add_offset_1_temporaries (constant);
4853 }
4854
4855 /* If X can be represented as a poly_int64, return the number
4856    of temporaries that are required to add it to a register.
4857    Return -1 otherwise.  */
4858
4859 int
4860 aarch64_add_offset_temporaries (rtx x)
4861 {
4862   poly_int64 offset;
4863   if (!poly_int_rtx_p (x, &offset))
4864     return -1;
4865   return aarch64_offset_temporaries (true, offset);
4866 }
4867
4868 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
4869    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4870    be set and CFA adjustments added to the generated instructions.
4871
4872    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4873    temporary if register allocation is already complete.  This temporary
4874    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4875    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4876    false to avoid emitting the immediate again.
4877
4878    TEMP2, if nonnull, is a second temporary register that doesn't
4879    overlap either DEST or REG.
4880
4881    Since this function may be used to adjust the stack pointer, we must
4882    ensure that it cannot cause transient stack deallocation (for example
4883    by first incrementing SP and then decrementing when adjusting by a
4884    large immediate).  */
4885
4886 static void
4887 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4888                     poly_int64 offset, rtx temp1, rtx temp2,
4889                     bool frame_related_p, bool emit_move_imm = true)
4890 {
4891   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4892   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4893   gcc_assert (temp1 == NULL_RTX
4894               || !frame_related_p
4895               || !reg_overlap_mentioned_p (temp1, dest));
4896   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4897
4898   /* Try using ADDVL or ADDPL to add the whole value.  */
4899   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4900     {
4901       rtx offset_rtx = gen_int_mode (offset, mode);
4902       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4903       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4904       return;
4905     }
4906
4907   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4908      SVE vector register, over and above the minimum size of 128 bits.
4909      This is equivalent to half the value returned by CNTD with a
4910      vector shape of ALL.  */
4911   HOST_WIDE_INT factor = offset.coeffs[1];
4912   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4913
4914   /* Try using ADDVL or ADDPL to add the VG-based part.  */
4915   poly_int64 poly_offset (factor, factor);
4916   if (src != const0_rtx
4917       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4918     {
4919       rtx offset_rtx = gen_int_mode (poly_offset, mode);
4920       if (frame_related_p)
4921         {
4922           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4923           RTX_FRAME_RELATED_P (insn) = true;
4924           src = dest;
4925         }
4926       else
4927         {
4928           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4929           src = aarch64_force_temporary (mode, temp1, addr);
4930           temp1 = temp2;
4931           temp2 = NULL_RTX;
4932         }
4933     }
4934   /* Otherwise use a CNT-based sequence.  */
4935   else if (factor != 0)
4936     {
4937       /* Use a subtraction if we have a negative factor.  */
4938       rtx_code code = PLUS;
4939       if (factor < 0)
4940         {
4941           factor = -factor;
4942           code = MINUS;
4943         }
4944
4945       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
4946          into the multiplication.  */
4947       rtx val;
4948       int shift = 0;
4949       if (factor & 1)
4950         /* Use a right shift by 1.  */
4951         shift = -1;
4952       else
4953         factor /= 2;
4954       HOST_WIDE_INT low_bit = factor & -factor;
4955       if (factor <= 16 * low_bit)
4956         {
4957           if (factor > 16 * 8)
4958             {
4959               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
4960                  the value with the minimum multiplier and shift it into
4961                  position.  */
4962               int extra_shift = exact_log2 (low_bit);
4963               shift += extra_shift;
4964               factor >>= extra_shift;
4965             }
4966           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
4967         }
4968       else
4969         {
4970           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
4971              directly, since that should increase the chances of being
4972              able to use a shift and add sequence.  If LOW_BIT itself
4973              is out of range, just use CNTD.  */
4974           if (low_bit <= 16 * 8)
4975             factor /= low_bit;
4976           else
4977             low_bit = 1;
4978
4979           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
4980           val = aarch64_force_temporary (mode, temp1, val);
4981
4982           if (can_create_pseudo_p ())
4983             {
4984               rtx coeff1 = gen_int_mode (factor, mode);
4985               val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
4986             }
4987           else
4988             {
4989               /* Go back to using a negative multiplication factor if we have
4990                  no register from which to subtract.  */
4991               if (code == MINUS && src == const0_rtx)
4992                 {
4993                   factor = -factor;
4994                   code = PLUS;
4995                 }
4996               rtx coeff1 = gen_int_mode (factor, mode);
4997               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4998               val = gen_rtx_MULT (mode, val, coeff1);
4999             }
5000         }
5001
5002       if (shift > 0)
5003         {
5004           /* Multiply by 1 << SHIFT.  */
5005           val = aarch64_force_temporary (mode, temp1, val);
5006           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
5007         }
5008       else if (shift == -1)
5009         {
5010           /* Divide by 2.  */
5011           val = aarch64_force_temporary (mode, temp1, val);
5012           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
5013         }
5014
5015       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
5016       if (src != const0_rtx)
5017         {
5018           val = aarch64_force_temporary (mode, temp1, val);
5019           val = gen_rtx_fmt_ee (code, mode, src, val);
5020         }
5021       else if (code == MINUS)
5022         {
5023           val = aarch64_force_temporary (mode, temp1, val);
5024           val = gen_rtx_NEG (mode, val);
5025         }
5026
5027       if (constant == 0 || frame_related_p)
5028         {
5029           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
5030           if (frame_related_p)
5031             {
5032               RTX_FRAME_RELATED_P (insn) = true;
5033               add_reg_note (insn, REG_CFA_ADJUST_CFA,
5034                             gen_rtx_SET (dest, plus_constant (Pmode, src,
5035                                                               poly_offset)));
5036             }
5037           src = dest;
5038           if (constant == 0)
5039             return;
5040         }
5041       else
5042         {
5043           src = aarch64_force_temporary (mode, temp1, val);
5044           temp1 = temp2;
5045           temp2 = NULL_RTX;
5046         }
5047
5048       emit_move_imm = true;
5049     }
5050
5051   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
5052                         frame_related_p, emit_move_imm);
5053 }
5054
5055 /* Like aarch64_add_offset, but the offset is given as an rtx rather
5056    than a poly_int64.  */
5057
5058 void
5059 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
5060                           rtx offset_rtx, rtx temp1, rtx temp2)
5061 {
5062   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
5063                       temp1, temp2, false);
5064 }
5065
5066 /* Add DELTA to the stack pointer, marking the instructions frame-related.
5067    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
5068    if TEMP1 already contains abs (DELTA).  */
5069
5070 static inline void
5071 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
5072 {
5073   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
5074                       temp1, temp2, true, emit_move_imm);
5075 }
5076
5077 /* Subtract DELTA from the stack pointer, marking the instructions
5078    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
5079    if nonnull.  */
5080
5081 static inline void
5082 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
5083                 bool emit_move_imm = true)
5084 {
5085   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
5086                       temp1, temp2, frame_related_p, emit_move_imm);
5087 }
5088
5089 /* Set DEST to (vec_series BASE STEP).  */
5090
5091 static void
5092 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
5093 {
5094   machine_mode mode = GET_MODE (dest);
5095   scalar_mode inner = GET_MODE_INNER (mode);
5096
5097   /* Each operand can be a register or an immediate in the range [-16, 15].  */
5098   if (!aarch64_sve_index_immediate_p (base))
5099     base = force_reg (inner, base);
5100   if (!aarch64_sve_index_immediate_p (step))
5101     step = force_reg (inner, step);
5102
5103   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
5104 }
5105
5106 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5107    register of mode MODE.  Use TARGET for the result if it's nonnull
5108    and convenient.
5109
5110    The two vector modes must have the same element mode.  The behavior
5111    is to duplicate architectural lane N of SRC into architectural lanes
5112    N + I * STEP of the result.  On big-endian targets, architectural
5113    lane 0 of an Advanced SIMD vector is the last element of the vector
5114    in memory layout, so for big-endian targets this operation has the
5115    effect of reversing SRC before duplicating it.  Callers need to
5116    account for this.  */
5117
5118 rtx
5119 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
5120 {
5121   machine_mode src_mode = GET_MODE (src);
5122   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
5123   insn_code icode = (BYTES_BIG_ENDIAN
5124                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
5125                      : code_for_aarch64_vec_duplicate_vq_le (mode));
5126
5127   unsigned int i = 0;
5128   expand_operand ops[3];
5129   create_output_operand (&ops[i++], target, mode);
5130   create_output_operand (&ops[i++], src, src_mode);
5131   if (BYTES_BIG_ENDIAN)
5132     {
5133       /* Create a PARALLEL describing the reversal of SRC.  */
5134       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
5135       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
5136                                                   nelts_per_vq - 1, -1);
5137       create_fixed_operand (&ops[i++], sel);
5138     }
5139   expand_insn (icode, i, ops);
5140   return ops[0].value;
5141 }
5142
5143 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5144    the memory image into DEST.  Return true on success.  */
5145
5146 static bool
5147 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
5148 {
5149   src = force_const_mem (GET_MODE (src), src);
5150   if (!src)
5151     return false;
5152
5153   /* Make sure that the address is legitimate.  */
5154   if (!aarch64_sve_ld1rq_operand_p (src))
5155     {
5156       rtx addr = force_reg (Pmode, XEXP (src, 0));
5157       src = replace_equiv_address (src, addr);
5158     }
5159
5160   machine_mode mode = GET_MODE (dest);
5161   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5162   rtx ptrue = aarch64_ptrue_reg (pred_mode);
5163   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
5164   return true;
5165 }
5166
5167 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5168    by N "background" values.  Try to move it into TARGET using:
5169
5170       PTRUE PRED.<T>, VL<N>
5171       MOV TRUE.<T>, #<foreground>
5172       MOV FALSE.<T>, #<background>
5173       SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5174
5175    The PTRUE is always a single instruction but the MOVs might need a
5176    longer sequence.  If the background value is zero (as it often is),
5177    the sequence can sometimes collapse to a PTRUE followed by a
5178    zero-predicated move.
5179
5180    Return the target on success, otherwise return null.  */
5181
5182 static rtx
5183 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
5184 {
5185   gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
5186
5187   /* Make sure that the PTRUE is valid.  */
5188   machine_mode mode = GET_MODE (src);
5189   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5190   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5191   if (aarch64_svpattern_for_vl (pred_mode, npatterns)
5192       == AARCH64_NUM_SVPATTERNS)
5193     return NULL_RTX;
5194
5195   rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
5196   rtx_vector_builder true_builder (mode, npatterns, 1);
5197   rtx_vector_builder false_builder (mode, npatterns, 1);
5198   for (unsigned int i = 0; i < npatterns; ++i)
5199     {
5200       true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5201       pred_builder.quick_push (CONST1_RTX (BImode));
5202     }
5203   for (unsigned int i = 0; i < npatterns; ++i)
5204     {
5205       false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
5206       pred_builder.quick_push (CONST0_RTX (BImode));
5207     }
5208   expand_operand ops[4];
5209   create_output_operand (&ops[0], target, mode);
5210   create_input_operand (&ops[1], true_builder.build (), mode);
5211   create_input_operand (&ops[2], false_builder.build (), mode);
5212   create_input_operand (&ops[3], pred_builder.build (), pred_mode);
5213   expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
5214   return target;
5215 }
5216
5217 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5218    SVE data mode and isn't a legitimate constant.  Use TARGET for the
5219    result if convenient.
5220
5221    The returned register can have whatever mode seems most natural
5222    given the contents of SRC.  */
5223
5224 static rtx
5225 aarch64_expand_sve_const_vector (rtx target, rtx src)
5226 {
5227   machine_mode mode = GET_MODE (src);
5228   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5229   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
5230   scalar_mode elt_mode = GET_MODE_INNER (mode);
5231   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
5232   unsigned int container_bits = aarch64_sve_container_bits (mode);
5233   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
5234
5235   if (nelts_per_pattern == 1
5236       && encoded_bits <= 128
5237       && container_bits != elt_bits)
5238     {
5239       /* We have a partial vector mode and a constant whose full-vector
5240          equivalent would occupy a repeating 128-bit sequence.  Build that
5241          full-vector equivalent instead, so that we have the option of
5242          using LD1RQ and Advanced SIMD operations.  */
5243       unsigned int repeat = container_bits / elt_bits;
5244       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
5245       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
5246       for (unsigned int i = 0; i < npatterns; ++i)
5247         for (unsigned int j = 0; j < repeat; ++j)
5248           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5249       target = aarch64_target_reg (target, full_mode);
5250       return aarch64_expand_sve_const_vector (target, builder.build ());
5251     }
5252
5253   if (nelts_per_pattern == 1 && encoded_bits == 128)
5254     {
5255       /* The constant is a duplicated quadword but can't be narrowed
5256          beyond a quadword.  Get the memory image of the first quadword
5257          as a 128-bit vector and try using LD1RQ to load it from memory.
5258
5259          The effect for both endiannesses is to load memory lane N into
5260          architectural lanes N + I * STEP of the result.  On big-endian
5261          targets, the layout of the 128-bit vector in an Advanced SIMD
5262          register would be different from its layout in an SVE register,
5263          but this 128-bit vector is a memory value only.  */
5264       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5265       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
5266       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
5267         return target;
5268     }
5269
5270   if (nelts_per_pattern == 1 && encoded_bits < 128)
5271     {
5272       /* The vector is a repeating sequence of 64 bits or fewer.
5273          See if we can load them using an Advanced SIMD move and then
5274          duplicate it to fill a vector.  This is better than using a GPR
5275          move because it keeps everything in the same register file.  */
5276       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5277       rtx_vector_builder builder (vq_mode, npatterns, 1);
5278       for (unsigned int i = 0; i < npatterns; ++i)
5279         {
5280           /* We want memory lane N to go into architectural lane N,
5281              so reverse for big-endian targets.  The DUP .Q pattern
5282              has a compensating reverse built-in.  */
5283           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
5284           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
5285         }
5286       rtx vq_src = builder.build ();
5287       if (aarch64_simd_valid_immediate (vq_src, NULL))
5288         {
5289           vq_src = force_reg (vq_mode, vq_src);
5290           return aarch64_expand_sve_dupq (target, mode, vq_src);
5291         }
5292
5293       /* Get an integer representation of the repeating part of Advanced
5294          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
5295          which for big-endian targets is lane-swapped wrt a normal
5296          Advanced SIMD vector.  This means that for both endiannesses,
5297          memory lane N of SVE vector SRC corresponds to architectural
5298          lane N of a register holding VQ_SRC.  This in turn means that
5299          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5300          as a single 128-bit value) and thus that memory lane 0 of SRC is
5301          in the lsb of the integer.  Duplicating the integer therefore
5302          ensures that memory lane N of SRC goes into architectural lane
5303          N + I * INDEX of the SVE register.  */
5304       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
5305       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
5306       if (elt_value)
5307         {
5308           /* Pretend that we had a vector of INT_MODE to start with.  */
5309           elt_mode = int_mode;
5310           mode = aarch64_full_sve_mode (int_mode).require ();
5311
5312           /* If the integer can be moved into a general register by a
5313              single instruction, do that and duplicate the result.  */
5314           if (CONST_INT_P (elt_value)
5315               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
5316             {
5317               elt_value = force_reg (elt_mode, elt_value);
5318               return expand_vector_broadcast (mode, elt_value);
5319             }
5320         }
5321       else if (npatterns == 1)
5322         /* We're duplicating a single value, but can't do better than
5323            force it to memory and load from there.  This handles things
5324            like symbolic constants.  */
5325         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
5326
5327       if (elt_value)
5328         {
5329           /* Load the element from memory if we can, otherwise move it into
5330              a register and use a DUP.  */
5331           rtx op = force_const_mem (elt_mode, elt_value);
5332           if (!op)
5333             op = force_reg (elt_mode, elt_value);
5334           return expand_vector_broadcast (mode, op);
5335         }
5336     }
5337
5338   /* Try using INDEX.  */
5339   rtx base, step;
5340   if (const_vec_series_p (src, &base, &step))
5341     {
5342       aarch64_expand_vec_series (target, base, step);
5343       return target;
5344     }
5345
5346   /* From here on, it's better to force the whole constant to memory
5347      if we can.  */
5348   if (GET_MODE_NUNITS (mode).is_constant ())
5349     return NULL_RTX;
5350
5351   if (nelts_per_pattern == 2)
5352     if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5353       return res;
5354
5355   /* Expand each pattern individually.  */
5356   gcc_assert (npatterns > 1);
5357   rtx_vector_builder builder;
5358   auto_vec<rtx, 16> vectors (npatterns);
5359   for (unsigned int i = 0; i < npatterns; ++i)
5360     {
5361       builder.new_vector (mode, 1, nelts_per_pattern);
5362       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5363         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5364       vectors.quick_push (force_reg (mode, builder.build ()));
5365     }
5366
5367   /* Use permutes to interleave the separate vectors.  */
5368   while (npatterns > 1)
5369     {
5370       npatterns /= 2;
5371       for (unsigned int i = 0; i < npatterns; ++i)
5372         {
5373           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
5374           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5375           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5376           vectors[i] = tmp;
5377         }
5378     }
5379   gcc_assert (vectors[0] == target);
5380   return target;
5381 }
5382
5383 /* Use WHILE to set a predicate register of mode MODE in which the first
5384    VL bits are set and the rest are clear.  Use TARGET for the register
5385    if it's nonnull and convenient.  */
5386
5387 static rtx
5388 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5389                                  unsigned int vl)
5390 {
5391   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
5392   target = aarch64_target_reg (target, mode);
5393   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
5394                         target, const0_rtx, limit));
5395   return target;
5396 }
5397
5398 static rtx
5399 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5400
5401 /* BUILDER is a constant predicate in which the index of every set bit
5402    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5403    by inverting every element at a multiple of ELT_SIZE and EORing the
5404    result with an ELT_SIZE PTRUE.
5405
5406    Return a register that contains the constant on success, otherwise
5407    return null.  Use TARGET as the register if it is nonnull and
5408    convenient.  */
5409
5410 static rtx
5411 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5412                                    unsigned int elt_size)
5413 {
5414   /* Invert every element at a multiple of ELT_SIZE, keeping the
5415      other bits zero.  */
5416   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5417                                   builder.nelts_per_pattern ());
5418   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5419     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5420       inv_builder.quick_push (const1_rtx);
5421     else
5422       inv_builder.quick_push (const0_rtx);
5423   inv_builder.finalize ();
5424
5425   /* See if we can load the constant cheaply.  */
5426   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5427   if (!inv)
5428     return NULL_RTX;
5429
5430   /* EOR the result with an ELT_SIZE PTRUE.  */
5431   rtx mask = aarch64_ptrue_all (elt_size);
5432   mask = force_reg (VNx16BImode, mask);
5433   inv = gen_lowpart (VNx16BImode, inv);
5434   target = aarch64_target_reg (target, VNx16BImode);
5435   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5436   return target;
5437 }
5438
5439 /* BUILDER is a constant predicate in which the index of every set bit
5440    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5441    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
5442    register on success, otherwise return null.  Use TARGET as the register
5443    if nonnull and convenient.  */
5444
5445 static rtx
5446 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5447                                    unsigned int elt_size,
5448                                    unsigned int permute_size)
5449 {
5450   /* We're going to split the constant into two new constants A and B,
5451      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5452      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5453
5454      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5455      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5456
5457      where _ indicates elements that will be discarded by the permute.
5458
5459      First calculate the ELT_SIZEs for A and B.  */
5460   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
5461   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
5462   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
5463     if (INTVAL (builder.elt (i)) != 0)
5464       {
5465         if (i & permute_size)
5466           b_elt_size |= i - permute_size;
5467         else
5468           a_elt_size |= i;
5469       }
5470   a_elt_size &= -a_elt_size;
5471   b_elt_size &= -b_elt_size;
5472
5473   /* Now construct the vectors themselves.  */
5474   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5475                                 builder.nelts_per_pattern ());
5476   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5477                                 builder.nelts_per_pattern ());
5478   unsigned int nelts = builder.encoded_nelts ();
5479   for (unsigned int i = 0; i < nelts; ++i)
5480     if (i & (elt_size - 1))
5481       {
5482         a_builder.quick_push (const0_rtx);
5483         b_builder.quick_push (const0_rtx);
5484       }
5485     else if ((i & permute_size) == 0)
5486       {
5487         /* The A and B elements are significant.  */
5488         a_builder.quick_push (builder.elt (i));
5489         b_builder.quick_push (builder.elt (i + permute_size));
5490       }
5491     else
5492       {
5493         /* The A and B elements are going to be discarded, so pick whatever
5494            is likely to give a nice constant.  We are targeting element
5495            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5496            with the aim of each being a sequence of ones followed by
5497            a sequence of zeros.  So:
5498
5499            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5500              duplicate the last X_ELT_SIZE element, to extend the
5501              current sequence of ones or zeros.
5502
5503            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5504              zero, so that the constant really does have X_ELT_SIZE and
5505              not a smaller size.  */
5506         if (a_elt_size > permute_size)
5507           a_builder.quick_push (const0_rtx);
5508         else
5509           a_builder.quick_push (a_builder.elt (i - a_elt_size));
5510         if (b_elt_size > permute_size)
5511           b_builder.quick_push (const0_rtx);
5512         else
5513           b_builder.quick_push (b_builder.elt (i - b_elt_size));
5514       }
5515   a_builder.finalize ();
5516   b_builder.finalize ();
5517
5518   /* Try loading A into a register.  */
5519   rtx_insn *last = get_last_insn ();
5520   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5521   if (!a)
5522     return NULL_RTX;
5523
5524   /* Try loading B into a register.  */
5525   rtx b = a;
5526   if (a_builder != b_builder)
5527     {
5528       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5529       if (!b)
5530         {
5531           delete_insns_since (last);
5532           return NULL_RTX;
5533         }
5534     }
5535
5536   /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
5537      operands but permutes them as though they had mode MODE.  */
5538   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5539   target = aarch64_target_reg (target, GET_MODE (a));
5540   rtx type_reg = CONST0_RTX (mode);
5541   emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
5542   return target;
5543 }
5544
5545 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
5546    constant in BUILDER into an SVE predicate register.  Return the register
5547    on success, otherwise return null.  Use TARGET for the register if
5548    nonnull and convenient.
5549
5550    ALLOW_RECURSE_P is true if we can use methods that would call this
5551    function recursively.  */
5552
5553 static rtx
5554 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5555                                  bool allow_recurse_p)
5556 {
5557   if (builder.encoded_nelts () == 1)
5558     /* A PFALSE or a PTRUE .B ALL.  */
5559     return aarch64_emit_set_immediate (target, builder);
5560
5561   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5562   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5563     {
5564       /* If we can load the constant using PTRUE, use it as-is.  */
5565       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5566       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5567         return aarch64_emit_set_immediate (target, builder);
5568
5569       /* Otherwise use WHILE to set the first VL bits.  */
5570       return aarch64_sve_move_pred_via_while (target, mode, vl);
5571     }
5572
5573   if (!allow_recurse_p)
5574     return NULL_RTX;
5575
5576   /* Try inverting the vector in element size ELT_SIZE and then EORing
5577      the result with an ELT_SIZE PTRUE.  */
5578   if (INTVAL (builder.elt (0)) == 0)
5579     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5580                                                      elt_size))
5581       return res;
5582
5583   /* Try using TRN1 to permute two simpler constants.  */
5584   for (unsigned int i = elt_size; i <= 8; i *= 2)
5585     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5586                                                      elt_size, i))
5587       return res;
5588
5589   return NULL_RTX;
5590 }
5591
5592 /* Return an SVE predicate register that contains the VNx16BImode
5593    constant in BUILDER, without going through the move expanders.
5594
5595    The returned register can have whatever mode seems most natural
5596    given the contents of BUILDER.  Use TARGET for the result if
5597    convenient.  */
5598
5599 static rtx
5600 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5601 {
5602   /* Try loading the constant using pure predicate operations.  */
5603   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5604     return res;
5605
5606   /* Try forcing the constant to memory.  */
5607   if (builder.full_nelts ().is_constant ())
5608     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5609       {
5610         target = aarch64_target_reg (target, VNx16BImode);
5611         emit_move_insn (target, mem);
5612         return target;
5613       }
5614
5615   /* The last resort is to load the constant as an integer and then
5616      compare it against zero.  Use -1 for set bits in order to increase
5617      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
5618   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5619                                   builder.nelts_per_pattern ());
5620   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5621     int_builder.quick_push (INTVAL (builder.elt (i))
5622                             ? constm1_rtx : const0_rtx);
5623   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5624                                            int_builder.build ());
5625 }
5626
5627 /* Set DEST to immediate IMM.  */
5628
5629 void
5630 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5631 {
5632   machine_mode mode = GET_MODE (dest);
5633
5634   /* Check on what type of symbol it is.  */
5635   scalar_int_mode int_mode;
5636   if ((SYMBOL_REF_P (imm)
5637        || LABEL_REF_P (imm)
5638        || GET_CODE (imm) == CONST
5639        || GET_CODE (imm) == CONST_POLY_INT)
5640       && is_a <scalar_int_mode> (mode, &int_mode))
5641     {
5642       rtx mem;
5643       poly_int64 offset;
5644       HOST_WIDE_INT const_offset;
5645       enum aarch64_symbol_type sty;
5646
5647       /* If we have (const (plus symbol offset)), separate out the offset
5648          before we start classifying the symbol.  */
5649       rtx base = strip_offset (imm, &offset);
5650
5651       /* We must always add an offset involving VL separately, rather than
5652          folding it into the relocation.  */
5653       if (!offset.is_constant (&const_offset))
5654         {
5655           if (!TARGET_SVE)
5656             {
5657               aarch64_report_sve_required ();
5658               return;
5659             }
5660           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
5661             emit_insn (gen_rtx_SET (dest, imm));
5662           else
5663             {
5664               /* Do arithmetic on 32-bit values if the result is smaller
5665                  than that.  */
5666               if (partial_subreg_p (int_mode, SImode))
5667                 {
5668                   /* It is invalid to do symbol calculations in modes
5669                      narrower than SImode.  */
5670                   gcc_assert (base == const0_rtx);
5671                   dest = gen_lowpart (SImode, dest);
5672                   int_mode = SImode;
5673                 }
5674               if (base != const0_rtx)
5675                 {
5676                   base = aarch64_force_temporary (int_mode, dest, base);
5677                   aarch64_add_offset (int_mode, dest, base, offset,
5678                                       NULL_RTX, NULL_RTX, false);
5679                 }
5680               else
5681                 aarch64_add_offset (int_mode, dest, base, offset,
5682                                     dest, NULL_RTX, false);
5683             }
5684           return;
5685         }
5686
5687       sty = aarch64_classify_symbol (base, const_offset);
5688       switch (sty)
5689         {
5690         case SYMBOL_FORCE_TO_MEM:
5691           if (int_mode != ptr_mode)
5692             imm = convert_memory_address (ptr_mode, imm);
5693
5694           if (const_offset != 0
5695               && targetm.cannot_force_const_mem (ptr_mode, imm))
5696             {
5697               gcc_assert (can_create_pseudo_p ());
5698               base = aarch64_force_temporary (int_mode, dest, base);
5699               aarch64_add_offset (int_mode, dest, base, const_offset,
5700                                   NULL_RTX, NULL_RTX, false);
5701               return;
5702             }
5703
5704           mem = force_const_mem (ptr_mode, imm);
5705           gcc_assert (mem);
5706
5707           /* If we aren't generating PC relative literals, then
5708              we need to expand the literal pool access carefully.
5709              This is something that needs to be done in a number
5710              of places, so could well live as a separate function.  */
5711           if (!aarch64_pcrelative_literal_loads)
5712             {
5713               gcc_assert (can_create_pseudo_p ());
5714               base = gen_reg_rtx (ptr_mode);
5715               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
5716               if (ptr_mode != Pmode)
5717                 base = convert_memory_address (Pmode, base);
5718               mem = gen_rtx_MEM (ptr_mode, base);
5719             }
5720
5721           if (int_mode != ptr_mode)
5722             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
5723
5724           emit_insn (gen_rtx_SET (dest, mem));
5725
5726           return;
5727
5728         case SYMBOL_SMALL_TLSGD:
5729         case SYMBOL_SMALL_TLSDESC:
5730         case SYMBOL_SMALL_TLSIE:
5731         case SYMBOL_SMALL_GOT_28K:
5732         case SYMBOL_SMALL_GOT_4G:
5733         case SYMBOL_TINY_GOT:
5734         case SYMBOL_TINY_TLSIE:
5735           if (const_offset != 0)
5736             {
5737               gcc_assert(can_create_pseudo_p ());
5738               base = aarch64_force_temporary (int_mode, dest, base);
5739               aarch64_add_offset (int_mode, dest, base, const_offset,
5740                                   NULL_RTX, NULL_RTX, false);
5741               return;
5742             }
5743           /* FALLTHRU */
5744
5745         case SYMBOL_SMALL_ABSOLUTE:
5746         case SYMBOL_TINY_ABSOLUTE:
5747         case SYMBOL_TLSLE12:
5748         case SYMBOL_TLSLE24:
5749         case SYMBOL_TLSLE32:
5750         case SYMBOL_TLSLE48:
5751           aarch64_load_symref_appropriately (dest, imm, sty);
5752           return;
5753
5754         default:
5755           gcc_unreachable ();
5756         }
5757     }
5758
5759   if (!CONST_INT_P (imm))
5760     {
5761       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
5762         {
5763           /* Only the low bit of each .H, .S and .D element is defined,
5764              so we can set the upper bits to whatever we like.  If the
5765              predicate is all-true in MODE, prefer to set all the undefined
5766              bits as well, so that we can share a single .B predicate for
5767              all modes.  */
5768           if (imm == CONSTM1_RTX (mode))
5769             imm = CONSTM1_RTX (VNx16BImode);
5770
5771           /* All methods for constructing predicate modes wider than VNx16BI
5772              will set the upper bits of each element to zero.  Expose this
5773              by moving such constants as a VNx16BI, so that all bits are
5774              significant and so that constants for different modes can be
5775              shared.  The wider constant will still be available as a
5776              REG_EQUAL note.  */
5777           rtx_vector_builder builder;
5778           if (aarch64_get_sve_pred_bits (builder, imm))
5779             {
5780               rtx res = aarch64_expand_sve_const_pred (dest, builder);
5781               if (dest != res)
5782                 emit_move_insn (dest, gen_lowpart (mode, res));
5783               return;
5784             }
5785         }
5786
5787       if (GET_CODE (imm) == HIGH
5788           || aarch64_simd_valid_immediate (imm, NULL))
5789         {
5790           emit_insn (gen_rtx_SET (dest, imm));
5791           return;
5792         }
5793
5794       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
5795         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
5796           {
5797             if (dest != res)
5798               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
5799             return;
5800           }
5801
5802       rtx mem = force_const_mem (mode, imm);
5803       gcc_assert (mem);
5804       emit_move_insn (dest, mem);
5805       return;
5806     }
5807
5808   aarch64_internal_mov_immediate (dest, imm, true,
5809                                   as_a <scalar_int_mode> (mode));
5810 }
5811
5812 /* Return the MEM rtx that provides the canary value that should be used
5813    for stack-smashing protection.  MODE is the mode of the memory.
5814    For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
5815    (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
5816    indicates whether the caller is performing a SET or a TEST operation.  */
5817
5818 rtx
5819 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
5820                                   aarch64_salt_type salt_type)
5821 {
5822   rtx addr;
5823   if (aarch64_stack_protector_guard == SSP_GLOBAL)
5824     {
5825       gcc_assert (MEM_P (decl_rtl));
5826       addr = XEXP (decl_rtl, 0);
5827       poly_int64 offset;
5828       rtx base = strip_offset_and_salt (addr, &offset);
5829       if (!SYMBOL_REF_P (base))
5830         return decl_rtl;
5831
5832       rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
5833       addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
5834       addr = gen_rtx_CONST (Pmode, addr);
5835       addr = plus_constant (Pmode, addr, offset);
5836     }
5837   else
5838     {
5839       /* Calculate the address from the system register.  */
5840       rtx salt = GEN_INT (salt_type);
5841       addr = gen_reg_rtx (mode);
5842       if (mode == DImode)
5843         emit_insn (gen_reg_stack_protect_address_di (addr, salt));
5844       else
5845         {
5846           emit_insn (gen_reg_stack_protect_address_si (addr, salt));
5847           addr = convert_memory_address (Pmode, addr);
5848         }
5849       addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
5850     }
5851   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
5852 }
5853
5854 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
5855    that is known to contain PTRUE.  */
5856
5857 void
5858 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
5859 {
5860   expand_operand ops[3];
5861   machine_mode mode = GET_MODE (dest);
5862   create_output_operand (&ops[0], dest, mode);
5863   create_input_operand (&ops[1], pred, GET_MODE(pred));
5864   create_input_operand (&ops[2], src, mode);
5865   temporary_volatile_ok v (true);
5866   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
5867 }
5868
5869 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
5870    operand is in memory.  In this case we need to use the predicated LD1
5871    and ST1 instead of LDR and STR, both for correctness on big-endian
5872    targets and because LD1 and ST1 support a wider range of addressing modes.
5873    PRED_MODE is the mode of the predicate.
5874
5875    See the comment at the head of aarch64-sve.md for details about the
5876    big-endian handling.  */
5877
5878 void
5879 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
5880 {
5881   machine_mode mode = GET_MODE (dest);
5882   rtx ptrue = aarch64_ptrue_reg (pred_mode);
5883   if (!register_operand (src, mode)
5884       && !register_operand (dest, mode))
5885     {
5886       rtx tmp = gen_reg_rtx (mode);
5887       if (MEM_P (src))
5888         aarch64_emit_sve_pred_move (tmp, ptrue, src);
5889       else
5890         emit_move_insn (tmp, src);
5891       src = tmp;
5892     }
5893   aarch64_emit_sve_pred_move (dest, ptrue, src);
5894 }
5895
5896 /* Called only on big-endian targets.  See whether an SVE vector move
5897    from SRC to DEST is effectively a REV[BHW] instruction, because at
5898    least one operand is a subreg of an SVE vector that has wider or
5899    narrower elements.  Return true and emit the instruction if so.
5900
5901    For example:
5902
5903      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
5904
5905    represents a VIEW_CONVERT between the following vectors, viewed
5906    in memory order:
5907
5908      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
5909      R1: { [0],      [1],      [2],      [3],     ... }
5910
5911    The high part of lane X in R2 should therefore correspond to lane X*2
5912    of R1, but the register representations are:
5913
5914          msb                                      lsb
5915      R2: ...... [1].high  [1].low   [0].high  [0].low
5916      R1: ...... [3]       [2]       [1]       [0]
5917
5918    where the low part of lane X in R2 corresponds to lane X*2 in R1.
5919    We therefore need a reverse operation to swap the high and low values
5920    around.
5921
5922    This is purely an optimization.  Without it we would spill the
5923    subreg operand to the stack in one mode and reload it in the
5924    other mode, which has the same effect as the REV.  */
5925
5926 bool
5927 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
5928 {
5929   gcc_assert (BYTES_BIG_ENDIAN);
5930
5931   /* Do not try to optimize subregs that LRA has created for matched
5932      reloads.  These subregs only exist as a temporary measure to make
5933      the RTL well-formed, but they are exempt from the usual
5934      TARGET_CAN_CHANGE_MODE_CLASS rules.
5935
5936      For example, if we have:
5937
5938        (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
5939
5940      and the constraints require R1 and R2 to be in the same register,
5941      LRA may need to create RTL such as:
5942
5943        (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
5944        (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
5945        (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
5946
5947      which forces both the input and output of the original instruction
5948      to use the same hard register.  But for this to work, the normal
5949      rules have to be suppressed on the subreg input, otherwise LRA
5950      would need to reload that input too, meaning that the process
5951      would never terminate.  To compensate for this, the normal rules
5952      are also suppressed for the subreg output of the first move.
5953      Ignoring the special case and handling the first move normally
5954      would therefore generate wrong code: we would reverse the elements
5955      for the first subreg but not reverse them back for the second subreg.  */
5956   if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
5957     dest = SUBREG_REG (dest);
5958   if (SUBREG_P (src) && !LRA_SUBREG_P (src))
5959     src = SUBREG_REG (src);
5960
5961   /* The optimization handles two single SVE REGs with different element
5962      sizes.  */
5963   if (!REG_P (dest)
5964       || !REG_P (src)
5965       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
5966       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
5967       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
5968           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
5969     return false;
5970
5971   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
5972   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
5973   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
5974                                UNSPEC_REV_SUBREG);
5975   emit_insn (gen_rtx_SET (dest, unspec));
5976   return true;
5977 }
5978
5979 /* Return a copy of X with mode MODE, without changing its other
5980    attributes.  Unlike gen_lowpart, this doesn't care whether the
5981    mode change is valid.  */
5982
5983 rtx
5984 aarch64_replace_reg_mode (rtx x, machine_mode mode)
5985 {
5986   if (GET_MODE (x) == mode)
5987     return x;
5988
5989   x = shallow_copy_rtx (x);
5990   set_mode_and_regno (x, mode, REGNO (x));
5991   return x;
5992 }
5993
5994 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
5995    stored in wider integer containers.  */
5996
5997 static unsigned int
5998 aarch64_sve_rev_unspec (machine_mode mode)
5999 {
6000   switch (GET_MODE_UNIT_SIZE (mode))
6001     {
6002     case 1: return UNSPEC_REVB;
6003     case 2: return UNSPEC_REVH;
6004     case 4: return UNSPEC_REVW;
6005     }
6006   gcc_unreachable ();
6007 }
6008
6009 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6010    operands.  */
6011
6012 void
6013 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6014 {
6015   /* Decide which REV operation we need.  The mode with wider elements
6016      determines the mode of the operands and the mode with the narrower
6017      elements determines the reverse width.  */
6018   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6019   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6020   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6021       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6022     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6023
6024   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6025   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6026
6027   /* Get the operands in the appropriate modes and emit the instruction.  */
6028   ptrue = gen_lowpart (pred_mode, ptrue);
6029   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6030   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6031   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6032                                dest, ptrue, src));
6033 }
6034
6035 static bool
6036 aarch64_function_ok_for_sibcall (tree, tree exp)
6037 {
6038   if (crtl->abi->id () != expr_callee_abi (exp).id ())
6039     return false;
6040
6041   return true;
6042 }
6043
6044 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6045    passed in SVE registers.  */
6046
6047 static bool
6048 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6049                              const function_arg_info &arg)
6050 {
6051   HOST_WIDE_INT size;
6052   machine_mode dummymode;
6053   int nregs;
6054
6055   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
6056   if (arg.mode == BLKmode && arg.type)
6057     size = int_size_in_bytes (arg.type);
6058   else
6059     /* No frontends can create types with variable-sized modes, so we
6060        shouldn't be asked to pass or return them.  */
6061     size = GET_MODE_SIZE (arg.mode).to_constant ();
6062
6063   /* Aggregates are passed by reference based on their size.  */
6064   if (arg.aggregate_type_p ())
6065     size = int_size_in_bytes (arg.type);
6066
6067   /* Variable sized arguments are always returned by reference.  */
6068   if (size < 0)
6069     return true;
6070
6071   /* Can this be a candidate to be passed in fp/simd register(s)?  */
6072   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6073                                                &dummymode, &nregs, NULL,
6074                                                !pcum || pcum->silent_p))
6075     return false;
6076
6077   /* Arguments which are variable sized or larger than 2 registers are
6078      passed by reference unless they are a homogenous floating point
6079      aggregate.  */
6080   return size > 2 * UNITS_PER_WORD;
6081 }
6082
6083 /* Implement TARGET_PASS_BY_REFERENCE.  */
6084
6085 static bool
6086 aarch64_pass_by_reference (cumulative_args_t pcum_v,
6087                            const function_arg_info &arg)
6088 {
6089   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6090
6091   if (!arg.type)
6092     return aarch64_pass_by_reference_1 (pcum, arg);
6093
6094   pure_scalable_type_info pst_info;
6095   switch (pst_info.analyze (arg.type))
6096     {
6097     case pure_scalable_type_info::IS_PST:
6098       if (pcum && !pcum->silent_p && !TARGET_SVE)
6099         /* We can't gracefully recover at this point, so make this a
6100            fatal error.  */
6101         fatal_error (input_location, "arguments of type %qT require"
6102                      " the SVE ISA extension", arg.type);
6103
6104       /* Variadic SVE types are passed by reference.  Normal non-variadic
6105          arguments are too if we've run out of registers.  */
6106       return (!arg.named
6107               || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
6108               || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
6109
6110     case pure_scalable_type_info::DOESNT_MATTER:
6111       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
6112       return true;
6113
6114     case pure_scalable_type_info::NO_ABI_IDENTITY:
6115     case pure_scalable_type_info::ISNT_PST:
6116       return aarch64_pass_by_reference_1 (pcum, arg);
6117     }
6118   gcc_unreachable ();
6119 }
6120
6121 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
6122 static bool
6123 aarch64_return_in_msb (const_tree valtype)
6124 {
6125   machine_mode dummy_mode;
6126   int dummy_int;
6127
6128   /* Never happens in little-endian mode.  */
6129   if (!BYTES_BIG_ENDIAN)
6130     return false;
6131
6132   /* Only composite types smaller than or equal to 16 bytes can
6133      be potentially returned in registers.  */
6134   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
6135       || int_size_in_bytes (valtype) <= 0
6136       || int_size_in_bytes (valtype) > 16)
6137     return false;
6138
6139   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6140      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6141      is always passed/returned in the least significant bits of fp/simd
6142      register(s).  */
6143   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
6144                                                &dummy_mode, &dummy_int, NULL,
6145                                                false))
6146     return false;
6147
6148   /* Likewise pure scalable types for SVE vector and predicate registers.  */
6149   pure_scalable_type_info pst_info;
6150   if (pst_info.analyze_registers (valtype))
6151     return false;
6152
6153   return true;
6154 }
6155
6156 /* Implement TARGET_FUNCTION_VALUE.
6157    Define how to find the value returned by a function.  */
6158
6159 static rtx
6160 aarch64_function_value (const_tree type, const_tree func,
6161                         bool outgoing ATTRIBUTE_UNUSED)
6162 {
6163   machine_mode mode;
6164   int unsignedp;
6165
6166   mode = TYPE_MODE (type);
6167   if (INTEGRAL_TYPE_P (type))
6168     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
6169
6170   pure_scalable_type_info pst_info;
6171   if (type && pst_info.analyze_registers (type))
6172     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
6173
6174   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6175      are returned in memory, not by value.  */
6176   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6177   bool sve_p = (vec_flags & VEC_ANY_SVE);
6178
6179   if (aarch64_return_in_msb (type))
6180     {
6181       HOST_WIDE_INT size = int_size_in_bytes (type);
6182
6183       if (size % UNITS_PER_WORD != 0)
6184         {
6185           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
6186           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
6187         }
6188     }
6189
6190   int count;
6191   machine_mode ag_mode;
6192   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
6193                                                NULL, false))
6194     {
6195       gcc_assert (!sve_p);
6196       if (!aarch64_composite_type_p (type, mode))
6197         {
6198           gcc_assert (count == 1 && mode == ag_mode);
6199           return gen_rtx_REG (mode, V0_REGNUM);
6200         }
6201       else
6202         {
6203           int i;
6204           rtx par;
6205
6206           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
6207           for (i = 0; i < count; i++)
6208             {
6209               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6210               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
6211               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6212               XVECEXP (par, 0, i) = tmp;
6213             }
6214           return par;
6215         }
6216     }
6217   else
6218     {
6219       if (sve_p)
6220         {
6221           /* Vector types can acquire a partial SVE mode using things like
6222              __attribute__((vector_size(N))), and this is potentially useful.
6223              However, the choice of mode doesn't affect the type's ABI
6224              identity, so we should treat the types as though they had
6225              the associated integer mode, just like they did before SVE
6226              was introduced.
6227
6228              We know that the vector must be 128 bits or smaller,
6229              otherwise we'd have returned it in memory instead.  */
6230           gcc_assert (type
6231                       && (aarch64_some_values_include_pst_objects_p (type)
6232                           || (vec_flags & VEC_PARTIAL)));
6233
6234           scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
6235           rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
6236           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
6237           return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
6238         }
6239       return gen_rtx_REG (mode, R0_REGNUM);
6240     }
6241 }
6242
6243 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6244    Return true if REGNO is the number of a hard register in which the values
6245    of called function may come back.  */
6246
6247 static bool
6248 aarch64_function_value_regno_p (const unsigned int regno)
6249 {
6250   /* Maximum of 16 bytes can be returned in the general registers.  Examples
6251      of 16-byte return values are: 128-bit integers and 16-byte small
6252      structures (excluding homogeneous floating-point aggregates).  */
6253   if (regno == R0_REGNUM || regno == R1_REGNUM)
6254     return true;
6255
6256   /* Up to four fp/simd registers can return a function value, e.g. a
6257      homogeneous floating-point aggregate having four members.  */
6258   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
6259     return TARGET_FLOAT;
6260
6261   return false;
6262 }
6263
6264 /* Subroutine for aarch64_return_in_memory for types that are not returned
6265    in SVE registers.  */
6266
6267 static bool
6268 aarch64_return_in_memory_1 (const_tree type)
6269 {
6270   HOST_WIDE_INT size;
6271   machine_mode ag_mode;
6272   int count;
6273
6274   if (!AGGREGATE_TYPE_P (type)
6275       && TREE_CODE (type) != COMPLEX_TYPE
6276       && TREE_CODE (type) != VECTOR_TYPE)
6277     /* Simple scalar types always returned in registers.  */
6278     return false;
6279
6280   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6281                                                &ag_mode, &count, NULL, false))
6282     return false;
6283
6284   /* Types larger than 2 registers returned in memory.  */
6285   size = int_size_in_bytes (type);
6286   return (size < 0 || size > 2 * UNITS_PER_WORD);
6287 }
6288
6289 /* Implement TARGET_RETURN_IN_MEMORY.
6290
6291    If the type T of the result of a function is such that
6292      void func (T arg)
6293    would require that arg be passed as a value in a register (or set of
6294    registers) according to the parameter passing rules, then the result
6295    is returned in the same registers as would be used for such an
6296    argument.  */
6297
6298 static bool
6299 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
6300 {
6301   pure_scalable_type_info pst_info;
6302   switch (pst_info.analyze (type))
6303     {
6304     case pure_scalable_type_info::IS_PST:
6305       return (pst_info.num_zr () > NUM_FP_ARG_REGS
6306               || pst_info.num_pr () > NUM_PR_ARG_REGS);
6307
6308     case pure_scalable_type_info::DOESNT_MATTER:
6309       gcc_assert (aarch64_return_in_memory_1 (type));
6310       return true;
6311
6312     case pure_scalable_type_info::NO_ABI_IDENTITY:
6313     case pure_scalable_type_info::ISNT_PST:
6314       return aarch64_return_in_memory_1 (type);
6315     }
6316   gcc_unreachable ();
6317 }
6318
6319 static bool
6320 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
6321                                const_tree type, int *nregs)
6322 {
6323   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6324   return aarch64_vfp_is_call_or_return_candidate (mode, type,
6325                                                   &pcum->aapcs_vfp_rmode,
6326                                                   nregs, NULL, pcum->silent_p);
6327 }
6328
6329 /* Given MODE and TYPE of a function argument, return the alignment in
6330    bits.  The idea is to suppress any stronger alignment requested by
6331    the user and opt for the natural alignment (specified in AAPCS64 \S
6332    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
6333    calculated in versions of GCC prior to GCC-9.  This is a helper
6334    function for local use only.  */
6335
6336 static unsigned int
6337 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6338                                 unsigned int *abi_break)
6339 {
6340   *abi_break = 0;
6341   if (!type)
6342     return GET_MODE_ALIGNMENT (mode);
6343
6344   if (integer_zerop (TYPE_SIZE (type)))
6345     return 0;
6346
6347   gcc_assert (TYPE_MODE (type) == mode);
6348
6349   if (!AGGREGATE_TYPE_P (type))
6350     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
6351
6352   if (TREE_CODE (type) == ARRAY_TYPE)
6353     return TYPE_ALIGN (TREE_TYPE (type));
6354
6355   unsigned int alignment = 0;
6356   unsigned int bitfield_alignment = 0;
6357   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6358     if (TREE_CODE (field) == FIELD_DECL)
6359       {
6360         /* Note that we explicitly consider zero-sized fields here,
6361            even though they don't map to AAPCS64 machine types.
6362            For example, in:
6363
6364                struct __attribute__((aligned(8))) empty {};
6365
6366                struct s {
6367                  [[no_unique_address]] empty e;
6368                  int x;
6369                };
6370
6371            "s" contains only one Fundamental Data Type (the int field)
6372            but gains 8-byte alignment and size thanks to "e".  */
6373         alignment = std::max (alignment, DECL_ALIGN (field));
6374         if (DECL_BIT_FIELD_TYPE (field))
6375           bitfield_alignment
6376             = std::max (bitfield_alignment,
6377                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6378       }
6379
6380   if (bitfield_alignment > alignment)
6381     {
6382       *abi_break = alignment;
6383       return bitfield_alignment;
6384     }
6385
6386   return alignment;
6387 }
6388
6389 /* Layout a function argument according to the AAPCS64 rules.  The rule
6390    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
6391    mode that was originally given to us by the target hook, whereas the
6392    mode in ARG might be the result of replacing partial SVE modes with
6393    the equivalent integer mode.  */
6394
6395 static void
6396 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6397 {
6398   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6399   tree type = arg.type;
6400   machine_mode mode = arg.mode;
6401   int ncrn, nvrn, nregs;
6402   bool allocate_ncrn, allocate_nvrn;
6403   HOST_WIDE_INT size;
6404   unsigned int abi_break;
6405
6406   /* We need to do this once per argument.  */
6407   if (pcum->aapcs_arg_processed)
6408     return;
6409
6410   pcum->aapcs_arg_processed = true;
6411
6412   pure_scalable_type_info pst_info;
6413   if (type && pst_info.analyze_registers (type))
6414     {
6415       /* The PCS says that it is invalid to pass an SVE value to an
6416          unprototyped function.  There is no ABI-defined location we
6417          can return in this case, so we have no real choice but to raise
6418          an error immediately, even though this is only a query function.  */
6419       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
6420         {
6421           gcc_assert (!pcum->silent_p);
6422           error ("SVE type %qT cannot be passed to an unprototyped function",
6423                  arg.type);
6424           /* Avoid repeating the message, and avoid tripping the assert
6425              below.  */
6426           pcum->pcs_variant = ARM_PCS_SVE;
6427         }
6428
6429       /* We would have converted the argument into pass-by-reference
6430          form if it didn't fit in registers.  */
6431       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
6432       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
6433       gcc_assert (arg.named
6434                   && pcum->pcs_variant == ARM_PCS_SVE
6435                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
6436                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
6437       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
6438                                           P0_REGNUM + pcum->aapcs_nprn);
6439       return;
6440     }
6441
6442   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6443      are passed by reference, not by value.  */
6444   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6445   bool sve_p = (vec_flags & VEC_ANY_SVE);
6446   if (sve_p)
6447     /* Vector types can acquire a partial SVE mode using things like
6448        __attribute__((vector_size(N))), and this is potentially useful.
6449        However, the choice of mode doesn't affect the type's ABI
6450        identity, so we should treat the types as though they had
6451        the associated integer mode, just like they did before SVE
6452        was introduced.
6453
6454        We know that the vector must be 128 bits or smaller,
6455        otherwise we'd have passed it in memory instead.  */
6456     gcc_assert (type
6457                 && (aarch64_some_values_include_pst_objects_p (type)
6458                     || (vec_flags & VEC_PARTIAL)));
6459
6460   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
6461   if (type)
6462     size = int_size_in_bytes (type);
6463   else
6464     /* No frontends can create types with variable-sized modes, so we
6465        shouldn't be asked to pass or return them.  */
6466     size = GET_MODE_SIZE (mode).to_constant ();
6467   size = ROUND_UP (size, UNITS_PER_WORD);
6468
6469   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6470   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6471                                                  mode,
6472                                                  type,
6473                                                  &nregs);
6474   gcc_assert (!sve_p || !allocate_nvrn);
6475
6476   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6477      The following code thus handles passing by SIMD/FP registers first.  */
6478
6479   nvrn = pcum->aapcs_nvrn;
6480
6481   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6482      and homogenous short-vector aggregates (HVA).  */
6483   if (allocate_nvrn)
6484     {
6485       if (!pcum->silent_p && !TARGET_FLOAT)
6486         aarch64_err_no_fpadvsimd (mode);
6487
6488       if (nvrn + nregs <= NUM_FP_ARG_REGS)
6489         {
6490           pcum->aapcs_nextnvrn = nvrn + nregs;
6491           if (!aarch64_composite_type_p (type, mode))
6492             {
6493               gcc_assert (nregs == 1);
6494               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6495             }
6496           else
6497             {
6498               rtx par;
6499               int i;
6500               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6501               for (i = 0; i < nregs; i++)
6502                 {
6503                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
6504                                          V0_REGNUM + nvrn + i);
6505                   rtx offset = gen_int_mode
6506                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
6507                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6508                   XVECEXP (par, 0, i) = tmp;
6509                 }
6510               pcum->aapcs_reg = par;
6511             }
6512           return;
6513         }
6514       else
6515         {
6516           /* C.3 NSRN is set to 8.  */
6517           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
6518           goto on_stack;
6519         }
6520     }
6521
6522   ncrn = pcum->aapcs_ncrn;
6523   nregs = size / UNITS_PER_WORD;
6524
6525   /* C6 - C9.  though the sign and zero extension semantics are
6526      handled elsewhere.  This is the case where the argument fits
6527      entirely general registers.  */
6528   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
6529     {
6530       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
6531
6532       /* C.8 if the argument has an alignment of 16 then the NGRN is
6533          rounded up to the next even number.  */
6534       if (nregs == 2
6535           && ncrn % 2
6536           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
6537              comparison is there because for > 16 * BITS_PER_UNIT
6538              alignment nregs should be > 2 and therefore it should be
6539              passed by reference rather than value.  */
6540           && (aarch64_function_arg_alignment (mode, type, &abi_break)
6541               == 16 * BITS_PER_UNIT))
6542         {
6543           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6544             inform (input_location, "parameter passing for argument of type "
6545                     "%qT changed in GCC 9.1", type);
6546           ++ncrn;
6547           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
6548         }
6549
6550       /* If an argument with an SVE mode needs to be shifted up to the
6551          high part of the register, treat it as though it had an integer mode.
6552          Using the normal (parallel [...]) would suppress the shifting.  */
6553       if (sve_p
6554           && BYTES_BIG_ENDIAN
6555           && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
6556           && aarch64_pad_reg_upward (mode, type, false))
6557         {
6558           mode = int_mode_for_mode (mode).require ();
6559           sve_p = false;
6560         }
6561
6562       /* NREGS can be 0 when e.g. an empty structure is to be passed.
6563          A reg is still generated for it, but the caller should be smart
6564          enough not to use it.  */
6565       if (nregs == 0
6566           || (nregs == 1 && !sve_p)
6567           || GET_MODE_CLASS (mode) == MODE_INT)
6568         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
6569       else
6570         {
6571           rtx par;
6572           int i;
6573
6574           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6575           for (i = 0; i < nregs; i++)
6576             {
6577               scalar_int_mode reg_mode = word_mode;
6578               if (nregs == 1)
6579                 reg_mode = int_mode_for_mode (mode).require ();
6580               rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
6581               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
6582                                        GEN_INT (i * UNITS_PER_WORD));
6583               XVECEXP (par, 0, i) = tmp;
6584             }
6585           pcum->aapcs_reg = par;
6586         }
6587
6588       pcum->aapcs_nextncrn = ncrn + nregs;
6589       return;
6590     }
6591
6592   /* C.11  */
6593   pcum->aapcs_nextncrn = NUM_ARG_REGS;
6594
6595   /* The argument is passed on stack; record the needed number of words for
6596      this argument and align the total size if necessary.  */
6597 on_stack:
6598   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
6599
6600   if (aarch64_function_arg_alignment (mode, type, &abi_break)
6601       == 16 * BITS_PER_UNIT)
6602     {
6603       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
6604       if (pcum->aapcs_stack_size != new_size)
6605         {
6606           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6607             inform (input_location, "parameter passing for argument of type "
6608                     "%qT changed in GCC 9.1", type);
6609           pcum->aapcs_stack_size = new_size;
6610         }
6611     }
6612   return;
6613 }
6614
6615 /* Implement TARGET_FUNCTION_ARG.  */
6616
6617 static rtx
6618 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6619 {
6620   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6621   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
6622               || pcum->pcs_variant == ARM_PCS_SIMD
6623               || pcum->pcs_variant == ARM_PCS_SVE);
6624
6625   if (arg.end_marker_p ())
6626     return gen_int_mode (pcum->pcs_variant, DImode);
6627
6628   aarch64_layout_arg (pcum_v, arg);
6629   return pcum->aapcs_reg;
6630 }
6631
6632 void
6633 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
6634                               const_tree fntype,
6635                               rtx libname ATTRIBUTE_UNUSED,
6636                               const_tree fndecl ATTRIBUTE_UNUSED,
6637                               unsigned n_named ATTRIBUTE_UNUSED,
6638                               bool silent_p)
6639 {
6640   pcum->aapcs_ncrn = 0;
6641   pcum->aapcs_nvrn = 0;
6642   pcum->aapcs_nprn = 0;
6643   pcum->aapcs_nextncrn = 0;
6644   pcum->aapcs_nextnvrn = 0;
6645   pcum->aapcs_nextnprn = 0;
6646   if (fntype)
6647     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
6648   else
6649     pcum->pcs_variant = ARM_PCS_AAPCS64;
6650   pcum->aapcs_reg = NULL_RTX;
6651   pcum->aapcs_arg_processed = false;
6652   pcum->aapcs_stack_words = 0;
6653   pcum->aapcs_stack_size = 0;
6654   pcum->silent_p = silent_p;
6655
6656   if (!silent_p
6657       && !TARGET_FLOAT
6658       && fntype && fntype != error_mark_node)
6659     {
6660       const_tree type = TREE_TYPE (fntype);
6661       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
6662       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
6663       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6664                                                    &mode, &nregs, NULL, false))
6665         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
6666     }
6667
6668   if (!silent_p
6669       && !TARGET_SVE
6670       && pcum->pcs_variant == ARM_PCS_SVE)
6671     {
6672       /* We can't gracefully recover at this point, so make this a
6673          fatal error.  */
6674       if (fndecl)
6675         fatal_error (input_location, "%qE requires the SVE ISA extension",
6676                      fndecl);
6677       else
6678         fatal_error (input_location, "calls to functions of type %qT require"
6679                      " the SVE ISA extension", fntype);
6680     }
6681 }
6682
6683 static void
6684 aarch64_function_arg_advance (cumulative_args_t pcum_v,
6685                               const function_arg_info &arg)
6686 {
6687   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6688   if (pcum->pcs_variant == ARM_PCS_AAPCS64
6689       || pcum->pcs_variant == ARM_PCS_SIMD
6690       || pcum->pcs_variant == ARM_PCS_SVE)
6691     {
6692       aarch64_layout_arg (pcum_v, arg);
6693       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
6694                   != (pcum->aapcs_stack_words != 0));
6695       pcum->aapcs_arg_processed = false;
6696       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
6697       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
6698       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
6699       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
6700       pcum->aapcs_stack_words = 0;
6701       pcum->aapcs_reg = NULL_RTX;
6702     }
6703 }
6704
6705 bool
6706 aarch64_function_arg_regno_p (unsigned regno)
6707 {
6708   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
6709           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
6710 }
6711
6712 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
6713    PARM_BOUNDARY bits of alignment, but will be given anything up
6714    to STACK_BOUNDARY bits if the type requires it.  This makes sure
6715    that both before and after the layout of each argument, the Next
6716    Stacked Argument Address (NSAA) will have a minimum alignment of
6717    8 bytes.  */
6718
6719 static unsigned int
6720 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
6721 {
6722   unsigned int abi_break;
6723   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
6724                                                            &abi_break);
6725   alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
6726   if (abi_break & warn_psabi)
6727     {
6728       abi_break = MIN (MAX (abi_break, PARM_BOUNDARY), STACK_BOUNDARY);
6729       if (alignment != abi_break)
6730         inform (input_location, "parameter passing for argument of type "
6731                 "%qT changed in GCC 9.1", type);
6732     }
6733
6734   return alignment;
6735 }
6736
6737 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
6738
6739 static fixed_size_mode
6740 aarch64_get_reg_raw_mode (int regno)
6741 {
6742   if (TARGET_SVE && FP_REGNUM_P (regno))
6743     /* Don't use the SVE part of the register for __builtin_apply and
6744        __builtin_return.  The SVE registers aren't used by the normal PCS,
6745        so using them there would be a waste of time.  The PCS extensions
6746        for SVE types are fundamentally incompatible with the
6747        __builtin_return/__builtin_apply interface.  */
6748     return as_a <fixed_size_mode> (V16QImode);
6749   return default_get_reg_raw_mode (regno);
6750 }
6751
6752 /* Implement TARGET_FUNCTION_ARG_PADDING.
6753
6754    Small aggregate types are placed in the lowest memory address.
6755
6756    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
6757
6758 static pad_direction
6759 aarch64_function_arg_padding (machine_mode mode, const_tree type)
6760 {
6761   /* On little-endian targets, the least significant byte of every stack
6762      argument is passed at the lowest byte address of the stack slot.  */
6763   if (!BYTES_BIG_ENDIAN)
6764     return PAD_UPWARD;
6765
6766   /* Otherwise, integral, floating-point and pointer types are padded downward:
6767      the least significant byte of a stack argument is passed at the highest
6768      byte address of the stack slot.  */
6769   if (type
6770       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
6771          || POINTER_TYPE_P (type))
6772       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
6773     return PAD_DOWNWARD;
6774
6775   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
6776   return PAD_UPWARD;
6777 }
6778
6779 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6780
6781    It specifies padding for the last (may also be the only)
6782    element of a block move between registers and memory.  If
6783    assuming the block is in the memory, padding upward means that
6784    the last element is padded after its highest significant byte,
6785    while in downward padding, the last element is padded at the
6786    its least significant byte side.
6787
6788    Small aggregates and small complex types are always padded
6789    upwards.
6790
6791    We don't need to worry about homogeneous floating-point or
6792    short-vector aggregates; their move is not affected by the
6793    padding direction determined here.  Regardless of endianness,
6794    each element of such an aggregate is put in the least
6795    significant bits of a fp/simd register.
6796
6797    Return !BYTES_BIG_ENDIAN if the least significant byte of the
6798    register has useful data, and return the opposite if the most
6799    significant byte does.  */
6800
6801 bool
6802 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
6803                      bool first ATTRIBUTE_UNUSED)
6804 {
6805
6806   /* Aside from pure scalable types, small composite types are always
6807      padded upward.  */
6808   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
6809     {
6810       HOST_WIDE_INT size;
6811       if (type)
6812         size = int_size_in_bytes (type);
6813       else
6814         /* No frontends can create types with variable-sized modes, so we
6815            shouldn't be asked to pass or return them.  */
6816         size = GET_MODE_SIZE (mode).to_constant ();
6817       if (size < 2 * UNITS_PER_WORD)
6818         {
6819           pure_scalable_type_info pst_info;
6820           if (pst_info.analyze_registers (type))
6821             return false;
6822           return true;
6823         }
6824     }
6825
6826   /* Otherwise, use the default padding.  */
6827   return !BYTES_BIG_ENDIAN;
6828 }
6829
6830 static scalar_int_mode
6831 aarch64_libgcc_cmp_return_mode (void)
6832 {
6833   return SImode;
6834 }
6835
6836 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6837
6838 /* We use the 12-bit shifted immediate arithmetic instructions so values
6839    must be multiple of (1 << 12), i.e. 4096.  */
6840 #define ARITH_FACTOR 4096
6841
6842 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6843 #error Cannot use simple address calculation for stack probing
6844 #endif
6845
6846 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
6847    inclusive.  These are offsets from the current stack pointer.  */
6848
6849 static void
6850 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
6851 {
6852   HOST_WIDE_INT size;
6853   if (!poly_size.is_constant (&size))
6854     {
6855       sorry ("stack probes for SVE frames");
6856       return;
6857     }
6858
6859   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
6860
6861   /* See the same assertion on PROBE_INTERVAL above.  */
6862   gcc_assert ((first % ARITH_FACTOR) == 0);
6863
6864   /* See if we have a constant small number of probes to generate.  If so,
6865      that's the easy case.  */
6866   if (size <= PROBE_INTERVAL)
6867     {
6868       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
6869
6870       emit_set_insn (reg1,
6871                      plus_constant (Pmode,
6872                                     stack_pointer_rtx, -(first + base)));
6873       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
6874     }
6875
6876   /* The run-time loop is made up of 8 insns in the generic case while the
6877      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
6878   else if (size <= 4 * PROBE_INTERVAL)
6879     {
6880       HOST_WIDE_INT i, rem;
6881
6882       emit_set_insn (reg1,
6883                      plus_constant (Pmode,
6884                                     stack_pointer_rtx,
6885                                     -(first + PROBE_INTERVAL)));
6886       emit_stack_probe (reg1);
6887
6888       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
6889          it exceeds SIZE.  If only two probes are needed, this will not
6890          generate any code.  Then probe at FIRST + SIZE.  */
6891       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
6892         {
6893           emit_set_insn (reg1,
6894                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
6895           emit_stack_probe (reg1);
6896         }
6897
6898       rem = size - (i - PROBE_INTERVAL);
6899       if (rem > 256)
6900         {
6901           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6902
6903           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
6904           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
6905         }
6906       else
6907         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
6908     }
6909
6910   /* Otherwise, do the same as above, but in a loop.  Note that we must be
6911      extra careful with variables wrapping around because we might be at
6912      the very top (or the very bottom) of the address space and we have
6913      to be able to handle this case properly; in particular, we use an
6914      equality test for the loop condition.  */
6915   else
6916     {
6917       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
6918
6919       /* Step 1: round SIZE to the previous multiple of the interval.  */
6920
6921       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
6922
6923
6924       /* Step 2: compute initial and final value of the loop counter.  */
6925
6926       /* TEST_ADDR = SP + FIRST.  */
6927       emit_set_insn (reg1,
6928                      plus_constant (Pmode, stack_pointer_rtx, -first));
6929
6930       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
6931       HOST_WIDE_INT adjustment = - (first + rounded_size);
6932       if (! aarch64_uimm12_shift (adjustment))
6933         {
6934           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
6935                                           true, Pmode);
6936           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
6937         }
6938       else
6939         emit_set_insn (reg2,
6940                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
6941
6942       /* Step 3: the loop
6943
6944          do
6945            {
6946              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
6947              probe at TEST_ADDR
6948            }
6949          while (TEST_ADDR != LAST_ADDR)
6950
6951          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
6952          until it is equal to ROUNDED_SIZE.  */
6953
6954       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
6955
6956
6957       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
6958          that SIZE is equal to ROUNDED_SIZE.  */
6959
6960       if (size != rounded_size)
6961         {
6962           HOST_WIDE_INT rem = size - rounded_size;
6963
6964           if (rem > 256)
6965             {
6966               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6967
6968               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
6969               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
6970             }
6971           else
6972             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
6973         }
6974     }
6975
6976   /* Make sure nothing is scheduled before we are done.  */
6977   emit_insn (gen_blockage ());
6978 }
6979
6980 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
6981    absolute addresses.  */
6982
6983 const char *
6984 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
6985 {
6986   static int labelno = 0;
6987   char loop_lab[32];
6988   rtx xops[2];
6989
6990   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
6991
6992   /* Loop.  */
6993   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
6994
6995   HOST_WIDE_INT stack_clash_probe_interval
6996     = 1 << param_stack_clash_protection_guard_size;
6997
6998   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
6999   xops[0] = reg1;
7000   HOST_WIDE_INT interval;
7001   if (flag_stack_clash_protection)
7002     interval = stack_clash_probe_interval;
7003   else
7004     interval = PROBE_INTERVAL;
7005
7006   gcc_assert (aarch64_uimm12_shift (interval));
7007   xops[1] = GEN_INT (interval);
7008
7009   output_asm_insn ("sub\t%0, %0, %1", xops);
7010
7011   /* If doing stack clash protection then we probe up by the ABI specified
7012      amount.  We do this because we're dropping full pages at a time in the
7013      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
7014   if (flag_stack_clash_protection)
7015     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7016   else
7017     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7018
7019   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
7020      by this amount for each iteration.  */
7021   output_asm_insn ("str\txzr, [%0, %1]", xops);
7022
7023   /* Test if TEST_ADDR == LAST_ADDR.  */
7024   xops[1] = reg2;
7025   output_asm_insn ("cmp\t%0, %1", xops);
7026
7027   /* Branch.  */
7028   fputs ("\tb.ne\t", asm_out_file);
7029   assemble_name_raw (asm_out_file, loop_lab);
7030   fputc ('\n', asm_out_file);
7031
7032   return "";
7033 }
7034
7035 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7036    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7037    of GUARD_SIZE.  When a probe is emitted it is done at most
7038    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7039    at most MIN_PROBE_THRESHOLD.  By the end of this function
7040    BASE = BASE - ADJUSTMENT.  */
7041
7042 const char *
7043 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7044                                       rtx min_probe_threshold, rtx guard_size)
7045 {
7046   /* This function is not allowed to use any instruction generation function
7047      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
7048      so instead emit the code you want using output_asm_insn.  */
7049   gcc_assert (flag_stack_clash_protection);
7050   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7051   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7052
7053   /* The minimum required allocation before the residual requires probing.  */
7054   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7055
7056   /* Clamp the value down to the nearest value that can be used with a cmp.  */
7057   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7058   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7059
7060   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7061   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7062
7063   static int labelno = 0;
7064   char loop_start_lab[32];
7065   char loop_end_lab[32];
7066   rtx xops[2];
7067
7068   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7069   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7070
7071   /* Emit loop start label.  */
7072   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7073
7074   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
7075   xops[0] = adjustment;
7076   xops[1] = probe_offset_value_rtx;
7077   output_asm_insn ("cmp\t%0, %1", xops);
7078
7079   /* Branch to end if not enough adjustment to probe.  */
7080   fputs ("\tb.lt\t", asm_out_file);
7081   assemble_name_raw (asm_out_file, loop_end_lab);
7082   fputc ('\n', asm_out_file);
7083
7084   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
7085   xops[0] = base;
7086   xops[1] = probe_offset_value_rtx;
7087   output_asm_insn ("sub\t%0, %0, %1", xops);
7088
7089   /* Probe at BASE.  */
7090   xops[1] = const0_rtx;
7091   output_asm_insn ("str\txzr, [%0, %1]", xops);
7092
7093   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
7094   xops[0] = adjustment;
7095   xops[1] = probe_offset_value_rtx;
7096   output_asm_insn ("sub\t%0, %0, %1", xops);
7097
7098   /* Branch to start if still more bytes to allocate.  */
7099   fputs ("\tb\t", asm_out_file);
7100   assemble_name_raw (asm_out_file, loop_start_lab);
7101   fputc ('\n', asm_out_file);
7102
7103   /* No probe leave.  */
7104   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
7105
7106   /* BASE = BASE - ADJUSTMENT.  */
7107   xops[0] = base;
7108   xops[1] = adjustment;
7109   output_asm_insn ("sub\t%0, %0, %1", xops);
7110   return "";
7111 }
7112
7113 /* Determine whether a frame chain needs to be generated.  */
7114 static bool
7115 aarch64_needs_frame_chain (void)
7116 {
7117   /* Force a frame chain for EH returns so the return address is at FP+8.  */
7118   if (frame_pointer_needed || crtl->calls_eh_return)
7119     return true;
7120
7121   /* A leaf function cannot have calls or write LR.  */
7122   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
7123
7124   /* Don't use a frame chain in leaf functions if leaf frame pointers
7125      are disabled.  */
7126   if (flag_omit_leaf_frame_pointer && is_leaf)
7127     return false;
7128
7129   return aarch64_use_frame_pointer;
7130 }
7131
7132 /* Mark the registers that need to be saved by the callee and calculate
7133    the size of the callee-saved registers area and frame record (both FP
7134    and LR may be omitted).  */
7135 static void
7136 aarch64_layout_frame (void)
7137 {
7138   poly_int64 offset = 0;
7139   int regno, last_fp_reg = INVALID_REGNUM;
7140   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
7141   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
7142   bool frame_related_fp_reg_p = false;
7143   aarch64_frame &frame = cfun->machine->frame;
7144
7145   frame.emit_frame_chain = aarch64_needs_frame_chain ();
7146
7147   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
7148      the mid-end is doing.  */
7149   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
7150
7151 #define SLOT_NOT_REQUIRED (-2)
7152 #define SLOT_REQUIRED     (-1)
7153
7154   frame.wb_candidate1 = INVALID_REGNUM;
7155   frame.wb_candidate2 = INVALID_REGNUM;
7156   frame.spare_pred_reg = INVALID_REGNUM;
7157
7158   /* First mark all the registers that really need to be saved...  */
7159   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7160     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
7161
7162   /* ... that includes the eh data registers (if needed)...  */
7163   if (crtl->calls_eh_return)
7164     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
7165       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
7166
7167   /* ... and any callee saved register that dataflow says is live.  */
7168   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7169     if (df_regs_ever_live_p (regno)
7170         && !fixed_regs[regno]
7171         && (regno == R30_REGNUM
7172             || !crtl->abi->clobbers_full_reg_p (regno)))
7173       frame.reg_offset[regno] = SLOT_REQUIRED;
7174
7175   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7176     if (df_regs_ever_live_p (regno)
7177         && !fixed_regs[regno]
7178         && !crtl->abi->clobbers_full_reg_p (regno))
7179       {
7180         frame.reg_offset[regno] = SLOT_REQUIRED;
7181         last_fp_reg = regno;
7182         if (aarch64_emit_cfi_for_reg_p (regno))
7183           frame_related_fp_reg_p = true;
7184       }
7185
7186   /* Big-endian SVE frames need a spare predicate register in order
7187      to save Z8-Z15.  Decide which register they should use.  Prefer
7188      an unused argument register if possible, so that we don't force P4
7189      to be saved unnecessarily.  */
7190   if (frame_related_fp_reg_p
7191       && crtl->abi->id () == ARM_PCS_SVE
7192       && BYTES_BIG_ENDIAN)
7193     {
7194       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7195       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
7196       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
7197         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
7198           break;
7199       gcc_assert (regno <= P7_REGNUM);
7200       frame.spare_pred_reg = regno;
7201       df_set_regs_ever_live (regno, true);
7202     }
7203
7204   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7205     if (df_regs_ever_live_p (regno)
7206         && !fixed_regs[regno]
7207         && !crtl->abi->clobbers_full_reg_p (regno))
7208       frame.reg_offset[regno] = SLOT_REQUIRED;
7209
7210   /* With stack-clash, LR must be saved in non-leaf functions.  The saving of
7211      LR counts as an implicit probe which allows us to maintain the invariant
7212      described in the comment at expand_prologue.  */
7213   gcc_assert (crtl->is_leaf
7214               || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
7215
7216   /* Now assign stack slots for the registers.  Start with the predicate
7217      registers, since predicate LDR and STR have a relatively small
7218      offset range.  These saves happen below the hard frame pointer.  */
7219   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7220     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7221       {
7222         frame.reg_offset[regno] = offset;
7223         offset += BYTES_PER_SVE_PRED;
7224       }
7225
7226   if (maybe_ne (offset, 0))
7227     {
7228       /* If we have any vector registers to save above the predicate registers,
7229          the offset of the vector register save slots need to be a multiple
7230          of the vector size.  This lets us use the immediate forms of LDR/STR
7231          (or LD1/ST1 for big-endian).
7232
7233          A vector register is 8 times the size of a predicate register,
7234          and we need to save a maximum of 12 predicate registers, so the
7235          first vector register will be at either #1, MUL VL or #2, MUL VL.
7236
7237          If we don't have any vector registers to save, and we know how
7238          big the predicate save area is, we can just round it up to the
7239          next 16-byte boundary.  */
7240       if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
7241         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7242       else
7243         {
7244           if (known_le (offset, vector_save_size))
7245             offset = vector_save_size;
7246           else if (known_le (offset, vector_save_size * 2))
7247             offset = vector_save_size * 2;
7248           else
7249             gcc_unreachable ();
7250         }
7251     }
7252
7253   /* If we need to save any SVE vector registers, add them next.  */
7254   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
7255     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7256       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7257         {
7258           frame.reg_offset[regno] = offset;
7259           offset += vector_save_size;
7260         }
7261
7262   /* OFFSET is now the offset of the hard frame pointer from the bottom
7263      of the callee save area.  */
7264   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
7265   frame.below_hard_fp_saved_regs_size = offset;
7266   if (frame.emit_frame_chain)
7267     {
7268       /* FP and LR are placed in the linkage record.  */
7269       frame.reg_offset[R29_REGNUM] = offset;
7270       frame.wb_candidate1 = R29_REGNUM;
7271       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
7272       frame.wb_candidate2 = R30_REGNUM;
7273       offset += 2 * UNITS_PER_WORD;
7274     }
7275
7276   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7277     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7278       {
7279         frame.reg_offset[regno] = offset;
7280         if (frame.wb_candidate1 == INVALID_REGNUM)
7281           frame.wb_candidate1 = regno;
7282         else if (frame.wb_candidate2 == INVALID_REGNUM)
7283           frame.wb_candidate2 = regno;
7284         offset += UNITS_PER_WORD;
7285       }
7286
7287   poly_int64 max_int_offset = offset;
7288   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7289   bool has_align_gap = maybe_ne (offset, max_int_offset);
7290
7291   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7292     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7293       {
7294         /* If there is an alignment gap between integer and fp callee-saves,
7295            allocate the last fp register to it if possible.  */
7296         if (regno == last_fp_reg
7297             && has_align_gap
7298             && known_eq (vector_save_size, 8)
7299             && multiple_p (offset, 16))
7300           {
7301             frame.reg_offset[regno] = max_int_offset;
7302             break;
7303           }
7304
7305         frame.reg_offset[regno] = offset;
7306         if (frame.wb_candidate1 == INVALID_REGNUM)
7307           frame.wb_candidate1 = regno;
7308         else if (frame.wb_candidate2 == INVALID_REGNUM
7309                  && frame.wb_candidate1 >= V0_REGNUM)
7310           frame.wb_candidate2 = regno;
7311         offset += vector_save_size;
7312       }
7313
7314   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7315
7316   frame.saved_regs_size = offset;
7317
7318   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
7319
7320   poly_int64 above_outgoing_args
7321     = aligned_upper_bound (varargs_and_saved_regs_size
7322                            + get_frame_size (),
7323                            STACK_BOUNDARY / BITS_PER_UNIT);
7324
7325   frame.hard_fp_offset
7326     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
7327
7328   /* Both these values are already aligned.  */
7329   gcc_assert (multiple_p (crtl->outgoing_args_size,
7330                           STACK_BOUNDARY / BITS_PER_UNIT));
7331   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
7332
7333   frame.locals_offset = frame.saved_varargs_size;
7334
7335   frame.initial_adjust = 0;
7336   frame.final_adjust = 0;
7337   frame.callee_adjust = 0;
7338   frame.sve_callee_adjust = 0;
7339   frame.callee_offset = 0;
7340
7341   HOST_WIDE_INT max_push_offset = 0;
7342   if (frame.wb_candidate2 != INVALID_REGNUM)
7343     max_push_offset = 512;
7344   else if (frame.wb_candidate1 != INVALID_REGNUM)
7345     max_push_offset = 256;
7346
7347   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
7348   HOST_WIDE_INT const_saved_regs_size;
7349   if (frame.frame_size.is_constant (&const_size)
7350       && const_size < max_push_offset
7351       && known_eq (frame.hard_fp_offset, const_size))
7352     {
7353       /* Simple, small frame with no outgoing arguments:
7354
7355          stp reg1, reg2, [sp, -frame_size]!
7356          stp reg3, reg4, [sp, 16]  */
7357       frame.callee_adjust = const_size;
7358     }
7359   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
7360            && frame.saved_regs_size.is_constant (&const_saved_regs_size)
7361            && const_outgoing_args_size + const_saved_regs_size < 512
7362            /* We could handle this case even with outgoing args, provided
7363               that the number of args left us with valid offsets for all
7364               predicate and vector save slots.  It's such a rare case that
7365               it hardly seems worth the effort though.  */
7366            && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
7367            && !(cfun->calls_alloca
7368                 && frame.hard_fp_offset.is_constant (&const_fp_offset)
7369                 && const_fp_offset < max_push_offset))
7370     {
7371       /* Frame with small outgoing arguments:
7372
7373          sub sp, sp, frame_size
7374          stp reg1, reg2, [sp, outgoing_args_size]
7375          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
7376       frame.initial_adjust = frame.frame_size;
7377       frame.callee_offset = const_outgoing_args_size;
7378     }
7379   else if (saves_below_hard_fp_p
7380            && known_eq (frame.saved_regs_size,
7381                         frame.below_hard_fp_saved_regs_size))
7382     {
7383       /* Frame in which all saves are SVE saves:
7384
7385          sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
7386          save SVE registers relative to SP
7387          sub sp, sp, outgoing_args_size  */
7388       frame.initial_adjust = (frame.hard_fp_offset
7389                               + frame.below_hard_fp_saved_regs_size);
7390       frame.final_adjust = crtl->outgoing_args_size;
7391     }
7392   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
7393            && const_fp_offset < max_push_offset)
7394     {
7395       /* Frame with large outgoing arguments or SVE saves, but with
7396          a small local area:
7397
7398          stp reg1, reg2, [sp, -hard_fp_offset]!
7399          stp reg3, reg4, [sp, 16]
7400          [sub sp, sp, below_hard_fp_saved_regs_size]
7401          [save SVE registers relative to SP]
7402          sub sp, sp, outgoing_args_size  */
7403       frame.callee_adjust = const_fp_offset;
7404       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
7405       frame.final_adjust = crtl->outgoing_args_size;
7406     }
7407   else
7408     {
7409       /* Frame with large local area and outgoing arguments or SVE saves,
7410          using frame pointer:
7411
7412          sub sp, sp, hard_fp_offset
7413          stp x29, x30, [sp, 0]
7414          add x29, sp, 0
7415          stp reg3, reg4, [sp, 16]
7416          [sub sp, sp, below_hard_fp_saved_regs_size]
7417          [save SVE registers relative to SP]
7418          sub sp, sp, outgoing_args_size  */
7419       frame.initial_adjust = frame.hard_fp_offset;
7420       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
7421       frame.final_adjust = crtl->outgoing_args_size;
7422     }
7423
7424   /* Make sure the individual adjustments add up to the full frame size.  */
7425   gcc_assert (known_eq (frame.initial_adjust
7426                         + frame.callee_adjust
7427                         + frame.sve_callee_adjust
7428                         + frame.final_adjust, frame.frame_size));
7429
7430   if (!frame.emit_frame_chain && frame.callee_adjust == 0)
7431     {
7432       /* We've decided not to associate any register saves with the initial
7433          stack allocation.  */
7434       frame.wb_candidate1 = INVALID_REGNUM;
7435       frame.wb_candidate2 = INVALID_REGNUM;
7436     }
7437
7438   frame.laid_out = true;
7439 }
7440
7441 /* Return true if the register REGNO is saved on entry to
7442    the current function.  */
7443
7444 static bool
7445 aarch64_register_saved_on_entry (int regno)
7446 {
7447   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
7448 }
7449
7450 /* Return the next register up from REGNO up to LIMIT for the callee
7451    to save.  */
7452
7453 static unsigned
7454 aarch64_next_callee_save (unsigned regno, unsigned limit)
7455 {
7456   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
7457     regno ++;
7458   return regno;
7459 }
7460
7461 /* Push the register number REGNO of mode MODE to the stack with write-back
7462    adjusting the stack by ADJUSTMENT.  */
7463
7464 static void
7465 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
7466                            HOST_WIDE_INT adjustment)
7467  {
7468   rtx base_rtx = stack_pointer_rtx;
7469   rtx insn, reg, mem;
7470
7471   reg = gen_rtx_REG (mode, regno);
7472   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
7473                             plus_constant (Pmode, base_rtx, -adjustment));
7474   mem = gen_frame_mem (mode, mem);
7475
7476   insn = emit_move_insn (mem, reg);
7477   RTX_FRAME_RELATED_P (insn) = 1;
7478 }
7479
7480 /* Generate and return an instruction to store the pair of registers
7481    REG and REG2 of mode MODE to location BASE with write-back adjusting
7482    the stack location BASE by ADJUSTMENT.  */
7483
7484 static rtx
7485 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
7486                           HOST_WIDE_INT adjustment)
7487 {
7488   switch (mode)
7489     {
7490     case E_DImode:
7491       return gen_storewb_pairdi_di (base, base, reg, reg2,
7492                                     GEN_INT (-adjustment),
7493                                     GEN_INT (UNITS_PER_WORD - adjustment));
7494     case E_DFmode:
7495       return gen_storewb_pairdf_di (base, base, reg, reg2,
7496                                     GEN_INT (-adjustment),
7497                                     GEN_INT (UNITS_PER_WORD - adjustment));
7498     case E_TFmode:
7499       return gen_storewb_pairtf_di (base, base, reg, reg2,
7500                                     GEN_INT (-adjustment),
7501                                     GEN_INT (UNITS_PER_VREG - adjustment));
7502     default:
7503       gcc_unreachable ();
7504     }
7505 }
7506
7507 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
7508    stack pointer by ADJUSTMENT.  */
7509
7510 static void
7511 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
7512 {
7513   rtx_insn *insn;
7514   machine_mode mode = aarch64_reg_save_mode (regno1);
7515
7516   if (regno2 == INVALID_REGNUM)
7517     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
7518
7519   rtx reg1 = gen_rtx_REG (mode, regno1);
7520   rtx reg2 = gen_rtx_REG (mode, regno2);
7521
7522   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
7523                                               reg2, adjustment));
7524   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
7525   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7526   RTX_FRAME_RELATED_P (insn) = 1;
7527 }
7528
7529 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
7530    adjusting it by ADJUSTMENT afterwards.  */
7531
7532 static rtx
7533 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
7534                          HOST_WIDE_INT adjustment)
7535 {
7536   switch (mode)
7537     {
7538     case E_DImode:
7539       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
7540                                    GEN_INT (UNITS_PER_WORD));
7541     case E_DFmode:
7542       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
7543                                    GEN_INT (UNITS_PER_WORD));
7544     case E_TFmode:
7545       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
7546                                    GEN_INT (UNITS_PER_VREG));
7547     default:
7548       gcc_unreachable ();
7549     }
7550 }
7551
7552 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
7553    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
7554    into CFI_OPS.  */
7555
7556 static void
7557 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
7558                   rtx *cfi_ops)
7559 {
7560   machine_mode mode = aarch64_reg_save_mode (regno1);
7561   rtx reg1 = gen_rtx_REG (mode, regno1);
7562
7563   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
7564
7565   if (regno2 == INVALID_REGNUM)
7566     {
7567       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
7568       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
7569       emit_move_insn (reg1, gen_frame_mem (mode, mem));
7570     }
7571   else
7572     {
7573       rtx reg2 = gen_rtx_REG (mode, regno2);
7574       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7575       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
7576                                           reg2, adjustment));
7577     }
7578 }
7579
7580 /* Generate and return a store pair instruction of mode MODE to store
7581    register REG1 to MEM1 and register REG2 to MEM2.  */
7582
7583 static rtx
7584 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
7585                         rtx reg2)
7586 {
7587   switch (mode)
7588     {
7589     case E_DImode:
7590       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
7591
7592     case E_DFmode:
7593       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
7594
7595     case E_TFmode:
7596       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
7597
7598     case E_V4SImode:
7599       return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
7600
7601     case E_V16QImode:
7602       return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
7603
7604     default:
7605       gcc_unreachable ();
7606     }
7607 }
7608
7609 /* Generate and regurn a load pair isntruction of mode MODE to load register
7610    REG1 from MEM1 and register REG2 from MEM2.  */
7611
7612 static rtx
7613 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
7614                        rtx mem2)
7615 {
7616   switch (mode)
7617     {
7618     case E_DImode:
7619       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
7620
7621     case E_DFmode:
7622       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
7623
7624     case E_TFmode:
7625       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
7626
7627     case E_V4SImode:
7628       return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
7629
7630     default:
7631       gcc_unreachable ();
7632     }
7633 }
7634
7635 /* Return TRUE if return address signing should be enabled for the current
7636    function, otherwise return FALSE.  */
7637
7638 bool
7639 aarch64_return_address_signing_enabled (void)
7640 {
7641   /* This function should only be called after frame laid out.   */
7642   gcc_assert (cfun->machine->frame.laid_out);
7643
7644   /* Turn return address signing off in any function that uses
7645      __builtin_eh_return.  The address passed to __builtin_eh_return
7646      is not signed so either it has to be signed (with original sp)
7647      or the code path that uses it has to avoid authenticating it.
7648      Currently eh return introduces a return to anywhere gadget, no
7649      matter what we do here since it uses ret with user provided
7650      address. An ideal fix for that is to use indirect branch which
7651      can be protected with BTI j (to some extent).  */
7652   if (crtl->calls_eh_return)
7653     return false;
7654
7655   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
7656      if its LR is pushed onto stack.  */
7657   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
7658           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
7659               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
7660 }
7661
7662 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
7663 bool
7664 aarch64_bti_enabled (void)
7665 {
7666   return (aarch64_enable_bti == 1);
7667 }
7668
7669 /* The caller is going to use ST1D or LD1D to save or restore an SVE
7670    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
7671    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
7672
7673      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
7674          or LD1D address
7675
7676      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
7677          if the variable isn't already nonnull
7678
7679    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
7680    Handle this case using a temporary base register that is suitable for
7681    all offsets in that range.  Use ANCHOR_REG as this base register if it
7682    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
7683
7684 static inline void
7685 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
7686                                      rtx &anchor_reg, poly_int64 &offset,
7687                                      rtx &ptrue)
7688 {
7689   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
7690     {
7691       /* This is the maximum valid offset of the anchor from the base.
7692          Lower values would be valid too.  */
7693       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
7694       if (!anchor_reg)
7695         {
7696           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7697           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7698                                     gen_int_mode (anchor_offset, Pmode)));
7699         }
7700       base_rtx = anchor_reg;
7701       offset -= anchor_offset;
7702     }
7703   if (!ptrue)
7704     {
7705       int pred_reg = cfun->machine->frame.spare_pred_reg;
7706       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
7707                       CONSTM1_RTX (VNx16BImode));
7708       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
7709     }
7710 }
7711
7712 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
7713    is saved at BASE + OFFSET.  */
7714
7715 static void
7716 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
7717                             rtx base, poly_int64 offset)
7718 {
7719   rtx mem = gen_frame_mem (GET_MODE (reg),
7720                            plus_constant (Pmode, base, offset));
7721   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
7722 }
7723
7724 /* Emit code to save the callee-saved registers from register number START
7725    to LIMIT to the stack at the location starting at offset START_OFFSET,
7726    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
7727    is true if the hard frame pointer has been set up.  */
7728
7729 static void
7730 aarch64_save_callee_saves (poly_int64 start_offset,
7731                            unsigned start, unsigned limit, bool skip_wb,
7732                            bool hard_fp_valid_p)
7733 {
7734   rtx_insn *insn;
7735   unsigned regno;
7736   unsigned regno2;
7737   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7738
7739   for (regno = aarch64_next_callee_save (start, limit);
7740        regno <= limit;
7741        regno = aarch64_next_callee_save (regno + 1, limit))
7742     {
7743       rtx reg, mem;
7744       poly_int64 offset;
7745       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7746
7747       if (skip_wb
7748           && (regno == cfun->machine->frame.wb_candidate1
7749               || regno == cfun->machine->frame.wb_candidate2))
7750         continue;
7751
7752       if (cfun->machine->reg_is_wrapped_separately[regno])
7753         continue;
7754
7755       machine_mode mode = aarch64_reg_save_mode (regno);
7756       reg = gen_rtx_REG (mode, regno);
7757       offset = start_offset + cfun->machine->frame.reg_offset[regno];
7758       rtx base_rtx = stack_pointer_rtx;
7759       poly_int64 sp_offset = offset;
7760
7761       HOST_WIDE_INT const_offset;
7762       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7763         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7764                                              offset, ptrue);
7765       else if (GP_REGNUM_P (regno)
7766                && (!offset.is_constant (&const_offset) || const_offset >= 512))
7767         {
7768           gcc_assert (known_eq (start_offset, 0));
7769           poly_int64 fp_offset
7770             = cfun->machine->frame.below_hard_fp_saved_regs_size;
7771           if (hard_fp_valid_p)
7772             base_rtx = hard_frame_pointer_rtx;
7773           else
7774             {
7775               if (!anchor_reg)
7776                 {
7777                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7778                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7779                                             gen_int_mode (fp_offset, Pmode)));
7780                 }
7781               base_rtx = anchor_reg;
7782             }
7783           offset -= fp_offset;
7784         }
7785       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7786       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
7787
7788       if (!aarch64_sve_mode_p (mode)
7789           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7790           && !cfun->machine->reg_is_wrapped_separately[regno2]
7791           && known_eq (GET_MODE_SIZE (mode),
7792                        cfun->machine->frame.reg_offset[regno2]
7793                        - cfun->machine->frame.reg_offset[regno]))
7794         {
7795           rtx reg2 = gen_rtx_REG (mode, regno2);
7796           rtx mem2;
7797
7798           offset += GET_MODE_SIZE (mode);
7799           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7800           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
7801                                                     reg2));
7802
7803           /* The first part of a frame-related parallel insn is
7804              always assumed to be relevant to the frame
7805              calculations; subsequent parts, are only
7806              frame-related if explicitly marked.  */
7807           if (aarch64_emit_cfi_for_reg_p (regno2))
7808             {
7809               if (need_cfa_note_p)
7810                 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
7811                                             sp_offset + GET_MODE_SIZE (mode));
7812               else
7813                 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7814             }
7815
7816           regno = regno2;
7817         }
7818       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7819         {
7820           insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
7821           need_cfa_note_p = true;
7822         }
7823       else if (aarch64_sve_mode_p (mode))
7824         insn = emit_insn (gen_rtx_SET (mem, reg));
7825       else
7826         insn = emit_move_insn (mem, reg);
7827
7828       RTX_FRAME_RELATED_P (insn) = frame_related_p;
7829       if (frame_related_p && need_cfa_note_p)
7830         aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
7831     }
7832 }
7833
7834 /* Emit code to restore the callee registers from register number START
7835    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
7836    skipping any write-back candidates if SKIP_WB is true.  Write the
7837    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
7838
7839 static void
7840 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
7841                               unsigned limit, bool skip_wb, rtx *cfi_ops)
7842 {
7843   unsigned regno;
7844   unsigned regno2;
7845   poly_int64 offset;
7846   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7847
7848   for (regno = aarch64_next_callee_save (start, limit);
7849        regno <= limit;
7850        regno = aarch64_next_callee_save (regno + 1, limit))
7851     {
7852       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7853       if (cfun->machine->reg_is_wrapped_separately[regno])
7854         continue;
7855
7856       rtx reg, mem;
7857
7858       if (skip_wb
7859           && (regno == cfun->machine->frame.wb_candidate1
7860               || regno == cfun->machine->frame.wb_candidate2))
7861         continue;
7862
7863       machine_mode mode = aarch64_reg_save_mode (regno);
7864       reg = gen_rtx_REG (mode, regno);
7865       offset = start_offset + cfun->machine->frame.reg_offset[regno];
7866       rtx base_rtx = stack_pointer_rtx;
7867       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7868         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7869                                              offset, ptrue);
7870       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7871
7872       if (!aarch64_sve_mode_p (mode)
7873           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7874           && !cfun->machine->reg_is_wrapped_separately[regno2]
7875           && known_eq (GET_MODE_SIZE (mode),
7876                        cfun->machine->frame.reg_offset[regno2]
7877                        - cfun->machine->frame.reg_offset[regno]))
7878         {
7879           rtx reg2 = gen_rtx_REG (mode, regno2);
7880           rtx mem2;
7881
7882           offset += GET_MODE_SIZE (mode);
7883           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7884           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7885
7886           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7887           regno = regno2;
7888         }
7889       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7890         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
7891       else if (aarch64_sve_mode_p (mode))
7892         emit_insn (gen_rtx_SET (reg, mem));
7893       else
7894         emit_move_insn (reg, mem);
7895       if (frame_related_p)
7896         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
7897     }
7898 }
7899
7900 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
7901    of MODE.  */
7902
7903 static inline bool
7904 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7905 {
7906   HOST_WIDE_INT multiple;
7907   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7908           && IN_RANGE (multiple, -8, 7));
7909 }
7910
7911 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
7912    of MODE.  */
7913
7914 static inline bool
7915 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7916 {
7917   HOST_WIDE_INT multiple;
7918   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7919           && IN_RANGE (multiple, -32, 31));
7920 }
7921
7922 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
7923    of MODE.  */
7924
7925 static inline bool
7926 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7927 {
7928   HOST_WIDE_INT multiple;
7929   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7930           && IN_RANGE (multiple, 0, 63));
7931 }
7932
7933 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
7934    of MODE.  */
7935
7936 bool
7937 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7938 {
7939   HOST_WIDE_INT multiple;
7940   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7941           && IN_RANGE (multiple, -64, 63));
7942 }
7943
7944 /* Return true if OFFSET is a signed 9-bit value.  */
7945
7946 bool
7947 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
7948                                        poly_int64 offset)
7949 {
7950   HOST_WIDE_INT const_offset;
7951   return (offset.is_constant (&const_offset)
7952           && IN_RANGE (const_offset, -256, 255));
7953 }
7954
7955 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
7956    of MODE.  */
7957
7958 static inline bool
7959 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7960 {
7961   HOST_WIDE_INT multiple;
7962   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7963           && IN_RANGE (multiple, -256, 255));
7964 }
7965
7966 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
7967    of MODE.  */
7968
7969 static inline bool
7970 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7971 {
7972   HOST_WIDE_INT multiple;
7973   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7974           && IN_RANGE (multiple, 0, 4095));
7975 }
7976
7977 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
7978
7979 static sbitmap
7980 aarch64_get_separate_components (void)
7981 {
7982   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7983   bitmap_clear (components);
7984
7985   /* The registers we need saved to the frame.  */
7986   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7987     if (aarch64_register_saved_on_entry (regno))
7988       {
7989         /* Punt on saves and restores that use ST1D and LD1D.  We could
7990            try to be smarter, but it would involve making sure that the
7991            spare predicate register itself is safe to use at the save
7992            and restore points.  Also, when a frame pointer is being used,
7993            the slots are often out of reach of ST1D and LD1D anyway.  */
7994         machine_mode mode = aarch64_reg_save_mode (regno);
7995         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7996           continue;
7997
7998         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7999
8000         /* If the register is saved in the first SVE save slot, we use
8001            it as a stack probe for -fstack-clash-protection.  */
8002         if (flag_stack_clash_protection
8003             && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
8004             && known_eq (offset, 0))
8005           continue;
8006
8007         /* Get the offset relative to the register we'll use.  */
8008         if (frame_pointer_needed)
8009           offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8010         else
8011           offset += crtl->outgoing_args_size;
8012
8013         /* Check that we can access the stack slot of the register with one
8014            direct load with no adjustments needed.  */
8015         if (aarch64_sve_mode_p (mode)
8016             ? offset_9bit_signed_scaled_p (mode, offset)
8017             : offset_12bit_unsigned_scaled_p (mode, offset))
8018           bitmap_set_bit (components, regno);
8019       }
8020
8021   /* Don't mess with the hard frame pointer.  */
8022   if (frame_pointer_needed)
8023     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
8024
8025   /* If the spare predicate register used by big-endian SVE code
8026      is call-preserved, it must be saved in the main prologue
8027      before any saves that use it.  */
8028   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
8029     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
8030
8031   unsigned reg1 = cfun->machine->frame.wb_candidate1;
8032   unsigned reg2 = cfun->machine->frame.wb_candidate2;
8033   /* If registers have been chosen to be stored/restored with
8034      writeback don't interfere with them to avoid having to output explicit
8035      stack adjustment instructions.  */
8036   if (reg2 != INVALID_REGNUM)
8037     bitmap_clear_bit (components, reg2);
8038   if (reg1 != INVALID_REGNUM)
8039     bitmap_clear_bit (components, reg1);
8040
8041   bitmap_clear_bit (components, LR_REGNUM);
8042   bitmap_clear_bit (components, SP_REGNUM);
8043
8044   return components;
8045 }
8046
8047 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
8048
8049 static sbitmap
8050 aarch64_components_for_bb (basic_block bb)
8051 {
8052   bitmap in = DF_LIVE_IN (bb);
8053   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
8054   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
8055
8056   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8057   bitmap_clear (components);
8058
8059   /* Clobbered registers don't generate values in any meaningful sense,
8060      since nothing after the clobber can rely on their value.  And we can't
8061      say that partially-clobbered registers are unconditionally killed,
8062      because whether they're killed or not depends on the mode of the
8063      value they're holding.  Thus partially call-clobbered registers
8064      appear in neither the kill set nor the gen set.
8065
8066      Check manually for any calls that clobber more of a register than the
8067      current function can.  */
8068   function_abi_aggregator callee_abis;
8069   rtx_insn *insn;
8070   FOR_BB_INSNS (bb, insn)
8071     if (CALL_P (insn))
8072       callee_abis.note_callee_abi (insn_callee_abi (insn));
8073   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
8074
8075   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
8076   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8077     if (!fixed_regs[regno]
8078         && !crtl->abi->clobbers_full_reg_p (regno)
8079         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
8080             || bitmap_bit_p (in, regno)
8081             || bitmap_bit_p (gen, regno)
8082             || bitmap_bit_p (kill, regno)))
8083       {
8084         bitmap_set_bit (components, regno);
8085
8086         /* If there is a callee-save at an adjacent offset, add it too
8087            to increase the use of LDP/STP.  */
8088         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8089         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
8090
8091         if (regno2 <= LAST_SAVED_REGNUM)
8092           {
8093             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
8094             if (regno < regno2
8095                 ? known_eq (offset + 8, offset2)
8096                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
8097               bitmap_set_bit (components, regno2);
8098           }
8099       }
8100
8101   return components;
8102 }
8103
8104 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
8105    Nothing to do for aarch64.  */
8106
8107 static void
8108 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
8109 {
8110 }
8111
8112 /* Return the next set bit in BMP from START onwards.  Return the total number
8113    of bits in BMP if no set bit is found at or after START.  */
8114
8115 static unsigned int
8116 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
8117 {
8118   unsigned int nbits = SBITMAP_SIZE (bmp);
8119   if (start == nbits)
8120     return start;
8121
8122   gcc_assert (start < nbits);
8123   for (unsigned int i = start; i < nbits; i++)
8124     if (bitmap_bit_p (bmp, i))
8125       return i;
8126
8127   return nbits;
8128 }
8129
8130 /* Do the work for aarch64_emit_prologue_components and
8131    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
8132    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
8133    for these components or the epilogue sequence.  That is, it determines
8134    whether we should emit stores or loads and what kind of CFA notes to attach
8135    to the insns.  Otherwise the logic for the two sequences is very
8136    similar.  */
8137
8138 static void
8139 aarch64_process_components (sbitmap components, bool prologue_p)
8140 {
8141   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
8142                              ? HARD_FRAME_POINTER_REGNUM
8143                              : STACK_POINTER_REGNUM);
8144
8145   unsigned last_regno = SBITMAP_SIZE (components);
8146   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
8147   rtx_insn *insn = NULL;
8148
8149   while (regno != last_regno)
8150     {
8151       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8152       machine_mode mode = aarch64_reg_save_mode (regno);
8153
8154       rtx reg = gen_rtx_REG (mode, regno);
8155       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8156       if (frame_pointer_needed)
8157         offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8158       else
8159         offset += crtl->outgoing_args_size;
8160
8161       rtx addr = plus_constant (Pmode, ptr_reg, offset);
8162       rtx mem = gen_frame_mem (mode, addr);
8163
8164       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
8165       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
8166       /* No more registers to handle after REGNO.
8167          Emit a single save/restore and exit.  */
8168       if (regno2 == last_regno)
8169         {
8170           insn = emit_insn (set);
8171           if (frame_related_p)
8172             {
8173               RTX_FRAME_RELATED_P (insn) = 1;
8174               if (prologue_p)
8175                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
8176               else
8177                 add_reg_note (insn, REG_CFA_RESTORE, reg);
8178             }
8179           break;
8180         }
8181
8182       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
8183       /* The next register is not of the same class or its offset is not
8184          mergeable with the current one into a pair.  */
8185       if (aarch64_sve_mode_p (mode)
8186           || !satisfies_constraint_Ump (mem)
8187           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
8188           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
8189           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
8190                        GET_MODE_SIZE (mode)))
8191         {
8192           insn = emit_insn (set);
8193           if (frame_related_p)
8194             {
8195               RTX_FRAME_RELATED_P (insn) = 1;
8196               if (prologue_p)
8197                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
8198               else
8199                 add_reg_note (insn, REG_CFA_RESTORE, reg);
8200             }
8201
8202           regno = regno2;
8203           continue;
8204         }
8205
8206       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
8207
8208       /* REGNO2 can be saved/restored in a pair with REGNO.  */
8209       rtx reg2 = gen_rtx_REG (mode, regno2);
8210       if (frame_pointer_needed)
8211         offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8212       else
8213         offset2 += crtl->outgoing_args_size;
8214       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
8215       rtx mem2 = gen_frame_mem (mode, addr2);
8216       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
8217                              : gen_rtx_SET (reg2, mem2);
8218
8219       if (prologue_p)
8220         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
8221       else
8222         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
8223
8224       if (frame_related_p || frame_related2_p)
8225         {
8226           RTX_FRAME_RELATED_P (insn) = 1;
8227           if (prologue_p)
8228             {
8229               if (frame_related_p)
8230                 add_reg_note (insn, REG_CFA_OFFSET, set);
8231               if (frame_related2_p)
8232                 add_reg_note (insn, REG_CFA_OFFSET, set2);
8233             }
8234           else
8235             {
8236               if (frame_related_p)
8237                 add_reg_note (insn, REG_CFA_RESTORE, reg);
8238               if (frame_related2_p)
8239                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
8240             }
8241         }
8242
8243       regno = aarch64_get_next_set_bit (components, regno2 + 1);
8244     }
8245 }
8246
8247 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
8248
8249 static void
8250 aarch64_emit_prologue_components (sbitmap components)
8251 {
8252   aarch64_process_components (components, true);
8253 }
8254
8255 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
8256
8257 static void
8258 aarch64_emit_epilogue_components (sbitmap components)
8259 {
8260   aarch64_process_components (components, false);
8261 }
8262
8263 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
8264
8265 static void
8266 aarch64_set_handled_components (sbitmap components)
8267 {
8268   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8269     if (bitmap_bit_p (components, regno))
8270       cfun->machine->reg_is_wrapped_separately[regno] = true;
8271 }
8272
8273 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
8274    determining the probe offset for alloca.  */
8275
8276 static HOST_WIDE_INT
8277 aarch64_stack_clash_protection_alloca_probe_range (void)
8278 {
8279   return STACK_CLASH_CALLER_GUARD;
8280 }
8281
8282
8283 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
8284    registers.  If POLY_SIZE is not large enough to require a probe this function
8285    will only adjust the stack.  When allocating the stack space
8286    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
8287    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
8288    arguments.  If we are then we ensure that any allocation larger than the ABI
8289    defined buffer needs a probe so that the invariant of having a 1KB buffer is
8290    maintained.
8291
8292    We emit barriers after each stack adjustment to prevent optimizations from
8293    breaking the invariant that we never drop the stack more than a page.  This
8294    invariant is needed to make it easier to correctly handle asynchronous
8295    events, e.g. if we were to allow the stack to be dropped by more than a page
8296    and then have multiple probes up and we take a signal somewhere in between
8297    then the signal handler doesn't know the state of the stack and can make no
8298    assumptions about which pages have been probed.  */
8299
8300 static void
8301 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
8302                                         poly_int64 poly_size,
8303                                         bool frame_related_p,
8304                                         bool final_adjustment_p)
8305 {
8306   HOST_WIDE_INT guard_size
8307     = 1 << param_stack_clash_protection_guard_size;
8308   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8309   HOST_WIDE_INT min_probe_threshold
8310     = (final_adjustment_p
8311        ? guard_used_by_caller
8312        : guard_size - guard_used_by_caller);
8313   /* When doing the final adjustment for the outgoing arguments, take into
8314      account any unprobed space there is above the current SP.  There are
8315      two cases:
8316
8317      - When saving SVE registers below the hard frame pointer, we force
8318        the lowest save to take place in the prologue before doing the final
8319        adjustment (i.e. we don't allow the save to be shrink-wrapped).
8320        This acts as a probe at SP, so there is no unprobed space.
8321
8322      - When there are no SVE register saves, we use the store of the link
8323        register as a probe.  We can't assume that LR was saved at position 0
8324        though, so treat any space below it as unprobed.  */
8325   if (final_adjustment_p
8326       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
8327     {
8328       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
8329       if (known_ge (lr_offset, 0))
8330         min_probe_threshold -= lr_offset.to_constant ();
8331       else
8332         gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
8333     }
8334
8335   poly_int64 frame_size = cfun->machine->frame.frame_size;
8336
8337   /* We should always have a positive probe threshold.  */
8338   gcc_assert (min_probe_threshold > 0);
8339
8340   if (flag_stack_clash_protection && !final_adjustment_p)
8341     {
8342       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8343       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8344       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8345
8346       if (known_eq (frame_size, 0))
8347         {
8348           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
8349         }
8350       else if (known_lt (initial_adjust + sve_callee_adjust,
8351                          guard_size - guard_used_by_caller)
8352                && known_lt (final_adjust, guard_used_by_caller))
8353         {
8354           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
8355         }
8356     }
8357
8358   /* If SIZE is not large enough to require probing, just adjust the stack and
8359      exit.  */
8360   if (known_lt (poly_size, min_probe_threshold)
8361       || !flag_stack_clash_protection)
8362     {
8363       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
8364       return;
8365     }
8366
8367   HOST_WIDE_INT size;
8368   /* Handle the SVE non-constant case first.  */
8369   if (!poly_size.is_constant (&size))
8370     {
8371      if (dump_file)
8372       {
8373         fprintf (dump_file, "Stack clash SVE prologue: ");
8374         print_dec (poly_size, dump_file);
8375         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
8376       }
8377
8378       /* First calculate the amount of bytes we're actually spilling.  */
8379       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
8380                           poly_size, temp1, temp2, false, true);
8381
8382       rtx_insn *insn = get_last_insn ();
8383
8384       if (frame_related_p)
8385         {
8386           /* This is done to provide unwinding information for the stack
8387              adjustments we're about to do, however to prevent the optimizers
8388              from removing the R11 move and leaving the CFA note (which would be
8389              very wrong) we tie the old and new stack pointer together.
8390              The tie will expand to nothing but the optimizers will not touch
8391              the instruction.  */
8392           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8393           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
8394           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
8395
8396           /* We want the CFA independent of the stack pointer for the
8397              duration of the loop.  */
8398           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
8399           RTX_FRAME_RELATED_P (insn) = 1;
8400         }
8401
8402       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
8403       rtx guard_const = gen_int_mode (guard_size, Pmode);
8404
8405       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
8406                                                    stack_pointer_rtx, temp1,
8407                                                    probe_const, guard_const));
8408
8409       /* Now reset the CFA register if needed.  */
8410       if (frame_related_p)
8411         {
8412           add_reg_note (insn, REG_CFA_DEF_CFA,
8413                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
8414                                       gen_int_mode (poly_size, Pmode)));
8415           RTX_FRAME_RELATED_P (insn) = 1;
8416         }
8417
8418       return;
8419     }
8420
8421   if (dump_file)
8422     fprintf (dump_file,
8423              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
8424              " bytes, probing will be required.\n", size);
8425
8426   /* Round size to the nearest multiple of guard_size, and calculate the
8427      residual as the difference between the original size and the rounded
8428      size.  */
8429   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
8430   HOST_WIDE_INT residual = size - rounded_size;
8431
8432   /* We can handle a small number of allocations/probes inline.  Otherwise
8433      punt to a loop.  */
8434   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
8435     {
8436       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
8437         {
8438           aarch64_sub_sp (NULL, temp2, guard_size, true);
8439           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
8440                                            guard_used_by_caller));
8441           emit_insn (gen_blockage ());
8442         }
8443       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
8444     }
8445   else
8446     {
8447       /* Compute the ending address.  */
8448       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
8449                           temp1, NULL, false, true);
8450       rtx_insn *insn = get_last_insn ();
8451
8452       /* For the initial allocation, we don't have a frame pointer
8453          set up, so we always need CFI notes.  If we're doing the
8454          final allocation, then we may have a frame pointer, in which
8455          case it is the CFA, otherwise we need CFI notes.
8456
8457          We can determine which allocation we are doing by looking at
8458          the value of FRAME_RELATED_P since the final allocations are not
8459          frame related.  */
8460       if (frame_related_p)
8461         {
8462           /* We want the CFA independent of the stack pointer for the
8463              duration of the loop.  */
8464           add_reg_note (insn, REG_CFA_DEF_CFA,
8465                         plus_constant (Pmode, temp1, rounded_size));
8466           RTX_FRAME_RELATED_P (insn) = 1;
8467         }
8468
8469       /* This allocates and probes the stack.  Note that this re-uses some of
8470          the existing Ada stack protection code.  However we are guaranteed not
8471          to enter the non loop or residual branches of that code.
8472
8473          The non-loop part won't be entered because if our allocation amount
8474          doesn't require a loop, the case above would handle it.
8475
8476          The residual amount won't be entered because TEMP1 is a mutliple of
8477          the allocation size.  The residual will always be 0.  As such, the only
8478          part we are actually using from that code is the loop setup.  The
8479          actual probing is done in aarch64_output_probe_stack_range.  */
8480       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
8481                                                stack_pointer_rtx, temp1));
8482
8483       /* Now reset the CFA register if needed.  */
8484       if (frame_related_p)
8485         {
8486           add_reg_note (insn, REG_CFA_DEF_CFA,
8487                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
8488           RTX_FRAME_RELATED_P (insn) = 1;
8489         }
8490
8491       emit_insn (gen_blockage ());
8492       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
8493     }
8494
8495   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
8496      be probed.  This maintains the requirement that each page is probed at
8497      least once.  For initial probing we probe only if the allocation is
8498      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
8499      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
8500      GUARD_SIZE.  This works that for any allocation that is large enough to
8501      trigger a probe here, we'll have at least one, and if they're not large
8502      enough for this code to emit anything for them, The page would have been
8503      probed by the saving of FP/LR either by this function or any callees.  If
8504      we don't have any callees then we won't have more stack adjustments and so
8505      are still safe.  */
8506   if (residual)
8507     {
8508       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
8509       /* If we're doing final adjustments, and we've done any full page
8510          allocations then any residual needs to be probed.  */
8511       if (final_adjustment_p && rounded_size != 0)
8512         min_probe_threshold = 0;
8513       /* If doing a small final adjustment, we always probe at offset 0.
8514          This is done to avoid issues when LR is not at position 0 or when
8515          the final adjustment is smaller than the probing offset.  */
8516       else if (final_adjustment_p && rounded_size == 0)
8517         residual_probe_offset = 0;
8518
8519       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
8520       if (residual >= min_probe_threshold)
8521         {
8522           if (dump_file)
8523             fprintf (dump_file,
8524                      "Stack clash AArch64 prologue residuals: "
8525                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
8526                      "\n", residual);
8527
8528             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
8529                                              residual_probe_offset));
8530           emit_insn (gen_blockage ());
8531         }
8532     }
8533 }
8534
8535 /* Return 1 if the register is used by the epilogue.  We need to say the
8536    return register is used, but only after epilogue generation is complete.
8537    Note that in the case of sibcalls, the values "used by the epilogue" are
8538    considered live at the start of the called function.
8539
8540    For SIMD functions we need to return 1 for FP registers that are saved and
8541    restored by a function but are not zero in call_used_regs.  If we do not do
8542    this optimizations may remove the restore of the register.  */
8543
8544 int
8545 aarch64_epilogue_uses (int regno)
8546 {
8547   if (epilogue_completed)
8548     {
8549       if (regno == LR_REGNUM)
8550         return 1;
8551     }
8552   return 0;
8553 }
8554
8555 /* AArch64 stack frames generated by this compiler look like:
8556
8557         +-------------------------------+
8558         |                               |
8559         |  incoming stack arguments     |
8560         |                               |
8561         +-------------------------------+
8562         |                               | <-- incoming stack pointer (aligned)
8563         |  callee-allocated save area   |
8564         |  for register varargs         |
8565         |                               |
8566         +-------------------------------+
8567         |  local variables              | <-- frame_pointer_rtx
8568         |                               |
8569         +-------------------------------+
8570         |  padding                      | \
8571         +-------------------------------+  |
8572         |  callee-saved registers       |  | frame.saved_regs_size
8573         +-------------------------------+  |
8574         |  LR'                          |  |
8575         +-------------------------------+  |
8576         |  FP'                          |  |
8577         +-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
8578         |  SVE vector registers         |  | \
8579         +-------------------------------+  |  | below_hard_fp_saved_regs_size
8580         |  SVE predicate registers      | /  /
8581         +-------------------------------+
8582         |  dynamic allocation           |
8583         +-------------------------------+
8584         |  padding                      |
8585         +-------------------------------+
8586         |  outgoing stack arguments     | <-- arg_pointer
8587         |                               |
8588         +-------------------------------+
8589         |                               | <-- stack_pointer_rtx (aligned)
8590
8591    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
8592    but leave frame_pointer_rtx and hard_frame_pointer_rtx
8593    unchanged.
8594
8595    By default for stack-clash we assume the guard is at least 64KB, but this
8596    value is configurable to either 4KB or 64KB.  We also force the guard size to
8597    be the same as the probing interval and both values are kept in sync.
8598
8599    With those assumptions the callee can allocate up to 63KB (or 3KB depending
8600    on the guard size) of stack space without probing.
8601
8602    When probing is needed, we emit a probe at the start of the prologue
8603    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
8604
8605    We have to track how much space has been allocated and the only stores
8606    to the stack we track as implicit probes are the FP/LR stores.
8607
8608    For outgoing arguments we probe if the size is larger than 1KB, such that
8609    the ABI specified buffer is maintained for the next callee.
8610
8611    The following registers are reserved during frame layout and should not be
8612    used for any other purpose:
8613
8614    - r11: Used by stack clash protection when SVE is enabled, and also
8615           as an anchor register when saving and restoring registers
8616    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
8617    - r14 and r15: Used for speculation tracking.
8618    - r16(IP0), r17(IP1): Used by indirect tailcalls.
8619    - r30(LR), r29(FP): Used by standard frame layout.
8620
8621    These registers must be avoided in frame layout related code unless the
8622    explicit intention is to interact with one of the features listed above.  */
8623
8624 /* Generate the prologue instructions for entry into a function.
8625    Establish the stack frame by decreasing the stack pointer with a
8626    properly calculated size and, if necessary, create a frame record
8627    filled with the values of LR and previous frame pointer.  The
8628    current FP is also set up if it is in use.  */
8629
8630 void
8631 aarch64_expand_prologue (void)
8632 {
8633   poly_int64 frame_size = cfun->machine->frame.frame_size;
8634   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8635   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8636   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8637   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8638   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8639   poly_int64 below_hard_fp_saved_regs_size
8640     = cfun->machine->frame.below_hard_fp_saved_regs_size;
8641   unsigned reg1 = cfun->machine->frame.wb_candidate1;
8642   unsigned reg2 = cfun->machine->frame.wb_candidate2;
8643   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
8644   rtx_insn *insn;
8645
8646   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
8647     {
8648       /* Fold the SVE allocation into the initial allocation.
8649          We don't do this in aarch64_layout_arg to avoid pessimizing
8650          the epilogue code.  */
8651       initial_adjust += sve_callee_adjust;
8652       sve_callee_adjust = 0;
8653     }
8654
8655   /* Sign return address for functions.  */
8656   if (aarch64_return_address_signing_enabled ())
8657     {
8658       switch (aarch64_ra_sign_key)
8659         {
8660           case AARCH64_KEY_A:
8661             insn = emit_insn (gen_paciasp ());
8662             break;
8663           case AARCH64_KEY_B:
8664             insn = emit_insn (gen_pacibsp ());
8665             break;
8666           default:
8667             gcc_unreachable ();
8668         }
8669       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8670       RTX_FRAME_RELATED_P (insn) = 1;
8671     }
8672
8673   if (flag_stack_usage_info)
8674     current_function_static_stack_size = constant_lower_bound (frame_size);
8675
8676   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
8677     {
8678       if (crtl->is_leaf && !cfun->calls_alloca)
8679         {
8680           if (maybe_gt (frame_size, PROBE_INTERVAL)
8681               && maybe_gt (frame_size, get_stack_check_protect ()))
8682             aarch64_emit_probe_stack_range (get_stack_check_protect (),
8683                                             (frame_size
8684                                              - get_stack_check_protect ()));
8685         }
8686       else if (maybe_gt (frame_size, 0))
8687         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
8688     }
8689
8690   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8691   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8692
8693   /* In theory we should never have both an initial adjustment
8694      and a callee save adjustment.  Verify that is the case since the
8695      code below does not handle it for -fstack-clash-protection.  */
8696   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
8697
8698   /* Will only probe if the initial adjustment is larger than the guard
8699      less the amount of the guard reserved for use by the caller's
8700      outgoing args.  */
8701   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
8702                                           true, false);
8703
8704   if (callee_adjust != 0)
8705     aarch64_push_regs (reg1, reg2, callee_adjust);
8706
8707   /* The offset of the frame chain record (if any) from the current SP.  */
8708   poly_int64 chain_offset = (initial_adjust + callee_adjust
8709                              - cfun->machine->frame.hard_fp_offset);
8710   gcc_assert (known_ge (chain_offset, 0));
8711
8712   /* The offset of the bottom of the save area from the current SP.  */
8713   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
8714
8715   if (emit_frame_chain)
8716     {
8717       if (callee_adjust == 0)
8718         {
8719           reg1 = R29_REGNUM;
8720           reg2 = R30_REGNUM;
8721           aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
8722                                      false, false);
8723         }
8724       else
8725         gcc_assert (known_eq (chain_offset, 0));
8726       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
8727                           stack_pointer_rtx, chain_offset,
8728                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
8729       if (frame_pointer_needed && !frame_size.is_constant ())
8730         {
8731           /* Variable-sized frames need to describe the save slot
8732              address using DW_CFA_expression rather than DW_CFA_offset.
8733              This means that, without taking further action, the
8734              locations of the registers that we've already saved would
8735              remain based on the stack pointer even after we redefine
8736              the CFA based on the frame pointer.  We therefore need new
8737              DW_CFA_expressions to re-express the save slots with addresses
8738              based on the frame pointer.  */
8739           rtx_insn *insn = get_last_insn ();
8740           gcc_assert (RTX_FRAME_RELATED_P (insn));
8741
8742           /* Add an explicit CFA definition if this was previously
8743              implicit.  */
8744           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
8745             {
8746               rtx src = plus_constant (Pmode, stack_pointer_rtx,
8747                                        callee_offset);
8748               add_reg_note (insn, REG_CFA_ADJUST_CFA,
8749                             gen_rtx_SET (hard_frame_pointer_rtx, src));
8750             }
8751
8752           /* Change the save slot expressions for the registers that
8753              we've already saved.  */
8754           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
8755                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
8756           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
8757                                       hard_frame_pointer_rtx, 0);
8758         }
8759       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
8760     }
8761
8762   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
8763                              callee_adjust != 0 || emit_frame_chain,
8764                              emit_frame_chain);
8765   if (maybe_ne (sve_callee_adjust, 0))
8766     {
8767       gcc_assert (!flag_stack_clash_protection
8768                   || known_eq (initial_adjust, 0));
8769       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
8770                                               sve_callee_adjust,
8771                                               !frame_pointer_needed, false);
8772       saved_regs_offset += sve_callee_adjust;
8773     }
8774   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
8775                              false, emit_frame_chain);
8776   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
8777                              callee_adjust != 0 || emit_frame_chain,
8778                              emit_frame_chain);
8779
8780   /* We may need to probe the final adjustment if it is larger than the guard
8781      that is assumed by the called.  */
8782   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
8783                                           !frame_pointer_needed, true);
8784 }
8785
8786 /* Return TRUE if we can use a simple_return insn.
8787
8788    This function checks whether the callee saved stack is empty, which
8789    means no restore actions are need. The pro_and_epilogue will use
8790    this to check whether shrink-wrapping opt is feasible.  */
8791
8792 bool
8793 aarch64_use_return_insn_p (void)
8794 {
8795   if (!reload_completed)
8796     return false;
8797
8798   if (crtl->profile)
8799     return false;
8800
8801   return known_eq (cfun->machine->frame.frame_size, 0);
8802 }
8803
8804 /* Generate the epilogue instructions for returning from a function.
8805    This is almost exactly the reverse of the prolog sequence, except
8806    that we need to insert barriers to avoid scheduling loads that read
8807    from a deallocated stack, and we optimize the unwind records by
8808    emitting them all together if possible.  */
8809 void
8810 aarch64_expand_epilogue (bool for_sibcall)
8811 {
8812   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8813   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8814   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8815   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8816   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8817   poly_int64 below_hard_fp_saved_regs_size
8818     = cfun->machine->frame.below_hard_fp_saved_regs_size;
8819   unsigned reg1 = cfun->machine->frame.wb_candidate1;
8820   unsigned reg2 = cfun->machine->frame.wb_candidate2;
8821   rtx cfi_ops = NULL;
8822   rtx_insn *insn;
8823   /* A stack clash protection prologue may not have left EP0_REGNUM or
8824      EP1_REGNUM in a usable state.  The same is true for allocations
8825      with an SVE component, since we then need both temporary registers
8826      for each allocation.  For stack clash we are in a usable state if
8827      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
8828   HOST_WIDE_INT guard_size
8829     = 1 << param_stack_clash_protection_guard_size;
8830   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8831
8832   /* We can re-use the registers when:
8833
8834      (a) the deallocation amount is the same as the corresponding
8835          allocation amount (which is false if we combine the initial
8836          and SVE callee save allocations in the prologue); and
8837
8838      (b) the allocation amount doesn't need a probe (which is false
8839          if the amount is guard_size - guard_used_by_caller or greater).
8840
8841      In such situations the register should remain live with the correct
8842      value.  */
8843   bool can_inherit_p = (initial_adjust.is_constant ()
8844                         && final_adjust.is_constant ()
8845                         && (!flag_stack_clash_protection
8846                             || (known_lt (initial_adjust,
8847                                           guard_size - guard_used_by_caller)
8848                                 && known_eq (sve_callee_adjust, 0))));
8849
8850   /* We need to add memory barrier to prevent read from deallocated stack.  */
8851   bool need_barrier_p
8852     = maybe_ne (get_frame_size ()
8853                 + cfun->machine->frame.saved_varargs_size, 0);
8854
8855   /* Emit a barrier to prevent loads from a deallocated stack.  */
8856   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
8857       || cfun->calls_alloca
8858       || crtl->calls_eh_return)
8859     {
8860       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8861       need_barrier_p = false;
8862     }
8863
8864   /* Restore the stack pointer from the frame pointer if it may not
8865      be the same as the stack pointer.  */
8866   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8867   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8868   if (frame_pointer_needed
8869       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
8870     /* If writeback is used when restoring callee-saves, the CFA
8871        is restored on the instruction doing the writeback.  */
8872     aarch64_add_offset (Pmode, stack_pointer_rtx,
8873                         hard_frame_pointer_rtx,
8874                         -callee_offset - below_hard_fp_saved_regs_size,
8875                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
8876   else
8877      /* The case where we need to re-use the register here is very rare, so
8878         avoid the complicated condition and just always emit a move if the
8879         immediate doesn't fit.  */
8880      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
8881
8882   /* Restore the vector registers before the predicate registers,
8883      so that we can use P4 as a temporary for big-endian SVE frames.  */
8884   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
8885                                 callee_adjust != 0, &cfi_ops);
8886   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
8887                                 false, &cfi_ops);
8888   if (maybe_ne (sve_callee_adjust, 0))
8889     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
8890   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
8891                                 R0_REGNUM, R30_REGNUM,
8892                                 callee_adjust != 0, &cfi_ops);
8893
8894   if (need_barrier_p)
8895     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8896
8897   if (callee_adjust != 0)
8898     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
8899
8900   /* If we have no register restore information, the CFA must have been
8901      defined in terms of the stack pointer since the end of the prologue.  */
8902   gcc_assert (cfi_ops || !frame_pointer_needed);
8903
8904   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
8905     {
8906       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
8907       insn = get_last_insn ();
8908       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
8909       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
8910       RTX_FRAME_RELATED_P (insn) = 1;
8911       cfi_ops = NULL;
8912     }
8913
8914   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
8915      add restriction on emit_move optimization to leaf functions.  */
8916   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
8917                   (!can_inherit_p || !crtl->is_leaf
8918                    || df_regs_ever_live_p (EP0_REGNUM)));
8919
8920   if (cfi_ops)
8921     {
8922       /* Emit delayed restores and reset the CFA to be SP.  */
8923       insn = get_last_insn ();
8924       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
8925       REG_NOTES (insn) = cfi_ops;
8926       RTX_FRAME_RELATED_P (insn) = 1;
8927     }
8928
8929   /* We prefer to emit the combined return/authenticate instruction RETAA,
8930      however there are three cases in which we must instead emit an explicit
8931      authentication instruction.
8932
8933         1) Sibcalls don't return in a normal way, so if we're about to call one
8934            we must authenticate.
8935
8936         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
8937            generating code for !TARGET_ARMV8_3 we can't use it and must
8938            explicitly authenticate.
8939
8940         3) On an eh_return path we make extra stack adjustments to update the
8941            canonical frame address to be the exception handler's CFA.  We want
8942            to authenticate using the CFA of the function which calls eh_return.
8943     */
8944   if (aarch64_return_address_signing_enabled ()
8945       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
8946     {
8947       switch (aarch64_ra_sign_key)
8948         {
8949           case AARCH64_KEY_A:
8950             insn = emit_insn (gen_autiasp ());
8951             break;
8952           case AARCH64_KEY_B:
8953             insn = emit_insn (gen_autibsp ());
8954             break;
8955           default:
8956             gcc_unreachable ();
8957         }
8958       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8959       RTX_FRAME_RELATED_P (insn) = 1;
8960     }
8961
8962   /* Stack adjustment for exception handler.  */
8963   if (crtl->calls_eh_return && !for_sibcall)
8964     {
8965       /* We need to unwind the stack by the offset computed by
8966          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
8967          to be SP; letting the CFA move during this adjustment
8968          is just as correct as retaining the CFA from the body
8969          of the function.  Therefore, do nothing special.  */
8970       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
8971     }
8972
8973   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
8974   if (!for_sibcall)
8975     emit_jump_insn (ret_rtx);
8976 }
8977
8978 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
8979    normally or return to a previous frame after unwinding.
8980
8981    An EH return uses a single shared return sequence.  The epilogue is
8982    exactly like a normal epilogue except that it has an extra input
8983    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
8984    that must be applied after the frame has been destroyed.  An extra label
8985    is inserted before the epilogue which initializes this register to zero,
8986    and this is the entry point for a normal return.
8987
8988    An actual EH return updates the return address, initializes the stack
8989    adjustment and jumps directly into the epilogue (bypassing the zeroing
8990    of the adjustment).  Since the return address is typically saved on the
8991    stack when a function makes a call, the saved LR must be updated outside
8992    the epilogue.
8993
8994    This poses problems as the store is generated well before the epilogue,
8995    so the offset of LR is not known yet.  Also optimizations will remove the
8996    store as it appears dead, even after the epilogue is generated (as the
8997    base or offset for loading LR is different in many cases).
8998
8999    To avoid these problems this implementation forces the frame pointer
9000    in eh_return functions so that the location of LR is fixed and known early.
9001    It also marks the store volatile, so no optimization is permitted to
9002    remove the store.  */
9003 rtx
9004 aarch64_eh_return_handler_rtx (void)
9005 {
9006   rtx tmp = gen_frame_mem (Pmode,
9007     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
9008
9009   /* Mark the store volatile, so no optimization is permitted to remove it.  */
9010   MEM_VOLATILE_P (tmp) = true;
9011   return tmp;
9012 }
9013
9014 /* Output code to add DELTA to the first argument, and then jump
9015    to FUNCTION.  Used for C++ multiple inheritance.  */
9016 static void
9017 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
9018                          HOST_WIDE_INT delta,
9019                          HOST_WIDE_INT vcall_offset,
9020                          tree function)
9021 {
9022   /* The this pointer is always in x0.  Note that this differs from
9023      Arm where the this pointer maybe bumped to r1 if r0 is required
9024      to return a pointer to an aggregate.  On AArch64 a result value
9025      pointer will be in x8.  */
9026   int this_regno = R0_REGNUM;
9027   rtx this_rtx, temp0, temp1, addr, funexp;
9028   rtx_insn *insn;
9029   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
9030
9031   if (aarch64_bti_enabled ())
9032     emit_insn (gen_bti_c());
9033
9034   reload_completed = 1;
9035   emit_note (NOTE_INSN_PROLOGUE_END);
9036
9037   this_rtx = gen_rtx_REG (Pmode, this_regno);
9038   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
9039   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
9040
9041   if (vcall_offset == 0)
9042     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
9043   else
9044     {
9045       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
9046
9047       addr = this_rtx;
9048       if (delta != 0)
9049         {
9050           if (delta >= -256 && delta < 256)
9051             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
9052                                        plus_constant (Pmode, this_rtx, delta));
9053           else
9054             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
9055                                 temp1, temp0, false);
9056         }
9057
9058       if (Pmode == ptr_mode)
9059         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
9060       else
9061         aarch64_emit_move (temp0,
9062                            gen_rtx_ZERO_EXTEND (Pmode,
9063                                                 gen_rtx_MEM (ptr_mode, addr)));
9064
9065       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
9066           addr = plus_constant (Pmode, temp0, vcall_offset);
9067       else
9068         {
9069           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
9070                                           Pmode);
9071           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
9072         }
9073
9074       if (Pmode == ptr_mode)
9075         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
9076       else
9077         aarch64_emit_move (temp1,
9078                            gen_rtx_SIGN_EXTEND (Pmode,
9079                                                 gen_rtx_MEM (ptr_mode, addr)));
9080
9081       emit_insn (gen_add2_insn (this_rtx, temp1));
9082     }
9083
9084   /* Generate a tail call to the target function.  */
9085   if (!TREE_USED (function))
9086     {
9087       assemble_external (function);
9088       TREE_USED (function) = 1;
9089     }
9090   funexp = XEXP (DECL_RTL (function), 0);
9091   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
9092   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
9093   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
9094   SIBLING_CALL_P (insn) = 1;
9095
9096   insn = get_insns ();
9097   shorten_branches (insn);
9098
9099   assemble_start_function (thunk, fnname);
9100   final_start_function (insn, file, 1);
9101   final (insn, file, 1);
9102   final_end_function ();
9103   assemble_end_function (thunk, fnname);
9104
9105   /* Stop pretending to be a post-reload pass.  */
9106   reload_completed = 0;
9107 }
9108
9109 static bool
9110 aarch64_tls_referenced_p (rtx x)
9111 {
9112   if (!TARGET_HAVE_TLS)
9113     return false;
9114   subrtx_iterator::array_type array;
9115   FOR_EACH_SUBRTX (iter, array, x, ALL)
9116     {
9117       const_rtx x = *iter;
9118       if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
9119         return true;
9120       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
9121          TLS offsets, not real symbol references.  */
9122       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9123         iter.skip_subrtxes ();
9124     }
9125   return false;
9126 }
9127
9128
9129 /* Return true if val can be encoded as a 12-bit unsigned immediate with
9130    a left shift of 0 or 12 bits.  */
9131 bool
9132 aarch64_uimm12_shift (HOST_WIDE_INT val)
9133 {
9134   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
9135           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
9136           );
9137 }
9138
9139 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
9140    that can be created with a left shift of 0 or 12.  */
9141 static HOST_WIDE_INT
9142 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
9143 {
9144   /* Check to see if the value fits in 24 bits, as that is the maximum we can
9145      handle correctly.  */
9146   gcc_assert ((val & 0xffffff) == val);
9147
9148   if (((val & 0xfff) << 0) == val)
9149     return val;
9150
9151   return val & (0xfff << 12);
9152 }
9153
9154 /* Return true if val is an immediate that can be loaded into a
9155    register by a MOVZ instruction.  */
9156 static bool
9157 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
9158 {
9159   if (GET_MODE_SIZE (mode) > 4)
9160     {
9161       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
9162           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
9163         return 1;
9164     }
9165   else
9166     {
9167       /* Ignore sign extension.  */
9168       val &= (HOST_WIDE_INT) 0xffffffff;
9169     }
9170   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
9171           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
9172 }
9173
9174 /* Test whether:
9175
9176      X = (X & AND_VAL) | IOR_VAL;
9177
9178    can be implemented using:
9179
9180      MOVK X, #(IOR_VAL >> shift), LSL #shift
9181
9182    Return the shift if so, otherwise return -1.  */
9183 int
9184 aarch64_movk_shift (const wide_int_ref &and_val,
9185                     const wide_int_ref &ior_val)
9186 {
9187   unsigned int precision = and_val.get_precision ();
9188   unsigned HOST_WIDE_INT mask = 0xffff;
9189   for (unsigned int shift = 0; shift < precision; shift += 16)
9190     {
9191       if (and_val == ~mask && (ior_val & mask) == ior_val)
9192         return shift;
9193       mask <<= 16;
9194     }
9195   return -1;
9196 }
9197
9198 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
9199    64-bit (DImode) integer.  */
9200
9201 static unsigned HOST_WIDE_INT
9202 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
9203 {
9204   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
9205   while (size < 64)
9206     {
9207       val &= (HOST_WIDE_INT_1U << size) - 1;
9208       val |= val << size;
9209       size *= 2;
9210     }
9211   return val;
9212 }
9213
9214 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
9215
9216 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
9217   {
9218     0x0000000100000001ull,
9219     0x0001000100010001ull,
9220     0x0101010101010101ull,
9221     0x1111111111111111ull,
9222     0x5555555555555555ull,
9223   };
9224
9225
9226 /* Return true if val is a valid bitmask immediate.  */
9227
9228 bool
9229 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
9230 {
9231   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
9232   int bits;
9233
9234   /* Check for a single sequence of one bits and return quickly if so.
9235      The special cases of all ones and all zeroes returns false.  */
9236   val = aarch64_replicate_bitmask_imm (val_in, mode);
9237   tmp = val + (val & -val);
9238
9239   if (tmp == (tmp & -tmp))
9240     return (val + 1) > 1;
9241
9242   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
9243   if (mode == SImode)
9244     val = (val << 32) | (val & 0xffffffff);
9245
9246   /* Invert if the immediate doesn't start with a zero bit - this means we
9247      only need to search for sequences of one bits.  */
9248   if (val & 1)
9249     val = ~val;
9250
9251   /* Find the first set bit and set tmp to val with the first sequence of one
9252      bits removed.  Return success if there is a single sequence of ones.  */
9253   first_one = val & -val;
9254   tmp = val & (val + first_one);
9255
9256   if (tmp == 0)
9257     return true;
9258
9259   /* Find the next set bit and compute the difference in bit position.  */
9260   next_one = tmp & -tmp;
9261   bits = clz_hwi (first_one) - clz_hwi (next_one);
9262   mask = val ^ tmp;
9263
9264   /* Check the bit position difference is a power of 2, and that the first
9265      sequence of one bits fits within 'bits' bits.  */
9266   if ((mask >> bits) != 0 || bits != (bits & -bits))
9267     return false;
9268
9269   /* Check the sequence of one bits is repeated 64/bits times.  */
9270   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
9271 }
9272
9273 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
9274    Assumed precondition: VAL_IN Is not zero.  */
9275
9276 unsigned HOST_WIDE_INT
9277 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
9278 {
9279   int lowest_bit_set = ctz_hwi (val_in);
9280   int highest_bit_set = floor_log2 (val_in);
9281   gcc_assert (val_in != 0);
9282
9283   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
9284           (HOST_WIDE_INT_1U << lowest_bit_set));
9285 }
9286
9287 /* Create constant where bits outside of lowest bit set to highest bit set
9288    are set to 1.  */
9289
9290 unsigned HOST_WIDE_INT
9291 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
9292 {
9293   return val_in | ~aarch64_and_split_imm1 (val_in);
9294 }
9295
9296 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
9297
9298 bool
9299 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
9300 {
9301   scalar_int_mode int_mode;
9302   if (!is_a <scalar_int_mode> (mode, &int_mode))
9303     return false;
9304
9305   if (aarch64_bitmask_imm (val_in, int_mode))
9306     return false;
9307
9308   if (aarch64_move_imm (val_in, int_mode))
9309     return false;
9310
9311   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
9312
9313   return aarch64_bitmask_imm (imm2, int_mode);
9314 }
9315
9316 /* Return true if val is an immediate that can be loaded into a
9317    register in a single instruction.  */
9318 bool
9319 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
9320 {
9321   scalar_int_mode int_mode;
9322   if (!is_a <scalar_int_mode> (mode, &int_mode))
9323     return false;
9324
9325   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
9326     return 1;
9327   return aarch64_bitmask_imm (val, int_mode);
9328 }
9329
9330 static bool
9331 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
9332 {
9333   if (GET_CODE (x) == HIGH)
9334     return true;
9335
9336   /* There's no way to calculate VL-based values using relocations.  */
9337   subrtx_iterator::array_type array;
9338   FOR_EACH_SUBRTX (iter, array, x, ALL)
9339     if (GET_CODE (*iter) == CONST_POLY_INT)
9340       return true;
9341
9342   poly_int64 offset;
9343   rtx base = strip_offset_and_salt (x, &offset);
9344   if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
9345     {
9346       /* We checked for POLY_INT_CST offsets above.  */
9347       if (aarch64_classify_symbol (base, offset.to_constant ())
9348           != SYMBOL_FORCE_TO_MEM)
9349         return true;
9350       else
9351         /* Avoid generating a 64-bit relocation in ILP32; leave
9352            to aarch64_expand_mov_immediate to handle it properly.  */
9353         return mode != ptr_mode;
9354     }
9355
9356   return aarch64_tls_referenced_p (x);
9357 }
9358
9359 /* Implement TARGET_CASE_VALUES_THRESHOLD.
9360    The expansion for a table switch is quite expensive due to the number
9361    of instructions, the table lookup and hard to predict indirect jump.
9362    When optimizing for speed, and -O3 enabled, use the per-core tuning if
9363    set, otherwise use tables for > 16 cases as a tradeoff between size and
9364    performance.  When optimizing for size, use the default setting.  */
9365
9366 static unsigned int
9367 aarch64_case_values_threshold (void)
9368 {
9369   /* Use the specified limit for the number of cases before using jump
9370      tables at higher optimization levels.  */
9371   if (optimize > 2
9372       && selected_cpu->tune->max_case_values != 0)
9373     return selected_cpu->tune->max_case_values;
9374   else
9375     return optimize_size ? default_case_values_threshold () : 17;
9376 }
9377
9378 /* Return true if register REGNO is a valid index register.
9379    STRICT_P is true if REG_OK_STRICT is in effect.  */
9380
9381 bool
9382 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
9383 {
9384   if (!HARD_REGISTER_NUM_P (regno))
9385     {
9386       if (!strict_p)
9387         return true;
9388
9389       if (!reg_renumber)
9390         return false;
9391
9392       regno = reg_renumber[regno];
9393     }
9394   return GP_REGNUM_P (regno);
9395 }
9396
9397 /* Return true if register REGNO is a valid base register for mode MODE.
9398    STRICT_P is true if REG_OK_STRICT is in effect.  */
9399
9400 bool
9401 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
9402 {
9403   if (!HARD_REGISTER_NUM_P (regno))
9404     {
9405       if (!strict_p)
9406         return true;
9407
9408       if (!reg_renumber)
9409         return false;
9410
9411       regno = reg_renumber[regno];
9412     }
9413
9414   /* The fake registers will be eliminated to either the stack or
9415      hard frame pointer, both of which are usually valid base registers.
9416      Reload deals with the cases where the eliminated form isn't valid.  */
9417   return (GP_REGNUM_P (regno)
9418           || regno == SP_REGNUM
9419           || regno == FRAME_POINTER_REGNUM
9420           || regno == ARG_POINTER_REGNUM);
9421 }
9422
9423 /* Return true if X is a valid base register for mode MODE.
9424    STRICT_P is true if REG_OK_STRICT is in effect.  */
9425
9426 static bool
9427 aarch64_base_register_rtx_p (rtx x, bool strict_p)
9428 {
9429   if (!strict_p
9430       && SUBREG_P (x)
9431       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
9432     x = SUBREG_REG (x);
9433
9434   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
9435 }
9436
9437 /* Return true if address offset is a valid index.  If it is, fill in INFO
9438    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
9439
9440 static bool
9441 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
9442                         machine_mode mode, bool strict_p)
9443 {
9444   enum aarch64_address_type type;
9445   rtx index;
9446   int shift;
9447
9448   /* (reg:P) */
9449   if ((REG_P (x) || SUBREG_P (x))
9450       && GET_MODE (x) == Pmode)
9451     {
9452       type = ADDRESS_REG_REG;
9453       index = x;
9454       shift = 0;
9455     }
9456   /* (sign_extend:DI (reg:SI)) */
9457   else if ((GET_CODE (x) == SIGN_EXTEND
9458             || GET_CODE (x) == ZERO_EXTEND)
9459            && GET_MODE (x) == DImode
9460            && GET_MODE (XEXP (x, 0)) == SImode)
9461     {
9462       type = (GET_CODE (x) == SIGN_EXTEND)
9463         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9464       index = XEXP (x, 0);
9465       shift = 0;
9466     }
9467   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
9468   else if (GET_CODE (x) == MULT
9469            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
9470                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
9471            && GET_MODE (XEXP (x, 0)) == DImode
9472            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
9473            && CONST_INT_P (XEXP (x, 1)))
9474     {
9475       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
9476         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9477       index = XEXP (XEXP (x, 0), 0);
9478       shift = exact_log2 (INTVAL (XEXP (x, 1)));
9479     }
9480   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
9481   else if (GET_CODE (x) == ASHIFT
9482            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
9483                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
9484            && GET_MODE (XEXP (x, 0)) == DImode
9485            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
9486            && CONST_INT_P (XEXP (x, 1)))
9487     {
9488       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
9489         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9490       index = XEXP (XEXP (x, 0), 0);
9491       shift = INTVAL (XEXP (x, 1));
9492     }
9493   /* (and:DI (mult:DI (reg:DI) (const_int scale))
9494      (const_int 0xffffffff<<shift)) */
9495   else if (GET_CODE (x) == AND
9496            && GET_MODE (x) == DImode
9497            && GET_CODE (XEXP (x, 0)) == MULT
9498            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9499            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9500            && CONST_INT_P (XEXP (x, 1)))
9501     {
9502       type = ADDRESS_REG_UXTW;
9503       index = XEXP (XEXP (x, 0), 0);
9504       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
9505       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9506         shift = -1;
9507     }
9508   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
9509      (const_int 0xffffffff<<shift)) */
9510   else if (GET_CODE (x) == AND
9511            && GET_MODE (x) == DImode
9512            && GET_CODE (XEXP (x, 0)) == ASHIFT
9513            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9514            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9515            && CONST_INT_P (XEXP (x, 1)))
9516     {
9517       type = ADDRESS_REG_UXTW;
9518       index = XEXP (XEXP (x, 0), 0);
9519       shift = INTVAL (XEXP (XEXP (x, 0), 1));
9520       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9521         shift = -1;
9522     }
9523   /* (mult:P (reg:P) (const_int scale)) */
9524   else if (GET_CODE (x) == MULT
9525            && GET_MODE (x) == Pmode
9526            && GET_MODE (XEXP (x, 0)) == Pmode
9527            && CONST_INT_P (XEXP (x, 1)))
9528     {
9529       type = ADDRESS_REG_REG;
9530       index = XEXP (x, 0);
9531       shift = exact_log2 (INTVAL (XEXP (x, 1)));
9532     }
9533   /* (ashift:P (reg:P) (const_int shift)) */
9534   else if (GET_CODE (x) == ASHIFT
9535            && GET_MODE (x) == Pmode
9536            && GET_MODE (XEXP (x, 0)) == Pmode
9537            && CONST_INT_P (XEXP (x, 1)))
9538     {
9539       type = ADDRESS_REG_REG;
9540       index = XEXP (x, 0);
9541       shift = INTVAL (XEXP (x, 1));
9542     }
9543   else
9544     return false;
9545
9546   if (!strict_p
9547       && SUBREG_P (index)
9548       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
9549     index = SUBREG_REG (index);
9550
9551   if (aarch64_sve_data_mode_p (mode))
9552     {
9553       if (type != ADDRESS_REG_REG
9554           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
9555         return false;
9556     }
9557   else
9558     {
9559       if (shift != 0
9560           && !(IN_RANGE (shift, 1, 3)
9561                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
9562         return false;
9563     }
9564
9565   if (REG_P (index)
9566       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
9567     {
9568       info->type = type;
9569       info->offset = index;
9570       info->shift = shift;
9571       return true;
9572     }
9573
9574   return false;
9575 }
9576
9577 /* Return true if MODE is one of the modes for which we
9578    support LDP/STP operations.  */
9579
9580 static bool
9581 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
9582 {
9583   return mode == SImode || mode == DImode
9584          || mode == SFmode || mode == DFmode
9585          || (aarch64_vector_mode_supported_p (mode)
9586              && (known_eq (GET_MODE_SIZE (mode), 8)
9587                  || (known_eq (GET_MODE_SIZE (mode), 16)
9588                     && (aarch64_tune_params.extra_tuning_flags
9589                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
9590 }
9591
9592 /* Return true if REGNO is a virtual pointer register, or an eliminable
9593    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
9594    include stack_pointer or hard_frame_pointer.  */
9595 static bool
9596 virt_or_elim_regno_p (unsigned regno)
9597 {
9598   return ((regno >= FIRST_VIRTUAL_REGISTER
9599            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
9600           || regno == FRAME_POINTER_REGNUM
9601           || regno == ARG_POINTER_REGNUM);
9602 }
9603
9604 /* Return true if X is a valid address of type TYPE for machine mode MODE.
9605    If it is, fill in INFO appropriately.  STRICT_P is true if
9606    REG_OK_STRICT is in effect.  */
9607
9608 bool
9609 aarch64_classify_address (struct aarch64_address_info *info,
9610                           rtx x, machine_mode mode, bool strict_p,
9611                           aarch64_addr_query_type type)
9612 {
9613   enum rtx_code code = GET_CODE (x);
9614   rtx op0, op1;
9615   poly_int64 offset;
9616
9617   HOST_WIDE_INT const_size;
9618
9619   /* Whether a vector mode is partial doesn't affect address legitimacy.
9620      Partial vectors like VNx8QImode allow the same indexed addressing
9621      mode and MUL VL addressing mode as full vectors like VNx16QImode;
9622      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
9623   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9624   vec_flags &= ~VEC_PARTIAL;
9625
9626   /* On BE, we use load/store pair for all large int mode load/stores.
9627      TI/TFmode may also use a load/store pair.  */
9628   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
9629   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
9630                             || type == ADDR_QUERY_LDP_STP_N
9631                             || mode == TImode
9632                             || mode == TFmode
9633                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
9634
9635   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
9636      corresponds to the actual size of the memory being loaded/stored and the
9637      mode of the corresponding addressing mode is half of that.  */
9638   if (type == ADDR_QUERY_LDP_STP_N
9639       && known_eq (GET_MODE_SIZE (mode), 16))
9640     mode = DFmode;
9641
9642   bool allow_reg_index_p = (!load_store_pair_p
9643                             && (known_lt (GET_MODE_SIZE (mode), 16)
9644                                 || vec_flags == VEC_ADVSIMD
9645                                 || vec_flags & VEC_SVE_DATA));
9646
9647   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
9648      [Rn, #offset, MUL VL].  */
9649   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
9650       && (code != REG && code != PLUS))
9651     return false;
9652
9653   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
9654      REG addressing.  */
9655   if (advsimd_struct_p
9656       && !BYTES_BIG_ENDIAN
9657       && (code != POST_INC && code != REG))
9658     return false;
9659
9660   gcc_checking_assert (GET_MODE (x) == VOIDmode
9661                        || SCALAR_INT_MODE_P (GET_MODE (x)));
9662
9663   switch (code)
9664     {
9665     case REG:
9666     case SUBREG:
9667       info->type = ADDRESS_REG_IMM;
9668       info->base = x;
9669       info->offset = const0_rtx;
9670       info->const_offset = 0;
9671       return aarch64_base_register_rtx_p (x, strict_p);
9672
9673     case PLUS:
9674       op0 = XEXP (x, 0);
9675       op1 = XEXP (x, 1);
9676
9677       if (! strict_p
9678           && REG_P (op0)
9679           && virt_or_elim_regno_p (REGNO (op0))
9680           && poly_int_rtx_p (op1, &offset))
9681         {
9682           info->type = ADDRESS_REG_IMM;
9683           info->base = op0;
9684           info->offset = op1;
9685           info->const_offset = offset;
9686
9687           return true;
9688         }
9689
9690       if (maybe_ne (GET_MODE_SIZE (mode), 0)
9691           && aarch64_base_register_rtx_p (op0, strict_p)
9692           && poly_int_rtx_p (op1, &offset))
9693         {
9694           info->type = ADDRESS_REG_IMM;
9695           info->base = op0;
9696           info->offset = op1;
9697           info->const_offset = offset;
9698
9699           /* TImode and TFmode values are allowed in both pairs of X
9700              registers and individual Q registers.  The available
9701              address modes are:
9702              X,X: 7-bit signed scaled offset
9703              Q:   9-bit signed offset
9704              We conservatively require an offset representable in either mode.
9705              When performing the check for pairs of X registers i.e.  LDP/STP
9706              pass down DImode since that is the natural size of the LDP/STP
9707              instruction memory accesses.  */
9708           if (mode == TImode || mode == TFmode)
9709             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
9710                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9711                         || offset_12bit_unsigned_scaled_p (mode, offset)));
9712
9713           /* A 7bit offset check because OImode will emit a ldp/stp
9714              instruction (only big endian will get here).
9715              For ldp/stp instructions, the offset is scaled for the size of a
9716              single element of the pair.  */
9717           if (mode == OImode)
9718             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
9719
9720           /* Three 9/12 bit offsets checks because CImode will emit three
9721              ldr/str instructions (only big endian will get here).  */
9722           if (mode == CImode)
9723             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9724                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
9725                                                                offset + 32)
9726                         || offset_12bit_unsigned_scaled_p (V16QImode,
9727                                                            offset + 32)));
9728
9729           /* Two 7bit offsets checks because XImode will emit two ldp/stp
9730              instructions (only big endian will get here).  */
9731           if (mode == XImode)
9732             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9733                     && aarch64_offset_7bit_signed_scaled_p (TImode,
9734                                                             offset + 32));
9735
9736           /* Make "m" use the LD1 offset range for SVE data modes, so
9737              that pre-RTL optimizers like ivopts will work to that
9738              instead of the wider LDR/STR range.  */
9739           if (vec_flags == VEC_SVE_DATA)
9740             return (type == ADDR_QUERY_M
9741                     ? offset_4bit_signed_scaled_p (mode, offset)
9742                     : offset_9bit_signed_scaled_p (mode, offset));
9743
9744           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
9745             {
9746               poly_int64 end_offset = (offset
9747                                        + GET_MODE_SIZE (mode)
9748                                        - BYTES_PER_SVE_VECTOR);
9749               return (type == ADDR_QUERY_M
9750                       ? offset_4bit_signed_scaled_p (mode, offset)
9751                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
9752                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
9753                                                          end_offset)));
9754             }
9755
9756           if (vec_flags == VEC_SVE_PRED)
9757             return offset_9bit_signed_scaled_p (mode, offset);
9758
9759           if (load_store_pair_p)
9760             return ((known_eq (GET_MODE_SIZE (mode), 4)
9761                      || known_eq (GET_MODE_SIZE (mode), 8)
9762                      || known_eq (GET_MODE_SIZE (mode), 16))
9763                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9764           else
9765             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9766                     || offset_12bit_unsigned_scaled_p (mode, offset));
9767         }
9768
9769       if (allow_reg_index_p)
9770         {
9771           /* Look for base + (scaled/extended) index register.  */
9772           if (aarch64_base_register_rtx_p (op0, strict_p)
9773               && aarch64_classify_index (info, op1, mode, strict_p))
9774             {
9775               info->base = op0;
9776               return true;
9777             }
9778           if (aarch64_base_register_rtx_p (op1, strict_p)
9779               && aarch64_classify_index (info, op0, mode, strict_p))
9780             {
9781               info->base = op1;
9782               return true;
9783             }
9784         }
9785
9786       return false;
9787
9788     case POST_INC:
9789     case POST_DEC:
9790     case PRE_INC:
9791     case PRE_DEC:
9792       info->type = ADDRESS_REG_WB;
9793       info->base = XEXP (x, 0);
9794       info->offset = NULL_RTX;
9795       return aarch64_base_register_rtx_p (info->base, strict_p);
9796
9797     case POST_MODIFY:
9798     case PRE_MODIFY:
9799       info->type = ADDRESS_REG_WB;
9800       info->base = XEXP (x, 0);
9801       if (GET_CODE (XEXP (x, 1)) == PLUS
9802           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
9803           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
9804           && aarch64_base_register_rtx_p (info->base, strict_p))
9805         {
9806           info->offset = XEXP (XEXP (x, 1), 1);
9807           info->const_offset = offset;
9808
9809           /* TImode and TFmode values are allowed in both pairs of X
9810              registers and individual Q registers.  The available
9811              address modes are:
9812              X,X: 7-bit signed scaled offset
9813              Q:   9-bit signed offset
9814              We conservatively require an offset representable in either mode.
9815            */
9816           if (mode == TImode || mode == TFmode)
9817             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
9818                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
9819
9820           if (load_store_pair_p)
9821             return ((known_eq (GET_MODE_SIZE (mode), 4)
9822                      || known_eq (GET_MODE_SIZE (mode), 8)
9823                      || known_eq (GET_MODE_SIZE (mode), 16))
9824                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9825           else
9826             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
9827         }
9828       return false;
9829
9830     case CONST:
9831     case SYMBOL_REF:
9832     case LABEL_REF:
9833       /* load literal: pc-relative constant pool entry.  Only supported
9834          for SI mode or larger.  */
9835       info->type = ADDRESS_SYMBOLIC;
9836
9837       if (!load_store_pair_p
9838           && GET_MODE_SIZE (mode).is_constant (&const_size)
9839           && const_size >= 4)
9840         {
9841           poly_int64 offset;
9842           rtx sym = strip_offset_and_salt (x, &offset);
9843           return ((LABEL_REF_P (sym)
9844                    || (SYMBOL_REF_P (sym)
9845                        && CONSTANT_POOL_ADDRESS_P (sym)
9846                        && aarch64_pcrelative_literal_loads)));
9847         }
9848       return false;
9849
9850     case LO_SUM:
9851       info->type = ADDRESS_LO_SUM;
9852       info->base = XEXP (x, 0);
9853       info->offset = XEXP (x, 1);
9854       if (allow_reg_index_p
9855           && aarch64_base_register_rtx_p (info->base, strict_p))
9856         {
9857           poly_int64 offset;
9858           HOST_WIDE_INT const_offset;
9859           rtx sym = strip_offset_and_salt (info->offset, &offset);
9860           if (SYMBOL_REF_P (sym)
9861               && offset.is_constant (&const_offset)
9862               && (aarch64_classify_symbol (sym, const_offset)
9863                   == SYMBOL_SMALL_ABSOLUTE))
9864             {
9865               /* The symbol and offset must be aligned to the access size.  */
9866               unsigned int align;
9867
9868               if (CONSTANT_POOL_ADDRESS_P (sym))
9869                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
9870               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
9871                 {
9872                   tree exp = SYMBOL_REF_DECL (sym);
9873                   align = TYPE_ALIGN (TREE_TYPE (exp));
9874                   align = aarch64_constant_alignment (exp, align);
9875                 }
9876               else if (SYMBOL_REF_DECL (sym))
9877                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
9878               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
9879                        && SYMBOL_REF_BLOCK (sym) != NULL)
9880                 align = SYMBOL_REF_BLOCK (sym)->alignment;
9881               else
9882                 align = BITS_PER_UNIT;
9883
9884               poly_int64 ref_size = GET_MODE_SIZE (mode);
9885               if (known_eq (ref_size, 0))
9886                 ref_size = GET_MODE_SIZE (DImode);
9887
9888               return (multiple_p (const_offset, ref_size)
9889                       && multiple_p (align / BITS_PER_UNIT, ref_size));
9890             }
9891         }
9892       return false;
9893
9894     default:
9895       return false;
9896     }
9897 }
9898
9899 /* Return true if the address X is valid for a PRFM instruction.
9900    STRICT_P is true if we should do strict checking with
9901    aarch64_classify_address.  */
9902
9903 bool
9904 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
9905 {
9906   struct aarch64_address_info addr;
9907
9908   /* PRFM accepts the same addresses as DImode...  */
9909   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9910   if (!res)
9911     return false;
9912
9913   /* ... except writeback forms.  */
9914   return addr.type != ADDRESS_REG_WB;
9915 }
9916
9917 bool
9918 aarch64_symbolic_address_p (rtx x)
9919 {
9920   poly_int64 offset;
9921   x = strip_offset_and_salt (x, &offset);
9922   return SYMBOL_REF_P (x) || LABEL_REF_P (x);
9923 }
9924
9925 /* Classify the base of symbolic expression X.  */
9926
9927 enum aarch64_symbol_type
9928 aarch64_classify_symbolic_expression (rtx x)
9929 {
9930   rtx offset;
9931
9932   split_const (x, &x, &offset);
9933   return aarch64_classify_symbol (x, INTVAL (offset));
9934 }
9935
9936
9937 /* Return TRUE if X is a legitimate address for accessing memory in
9938    mode MODE.  */
9939 static bool
9940 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
9941 {
9942   struct aarch64_address_info addr;
9943
9944   return aarch64_classify_address (&addr, x, mode, strict_p);
9945 }
9946
9947 /* Return TRUE if X is a legitimate address of type TYPE for accessing
9948    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
9949 bool
9950 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
9951                               aarch64_addr_query_type type)
9952 {
9953   struct aarch64_address_info addr;
9954
9955   return aarch64_classify_address (&addr, x, mode, strict_p, type);
9956 }
9957
9958 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
9959
9960 static bool
9961 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
9962                                          poly_int64 orig_offset,
9963                                          machine_mode mode)
9964 {
9965   HOST_WIDE_INT size;
9966   if (GET_MODE_SIZE (mode).is_constant (&size))
9967     {
9968       HOST_WIDE_INT const_offset, second_offset;
9969
9970       /* A general SVE offset is A * VQ + B.  Remove the A component from
9971          coefficient 0 in order to get the constant B.  */
9972       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
9973
9974       /* Split an out-of-range address displacement into a base and
9975          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
9976          range otherwise to increase opportunities for sharing the base
9977          address of different sizes.  Unaligned accesses use the signed
9978          9-bit range, TImode/TFmode use the intersection of signed
9979          scaled 7-bit and signed 9-bit offset.  */
9980       if (mode == TImode || mode == TFmode)
9981         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
9982       else if ((const_offset & (size - 1)) != 0)
9983         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
9984       else
9985         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
9986
9987       if (second_offset == 0 || known_eq (orig_offset, second_offset))
9988         return false;
9989
9990       /* Split the offset into second_offset and the rest.  */
9991       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9992       *offset2 = gen_int_mode (second_offset, Pmode);
9993       return true;
9994     }
9995   else
9996     {
9997       /* Get the mode we should use as the basis of the range.  For structure
9998          modes this is the mode of one vector.  */
9999       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10000       machine_mode step_mode
10001         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
10002
10003       /* Get the "mul vl" multiplier we'd like to use.  */
10004       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
10005       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
10006       if (vec_flags & VEC_SVE_DATA)
10007         /* LDR supports a 9-bit range, but the move patterns for
10008            structure modes require all vectors to be in range of the
10009            same base.  The simplest way of accomodating that while still
10010            promoting reuse of anchor points between different modes is
10011            to use an 8-bit range unconditionally.  */
10012         vnum = ((vnum + 128) & 255) - 128;
10013       else
10014         /* Predicates are only handled singly, so we might as well use
10015            the full range.  */
10016         vnum = ((vnum + 256) & 511) - 256;
10017       if (vnum == 0)
10018         return false;
10019
10020       /* Convert the "mul vl" multiplier into a byte offset.  */
10021       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
10022       if (known_eq (second_offset, orig_offset))
10023         return false;
10024
10025       /* Split the offset into second_offset and the rest.  */
10026       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10027       *offset2 = gen_int_mode (second_offset, Pmode);
10028       return true;
10029     }
10030 }
10031
10032 /* Return the binary representation of floating point constant VALUE in INTVAL.
10033    If the value cannot be converted, return false without setting INTVAL.
10034    The conversion is done in the given MODE.  */
10035 bool
10036 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
10037 {
10038
10039   /* We make a general exception for 0.  */
10040   if (aarch64_float_const_zero_rtx_p (value))
10041     {
10042       *intval = 0;
10043       return true;
10044     }
10045
10046   scalar_float_mode mode;
10047   if (!CONST_DOUBLE_P (value)
10048       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
10049       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
10050       /* Only support up to DF mode.  */
10051       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
10052     return false;
10053
10054   unsigned HOST_WIDE_INT ival = 0;
10055
10056   long res[2];
10057   real_to_target (res,
10058                   CONST_DOUBLE_REAL_VALUE (value),
10059                   REAL_MODE_FORMAT (mode));
10060
10061   if (mode == DFmode)
10062     {
10063       int order = BYTES_BIG_ENDIAN ? 1 : 0;
10064       ival = zext_hwi (res[order], 32);
10065       ival |= (zext_hwi (res[1 - order], 32) << 32);
10066     }
10067   else
10068       ival = zext_hwi (res[0], 32);
10069
10070   *intval = ival;
10071   return true;
10072 }
10073
10074 /* Return TRUE if rtx X is an immediate constant that can be moved using a
10075    single MOV(+MOVK) followed by an FMOV.  */
10076 bool
10077 aarch64_float_const_rtx_p (rtx x)
10078 {
10079   machine_mode mode = GET_MODE (x);
10080   if (mode == VOIDmode)
10081     return false;
10082
10083   /* Determine whether it's cheaper to write float constants as
10084      mov/movk pairs over ldr/adrp pairs.  */
10085   unsigned HOST_WIDE_INT ival;
10086
10087   if (CONST_DOUBLE_P (x)
10088       && SCALAR_FLOAT_MODE_P (mode)
10089       && aarch64_reinterpret_float_as_int (x, &ival))
10090     {
10091       scalar_int_mode imode = (mode == HFmode
10092                                ? SImode
10093                                : int_mode_for_mode (mode).require ());
10094       int num_instr = aarch64_internal_mov_immediate
10095                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10096       return num_instr < 3;
10097     }
10098
10099   return false;
10100 }
10101
10102 /* Return TRUE if rtx X is immediate constant 0.0 */
10103 bool
10104 aarch64_float_const_zero_rtx_p (rtx x)
10105 {
10106   if (GET_MODE (x) == VOIDmode)
10107     return false;
10108
10109   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
10110     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
10111   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
10112 }
10113
10114 /* Return TRUE if rtx X is immediate constant that fits in a single
10115    MOVI immediate operation.  */
10116 bool
10117 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
10118 {
10119   if (!TARGET_SIMD)
10120      return false;
10121
10122   machine_mode vmode;
10123   scalar_int_mode imode;
10124   unsigned HOST_WIDE_INT ival;
10125
10126   if (CONST_DOUBLE_P (x)
10127       && SCALAR_FLOAT_MODE_P (mode))
10128     {
10129       if (!aarch64_reinterpret_float_as_int (x, &ival))
10130         return false;
10131
10132       /* We make a general exception for 0.  */
10133       if (aarch64_float_const_zero_rtx_p (x))
10134         return true;
10135
10136       imode = int_mode_for_mode (mode).require ();
10137     }
10138   else if (CONST_INT_P (x)
10139            && is_a <scalar_int_mode> (mode, &imode))
10140     ival = INTVAL (x);
10141   else
10142     return false;
10143
10144    /* use a 64 bit mode for everything except for DI/DF mode, where we use
10145      a 128 bit vector mode.  */
10146   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
10147
10148   vmode = aarch64_simd_container_mode (imode, width);
10149   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
10150
10151   return aarch64_simd_valid_immediate (v_op, NULL);
10152 }
10153
10154
10155 /* Return the fixed registers used for condition codes.  */
10156
10157 static bool
10158 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
10159 {
10160   *p1 = CC_REGNUM;
10161   *p2 = INVALID_REGNUM;
10162   return true;
10163 }
10164
10165 /* This function is used by the call expanders of the machine description.
10166    RESULT is the register in which the result is returned.  It's NULL for
10167    "call" and "sibcall".
10168    MEM is the location of the function call.
10169    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
10170    SIBCALL indicates whether this function call is normal call or sibling call.
10171    It will generate different pattern accordingly.  */
10172
10173 void
10174 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
10175 {
10176   rtx call, callee, tmp;
10177   rtvec vec;
10178   machine_mode mode;
10179
10180   gcc_assert (MEM_P (mem));
10181   callee = XEXP (mem, 0);
10182   mode = GET_MODE (callee);
10183   gcc_assert (mode == Pmode);
10184
10185   /* Decide if we should generate indirect calls by loading the
10186      address of the callee into a register before performing
10187      the branch-and-link.  */
10188   if (SYMBOL_REF_P (callee)
10189       ? (aarch64_is_long_call_p (callee)
10190          || aarch64_is_noplt_call_p (callee))
10191       : !REG_P (callee))
10192     XEXP (mem, 0) = force_reg (mode, callee);
10193
10194   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
10195
10196   if (result != NULL_RTX)
10197     call = gen_rtx_SET (result, call);
10198
10199   if (sibcall)
10200     tmp = ret_rtx;
10201   else
10202     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
10203
10204   gcc_assert (CONST_INT_P (callee_abi));
10205   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
10206                                UNSPEC_CALLEE_ABI);
10207
10208   vec = gen_rtvec (3, call, callee_abi, tmp);
10209   call = gen_rtx_PARALLEL (VOIDmode, vec);
10210
10211   aarch64_emit_call_insn (call);
10212 }
10213
10214 /* Emit call insn with PAT and do aarch64-specific handling.  */
10215
10216 void
10217 aarch64_emit_call_insn (rtx pat)
10218 {
10219   rtx insn = emit_call_insn (pat);
10220
10221   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
10222   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
10223   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
10224 }
10225
10226 machine_mode
10227 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
10228 {
10229   machine_mode mode_x = GET_MODE (x);
10230   rtx_code code_x = GET_CODE (x);
10231
10232   /* All floating point compares return CCFP if it is an equality
10233      comparison, and CCFPE otherwise.  */
10234   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
10235     {
10236       switch (code)
10237         {
10238         case EQ:
10239         case NE:
10240         case UNORDERED:
10241         case ORDERED:
10242         case UNLT:
10243         case UNLE:
10244         case UNGT:
10245         case UNGE:
10246         case UNEQ:
10247           return CCFPmode;
10248
10249         case LT:
10250         case LE:
10251         case GT:
10252         case GE:
10253         case LTGT:
10254           return CCFPEmode;
10255
10256         default:
10257           gcc_unreachable ();
10258         }
10259     }
10260
10261   /* Equality comparisons of short modes against zero can be performed
10262      using the TST instruction with the appropriate bitmask.  */
10263   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
10264       && (code == EQ || code == NE)
10265       && (mode_x == HImode || mode_x == QImode))
10266     return CC_NZmode;
10267
10268   /* Similarly, comparisons of zero_extends from shorter modes can
10269      be performed using an ANDS with an immediate mask.  */
10270   if (y == const0_rtx && code_x == ZERO_EXTEND
10271       && (mode_x == SImode || mode_x == DImode)
10272       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
10273       && (code == EQ || code == NE))
10274     return CC_NZmode;
10275
10276   if ((mode_x == SImode || mode_x == DImode)
10277       && y == const0_rtx
10278       && (code == EQ || code == NE || code == LT || code == GE)
10279       && (code_x == PLUS || code_x == MINUS || code_x == AND
10280           || code_x == NEG
10281           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
10282               && CONST_INT_P (XEXP (x, 2)))))
10283     return CC_NZmode;
10284
10285   /* A compare with a shifted operand.  Because of canonicalization,
10286      the comparison will have to be swapped when we emit the assembly
10287      code.  */
10288   if ((mode_x == SImode || mode_x == DImode)
10289       && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
10290       && (code_x == ASHIFT || code_x == ASHIFTRT
10291           || code_x == LSHIFTRT
10292           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
10293     return CC_SWPmode;
10294
10295   /* Similarly for a negated operand, but we can only do this for
10296      equalities.  */
10297   if ((mode_x == SImode || mode_x == DImode)
10298       && (REG_P (y) || SUBREG_P (y))
10299       && (code == EQ || code == NE)
10300       && code_x == NEG)
10301     return CC_Zmode;
10302
10303   /* A test for unsigned overflow from an addition.  */
10304   if ((mode_x == DImode || mode_x == TImode)
10305       && (code == LTU || code == GEU)
10306       && code_x == PLUS
10307       && rtx_equal_p (XEXP (x, 0), y))
10308     return CC_Cmode;
10309
10310   /* A test for unsigned overflow from an add with carry.  */
10311   if ((mode_x == DImode || mode_x == TImode)
10312       && (code == LTU || code == GEU)
10313       && code_x == PLUS
10314       && CONST_SCALAR_INT_P (y)
10315       && (rtx_mode_t (y, mode_x)
10316           == (wi::shwi (1, mode_x)
10317               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
10318     return CC_ADCmode;
10319
10320   /* A test for signed overflow.  */
10321   if ((mode_x == DImode || mode_x == TImode)
10322       && code == NE
10323       && code_x == PLUS
10324       && GET_CODE (y) == SIGN_EXTEND)
10325     return CC_Vmode;
10326
10327   /* For everything else, return CCmode.  */
10328   return CCmode;
10329 }
10330
10331 static int
10332 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
10333
10334 int
10335 aarch64_get_condition_code (rtx x)
10336 {
10337   machine_mode mode = GET_MODE (XEXP (x, 0));
10338   enum rtx_code comp_code = GET_CODE (x);
10339
10340   if (GET_MODE_CLASS (mode) != MODE_CC)
10341     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
10342   return aarch64_get_condition_code_1 (mode, comp_code);
10343 }
10344
10345 static int
10346 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
10347 {
10348   switch (mode)
10349     {
10350     case E_CCFPmode:
10351     case E_CCFPEmode:
10352       switch (comp_code)
10353         {
10354         case GE: return AARCH64_GE;
10355         case GT: return AARCH64_GT;
10356         case LE: return AARCH64_LS;
10357         case LT: return AARCH64_MI;
10358         case NE: return AARCH64_NE;
10359         case EQ: return AARCH64_EQ;
10360         case ORDERED: return AARCH64_VC;
10361         case UNORDERED: return AARCH64_VS;
10362         case UNLT: return AARCH64_LT;
10363         case UNLE: return AARCH64_LE;
10364         case UNGT: return AARCH64_HI;
10365         case UNGE: return AARCH64_PL;
10366         default: return -1;
10367         }
10368       break;
10369
10370     case E_CCmode:
10371       switch (comp_code)
10372         {
10373         case NE: return AARCH64_NE;
10374         case EQ: return AARCH64_EQ;
10375         case GE: return AARCH64_GE;
10376         case GT: return AARCH64_GT;
10377         case LE: return AARCH64_LE;
10378         case LT: return AARCH64_LT;
10379         case GEU: return AARCH64_CS;
10380         case GTU: return AARCH64_HI;
10381         case LEU: return AARCH64_LS;
10382         case LTU: return AARCH64_CC;
10383         default: return -1;
10384         }
10385       break;
10386
10387     case E_CC_SWPmode:
10388       switch (comp_code)
10389         {
10390         case NE: return AARCH64_NE;
10391         case EQ: return AARCH64_EQ;
10392         case GE: return AARCH64_LE;
10393         case GT: return AARCH64_LT;
10394         case LE: return AARCH64_GE;
10395         case LT: return AARCH64_GT;
10396         case GEU: return AARCH64_LS;
10397         case GTU: return AARCH64_CC;
10398         case LEU: return AARCH64_CS;
10399         case LTU: return AARCH64_HI;
10400         default: return -1;
10401         }
10402       break;
10403
10404     case E_CC_NZCmode:
10405       switch (comp_code)
10406         {
10407         case NE: return AARCH64_NE; /* = any */
10408         case EQ: return AARCH64_EQ; /* = none */
10409         case GE: return AARCH64_PL; /* = nfrst */
10410         case LT: return AARCH64_MI; /* = first */
10411         case GEU: return AARCH64_CS; /* = nlast */
10412         case GTU: return AARCH64_HI; /* = pmore */
10413         case LEU: return AARCH64_LS; /* = plast */
10414         case LTU: return AARCH64_CC; /* = last */
10415         default: return -1;
10416         }
10417       break;
10418
10419     case E_CC_NZmode:
10420       switch (comp_code)
10421         {
10422         case NE: return AARCH64_NE;
10423         case EQ: return AARCH64_EQ;
10424         case GE: return AARCH64_PL;
10425         case LT: return AARCH64_MI;
10426         default: return -1;
10427         }
10428       break;
10429
10430     case E_CC_Zmode:
10431       switch (comp_code)
10432         {
10433         case NE: return AARCH64_NE;
10434         case EQ: return AARCH64_EQ;
10435         default: return -1;
10436         }
10437       break;
10438
10439     case E_CC_Cmode:
10440       switch (comp_code)
10441         {
10442         case LTU: return AARCH64_CS;
10443         case GEU: return AARCH64_CC;
10444         default: return -1;
10445         }
10446       break;
10447
10448     case E_CC_ADCmode:
10449       switch (comp_code)
10450         {
10451         case GEU: return AARCH64_CS;
10452         case LTU: return AARCH64_CC;
10453         default: return -1;
10454         }
10455       break;
10456
10457     case E_CC_Vmode:
10458       switch (comp_code)
10459         {
10460         case NE: return AARCH64_VS;
10461         case EQ: return AARCH64_VC;
10462         default: return -1;
10463         }
10464       break;
10465
10466     default:
10467       return -1;
10468     }
10469
10470   return -1;
10471 }
10472
10473 bool
10474 aarch64_const_vec_all_same_in_range_p (rtx x,
10475                                        HOST_WIDE_INT minval,
10476                                        HOST_WIDE_INT maxval)
10477 {
10478   rtx elt;
10479   return (const_vec_duplicate_p (x, &elt)
10480           && CONST_INT_P (elt)
10481           && IN_RANGE (INTVAL (elt), minval, maxval));
10482 }
10483
10484 bool
10485 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
10486 {
10487   return aarch64_const_vec_all_same_in_range_p (x, val, val);
10488 }
10489
10490 /* Return true if VEC is a constant in which every element is in the range
10491    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
10492
10493 static bool
10494 aarch64_const_vec_all_in_range_p (rtx vec,
10495                                   HOST_WIDE_INT minval,
10496                                   HOST_WIDE_INT maxval)
10497 {
10498   if (GET_CODE (vec) != CONST_VECTOR
10499       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
10500     return false;
10501
10502   int nunits;
10503   if (!CONST_VECTOR_STEPPED_P (vec))
10504     nunits = const_vector_encoded_nelts (vec);
10505   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
10506     return false;
10507
10508   for (int i = 0; i < nunits; i++)
10509     {
10510       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
10511       if (!CONST_INT_P (vec_elem)
10512           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
10513         return false;
10514     }
10515   return true;
10516 }
10517
10518 /* N Z C V.  */
10519 #define AARCH64_CC_V 1
10520 #define AARCH64_CC_C (1 << 1)
10521 #define AARCH64_CC_Z (1 << 2)
10522 #define AARCH64_CC_N (1 << 3)
10523
10524 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
10525 static const int aarch64_nzcv_codes[] =
10526 {
10527   0,            /* EQ, Z == 1.  */
10528   AARCH64_CC_Z, /* NE, Z == 0.  */
10529   0,            /* CS, C == 1.  */
10530   AARCH64_CC_C, /* CC, C == 0.  */
10531   0,            /* MI, N == 1.  */
10532   AARCH64_CC_N, /* PL, N == 0.  */
10533   0,            /* VS, V == 1.  */
10534   AARCH64_CC_V, /* VC, V == 0.  */
10535   0,            /* HI, C ==1 && Z == 0.  */
10536   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
10537   AARCH64_CC_V, /* GE, N == V.  */
10538   0,            /* LT, N != V.  */
10539   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
10540   0,            /* LE, !(Z == 0 && N == V).  */
10541   0,            /* AL, Any.  */
10542   0             /* NV, Any.  */
10543 };
10544
10545 /* Print floating-point vector immediate operand X to F, negating it
10546    first if NEGATE is true.  Return true on success, false if it isn't
10547    a constant we can handle.  */
10548
10549 static bool
10550 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
10551 {
10552   rtx elt;
10553
10554   if (!const_vec_duplicate_p (x, &elt))
10555     return false;
10556
10557   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
10558   if (negate)
10559     r = real_value_negate (&r);
10560
10561   /* Handle the SVE single-bit immediates specially, since they have a
10562      fixed form in the assembly syntax.  */
10563   if (real_equal (&r, &dconst0))
10564     asm_fprintf (f, "0.0");
10565   else if (real_equal (&r, &dconst2))
10566     asm_fprintf (f, "2.0");
10567   else if (real_equal (&r, &dconst1))
10568     asm_fprintf (f, "1.0");
10569   else if (real_equal (&r, &dconsthalf))
10570     asm_fprintf (f, "0.5");
10571   else
10572     {
10573       const int buf_size = 20;
10574       char float_buf[buf_size] = {'\0'};
10575       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
10576                                 1, GET_MODE (elt));
10577       asm_fprintf (f, "%s", float_buf);
10578     }
10579
10580   return true;
10581 }
10582
10583 /* Return the equivalent letter for size.  */
10584 static char
10585 sizetochar (int size)
10586 {
10587   switch (size)
10588     {
10589     case 64: return 'd';
10590     case 32: return 's';
10591     case 16: return 'h';
10592     case 8 : return 'b';
10593     default: gcc_unreachable ();
10594     }
10595 }
10596
10597 /* Print operand X to file F in a target specific manner according to CODE.
10598    The acceptable formatting commands given by CODE are:
10599      'c':               An integer or symbol address without a preceding #
10600                         sign.
10601      'C':               Take the duplicated element in a vector constant
10602                         and print it in hex.
10603      'D':               Take the duplicated element in a vector constant
10604                         and print it as an unsigned integer, in decimal.
10605      'e':               Print the sign/zero-extend size as a character 8->b,
10606                         16->h, 32->w.  Can also be used for masks:
10607                         0xff->b, 0xffff->h, 0xffffffff->w.
10608      'I':               If the operand is a duplicated vector constant,
10609                         replace it with the duplicated scalar.  If the
10610                         operand is then a floating-point constant, replace
10611                         it with the integer bit representation.  Print the
10612                         transformed constant as a signed decimal number.
10613      'p':               Prints N such that 2^N == X (X must be power of 2 and
10614                         const int).
10615      'P':               Print the number of non-zero bits in X (a const_int).
10616      'H':               Print the higher numbered register of a pair (TImode)
10617                         of regs.
10618      'm':               Print a condition (eq, ne, etc).
10619      'M':               Same as 'm', but invert condition.
10620      'N':               Take the duplicated element in a vector constant
10621                         and print the negative of it in decimal.
10622      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
10623      'S/T/U/V':         Print a FP/SIMD register name for a register list.
10624                         The register printed is the FP/SIMD register name
10625                         of X + 0/1/2/3 for S/T/U/V.
10626      'R':               Print a scalar Integer/FP/SIMD register name + 1.
10627      'X':               Print bottom 16 bits of integer constant in hex.
10628      'w/x':             Print a general register name or the zero register
10629                         (32-bit or 64-bit).
10630      '0':               Print a normal operand, if it's a general register,
10631                         then we assume DImode.
10632      'k':               Print NZCV for conditional compare instructions.
10633      'A':               Output address constant representing the first
10634                         argument of X, specifying a relocation offset
10635                         if appropriate.
10636      'L':               Output constant address specified by X
10637                         with a relocation offset if appropriate.
10638      'G':               Prints address of X, specifying a PC relative
10639                         relocation mode if appropriate.
10640      'y':               Output address of LDP or STP - this is used for
10641                         some LDP/STPs which don't use a PARALLEL in their
10642                         pattern (so the mode needs to be adjusted).
10643      'z':               Output address of a typical LDP or STP.  */
10644
10645 static void
10646 aarch64_print_operand (FILE *f, rtx x, int code)
10647 {
10648   rtx elt;
10649   switch (code)
10650     {
10651     case 'c':
10652       if (CONST_INT_P (x))
10653         fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
10654       else
10655         {
10656           poly_int64 offset;
10657           rtx base = strip_offset_and_salt (x, &offset);
10658           if (SYMBOL_REF_P (base))
10659             output_addr_const (f, x);
10660           else
10661             output_operand_lossage ("unsupported operand for code '%c'", code);
10662         }
10663       break;
10664
10665     case 'e':
10666       {
10667         x = unwrap_const_vec_duplicate (x);
10668         if (!CONST_INT_P (x))
10669           {
10670             output_operand_lossage ("invalid operand for '%%%c'", code);
10671             return;
10672           }
10673
10674         HOST_WIDE_INT val = INTVAL (x);
10675         if ((val & ~7) == 8 || val == 0xff)
10676           fputc ('b', f);
10677         else if ((val & ~7) == 16 || val == 0xffff)
10678           fputc ('h', f);
10679         else if ((val & ~7) == 32 || val == 0xffffffff)
10680           fputc ('w', f);
10681         else
10682           {
10683             output_operand_lossage ("invalid operand for '%%%c'", code);
10684             return;
10685           }
10686       }
10687       break;
10688
10689     case 'p':
10690       {
10691         int n;
10692
10693         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
10694           {
10695             output_operand_lossage ("invalid operand for '%%%c'", code);
10696             return;
10697           }
10698
10699         asm_fprintf (f, "%d", n);
10700       }
10701       break;
10702
10703     case 'P':
10704       if (!CONST_INT_P (x))
10705         {
10706           output_operand_lossage ("invalid operand for '%%%c'", code);
10707           return;
10708         }
10709
10710       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
10711       break;
10712
10713     case 'H':
10714       if (x == const0_rtx)
10715         {
10716           asm_fprintf (f, "xzr");
10717           break;
10718         }
10719
10720       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
10721         {
10722           output_operand_lossage ("invalid operand for '%%%c'", code);
10723           return;
10724         }
10725
10726       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
10727       break;
10728
10729     case 'I':
10730       {
10731         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
10732         if (CONST_INT_P (x))
10733           asm_fprintf (f, "%wd", INTVAL (x));
10734         else
10735           {
10736             output_operand_lossage ("invalid operand for '%%%c'", code);
10737             return;
10738           }
10739         break;
10740       }
10741
10742     case 'M':
10743     case 'm':
10744       {
10745         int cond_code;
10746         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
10747         if (x == const_true_rtx)
10748           {
10749             if (code == 'M')
10750               fputs ("nv", f);
10751             return;
10752           }
10753
10754         if (!COMPARISON_P (x))
10755           {
10756             output_operand_lossage ("invalid operand for '%%%c'", code);
10757             return;
10758           }
10759
10760         cond_code = aarch64_get_condition_code (x);
10761         gcc_assert (cond_code >= 0);
10762         if (code == 'M')
10763           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
10764         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
10765           fputs (aarch64_sve_condition_codes[cond_code], f);
10766         else
10767           fputs (aarch64_condition_codes[cond_code], f);
10768       }
10769       break;
10770
10771     case 'N':
10772       if (!const_vec_duplicate_p (x, &elt))
10773         {
10774           output_operand_lossage ("invalid vector constant");
10775           return;
10776         }
10777
10778       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10779         asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
10780       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10781                && aarch64_print_vector_float_operand (f, x, true))
10782         ;
10783       else
10784         {
10785           output_operand_lossage ("invalid vector constant");
10786           return;
10787         }
10788       break;
10789
10790     case 'b':
10791     case 'h':
10792     case 's':
10793     case 'd':
10794     case 'q':
10795       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10796         {
10797           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10798           return;
10799         }
10800       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
10801       break;
10802
10803     case 'S':
10804     case 'T':
10805     case 'U':
10806     case 'V':
10807       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10808         {
10809           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10810           return;
10811         }
10812       asm_fprintf (f, "%c%d",
10813                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
10814                    REGNO (x) - V0_REGNUM + (code - 'S'));
10815       break;
10816
10817     case 'R':
10818       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
10819         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
10820       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10821         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
10822       else
10823         output_operand_lossage ("incompatible register operand for '%%%c'",
10824                                 code);
10825       break;
10826
10827     case 'X':
10828       if (!CONST_INT_P (x))
10829         {
10830           output_operand_lossage ("invalid operand for '%%%c'", code);
10831           return;
10832         }
10833       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
10834       break;
10835
10836     case 'C':
10837       {
10838         /* Print a replicated constant in hex.  */
10839         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10840           {
10841             output_operand_lossage ("invalid operand for '%%%c'", code);
10842             return;
10843           }
10844         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10845         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10846       }
10847       break;
10848
10849     case 'D':
10850       {
10851         /* Print a replicated constant in decimal, treating it as
10852            unsigned.  */
10853         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10854           {
10855             output_operand_lossage ("invalid operand for '%%%c'", code);
10856             return;
10857           }
10858         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10859         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10860       }
10861       break;
10862
10863     case 'w':
10864     case 'x':
10865       if (x == const0_rtx
10866           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
10867         {
10868           asm_fprintf (f, "%czr", code);
10869           break;
10870         }
10871
10872       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10873         {
10874           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
10875           break;
10876         }
10877
10878       if (REG_P (x) && REGNO (x) == SP_REGNUM)
10879         {
10880           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
10881           break;
10882         }
10883
10884       /* Fall through */
10885
10886     case 0:
10887       if (x == NULL)
10888         {
10889           output_operand_lossage ("missing operand");
10890           return;
10891         }
10892
10893       switch (GET_CODE (x))
10894         {
10895         case REG:
10896           if (aarch64_sve_data_mode_p (GET_MODE (x)))
10897             {
10898               if (REG_NREGS (x) == 1)
10899                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
10900               else
10901                 {
10902                   char suffix
10903                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
10904                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
10905                                REGNO (x) - V0_REGNUM, suffix,
10906                                END_REGNO (x) - V0_REGNUM - 1, suffix);
10907                 }
10908             }
10909           else
10910             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
10911           break;
10912
10913         case MEM:
10914           output_address (GET_MODE (x), XEXP (x, 0));
10915           break;
10916
10917         case LABEL_REF:
10918         case SYMBOL_REF:
10919           output_addr_const (asm_out_file, x);
10920           break;
10921
10922         case CONST_INT:
10923           asm_fprintf (f, "%wd", INTVAL (x));
10924           break;
10925
10926         case CONST:
10927           if (!VECTOR_MODE_P (GET_MODE (x)))
10928             {
10929               output_addr_const (asm_out_file, x);
10930               break;
10931             }
10932           /* fall through */
10933
10934         case CONST_VECTOR:
10935           if (!const_vec_duplicate_p (x, &elt))
10936             {
10937               output_operand_lossage ("invalid vector constant");
10938               return;
10939             }
10940
10941           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10942             asm_fprintf (f, "%wd", INTVAL (elt));
10943           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10944                    && aarch64_print_vector_float_operand (f, x, false))
10945             ;
10946           else
10947             {
10948               output_operand_lossage ("invalid vector constant");
10949               return;
10950             }
10951           break;
10952
10953         case CONST_DOUBLE:
10954           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
10955              be getting CONST_DOUBLEs holding integers.  */
10956           gcc_assert (GET_MODE (x) != VOIDmode);
10957           if (aarch64_float_const_zero_rtx_p (x))
10958             {
10959               fputc ('0', f);
10960               break;
10961             }
10962           else if (aarch64_float_const_representable_p (x))
10963             {
10964 #define buf_size 20
10965               char float_buf[buf_size] = {'\0'};
10966               real_to_decimal_for_mode (float_buf,
10967                                         CONST_DOUBLE_REAL_VALUE (x),
10968                                         buf_size, buf_size,
10969                                         1, GET_MODE (x));
10970               asm_fprintf (asm_out_file, "%s", float_buf);
10971               break;
10972 #undef buf_size
10973             }
10974           output_operand_lossage ("invalid constant");
10975           return;
10976         default:
10977           output_operand_lossage ("invalid operand");
10978           return;
10979         }
10980       break;
10981
10982     case 'A':
10983       if (GET_CODE (x) == HIGH)
10984         x = XEXP (x, 0);
10985
10986       switch (aarch64_classify_symbolic_expression (x))
10987         {
10988         case SYMBOL_SMALL_GOT_4G:
10989           asm_fprintf (asm_out_file, ":got:");
10990           break;
10991
10992         case SYMBOL_SMALL_TLSGD:
10993           asm_fprintf (asm_out_file, ":tlsgd:");
10994           break;
10995
10996         case SYMBOL_SMALL_TLSDESC:
10997           asm_fprintf (asm_out_file, ":tlsdesc:");
10998           break;
10999
11000         case SYMBOL_SMALL_TLSIE:
11001           asm_fprintf (asm_out_file, ":gottprel:");
11002           break;
11003
11004         case SYMBOL_TLSLE24:
11005           asm_fprintf (asm_out_file, ":tprel:");
11006           break;
11007
11008         case SYMBOL_TINY_GOT:
11009           gcc_unreachable ();
11010           break;
11011
11012         default:
11013           break;
11014         }
11015       output_addr_const (asm_out_file, x);
11016       break;
11017
11018     case 'L':
11019       switch (aarch64_classify_symbolic_expression (x))
11020         {
11021         case SYMBOL_SMALL_GOT_4G:
11022           asm_fprintf (asm_out_file, ":lo12:");
11023           break;
11024
11025         case SYMBOL_SMALL_TLSGD:
11026           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
11027           break;
11028
11029         case SYMBOL_SMALL_TLSDESC:
11030           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
11031           break;
11032
11033         case SYMBOL_SMALL_TLSIE:
11034           asm_fprintf (asm_out_file, ":gottprel_lo12:");
11035           break;
11036
11037         case SYMBOL_TLSLE12:
11038           asm_fprintf (asm_out_file, ":tprel_lo12:");
11039           break;
11040
11041         case SYMBOL_TLSLE24:
11042           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
11043           break;
11044
11045         case SYMBOL_TINY_GOT:
11046           asm_fprintf (asm_out_file, ":got:");
11047           break;
11048
11049         case SYMBOL_TINY_TLSIE:
11050           asm_fprintf (asm_out_file, ":gottprel:");
11051           break;
11052
11053         default:
11054           break;
11055         }
11056       output_addr_const (asm_out_file, x);
11057       break;
11058
11059     case 'G':
11060       switch (aarch64_classify_symbolic_expression (x))
11061         {
11062         case SYMBOL_TLSLE24:
11063           asm_fprintf (asm_out_file, ":tprel_hi12:");
11064           break;
11065         default:
11066           break;
11067         }
11068       output_addr_const (asm_out_file, x);
11069       break;
11070
11071     case 'k':
11072       {
11073         HOST_WIDE_INT cond_code;
11074
11075         if (!CONST_INT_P (x))
11076           {
11077             output_operand_lossage ("invalid operand for '%%%c'", code);
11078             return;
11079           }
11080
11081         cond_code = INTVAL (x);
11082         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
11083         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
11084       }
11085       break;
11086
11087     case 'y':
11088     case 'z':
11089       {
11090         machine_mode mode = GET_MODE (x);
11091
11092         if (!MEM_P (x)
11093             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
11094           {
11095             output_operand_lossage ("invalid operand for '%%%c'", code);
11096             return;
11097           }
11098
11099         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
11100                                             code == 'y'
11101                                             ? ADDR_QUERY_LDP_STP_N
11102                                             : ADDR_QUERY_LDP_STP))
11103           output_operand_lossage ("invalid operand prefix '%%%c'", code);
11104       }
11105       break;
11106
11107     default:
11108       output_operand_lossage ("invalid operand prefix '%%%c'", code);
11109       return;
11110     }
11111 }
11112
11113 /* Print address 'x' of a memory access with mode 'mode'.
11114    'op' is the context required by aarch64_classify_address.  It can either be
11115    MEM for a normal memory access or PARALLEL for LDP/STP.  */
11116 static bool
11117 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
11118                                 aarch64_addr_query_type type)
11119 {
11120   struct aarch64_address_info addr;
11121   unsigned int size, vec_flags;
11122
11123   /* Check all addresses are Pmode - including ILP32.  */
11124   if (GET_MODE (x) != Pmode
11125       && (!CONST_INT_P (x)
11126           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
11127     {
11128       output_operand_lossage ("invalid address mode");
11129       return false;
11130     }
11131
11132   if (aarch64_classify_address (&addr, x, mode, true, type))
11133     switch (addr.type)
11134       {
11135       case ADDRESS_REG_IMM:
11136         if (known_eq (addr.const_offset, 0))
11137           {
11138             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
11139             return true;
11140           }
11141
11142         vec_flags = aarch64_classify_vector_mode (mode);
11143         if (vec_flags & VEC_ANY_SVE)
11144           {
11145             HOST_WIDE_INT vnum
11146               = exact_div (addr.const_offset,
11147                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
11148             asm_fprintf (f, "[%s, #%wd, mul vl]",
11149                          reg_names[REGNO (addr.base)], vnum);
11150             return true;
11151           }
11152
11153         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
11154                      INTVAL (addr.offset));
11155         return true;
11156
11157       case ADDRESS_REG_REG:
11158         if (addr.shift == 0)
11159           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
11160                        reg_names [REGNO (addr.offset)]);
11161         else
11162           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
11163                        reg_names [REGNO (addr.offset)], addr.shift);
11164         return true;
11165
11166       case ADDRESS_REG_UXTW:
11167         if (addr.shift == 0)
11168           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
11169                        REGNO (addr.offset) - R0_REGNUM);
11170         else
11171           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
11172                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
11173         return true;
11174
11175       case ADDRESS_REG_SXTW:
11176         if (addr.shift == 0)
11177           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
11178                        REGNO (addr.offset) - R0_REGNUM);
11179         else
11180           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
11181                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
11182         return true;
11183
11184       case ADDRESS_REG_WB:
11185         /* Writeback is only supported for fixed-width modes.  */
11186         size = GET_MODE_SIZE (mode).to_constant ();
11187         switch (GET_CODE (x))
11188           {
11189           case PRE_INC:
11190             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
11191             return true;
11192           case POST_INC:
11193             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
11194             return true;
11195           case PRE_DEC:
11196             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
11197             return true;
11198           case POST_DEC:
11199             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
11200             return true;
11201           case PRE_MODIFY:
11202             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
11203                          INTVAL (addr.offset));
11204             return true;
11205           case POST_MODIFY:
11206             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
11207                          INTVAL (addr.offset));
11208             return true;
11209           default:
11210             break;
11211           }
11212         break;
11213
11214       case ADDRESS_LO_SUM:
11215         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
11216         output_addr_const (f, addr.offset);
11217         asm_fprintf (f, "]");
11218         return true;
11219
11220       case ADDRESS_SYMBOLIC:
11221         output_addr_const (f, x);
11222         return true;
11223       }
11224
11225   return false;
11226 }
11227
11228 /* Print address 'x' of a memory access with mode 'mode'.  */
11229 static void
11230 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
11231 {
11232   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
11233     output_addr_const (f, x);
11234 }
11235
11236 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
11237
11238 static bool
11239 aarch64_output_addr_const_extra (FILE *file, rtx x)
11240 {
11241   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
11242     {
11243       output_addr_const (file, XVECEXP (x, 0, 0));
11244       return true;
11245    }
11246   return false;
11247 }
11248
11249 bool
11250 aarch64_label_mentioned_p (rtx x)
11251 {
11252   const char *fmt;
11253   int i;
11254
11255   if (LABEL_REF_P (x))
11256     return true;
11257
11258   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
11259      referencing instruction, but they are constant offsets, not
11260      symbols.  */
11261   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
11262     return false;
11263
11264   fmt = GET_RTX_FORMAT (GET_CODE (x));
11265   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
11266     {
11267       if (fmt[i] == 'E')
11268         {
11269           int j;
11270
11271           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
11272             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
11273               return 1;
11274         }
11275       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
11276         return 1;
11277     }
11278
11279   return 0;
11280 }
11281
11282 /* Implement REGNO_REG_CLASS.  */
11283
11284 enum reg_class
11285 aarch64_regno_regclass (unsigned regno)
11286 {
11287   if (STUB_REGNUM_P (regno))
11288     return STUB_REGS;
11289
11290   if (GP_REGNUM_P (regno))
11291     return GENERAL_REGS;
11292
11293   if (regno == SP_REGNUM)
11294     return STACK_REG;
11295
11296   if (regno == FRAME_POINTER_REGNUM
11297       || regno == ARG_POINTER_REGNUM)
11298     return POINTER_REGS;
11299
11300   if (FP_REGNUM_P (regno))
11301     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
11302             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
11303
11304   if (PR_REGNUM_P (regno))
11305     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
11306
11307   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
11308     return FFR_REGS;
11309
11310   return NO_REGS;
11311 }
11312
11313 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
11314    If OFFSET is out of range, return an offset of an anchor point
11315    that is in range.  Return 0 otherwise.  */
11316
11317 static HOST_WIDE_INT
11318 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
11319                        machine_mode mode)
11320 {
11321   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
11322   if (size > 16)
11323     return (offset + 0x400) & ~0x7f0;
11324
11325   /* For offsets that aren't a multiple of the access size, the limit is
11326      -256...255.  */
11327   if (offset & (size - 1))
11328     {
11329       /* BLKmode typically uses LDP of X-registers.  */
11330       if (mode == BLKmode)
11331         return (offset + 512) & ~0x3ff;
11332       return (offset + 0x100) & ~0x1ff;
11333     }
11334
11335   /* Small negative offsets are supported.  */
11336   if (IN_RANGE (offset, -256, 0))
11337     return 0;
11338
11339   if (mode == TImode || mode == TFmode)
11340     return (offset + 0x100) & ~0x1ff;
11341
11342   /* Use 12-bit offset by access size.  */
11343   return offset & (~0xfff * size);
11344 }
11345
11346 static rtx
11347 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
11348 {
11349   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
11350      where mask is selected by alignment and size of the offset.
11351      We try to pick as large a range for the offset as possible to
11352      maximize the chance of a CSE.  However, for aligned addresses
11353      we limit the range to 4k so that structures with different sized
11354      elements are likely to use the same base.  We need to be careful
11355      not to split a CONST for some forms of address expression, otherwise
11356      it will generate sub-optimal code.  */
11357
11358   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
11359     {
11360       rtx base = XEXP (x, 0);
11361       rtx offset_rtx = XEXP (x, 1);
11362       HOST_WIDE_INT offset = INTVAL (offset_rtx);
11363
11364       if (GET_CODE (base) == PLUS)
11365         {
11366           rtx op0 = XEXP (base, 0);
11367           rtx op1 = XEXP (base, 1);
11368
11369           /* Force any scaling into a temp for CSE.  */
11370           op0 = force_reg (Pmode, op0);
11371           op1 = force_reg (Pmode, op1);
11372
11373           /* Let the pointer register be in op0.  */
11374           if (REG_POINTER (op1))
11375             std::swap (op0, op1);
11376
11377           /* If the pointer is virtual or frame related, then we know that
11378              virtual register instantiation or register elimination is going
11379              to apply a second constant.  We want the two constants folded
11380              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
11381           if (virt_or_elim_regno_p (REGNO (op0)))
11382             {
11383               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
11384                                    NULL_RTX, true, OPTAB_DIRECT);
11385               return gen_rtx_PLUS (Pmode, base, op1);
11386             }
11387
11388           /* Otherwise, in order to encourage CSE (and thence loop strength
11389              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
11390           base = expand_binop (Pmode, add_optab, op0, op1,
11391                                NULL_RTX, true, OPTAB_DIRECT);
11392           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
11393         }
11394
11395       HOST_WIDE_INT size;
11396       if (GET_MODE_SIZE (mode).is_constant (&size))
11397         {
11398           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
11399                                                              mode);
11400           if (base_offset != 0)
11401             {
11402               base = plus_constant (Pmode, base, base_offset);
11403               base = force_operand (base, NULL_RTX);
11404               return plus_constant (Pmode, base, offset - base_offset);
11405             }
11406         }
11407     }
11408
11409   return x;
11410 }
11411
11412 static reg_class_t
11413 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
11414                           reg_class_t rclass,
11415                           machine_mode mode,
11416                           secondary_reload_info *sri)
11417 {
11418   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
11419      LDR and STR.  See the comment at the head of aarch64-sve.md for
11420      more details about the big-endian handling.  */
11421   if (reg_class_subset_p (rclass, FP_REGS)
11422       && !((REG_P (x) && HARD_REGISTER_P (x))
11423            || aarch64_simd_valid_immediate (x, NULL))
11424       && mode != VNx16QImode)
11425     {
11426       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11427       if ((vec_flags & VEC_SVE_DATA)
11428           && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
11429         {
11430           sri->icode = CODE_FOR_aarch64_sve_reload_mem;
11431           return NO_REGS;
11432         }
11433     }
11434
11435   /* If we have to disable direct literal pool loads and stores because the
11436      function is too big, then we need a scratch register.  */
11437   if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
11438       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
11439           || targetm.vector_mode_supported_p (GET_MODE (x)))
11440       && !aarch64_pcrelative_literal_loads)
11441     {
11442       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
11443       return NO_REGS;
11444     }
11445
11446   /* Without the TARGET_SIMD instructions we cannot move a Q register
11447      to a Q register directly.  We need a scratch.  */
11448   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
11449       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
11450       && reg_class_subset_p (rclass, FP_REGS))
11451     {
11452       sri->icode = code_for_aarch64_reload_mov (mode);
11453       return NO_REGS;
11454     }
11455
11456   /* A TFmode or TImode memory access should be handled via an FP_REGS
11457      because AArch64 has richer addressing modes for LDR/STR instructions
11458      than LDP/STP instructions.  */
11459   if (TARGET_FLOAT && rclass == GENERAL_REGS
11460       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
11461     return FP_REGS;
11462
11463   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
11464       return GENERAL_REGS;
11465
11466   return NO_REGS;
11467 }
11468
11469 static bool
11470 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
11471 {
11472   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
11473
11474   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
11475      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
11476   if (frame_pointer_needed)
11477     return to == HARD_FRAME_POINTER_REGNUM;
11478   return true;
11479 }
11480
11481 poly_int64
11482 aarch64_initial_elimination_offset (unsigned from, unsigned to)
11483 {
11484   if (to == HARD_FRAME_POINTER_REGNUM)
11485     {
11486       if (from == ARG_POINTER_REGNUM)
11487         return cfun->machine->frame.hard_fp_offset;
11488
11489       if (from == FRAME_POINTER_REGNUM)
11490         return cfun->machine->frame.hard_fp_offset
11491                - cfun->machine->frame.locals_offset;
11492     }
11493
11494   if (to == STACK_POINTER_REGNUM)
11495     {
11496       if (from == FRAME_POINTER_REGNUM)
11497           return cfun->machine->frame.frame_size
11498                  - cfun->machine->frame.locals_offset;
11499     }
11500
11501   return cfun->machine->frame.frame_size;
11502 }
11503
11504
11505 /* Get return address without mangling.  */
11506
11507 rtx
11508 aarch64_return_addr_rtx (void)
11509 {
11510   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
11511   /* Note: aarch64_return_address_signing_enabled only
11512      works after cfun->machine->frame.laid_out is set,
11513      so here we don't know if the return address will
11514      be signed or not.  */
11515   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
11516   emit_move_insn (lr, val);
11517   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
11518   return lr;
11519 }
11520
11521
11522 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
11523    previous frame.  */
11524
11525 rtx
11526 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
11527 {
11528   if (count != 0)
11529     return const0_rtx;
11530   return aarch64_return_addr_rtx ();
11531 }
11532
11533 static void
11534 aarch64_asm_trampoline_template (FILE *f)
11535 {
11536   /* Even if the current function doesn't have branch protection, some
11537      later function might, so since this template is only generated once
11538      we have to add a BTI just in case. */
11539   asm_fprintf (f, "\thint\t34 // bti c\n");
11540
11541   if (TARGET_ILP32)
11542     {
11543       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
11544       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
11545     }
11546   else
11547     {
11548       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
11549       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
11550     }
11551   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
11552
11553   /* We always emit a speculation barrier.
11554      This is because the same trampoline template is used for every nested
11555      function.  Since nested functions are not particularly common or
11556      performant we don't worry too much about the extra instructions to copy
11557      around.
11558      This is not yet a problem, since we have not yet implemented function
11559      specific attributes to choose between hardening against straight line
11560      speculation or not, but such function specific attributes are likely to
11561      happen in the future.  */
11562   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
11563
11564   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11565   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11566 }
11567
11568 static void
11569 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
11570 {
11571   rtx fnaddr, mem, a_tramp;
11572   const int tramp_code_sz = 24;
11573
11574   /* Don't need to copy the trailing D-words, we fill those in below.  */
11575   /* We create our own memory address in Pmode so that `emit_block_move` can
11576      use parts of the backend which expect Pmode addresses.  */
11577   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
11578   emit_block_move (gen_rtx_MEM (BLKmode, temp),
11579                    assemble_trampoline_template (),
11580                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
11581   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
11582   fnaddr = XEXP (DECL_RTL (fndecl), 0);
11583   if (GET_MODE (fnaddr) != ptr_mode)
11584     fnaddr = convert_memory_address (ptr_mode, fnaddr);
11585   emit_move_insn (mem, fnaddr);
11586
11587   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
11588   emit_move_insn (mem, chain_value);
11589
11590   /* XXX We should really define a "clear_cache" pattern and use
11591      gen_clear_cache().  */
11592   a_tramp = XEXP (m_tramp, 0);
11593   maybe_emit_call_builtin___clear_cache (a_tramp,
11594                                          plus_constant (ptr_mode,
11595                                                         a_tramp,
11596                                                         TRAMPOLINE_SIZE));
11597 }
11598
11599 static unsigned char
11600 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
11601 {
11602   /* ??? Logically we should only need to provide a value when
11603      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
11604      can hold MODE, but at the moment we need to handle all modes.
11605      Just ignore any runtime parts for registers that can't store them.  */
11606   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
11607   unsigned int nregs, vec_flags;
11608   switch (regclass)
11609     {
11610     case STUB_REGS:
11611     case TAILCALL_ADDR_REGS:
11612     case POINTER_REGS:
11613     case GENERAL_REGS:
11614     case ALL_REGS:
11615     case POINTER_AND_FP_REGS:
11616     case FP_REGS:
11617     case FP_LO_REGS:
11618     case FP_LO8_REGS:
11619       vec_flags = aarch64_classify_vector_mode (mode);
11620       if ((vec_flags & VEC_SVE_DATA)
11621           && constant_multiple_p (GET_MODE_SIZE (mode),
11622                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
11623         return nregs;
11624       return (vec_flags & VEC_ADVSIMD
11625               ? CEIL (lowest_size, UNITS_PER_VREG)
11626               : CEIL (lowest_size, UNITS_PER_WORD));
11627     case STACK_REG:
11628     case PR_REGS:
11629     case PR_LO_REGS:
11630     case PR_HI_REGS:
11631     case FFR_REGS:
11632     case PR_AND_FFR_REGS:
11633       return 1;
11634
11635     case NO_REGS:
11636       return 0;
11637
11638     default:
11639       break;
11640     }
11641   gcc_unreachable ();
11642 }
11643
11644 static reg_class_t
11645 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
11646 {
11647   if (regclass == POINTER_REGS)
11648     return GENERAL_REGS;
11649
11650   if (regclass == STACK_REG)
11651     {
11652       if (REG_P(x)
11653           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
11654           return regclass;
11655
11656       return NO_REGS;
11657     }
11658
11659   /* Register eliminiation can result in a request for
11660      SP+constant->FP_REGS.  We cannot support such operations which
11661      use SP as source and an FP_REG as destination, so reject out
11662      right now.  */
11663   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
11664     {
11665       rtx lhs = XEXP (x, 0);
11666
11667       /* Look through a possible SUBREG introduced by ILP32.  */
11668       if (SUBREG_P (lhs))
11669         lhs = SUBREG_REG (lhs);
11670
11671       gcc_assert (REG_P (lhs));
11672       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
11673                                       POINTER_REGS));
11674       return NO_REGS;
11675     }
11676
11677   return regclass;
11678 }
11679
11680 void
11681 aarch64_asm_output_labelref (FILE* f, const char *name)
11682 {
11683   asm_fprintf (f, "%U%s", name);
11684 }
11685
11686 static void
11687 aarch64_elf_asm_constructor (rtx symbol, int priority)
11688 {
11689   if (priority == DEFAULT_INIT_PRIORITY)
11690     default_ctor_section_asm_out_constructor (symbol, priority);
11691   else
11692     {
11693       section *s;
11694       /* While priority is known to be in range [0, 65535], so 18 bytes
11695          would be enough, the compiler might not know that.  To avoid
11696          -Wformat-truncation false positive, use a larger size.  */
11697       char buf[23];
11698       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
11699       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11700       switch_to_section (s);
11701       assemble_align (POINTER_SIZE);
11702       assemble_aligned_integer (POINTER_BYTES, symbol);
11703     }
11704 }
11705
11706 static void
11707 aarch64_elf_asm_destructor (rtx symbol, int priority)
11708 {
11709   if (priority == DEFAULT_INIT_PRIORITY)
11710     default_dtor_section_asm_out_destructor (symbol, priority);
11711   else
11712     {
11713       section *s;
11714       /* While priority is known to be in range [0, 65535], so 18 bytes
11715          would be enough, the compiler might not know that.  To avoid
11716          -Wformat-truncation false positive, use a larger size.  */
11717       char buf[23];
11718       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
11719       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11720       switch_to_section (s);
11721       assemble_align (POINTER_SIZE);
11722       assemble_aligned_integer (POINTER_BYTES, symbol);
11723     }
11724 }
11725
11726 const char*
11727 aarch64_output_casesi (rtx *operands)
11728 {
11729   char buf[100];
11730   char label[100];
11731   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
11732   int index;
11733   static const char *const patterns[4][2] =
11734   {
11735     {
11736       "ldrb\t%w3, [%0,%w1,uxtw]",
11737       "add\t%3, %4, %w3, sxtb #2"
11738     },
11739     {
11740       "ldrh\t%w3, [%0,%w1,uxtw #1]",
11741       "add\t%3, %4, %w3, sxth #2"
11742     },
11743     {
11744       "ldr\t%w3, [%0,%w1,uxtw #2]",
11745       "add\t%3, %4, %w3, sxtw #2"
11746     },
11747     /* We assume that DImode is only generated when not optimizing and
11748        that we don't really need 64-bit address offsets.  That would
11749        imply an object file with 8GB of code in a single function!  */
11750     {
11751       "ldr\t%w3, [%0,%w1,uxtw #2]",
11752       "add\t%3, %4, %w3, sxtw #2"
11753     }
11754   };
11755
11756   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
11757
11758   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
11759   index = exact_log2 (GET_MODE_SIZE (mode));
11760
11761   gcc_assert (index >= 0 && index <= 3);
11762
11763   /* Need to implement table size reduction, by chaning the code below.  */
11764   output_asm_insn (patterns[index][0], operands);
11765   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
11766   snprintf (buf, sizeof (buf),
11767             "adr\t%%4, %s", targetm.strip_name_encoding (label));
11768   output_asm_insn (buf, operands);
11769   output_asm_insn (patterns[index][1], operands);
11770   output_asm_insn ("br\t%3", operands);
11771   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
11772                    operands);
11773   assemble_label (asm_out_file, label);
11774   return "";
11775 }
11776
11777
11778 /* Return size in bits of an arithmetic operand which is shifted/scaled and
11779    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
11780    operator.  */
11781
11782 int
11783 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
11784 {
11785   if (shift >= 0 && shift <= 3)
11786     {
11787       int size;
11788       for (size = 8; size <= 32; size *= 2)
11789         {
11790           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
11791           if (mask == bits << shift)
11792             return size;
11793         }
11794     }
11795   return 0;
11796 }
11797
11798 /* Constant pools are per function only when PC relative
11799    literal loads are true or we are in the large memory
11800    model.  */
11801
11802 static inline bool
11803 aarch64_can_use_per_function_literal_pools_p (void)
11804 {
11805   return (aarch64_pcrelative_literal_loads
11806           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
11807 }
11808
11809 static bool
11810 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
11811 {
11812   /* We can't use blocks for constants when we're using a per-function
11813      constant pool.  */
11814   return !aarch64_can_use_per_function_literal_pools_p ();
11815 }
11816
11817 /* Select appropriate section for constants depending
11818    on where we place literal pools.  */
11819
11820 static section *
11821 aarch64_select_rtx_section (machine_mode mode,
11822                             rtx x,
11823                             unsigned HOST_WIDE_INT align)
11824 {
11825   if (aarch64_can_use_per_function_literal_pools_p ())
11826     return function_section (current_function_decl);
11827
11828   return default_elf_select_rtx_section (mode, x, align);
11829 }
11830
11831 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
11832 void
11833 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
11834                                   HOST_WIDE_INT offset)
11835 {
11836   /* When using per-function literal pools, we must ensure that any code
11837      section is aligned to the minimal instruction length, lest we get
11838      errors from the assembler re "unaligned instructions".  */
11839   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
11840     ASM_OUTPUT_ALIGN (f, 2);
11841 }
11842
11843 /* Costs.  */
11844
11845 /* Helper function for rtx cost calculation.  Strip a shift expression
11846    from X.  Returns the inner operand if successful, or the original
11847    expression on failure.  */
11848 static rtx
11849 aarch64_strip_shift (rtx x)
11850 {
11851   rtx op = x;
11852
11853   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11854      we can convert both to ROR during final output.  */
11855   if ((GET_CODE (op) == ASHIFT
11856        || GET_CODE (op) == ASHIFTRT
11857        || GET_CODE (op) == LSHIFTRT
11858        || GET_CODE (op) == ROTATERT
11859        || GET_CODE (op) == ROTATE)
11860       && CONST_INT_P (XEXP (op, 1)))
11861     return XEXP (op, 0);
11862
11863   if (GET_CODE (op) == MULT
11864       && CONST_INT_P (XEXP (op, 1))
11865       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
11866     return XEXP (op, 0);
11867
11868   return x;
11869 }
11870
11871 /* Helper function for rtx cost calculation.  Strip an extend
11872    expression from X.  Returns the inner operand if successful, or the
11873    original expression on failure.  We deal with a number of possible
11874    canonicalization variations here. If STRIP_SHIFT is true, then
11875    we can strip off a shift also.  */
11876 static rtx
11877 aarch64_strip_extend (rtx x, bool strip_shift)
11878 {
11879   scalar_int_mode mode;
11880   rtx op = x;
11881
11882   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
11883     return op;
11884
11885   if (GET_CODE (op) == AND
11886       && GET_CODE (XEXP (op, 0)) == MULT
11887       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
11888       && CONST_INT_P (XEXP (op, 1))
11889       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
11890                            INTVAL (XEXP (op, 1))) != 0)
11891     return XEXP (XEXP (op, 0), 0);
11892
11893   /* Now handle extended register, as this may also have an optional
11894      left shift by 1..4.  */
11895   if (strip_shift
11896       && GET_CODE (op) == ASHIFT
11897       && CONST_INT_P (XEXP (op, 1))
11898       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
11899     op = XEXP (op, 0);
11900
11901   if (GET_CODE (op) == ZERO_EXTEND
11902       || GET_CODE (op) == SIGN_EXTEND)
11903     op = XEXP (op, 0);
11904
11905   if (op != x)
11906     return op;
11907
11908   return x;
11909 }
11910
11911 /* Return true iff CODE is a shift supported in combination
11912    with arithmetic instructions.  */
11913
11914 static bool
11915 aarch64_shift_p (enum rtx_code code)
11916 {
11917   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
11918 }
11919
11920
11921 /* Return true iff X is a cheap shift without a sign extend. */
11922
11923 static bool
11924 aarch64_cheap_mult_shift_p (rtx x)
11925 {
11926   rtx op0, op1;
11927
11928   op0 = XEXP (x, 0);
11929   op1 = XEXP (x, 1);
11930
11931   if (!(aarch64_tune_params.extra_tuning_flags
11932                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
11933     return false;
11934
11935   if (GET_CODE (op0) == SIGN_EXTEND)
11936     return false;
11937
11938   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
11939       && UINTVAL (op1) <= 4)
11940     return true;
11941
11942   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
11943     return false;
11944
11945   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
11946
11947   if (l2 > 0 && l2 <= 4)
11948     return true;
11949
11950   return false;
11951 }
11952
11953 /* Helper function for rtx cost calculation.  Calculate the cost of
11954    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
11955    Return the calculated cost of the expression, recursing manually in to
11956    operands where needed.  */
11957
11958 static int
11959 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
11960 {
11961   rtx op0, op1;
11962   const struct cpu_cost_table *extra_cost
11963     = aarch64_tune_params.insn_extra_cost;
11964   int cost = 0;
11965   bool compound_p = (outer == PLUS || outer == MINUS);
11966   machine_mode mode = GET_MODE (x);
11967
11968   gcc_checking_assert (code == MULT);
11969
11970   op0 = XEXP (x, 0);
11971   op1 = XEXP (x, 1);
11972
11973   if (VECTOR_MODE_P (mode))
11974     {
11975       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11976       if (vec_flags & VEC_ADVSIMD)
11977         {
11978           /* The by-element versions of the instruction have the same costs as
11979              the normal 3-vector version.  So don't add the costs of the
11980              duplicate into the costs of the multiply.  We make an assumption
11981              that the input to the VEC_DUPLICATE is already on the FP & SIMD
11982              side.  This means costing of a MUL by element pre RA is a bit
11983              optimistic.  */
11984           if (GET_CODE (op0) == VEC_DUPLICATE)
11985             op0 = XEXP (op0, 0);
11986           else if (GET_CODE (op1) == VEC_DUPLICATE)
11987             op1 = XEXP (op1, 0);
11988         }
11989       cost += rtx_cost (op0, mode, MULT, 0, speed);
11990       cost += rtx_cost (op1, mode, MULT, 1, speed);
11991       if (speed)
11992         {
11993           if (GET_CODE (x) == MULT)
11994             cost += extra_cost->vect.mult;
11995           /* This is to catch the SSRA costing currently flowing here.  */
11996           else
11997             cost += extra_cost->vect.alu;
11998         }
11999       return cost;
12000     }
12001
12002   /* Integer multiply/fma.  */
12003   if (GET_MODE_CLASS (mode) == MODE_INT)
12004     {
12005       /* The multiply will be canonicalized as a shift, cost it as such.  */
12006       if (aarch64_shift_p (GET_CODE (x))
12007           || (CONST_INT_P (op1)
12008               && exact_log2 (INTVAL (op1)) > 0))
12009         {
12010           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
12011                            || GET_CODE (op0) == SIGN_EXTEND;
12012           if (speed)
12013             {
12014               if (compound_p)
12015                 {
12016                   /* If the shift is considered cheap,
12017                      then don't add any cost. */
12018                   if (aarch64_cheap_mult_shift_p (x))
12019                     ;
12020                   else if (REG_P (op1))
12021                     /* ARITH + shift-by-register.  */
12022                     cost += extra_cost->alu.arith_shift_reg;
12023                   else if (is_extend)
12024                     /* ARITH + extended register.  We don't have a cost field
12025                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
12026                     cost += extra_cost->alu.extend_arith;
12027                   else
12028                     /* ARITH + shift-by-immediate.  */
12029                     cost += extra_cost->alu.arith_shift;
12030                 }
12031               else
12032                 /* LSL (immediate).  */
12033                 cost += extra_cost->alu.shift;
12034
12035             }
12036           /* Strip extends as we will have costed them in the case above.  */
12037           if (is_extend)
12038             op0 = aarch64_strip_extend (op0, true);
12039
12040           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
12041
12042           return cost;
12043         }
12044
12045       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
12046          compound and let the below cases handle it.  After all, MNEG is a
12047          special-case alias of MSUB.  */
12048       if (GET_CODE (op0) == NEG)
12049         {
12050           op0 = XEXP (op0, 0);
12051           compound_p = true;
12052         }
12053
12054       /* Integer multiplies or FMAs have zero/sign extending variants.  */
12055       if ((GET_CODE (op0) == ZERO_EXTEND
12056            && GET_CODE (op1) == ZERO_EXTEND)
12057           || (GET_CODE (op0) == SIGN_EXTEND
12058               && GET_CODE (op1) == SIGN_EXTEND))
12059         {
12060           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
12061           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
12062
12063           if (speed)
12064             {
12065               if (compound_p)
12066                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
12067                 cost += extra_cost->mult[0].extend_add;
12068               else
12069                 /* MUL/SMULL/UMULL.  */
12070                 cost += extra_cost->mult[0].extend;
12071             }
12072
12073           return cost;
12074         }
12075
12076       /* This is either an integer multiply or a MADD.  In both cases
12077          we want to recurse and cost the operands.  */
12078       cost += rtx_cost (op0, mode, MULT, 0, speed);
12079       cost += rtx_cost (op1, mode, MULT, 1, speed);
12080
12081       if (speed)
12082         {
12083           if (compound_p)
12084             /* MADD/MSUB.  */
12085             cost += extra_cost->mult[mode == DImode].add;
12086           else
12087             /* MUL.  */
12088             cost += extra_cost->mult[mode == DImode].simple;
12089         }
12090
12091       return cost;
12092     }
12093   else
12094     {
12095       if (speed)
12096         {
12097           /* Floating-point FMA/FMUL can also support negations of the
12098              operands, unless the rounding mode is upward or downward in
12099              which case FNMUL is different than FMUL with operand negation.  */
12100           bool neg0 = GET_CODE (op0) == NEG;
12101           bool neg1 = GET_CODE (op1) == NEG;
12102           if (compound_p || !flag_rounding_math || (neg0 && neg1))
12103             {
12104               if (neg0)
12105                 op0 = XEXP (op0, 0);
12106               if (neg1)
12107                 op1 = XEXP (op1, 0);
12108             }
12109
12110           if (compound_p)
12111             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
12112             cost += extra_cost->fp[mode == DFmode].fma;
12113           else
12114             /* FMUL/FNMUL.  */
12115             cost += extra_cost->fp[mode == DFmode].mult;
12116         }
12117
12118       cost += rtx_cost (op0, mode, MULT, 0, speed);
12119       cost += rtx_cost (op1, mode, MULT, 1, speed);
12120       return cost;
12121     }
12122 }
12123
12124 static int
12125 aarch64_address_cost (rtx x,
12126                       machine_mode mode,
12127                       addr_space_t as ATTRIBUTE_UNUSED,
12128                       bool speed)
12129 {
12130   enum rtx_code c = GET_CODE (x);
12131   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
12132   struct aarch64_address_info info;
12133   int cost = 0;
12134   info.shift = 0;
12135
12136   if (!aarch64_classify_address (&info, x, mode, false))
12137     {
12138       if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
12139         {
12140           /* This is a CONST or SYMBOL ref which will be split
12141              in a different way depending on the code model in use.
12142              Cost it through the generic infrastructure.  */
12143           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
12144           /* Divide through by the cost of one instruction to
12145              bring it to the same units as the address costs.  */
12146           cost_symbol_ref /= COSTS_N_INSNS (1);
12147           /* The cost is then the cost of preparing the address,
12148              followed by an immediate (possibly 0) offset.  */
12149           return cost_symbol_ref + addr_cost->imm_offset;
12150         }
12151       else
12152         {
12153           /* This is most likely a jump table from a case
12154              statement.  */
12155           return addr_cost->register_offset;
12156         }
12157     }
12158
12159   switch (info.type)
12160     {
12161       case ADDRESS_LO_SUM:
12162       case ADDRESS_SYMBOLIC:
12163       case ADDRESS_REG_IMM:
12164         cost += addr_cost->imm_offset;
12165         break;
12166
12167       case ADDRESS_REG_WB:
12168         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
12169           cost += addr_cost->pre_modify;
12170         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
12171           {
12172             if (mode == CImode)
12173               cost += addr_cost->post_modify_ld3_st3;
12174             else if (mode == XImode)
12175               cost += addr_cost->post_modify_ld4_st4;
12176             else
12177               cost += addr_cost->post_modify;
12178           }
12179         else
12180           gcc_unreachable ();
12181
12182         break;
12183
12184       case ADDRESS_REG_REG:
12185         cost += addr_cost->register_offset;
12186         break;
12187
12188       case ADDRESS_REG_SXTW:
12189         cost += addr_cost->register_sextend;
12190         break;
12191
12192       case ADDRESS_REG_UXTW:
12193         cost += addr_cost->register_zextend;
12194         break;
12195
12196       default:
12197         gcc_unreachable ();
12198     }
12199
12200
12201   if (info.shift > 0)
12202     {
12203       /* For the sake of calculating the cost of the shifted register
12204          component, we can treat same sized modes in the same way.  */
12205       if (known_eq (GET_MODE_BITSIZE (mode), 16))
12206         cost += addr_cost->addr_scale_costs.hi;
12207       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
12208         cost += addr_cost->addr_scale_costs.si;
12209       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
12210         cost += addr_cost->addr_scale_costs.di;
12211       else
12212         /* We can't tell, or this is a 128-bit vector.  */
12213         cost += addr_cost->addr_scale_costs.ti;
12214     }
12215
12216   return cost;
12217 }
12218
12219 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
12220    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
12221    to be taken.  */
12222
12223 int
12224 aarch64_branch_cost (bool speed_p, bool predictable_p)
12225 {
12226   /* When optimizing for speed, use the cost of unpredictable branches.  */
12227   const struct cpu_branch_cost *branch_costs =
12228     aarch64_tune_params.branch_costs;
12229
12230   if (!speed_p || predictable_p)
12231     return branch_costs->predictable;
12232   else
12233     return branch_costs->unpredictable;
12234 }
12235
12236 /* Return true if X is a zero or sign extract
12237    usable in an ADD or SUB (extended register) instruction.  */
12238 static bool
12239 aarch64_rtx_arith_op_extract_p (rtx x)
12240 {
12241   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
12242      No shift.  */
12243   if (GET_CODE (x) == SIGN_EXTEND
12244       || GET_CODE (x) == ZERO_EXTEND)
12245     return REG_P (XEXP (x, 0));
12246
12247   return false;
12248 }
12249
12250 static bool
12251 aarch64_frint_unspec_p (unsigned int u)
12252 {
12253   switch (u)
12254     {
12255       case UNSPEC_FRINTZ:
12256       case UNSPEC_FRINTP:
12257       case UNSPEC_FRINTM:
12258       case UNSPEC_FRINTA:
12259       case UNSPEC_FRINTN:
12260       case UNSPEC_FRINTX:
12261       case UNSPEC_FRINTI:
12262         return true;
12263
12264       default:
12265         return false;
12266     }
12267 }
12268
12269 /* Return true iff X is an rtx that will match an extr instruction
12270    i.e. as described in the *extr<mode>5_insn family of patterns.
12271    OP0 and OP1 will be set to the operands of the shifts involved
12272    on success and will be NULL_RTX otherwise.  */
12273
12274 static bool
12275 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
12276 {
12277   rtx op0, op1;
12278   scalar_int_mode mode;
12279   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
12280     return false;
12281
12282   *res_op0 = NULL_RTX;
12283   *res_op1 = NULL_RTX;
12284
12285   if (GET_CODE (x) != IOR)
12286     return false;
12287
12288   op0 = XEXP (x, 0);
12289   op1 = XEXP (x, 1);
12290
12291   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
12292       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
12293     {
12294      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
12295       if (GET_CODE (op1) == ASHIFT)
12296         std::swap (op0, op1);
12297
12298       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
12299         return false;
12300
12301       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
12302       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
12303
12304       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
12305           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
12306         {
12307           *res_op0 = XEXP (op0, 0);
12308           *res_op1 = XEXP (op1, 0);
12309           return true;
12310         }
12311     }
12312
12313   return false;
12314 }
12315
12316 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
12317    storing it in *COST.  Result is true if the total cost of the operation
12318    has now been calculated.  */
12319 static bool
12320 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
12321 {
12322   rtx inner;
12323   rtx comparator;
12324   enum rtx_code cmpcode;
12325   const struct cpu_cost_table *extra_cost
12326     = aarch64_tune_params.insn_extra_cost;
12327
12328   if (COMPARISON_P (op0))
12329     {
12330       inner = XEXP (op0, 0);
12331       comparator = XEXP (op0, 1);
12332       cmpcode = GET_CODE (op0);
12333     }
12334   else
12335     {
12336       inner = op0;
12337       comparator = const0_rtx;
12338       cmpcode = NE;
12339     }
12340
12341   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
12342     {
12343       /* Conditional branch.  */
12344       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
12345         return true;
12346       else
12347         {
12348           if (cmpcode == NE || cmpcode == EQ)
12349             {
12350               if (comparator == const0_rtx)
12351                 {
12352                   /* TBZ/TBNZ/CBZ/CBNZ.  */
12353                   if (GET_CODE (inner) == ZERO_EXTRACT)
12354                     /* TBZ/TBNZ.  */
12355                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
12356                                        ZERO_EXTRACT, 0, speed);
12357                   else
12358                     /* CBZ/CBNZ.  */
12359                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
12360
12361                   return true;
12362                 }
12363               if (register_operand (inner, VOIDmode)
12364                   && aarch64_imm24 (comparator, VOIDmode))
12365                 {
12366                   /* SUB and SUBS.  */
12367                   *cost += COSTS_N_INSNS (2);
12368                   if (speed)
12369                     *cost += extra_cost->alu.arith * 2;
12370                   return true;
12371                 }
12372             }
12373           else if (cmpcode == LT || cmpcode == GE)
12374             {
12375               /* TBZ/TBNZ.  */
12376               if (comparator == const0_rtx)
12377                 return true;
12378             }
12379         }
12380     }
12381   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
12382     {
12383       /* CCMP.  */
12384       if (GET_CODE (op1) == COMPARE)
12385         {
12386           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
12387           if (XEXP (op1, 1) == const0_rtx)
12388             *cost += 1;
12389           if (speed)
12390             {
12391               machine_mode mode = GET_MODE (XEXP (op1, 0));
12392
12393               if (GET_MODE_CLASS (mode) == MODE_INT)
12394                 *cost += extra_cost->alu.arith;
12395               else
12396                 *cost += extra_cost->fp[mode == DFmode].compare;
12397             }
12398           return true;
12399         }
12400
12401       /* It's a conditional operation based on the status flags,
12402          so it must be some flavor of CSEL.  */
12403
12404       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
12405       if (GET_CODE (op1) == NEG
12406           || GET_CODE (op1) == NOT
12407           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
12408         op1 = XEXP (op1, 0);
12409       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
12410         {
12411           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
12412           op1 = XEXP (op1, 0);
12413           op2 = XEXP (op2, 0);
12414         }
12415       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
12416         {
12417           inner = XEXP (op1, 0);
12418           if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
12419             /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
12420             op1 = XEXP (inner, 0);
12421         }
12422
12423       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
12424       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
12425       return true;
12426     }
12427
12428   /* We don't know what this is, cost all operands.  */
12429   return false;
12430 }
12431
12432 /* Check whether X is a bitfield operation of the form shift + extend that
12433    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
12434    operand to which the bitfield operation is applied.  Otherwise return
12435    NULL_RTX.  */
12436
12437 static rtx
12438 aarch64_extend_bitfield_pattern_p (rtx x)
12439 {
12440   rtx_code outer_code = GET_CODE (x);
12441   machine_mode outer_mode = GET_MODE (x);
12442
12443   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
12444       && outer_mode != SImode && outer_mode != DImode)
12445     return NULL_RTX;
12446
12447   rtx inner = XEXP (x, 0);
12448   rtx_code inner_code = GET_CODE (inner);
12449   machine_mode inner_mode = GET_MODE (inner);
12450   rtx op = NULL_RTX;
12451
12452   switch (inner_code)
12453     {
12454       case ASHIFT:
12455         if (CONST_INT_P (XEXP (inner, 1))
12456             && (inner_mode == QImode || inner_mode == HImode))
12457           op = XEXP (inner, 0);
12458         break;
12459       case LSHIFTRT:
12460         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
12461             && (inner_mode == QImode || inner_mode == HImode))
12462           op = XEXP (inner, 0);
12463         break;
12464       case ASHIFTRT:
12465         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
12466             && (inner_mode == QImode || inner_mode == HImode))
12467           op = XEXP (inner, 0);
12468         break;
12469       default:
12470         break;
12471     }
12472
12473   return op;
12474 }
12475
12476 /* Return true if the mask and a shift amount from an RTX of the form
12477    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
12478    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
12479
12480 bool
12481 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
12482                                     rtx shft_amnt)
12483 {
12484   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
12485          && INTVAL (mask) > 0
12486          && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
12487          && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
12488          && (UINTVAL (mask)
12489              & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
12490 }
12491
12492 /* Return true if the masks and a shift amount from an RTX of the form
12493    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
12494    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
12495
12496 bool
12497 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
12498                                    unsigned HOST_WIDE_INT mask1,
12499                                    unsigned HOST_WIDE_INT shft_amnt,
12500                                    unsigned HOST_WIDE_INT mask2)
12501 {
12502   unsigned HOST_WIDE_INT t;
12503
12504   /* Verify that there is no overlap in what bits are set in the two masks.  */
12505   if (mask1 != ~mask2)
12506     return false;
12507
12508   /* Verify that mask2 is not all zeros or ones.  */
12509   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
12510     return false;
12511
12512   /* The shift amount should always be less than the mode size.  */
12513   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
12514
12515   /* Verify that the mask being shifted is contiguous and would be in the
12516      least significant bits after shifting by shft_amnt.  */
12517   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
12518   return (t == (t & -t));
12519 }
12520
12521 /* Calculate the cost of calculating X, storing it in *COST.  Result
12522    is true if the total cost of the operation has now been calculated.  */
12523 static bool
12524 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
12525                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
12526 {
12527   rtx op0, op1, op2;
12528   const struct cpu_cost_table *extra_cost
12529     = aarch64_tune_params.insn_extra_cost;
12530   int code = GET_CODE (x);
12531   scalar_int_mode int_mode;
12532
12533   /* By default, assume that everything has equivalent cost to the
12534      cheapest instruction.  Any additional costs are applied as a delta
12535      above this default.  */
12536   *cost = COSTS_N_INSNS (1);
12537
12538   switch (code)
12539     {
12540     case SET:
12541       /* The cost depends entirely on the operands to SET.  */
12542       *cost = 0;
12543       op0 = SET_DEST (x);
12544       op1 = SET_SRC (x);
12545
12546       switch (GET_CODE (op0))
12547         {
12548         case MEM:
12549           if (speed)
12550             {
12551               rtx address = XEXP (op0, 0);
12552               if (VECTOR_MODE_P (mode))
12553                 *cost += extra_cost->ldst.storev;
12554               else if (GET_MODE_CLASS (mode) == MODE_INT)
12555                 *cost += extra_cost->ldst.store;
12556               else if (mode == SFmode)
12557                 *cost += extra_cost->ldst.storef;
12558               else if (mode == DFmode)
12559                 *cost += extra_cost->ldst.stored;
12560
12561               *cost +=
12562                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12563                                                      0, speed));
12564             }
12565
12566           *cost += rtx_cost (op1, mode, SET, 1, speed);
12567           return true;
12568
12569         case SUBREG:
12570           if (! REG_P (SUBREG_REG (op0)))
12571             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
12572
12573           /* Fall through.  */
12574         case REG:
12575           /* The cost is one per vector-register copied.  */
12576           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
12577             {
12578               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
12579               *cost = COSTS_N_INSNS (nregs);
12580             }
12581           /* const0_rtx is in general free, but we will use an
12582              instruction to set a register to 0.  */
12583           else if (REG_P (op1) || op1 == const0_rtx)
12584             {
12585               /* The cost is 1 per register copied.  */
12586               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
12587               *cost = COSTS_N_INSNS (nregs);
12588             }
12589           else
12590             /* Cost is just the cost of the RHS of the set.  */
12591             *cost += rtx_cost (op1, mode, SET, 1, speed);
12592           return true;
12593
12594         case ZERO_EXTRACT:
12595         case SIGN_EXTRACT:
12596           /* Bit-field insertion.  Strip any redundant widening of
12597              the RHS to meet the width of the target.  */
12598           if (GET_CODE (op1) == SUBREG)
12599             op1 = SUBREG_REG (op1);
12600           if ((GET_CODE (op1) == ZERO_EXTEND
12601                || GET_CODE (op1) == SIGN_EXTEND)
12602               && CONST_INT_P (XEXP (op0, 1))
12603               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
12604               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
12605             op1 = XEXP (op1, 0);
12606
12607           if (CONST_INT_P (op1))
12608             {
12609               /* MOV immediate is assumed to always be cheap.  */
12610               *cost = COSTS_N_INSNS (1);
12611             }
12612           else
12613             {
12614               /* BFM.  */
12615               if (speed)
12616                 *cost += extra_cost->alu.bfi;
12617               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
12618             }
12619
12620           return true;
12621
12622         default:
12623           /* We can't make sense of this, assume default cost.  */
12624           *cost = COSTS_N_INSNS (1);
12625           return false;
12626         }
12627       return false;
12628
12629     case CONST_INT:
12630       /* If an instruction can incorporate a constant within the
12631          instruction, the instruction's expression avoids calling
12632          rtx_cost() on the constant.  If rtx_cost() is called on a
12633          constant, then it is usually because the constant must be
12634          moved into a register by one or more instructions.
12635
12636          The exception is constant 0, which can be expressed
12637          as XZR/WZR and is therefore free.  The exception to this is
12638          if we have (set (reg) (const0_rtx)) in which case we must cost
12639          the move.  However, we can catch that when we cost the SET, so
12640          we don't need to consider that here.  */
12641       if (x == const0_rtx)
12642         *cost = 0;
12643       else
12644         {
12645           /* To an approximation, building any other constant is
12646              proportionally expensive to the number of instructions
12647              required to build that constant.  This is true whether we
12648              are compiling for SPEED or otherwise.  */
12649           if (!is_a <scalar_int_mode> (mode, &int_mode))
12650             int_mode = word_mode;
12651           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
12652                                  (NULL_RTX, x, false, int_mode));
12653         }
12654       return true;
12655
12656     case CONST_DOUBLE:
12657
12658       /* First determine number of instructions to do the move
12659           as an integer constant.  */
12660       if (!aarch64_float_const_representable_p (x)
12661            && !aarch64_can_const_movi_rtx_p (x, mode)
12662            && aarch64_float_const_rtx_p (x))
12663         {
12664           unsigned HOST_WIDE_INT ival;
12665           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
12666           gcc_assert (succeed);
12667
12668           scalar_int_mode imode = (mode == HFmode
12669                                    ? SImode
12670                                    : int_mode_for_mode (mode).require ());
12671           int ncost = aarch64_internal_mov_immediate
12672                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
12673           *cost += COSTS_N_INSNS (ncost);
12674           return true;
12675         }
12676
12677       if (speed)
12678         {
12679           /* mov[df,sf]_aarch64.  */
12680           if (aarch64_float_const_representable_p (x))
12681             /* FMOV (scalar immediate).  */
12682             *cost += extra_cost->fp[mode == DFmode].fpconst;
12683           else if (!aarch64_float_const_zero_rtx_p (x))
12684             {
12685               /* This will be a load from memory.  */
12686               if (mode == DFmode)
12687                 *cost += extra_cost->ldst.loadd;
12688               else
12689                 *cost += extra_cost->ldst.loadf;
12690             }
12691           else
12692             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
12693                or MOV v0.s[0], wzr - neither of which are modeled by the
12694                cost tables.  Just use the default cost.  */
12695             {
12696             }
12697         }
12698
12699       return true;
12700
12701     case MEM:
12702       if (speed)
12703         {
12704           /* For loads we want the base cost of a load, plus an
12705              approximation for the additional cost of the addressing
12706              mode.  */
12707           rtx address = XEXP (x, 0);
12708           if (VECTOR_MODE_P (mode))
12709             *cost += extra_cost->ldst.loadv;
12710           else if (GET_MODE_CLASS (mode) == MODE_INT)
12711             *cost += extra_cost->ldst.load;
12712           else if (mode == SFmode)
12713             *cost += extra_cost->ldst.loadf;
12714           else if (mode == DFmode)
12715             *cost += extra_cost->ldst.loadd;
12716
12717           *cost +=
12718                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12719                                                      0, speed));
12720         }
12721
12722       return true;
12723
12724     case NEG:
12725       op0 = XEXP (x, 0);
12726
12727       if (VECTOR_MODE_P (mode))
12728         {
12729           if (speed)
12730             {
12731               /* FNEG.  */
12732               *cost += extra_cost->vect.alu;
12733             }
12734           return false;
12735         }
12736
12737       if (GET_MODE_CLASS (mode) == MODE_INT)
12738         {
12739           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12740               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12741             {
12742               /* CSETM.  */
12743               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
12744               return true;
12745             }
12746
12747           /* Cost this as SUB wzr, X.  */
12748           op0 = CONST0_RTX (mode);
12749           op1 = XEXP (x, 0);
12750           goto cost_minus;
12751         }
12752
12753       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12754         {
12755           /* Support (neg(fma...)) as a single instruction only if
12756              sign of zeros is unimportant.  This matches the decision
12757              making in aarch64.md.  */
12758           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
12759             {
12760               /* FNMADD.  */
12761               *cost = rtx_cost (op0, mode, NEG, 0, speed);
12762               return true;
12763             }
12764           if (GET_CODE (op0) == MULT)
12765             {
12766               /* FNMUL.  */
12767               *cost = rtx_cost (op0, mode, NEG, 0, speed);
12768               return true;
12769             }
12770           if (speed)
12771             /* FNEG.  */
12772             *cost += extra_cost->fp[mode == DFmode].neg;
12773           return false;
12774         }
12775
12776       return false;
12777
12778     case CLRSB:
12779     case CLZ:
12780       if (speed)
12781         {
12782           if (VECTOR_MODE_P (mode))
12783             *cost += extra_cost->vect.alu;
12784           else
12785             *cost += extra_cost->alu.clz;
12786         }
12787
12788       return false;
12789
12790     case CTZ:
12791       *cost = COSTS_N_INSNS (2);
12792
12793       if (speed)
12794         *cost += extra_cost->alu.clz + extra_cost->alu.rev;
12795       return false;
12796
12797     case COMPARE:
12798       op0 = XEXP (x, 0);
12799       op1 = XEXP (x, 1);
12800
12801       if (op1 == const0_rtx
12802           && GET_CODE (op0) == AND)
12803         {
12804           x = op0;
12805           mode = GET_MODE (op0);
12806           goto cost_logic;
12807         }
12808
12809       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
12810         {
12811           /* TODO: A write to the CC flags possibly costs extra, this
12812              needs encoding in the cost tables.  */
12813
12814           mode = GET_MODE (op0);
12815           /* ANDS.  */
12816           if (GET_CODE (op0) == AND)
12817             {
12818               x = op0;
12819               goto cost_logic;
12820             }
12821
12822           if (GET_CODE (op0) == PLUS)
12823             {
12824               /* ADDS (and CMN alias).  */
12825               x = op0;
12826               goto cost_plus;
12827             }
12828
12829           if (GET_CODE (op0) == MINUS)
12830             {
12831               /* SUBS.  */
12832               x = op0;
12833               goto cost_minus;
12834             }
12835
12836           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
12837               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
12838               && CONST_INT_P (XEXP (op0, 2)))
12839             {
12840               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12841                  Handle it here directly rather than going to cost_logic
12842                  since we know the immediate generated for the TST is valid
12843                  so we can avoid creating an intermediate rtx for it only
12844                  for costing purposes.  */
12845               if (speed)
12846                 *cost += extra_cost->alu.logical;
12847
12848               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
12849                                  ZERO_EXTRACT, 0, speed);
12850               return true;
12851             }
12852
12853           if (GET_CODE (op1) == NEG)
12854             {
12855               /* CMN.  */
12856               if (speed)
12857                 *cost += extra_cost->alu.arith;
12858
12859               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
12860               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
12861               return true;
12862             }
12863
12864           /* CMP.
12865
12866              Compare can freely swap the order of operands, and
12867              canonicalization puts the more complex operation first.
12868              But the integer MINUS logic expects the shift/extend
12869              operation in op1.  */
12870           if (! (REG_P (op0)
12871                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
12872           {
12873             op0 = XEXP (x, 1);
12874             op1 = XEXP (x, 0);
12875           }
12876           goto cost_minus;
12877         }
12878
12879       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
12880         {
12881           /* FCMP.  */
12882           if (speed)
12883             *cost += extra_cost->fp[mode == DFmode].compare;
12884
12885           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
12886             {
12887               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
12888               /* FCMP supports constant 0.0 for no extra cost. */
12889               return true;
12890             }
12891           return false;
12892         }
12893
12894       if (VECTOR_MODE_P (mode))
12895         {
12896           /* Vector compare.  */
12897           if (speed)
12898             *cost += extra_cost->vect.alu;
12899
12900           if (aarch64_float_const_zero_rtx_p (op1))
12901             {
12902               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
12903                  cost.  */
12904               return true;
12905             }
12906           return false;
12907         }
12908       return false;
12909
12910     case MINUS:
12911       {
12912         op0 = XEXP (x, 0);
12913         op1 = XEXP (x, 1);
12914
12915 cost_minus:
12916         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
12917
12918         /* Detect valid immediates.  */
12919         if ((GET_MODE_CLASS (mode) == MODE_INT
12920              || (GET_MODE_CLASS (mode) == MODE_CC
12921                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
12922             && CONST_INT_P (op1)
12923             && aarch64_uimm12_shift (INTVAL (op1)))
12924           {
12925             if (speed)
12926               /* SUB(S) (immediate).  */
12927               *cost += extra_cost->alu.arith;
12928             return true;
12929           }
12930
12931         /* Look for SUB (extended register).  */
12932         if (is_a <scalar_int_mode> (mode)
12933             && aarch64_rtx_arith_op_extract_p (op1))
12934           {
12935             if (speed)
12936               *cost += extra_cost->alu.extend_arith;
12937
12938             op1 = aarch64_strip_extend (op1, true);
12939             *cost += rtx_cost (op1, VOIDmode,
12940                                (enum rtx_code) GET_CODE (op1), 0, speed);
12941             return true;
12942           }
12943
12944         rtx new_op1 = aarch64_strip_extend (op1, false);
12945
12946         /* Cost this as an FMA-alike operation.  */
12947         if ((GET_CODE (new_op1) == MULT
12948              || aarch64_shift_p (GET_CODE (new_op1)))
12949             && code != COMPARE)
12950           {
12951             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
12952                                             (enum rtx_code) code,
12953                                             speed);
12954             return true;
12955           }
12956
12957         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
12958
12959         if (speed)
12960           {
12961             if (VECTOR_MODE_P (mode))
12962               {
12963                 /* Vector SUB.  */
12964                 *cost += extra_cost->vect.alu;
12965               }
12966             else if (GET_MODE_CLASS (mode) == MODE_INT)
12967               {
12968                 /* SUB(S).  */
12969                 *cost += extra_cost->alu.arith;
12970               }
12971             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12972               {
12973                 /* FSUB.  */
12974                 *cost += extra_cost->fp[mode == DFmode].addsub;
12975               }
12976           }
12977         return true;
12978       }
12979
12980     case PLUS:
12981       {
12982         rtx new_op0;
12983
12984         op0 = XEXP (x, 0);
12985         op1 = XEXP (x, 1);
12986
12987 cost_plus:
12988         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12989             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12990           {
12991             /* CSINC.  */
12992             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
12993             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12994             return true;
12995           }
12996
12997         if (GET_MODE_CLASS (mode) == MODE_INT
12998             && (aarch64_plus_immediate (op1, mode)
12999                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
13000           {
13001             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
13002
13003             if (speed)
13004               {
13005                 /* ADD (immediate).  */
13006                 *cost += extra_cost->alu.arith;
13007
13008                 /* Some tunings prefer to not use the VL-based scalar ops.
13009                    Increase the cost of the poly immediate to prevent their
13010                    formation.  */
13011                 if (GET_CODE (op1) == CONST_POLY_INT
13012                     && (aarch64_tune_params.extra_tuning_flags
13013                         & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
13014                   *cost += COSTS_N_INSNS (1);
13015               }
13016             return true;
13017           }
13018
13019         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
13020
13021         /* Look for ADD (extended register).  */
13022         if (is_a <scalar_int_mode> (mode)
13023             && aarch64_rtx_arith_op_extract_p (op0))
13024           {
13025             if (speed)
13026               *cost += extra_cost->alu.extend_arith;
13027
13028             op0 = aarch64_strip_extend (op0, true);
13029             *cost += rtx_cost (op0, VOIDmode,
13030                                (enum rtx_code) GET_CODE (op0), 0, speed);
13031             return true;
13032           }
13033
13034         /* Strip any extend, leave shifts behind as we will
13035            cost them through mult_cost.  */
13036         new_op0 = aarch64_strip_extend (op0, false);
13037
13038         if (GET_CODE (new_op0) == MULT
13039             || aarch64_shift_p (GET_CODE (new_op0)))
13040           {
13041             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
13042                                             speed);
13043             return true;
13044           }
13045
13046         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
13047
13048         if (speed)
13049           {
13050             if (VECTOR_MODE_P (mode))
13051               {
13052                 /* Vector ADD.  */
13053                 *cost += extra_cost->vect.alu;
13054               }
13055             else if (GET_MODE_CLASS (mode) == MODE_INT)
13056               {
13057                 /* ADD.  */
13058                 *cost += extra_cost->alu.arith;
13059               }
13060             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13061               {
13062                 /* FADD.  */
13063                 *cost += extra_cost->fp[mode == DFmode].addsub;
13064               }
13065           }
13066         return true;
13067       }
13068
13069     case BSWAP:
13070       *cost = COSTS_N_INSNS (1);
13071
13072       if (speed)
13073         {
13074           if (VECTOR_MODE_P (mode))
13075             *cost += extra_cost->vect.alu;
13076           else
13077             *cost += extra_cost->alu.rev;
13078         }
13079       return false;
13080
13081     case IOR:
13082       if (aarch_rev16_p (x))
13083         {
13084           *cost = COSTS_N_INSNS (1);
13085
13086           if (speed)
13087             {
13088               if (VECTOR_MODE_P (mode))
13089                 *cost += extra_cost->vect.alu;
13090               else
13091                 *cost += extra_cost->alu.rev;
13092             }
13093           return true;
13094         }
13095
13096       if (aarch64_extr_rtx_p (x, &op0, &op1))
13097         {
13098           *cost += rtx_cost (op0, mode, IOR, 0, speed);
13099           *cost += rtx_cost (op1, mode, IOR, 1, speed);
13100           if (speed)
13101             *cost += extra_cost->alu.shift;
13102
13103           return true;
13104         }
13105     /* Fall through.  */
13106     case XOR:
13107     case AND:
13108     cost_logic:
13109       op0 = XEXP (x, 0);
13110       op1 = XEXP (x, 1);
13111
13112       if (VECTOR_MODE_P (mode))
13113         {
13114           if (speed)
13115             *cost += extra_cost->vect.alu;
13116           return true;
13117         }
13118
13119       if (code == AND
13120           && GET_CODE (op0) == MULT
13121           && CONST_INT_P (XEXP (op0, 1))
13122           && CONST_INT_P (op1)
13123           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
13124                                INTVAL (op1)) != 0)
13125         {
13126           /* This is a UBFM/SBFM.  */
13127           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
13128           if (speed)
13129             *cost += extra_cost->alu.bfx;
13130           return true;
13131         }
13132
13133       if (is_int_mode (mode, &int_mode))
13134         {
13135           if (CONST_INT_P (op1))
13136             {
13137               /* We have a mask + shift version of a UBFIZ
13138                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
13139               if (GET_CODE (op0) == ASHIFT
13140                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
13141                                                          XEXP (op0, 1)))
13142                 {
13143                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
13144                                      (enum rtx_code) code, 0, speed);
13145                   if (speed)
13146                     *cost += extra_cost->alu.bfx;
13147
13148                   return true;
13149                 }
13150               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
13151                 {
13152                 /* We possibly get the immediate for free, this is not
13153                    modelled.  */
13154                   *cost += rtx_cost (op0, int_mode,
13155                                      (enum rtx_code) code, 0, speed);
13156                   if (speed)
13157                     *cost += extra_cost->alu.logical;
13158
13159                   return true;
13160                 }
13161             }
13162           else
13163             {
13164               rtx new_op0 = op0;
13165
13166               /* Handle ORN, EON, or BIC.  */
13167               if (GET_CODE (op0) == NOT)
13168                 op0 = XEXP (op0, 0);
13169
13170               new_op0 = aarch64_strip_shift (op0);
13171
13172               /* If we had a shift on op0 then this is a logical-shift-
13173                  by-register/immediate operation.  Otherwise, this is just
13174                  a logical operation.  */
13175               if (speed)
13176                 {
13177                   if (new_op0 != op0)
13178                     {
13179                       /* Shift by immediate.  */
13180                       if (CONST_INT_P (XEXP (op0, 1)))
13181                         *cost += extra_cost->alu.log_shift;
13182                       else
13183                         *cost += extra_cost->alu.log_shift_reg;
13184                     }
13185                   else
13186                     *cost += extra_cost->alu.logical;
13187                 }
13188
13189               /* In both cases we want to cost both operands.  */
13190               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
13191                                  0, speed);
13192               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
13193                                  1, speed);
13194
13195               return true;
13196             }
13197         }
13198       return false;
13199
13200     case NOT:
13201       x = XEXP (x, 0);
13202       op0 = aarch64_strip_shift (x);
13203
13204       if (VECTOR_MODE_P (mode))
13205         {
13206           /* Vector NOT.  */
13207           *cost += extra_cost->vect.alu;
13208           return false;
13209         }
13210
13211       /* MVN-shifted-reg.  */
13212       if (op0 != x)
13213         {
13214           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
13215
13216           if (speed)
13217             *cost += extra_cost->alu.log_shift;
13218
13219           return true;
13220         }
13221       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
13222          Handle the second form here taking care that 'a' in the above can
13223          be a shift.  */
13224       else if (GET_CODE (op0) == XOR)
13225         {
13226           rtx newop0 = XEXP (op0, 0);
13227           rtx newop1 = XEXP (op0, 1);
13228           rtx op0_stripped = aarch64_strip_shift (newop0);
13229
13230           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
13231           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
13232
13233           if (speed)
13234             {
13235               if (op0_stripped != newop0)
13236                 *cost += extra_cost->alu.log_shift;
13237               else
13238                 *cost += extra_cost->alu.logical;
13239             }
13240
13241           return true;
13242         }
13243       /* MVN.  */
13244       if (speed)
13245         *cost += extra_cost->alu.logical;
13246
13247       return false;
13248
13249     case ZERO_EXTEND:
13250
13251       op0 = XEXP (x, 0);
13252       /* If a value is written in SI mode, then zero extended to DI
13253          mode, the operation will in general be free as a write to
13254          a 'w' register implicitly zeroes the upper bits of an 'x'
13255          register.  However, if this is
13256
13257            (set (reg) (zero_extend (reg)))
13258
13259          we must cost the explicit register move.  */
13260       if (mode == DImode
13261           && GET_MODE (op0) == SImode
13262           && outer == SET)
13263         {
13264           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
13265
13266         /* If OP_COST is non-zero, then the cost of the zero extend
13267            is effectively the cost of the inner operation.  Otherwise
13268            we have a MOV instruction and we take the cost from the MOV
13269            itself.  This is true independently of whether we are
13270            optimizing for space or time.  */
13271           if (op_cost)
13272             *cost = op_cost;
13273
13274           return true;
13275         }
13276       else if (MEM_P (op0))
13277         {
13278           /* All loads can zero extend to any size for free.  */
13279           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
13280           return true;
13281         }
13282
13283       op0 = aarch64_extend_bitfield_pattern_p (x);
13284       if (op0)
13285         {
13286           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
13287           if (speed)
13288             *cost += extra_cost->alu.bfx;
13289           return true;
13290         }
13291
13292       if (speed)
13293         {
13294           if (VECTOR_MODE_P (mode))
13295             {
13296               /* UMOV.  */
13297               *cost += extra_cost->vect.alu;
13298             }
13299           else
13300             {
13301               /* We generate an AND instead of UXTB/UXTH.  */
13302               *cost += extra_cost->alu.logical;
13303             }
13304         }
13305       return false;
13306
13307     case SIGN_EXTEND:
13308       if (MEM_P (XEXP (x, 0)))
13309         {
13310           /* LDRSH.  */
13311           if (speed)
13312             {
13313               rtx address = XEXP (XEXP (x, 0), 0);
13314               *cost += extra_cost->ldst.load_sign_extend;
13315
13316               *cost +=
13317                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13318                                                      0, speed));
13319             }
13320           return true;
13321         }
13322
13323       op0 = aarch64_extend_bitfield_pattern_p (x);
13324       if (op0)
13325         {
13326           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
13327           if (speed)
13328             *cost += extra_cost->alu.bfx;
13329           return true;
13330         }
13331
13332       if (speed)
13333         {
13334           if (VECTOR_MODE_P (mode))
13335             *cost += extra_cost->vect.alu;
13336           else
13337             *cost += extra_cost->alu.extend;
13338         }
13339       return false;
13340
13341     case ASHIFT:
13342       op0 = XEXP (x, 0);
13343       op1 = XEXP (x, 1);
13344
13345       if (CONST_INT_P (op1))
13346         {
13347           if (speed)
13348             {
13349               if (VECTOR_MODE_P (mode))
13350                 {
13351                   /* Vector shift (immediate).  */
13352                   *cost += extra_cost->vect.alu;
13353                 }
13354               else
13355                 {
13356                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
13357                      aliases.  */
13358                   *cost += extra_cost->alu.shift;
13359                 }
13360             }
13361
13362           /* We can incorporate zero/sign extend for free.  */
13363           if (GET_CODE (op0) == ZERO_EXTEND
13364               || GET_CODE (op0) == SIGN_EXTEND)
13365             op0 = XEXP (op0, 0);
13366
13367           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
13368           return true;
13369         }
13370       else
13371         {
13372           if (VECTOR_MODE_P (mode))
13373             {
13374               if (speed)
13375                 /* Vector shift (register).  */
13376                 *cost += extra_cost->vect.alu;
13377             }
13378           else
13379             {
13380               if (speed)
13381                 /* LSLV.  */
13382                 *cost += extra_cost->alu.shift_reg;
13383
13384               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
13385                   && CONST_INT_P (XEXP (op1, 1))
13386                   && known_eq (INTVAL (XEXP (op1, 1)),
13387                                GET_MODE_BITSIZE (mode) - 1))
13388                 {
13389                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
13390                   /* We already demanded XEXP (op1, 0) to be REG_P, so
13391                      don't recurse into it.  */
13392                   return true;
13393                 }
13394             }
13395           return false;  /* All arguments need to be in registers.  */
13396         }
13397
13398     case ROTATE:
13399     case ROTATERT:
13400     case LSHIFTRT:
13401     case ASHIFTRT:
13402       op0 = XEXP (x, 0);
13403       op1 = XEXP (x, 1);
13404
13405       if (CONST_INT_P (op1))
13406         {
13407           /* ASR (immediate) and friends.  */
13408           if (speed)
13409             {
13410               if (VECTOR_MODE_P (mode))
13411                 *cost += extra_cost->vect.alu;
13412               else
13413                 *cost += extra_cost->alu.shift;
13414             }
13415
13416           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
13417           return true;
13418         }
13419       else
13420         {
13421           if (VECTOR_MODE_P (mode))
13422             {
13423               if (speed)
13424                 /* Vector shift (register).  */
13425                 *cost += extra_cost->vect.alu;
13426             }
13427           else
13428             {
13429               if (speed)
13430                 /* ASR (register) and friends.  */
13431                 *cost += extra_cost->alu.shift_reg;
13432
13433               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
13434                   && CONST_INT_P (XEXP (op1, 1))
13435                   && known_eq (INTVAL (XEXP (op1, 1)),
13436                                GET_MODE_BITSIZE (mode) - 1))
13437                 {
13438                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
13439                   /* We already demanded XEXP (op1, 0) to be REG_P, so
13440                      don't recurse into it.  */
13441                   return true;
13442                 }
13443             }
13444           return false;  /* All arguments need to be in registers.  */
13445         }
13446
13447     case SYMBOL_REF:
13448
13449       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
13450           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
13451         {
13452           /* LDR.  */
13453           if (speed)
13454             *cost += extra_cost->ldst.load;
13455         }
13456       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
13457                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
13458         {
13459           /* ADRP, followed by ADD.  */
13460           *cost += COSTS_N_INSNS (1);
13461           if (speed)
13462             *cost += 2 * extra_cost->alu.arith;
13463         }
13464       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
13465                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
13466         {
13467           /* ADR.  */
13468           if (speed)
13469             *cost += extra_cost->alu.arith;
13470         }
13471
13472       if (flag_pic)
13473         {
13474           /* One extra load instruction, after accessing the GOT.  */
13475           *cost += COSTS_N_INSNS (1);
13476           if (speed)
13477             *cost += extra_cost->ldst.load;
13478         }
13479       return true;
13480
13481     case HIGH:
13482     case LO_SUM:
13483       /* ADRP/ADD (immediate).  */
13484       if (speed)
13485         *cost += extra_cost->alu.arith;
13486       return true;
13487
13488     case ZERO_EXTRACT:
13489     case SIGN_EXTRACT:
13490       /* UBFX/SBFX.  */
13491       if (speed)
13492         {
13493           if (VECTOR_MODE_P (mode))
13494             *cost += extra_cost->vect.alu;
13495           else
13496             *cost += extra_cost->alu.bfx;
13497         }
13498
13499       /* We can trust that the immediates used will be correct (there
13500          are no by-register forms), so we need only cost op0.  */
13501       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
13502       return true;
13503
13504     case MULT:
13505       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
13506       /* aarch64_rtx_mult_cost always handles recursion to its
13507          operands.  */
13508       return true;
13509
13510     case MOD:
13511     /* We can expand signed mod by power of 2 using a NEGS, two parallel
13512        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
13513        an unconditional negate.  This case should only ever be reached through
13514        the set_smod_pow2_cheap check in expmed.c.  */
13515       if (CONST_INT_P (XEXP (x, 1))
13516           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
13517           && (mode == SImode || mode == DImode))
13518         {
13519           /* We expand to 4 instructions.  Reset the baseline.  */
13520           *cost = COSTS_N_INSNS (4);
13521
13522           if (speed)
13523             *cost += 2 * extra_cost->alu.logical
13524                      + 2 * extra_cost->alu.arith;
13525
13526           return true;
13527         }
13528
13529     /* Fall-through.  */
13530     case UMOD:
13531       if (speed)
13532         {
13533           /* Slighly prefer UMOD over SMOD.  */
13534           if (VECTOR_MODE_P (mode))
13535             *cost += extra_cost->vect.alu;
13536           else if (GET_MODE_CLASS (mode) == MODE_INT)
13537             *cost += (extra_cost->mult[mode == DImode].add
13538                       + extra_cost->mult[mode == DImode].idiv
13539                       + (code == MOD ? 1 : 0));
13540         }
13541       return false;  /* All arguments need to be in registers.  */
13542
13543     case DIV:
13544     case UDIV:
13545     case SQRT:
13546       if (speed)
13547         {
13548           if (VECTOR_MODE_P (mode))
13549             *cost += extra_cost->vect.alu;
13550           else if (GET_MODE_CLASS (mode) == MODE_INT)
13551             /* There is no integer SQRT, so only DIV and UDIV can get
13552                here.  */
13553             *cost += (extra_cost->mult[mode == DImode].idiv
13554                      /* Slighly prefer UDIV over SDIV.  */
13555                      + (code == DIV ? 1 : 0));
13556           else
13557             *cost += extra_cost->fp[mode == DFmode].div;
13558         }
13559       return false;  /* All arguments need to be in registers.  */
13560
13561     case IF_THEN_ELSE:
13562       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
13563                                          XEXP (x, 2), cost, speed);
13564
13565     case EQ:
13566     case NE:
13567     case GT:
13568     case GTU:
13569     case LT:
13570     case LTU:
13571     case GE:
13572     case GEU:
13573     case LE:
13574     case LEU:
13575
13576       return false; /* All arguments must be in registers.  */
13577
13578     case FMA:
13579       op0 = XEXP (x, 0);
13580       op1 = XEXP (x, 1);
13581       op2 = XEXP (x, 2);
13582
13583       if (speed)
13584         {
13585           if (VECTOR_MODE_P (mode))
13586             *cost += extra_cost->vect.alu;
13587           else
13588             *cost += extra_cost->fp[mode == DFmode].fma;
13589         }
13590
13591       /* FMSUB, FNMADD, and FNMSUB are free.  */
13592       if (GET_CODE (op0) == NEG)
13593         op0 = XEXP (op0, 0);
13594
13595       if (GET_CODE (op2) == NEG)
13596         op2 = XEXP (op2, 0);
13597
13598       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
13599          and the by-element operand as operand 0.  */
13600       if (GET_CODE (op1) == NEG)
13601         op1 = XEXP (op1, 0);
13602
13603       /* Catch vector-by-element operations.  The by-element operand can
13604          either be (vec_duplicate (vec_select (x))) or just
13605          (vec_select (x)), depending on whether we are multiplying by
13606          a vector or a scalar.
13607
13608          Canonicalization is not very good in these cases, FMA4 will put the
13609          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
13610       if (GET_CODE (op0) == VEC_DUPLICATE)
13611         op0 = XEXP (op0, 0);
13612       else if (GET_CODE (op1) == VEC_DUPLICATE)
13613         op1 = XEXP (op1, 0);
13614
13615       if (GET_CODE (op0) == VEC_SELECT)
13616         op0 = XEXP (op0, 0);
13617       else if (GET_CODE (op1) == VEC_SELECT)
13618         op1 = XEXP (op1, 0);
13619
13620       /* If the remaining parameters are not registers,
13621          get the cost to put them into registers.  */
13622       *cost += rtx_cost (op0, mode, FMA, 0, speed);
13623       *cost += rtx_cost (op1, mode, FMA, 1, speed);
13624       *cost += rtx_cost (op2, mode, FMA, 2, speed);
13625       return true;
13626
13627     case FLOAT:
13628     case UNSIGNED_FLOAT:
13629       if (speed)
13630         *cost += extra_cost->fp[mode == DFmode].fromint;
13631       return false;
13632
13633     case FLOAT_EXTEND:
13634       if (speed)
13635         {
13636           if (VECTOR_MODE_P (mode))
13637             {
13638               /*Vector truncate.  */
13639               *cost += extra_cost->vect.alu;
13640             }
13641           else
13642             *cost += extra_cost->fp[mode == DFmode].widen;
13643         }
13644       return false;
13645
13646     case FLOAT_TRUNCATE:
13647       if (speed)
13648         {
13649           if (VECTOR_MODE_P (mode))
13650             {
13651               /*Vector conversion.  */
13652               *cost += extra_cost->vect.alu;
13653             }
13654           else
13655             *cost += extra_cost->fp[mode == DFmode].narrow;
13656         }
13657       return false;
13658
13659     case FIX:
13660     case UNSIGNED_FIX:
13661       x = XEXP (x, 0);
13662       /* Strip the rounding part.  They will all be implemented
13663          by the fcvt* family of instructions anyway.  */
13664       if (GET_CODE (x) == UNSPEC)
13665         {
13666           unsigned int uns_code = XINT (x, 1);
13667
13668           if (uns_code == UNSPEC_FRINTA
13669               || uns_code == UNSPEC_FRINTM
13670               || uns_code == UNSPEC_FRINTN
13671               || uns_code == UNSPEC_FRINTP
13672               || uns_code == UNSPEC_FRINTZ)
13673             x = XVECEXP (x, 0, 0);
13674         }
13675
13676       if (speed)
13677         {
13678           if (VECTOR_MODE_P (mode))
13679             *cost += extra_cost->vect.alu;
13680           else
13681             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
13682         }
13683
13684       /* We can combine fmul by a power of 2 followed by a fcvt into a single
13685          fixed-point fcvt.  */
13686       if (GET_CODE (x) == MULT
13687           && ((VECTOR_MODE_P (mode)
13688                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
13689               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
13690         {
13691           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
13692                              0, speed);
13693           return true;
13694         }
13695
13696       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
13697       return true;
13698
13699     case ABS:
13700       if (VECTOR_MODE_P (mode))
13701         {
13702           /* ABS (vector).  */
13703           if (speed)
13704             *cost += extra_cost->vect.alu;
13705         }
13706       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13707         {
13708           op0 = XEXP (x, 0);
13709
13710           /* FABD, which is analogous to FADD.  */
13711           if (GET_CODE (op0) == MINUS)
13712             {
13713               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
13714               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
13715               if (speed)
13716                 *cost += extra_cost->fp[mode == DFmode].addsub;
13717
13718               return true;
13719             }
13720           /* Simple FABS is analogous to FNEG.  */
13721           if (speed)
13722             *cost += extra_cost->fp[mode == DFmode].neg;
13723         }
13724       else
13725         {
13726           /* Integer ABS will either be split to
13727              two arithmetic instructions, or will be an ABS
13728              (scalar), which we don't model.  */
13729           *cost = COSTS_N_INSNS (2);
13730           if (speed)
13731             *cost += 2 * extra_cost->alu.arith;
13732         }
13733       return false;
13734
13735     case SMAX:
13736     case SMIN:
13737       if (speed)
13738         {
13739           if (VECTOR_MODE_P (mode))
13740             *cost += extra_cost->vect.alu;
13741           else
13742             {
13743               /* FMAXNM/FMINNM/FMAX/FMIN.
13744                  TODO: This may not be accurate for all implementations, but
13745                  we do not model this in the cost tables.  */
13746               *cost += extra_cost->fp[mode == DFmode].addsub;
13747             }
13748         }
13749       return false;
13750
13751     case UNSPEC:
13752       /* The floating point round to integer frint* instructions.  */
13753       if (aarch64_frint_unspec_p (XINT (x, 1)))
13754         {
13755           if (speed)
13756             *cost += extra_cost->fp[mode == DFmode].roundint;
13757
13758           return false;
13759         }
13760
13761       if (XINT (x, 1) == UNSPEC_RBIT)
13762         {
13763           if (speed)
13764             *cost += extra_cost->alu.rev;
13765
13766           return false;
13767         }
13768       break;
13769
13770     case TRUNCATE:
13771
13772       /* Decompose <su>muldi3_highpart.  */
13773       if (/* (truncate:DI  */
13774           mode == DImode
13775           /*   (lshiftrt:TI  */
13776           && GET_MODE (XEXP (x, 0)) == TImode
13777           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
13778           /*      (mult:TI  */
13779           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13780           /*        (ANY_EXTEND:TI (reg:DI))
13781                     (ANY_EXTEND:TI (reg:DI)))  */
13782           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
13783                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
13784               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
13785                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
13786           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
13787           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
13788           /*     (const_int 64)  */
13789           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13790           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
13791         {
13792           /* UMULH/SMULH.  */
13793           if (speed)
13794             *cost += extra_cost->mult[mode == DImode].extend;
13795           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
13796                              mode, MULT, 0, speed);
13797           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
13798                              mode, MULT, 1, speed);
13799           return true;
13800         }
13801
13802       /* Fall through.  */
13803     default:
13804       break;
13805     }
13806
13807   if (dump_file
13808       && flag_aarch64_verbose_cost)
13809     fprintf (dump_file,
13810       "\nFailed to cost RTX.  Assuming default cost.\n");
13811
13812   return true;
13813 }
13814
13815 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
13816    calculated for X.  This cost is stored in *COST.  Returns true
13817    if the total cost of X was calculated.  */
13818 static bool
13819 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
13820                    int param, int *cost, bool speed)
13821 {
13822   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
13823
13824   if (dump_file
13825       && flag_aarch64_verbose_cost)
13826     {
13827       print_rtl_single (dump_file, x);
13828       fprintf (dump_file, "\n%s cost: %d (%s)\n",
13829                speed ? "Hot" : "Cold",
13830                *cost, result ? "final" : "partial");
13831     }
13832
13833   return result;
13834 }
13835
13836 static int
13837 aarch64_register_move_cost (machine_mode mode,
13838                             reg_class_t from_i, reg_class_t to_i)
13839 {
13840   enum reg_class from = (enum reg_class) from_i;
13841   enum reg_class to = (enum reg_class) to_i;
13842   const struct cpu_regmove_cost *regmove_cost
13843     = aarch64_tune_params.regmove_cost;
13844
13845   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
13846   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
13847       || to == STUB_REGS)
13848     to = GENERAL_REGS;
13849
13850   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
13851       || from == STUB_REGS)
13852     from = GENERAL_REGS;
13853
13854   /* Make RDFFR very expensive.  In particular, if we know that the FFR
13855      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13856      as a way of obtaining a PTRUE.  */
13857   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
13858       && hard_reg_set_subset_p (reg_class_contents[from_i],
13859                                 reg_class_contents[FFR_REGS]))
13860     return 80;
13861
13862   /* Moving between GPR and stack cost is the same as GP2GP.  */
13863   if ((from == GENERAL_REGS && to == STACK_REG)
13864       || (to == GENERAL_REGS && from == STACK_REG))
13865     return regmove_cost->GP2GP;
13866
13867   /* To/From the stack register, we move via the gprs.  */
13868   if (to == STACK_REG || from == STACK_REG)
13869     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
13870             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
13871
13872   if (known_eq (GET_MODE_SIZE (mode), 16))
13873     {
13874       /* 128-bit operations on general registers require 2 instructions.  */
13875       if (from == GENERAL_REGS && to == GENERAL_REGS)
13876         return regmove_cost->GP2GP * 2;
13877       else if (from == GENERAL_REGS)
13878         return regmove_cost->GP2FP * 2;
13879       else if (to == GENERAL_REGS)
13880         return regmove_cost->FP2GP * 2;
13881
13882       /* When AdvSIMD instructions are disabled it is not possible to move
13883          a 128-bit value directly between Q registers.  This is handled in
13884          secondary reload.  A general register is used as a scratch to move
13885          the upper DI value and the lower DI value is moved directly,
13886          hence the cost is the sum of three moves. */
13887       if (! TARGET_SIMD)
13888         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
13889
13890       return regmove_cost->FP2FP;
13891     }
13892
13893   if (from == GENERAL_REGS && to == GENERAL_REGS)
13894     return regmove_cost->GP2GP;
13895   else if (from == GENERAL_REGS)
13896     return regmove_cost->GP2FP;
13897   else if (to == GENERAL_REGS)
13898     return regmove_cost->FP2GP;
13899
13900   return regmove_cost->FP2FP;
13901 }
13902
13903 static int
13904 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
13905                           reg_class_t rclass ATTRIBUTE_UNUSED,
13906                           bool in ATTRIBUTE_UNUSED)
13907 {
13908   return aarch64_tune_params.memmov_cost;
13909 }
13910
13911 /* Implement TARGET_INIT_BUILTINS.  */
13912 static void
13913 aarch64_init_builtins ()
13914 {
13915   aarch64_general_init_builtins ();
13916   aarch64_sve::init_builtins ();
13917 #ifdef SUBTARGET_INIT_BUILTINS
13918   SUBTARGET_INIT_BUILTINS;
13919 #endif
13920 }
13921
13922 /* Implement TARGET_FOLD_BUILTIN.  */
13923 static tree
13924 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
13925 {
13926   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13927   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13928   tree type = TREE_TYPE (TREE_TYPE (fndecl));
13929   switch (code & AARCH64_BUILTIN_CLASS)
13930     {
13931     case AARCH64_BUILTIN_GENERAL:
13932       return aarch64_general_fold_builtin (subcode, type, nargs, args);
13933
13934     case AARCH64_BUILTIN_SVE:
13935       return NULL_TREE;
13936     }
13937   gcc_unreachable ();
13938 }
13939
13940 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
13941 static bool
13942 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
13943 {
13944   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
13945   tree fndecl = gimple_call_fndecl (stmt);
13946   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13947   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13948   gimple *new_stmt = NULL;
13949   switch (code & AARCH64_BUILTIN_CLASS)
13950     {
13951     case AARCH64_BUILTIN_GENERAL:
13952       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
13953       break;
13954
13955     case AARCH64_BUILTIN_SVE:
13956       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
13957       break;
13958     }
13959
13960   if (!new_stmt)
13961     return false;
13962
13963   gsi_replace (gsi, new_stmt, true);
13964   return true;
13965 }
13966
13967 /* Implement TARGET_EXPAND_BUILTIN.  */
13968 static rtx
13969 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
13970 {
13971   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
13972   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13973   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13974   switch (code & AARCH64_BUILTIN_CLASS)
13975     {
13976     case AARCH64_BUILTIN_GENERAL:
13977       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
13978
13979     case AARCH64_BUILTIN_SVE:
13980       return aarch64_sve::expand_builtin (subcode, exp, target);
13981     }
13982   gcc_unreachable ();
13983 }
13984
13985 /* Implement TARGET_BUILTIN_DECL.  */
13986 static tree
13987 aarch64_builtin_decl (unsigned int code, bool initialize_p)
13988 {
13989   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13990   switch (code & AARCH64_BUILTIN_CLASS)
13991     {
13992     case AARCH64_BUILTIN_GENERAL:
13993       return aarch64_general_builtin_decl (subcode, initialize_p);
13994
13995     case AARCH64_BUILTIN_SVE:
13996       return aarch64_sve::builtin_decl (subcode, initialize_p);
13997     }
13998   gcc_unreachable ();
13999 }
14000
14001 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
14002    to optimize 1.0/sqrt.  */
14003
14004 static bool
14005 use_rsqrt_p (machine_mode mode)
14006 {
14007   return (!flag_trapping_math
14008           && flag_unsafe_math_optimizations
14009           && ((aarch64_tune_params.approx_modes->recip_sqrt
14010                & AARCH64_APPROX_MODE (mode))
14011               || flag_mrecip_low_precision_sqrt));
14012 }
14013
14014 /* Function to decide when to use the approximate reciprocal square root
14015    builtin.  */
14016
14017 static tree
14018 aarch64_builtin_reciprocal (tree fndecl)
14019 {
14020   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
14021
14022   if (!use_rsqrt_p (mode))
14023     return NULL_TREE;
14024   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
14025   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
14026   switch (code & AARCH64_BUILTIN_CLASS)
14027     {
14028     case AARCH64_BUILTIN_GENERAL:
14029       return aarch64_general_builtin_rsqrt (subcode);
14030
14031     case AARCH64_BUILTIN_SVE:
14032       return NULL_TREE;
14033     }
14034   gcc_unreachable ();
14035 }
14036
14037 /* Emit code to perform the floating-point operation:
14038
14039      DST = SRC1 * SRC2
14040
14041    where all three operands are already known to be registers.
14042    If the operation is an SVE one, PTRUE is a suitable all-true
14043    predicate.  */
14044
14045 static void
14046 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
14047 {
14048   if (ptrue)
14049     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
14050                                  dst, ptrue, src1, src2,
14051                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
14052   else
14053     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
14054 }
14055
14056 /* Emit instruction sequence to compute either the approximate square root
14057    or its approximate reciprocal, depending on the flag RECP, and return
14058    whether the sequence was emitted or not.  */
14059
14060 bool
14061 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
14062 {
14063   machine_mode mode = GET_MODE (dst);
14064
14065   if (GET_MODE_INNER (mode) == HFmode)
14066     {
14067       gcc_assert (!recp);
14068       return false;
14069     }
14070
14071   if (!recp)
14072     {
14073       if (!(flag_mlow_precision_sqrt
14074             || (aarch64_tune_params.approx_modes->sqrt
14075                 & AARCH64_APPROX_MODE (mode))))
14076         return false;
14077
14078       if (!flag_finite_math_only
14079           || flag_trapping_math
14080           || !flag_unsafe_math_optimizations
14081           || optimize_function_for_size_p (cfun))
14082         return false;
14083     }
14084   else
14085     /* Caller assumes we cannot fail.  */
14086     gcc_assert (use_rsqrt_p (mode));
14087
14088   rtx pg = NULL_RTX;
14089   if (aarch64_sve_mode_p (mode))
14090     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
14091   machine_mode mmsk = (VECTOR_MODE_P (mode)
14092                        ? related_int_vector_mode (mode).require ()
14093                        : int_mode_for_mode (mode).require ());
14094   rtx xmsk = NULL_RTX;
14095   if (!recp)
14096     {
14097       /* When calculating the approximate square root, compare the
14098          argument with 0.0 and create a mask.  */
14099       rtx zero = CONST0_RTX (mode);
14100       if (pg)
14101         {
14102           xmsk = gen_reg_rtx (GET_MODE (pg));
14103           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
14104           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
14105                                            xmsk, pg, hint, src, zero));
14106         }
14107       else
14108         {
14109           xmsk = gen_reg_rtx (mmsk);
14110           emit_insn (gen_rtx_SET (xmsk,
14111                                   gen_rtx_NEG (mmsk,
14112                                                gen_rtx_EQ (mmsk, src, zero))));
14113         }
14114     }
14115
14116   /* Estimate the approximate reciprocal square root.  */
14117   rtx xdst = gen_reg_rtx (mode);
14118   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
14119
14120   /* Iterate over the series twice for SF and thrice for DF.  */
14121   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
14122
14123   /* Optionally iterate over the series once less for faster performance
14124      while sacrificing the accuracy.  */
14125   if ((recp && flag_mrecip_low_precision_sqrt)
14126       || (!recp && flag_mlow_precision_sqrt))
14127     iterations--;
14128
14129   /* Iterate over the series to calculate the approximate reciprocal square
14130      root.  */
14131   rtx x1 = gen_reg_rtx (mode);
14132   while (iterations--)
14133     {
14134       rtx x2 = gen_reg_rtx (mode);
14135       aarch64_emit_mult (x2, pg, xdst, xdst);
14136
14137       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
14138
14139       if (iterations > 0)
14140         aarch64_emit_mult (xdst, pg, xdst, x1);
14141     }
14142
14143   if (!recp)
14144     {
14145       if (pg)
14146         /* Multiply nonzero source values by the corresponding intermediate
14147            result elements, so that the final calculation is the approximate
14148            square root rather than its reciprocal.  Select a zero result for
14149            zero source values, to avoid the Inf * 0 -> NaN that we'd get
14150            otherwise.  */
14151         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
14152                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
14153       else
14154         {
14155           /* Qualify the approximate reciprocal square root when the
14156              argument is 0.0 by squashing the intermediary result to 0.0.  */
14157           rtx xtmp = gen_reg_rtx (mmsk);
14158           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
14159                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
14160           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
14161
14162           /* Calculate the approximate square root.  */
14163           aarch64_emit_mult (xdst, pg, xdst, src);
14164         }
14165     }
14166
14167   /* Finalize the approximation.  */
14168   aarch64_emit_mult (dst, pg, xdst, x1);
14169
14170   return true;
14171 }
14172
14173 /* Emit the instruction sequence to compute the approximation for the division
14174    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
14175
14176 bool
14177 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
14178 {
14179   machine_mode mode = GET_MODE (quo);
14180
14181   if (GET_MODE_INNER (mode) == HFmode)
14182     return false;
14183
14184   bool use_approx_division_p = (flag_mlow_precision_div
14185                                 || (aarch64_tune_params.approx_modes->division
14186                                     & AARCH64_APPROX_MODE (mode)));
14187
14188   if (!flag_finite_math_only
14189       || flag_trapping_math
14190       || !flag_unsafe_math_optimizations
14191       || optimize_function_for_size_p (cfun)
14192       || !use_approx_division_p)
14193     return false;
14194
14195   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
14196     return false;
14197
14198   rtx pg = NULL_RTX;
14199   if (aarch64_sve_mode_p (mode))
14200     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
14201
14202   /* Estimate the approximate reciprocal.  */
14203   rtx xrcp = gen_reg_rtx (mode);
14204   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
14205
14206   /* Iterate over the series twice for SF and thrice for DF.  */
14207   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
14208
14209   /* Optionally iterate over the series less for faster performance,
14210      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
14211   if (flag_mlow_precision_div)
14212     iterations = (GET_MODE_INNER (mode) == DFmode
14213                   ? aarch64_double_recp_precision
14214                   : aarch64_float_recp_precision);
14215
14216   /* Iterate over the series to calculate the approximate reciprocal.  */
14217   rtx xtmp = gen_reg_rtx (mode);
14218   while (iterations--)
14219     {
14220       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
14221
14222       if (iterations > 0)
14223         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
14224     }
14225
14226   if (num != CONST1_RTX (mode))
14227     {
14228       /* As the approximate reciprocal of DEN is already calculated, only
14229          calculate the approximate division when NUM is not 1.0.  */
14230       rtx xnum = force_reg (mode, num);
14231       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
14232     }
14233
14234   /* Finalize the approximation.  */
14235   aarch64_emit_mult (quo, pg, xrcp, xtmp);
14236   return true;
14237 }
14238
14239 /* Return the number of instructions that can be issued per cycle.  */
14240 static int
14241 aarch64_sched_issue_rate (void)
14242 {
14243   return aarch64_tune_params.issue_rate;
14244 }
14245
14246 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
14247 static int
14248 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
14249 {
14250   if (DEBUG_INSN_P (insn))
14251     return more;
14252
14253   rtx_code code = GET_CODE (PATTERN (insn));
14254   if (code == USE || code == CLOBBER)
14255     return more;
14256
14257   if (get_attr_type (insn) == TYPE_NO_INSN)
14258     return more;
14259
14260   return more - 1;
14261 }
14262
14263 static int
14264 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
14265 {
14266   int issue_rate = aarch64_sched_issue_rate ();
14267
14268   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
14269 }
14270
14271
14272 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
14273    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
14274    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
14275
14276 static int
14277 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
14278                                                     int ready_index)
14279 {
14280   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
14281 }
14282
14283
14284 /* Vectorizer cost model target hooks.  */
14285
14286 /* Information about how the CPU would issue the scalar, Advanced SIMD
14287    or SVE version of a vector loop, using the scheme defined by the
14288    aarch64_base_vec_issue_info hierarchy of structures.  */
14289 struct aarch64_vec_op_count
14290 {
14291   void dump () const;
14292
14293   /* The number of individual "general" operations.  See the comments
14294      in aarch64_base_vec_issue_info for details.  */
14295   unsigned int general_ops = 0;
14296
14297   /* The number of load and store operations, under the same scheme
14298      as above.  */
14299   unsigned int loads = 0;
14300   unsigned int stores = 0;
14301
14302   /* The minimum number of cycles needed to execute all loop-carried
14303      operations, which in the vector code become associated with
14304      reductions.  */
14305   unsigned int reduction_latency = 0;
14306 };
14307
14308 /* Extends aarch64_vec_op_count with SVE-specific information.  */
14309 struct aarch64_sve_op_count : aarch64_vec_op_count
14310 {
14311   void dump () const;
14312
14313   /* The number of individual predicate operations.  See the comments
14314      in aarch64_sve_vec_issue_info for details.  */
14315   unsigned int pred_ops = 0;
14316 };
14317
14318 /* Information about vector code that we're in the process of costing.  */
14319 struct aarch64_vector_costs
14320 {
14321   /* The normal latency-based costs for each region (prologue, body and
14322      epilogue), indexed by vect_cost_model_location.  */
14323   unsigned int region[3] = {};
14324
14325   /* True if we have performed one-time initialization based on the vec_info.
14326
14327      This variable exists because the vec_info is not passed to the
14328      init_cost hook.  We therefore have to defer initialization based on
14329      it till later.  */
14330   bool analyzed_vinfo = false;
14331
14332   /* True if we're costing a vector loop, false if we're costing block-level
14333      vectorization.  */
14334   bool is_loop = false;
14335
14336   /* True if we've seen an SVE operation that we cannot currently vectorize
14337      using Advanced SIMD.  */
14338   bool saw_sve_only_op = false;
14339
14340   /* - If VEC_FLAGS is zero then we're costing the original scalar code.
14341      - If VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
14342        SIMD code.
14343      - If VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
14344   unsigned int vec_flags = 0;
14345
14346   /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
14347      throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
14348      situations, we try to predict whether an Advanced SIMD implementation
14349      of the loop could be completely unrolled and become straight-line code.
14350      If so, it is generally better to use the Advanced SIMD version rather
14351      than length-agnostic SVE, since the SVE loop would execute an unknown
14352      number of times and so could not be completely unrolled in the same way.
14353
14354      If we're applying this heuristic, UNROLLED_ADVSIMD_NITERS is the
14355      number of Advanced SIMD loop iterations that would be unrolled and
14356      UNROLLED_ADVSIMD_STMTS estimates the total number of statements
14357      in the unrolled loop.  Both values are zero if we're not applying
14358      the heuristic.  */
14359   unsigned HOST_WIDE_INT unrolled_advsimd_niters = 0;
14360   unsigned HOST_WIDE_INT unrolled_advsimd_stmts = 0;
14361
14362   /* If we're vectorizing a loop that executes a constant number of times,
14363      this variable gives the number of times that the vector loop would
14364      iterate, otherwise it is zero.  */
14365   uint64_t num_vector_iterations = 0;
14366
14367   /* Used only when vectorizing loops.  Estimates the number and kind of scalar
14368      operations that would be needed to perform the same work as one iteration
14369      of the vector loop.  */
14370   aarch64_vec_op_count scalar_ops;
14371
14372   /* Used only when vectorizing loops.  If VEC_FLAGS & VEC_ADVSIMD,
14373      this structure estimates the number and kind of operations that the
14374      vector loop would contain.  If VEC_FLAGS & VEC_SVE, the structure
14375      estimates what the equivalent Advanced SIMD-only code would need in
14376      order to perform the same work as one iteration of the SVE loop.  */
14377   aarch64_vec_op_count advsimd_ops;
14378
14379   /* Used only when vectorizing loops with SVE.  It estimates the number and
14380      kind of operations that the SVE loop would contain.  */
14381   aarch64_sve_op_count sve_ops;
14382
14383   /* Used to detect cases in which we end up costing the same load twice,
14384      once to account for results that are actually used and once to account
14385      for unused results.  */
14386   hash_map<nofree_ptr_hash<_stmt_vec_info>, unsigned int> seen_loads;
14387 };
14388
14389 /* Implement TARGET_VECTORIZE_INIT_COST.  */
14390 void *
14391 aarch64_init_cost (class loop *, bool)
14392 {
14393   return new aarch64_vector_costs;
14394 }
14395
14396 /* Return true if the current CPU should use the new costs defined
14397    in GCC 11.  This should be removed for GCC 12 and above, with the
14398    costs applying to all CPUs instead.  */
14399 static bool
14400 aarch64_use_new_vector_costs_p ()
14401 {
14402   return (aarch64_tune_params.extra_tuning_flags
14403           & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
14404 }
14405
14406 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
14407 static const simd_vec_cost *
14408 aarch64_simd_vec_costs (tree vectype)
14409 {
14410   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
14411   if (vectype != NULL
14412       && aarch64_sve_mode_p (TYPE_MODE (vectype))
14413       && costs->sve != NULL)
14414     return costs->sve;
14415   return costs->advsimd;
14416 }
14417
14418 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
14419 static const simd_vec_cost *
14420 aarch64_simd_vec_costs_for_flags (unsigned int flags)
14421 {
14422   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
14423   if ((flags & VEC_ANY_SVE) && costs->sve)
14424     return costs->sve;
14425   return costs->advsimd;
14426 }
14427
14428 /* Decide whether to use the unrolling heuristic described above
14429    aarch64_vector_costs::unrolled_advsimd_niters, updating that
14430    field if so.  LOOP_VINFO describes the loop that we're vectorizing
14431    and COSTS are the costs that we're calculating for it.  */
14432 static void
14433 aarch64_record_potential_advsimd_unrolling (loop_vec_info loop_vinfo,
14434                                             aarch64_vector_costs *costs)
14435 {
14436   /* The heuristic only makes sense on targets that have the same
14437      vector throughput for SVE and Advanced SIMD.  */
14438   if (!(aarch64_tune_params.extra_tuning_flags
14439         & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
14440     return;
14441
14442   /* We only want to apply the heuristic if LOOP_VINFO is being
14443      vectorized for SVE.  */
14444   if (!(costs->vec_flags & VEC_ANY_SVE))
14445     return;
14446
14447   /* Check whether it is possible in principle to use Advanced SIMD
14448      instead.  */
14449   if (aarch64_autovec_preference == 2)
14450     return;
14451
14452   /* We don't want to apply the heuristic to outer loops, since it's
14453      harder to track two levels of unrolling.  */
14454   if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
14455     return;
14456
14457   /* Only handle cases in which the number of Advanced SIMD iterations
14458      would be known at compile time but the number of SVE iterations
14459      would not.  */
14460   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
14461       || aarch64_sve_vg.is_constant ())
14462     return;
14463
14464   /* Guess how many times the Advanced SIMD loop would iterate and make
14465      sure that it is within the complete unrolling limit.  Even if the
14466      number of iterations is small enough, the number of statements might
14467      not be, which is why we need to estimate the number of statements too.  */
14468   unsigned int estimated_vq = aarch64_estimated_sve_vq ();
14469   unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
14470   unsigned HOST_WIDE_INT unrolled_advsimd_niters
14471     = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
14472   if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
14473     return;
14474
14475   /* Record that we're applying the heuristic and should try to estimate
14476      the number of statements in the Advanced SIMD loop.  */
14477   costs->unrolled_advsimd_niters = unrolled_advsimd_niters;
14478 }
14479
14480 /* Do one-time initialization of COSTS given that we're costing the loop
14481    vectorization described by LOOP_VINFO.  */
14482 static void
14483 aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo,
14484                             aarch64_vector_costs *costs)
14485 {
14486   costs->is_loop = true;
14487
14488   /* Record the number of times that the vector loop would execute,
14489      if known.  */
14490   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
14491   auto scalar_niters = max_stmt_executions_int (loop);
14492   if (scalar_niters >= 0)
14493     {
14494       unsigned int vf = vect_vf_for_cost (loop_vinfo);
14495       if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
14496         costs->num_vector_iterations = scalar_niters / vf;
14497       else
14498         costs->num_vector_iterations = CEIL (scalar_niters, vf);
14499     }
14500
14501   /* Detect whether we're costing the scalar code or the vector code.
14502      This is a bit hacky: it would be better if the vectorizer told
14503      us directly.
14504
14505      If we're costing the vector code, record whether we're vectorizing
14506      for Advanced SIMD or SVE.  */
14507   if (costs == LOOP_VINFO_TARGET_COST_DATA (loop_vinfo))
14508     costs->vec_flags = aarch64_classify_vector_mode (loop_vinfo->vector_mode);
14509   else
14510     costs->vec_flags = 0;
14511
14512   /* Detect whether we're vectorizing for SVE and should
14513      apply the unrolling heuristic described above
14514      aarch64_vector_costs::unrolled_advsimd_niters.  */
14515   aarch64_record_potential_advsimd_unrolling (loop_vinfo, costs);
14516
14517   /* Record the issue information for any SVE WHILE instructions that the
14518      loop needs.  */
14519   auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
14520   if (issue_info
14521       && issue_info->sve
14522       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
14523     {
14524       unsigned int num_masks = 0;
14525       rgroup_controls *rgm;
14526       unsigned int num_vectors_m1;
14527       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
14528         if (rgm->type)
14529           num_masks += num_vectors_m1 + 1;
14530       costs->sve_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops;
14531     }
14532 }
14533
14534 /* Do one-time initialization of COSTS given that we're costing the block
14535    vectorization described by BB_VINFO.  */
14536 static void
14537 aarch64_analyze_bb_vinfo (bb_vec_info bb_vinfo, aarch64_vector_costs *costs)
14538 {
14539   /* Unfortunately, there's no easy way of telling whether we're costing
14540      the vector code or the scalar code, so just assume that we're costing
14541      the vector code.  */
14542   costs->vec_flags = aarch64_classify_vector_mode (bb_vinfo->vector_mode);
14543 }
14544
14545 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
14546 static int
14547 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
14548                                     tree vectype,
14549                                     int misalign ATTRIBUTE_UNUSED)
14550 {
14551   unsigned elements;
14552   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
14553   bool fp = false;
14554
14555   if (vectype != NULL)
14556     fp = FLOAT_TYPE_P (vectype);
14557
14558   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
14559
14560   switch (type_of_cost)
14561     {
14562       case scalar_stmt:
14563         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
14564
14565       case scalar_load:
14566         return costs->scalar_load_cost;
14567
14568       case scalar_store:
14569         return costs->scalar_store_cost;
14570
14571       case vector_stmt:
14572         return fp ? simd_costs->fp_stmt_cost
14573                   : simd_costs->int_stmt_cost;
14574
14575       case vector_load:
14576         return simd_costs->align_load_cost;
14577
14578       case vector_store:
14579         return simd_costs->store_cost;
14580
14581       case vec_to_scalar:
14582         return simd_costs->vec_to_scalar_cost;
14583
14584       case scalar_to_vec:
14585         return simd_costs->scalar_to_vec_cost;
14586
14587       case unaligned_load:
14588       case vector_gather_load:
14589         return simd_costs->unalign_load_cost;
14590
14591       case unaligned_store:
14592       case vector_scatter_store:
14593         return simd_costs->unalign_store_cost;
14594
14595       case cond_branch_taken:
14596         return costs->cond_taken_branch_cost;
14597
14598       case cond_branch_not_taken:
14599         return costs->cond_not_taken_branch_cost;
14600
14601       case vec_perm:
14602         return simd_costs->permute_cost;
14603
14604       case vec_promote_demote:
14605         return fp ? simd_costs->fp_stmt_cost
14606                   : simd_costs->int_stmt_cost;
14607
14608       case vec_construct:
14609         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
14610         return elements / 2 + 1;
14611
14612       default:
14613         gcc_unreachable ();
14614     }
14615 }
14616
14617 /* Return true if STMT_INFO represents part of a reduction.  */
14618 static bool
14619 aarch64_is_reduction (stmt_vec_info stmt_info)
14620 {
14621   return (STMT_VINFO_REDUC_DEF (stmt_info)
14622           || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
14623 }
14624
14625 /* If STMT_INFO describes a reduction, return the type of reduction
14626    it describes, otherwise return -1.  */
14627 static int
14628 aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info)
14629 {
14630   if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
14631     if (STMT_VINFO_REDUC_DEF (stmt_info))
14632       {
14633         stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
14634         return int (STMT_VINFO_REDUC_TYPE (reduc_info));
14635       }
14636   return -1;
14637 }
14638
14639 /* Return true if an access of kind KIND for STMT_INFO represents one
14640    vector of an LD[234] or ST[234] operation.  Return the total number of
14641    vectors (2, 3 or 4) if so, otherwise return a value outside that range.  */
14642 static int
14643 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
14644 {
14645   if ((kind == vector_load
14646        || kind == unaligned_load
14647        || kind == vector_store
14648        || kind == unaligned_store)
14649       && STMT_VINFO_DATA_REF (stmt_info))
14650     {
14651       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
14652       if (stmt_info
14653           && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
14654         return DR_GROUP_SIZE (stmt_info);
14655     }
14656   return 0;
14657 }
14658
14659 /* If STMT_INFO is a COND_EXPR that includes an embedded comparison, return the
14660    scalar type of the values being compared.  Return null otherwise.  */
14661 static tree
14662 aarch64_embedded_comparison_type (stmt_vec_info stmt_info)
14663 {
14664   if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
14665     if (gimple_assign_rhs_code (assign) == COND_EXPR)
14666       {
14667         tree cond = gimple_assign_rhs1 (assign);
14668         if (COMPARISON_CLASS_P (cond))
14669           return TREE_TYPE (TREE_OPERAND (cond, 0));
14670       }
14671   return NULL_TREE;
14672 }
14673
14674 /* If STMT_INFO is a comparison or contains an embedded comparison, return the
14675    scalar type of the values being compared.  Return null otherwise.  */
14676 static tree
14677 aarch64_comparison_type (stmt_vec_info stmt_info)
14678 {
14679   if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
14680     if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison)
14681       return TREE_TYPE (gimple_assign_rhs1 (assign));
14682   return aarch64_embedded_comparison_type (stmt_info);
14683 }
14684
14685 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
14686    vectors would produce a series of LDP or STP operations.  KIND is the
14687    kind of statement that STMT_INFO represents.  */
14688 static bool
14689 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
14690                            stmt_vec_info stmt_info)
14691 {
14692   switch (kind)
14693     {
14694     case vector_load:
14695     case vector_store:
14696     case unaligned_load:
14697     case unaligned_store:
14698       break;
14699
14700     default:
14701       return false;
14702     }
14703
14704   if (aarch64_tune_params.extra_tuning_flags
14705       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
14706     return false;
14707
14708   return is_gimple_assign (stmt_info->stmt);
14709 }
14710
14711 /* Return true if STMT_INFO extends the result of a load.  */
14712 static bool
14713 aarch64_extending_load_p (class vec_info *vinfo, stmt_vec_info stmt_info)
14714 {
14715   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
14716   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
14717     return false;
14718
14719   tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
14720   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
14721   tree rhs_type = TREE_TYPE (rhs);
14722   if (!INTEGRAL_TYPE_P (lhs_type)
14723       || !INTEGRAL_TYPE_P (rhs_type)
14724       || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
14725     return false;
14726
14727   stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
14728   return (def_stmt_info
14729           && STMT_VINFO_DATA_REF (def_stmt_info)
14730           && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
14731 }
14732
14733 /* Return true if STMT_INFO is an integer truncation.  */
14734 static bool
14735 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
14736 {
14737   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
14738   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
14739     return false;
14740
14741   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
14742   tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
14743   return (INTEGRAL_TYPE_P (lhs_type)
14744           && INTEGRAL_TYPE_P (rhs_type)
14745           && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
14746 }
14747
14748 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
14749    or multiply-subtract sequence that might be suitable for fusing into a
14750    single instruction.  */
14751 static bool
14752 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info)
14753 {
14754   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
14755   if (!assign)
14756     return false;
14757   tree_code code = gimple_assign_rhs_code (assign);
14758   if (code != PLUS_EXPR && code != MINUS_EXPR)
14759     return false;
14760
14761   if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
14762       || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
14763     return false;
14764
14765   for (int i = 1; i < 3; ++i)
14766     {
14767       tree rhs = gimple_op (assign, i);
14768       /* ??? Should we try to check for a single use as well?  */
14769       if (TREE_CODE (rhs) != SSA_NAME)
14770         continue;
14771
14772       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
14773       if (!def_stmt_info
14774           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
14775         continue;
14776       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
14777       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
14778         continue;
14779
14780       return true;
14781     }
14782   return false;
14783 }
14784
14785 /* Return true if the vectorized form of STMT_INFO is something that is only
14786    possible when using SVE instead of Advanced SIMD.  VECTYPE is the type of
14787    the vector that STMT_INFO is operating on.  */
14788 static bool
14789 aarch64_sve_only_stmt_p (stmt_vec_info stmt_info, tree vectype)
14790 {
14791   if (!aarch64_sve_mode_p (TYPE_MODE (vectype)))
14792     return false;
14793
14794   if (STMT_VINFO_DATA_REF (stmt_info))
14795     {
14796       /* Check for true gathers and scatters (rather than just strided accesses
14797          that we've chosen to implement using gathers and scatters).  Although
14798          in principle we could use elementwise accesses for Advanced SIMD,
14799          the vectorizer doesn't yet support that.  */
14800       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
14801         return true;
14802
14803       /* Check for masked loads and stores.  */
14804       if (auto *call = dyn_cast<gcall *> (stmt_info->stmt))
14805         if (gimple_call_internal_p (call)
14806             && internal_fn_mask_index (gimple_call_internal_fn (call)) >= 0)
14807           return true;
14808     }
14809
14810   /* Check for 64-bit integer multiplications.  */
14811   auto *assign = dyn_cast<gassign *> (stmt_info->stmt);
14812   if (assign
14813       && gimple_assign_rhs_code (assign) == MULT_EXPR
14814       && GET_MODE_INNER (TYPE_MODE (vectype)) == DImode
14815       && !integer_pow2p (gimple_assign_rhs2 (assign)))
14816     return true;
14817
14818   return false;
14819 }
14820
14821 /* We are considering implementing STMT_INFO using SVE vector type VECTYPE.
14822    If STMT_INFO is an in-loop reduction that SVE supports directly, return
14823    its latency in cycles, otherwise return zero.  SVE_COSTS specifies the
14824    latencies of the relevant instructions.  */
14825 static unsigned int
14826 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
14827                                        stmt_vec_info stmt_info,
14828                                        tree vectype,
14829                                        const sve_vec_cost *sve_costs)
14830 {
14831   switch (aarch64_reduc_type (vinfo, stmt_info))
14832     {
14833     case EXTRACT_LAST_REDUCTION:
14834       return sve_costs->clast_cost;
14835
14836     case FOLD_LEFT_REDUCTION:
14837       switch (GET_MODE_INNER (TYPE_MODE (vectype)))
14838         {
14839         case E_HFmode:
14840         case E_BFmode:
14841           return sve_costs->fadda_f16_cost;
14842
14843         case E_SFmode:
14844           return sve_costs->fadda_f32_cost;
14845
14846         case E_DFmode:
14847           return sve_costs->fadda_f64_cost;
14848
14849         default:
14850           break;
14851         }
14852       break;
14853     }
14854
14855   return 0;
14856 }
14857
14858 /* STMT_INFO describes a loop-carried operation in the original scalar code
14859    that we are considering implementing as a reduction.  Return one of the
14860    following values, depending on VEC_FLAGS:
14861
14862    - If VEC_FLAGS is zero, return the loop carry latency of the original
14863      scalar operation.
14864
14865    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
14866      the Advanced SIMD implementation.
14867
14868    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
14869      SVE implementation.
14870
14871    VECTYPE is the type of vector that the vectorizer is considering using
14872    for STMT_INFO, which might be different from the type of vector described
14873    by VEC_FLAGS.  */
14874 static unsigned int
14875 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
14876                                    tree vectype, unsigned int vec_flags)
14877 {
14878   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
14879   const sve_vec_cost *sve_costs = nullptr;
14880   if (vec_flags & VEC_ANY_SVE)
14881     sve_costs = aarch64_tune_params.vec_costs->sve;
14882
14883   /* If the caller is asking for the SVE latency, check for forms of reduction
14884      that only SVE can handle directly.  */
14885   if (sve_costs)
14886     {
14887       unsigned int latency
14888         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype,
14889                                                  sve_costs);
14890       if (latency)
14891         return latency;
14892     }
14893
14894   /* Handle scalar costs.  */
14895   if (vec_flags == 0)
14896     {
14897       if (FLOAT_TYPE_P (vectype))
14898         return vec_costs->scalar_fp_stmt_cost;
14899       return vec_costs->scalar_int_stmt_cost;
14900     }
14901
14902   /* Otherwise, the loop body just contains normal integer or FP operations,
14903      with a vector reduction outside the loop.  */
14904   const simd_vec_cost *simd_costs
14905     = aarch64_simd_vec_costs_for_flags (vec_flags);
14906   if (FLOAT_TYPE_P (vectype))
14907     return simd_costs->fp_stmt_cost;
14908   return simd_costs->int_stmt_cost;
14909 }
14910
14911 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
14912    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
14913    try to subdivide the target-independent categorization provided by KIND
14914    to get a more accurate cost.  */
14915 static unsigned int
14916 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
14917                                     stmt_vec_info stmt_info,
14918                                     unsigned int stmt_cost)
14919 {
14920   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
14921      the extension with the load.  */
14922   if (kind == scalar_stmt && aarch64_extending_load_p (vinfo, stmt_info))
14923     return 0;
14924
14925   return stmt_cost;
14926 }
14927
14928 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
14929    for the vectorized form of STMT_INFO, which has cost kind KIND and which
14930    when vectorized would operate on vector type VECTYPE.  Try to subdivide
14931    the target-independent categorization provided by KIND to get a more
14932    accurate cost.  WHERE specifies where the cost associated with KIND
14933    occurs.  */
14934 static unsigned int
14935 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
14936                                     stmt_vec_info stmt_info, tree vectype,
14937                                     enum vect_cost_model_location where,
14938                                     unsigned int stmt_cost)
14939 {
14940   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
14941   const sve_vec_cost *sve_costs = nullptr;
14942   if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
14943     sve_costs = aarch64_tune_params.vec_costs->sve;
14944
14945   /* It's generally better to avoid costing inductions, since the induction
14946      will usually be hidden by other operations.  This is particularly true
14947      for things like COND_REDUCTIONS.  */
14948   if (is_a<gphi *> (stmt_info->stmt))
14949     return 0;
14950
14951   /* Detect cases in which vec_to_scalar is describing the extraction of a
14952      vector element in preparation for a scalar store.  The store itself is
14953      costed separately.  */
14954   if (kind == vec_to_scalar
14955       && STMT_VINFO_DATA_REF (stmt_info)
14956       && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
14957     return simd_costs->store_elt_extra_cost;
14958
14959   /* Detect cases in which a scalar_store is really storing one element
14960      in a scatter operation.  */
14961   if (kind == scalar_store
14962       && sve_costs
14963       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
14964     return sve_costs->scatter_store_elt_cost;
14965
14966   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
14967   if (kind == vec_to_scalar
14968       && where == vect_body
14969       && sve_costs)
14970     {
14971       unsigned int latency
14972         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype,
14973                                                  sve_costs);
14974       if (latency)
14975         return latency;
14976     }
14977
14978   /* Detect cases in which vec_to_scalar represents a single reduction
14979      instruction like FADDP or MAXV.  */
14980   if (kind == vec_to_scalar
14981       && where == vect_epilogue
14982       && aarch64_is_reduction (stmt_info))
14983     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
14984       {
14985       case E_QImode:
14986         return simd_costs->reduc_i8_cost;
14987
14988       case E_HImode:
14989         return simd_costs->reduc_i16_cost;
14990
14991       case E_SImode:
14992         return simd_costs->reduc_i32_cost;
14993
14994       case E_DImode:
14995         return simd_costs->reduc_i64_cost;
14996
14997       case E_HFmode:
14998       case E_BFmode:
14999         return simd_costs->reduc_f16_cost;
15000
15001       case E_SFmode:
15002         return simd_costs->reduc_f32_cost;
15003
15004       case E_DFmode:
15005         return simd_costs->reduc_f64_cost;
15006
15007       default:
15008         break;
15009       }
15010
15011   /* Otherwise stick with the original categorization.  */
15012   return stmt_cost;
15013 }
15014
15015 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
15016    for STMT_INFO, which has cost kind KIND and which when vectorized would
15017    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
15018    targets.  */
15019 static unsigned int
15020 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
15021                               stmt_vec_info stmt_info, tree vectype,
15022                               unsigned int stmt_cost)
15023 {
15024   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
15025      vector register size or number of units.  Integer promotions of this
15026      type therefore map to SXT[BHW] or UXT[BHW].
15027
15028      Most loads have extending forms that can do the sign or zero extension
15029      on the fly.  Optimistically assume that a load followed by an extension
15030      will fold to this form during combine, and that the extension therefore
15031      comes for free.  */
15032   if (kind == vector_stmt && aarch64_extending_load_p (vinfo, stmt_info))
15033     stmt_cost = 0;
15034
15035   /* For similar reasons, vector_stmt integer truncations are a no-op,
15036      because we can just ignore the unused upper bits of the source.  */
15037   if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
15038     stmt_cost = 0;
15039
15040   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
15041      but there are no equivalent instructions for SVE.  This means that
15042      (all other things being equal) 128-bit SVE needs twice as many load
15043      and store instructions as Advanced SIMD in order to process vector pairs.
15044
15045      Also, scalar code can often use LDP and STP to access pairs of values,
15046      so it is too simplistic to say that one SVE load or store replaces
15047      VF scalar loads and stores.
15048
15049      Ideally we would account for this in the scalar and Advanced SIMD
15050      costs by making suitable load/store pairs as cheap as a single
15051      load/store.  However, that would be a very invasive change and in
15052      practice it tends to stress other parts of the cost model too much.
15053      E.g. stores of scalar constants currently count just a store,
15054      whereas stores of vector constants count a store and a vec_init.
15055      This is an artificial distinction for AArch64, where stores of
15056      nonzero scalar constants need the same kind of register invariant
15057      as vector stores.
15058
15059      An alternative would be to double the cost of any SVE loads and stores
15060      that could be paired in Advanced SIMD (and possibly also paired in
15061      scalar code).  But this tends to stress other parts of the cost model
15062      in the same way.  It also means that we can fall back to Advanced SIMD
15063      even if full-loop predication would have been useful.
15064
15065      Here we go for a more conservative version: double the costs of SVE
15066      loads and stores if one iteration of the scalar loop processes enough
15067      elements for it to use a whole number of Advanced SIMD LDP or STP
15068      instructions.  This makes it very likely that the VF would be 1 for
15069      Advanced SIMD, and so no epilogue should be needed.  */
15070   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
15071     {
15072       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
15073       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
15074       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
15075       if (multiple_p (count * elt_bits, 256)
15076           && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
15077         stmt_cost *= 2;
15078     }
15079
15080   return stmt_cost;
15081 }
15082
15083 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
15084    and which when vectorized would operate on vector type VECTYPE.  Add the
15085    cost of any embedded operations.  */
15086 static unsigned int
15087 aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
15088                           tree vectype, unsigned int stmt_cost)
15089 {
15090   if (vectype)
15091     {
15092       const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
15093
15094       /* Detect cases in which a vector load or store represents an
15095          LD[234] or ST[234] instruction.  */
15096       switch (aarch64_ld234_st234_vectors (kind, stmt_info))
15097         {
15098         case 2:
15099           stmt_cost += simd_costs->ld2_st2_permute_cost;
15100           break;
15101
15102         case 3:
15103           stmt_cost += simd_costs->ld3_st3_permute_cost;
15104           break;
15105
15106         case 4:
15107           stmt_cost += simd_costs->ld4_st4_permute_cost;
15108           break;
15109         }
15110
15111       if (kind == vector_stmt || kind == vec_to_scalar)
15112         if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info))
15113           {
15114             if (FLOAT_TYPE_P (cmp_type))
15115               stmt_cost += simd_costs->fp_stmt_cost;
15116             else
15117               stmt_cost += simd_costs->int_stmt_cost;
15118           }
15119     }
15120
15121   if (kind == scalar_stmt)
15122     if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info))
15123       {
15124         if (FLOAT_TYPE_P (cmp_type))
15125           stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
15126         else
15127           stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
15128       }
15129
15130   return stmt_cost;
15131 }
15132
15133 /* VINFO, COSTS, COUNT, KIND, STMT_INFO and VECTYPE are the same as for
15134    TARGET_VECTORIZE_ADD_STMT_COST and they describe an operation in the
15135    body of a vector loop.  Record issue information relating to the vector
15136    operation in OPS, where OPS is one of COSTS->scalar_ops, COSTS->advsimd_ops
15137    or COSTS->sve_ops; see the comments above those variables for details.
15138    In addition:
15139
15140    - VEC_FLAGS is zero if OPS is COSTS->scalar_ops.
15141
15142    - VEC_FLAGS & VEC_ADVSIMD is nonzero if OPS is COSTS->advsimd_ops.
15143
15144    - VEC_FLAGS & VEC_ANY_SVE is nonzero if OPS is COSTS->sve_ops.
15145
15146    ISSUE_INFO provides the scalar, Advanced SIMD or SVE issue information
15147    associated with OPS and VEC_FLAGS.  FACTOR says how many iterations of
15148    the loop described by VEC_FLAGS would be needed to match one iteration
15149    of the vector loop in VINFO.  */
15150 static void
15151 aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
15152                    unsigned int count, enum vect_cost_for_stmt kind,
15153                    _stmt_vec_info *stmt_info, tree vectype,
15154                    unsigned int vec_flags, aarch64_vec_op_count *ops,
15155                    const aarch64_base_vec_issue_info *issue_info,
15156                    unsigned int factor)
15157 {
15158   if (!issue_info)
15159     return;
15160
15161   const aarch64_simd_vec_issue_info *simd_issue = nullptr;
15162   if (vec_flags)
15163     simd_issue = static_cast<const aarch64_simd_vec_issue_info *> (issue_info);
15164
15165   const aarch64_sve_vec_issue_info *sve_issue = nullptr;
15166   if (vec_flags & VEC_ANY_SVE)
15167     sve_issue = static_cast<const aarch64_sve_vec_issue_info *> (issue_info);
15168
15169   /* Calculate the minimum cycles per iteration imposed by a reduction
15170      operation.  */
15171   if ((kind == vector_stmt || kind == vec_to_scalar)
15172       && aarch64_is_reduction (stmt_info))
15173     {
15174       unsigned int base
15175         = aarch64_in_loop_reduction_latency (vinfo, stmt_info, vectype,
15176                                              vec_flags);
15177       if (aarch64_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
15178         {
15179           if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
15180             {
15181               /* When costing an SVE FADDA, the vectorizer treats vec_to_scalar
15182                  as a single operation, whereas for Advanced SIMD it is a
15183                  per-element one.  Increase the factor accordingly, both for
15184                  the reduction_latency calculation and for the op couting.  */
15185               if (vec_flags & VEC_ADVSIMD)
15186                 factor = vect_nunits_for_cost (vectype);
15187             }
15188           else
15189             /* An Advanced SIMD fold-left reduction is the same as a
15190                scalar one and the vectorizer therefore treats vec_to_scalar
15191                as a per-element cost.  There is no extra factor to apply for
15192                scalar code, either for reduction_latency or for the op
15193                counting below.  */
15194             factor = 1;
15195         }
15196
15197       /* ??? Ideally for vector code we'd do COUNT * FACTOR reductions in
15198          parallel, but unfortunately that's not yet the case.  */
15199       ops->reduction_latency = MAX (ops->reduction_latency,
15200                                     base * count * factor);
15201     }
15202
15203   /* Assume that multiply-adds will become a single operation.  */
15204   if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info))
15205     return;
15206
15207   /* When costing scalar statements in vector code, the count already
15208      includes the number of scalar elements in the vector, so we don't
15209      need to apply the factor as well.  */
15210   if (kind == scalar_load || kind == scalar_store || kind == scalar_stmt)
15211     factor = 1;
15212
15213   /* This can go negative with the load handling below.  */
15214   int num_copies = count * factor;
15215
15216   /* Count the basic operation cost associated with KIND.  */
15217   switch (kind)
15218     {
15219     case cond_branch_taken:
15220     case cond_branch_not_taken:
15221     case vector_gather_load:
15222     case vector_scatter_store:
15223       /* We currently don't expect these to be used in a loop body.  */
15224       break;
15225
15226     case vec_perm:
15227     case vec_promote_demote:
15228     case vec_construct:
15229     case vec_to_scalar:
15230     case scalar_to_vec:
15231       /* Assume that these operations have no overhead in the original
15232          scalar code.  */
15233       if (!vec_flags)
15234         break;
15235       /* Fallthrough.  */
15236     case vector_stmt:
15237     case scalar_stmt:
15238       ops->general_ops += num_copies;
15239       break;
15240
15241     case scalar_load:
15242     case vector_load:
15243     case unaligned_load:
15244       /* When costing scalars, detect cases in which we are called twice for
15245          the same load.  This happens for LD[234] operations if only some of
15246          the results are used.  The first time represents the cost of loading
15247          the unused vectors, while the second time represents the cost of
15248          loading the useful parts.  Only the latter should count towards the
15249          scalar costs.  */
15250       if (stmt_info && !vec_flags)
15251         {
15252           bool existed = false;
15253           unsigned int &prev_count
15254             = costs->seen_loads.get_or_insert (stmt_info, &existed);
15255           if (existed)
15256             num_copies -= prev_count;
15257           else
15258             prev_count = num_copies;
15259         }
15260       ops->loads += num_copies;
15261       if (vec_flags || FLOAT_TYPE_P (vectype))
15262         ops->general_ops += issue_info->fp_simd_load_general_ops * num_copies;
15263       break;
15264
15265     case vector_store:
15266     case unaligned_store:
15267     case scalar_store:
15268       ops->stores += num_copies;
15269       if (vec_flags || FLOAT_TYPE_P (vectype))
15270         ops->general_ops += issue_info->fp_simd_store_general_ops * num_copies;
15271       break;
15272     }
15273
15274   /* Add any embedded comparison operations.  */
15275   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
15276       && aarch64_embedded_comparison_type (stmt_info))
15277     ops->general_ops += num_copies;
15278
15279   /* Detect COND_REDUCTIONs and things that would need to become
15280      COND_REDUCTIONs if they were implemented using Advanced SIMD.
15281      There are then two sets of VEC_COND_EXPRs, whereas so far we
15282      have only accounted for one.  */
15283   if (vec_flags && (kind == vector_stmt || kind == vec_to_scalar))
15284     {
15285       int reduc_type = aarch64_reduc_type (vinfo, stmt_info);
15286       if ((reduc_type == EXTRACT_LAST_REDUCTION && (vec_flags & VEC_ADVSIMD))
15287           || reduc_type == COND_REDUCTION)
15288         ops->general_ops += num_copies;
15289     }
15290
15291   /* Count the predicate operations needed by an SVE comparison.  */
15292   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
15293     if (tree type = aarch64_comparison_type (stmt_info))
15294       {
15295         unsigned int base = (FLOAT_TYPE_P (type)
15296                              ? sve_issue->fp_cmp_pred_ops
15297                              : sve_issue->int_cmp_pred_ops);
15298         costs->sve_ops.pred_ops += base * num_copies;
15299       }
15300
15301   /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
15302   if (simd_issue)
15303     switch (aarch64_ld234_st234_vectors (kind, stmt_info))
15304       {
15305       case 2:
15306         ops->general_ops += simd_issue->ld2_st2_general_ops * num_copies;
15307         break;
15308
15309       case 3:
15310         ops->general_ops += simd_issue->ld3_st3_general_ops * num_copies;
15311         break;
15312
15313       case 4:
15314         ops->general_ops += simd_issue->ld4_st4_general_ops * num_copies;
15315         break;
15316       }
15317
15318   /* Add any overhead associated with gather loads and scatter stores.  */
15319   if (sve_issue
15320       && (kind == scalar_load || kind == scalar_store)
15321       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
15322     {
15323       unsigned int pairs = CEIL (count, 2);
15324       costs->sve_ops.pred_ops
15325         += sve_issue->gather_scatter_pair_pred_ops * pairs;
15326       ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
15327     }
15328 }
15329
15330 /* Implement targetm.vectorize.add_stmt_cost.  */
15331 static unsigned
15332 aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
15333                        enum vect_cost_for_stmt kind,
15334                        struct _stmt_vec_info *stmt_info, tree vectype,
15335                        int misalign, enum vect_cost_model_location where)
15336 {
15337   auto *costs = static_cast<aarch64_vector_costs *> (data);
15338   unsigned retval = 0;
15339
15340   if (flag_vect_cost_model)
15341     {
15342       int stmt_cost
15343         = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
15344
15345       /* Do one-time initialization based on the vinfo.  */
15346       loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
15347       bb_vec_info bb_vinfo = dyn_cast<bb_vec_info> (vinfo);
15348       if (!costs->analyzed_vinfo && aarch64_use_new_vector_costs_p ())
15349         {
15350           if (loop_vinfo)
15351             aarch64_analyze_loop_vinfo (loop_vinfo, costs);
15352           else
15353             aarch64_analyze_bb_vinfo (bb_vinfo, costs);
15354           costs->analyzed_vinfo = true;
15355         }
15356
15357       /* Try to get a more accurate cost by looking at STMT_INFO instead
15358          of just looking at KIND.  */
15359       if (stmt_info && aarch64_use_new_vector_costs_p ())
15360         {
15361           if (vectype && aarch64_sve_only_stmt_p (stmt_info, vectype))
15362             costs->saw_sve_only_op = true;
15363
15364           stmt_cost = aarch64_detect_scalar_stmt_subtype
15365             (vinfo, kind, stmt_info, stmt_cost);
15366
15367           if (vectype && costs->vec_flags)
15368             stmt_cost = aarch64_detect_vector_stmt_subtype (vinfo, kind,
15369                                                             stmt_info, vectype,
15370                                                             where, stmt_cost);
15371         }
15372
15373       /* Do any SVE-specific adjustments to the cost.  */
15374       if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
15375         stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
15376                                                   vectype, stmt_cost);
15377
15378       if (stmt_info && aarch64_use_new_vector_costs_p ())
15379         {
15380           /* Account for any extra "embedded" costs that apply additively
15381              to the base cost calculated above.  */
15382           stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
15383                                                 stmt_cost);
15384
15385           /* If we're recording a nonzero vector loop body cost, also estimate
15386              the operations that would need to be issued by all relevant
15387              implementations of the loop.  */
15388           auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
15389           if (loop_vinfo
15390               && issue_info
15391               && costs->vec_flags
15392               && where == vect_body
15393               && vectype
15394               && stmt_cost != 0)
15395             {
15396               /* Record estimates for the scalar code.  */
15397               aarch64_count_ops (vinfo, costs, count, kind, stmt_info, vectype,
15398                                  0, &costs->scalar_ops, issue_info->scalar,
15399                                  vect_nunits_for_cost (vectype));
15400
15401               if (aarch64_sve_mode_p (vinfo->vector_mode) && issue_info->sve)
15402                 {
15403                   /* Record estimates for a possible Advanced SIMD version
15404                      of the SVE code.  */
15405                   aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
15406                                      vectype, VEC_ADVSIMD, &costs->advsimd_ops,
15407                                      issue_info->advsimd,
15408                                      aarch64_estimated_sve_vq ());
15409
15410                   /* Record estimates for the SVE code itself.  */
15411                   aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
15412                                      vectype, VEC_ANY_SVE, &costs->sve_ops,
15413                                      issue_info->sve, 1);
15414                 }
15415               else
15416                 /* Record estimates for the Advanced SIMD code.  Treat SVE like
15417                    Advanced SIMD if the CPU has no specific SVE costs.  */
15418                 aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
15419                                    vectype, VEC_ADVSIMD, &costs->advsimd_ops,
15420                                    issue_info->advsimd, 1);
15421             }
15422
15423           /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
15424              estimate the number of statements in the unrolled Advanced SIMD
15425              loop.  For simplicitly, we assume that one iteration of the
15426              Advanced SIMD loop would need the same number of statements
15427              as one iteration of the SVE loop.  */
15428           if (where == vect_body && costs->unrolled_advsimd_niters)
15429             costs->unrolled_advsimd_stmts
15430               += count * costs->unrolled_advsimd_niters;
15431         }
15432
15433       /* Statements in an inner loop relative to the loop being
15434          vectorized are weighted more heavily.  The value here is
15435          arbitrary and could potentially be improved with analysis.  */
15436       if (where == vect_body && stmt_info
15437           && stmt_in_inner_loop_p (vinfo, stmt_info))
15438         {
15439           gcc_assert (loop_vinfo);
15440           count *= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); /*  FIXME  */
15441         }
15442
15443       retval = (unsigned) (count * stmt_cost);
15444       costs->region[where] += retval;
15445     }
15446
15447   return retval;
15448 }
15449
15450 /* Dump information about the structure.  */
15451 void
15452 aarch64_vec_op_count::dump () const
15453 {
15454   dump_printf_loc (MSG_NOTE, vect_location,
15455                    "  load operations = %d\n", loads);
15456   dump_printf_loc (MSG_NOTE, vect_location,
15457                    "  store operations = %d\n", stores);
15458   dump_printf_loc (MSG_NOTE, vect_location,
15459                    "  general operations = %d\n", general_ops);
15460   dump_printf_loc (MSG_NOTE, vect_location,
15461                    "  reduction latency = %d\n", reduction_latency);
15462 }
15463
15464 /* Dump information about the structure.  */
15465 void
15466 aarch64_sve_op_count::dump () const
15467 {
15468   aarch64_vec_op_count::dump ();
15469   dump_printf_loc (MSG_NOTE, vect_location,
15470                    "  predicate operations = %d\n", pred_ops);
15471 }
15472
15473 /* Use ISSUE_INFO to estimate the minimum number of cycles needed to issue
15474    the operations described by OPS.  This is a very simplistic model!  */
15475 static unsigned int
15476 aarch64_estimate_min_cycles_per_iter
15477   (const aarch64_vec_op_count *ops,
15478    const aarch64_base_vec_issue_info *issue_info)
15479 {
15480   unsigned int cycles = MAX (ops->reduction_latency, 1);
15481   cycles = MAX (cycles, CEIL (ops->stores, issue_info->stores_per_cycle));
15482   cycles = MAX (cycles, CEIL (ops->loads + ops->stores,
15483                               issue_info->loads_stores_per_cycle));
15484   cycles = MAX (cycles, CEIL (ops->general_ops,
15485                               issue_info->general_ops_per_cycle));
15486   return cycles;
15487 }
15488
15489 /* BODY_COST is the cost of a vector loop body recorded in COSTS.
15490    Adjust the cost as necessary and return the new cost.  */
15491 static unsigned int
15492 aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
15493 {
15494   unsigned int orig_body_cost = body_cost;
15495   bool should_disparage = false;
15496
15497   if (dump_enabled_p ())
15498     dump_printf_loc (MSG_NOTE, vect_location,
15499                      "Original vector body cost = %d\n", body_cost);
15500
15501   if (costs->unrolled_advsimd_stmts)
15502     {
15503       if (dump_enabled_p ())
15504         dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
15505                          " unrolled Advanced SIMD loop = %d\n",
15506                          costs->unrolled_advsimd_stmts);
15507
15508       /* Apply the Advanced SIMD vs. SVE unrolling heuristic described above
15509          aarch64_vector_costs::unrolled_advsimd_niters.
15510
15511          The balance here is tricky.  On the one hand, we can't be sure whether
15512          the code is vectorizable with Advanced SIMD or not.  However, even if
15513          it isn't vectorizable with Advanced SIMD, there's a possibility that
15514          the scalar code could also be unrolled.  Some of the code might then
15515          benefit from SLP, or from using LDP and STP.  We therefore apply
15516          the heuristic regardless of can_use_advsimd_p.  */
15517       if (costs->unrolled_advsimd_stmts
15518           && (costs->unrolled_advsimd_stmts
15519               <= (unsigned int) param_max_completely_peeled_insns))
15520         {
15521           unsigned int estimated_vq = aarch64_estimated_sve_vq ();
15522           unsigned int min_cost = (orig_body_cost * estimated_vq) + 1;
15523           if (body_cost < min_cost)
15524             {
15525               if (dump_enabled_p ())
15526                 dump_printf_loc (MSG_NOTE, vect_location,
15527                                  "Increasing body cost to %d to account for"
15528                                  " unrolling\n", min_cost);
15529               body_cost = min_cost;
15530               should_disparage = true;
15531             }
15532         }
15533     }
15534
15535   auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
15536   if (!issue_info)
15537     return body_cost;
15538
15539   unsigned int scalar_cycles_per_iter
15540     = aarch64_estimate_min_cycles_per_iter (&costs->scalar_ops,
15541                                             issue_info->scalar);
15542   unsigned int advsimd_cycles_per_iter
15543     = aarch64_estimate_min_cycles_per_iter (&costs->advsimd_ops,
15544                                             issue_info->advsimd);
15545   bool could_use_advsimd
15546     = ((costs->vec_flags & VEC_ADVSIMD)
15547        || (aarch64_autovec_preference != 2
15548            && (aarch64_tune_params.extra_tuning_flags
15549                & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT)
15550            && !costs->saw_sve_only_op));
15551
15552   if (dump_enabled_p ())
15553     {
15554       if (IN_RANGE (costs->num_vector_iterations, 0, 65536))
15555         dump_printf_loc (MSG_NOTE, vect_location,
15556                          "Vector loop iterates at most %wd times\n",
15557                          costs->num_vector_iterations);
15558       dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
15559       costs->scalar_ops.dump ();
15560       dump_printf_loc (MSG_NOTE, vect_location,
15561                        "  estimated cycles per iteration = %d\n",
15562                        scalar_cycles_per_iter);
15563       if (could_use_advsimd)
15564         {
15565           dump_printf_loc (MSG_NOTE, vect_location,
15566                            "Advanced SIMD issue estimate:\n");
15567           costs->advsimd_ops.dump ();
15568           dump_printf_loc (MSG_NOTE, vect_location,
15569                            "  estimated cycles per iteration = %d\n",
15570                            advsimd_cycles_per_iter);
15571         }
15572       else
15573         dump_printf_loc (MSG_NOTE, vect_location,
15574                          "Loop could not use Advanced SIMD\n");
15575     }
15576
15577   uint64_t vector_cycles_per_iter = advsimd_cycles_per_iter;
15578   unsigned int vector_reduction_latency = costs->advsimd_ops.reduction_latency;
15579   if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve)
15580     {
15581       /* Estimate the minimum number of cycles per iteration needed to issue
15582          non-predicate operations.  */
15583       unsigned int sve_cycles_per_iter
15584         = aarch64_estimate_min_cycles_per_iter (&costs->sve_ops,
15585                                                 issue_info->sve);
15586
15587       /* Separately estimate the minimum number of cycles per iteration needed
15588          to issue the predicate operations.  */
15589       unsigned int pred_cycles_per_iter
15590         = CEIL (costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle);
15591
15592       if (dump_enabled_p ())
15593         {
15594           dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
15595           costs->sve_ops.dump ();
15596           dump_printf_loc (MSG_NOTE, vect_location,
15597                            "  estimated cycles per iteration for non-predicate"
15598                            " operations = %d\n", sve_cycles_per_iter);
15599           if (costs->sve_ops.pred_ops)
15600             dump_printf_loc (MSG_NOTE, vect_location, "  estimated cycles per"
15601                              " iteration for predicate operations = %d\n",
15602                              pred_cycles_per_iter);
15603         }
15604
15605       vector_cycles_per_iter = MAX (sve_cycles_per_iter, pred_cycles_per_iter);
15606       vector_reduction_latency = costs->sve_ops.reduction_latency;
15607
15608       /* If the scalar version of the loop could issue at least as
15609          quickly as the predicate parts of the SVE loop, make the SVE loop
15610          prohibitively expensive.  In this case vectorization is adding an
15611          overhead that the original scalar code didn't have.
15612
15613          This is mostly intended to detect cases in which WHILELOs dominate
15614          for very tight loops, which is something that normal latency-based
15615          costs would not model.  Adding this kind of cliffedge would be
15616          too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
15617          code later in the function handles that case in a more
15618          conservative way.  */
15619       uint64_t sve_estimate = pred_cycles_per_iter + 1;
15620       if (scalar_cycles_per_iter < sve_estimate)
15621         {
15622           unsigned int min_cost
15623             = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
15624           if (body_cost < min_cost)
15625             {
15626               if (dump_enabled_p ())
15627                 dump_printf_loc (MSG_NOTE, vect_location,
15628                                  "Increasing body cost to %d because the"
15629                                  " scalar code could issue within the limit"
15630                                  " imposed by predicate operations\n",
15631                                  min_cost);
15632               body_cost = min_cost;
15633               should_disparage = true;
15634             }
15635         }
15636
15637       /* If it appears that the Advanced SIMD version of a loop could issue
15638          more quickly than the SVE one, increase the SVE cost in proportion
15639          to the difference.  The intention is to make Advanced SIMD preferable
15640          in cases where an Advanced SIMD version exists, without increasing
15641          the costs so much that SVE won't be used at all.
15642
15643          The reasoning is similar to the scalar vs. predicate comparison above:
15644          if the issue rate of the SVE code is limited by predicate operations
15645          (i.e. if pred_cycles_per_iter > sve_cycles_per_iter), and if the
15646          Advanced SIMD code could issue within the limit imposed by the
15647          predicate operations, the predicate operations are adding an
15648          overhead that the original code didn't have and so we should prefer
15649          the Advanced SIMD version.  However, if the predicate operations
15650          do not dominate in this way, we should only increase the cost of
15651          the SVE code if sve_cycles_per_iter is strictly greater than
15652          advsimd_cycles_per_iter.  Given rounding effects, this should mean
15653          that Advanced SIMD is either better or at least no worse.  */
15654       if (sve_cycles_per_iter >= pred_cycles_per_iter)
15655         sve_estimate = sve_cycles_per_iter;
15656       if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate)
15657         {
15658           /* This ensures that min_cost > orig_body_cost * 2.  */
15659           unsigned int min_cost
15660             = orig_body_cost * CEIL (sve_estimate, advsimd_cycles_per_iter) + 1;
15661           if (body_cost < min_cost)
15662             {
15663               if (dump_enabled_p ())
15664                 dump_printf_loc (MSG_NOTE, vect_location,
15665                                  "Increasing body cost to %d because Advanced"
15666                                  " SIMD code could issue as quickly\n",
15667                                  min_cost);
15668               body_cost = min_cost;
15669               should_disparage = true;
15670             }
15671         }
15672     }
15673
15674   /* Decide whether to stick to latency-based costs or whether to try to
15675      take issue rates into account.  */
15676   unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
15677   if (costs->vec_flags & VEC_ANY_SVE)
15678     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
15679
15680   if (costs->num_vector_iterations >= 1
15681       && costs->num_vector_iterations < threshold)
15682     {
15683       if (dump_enabled_p ())
15684         dump_printf_loc (MSG_NOTE, vect_location,
15685                          "Low iteration count, so using pure latency"
15686                          " costs\n");
15687     }
15688   /* Increase the cost of the vector code if it looks like the scalar code
15689      could issue more quickly.  These values are only rough estimates,
15690      so minor differences should only result in minor changes.  */
15691   else if (scalar_cycles_per_iter < vector_cycles_per_iter)
15692     {
15693       body_cost = CEIL (body_cost * vector_cycles_per_iter,
15694                         scalar_cycles_per_iter);
15695       if (dump_enabled_p ())
15696         dump_printf_loc (MSG_NOTE, vect_location,
15697                          "Increasing body cost to %d because scalar code"
15698                          " would issue more quickly\n", body_cost);
15699     }
15700   /* In general, it's expected that the proposed vector code would be able
15701      to issue more quickly than the original scalar code.  This should
15702      already be reflected to some extent in the latency-based costs.
15703
15704      However, the latency-based costs effectively assume that the scalar
15705      code and the vector code execute serially, which tends to underplay
15706      one important case: if the real (non-serialized) execution time of
15707      a scalar iteration is dominated by loop-carried dependencies,
15708      and if the vector code is able to reduce both the length of
15709      the loop-carried dependencies *and* the number of cycles needed
15710      to issue the code in general, we can be more confident that the
15711      vector code is an improvement, even if adding the other (non-loop-carried)
15712      latencies tends to hide this saving.  We therefore reduce the cost of the
15713      vector loop body in proportion to the saving.  */
15714   else if (costs->scalar_ops.reduction_latency > vector_reduction_latency
15715            && costs->scalar_ops.reduction_latency == scalar_cycles_per_iter
15716            && scalar_cycles_per_iter > vector_cycles_per_iter
15717            && !should_disparage)
15718     {
15719       body_cost = CEIL (body_cost * vector_cycles_per_iter,
15720                         scalar_cycles_per_iter);
15721       if (dump_enabled_p ())
15722         dump_printf_loc (MSG_NOTE, vect_location,
15723                          "Decreasing body cost to %d account for smaller"
15724                          " reduction latency\n", body_cost);
15725     }
15726
15727   return body_cost;
15728 }
15729
15730 /* Implement TARGET_VECTORIZE_FINISH_COST.  */
15731 static void
15732 aarch64_finish_cost (void *data, unsigned *prologue_cost,
15733                      unsigned *body_cost, unsigned *epilogue_cost)
15734 {
15735   auto *costs = static_cast<aarch64_vector_costs *> (data);
15736   *prologue_cost = costs->region[vect_prologue];
15737   *body_cost     = costs->region[vect_body];
15738   *epilogue_cost = costs->region[vect_epilogue];
15739
15740   if (costs->is_loop
15741       && costs->vec_flags
15742       && aarch64_use_new_vector_costs_p ())
15743     *body_cost = aarch64_adjust_body_cost (costs, *body_cost);
15744 }
15745
15746 /* Implement TARGET_VECTORIZE_DESTROY_COST_DATA.  */
15747 static void
15748 aarch64_destroy_cost_data (void *data)
15749 {
15750   delete static_cast<aarch64_vector_costs *> (data);
15751 }
15752
15753 static void initialize_aarch64_code_model (struct gcc_options *);
15754
15755 /* Parse the TO_PARSE string and put the architecture struct that it
15756    selects into RES and the architectural features into ISA_FLAGS.
15757    Return an aarch64_parse_opt_result describing the parse result.
15758    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
15759    When the TO_PARSE string contains an invalid extension,
15760    a copy of the string is created and stored to INVALID_EXTENSION.  */
15761
15762 static enum aarch64_parse_opt_result
15763 aarch64_parse_arch (const char *to_parse, const struct processor **res,
15764                     uint64_t *isa_flags, std::string *invalid_extension)
15765 {
15766   const char *ext;
15767   const struct processor *arch;
15768   size_t len;
15769
15770   ext = strchr (to_parse, '+');
15771
15772   if (ext != NULL)
15773     len = ext - to_parse;
15774   else
15775     len = strlen (to_parse);
15776
15777   if (len == 0)
15778     return AARCH64_PARSE_MISSING_ARG;
15779
15780
15781   /* Loop through the list of supported ARCHes to find a match.  */
15782   for (arch = all_architectures; arch->name != NULL; arch++)
15783     {
15784       if (strlen (arch->name) == len
15785           && strncmp (arch->name, to_parse, len) == 0)
15786         {
15787           uint64_t isa_temp = arch->flags;
15788
15789           if (ext != NULL)
15790             {
15791               /* TO_PARSE string contains at least one extension.  */
15792               enum aarch64_parse_opt_result ext_res
15793                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
15794
15795               if (ext_res != AARCH64_PARSE_OK)
15796                 return ext_res;
15797             }
15798           /* Extension parsing was successful.  Confirm the result
15799              arch and ISA flags.  */
15800           *res = arch;
15801           *isa_flags = isa_temp;
15802           return AARCH64_PARSE_OK;
15803         }
15804     }
15805
15806   /* ARCH name not found in list.  */
15807   return AARCH64_PARSE_INVALID_ARG;
15808 }
15809
15810 /* Parse the TO_PARSE string and put the result tuning in RES and the
15811    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
15812    describing the parse result.  If there is an error parsing, RES and
15813    ISA_FLAGS are left unchanged.
15814    When the TO_PARSE string contains an invalid extension,
15815    a copy of the string is created and stored to INVALID_EXTENSION.  */
15816
15817 static enum aarch64_parse_opt_result
15818 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
15819                    uint64_t *isa_flags, std::string *invalid_extension)
15820 {
15821   const char *ext;
15822   const struct processor *cpu;
15823   size_t len;
15824
15825   ext = strchr (to_parse, '+');
15826
15827   if (ext != NULL)
15828     len = ext - to_parse;
15829   else
15830     len = strlen (to_parse);
15831
15832   if (len == 0)
15833     return AARCH64_PARSE_MISSING_ARG;
15834
15835
15836   /* Loop through the list of supported CPUs to find a match.  */
15837   for (cpu = all_cores; cpu->name != NULL; cpu++)
15838     {
15839       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
15840         {
15841           uint64_t isa_temp = cpu->flags;
15842
15843
15844           if (ext != NULL)
15845             {
15846               /* TO_PARSE string contains at least one extension.  */
15847               enum aarch64_parse_opt_result ext_res
15848                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
15849
15850               if (ext_res != AARCH64_PARSE_OK)
15851                 return ext_res;
15852             }
15853           /* Extension parsing was successfull.  Confirm the result
15854              cpu and ISA flags.  */
15855           *res = cpu;
15856           *isa_flags = isa_temp;
15857           return AARCH64_PARSE_OK;
15858         }
15859     }
15860
15861   /* CPU name not found in list.  */
15862   return AARCH64_PARSE_INVALID_ARG;
15863 }
15864
15865 /* Parse the TO_PARSE string and put the cpu it selects into RES.
15866    Return an aarch64_parse_opt_result describing the parse result.
15867    If the parsing fails the RES does not change.  */
15868
15869 static enum aarch64_parse_opt_result
15870 aarch64_parse_tune (const char *to_parse, const struct processor **res)
15871 {
15872   const struct processor *cpu;
15873
15874   /* Loop through the list of supported CPUs to find a match.  */
15875   for (cpu = all_cores; cpu->name != NULL; cpu++)
15876     {
15877       if (strcmp (cpu->name, to_parse) == 0)
15878         {
15879           *res = cpu;
15880           return AARCH64_PARSE_OK;
15881         }
15882     }
15883
15884   /* CPU name not found in list.  */
15885   return AARCH64_PARSE_INVALID_ARG;
15886 }
15887
15888 /* Parse TOKEN, which has length LENGTH to see if it is an option
15889    described in FLAG.  If it is, return the index bit for that fusion type.
15890    If not, error (printing OPTION_NAME) and return zero.  */
15891
15892 static unsigned int
15893 aarch64_parse_one_option_token (const char *token,
15894                                 size_t length,
15895                                 const struct aarch64_flag_desc *flag,
15896                                 const char *option_name)
15897 {
15898   for (; flag->name != NULL; flag++)
15899     {
15900       if (length == strlen (flag->name)
15901           && !strncmp (flag->name, token, length))
15902         return flag->flag;
15903     }
15904
15905   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
15906   return 0;
15907 }
15908
15909 /* Parse OPTION which is a comma-separated list of flags to enable.
15910    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
15911    default state we inherit from the CPU tuning structures.  OPTION_NAME
15912    gives the top-level option we are parsing in the -moverride string,
15913    for use in error messages.  */
15914
15915 static unsigned int
15916 aarch64_parse_boolean_options (const char *option,
15917                                const struct aarch64_flag_desc *flags,
15918                                unsigned int initial_state,
15919                                const char *option_name)
15920 {
15921   const char separator = '.';
15922   const char* specs = option;
15923   const char* ntoken = option;
15924   unsigned int found_flags = initial_state;
15925
15926   while ((ntoken = strchr (specs, separator)))
15927     {
15928       size_t token_length = ntoken - specs;
15929       unsigned token_ops = aarch64_parse_one_option_token (specs,
15930                                                            token_length,
15931                                                            flags,
15932                                                            option_name);
15933       /* If we find "none" (or, for simplicity's sake, an error) anywhere
15934          in the token stream, reset the supported operations.  So:
15935
15936            adrp+add.cmp+branch.none.adrp+add
15937
15938            would have the result of turning on only adrp+add fusion.  */
15939       if (!token_ops)
15940         found_flags = 0;
15941
15942       found_flags |= token_ops;
15943       specs = ++ntoken;
15944     }
15945
15946   /* We ended with a comma, print something.  */
15947   if (!(*specs))
15948     {
15949       error ("%s string ill-formed\n", option_name);
15950       return 0;
15951     }
15952
15953   /* We still have one more token to parse.  */
15954   size_t token_length = strlen (specs);
15955   unsigned token_ops = aarch64_parse_one_option_token (specs,
15956                                                        token_length,
15957                                                        flags,
15958                                                        option_name);
15959    if (!token_ops)
15960      found_flags = 0;
15961
15962   found_flags |= token_ops;
15963   return found_flags;
15964 }
15965
15966 /* Support for overriding instruction fusion.  */
15967
15968 static void
15969 aarch64_parse_fuse_string (const char *fuse_string,
15970                             struct tune_params *tune)
15971 {
15972   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
15973                                                      aarch64_fusible_pairs,
15974                                                      tune->fusible_ops,
15975                                                      "fuse=");
15976 }
15977
15978 /* Support for overriding other tuning flags.  */
15979
15980 static void
15981 aarch64_parse_tune_string (const char *tune_string,
15982                             struct tune_params *tune)
15983 {
15984   tune->extra_tuning_flags
15985     = aarch64_parse_boolean_options (tune_string,
15986                                      aarch64_tuning_flags,
15987                                      tune->extra_tuning_flags,
15988                                      "tune=");
15989 }
15990
15991 /* Parse the sve_width tuning moverride string in TUNE_STRING.
15992    Accept the valid SVE vector widths allowed by
15993    aarch64_sve_vector_bits_enum and use it to override sve_width
15994    in TUNE.  */
15995
15996 static void
15997 aarch64_parse_sve_width_string (const char *tune_string,
15998                                 struct tune_params *tune)
15999 {
16000   int width = -1;
16001
16002   int n = sscanf (tune_string, "%d", &width);
16003   if (n == EOF)
16004     {
16005       error ("invalid format for sve_width");
16006       return;
16007     }
16008   switch (width)
16009     {
16010     case SVE_128:
16011     case SVE_256:
16012     case SVE_512:
16013     case SVE_1024:
16014     case SVE_2048:
16015       break;
16016     default:
16017       error ("invalid sve_width value: %d", width);
16018     }
16019   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
16020 }
16021
16022 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
16023    we understand.  If it is, extract the option string and handoff to
16024    the appropriate function.  */
16025
16026 void
16027 aarch64_parse_one_override_token (const char* token,
16028                                   size_t length,
16029                                   struct tune_params *tune)
16030 {
16031   const struct aarch64_tuning_override_function *fn
16032     = aarch64_tuning_override_functions;
16033
16034   const char *option_part = strchr (token, '=');
16035   if (!option_part)
16036     {
16037       error ("tuning string missing in option (%s)", token);
16038       return;
16039     }
16040
16041   /* Get the length of the option name.  */
16042   length = option_part - token;
16043   /* Skip the '=' to get to the option string.  */
16044   option_part++;
16045
16046   for (; fn->name != NULL; fn++)
16047     {
16048       if (!strncmp (fn->name, token, length))
16049         {
16050           fn->parse_override (option_part, tune);
16051           return;
16052         }
16053     }
16054
16055   error ("unknown tuning option (%s)",token);
16056   return;
16057 }
16058
16059 /* A checking mechanism for the implementation of the tls size.  */
16060
16061 static void
16062 initialize_aarch64_tls_size (struct gcc_options *opts)
16063 {
16064   if (aarch64_tls_size == 0)
16065     aarch64_tls_size = 24;
16066
16067   switch (opts->x_aarch64_cmodel_var)
16068     {
16069     case AARCH64_CMODEL_TINY:
16070       /* Both the default and maximum TLS size allowed under tiny is 1M which
16071          needs two instructions to address, so we clamp the size to 24.  */
16072       if (aarch64_tls_size > 24)
16073         aarch64_tls_size = 24;
16074       break;
16075     case AARCH64_CMODEL_SMALL:
16076       /* The maximum TLS size allowed under small is 4G.  */
16077       if (aarch64_tls_size > 32)
16078         aarch64_tls_size = 32;
16079       break;
16080     case AARCH64_CMODEL_LARGE:
16081       /* The maximum TLS size allowed under large is 16E.
16082          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
16083       if (aarch64_tls_size > 48)
16084         aarch64_tls_size = 48;
16085       break;
16086     default:
16087       gcc_unreachable ();
16088     }
16089
16090   return;
16091 }
16092
16093 /* Parse STRING looking for options in the format:
16094      string     :: option:string
16095      option     :: name=substring
16096      name       :: {a-z}
16097      substring  :: defined by option.  */
16098
16099 static void
16100 aarch64_parse_override_string (const char* input_string,
16101                                struct tune_params* tune)
16102 {
16103   const char separator = ':';
16104   size_t string_length = strlen (input_string) + 1;
16105   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
16106   char *string = string_root;
16107   strncpy (string, input_string, string_length);
16108   string[string_length - 1] = '\0';
16109
16110   char* ntoken = string;
16111
16112   while ((ntoken = strchr (string, separator)))
16113     {
16114       size_t token_length = ntoken - string;
16115       /* Make this substring look like a string.  */
16116       *ntoken = '\0';
16117       aarch64_parse_one_override_token (string, token_length, tune);
16118       string = ++ntoken;
16119     }
16120
16121   /* One last option to parse.  */
16122   aarch64_parse_one_override_token (string, strlen (string), tune);
16123   free (string_root);
16124 }
16125
16126 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
16127    are best for a generic target with the currently-enabled architecture
16128    extensions.  */
16129 static void
16130 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
16131 {
16132   /* Neoverse V1 is the only core that is known to benefit from
16133      AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.  There is therefore no
16134      point enabling it for SVE2 and above.  */
16135   if (TARGET_SVE2)
16136     current_tune.extra_tuning_flags
16137       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
16138 }
16139
16140 static void
16141 aarch64_override_options_after_change_1 (struct gcc_options *opts)
16142 {
16143   if (accepted_branch_protection_string)
16144     {
16145       opts->x_aarch64_branch_protection_string
16146         = xstrdup (accepted_branch_protection_string);
16147     }
16148
16149   /* PR 70044: We have to be careful about being called multiple times for the
16150      same function.  This means all changes should be repeatable.  */
16151
16152   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
16153      Disable the frame pointer flag so the mid-end will not use a frame
16154      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
16155      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
16156      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
16157   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
16158   if (opts->x_flag_omit_frame_pointer == 0)
16159     opts->x_flag_omit_frame_pointer = 2;
16160
16161   /* If not optimizing for size, set the default
16162      alignment to what the target wants.  */
16163   if (!opts->x_optimize_size)
16164     {
16165       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
16166         opts->x_str_align_loops = aarch64_tune_params.loop_align;
16167       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
16168         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
16169       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
16170         opts->x_str_align_functions = aarch64_tune_params.function_align;
16171     }
16172
16173   /* We default to no pc-relative literal loads.  */
16174
16175   aarch64_pcrelative_literal_loads = false;
16176
16177   /* If -mpc-relative-literal-loads is set on the command line, this
16178      implies that the user asked for PC relative literal loads.  */
16179   if (opts->x_pcrelative_literal_loads == 1)
16180     aarch64_pcrelative_literal_loads = true;
16181
16182   /* In the tiny memory model it makes no sense to disallow PC relative
16183      literal pool loads.  */
16184   if (aarch64_cmodel == AARCH64_CMODEL_TINY
16185       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
16186     aarch64_pcrelative_literal_loads = true;
16187
16188   /* When enabling the lower precision Newton series for the square root, also
16189      enable it for the reciprocal square root, since the latter is an
16190      intermediary step for the former.  */
16191   if (flag_mlow_precision_sqrt)
16192     flag_mrecip_low_precision_sqrt = true;
16193 }
16194
16195 /* 'Unpack' up the internal tuning structs and update the options
16196     in OPTS.  The caller must have set up selected_tune and selected_arch
16197     as all the other target-specific codegen decisions are
16198     derived from them.  */
16199
16200 void
16201 aarch64_override_options_internal (struct gcc_options *opts)
16202 {
16203   aarch64_tune_flags = selected_tune->flags;
16204   aarch64_tune = selected_tune->sched_core;
16205   /* Make a copy of the tuning parameters attached to the core, which
16206      we may later overwrite.  */
16207   aarch64_tune_params = *(selected_tune->tune);
16208   aarch64_architecture_version = selected_arch->architecture_version;
16209   if (selected_tune->tune == &generic_tunings)
16210     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
16211
16212   if (opts->x_aarch64_override_tune_string)
16213     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
16214                                   &aarch64_tune_params);
16215
16216   /* This target defaults to strict volatile bitfields.  */
16217   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
16218     opts->x_flag_strict_volatile_bitfields = 1;
16219
16220   if (aarch64_stack_protector_guard == SSP_GLOBAL
16221       && opts->x_aarch64_stack_protector_guard_offset_str)
16222     {
16223       error ("incompatible options %<-mstack-protector-guard=global%> and "
16224              "%<-mstack-protector-guard-offset=%s%>",
16225              aarch64_stack_protector_guard_offset_str);
16226     }
16227
16228   if (aarch64_stack_protector_guard == SSP_SYSREG
16229       && !(opts->x_aarch64_stack_protector_guard_offset_str
16230            && opts->x_aarch64_stack_protector_guard_reg_str))
16231     {
16232       error ("both %<-mstack-protector-guard-offset%> and "
16233              "%<-mstack-protector-guard-reg%> must be used "
16234              "with %<-mstack-protector-guard=sysreg%>");
16235     }
16236
16237   if (opts->x_aarch64_stack_protector_guard_reg_str)
16238     {
16239       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
16240           error ("specify a system register with a small string length.");
16241     }
16242
16243   if (opts->x_aarch64_stack_protector_guard_offset_str)
16244     {
16245       char *end;
16246       const char *str = aarch64_stack_protector_guard_offset_str;
16247       errno = 0;
16248       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
16249       if (!*str || *end || errno)
16250         error ("%qs is not a valid offset in %qs", str,
16251                "-mstack-protector-guard-offset=");
16252       aarch64_stack_protector_guard_offset = offs;
16253     }
16254
16255   initialize_aarch64_code_model (opts);
16256   initialize_aarch64_tls_size (opts);
16257
16258   int queue_depth = 0;
16259   switch (aarch64_tune_params.autoprefetcher_model)
16260     {
16261       case tune_params::AUTOPREFETCHER_OFF:
16262         queue_depth = -1;
16263         break;
16264       case tune_params::AUTOPREFETCHER_WEAK:
16265         queue_depth = 0;
16266         break;
16267       case tune_params::AUTOPREFETCHER_STRONG:
16268         queue_depth = max_insn_queue_index + 1;
16269         break;
16270       default:
16271         gcc_unreachable ();
16272     }
16273
16274   /* We don't mind passing in global_options_set here as we don't use
16275      the *options_set structs anyway.  */
16276   SET_OPTION_IF_UNSET (opts, &global_options_set,
16277                        param_sched_autopref_queue_depth, queue_depth);
16278
16279   /* If using Advanced SIMD only for autovectorization disable SVE vector costs
16280      comparison.  */
16281   if (aarch64_autovec_preference == 1)
16282     SET_OPTION_IF_UNSET (opts, &global_options_set,
16283                          aarch64_sve_compare_costs, 0);
16284
16285   /* Set up parameters to be used in prefetching algorithm.  Do not
16286      override the defaults unless we are tuning for a core we have
16287      researched values for.  */
16288   if (aarch64_tune_params.prefetch->num_slots > 0)
16289     SET_OPTION_IF_UNSET (opts, &global_options_set,
16290                          param_simultaneous_prefetches,
16291                          aarch64_tune_params.prefetch->num_slots);
16292   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
16293     SET_OPTION_IF_UNSET (opts, &global_options_set,
16294                          param_l1_cache_size,
16295                          aarch64_tune_params.prefetch->l1_cache_size);
16296   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
16297     SET_OPTION_IF_UNSET (opts, &global_options_set,
16298                          param_l1_cache_line_size,
16299                          aarch64_tune_params.prefetch->l1_cache_line_size);
16300   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
16301     SET_OPTION_IF_UNSET (opts, &global_options_set,
16302                          param_l2_cache_size,
16303                          aarch64_tune_params.prefetch->l2_cache_size);
16304   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
16305     SET_OPTION_IF_UNSET (opts, &global_options_set,
16306                          param_prefetch_dynamic_strides, 0);
16307   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
16308     SET_OPTION_IF_UNSET (opts, &global_options_set,
16309                          param_prefetch_minimum_stride,
16310                          aarch64_tune_params.prefetch->minimum_stride);
16311
16312   /* Use the alternative scheduling-pressure algorithm by default.  */
16313   SET_OPTION_IF_UNSET (opts, &global_options_set,
16314                        param_sched_pressure_algorithm,
16315                        SCHED_PRESSURE_MODEL);
16316
16317   /* Validate the guard size.  */
16318   int guard_size = param_stack_clash_protection_guard_size;
16319
16320   if (guard_size != 12 && guard_size != 16)
16321     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
16322            "size.  Given value %d (%llu KB) is out of range",
16323            guard_size, (1ULL << guard_size) / 1024ULL);
16324
16325   /* Enforce that interval is the same size as size so the mid-end does the
16326      right thing.  */
16327   SET_OPTION_IF_UNSET (opts, &global_options_set,
16328                        param_stack_clash_protection_probe_interval,
16329                        guard_size);
16330
16331   /* The maybe_set calls won't update the value if the user has explicitly set
16332      one.  Which means we need to validate that probing interval and guard size
16333      are equal.  */
16334   int probe_interval
16335     = param_stack_clash_protection_probe_interval;
16336   if (guard_size != probe_interval)
16337     error ("stack clash guard size %<%d%> must be equal to probing interval "
16338            "%<%d%>", guard_size, probe_interval);
16339
16340   /* Enable sw prefetching at specified optimization level for
16341      CPUS that have prefetch.  Lower optimization level threshold by 1
16342      when profiling is enabled.  */
16343   if (opts->x_flag_prefetch_loop_arrays < 0
16344       && !opts->x_optimize_size
16345       && aarch64_tune_params.prefetch->default_opt_level >= 0
16346       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
16347     opts->x_flag_prefetch_loop_arrays = 1;
16348
16349   if (opts->x_aarch64_arch_string == NULL)
16350     opts->x_aarch64_arch_string = selected_arch->name;
16351   if (opts->x_aarch64_cpu_string == NULL)
16352     opts->x_aarch64_cpu_string = selected_cpu->name;
16353   if (opts->x_aarch64_tune_string == NULL)
16354     opts->x_aarch64_tune_string = selected_tune->name;
16355
16356   aarch64_override_options_after_change_1 (opts);
16357 }
16358
16359 /* Print a hint with a suggestion for a core or architecture name that
16360    most closely resembles what the user passed in STR.  ARCH is true if
16361    the user is asking for an architecture name.  ARCH is false if the user
16362    is asking for a core name.  */
16363
16364 static void
16365 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
16366 {
16367   auto_vec<const char *> candidates;
16368   const struct processor *entry = arch ? all_architectures : all_cores;
16369   for (; entry->name != NULL; entry++)
16370     candidates.safe_push (entry->name);
16371
16372 #ifdef HAVE_LOCAL_CPU_DETECT
16373   /* Add also "native" as possible value.  */
16374   if (arch)
16375     candidates.safe_push ("native");
16376 #endif
16377
16378   char *s;
16379   const char *hint = candidates_list_and_hint (str, s, candidates);
16380   if (hint)
16381     inform (input_location, "valid arguments are: %s;"
16382                              " did you mean %qs?", s, hint);
16383   else
16384     inform (input_location, "valid arguments are: %s", s);
16385
16386   XDELETEVEC (s);
16387 }
16388
16389 /* Print a hint with a suggestion for a core name that most closely resembles
16390    what the user passed in STR.  */
16391
16392 inline static void
16393 aarch64_print_hint_for_core (const char *str)
16394 {
16395   aarch64_print_hint_for_core_or_arch (str, false);
16396 }
16397
16398 /* Print a hint with a suggestion for an architecture name that most closely
16399    resembles what the user passed in STR.  */
16400
16401 inline static void
16402 aarch64_print_hint_for_arch (const char *str)
16403 {
16404   aarch64_print_hint_for_core_or_arch (str, true);
16405 }
16406
16407
16408 /* Print a hint with a suggestion for an extension name
16409    that most closely resembles what the user passed in STR.  */
16410
16411 void
16412 aarch64_print_hint_for_extensions (const std::string &str)
16413 {
16414   auto_vec<const char *> candidates;
16415   aarch64_get_all_extension_candidates (&candidates);
16416   char *s;
16417   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
16418   if (hint)
16419     inform (input_location, "valid arguments are: %s;"
16420                              " did you mean %qs?", s, hint);
16421   else
16422     inform (input_location, "valid arguments are: %s;", s);
16423
16424   XDELETEVEC (s);
16425 }
16426
16427 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
16428    specified in STR and throw errors if appropriate.  Put the results if
16429    they are valid in RES and ISA_FLAGS.  Return whether the option is
16430    valid.  */
16431
16432 static bool
16433 aarch64_validate_mcpu (const char *str, const struct processor **res,
16434                        uint64_t *isa_flags)
16435 {
16436   std::string invalid_extension;
16437   enum aarch64_parse_opt_result parse_res
16438     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
16439
16440   if (parse_res == AARCH64_PARSE_OK)
16441     return true;
16442
16443   switch (parse_res)
16444     {
16445       case AARCH64_PARSE_MISSING_ARG:
16446         error ("missing cpu name in %<-mcpu=%s%>", str);
16447         break;
16448       case AARCH64_PARSE_INVALID_ARG:
16449         error ("unknown value %qs for %<-mcpu%>", str);
16450         aarch64_print_hint_for_core (str);
16451         break;
16452       case AARCH64_PARSE_INVALID_FEATURE:
16453         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
16454                invalid_extension.c_str (), str);
16455         aarch64_print_hint_for_extensions (invalid_extension);
16456         break;
16457       default:
16458         gcc_unreachable ();
16459     }
16460
16461   return false;
16462 }
16463
16464 /* Straight line speculation indicators.  */
16465 enum aarch64_sls_hardening_type
16466 {
16467   SLS_NONE = 0,
16468   SLS_RETBR = 1,
16469   SLS_BLR = 2,
16470   SLS_ALL = 3,
16471 };
16472 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
16473
16474 /* Return whether we should mitigatate Straight Line Speculation for the RET
16475    and BR instructions.  */
16476 bool
16477 aarch64_harden_sls_retbr_p (void)
16478 {
16479   return aarch64_sls_hardening & SLS_RETBR;
16480 }
16481
16482 /* Return whether we should mitigatate Straight Line Speculation for the BLR
16483    instruction.  */
16484 bool
16485 aarch64_harden_sls_blr_p (void)
16486 {
16487   return aarch64_sls_hardening & SLS_BLR;
16488 }
16489
16490 /* As of yet we only allow setting these options globally, in the future we may
16491    allow setting them per function.  */
16492 static void
16493 aarch64_validate_sls_mitigation (const char *const_str)
16494 {
16495   char *token_save = NULL;
16496   char *str = NULL;
16497
16498   if (strcmp (const_str, "none") == 0)
16499     {
16500       aarch64_sls_hardening = SLS_NONE;
16501       return;
16502     }
16503   if (strcmp (const_str, "all") == 0)
16504     {
16505       aarch64_sls_hardening = SLS_ALL;
16506       return;
16507     }
16508
16509   char *str_root = xstrdup (const_str);
16510   str = strtok_r (str_root, ",", &token_save);
16511   if (!str)
16512     error ("invalid argument given to %<-mharden-sls=%>");
16513
16514   int temp = SLS_NONE;
16515   while (str)
16516     {
16517       if (strcmp (str, "blr") == 0)
16518         temp |= SLS_BLR;
16519       else if (strcmp (str, "retbr") == 0)
16520         temp |= SLS_RETBR;
16521       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
16522         {
16523           error ("%<%s%> must be by itself for %<-mharden-sls=%>", str);
16524           break;
16525         }
16526       else
16527         {
16528           error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
16529           break;
16530         }
16531       str = strtok_r (NULL, ",", &token_save);
16532     }
16533   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
16534   free (str_root);
16535 }
16536
16537 /* Parses CONST_STR for branch protection features specified in
16538    aarch64_branch_protect_types, and set any global variables required.  Returns
16539    the parsing result and assigns LAST_STR to the last processed token from
16540    CONST_STR so that it can be used for error reporting.  */
16541
16542 static enum
16543 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
16544                                                           char** last_str)
16545 {
16546   char *str_root = xstrdup (const_str);
16547   char* token_save = NULL;
16548   char *str = strtok_r (str_root, "+", &token_save);
16549   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
16550   if (!str)
16551     res = AARCH64_PARSE_MISSING_ARG;
16552   else
16553     {
16554       char *next_str = strtok_r (NULL, "+", &token_save);
16555       /* Reset the branch protection features to their defaults.  */
16556       aarch64_handle_no_branch_protection (NULL, NULL);
16557
16558       while (str && res == AARCH64_PARSE_OK)
16559         {
16560           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
16561           bool found = false;
16562           /* Search for this type.  */
16563           while (type && type->name && !found && res == AARCH64_PARSE_OK)
16564             {
16565               if (strcmp (str, type->name) == 0)
16566                 {
16567                   found = true;
16568                   res = type->handler (str, next_str);
16569                   str = next_str;
16570                   next_str = strtok_r (NULL, "+", &token_save);
16571                 }
16572               else
16573                 type++;
16574             }
16575           if (found && res == AARCH64_PARSE_OK)
16576             {
16577               bool found_subtype = true;
16578               /* Loop through each token until we find one that isn't a
16579                  subtype.  */
16580               while (found_subtype)
16581                 {
16582                   found_subtype = false;
16583                   const aarch64_branch_protect_type *subtype = type->subtypes;
16584                   /* Search for the subtype.  */
16585                   while (str && subtype && subtype->name && !found_subtype
16586                           && res == AARCH64_PARSE_OK)
16587                     {
16588                       if (strcmp (str, subtype->name) == 0)
16589                         {
16590                           found_subtype = true;
16591                           res = subtype->handler (str, next_str);
16592                           str = next_str;
16593                           next_str = strtok_r (NULL, "+", &token_save);
16594                         }
16595                       else
16596                         subtype++;
16597                     }
16598                 }
16599             }
16600           else if (!found)
16601             res = AARCH64_PARSE_INVALID_ARG;
16602         }
16603     }
16604   /* Copy the last processed token into the argument to pass it back.
16605     Used by option and attribute validation to print the offending token.  */
16606   if (last_str)
16607     {
16608       if (str) strcpy (*last_str, str);
16609       else *last_str = NULL;
16610     }
16611   if (res == AARCH64_PARSE_OK)
16612     {
16613       /* If needed, alloc the accepted string then copy in const_str.
16614         Used by override_option_after_change_1.  */
16615       if (!accepted_branch_protection_string)
16616         accepted_branch_protection_string = (char *) xmalloc (
16617                                                       BRANCH_PROTECT_STR_MAX
16618                                                         + 1);
16619       strncpy (accepted_branch_protection_string, const_str,
16620                 BRANCH_PROTECT_STR_MAX + 1);
16621       /* Forcibly null-terminate.  */
16622       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
16623     }
16624   return res;
16625 }
16626
16627 static bool
16628 aarch64_validate_mbranch_protection (const char *const_str)
16629 {
16630   char *str = (char *) xmalloc (strlen (const_str));
16631   enum aarch64_parse_opt_result res =
16632     aarch64_parse_branch_protection (const_str, &str);
16633   if (res == AARCH64_PARSE_INVALID_ARG)
16634     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
16635   else if (res == AARCH64_PARSE_MISSING_ARG)
16636     error ("missing argument for %<-mbranch-protection=%>");
16637   free (str);
16638   return res == AARCH64_PARSE_OK;
16639 }
16640
16641 /* Validate a command-line -march option.  Parse the arch and extensions
16642    (if any) specified in STR and throw errors if appropriate.  Put the
16643    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
16644    option is valid.  */
16645
16646 static bool
16647 aarch64_validate_march (const char *str, const struct processor **res,
16648                          uint64_t *isa_flags)
16649 {
16650   std::string invalid_extension;
16651   enum aarch64_parse_opt_result parse_res
16652     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
16653
16654   if (parse_res == AARCH64_PARSE_OK)
16655     return true;
16656
16657   switch (parse_res)
16658     {
16659       case AARCH64_PARSE_MISSING_ARG:
16660         error ("missing arch name in %<-march=%s%>", str);
16661         break;
16662       case AARCH64_PARSE_INVALID_ARG:
16663         error ("unknown value %qs for %<-march%>", str);
16664         aarch64_print_hint_for_arch (str);
16665         break;
16666       case AARCH64_PARSE_INVALID_FEATURE:
16667         error ("invalid feature modifier %qs in %<-march=%s%>",
16668                invalid_extension.c_str (), str);
16669         aarch64_print_hint_for_extensions (invalid_extension);
16670         break;
16671       default:
16672         gcc_unreachable ();
16673     }
16674
16675   return false;
16676 }
16677
16678 /* Validate a command-line -mtune option.  Parse the cpu
16679    specified in STR and throw errors if appropriate.  Put the
16680    result, if it is valid, in RES.  Return whether the option is
16681    valid.  */
16682
16683 static bool
16684 aarch64_validate_mtune (const char *str, const struct processor **res)
16685 {
16686   enum aarch64_parse_opt_result parse_res
16687     = aarch64_parse_tune (str, res);
16688
16689   if (parse_res == AARCH64_PARSE_OK)
16690     return true;
16691
16692   switch (parse_res)
16693     {
16694       case AARCH64_PARSE_MISSING_ARG:
16695         error ("missing cpu name in %<-mtune=%s%>", str);
16696         break;
16697       case AARCH64_PARSE_INVALID_ARG:
16698         error ("unknown value %qs for %<-mtune%>", str);
16699         aarch64_print_hint_for_core (str);
16700         break;
16701       default:
16702         gcc_unreachable ();
16703     }
16704   return false;
16705 }
16706
16707 /* Return the CPU corresponding to the enum CPU.
16708    If it doesn't specify a cpu, return the default.  */
16709
16710 static const struct processor *
16711 aarch64_get_tune_cpu (enum aarch64_processor cpu)
16712 {
16713   if (cpu != aarch64_none)
16714     return &all_cores[cpu];
16715
16716   /* The & 0x3f is to extract the bottom 6 bits that encode the
16717      default cpu as selected by the --with-cpu GCC configure option
16718      in config.gcc.
16719      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
16720      flags mechanism should be reworked to make it more sane.  */
16721   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
16722 }
16723
16724 /* Return the architecture corresponding to the enum ARCH.
16725    If it doesn't specify a valid architecture, return the default.  */
16726
16727 static const struct processor *
16728 aarch64_get_arch (enum aarch64_arch arch)
16729 {
16730   if (arch != aarch64_no_arch)
16731     return &all_architectures[arch];
16732
16733   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
16734
16735   return &all_architectures[cpu->arch];
16736 }
16737
16738 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
16739
16740 static poly_uint16
16741 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
16742 {
16743   /* 128-bit SVE and Advanced SIMD modes use different register layouts
16744      on big-endian targets, so we would need to forbid subregs that convert
16745      from one to the other.  By default a reinterpret sequence would then
16746      involve a store to memory in one mode and a load back in the other.
16747      Even if we optimize that sequence using reverse instructions,
16748      it would still be a significant potential overhead.
16749
16750      For now, it seems better to generate length-agnostic code for that
16751      case instead.  */
16752   if (value == SVE_SCALABLE
16753       || (value == SVE_128 && BYTES_BIG_ENDIAN))
16754     return poly_uint16 (2, 2);
16755   else
16756     return (int) value / 64;
16757 }
16758
16759 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
16760    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
16761    tuning structs.  In particular it must set selected_tune and
16762    aarch64_isa_flags that define the available ISA features and tuning
16763    decisions.  It must also set selected_arch as this will be used to
16764    output the .arch asm tags for each function.  */
16765
16766 static void
16767 aarch64_override_options (void)
16768 {
16769   uint64_t cpu_isa = 0;
16770   uint64_t arch_isa = 0;
16771   aarch64_isa_flags = 0;
16772
16773   bool valid_cpu = true;
16774   bool valid_tune = true;
16775   bool valid_arch = true;
16776
16777   selected_cpu = NULL;
16778   selected_arch = NULL;
16779   selected_tune = NULL;
16780
16781   if (aarch64_harden_sls_string)
16782     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
16783
16784   if (aarch64_branch_protection_string)
16785     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
16786
16787   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
16788      If either of -march or -mtune is given, they override their
16789      respective component of -mcpu.  */
16790   if (aarch64_cpu_string)
16791     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
16792                                         &cpu_isa);
16793
16794   if (aarch64_arch_string)
16795     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
16796                                           &arch_isa);
16797
16798   if (aarch64_tune_string)
16799     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
16800
16801 #ifdef SUBTARGET_OVERRIDE_OPTIONS
16802   SUBTARGET_OVERRIDE_OPTIONS;
16803 #endif
16804
16805   /* If the user did not specify a processor, choose the default
16806      one for them.  This will be the CPU set during configuration using
16807      --with-cpu, otherwise it is "generic".  */
16808   if (!selected_cpu)
16809     {
16810       if (selected_arch)
16811         {
16812           selected_cpu = &all_cores[selected_arch->ident];
16813           aarch64_isa_flags = arch_isa;
16814           explicit_arch = selected_arch->arch;
16815         }
16816       else
16817         {
16818           /* Get default configure-time CPU.  */
16819           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
16820           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
16821         }
16822
16823       if (selected_tune)
16824         explicit_tune_core = selected_tune->ident;
16825     }
16826   /* If both -mcpu and -march are specified check that they are architecturally
16827      compatible, warn if they're not and prefer the -march ISA flags.  */
16828   else if (selected_arch)
16829     {
16830       if (selected_arch->arch != selected_cpu->arch)
16831         {
16832           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
16833                        aarch64_cpu_string,
16834                        aarch64_arch_string);
16835         }
16836       aarch64_isa_flags = arch_isa;
16837       explicit_arch = selected_arch->arch;
16838       explicit_tune_core = selected_tune ? selected_tune->ident
16839                                           : selected_cpu->ident;
16840     }
16841   else
16842     {
16843       /* -mcpu but no -march.  */
16844       aarch64_isa_flags = cpu_isa;
16845       explicit_tune_core = selected_tune ? selected_tune->ident
16846                                           : selected_cpu->ident;
16847       gcc_assert (selected_cpu);
16848       selected_arch = &all_architectures[selected_cpu->arch];
16849       explicit_arch = selected_arch->arch;
16850     }
16851
16852   /* Set the arch as well as we will need it when outputing
16853      the .arch directive in assembly.  */
16854   if (!selected_arch)
16855     {
16856       gcc_assert (selected_cpu);
16857       selected_arch = &all_architectures[selected_cpu->arch];
16858     }
16859
16860   if (!selected_tune)
16861     selected_tune = selected_cpu;
16862
16863   if (aarch64_enable_bti == 2)
16864     {
16865 #ifdef TARGET_ENABLE_BTI
16866       aarch64_enable_bti = 1;
16867 #else
16868       aarch64_enable_bti = 0;
16869 #endif
16870     }
16871
16872   /* Return address signing is currently not supported for ILP32 targets.  For
16873      LP64 targets use the configured option in the absence of a command-line
16874      option for -mbranch-protection.  */
16875   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
16876     {
16877 #ifdef TARGET_ENABLE_PAC_RET
16878       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
16879 #else
16880       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
16881 #endif
16882     }
16883
16884 #ifndef HAVE_AS_MABI_OPTION
16885   /* The compiler may have been configured with 2.23.* binutils, which does
16886      not have support for ILP32.  */
16887   if (TARGET_ILP32)
16888     error ("assembler does not support %<-mabi=ilp32%>");
16889 #endif
16890
16891   /* Convert -msve-vector-bits to a VG count.  */
16892   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
16893
16894   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
16895     sorry ("return address signing is only supported for %<-mabi=lp64%>");
16896
16897   /* Make sure we properly set up the explicit options.  */
16898   if ((aarch64_cpu_string && valid_cpu)
16899        || (aarch64_tune_string && valid_tune))
16900     gcc_assert (explicit_tune_core != aarch64_none);
16901
16902   if ((aarch64_cpu_string && valid_cpu)
16903        || (aarch64_arch_string && valid_arch))
16904     gcc_assert (explicit_arch != aarch64_no_arch);
16905
16906   /* The pass to insert speculation tracking runs before
16907      shrink-wrapping and the latter does not know how to update the
16908      tracking status.  So disable it in this case.  */
16909   if (aarch64_track_speculation)
16910     flag_shrink_wrap = 0;
16911
16912   aarch64_override_options_internal (&global_options);
16913
16914   /* Save these options as the default ones in case we push and pop them later
16915      while processing functions with potential target attributes.  */
16916   target_option_default_node = target_option_current_node
16917     = build_target_option_node (&global_options, &global_options_set);
16918 }
16919
16920 /* Implement targetm.override_options_after_change.  */
16921
16922 static void
16923 aarch64_override_options_after_change (void)
16924 {
16925   aarch64_override_options_after_change_1 (&global_options);
16926 }
16927
16928 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
16929 static char *
16930 aarch64_offload_options (void)
16931 {
16932   if (TARGET_ILP32)
16933     return xstrdup ("-foffload-abi=ilp32");
16934   else
16935     return xstrdup ("-foffload-abi=lp64");
16936 }
16937
16938 static struct machine_function *
16939 aarch64_init_machine_status (void)
16940 {
16941   struct machine_function *machine;
16942   machine = ggc_cleared_alloc<machine_function> ();
16943   return machine;
16944 }
16945
16946 void
16947 aarch64_init_expanders (void)
16948 {
16949   init_machine_status = aarch64_init_machine_status;
16950 }
16951
16952 /* A checking mechanism for the implementation of the various code models.  */
16953 static void
16954 initialize_aarch64_code_model (struct gcc_options *opts)
16955 {
16956   aarch64_cmodel = opts->x_aarch64_cmodel_var;
16957   switch (opts->x_aarch64_cmodel_var)
16958     {
16959     case AARCH64_CMODEL_TINY:
16960       if (opts->x_flag_pic)
16961         aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
16962       break;
16963     case AARCH64_CMODEL_SMALL:
16964       if (opts->x_flag_pic)
16965         {
16966 #ifdef HAVE_AS_SMALL_PIC_RELOCS
16967           aarch64_cmodel = (flag_pic == 2
16968                             ? AARCH64_CMODEL_SMALL_PIC
16969                             : AARCH64_CMODEL_SMALL_SPIC);
16970 #else
16971           aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
16972 #endif
16973         }
16974       break;
16975     case AARCH64_CMODEL_LARGE:
16976       if (opts->x_flag_pic)
16977         sorry ("code model %qs with %<-f%s%>", "large",
16978                opts->x_flag_pic > 1 ? "PIC" : "pic");
16979       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
16980         sorry ("code model %qs not supported in ilp32 mode", "large");
16981       break;
16982     case AARCH64_CMODEL_TINY_PIC:
16983     case AARCH64_CMODEL_SMALL_PIC:
16984     case AARCH64_CMODEL_SMALL_SPIC:
16985       gcc_unreachable ();
16986     }
16987 }
16988
16989 /* Implement TARGET_OPTION_SAVE.  */
16990
16991 static void
16992 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts,
16993                      struct gcc_options */* opts_set */)
16994 {
16995   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
16996   ptr->x_aarch64_branch_protection_string
16997     = opts->x_aarch64_branch_protection_string;
16998 }
16999
17000 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
17001    using the information saved in PTR.  */
17002
17003 static void
17004 aarch64_option_restore (struct gcc_options *opts,
17005                         struct gcc_options */* opts_set */,
17006                         struct cl_target_option *ptr)
17007 {
17008   opts->x_explicit_arch = ptr->x_explicit_arch;
17009   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
17010   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
17011   if (opts->x_explicit_tune_core == aarch64_none
17012       && opts->x_explicit_arch != aarch64_no_arch)
17013     selected_tune = &all_cores[selected_arch->ident];
17014   else
17015     selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
17016   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
17017   opts->x_aarch64_branch_protection_string
17018     = ptr->x_aarch64_branch_protection_string;
17019   if (opts->x_aarch64_branch_protection_string)
17020     {
17021       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
17022                                         NULL);
17023     }
17024
17025   aarch64_override_options_internal (opts);
17026 }
17027
17028 /* Implement TARGET_OPTION_PRINT.  */
17029
17030 static void
17031 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
17032 {
17033   const struct processor *cpu
17034     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
17035   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
17036   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
17037   std::string extension
17038     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
17039
17040   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
17041   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
17042            arch->name, extension.c_str ());
17043 }
17044
17045 static GTY(()) tree aarch64_previous_fndecl;
17046
17047 void
17048 aarch64_reset_previous_fndecl (void)
17049 {
17050   aarch64_previous_fndecl = NULL;
17051 }
17052
17053 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
17054    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
17055    make sure optab availability predicates are recomputed when necessary.  */
17056
17057 void
17058 aarch64_save_restore_target_globals (tree new_tree)
17059 {
17060   if (TREE_TARGET_GLOBALS (new_tree))
17061     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
17062   else if (new_tree == target_option_default_node)
17063     restore_target_globals (&default_target_globals);
17064   else
17065     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
17066 }
17067
17068 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
17069    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
17070    of the function, if such exists.  This function may be called multiple
17071    times on a single function so use aarch64_previous_fndecl to avoid
17072    setting up identical state.  */
17073
17074 static void
17075 aarch64_set_current_function (tree fndecl)
17076 {
17077   if (!fndecl || fndecl == aarch64_previous_fndecl)
17078     return;
17079
17080   tree old_tree = (aarch64_previous_fndecl
17081                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
17082                    : NULL_TREE);
17083
17084   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
17085
17086   /* If current function has no attributes but the previous one did,
17087      use the default node.  */
17088   if (!new_tree && old_tree)
17089     new_tree = target_option_default_node;
17090
17091   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
17092      the default have been handled by aarch64_save_restore_target_globals from
17093      aarch64_pragma_target_parse.  */
17094   if (old_tree == new_tree)
17095     return;
17096
17097   aarch64_previous_fndecl = fndecl;
17098
17099   /* First set the target options.  */
17100   cl_target_option_restore (&global_options, &global_options_set,
17101                             TREE_TARGET_OPTION (new_tree));
17102
17103   aarch64_save_restore_target_globals (new_tree);
17104 }
17105
17106 /* Enum describing the various ways we can handle attributes.
17107    In many cases we can reuse the generic option handling machinery.  */
17108
17109 enum aarch64_attr_opt_type
17110 {
17111   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
17112   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
17113   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
17114   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
17115 };
17116
17117 /* All the information needed to handle a target attribute.
17118    NAME is the name of the attribute.
17119    ATTR_TYPE specifies the type of behavior of the attribute as described
17120    in the definition of enum aarch64_attr_opt_type.
17121    ALLOW_NEG is true if the attribute supports a "no-" form.
17122    HANDLER is the function that takes the attribute string as an argument
17123    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
17124    OPT_NUM is the enum specifying the option that the attribute modifies.
17125    This is needed for attributes that mirror the behavior of a command-line
17126    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
17127    aarch64_attr_enum.  */
17128
17129 struct aarch64_attribute_info
17130 {
17131   const char *name;
17132   enum aarch64_attr_opt_type attr_type;
17133   bool allow_neg;
17134   bool (*handler) (const char *);
17135   enum opt_code opt_num;
17136 };
17137
17138 /* Handle the ARCH_STR argument to the arch= target attribute.  */
17139
17140 static bool
17141 aarch64_handle_attr_arch (const char *str)
17142 {
17143   const struct processor *tmp_arch = NULL;
17144   std::string invalid_extension;
17145   enum aarch64_parse_opt_result parse_res
17146     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
17147
17148   if (parse_res == AARCH64_PARSE_OK)
17149     {
17150       gcc_assert (tmp_arch);
17151       selected_arch = tmp_arch;
17152       explicit_arch = selected_arch->arch;
17153       return true;
17154     }
17155
17156   switch (parse_res)
17157     {
17158       case AARCH64_PARSE_MISSING_ARG:
17159         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
17160         break;
17161       case AARCH64_PARSE_INVALID_ARG:
17162         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
17163         aarch64_print_hint_for_arch (str);
17164         break;
17165       case AARCH64_PARSE_INVALID_FEATURE:
17166         error ("invalid feature modifier %s of value (\"%s\") in "
17167                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
17168         aarch64_print_hint_for_extensions (invalid_extension);
17169         break;
17170       default:
17171         gcc_unreachable ();
17172     }
17173
17174   return false;
17175 }
17176
17177 /* Handle the argument CPU_STR to the cpu= target attribute.  */
17178
17179 static bool
17180 aarch64_handle_attr_cpu (const char *str)
17181 {
17182   const struct processor *tmp_cpu = NULL;
17183   std::string invalid_extension;
17184   enum aarch64_parse_opt_result parse_res
17185     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
17186
17187   if (parse_res == AARCH64_PARSE_OK)
17188     {
17189       gcc_assert (tmp_cpu);
17190       selected_tune = tmp_cpu;
17191       explicit_tune_core = selected_tune->ident;
17192
17193       selected_arch = &all_architectures[tmp_cpu->arch];
17194       explicit_arch = selected_arch->arch;
17195       return true;
17196     }
17197
17198   switch (parse_res)
17199     {
17200       case AARCH64_PARSE_MISSING_ARG:
17201         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
17202         break;
17203       case AARCH64_PARSE_INVALID_ARG:
17204         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
17205         aarch64_print_hint_for_core (str);
17206         break;
17207       case AARCH64_PARSE_INVALID_FEATURE:
17208         error ("invalid feature modifier %s of value (\"%s\") in "
17209                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
17210         aarch64_print_hint_for_extensions (invalid_extension);
17211         break;
17212       default:
17213         gcc_unreachable ();
17214     }
17215
17216   return false;
17217 }
17218
17219 /* Handle the argument STR to the branch-protection= attribute.  */
17220
17221  static bool
17222  aarch64_handle_attr_branch_protection (const char* str)
17223  {
17224   char *err_str = (char *) xmalloc (strlen (str) + 1);
17225   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
17226                                                                       &err_str);
17227   bool success = false;
17228   switch (res)
17229     {
17230      case AARCH64_PARSE_MISSING_ARG:
17231        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
17232               " attribute");
17233        break;
17234      case AARCH64_PARSE_INVALID_ARG:
17235        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
17236               "=\")%> pragma or attribute", err_str);
17237        break;
17238      case AARCH64_PARSE_OK:
17239        success = true;
17240       /* Fall through.  */
17241      case AARCH64_PARSE_INVALID_FEATURE:
17242        break;
17243      default:
17244        gcc_unreachable ();
17245     }
17246   free (err_str);
17247   return success;
17248  }
17249
17250 /* Handle the argument STR to the tune= target attribute.  */
17251
17252 static bool
17253 aarch64_handle_attr_tune (const char *str)
17254 {
17255   const struct processor *tmp_tune = NULL;
17256   enum aarch64_parse_opt_result parse_res
17257     = aarch64_parse_tune (str, &tmp_tune);
17258
17259   if (parse_res == AARCH64_PARSE_OK)
17260     {
17261       gcc_assert (tmp_tune);
17262       selected_tune = tmp_tune;
17263       explicit_tune_core = selected_tune->ident;
17264       return true;
17265     }
17266
17267   switch (parse_res)
17268     {
17269       case AARCH64_PARSE_INVALID_ARG:
17270         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
17271         aarch64_print_hint_for_core (str);
17272         break;
17273       default:
17274         gcc_unreachable ();
17275     }
17276
17277   return false;
17278 }
17279
17280 /* Parse an architecture extensions target attribute string specified in STR.
17281    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
17282    if successful.  Update aarch64_isa_flags to reflect the ISA features
17283    modified.  */
17284
17285 static bool
17286 aarch64_handle_attr_isa_flags (char *str)
17287 {
17288   enum aarch64_parse_opt_result parse_res;
17289   uint64_t isa_flags = aarch64_isa_flags;
17290
17291   /* We allow "+nothing" in the beginning to clear out all architectural
17292      features if the user wants to handpick specific features.  */
17293   if (strncmp ("+nothing", str, 8) == 0)
17294     {
17295       isa_flags = 0;
17296       str += 8;
17297     }
17298
17299   std::string invalid_extension;
17300   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
17301
17302   if (parse_res == AARCH64_PARSE_OK)
17303     {
17304       aarch64_isa_flags = isa_flags;
17305       return true;
17306     }
17307
17308   switch (parse_res)
17309     {
17310       case AARCH64_PARSE_MISSING_ARG:
17311         error ("missing value in %<target()%> pragma or attribute");
17312         break;
17313
17314       case AARCH64_PARSE_INVALID_FEATURE:
17315         error ("invalid feature modifier %s of value (\"%s\") in "
17316                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
17317         break;
17318
17319       default:
17320         gcc_unreachable ();
17321     }
17322
17323  return false;
17324 }
17325
17326 /* The target attributes that we support.  On top of these we also support just
17327    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
17328    handled explicitly in aarch64_process_one_target_attr.  */
17329
17330 static const struct aarch64_attribute_info aarch64_attributes[] =
17331 {
17332   { "general-regs-only", aarch64_attr_mask, false, NULL,
17333      OPT_mgeneral_regs_only },
17334   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
17335      OPT_mfix_cortex_a53_835769 },
17336   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
17337      OPT_mfix_cortex_a53_843419 },
17338   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
17339   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
17340   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
17341      OPT_momit_leaf_frame_pointer },
17342   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
17343   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
17344      OPT_march_ },
17345   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
17346   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
17347      OPT_mtune_ },
17348   { "branch-protection", aarch64_attr_custom, false,
17349      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
17350   { "sign-return-address", aarch64_attr_enum, false, NULL,
17351      OPT_msign_return_address_ },
17352   { "outline-atomics", aarch64_attr_bool, true, NULL,
17353      OPT_moutline_atomics},
17354   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
17355 };
17356
17357 /* Parse ARG_STR which contains the definition of one target attribute.
17358    Show appropriate errors if any or return true if the attribute is valid.  */
17359
17360 static bool
17361 aarch64_process_one_target_attr (char *arg_str)
17362 {
17363   bool invert = false;
17364
17365   size_t len = strlen (arg_str);
17366
17367   if (len == 0)
17368     {
17369       error ("malformed %<target()%> pragma or attribute");
17370       return false;
17371     }
17372
17373   char *str_to_check = (char *) alloca (len + 1);
17374   strcpy (str_to_check, arg_str);
17375
17376   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
17377      It is easier to detect and handle it explicitly here rather than going
17378      through the machinery for the rest of the target attributes in this
17379      function.  */
17380   if (*str_to_check == '+')
17381     return aarch64_handle_attr_isa_flags (str_to_check);
17382
17383   if (len > 3 && startswith (str_to_check, "no-"))
17384     {
17385       invert = true;
17386       str_to_check += 3;
17387     }
17388   char *arg = strchr (str_to_check, '=');
17389
17390   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
17391      and point ARG to "foo".  */
17392   if (arg)
17393     {
17394       *arg = '\0';
17395       arg++;
17396     }
17397   const struct aarch64_attribute_info *p_attr;
17398   bool found = false;
17399   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
17400     {
17401       /* If the names don't match up, or the user has given an argument
17402          to an attribute that doesn't accept one, or didn't give an argument
17403          to an attribute that expects one, fail to match.  */
17404       if (strcmp (str_to_check, p_attr->name) != 0)
17405         continue;
17406
17407       found = true;
17408       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
17409                               || p_attr->attr_type == aarch64_attr_enum;
17410
17411       if (attr_need_arg_p ^ (arg != NULL))
17412         {
17413           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
17414           return false;
17415         }
17416
17417       /* If the name matches but the attribute does not allow "no-" versions
17418          then we can't match.  */
17419       if (invert && !p_attr->allow_neg)
17420         {
17421           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
17422           return false;
17423         }
17424
17425       switch (p_attr->attr_type)
17426         {
17427         /* Has a custom handler registered.
17428            For example, cpu=, arch=, tune=.  */
17429           case aarch64_attr_custom:
17430             gcc_assert (p_attr->handler);
17431             if (!p_attr->handler (arg))
17432               return false;
17433             break;
17434
17435           /* Either set or unset a boolean option.  */
17436           case aarch64_attr_bool:
17437             {
17438               struct cl_decoded_option decoded;
17439
17440               generate_option (p_attr->opt_num, NULL, !invert,
17441                                CL_TARGET, &decoded);
17442               aarch64_handle_option (&global_options, &global_options_set,
17443                                       &decoded, input_location);
17444               break;
17445             }
17446           /* Set or unset a bit in the target_flags.  aarch64_handle_option
17447              should know what mask to apply given the option number.  */
17448           case aarch64_attr_mask:
17449             {
17450               struct cl_decoded_option decoded;
17451               /* We only need to specify the option number.
17452                  aarch64_handle_option will know which mask to apply.  */
17453               decoded.opt_index = p_attr->opt_num;
17454               decoded.value = !invert;
17455               aarch64_handle_option (&global_options, &global_options_set,
17456                                       &decoded, input_location);
17457               break;
17458             }
17459           /* Use the option setting machinery to set an option to an enum.  */
17460           case aarch64_attr_enum:
17461             {
17462               gcc_assert (arg);
17463               bool valid;
17464               int value;
17465               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
17466                                               &value, CL_TARGET);
17467               if (valid)
17468                 {
17469                   set_option (&global_options, NULL, p_attr->opt_num, value,
17470                               NULL, DK_UNSPECIFIED, input_location,
17471                               global_dc);
17472                 }
17473               else
17474                 {
17475                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
17476                 }
17477               break;
17478             }
17479           default:
17480             gcc_unreachable ();
17481         }
17482     }
17483
17484   /* If we reached here we either have found an attribute and validated
17485      it or didn't match any.  If we matched an attribute but its arguments
17486      were malformed we will have returned false already.  */
17487   return found;
17488 }
17489
17490 /* Count how many times the character C appears in
17491    NULL-terminated string STR.  */
17492
17493 static unsigned int
17494 num_occurences_in_str (char c, char *str)
17495 {
17496   unsigned int res = 0;
17497   while (*str != '\0')
17498     {
17499       if (*str == c)
17500         res++;
17501
17502       str++;
17503     }
17504
17505   return res;
17506 }
17507
17508 /* Parse the tree in ARGS that contains the target attribute information
17509    and update the global target options space.  */
17510
17511 bool
17512 aarch64_process_target_attr (tree args)
17513 {
17514   if (TREE_CODE (args) == TREE_LIST)
17515     {
17516       do
17517         {
17518           tree head = TREE_VALUE (args);
17519           if (head)
17520             {
17521               if (!aarch64_process_target_attr (head))
17522                 return false;
17523             }
17524           args = TREE_CHAIN (args);
17525         } while (args);
17526
17527       return true;
17528     }
17529
17530   if (TREE_CODE (args) != STRING_CST)
17531     {
17532       error ("attribute %<target%> argument not a string");
17533       return false;
17534     }
17535
17536   size_t len = strlen (TREE_STRING_POINTER (args));
17537   char *str_to_check = (char *) alloca (len + 1);
17538   strcpy (str_to_check, TREE_STRING_POINTER (args));
17539
17540   if (len == 0)
17541     {
17542       error ("malformed %<target()%> pragma or attribute");
17543       return false;
17544     }
17545
17546   /* Used to catch empty spaces between commas i.e.
17547      attribute ((target ("attr1,,attr2"))).  */
17548   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
17549
17550   /* Handle multiple target attributes separated by ','.  */
17551   char *token = strtok_r (str_to_check, ",", &str_to_check);
17552
17553   unsigned int num_attrs = 0;
17554   while (token)
17555     {
17556       num_attrs++;
17557       if (!aarch64_process_one_target_attr (token))
17558         {
17559           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
17560           return false;
17561         }
17562
17563       token = strtok_r (NULL, ",", &str_to_check);
17564     }
17565
17566   if (num_attrs != num_commas + 1)
17567     {
17568       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
17569       return false;
17570     }
17571
17572   return true;
17573 }
17574
17575 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
17576    process attribute ((target ("..."))).  */
17577
17578 static bool
17579 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
17580 {
17581   struct cl_target_option cur_target;
17582   bool ret;
17583   tree old_optimize;
17584   tree new_target, new_optimize;
17585   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
17586
17587   /* If what we're processing is the current pragma string then the
17588      target option node is already stored in target_option_current_node
17589      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
17590      having to re-parse the string.  This is especially useful to keep
17591      arm_neon.h compile times down since that header contains a lot
17592      of intrinsics enclosed in pragmas.  */
17593   if (!existing_target && args == current_target_pragma)
17594     {
17595       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
17596       return true;
17597     }
17598   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
17599
17600   old_optimize
17601     = build_optimization_node (&global_options, &global_options_set);
17602   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
17603
17604   /* If the function changed the optimization levels as well as setting
17605      target options, start with the optimizations specified.  */
17606   if (func_optimize && func_optimize != old_optimize)
17607     cl_optimization_restore (&global_options, &global_options_set,
17608                              TREE_OPTIMIZATION (func_optimize));
17609
17610   /* Save the current target options to restore at the end.  */
17611   cl_target_option_save (&cur_target, &global_options, &global_options_set);
17612
17613   /* If fndecl already has some target attributes applied to it, unpack
17614      them so that we add this attribute on top of them, rather than
17615      overwriting them.  */
17616   if (existing_target)
17617     {
17618       struct cl_target_option *existing_options
17619         = TREE_TARGET_OPTION (existing_target);
17620
17621       if (existing_options)
17622         cl_target_option_restore (&global_options, &global_options_set,
17623                                   existing_options);
17624     }
17625   else
17626     cl_target_option_restore (&global_options, &global_options_set,
17627                               TREE_TARGET_OPTION (target_option_current_node));
17628
17629   ret = aarch64_process_target_attr (args);
17630
17631   /* Set up any additional state.  */
17632   if (ret)
17633     {
17634       aarch64_override_options_internal (&global_options);
17635       /* Initialize SIMD builtins if we haven't already.
17636          Set current_target_pragma to NULL for the duration so that
17637          the builtin initialization code doesn't try to tag the functions
17638          being built with the attributes specified by any current pragma, thus
17639          going into an infinite recursion.  */
17640       if (TARGET_SIMD)
17641         {
17642           tree saved_current_target_pragma = current_target_pragma;
17643           current_target_pragma = NULL;
17644           aarch64_init_simd_builtins ();
17645           current_target_pragma = saved_current_target_pragma;
17646         }
17647       new_target = build_target_option_node (&global_options,
17648                                              &global_options_set);
17649     }
17650   else
17651     new_target = NULL;
17652
17653   new_optimize = build_optimization_node (&global_options,
17654                                           &global_options_set);
17655
17656   if (fndecl && ret)
17657     {
17658       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
17659
17660       if (old_optimize != new_optimize)
17661         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
17662     }
17663
17664   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
17665
17666   if (old_optimize != new_optimize)
17667     cl_optimization_restore (&global_options, &global_options_set,
17668                              TREE_OPTIMIZATION (old_optimize));
17669   return ret;
17670 }
17671
17672 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
17673    tri-bool options (yes, no, don't care) and the default value is
17674    DEF, determine whether to reject inlining.  */
17675
17676 static bool
17677 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
17678                                      int dont_care, int def)
17679 {
17680   /* If the callee doesn't care, always allow inlining.  */
17681   if (callee == dont_care)
17682     return true;
17683
17684   /* If the caller doesn't care, always allow inlining.  */
17685   if (caller == dont_care)
17686     return true;
17687
17688   /* Otherwise, allow inlining if either the callee and caller values
17689      agree, or if the callee is using the default value.  */
17690   return (callee == caller || callee == def);
17691 }
17692
17693 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
17694    to inline CALLEE into CALLER based on target-specific info.
17695    Make sure that the caller and callee have compatible architectural
17696    features.  Then go through the other possible target attributes
17697    and see if they can block inlining.  Try not to reject always_inline
17698    callees unless they are incompatible architecturally.  */
17699
17700 static bool
17701 aarch64_can_inline_p (tree caller, tree callee)
17702 {
17703   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
17704   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
17705
17706   struct cl_target_option *caller_opts
17707         = TREE_TARGET_OPTION (caller_tree ? caller_tree
17708                                            : target_option_default_node);
17709
17710   struct cl_target_option *callee_opts
17711         = TREE_TARGET_OPTION (callee_tree ? callee_tree
17712                                            : target_option_default_node);
17713
17714   /* Callee's ISA flags should be a subset of the caller's.  */
17715   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
17716        != callee_opts->x_aarch64_isa_flags)
17717     return false;
17718
17719   /* Allow non-strict aligned functions inlining into strict
17720      aligned ones.  */
17721   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
17722        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
17723       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
17724            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
17725     return false;
17726
17727   bool always_inline = lookup_attribute ("always_inline",
17728                                           DECL_ATTRIBUTES (callee));
17729
17730   /* If the architectural features match up and the callee is always_inline
17731      then the other attributes don't matter.  */
17732   if (always_inline)
17733     return true;
17734
17735   if (caller_opts->x_aarch64_cmodel_var
17736       != callee_opts->x_aarch64_cmodel_var)
17737     return false;
17738
17739   if (caller_opts->x_aarch64_tls_dialect
17740       != callee_opts->x_aarch64_tls_dialect)
17741     return false;
17742
17743   /* Honour explicit requests to workaround errata.  */
17744   if (!aarch64_tribools_ok_for_inlining_p (
17745           caller_opts->x_aarch64_fix_a53_err835769,
17746           callee_opts->x_aarch64_fix_a53_err835769,
17747           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
17748     return false;
17749
17750   if (!aarch64_tribools_ok_for_inlining_p (
17751           caller_opts->x_aarch64_fix_a53_err843419,
17752           callee_opts->x_aarch64_fix_a53_err843419,
17753           2, TARGET_FIX_ERR_A53_843419))
17754     return false;
17755
17756   /* If the user explicitly specified -momit-leaf-frame-pointer for the
17757      caller and calle and they don't match up, reject inlining.  */
17758   if (!aarch64_tribools_ok_for_inlining_p (
17759           caller_opts->x_flag_omit_leaf_frame_pointer,
17760           callee_opts->x_flag_omit_leaf_frame_pointer,
17761           2, 1))
17762     return false;
17763
17764   /* If the callee has specific tuning overrides, respect them.  */
17765   if (callee_opts->x_aarch64_override_tune_string != NULL
17766       && caller_opts->x_aarch64_override_tune_string == NULL)
17767     return false;
17768
17769   /* If the user specified tuning override strings for the
17770      caller and callee and they don't match up, reject inlining.
17771      We just do a string compare here, we don't analyze the meaning
17772      of the string, as it would be too costly for little gain.  */
17773   if (callee_opts->x_aarch64_override_tune_string
17774       && caller_opts->x_aarch64_override_tune_string
17775       && (strcmp (callee_opts->x_aarch64_override_tune_string,
17776                   caller_opts->x_aarch64_override_tune_string) != 0))
17777     return false;
17778
17779   return true;
17780 }
17781
17782 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
17783    been already.  */
17784
17785 unsigned int
17786 aarch64_tlsdesc_abi_id ()
17787 {
17788   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
17789   if (!tlsdesc_abi.initialized_p ())
17790     {
17791       HARD_REG_SET full_reg_clobbers;
17792       CLEAR_HARD_REG_SET (full_reg_clobbers);
17793       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
17794       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
17795       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
17796         SET_HARD_REG_BIT (full_reg_clobbers, regno);
17797       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
17798     }
17799   return tlsdesc_abi.id ();
17800 }
17801
17802 /* Return true if SYMBOL_REF X binds locally.  */
17803
17804 static bool
17805 aarch64_symbol_binds_local_p (const_rtx x)
17806 {
17807   return (SYMBOL_REF_DECL (x)
17808           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
17809           : SYMBOL_REF_LOCAL_P (x));
17810 }
17811
17812 /* Return true if SYMBOL_REF X is thread local */
17813 static bool
17814 aarch64_tls_symbol_p (rtx x)
17815 {
17816   if (! TARGET_HAVE_TLS)
17817     return false;
17818
17819   x = strip_salt (x);
17820   if (!SYMBOL_REF_P (x))
17821     return false;
17822
17823   return SYMBOL_REF_TLS_MODEL (x) != 0;
17824 }
17825
17826 /* Classify a TLS symbol into one of the TLS kinds.  */
17827 enum aarch64_symbol_type
17828 aarch64_classify_tls_symbol (rtx x)
17829 {
17830   enum tls_model tls_kind = tls_symbolic_operand_type (x);
17831
17832   switch (tls_kind)
17833     {
17834     case TLS_MODEL_GLOBAL_DYNAMIC:
17835     case TLS_MODEL_LOCAL_DYNAMIC:
17836       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
17837
17838     case TLS_MODEL_INITIAL_EXEC:
17839       switch (aarch64_cmodel)
17840         {
17841         case AARCH64_CMODEL_TINY:
17842         case AARCH64_CMODEL_TINY_PIC:
17843           return SYMBOL_TINY_TLSIE;
17844         default:
17845           return SYMBOL_SMALL_TLSIE;
17846         }
17847
17848     case TLS_MODEL_LOCAL_EXEC:
17849       if (aarch64_tls_size == 12)
17850         return SYMBOL_TLSLE12;
17851       else if (aarch64_tls_size == 24)
17852         return SYMBOL_TLSLE24;
17853       else if (aarch64_tls_size == 32)
17854         return SYMBOL_TLSLE32;
17855       else if (aarch64_tls_size == 48)
17856         return SYMBOL_TLSLE48;
17857       else
17858         gcc_unreachable ();
17859
17860     case TLS_MODEL_EMULATED:
17861     case TLS_MODEL_NONE:
17862       return SYMBOL_FORCE_TO_MEM;
17863
17864     default:
17865       gcc_unreachable ();
17866     }
17867 }
17868
17869 /* Return the correct method for accessing X + OFFSET, where X is either
17870    a SYMBOL_REF or LABEL_REF.  */
17871
17872 enum aarch64_symbol_type
17873 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
17874 {
17875   x = strip_salt (x);
17876
17877   if (LABEL_REF_P (x))
17878     {
17879       switch (aarch64_cmodel)
17880         {
17881         case AARCH64_CMODEL_LARGE:
17882           return SYMBOL_FORCE_TO_MEM;
17883
17884         case AARCH64_CMODEL_TINY_PIC:
17885         case AARCH64_CMODEL_TINY:
17886           return SYMBOL_TINY_ABSOLUTE;
17887
17888         case AARCH64_CMODEL_SMALL_SPIC:
17889         case AARCH64_CMODEL_SMALL_PIC:
17890         case AARCH64_CMODEL_SMALL:
17891           return SYMBOL_SMALL_ABSOLUTE;
17892
17893         default:
17894           gcc_unreachable ();
17895         }
17896     }
17897
17898   if (SYMBOL_REF_P (x))
17899     {
17900       if (aarch64_tls_symbol_p (x))
17901         return aarch64_classify_tls_symbol (x);
17902
17903       switch (aarch64_cmodel)
17904         {
17905         case AARCH64_CMODEL_TINY_PIC:
17906         case AARCH64_CMODEL_TINY:
17907           /* With -fPIC non-local symbols use the GOT.  For orthogonality
17908              always use the GOT for extern weak symbols.  */
17909           if ((flag_pic || SYMBOL_REF_WEAK (x))
17910               && !aarch64_symbol_binds_local_p (x))
17911             return SYMBOL_TINY_GOT;
17912
17913           /* When we retrieve symbol + offset address, we have to make sure
17914              the offset does not cause overflow of the final address.  But
17915              we have no way of knowing the address of symbol at compile time
17916              so we can't accurately say if the distance between the PC and
17917              symbol + offset is outside the addressible range of +/-1MB in the
17918              TINY code model.  So we limit the maximum offset to +/-64KB and
17919              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
17920              If offset_within_block_p is true we allow larger offsets.  */
17921           if (!(IN_RANGE (offset, -0x10000, 0x10000)
17922                 || offset_within_block_p (x, offset)))
17923             return SYMBOL_FORCE_TO_MEM;
17924
17925           return SYMBOL_TINY_ABSOLUTE;
17926
17927
17928         case AARCH64_CMODEL_SMALL_SPIC:
17929         case AARCH64_CMODEL_SMALL_PIC:
17930         case AARCH64_CMODEL_SMALL:
17931           if ((flag_pic || SYMBOL_REF_WEAK (x))
17932               && !aarch64_symbol_binds_local_p (x))
17933             return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
17934                     ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
17935
17936           /* Same reasoning as the tiny code model, but the offset cap here is
17937              1MB, allowing +/-3.9GB for the offset to the symbol.  */
17938           if (!(IN_RANGE (offset, -0x100000, 0x100000)
17939                 || offset_within_block_p (x, offset)))
17940             return SYMBOL_FORCE_TO_MEM;
17941
17942           return SYMBOL_SMALL_ABSOLUTE;
17943
17944         case AARCH64_CMODEL_LARGE:
17945           /* This is alright even in PIC code as the constant
17946              pool reference is always PC relative and within
17947              the same translation unit.  */
17948           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
17949             return SYMBOL_SMALL_ABSOLUTE;
17950           else
17951             return SYMBOL_FORCE_TO_MEM;
17952
17953         default:
17954           gcc_unreachable ();
17955         }
17956     }
17957
17958   /* By default push everything into the constant pool.  */
17959   return SYMBOL_FORCE_TO_MEM;
17960 }
17961
17962 bool
17963 aarch64_constant_address_p (rtx x)
17964 {
17965   return (CONSTANT_P (x) && memory_address_p (DImode, x));
17966 }
17967
17968 bool
17969 aarch64_legitimate_pic_operand_p (rtx x)
17970 {
17971   poly_int64 offset;
17972   x = strip_offset_and_salt (x, &offset);
17973   if (SYMBOL_REF_P (x))
17974     return false;
17975
17976   return true;
17977 }
17978
17979 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
17980    that should be rematerialized rather than spilled.  */
17981
17982 static bool
17983 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
17984 {
17985   /* Support CSE and rematerialization of common constants.  */
17986   if (CONST_INT_P (x)
17987       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT))
17988     return true;
17989
17990   /* Only accept variable-length vector constants if they can be
17991      handled directly.
17992
17993      ??? It would be possible (but complex) to handle rematerialization
17994      of other constants via secondary reloads.  */
17995   if (!GET_MODE_SIZE (mode).is_constant ())
17996     return aarch64_simd_valid_immediate (x, NULL);
17997
17998   /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
17999      least be forced to memory and loaded from there.  */
18000   if (GET_CODE (x) == CONST_VECTOR)
18001     return !targetm.cannot_force_const_mem (mode, x);
18002
18003   /* Do not allow vector struct mode constants for Advanced SIMD.
18004      We could support 0 and -1 easily, but they need support in
18005      aarch64-simd.md.  */
18006   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18007   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
18008     return false;
18009
18010   if (GET_CODE (x) == HIGH)
18011     x = XEXP (x, 0);
18012
18013   /* Accept polynomial constants that can be calculated by using the
18014      destination of a move as the sole temporary.  Constants that
18015      require a second temporary cannot be rematerialized (they can't be
18016      forced to memory and also aren't legitimate constants).  */
18017   poly_int64 offset;
18018   if (poly_int_rtx_p (x, &offset))
18019     return aarch64_offset_temporaries (false, offset) <= 1;
18020
18021   /* If an offset is being added to something else, we need to allow the
18022      base to be moved into the destination register, meaning that there
18023      are no free temporaries for the offset.  */
18024   x = strip_offset_and_salt (x, &offset);
18025   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
18026     return false;
18027
18028   /* Do not allow const (plus (anchor_symbol, const_int)).  */
18029   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
18030     return false;
18031
18032   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
18033      so spilling them is better than rematerialization.  */
18034   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
18035     return true;
18036
18037   /* Label references are always constant.  */
18038   if (LABEL_REF_P (x))
18039     return true;
18040
18041   return false;
18042 }
18043
18044 rtx
18045 aarch64_load_tp (rtx target)
18046 {
18047   if (!target
18048       || GET_MODE (target) != Pmode
18049       || !register_operand (target, Pmode))
18050     target = gen_reg_rtx (Pmode);
18051
18052   /* Can return in any reg.  */
18053   emit_insn (gen_aarch64_load_tp_hard (target));
18054   return target;
18055 }
18056
18057 /* On AAPCS systems, this is the "struct __va_list".  */
18058 static GTY(()) tree va_list_type;
18059
18060 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
18061    Return the type to use as __builtin_va_list.
18062
18063    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
18064
18065    struct __va_list
18066    {
18067      void *__stack;
18068      void *__gr_top;
18069      void *__vr_top;
18070      int   __gr_offs;
18071      int   __vr_offs;
18072    };  */
18073
18074 static tree
18075 aarch64_build_builtin_va_list (void)
18076 {
18077   tree va_list_name;
18078   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
18079
18080   /* Create the type.  */
18081   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
18082   /* Give it the required name.  */
18083   va_list_name = build_decl (BUILTINS_LOCATION,
18084                              TYPE_DECL,
18085                              get_identifier ("__va_list"),
18086                              va_list_type);
18087   DECL_ARTIFICIAL (va_list_name) = 1;
18088   TYPE_NAME (va_list_type) = va_list_name;
18089   TYPE_STUB_DECL (va_list_type) = va_list_name;
18090
18091   /* Create the fields.  */
18092   f_stack = build_decl (BUILTINS_LOCATION,
18093                         FIELD_DECL, get_identifier ("__stack"),
18094                         ptr_type_node);
18095   f_grtop = build_decl (BUILTINS_LOCATION,
18096                         FIELD_DECL, get_identifier ("__gr_top"),
18097                         ptr_type_node);
18098   f_vrtop = build_decl (BUILTINS_LOCATION,
18099                         FIELD_DECL, get_identifier ("__vr_top"),
18100                         ptr_type_node);
18101   f_groff = build_decl (BUILTINS_LOCATION,
18102                         FIELD_DECL, get_identifier ("__gr_offs"),
18103                         integer_type_node);
18104   f_vroff = build_decl (BUILTINS_LOCATION,
18105                         FIELD_DECL, get_identifier ("__vr_offs"),
18106                         integer_type_node);
18107
18108   /* Tell tree-stdarg pass about our internal offset fields.
18109      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
18110      purpose to identify whether the code is updating va_list internal
18111      offset fields through irregular way.  */
18112   va_list_gpr_counter_field = f_groff;
18113   va_list_fpr_counter_field = f_vroff;
18114
18115   DECL_ARTIFICIAL (f_stack) = 1;
18116   DECL_ARTIFICIAL (f_grtop) = 1;
18117   DECL_ARTIFICIAL (f_vrtop) = 1;
18118   DECL_ARTIFICIAL (f_groff) = 1;
18119   DECL_ARTIFICIAL (f_vroff) = 1;
18120
18121   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
18122   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
18123   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
18124   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
18125   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
18126
18127   TYPE_FIELDS (va_list_type) = f_stack;
18128   DECL_CHAIN (f_stack) = f_grtop;
18129   DECL_CHAIN (f_grtop) = f_vrtop;
18130   DECL_CHAIN (f_vrtop) = f_groff;
18131   DECL_CHAIN (f_groff) = f_vroff;
18132
18133   /* Compute its layout.  */
18134   layout_type (va_list_type);
18135
18136   return va_list_type;
18137 }
18138
18139 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
18140 static void
18141 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
18142 {
18143   const CUMULATIVE_ARGS *cum;
18144   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
18145   tree stack, grtop, vrtop, groff, vroff;
18146   tree t;
18147   int gr_save_area_size = cfun->va_list_gpr_size;
18148   int vr_save_area_size = cfun->va_list_fpr_size;
18149   int vr_offset;
18150
18151   cum = &crtl->args.info;
18152   if (cfun->va_list_gpr_size)
18153     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
18154                              cfun->va_list_gpr_size);
18155   if (cfun->va_list_fpr_size)
18156     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
18157                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
18158
18159   if (!TARGET_FLOAT)
18160     {
18161       gcc_assert (cum->aapcs_nvrn == 0);
18162       vr_save_area_size = 0;
18163     }
18164
18165   f_stack = TYPE_FIELDS (va_list_type_node);
18166   f_grtop = DECL_CHAIN (f_stack);
18167   f_vrtop = DECL_CHAIN (f_grtop);
18168   f_groff = DECL_CHAIN (f_vrtop);
18169   f_vroff = DECL_CHAIN (f_groff);
18170
18171   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
18172                   NULL_TREE);
18173   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
18174                   NULL_TREE);
18175   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
18176                   NULL_TREE);
18177   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
18178                   NULL_TREE);
18179   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
18180                   NULL_TREE);
18181
18182   /* Emit code to initialize STACK, which points to the next varargs stack
18183      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
18184      by named arguments.  STACK is 8-byte aligned.  */
18185   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
18186   if (cum->aapcs_stack_size > 0)
18187     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
18188   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
18189   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18190
18191   /* Emit code to initialize GRTOP, the top of the GR save area.
18192      virtual_incoming_args_rtx should have been 16 byte aligned.  */
18193   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
18194   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
18195   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18196
18197   /* Emit code to initialize VRTOP, the top of the VR save area.
18198      This address is gr_save_area_bytes below GRTOP, rounded
18199      down to the next 16-byte boundary.  */
18200   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
18201   vr_offset = ROUND_UP (gr_save_area_size,
18202                         STACK_BOUNDARY / BITS_PER_UNIT);
18203
18204   if (vr_offset)
18205     t = fold_build_pointer_plus_hwi (t, -vr_offset);
18206   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
18207   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18208
18209   /* Emit code to initialize GROFF, the offset from GRTOP of the
18210      next GPR argument.  */
18211   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
18212               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
18213   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18214
18215   /* Likewise emit code to initialize VROFF, the offset from FTOP
18216      of the next VR argument.  */
18217   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
18218               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
18219   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18220 }
18221
18222 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
18223
18224 static tree
18225 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
18226                               gimple_seq *post_p ATTRIBUTE_UNUSED)
18227 {
18228   tree addr;
18229   bool indirect_p;
18230   bool is_ha;           /* is HFA or HVA.  */
18231   bool dw_align;        /* double-word align.  */
18232   machine_mode ag_mode = VOIDmode;
18233   int nregs;
18234   machine_mode mode;
18235
18236   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
18237   tree stack, f_top, f_off, off, arg, roundup, on_stack;
18238   HOST_WIDE_INT size, rsize, adjust, align;
18239   tree t, u, cond1, cond2;
18240
18241   indirect_p = pass_va_arg_by_reference (type);
18242   if (indirect_p)
18243     type = build_pointer_type (type);
18244
18245   mode = TYPE_MODE (type);
18246
18247   f_stack = TYPE_FIELDS (va_list_type_node);
18248   f_grtop = DECL_CHAIN (f_stack);
18249   f_vrtop = DECL_CHAIN (f_grtop);
18250   f_groff = DECL_CHAIN (f_vrtop);
18251   f_vroff = DECL_CHAIN (f_groff);
18252
18253   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
18254                   f_stack, NULL_TREE);
18255   size = int_size_in_bytes (type);
18256
18257   unsigned int abi_break;
18258   align
18259     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
18260
18261   dw_align = false;
18262   adjust = 0;
18263   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
18264                                                &is_ha, false))
18265     {
18266       /* No frontends can create types with variable-sized modes, so we
18267          shouldn't be asked to pass or return them.  */
18268       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
18269
18270       /* TYPE passed in fp/simd registers.  */
18271       if (!TARGET_FLOAT)
18272         aarch64_err_no_fpadvsimd (mode);
18273
18274       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
18275                       unshare_expr (valist), f_vrtop, NULL_TREE);
18276       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
18277                       unshare_expr (valist), f_vroff, NULL_TREE);
18278
18279       rsize = nregs * UNITS_PER_VREG;
18280
18281       if (is_ha)
18282         {
18283           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
18284             adjust = UNITS_PER_VREG - ag_size;
18285         }
18286       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
18287                && size < UNITS_PER_VREG)
18288         {
18289           adjust = UNITS_PER_VREG - size;
18290         }
18291     }
18292   else
18293     {
18294       /* TYPE passed in general registers.  */
18295       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
18296                       unshare_expr (valist), f_grtop, NULL_TREE);
18297       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
18298                       unshare_expr (valist), f_groff, NULL_TREE);
18299       rsize = ROUND_UP (size, UNITS_PER_WORD);
18300       nregs = rsize / UNITS_PER_WORD;
18301
18302       if (align > 8)
18303         {
18304           if (abi_break && warn_psabi)
18305             inform (input_location, "parameter passing for argument of type "
18306                     "%qT changed in GCC 9.1", type);
18307           dw_align = true;
18308         }
18309
18310       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
18311           && size < UNITS_PER_WORD)
18312         {
18313           adjust = UNITS_PER_WORD  - size;
18314         }
18315     }
18316
18317   /* Get a local temporary for the field value.  */
18318   off = get_initialized_tmp_var (f_off, pre_p, NULL);
18319
18320   /* Emit code to branch if off >= 0.  */
18321   t = build2 (GE_EXPR, boolean_type_node, off,
18322               build_int_cst (TREE_TYPE (off), 0));
18323   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
18324
18325   if (dw_align)
18326     {
18327       /* Emit: offs = (offs + 15) & -16.  */
18328       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
18329                   build_int_cst (TREE_TYPE (off), 15));
18330       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
18331                   build_int_cst (TREE_TYPE (off), -16));
18332       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
18333     }
18334   else
18335     roundup = NULL;
18336
18337   /* Update ap.__[g|v]r_offs  */
18338   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
18339               build_int_cst (TREE_TYPE (off), rsize));
18340   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
18341
18342   /* String up.  */
18343   if (roundup)
18344     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
18345
18346   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
18347   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
18348               build_int_cst (TREE_TYPE (f_off), 0));
18349   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
18350
18351   /* String up: make sure the assignment happens before the use.  */
18352   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
18353   COND_EXPR_ELSE (cond1) = t;
18354
18355   /* Prepare the trees handling the argument that is passed on the stack;
18356      the top level node will store in ON_STACK.  */
18357   arg = get_initialized_tmp_var (stack, pre_p, NULL);
18358   if (align > 8)
18359     {
18360       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
18361       t = fold_build_pointer_plus_hwi (arg, 15);
18362       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
18363                   build_int_cst (TREE_TYPE (t), -16));
18364       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
18365     }
18366   else
18367     roundup = NULL;
18368   /* Advance ap.__stack  */
18369   t = fold_build_pointer_plus_hwi (arg, size + 7);
18370   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
18371               build_int_cst (TREE_TYPE (t), -8));
18372   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
18373   /* String up roundup and advance.  */
18374   if (roundup)
18375     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
18376   /* String up with arg */
18377   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
18378   /* Big-endianness related address adjustment.  */
18379   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
18380       && size < UNITS_PER_WORD)
18381   {
18382     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
18383                 size_int (UNITS_PER_WORD - size));
18384     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
18385   }
18386
18387   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
18388   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
18389
18390   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
18391   t = off;
18392   if (adjust)
18393     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
18394                 build_int_cst (TREE_TYPE (off), adjust));
18395
18396   t = fold_convert (sizetype, t);
18397   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
18398
18399   if (is_ha)
18400     {
18401       /* type ha; // treat as "struct {ftype field[n];}"
18402          ... [computing offs]
18403          for (i = 0; i <nregs; ++i, offs += 16)
18404            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
18405          return ha;  */
18406       int i;
18407       tree tmp_ha, field_t, field_ptr_t;
18408
18409       /* Declare a local variable.  */
18410       tmp_ha = create_tmp_var_raw (type, "ha");
18411       gimple_add_tmp_var (tmp_ha);
18412
18413       /* Establish the base type.  */
18414       switch (ag_mode)
18415         {
18416         case E_SFmode:
18417           field_t = float_type_node;
18418           field_ptr_t = float_ptr_type_node;
18419           break;
18420         case E_DFmode:
18421           field_t = double_type_node;
18422           field_ptr_t = double_ptr_type_node;
18423           break;
18424         case E_TFmode:
18425           field_t = long_double_type_node;
18426           field_ptr_t = long_double_ptr_type_node;
18427           break;
18428         case E_HFmode:
18429           field_t = aarch64_fp16_type_node;
18430           field_ptr_t = aarch64_fp16_ptr_type_node;
18431           break;
18432         case E_BFmode:
18433           field_t = aarch64_bf16_type_node;
18434           field_ptr_t = aarch64_bf16_ptr_type_node;
18435           break;
18436         case E_V2SImode:
18437         case E_V4SImode:
18438             {
18439               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
18440               field_t = build_vector_type_for_mode (innertype, ag_mode);
18441               field_ptr_t = build_pointer_type (field_t);
18442             }
18443           break;
18444         default:
18445           gcc_assert (0);
18446         }
18447
18448       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
18449       TREE_ADDRESSABLE (tmp_ha) = 1;
18450       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
18451       addr = t;
18452       t = fold_convert (field_ptr_t, addr);
18453       t = build2 (MODIFY_EXPR, field_t,
18454                   build1 (INDIRECT_REF, field_t, tmp_ha),
18455                   build1 (INDIRECT_REF, field_t, t));
18456
18457       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
18458       for (i = 1; i < nregs; ++i)
18459         {
18460           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
18461           u = fold_convert (field_ptr_t, addr);
18462           u = build2 (MODIFY_EXPR, field_t,
18463                       build2 (MEM_REF, field_t, tmp_ha,
18464                               build_int_cst (field_ptr_t,
18465                                              (i *
18466                                               int_size_in_bytes (field_t)))),
18467                       build1 (INDIRECT_REF, field_t, u));
18468           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
18469         }
18470
18471       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
18472       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
18473     }
18474
18475   COND_EXPR_ELSE (cond2) = t;
18476   addr = fold_convert (build_pointer_type (type), cond1);
18477   addr = build_va_arg_indirect_ref (addr);
18478
18479   if (indirect_p)
18480     addr = build_va_arg_indirect_ref (addr);
18481
18482   return addr;
18483 }
18484
18485 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
18486
18487 static void
18488 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
18489                                 const function_arg_info &arg,
18490                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
18491 {
18492   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
18493   CUMULATIVE_ARGS local_cum;
18494   int gr_saved = cfun->va_list_gpr_size;
18495   int vr_saved = cfun->va_list_fpr_size;
18496
18497   /* The caller has advanced CUM up to, but not beyond, the last named
18498      argument.  Advance a local copy of CUM past the last "real" named
18499      argument, to find out how many registers are left over.  */
18500   local_cum = *cum;
18501   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
18502
18503   /* Found out how many registers we need to save.
18504      Honor tree-stdvar analysis results.  */
18505   if (cfun->va_list_gpr_size)
18506     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
18507                     cfun->va_list_gpr_size / UNITS_PER_WORD);
18508   if (cfun->va_list_fpr_size)
18509     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
18510                     cfun->va_list_fpr_size / UNITS_PER_VREG);
18511
18512   if (!TARGET_FLOAT)
18513     {
18514       gcc_assert (local_cum.aapcs_nvrn == 0);
18515       vr_saved = 0;
18516     }
18517
18518   if (!no_rtl)
18519     {
18520       if (gr_saved > 0)
18521         {
18522           rtx ptr, mem;
18523
18524           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
18525           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
18526                                - gr_saved * UNITS_PER_WORD);
18527           mem = gen_frame_mem (BLKmode, ptr);
18528           set_mem_alias_set (mem, get_varargs_alias_set ());
18529
18530           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
18531                                mem, gr_saved);
18532         }
18533       if (vr_saved > 0)
18534         {
18535           /* We can't use move_block_from_reg, because it will use
18536              the wrong mode, storing D regs only.  */
18537           machine_mode mode = TImode;
18538           int off, i, vr_start;
18539
18540           /* Set OFF to the offset from virtual_incoming_args_rtx of
18541              the first vector register.  The VR save area lies below
18542              the GR one, and is aligned to 16 bytes.  */
18543           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
18544                            STACK_BOUNDARY / BITS_PER_UNIT);
18545           off -= vr_saved * UNITS_PER_VREG;
18546
18547           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
18548           for (i = 0; i < vr_saved; ++i)
18549             {
18550               rtx ptr, mem;
18551
18552               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
18553               mem = gen_frame_mem (mode, ptr);
18554               set_mem_alias_set (mem, get_varargs_alias_set ());
18555               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
18556               off += UNITS_PER_VREG;
18557             }
18558         }
18559     }
18560
18561   /* We don't save the size into *PRETEND_SIZE because we want to avoid
18562      any complication of having crtl->args.pretend_args_size changed.  */
18563   cfun->machine->frame.saved_varargs_size
18564     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
18565                  STACK_BOUNDARY / BITS_PER_UNIT)
18566        + vr_saved * UNITS_PER_VREG);
18567 }
18568
18569 static void
18570 aarch64_conditional_register_usage (void)
18571 {
18572   int i;
18573   if (!TARGET_FLOAT)
18574     {
18575       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
18576         {
18577           fixed_regs[i] = 1;
18578           call_used_regs[i] = 1;
18579         }
18580     }
18581   if (!TARGET_SVE)
18582     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
18583       {
18584         fixed_regs[i] = 1;
18585         call_used_regs[i] = 1;
18586       }
18587
18588   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
18589   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
18590   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
18591
18592   /* When tracking speculation, we need a couple of call-clobbered registers
18593      to track the speculation state.  It would be nice to just use
18594      IP0 and IP1, but currently there are numerous places that just
18595      assume these registers are free for other uses (eg pointer
18596      authentication).  */
18597   if (aarch64_track_speculation)
18598     {
18599       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
18600       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
18601       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
18602       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
18603     }
18604 }
18605
18606 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
18607
18608 bool
18609 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
18610 {
18611   /* For records we're passed a FIELD_DECL, for arrays we're passed
18612      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
18613   const_tree type = TREE_TYPE (field_or_array);
18614
18615   /* Assign BLKmode to anything that contains multiple SVE predicates.
18616      For structures, the "multiple" case is indicated by MODE being
18617      VOIDmode.  */
18618   unsigned int num_zr, num_pr;
18619   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
18620     {
18621       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
18622         return !simple_cst_equal (TYPE_SIZE (field_or_array),
18623                                   TYPE_SIZE (type));
18624       return mode == VOIDmode;
18625     }
18626
18627   return default_member_type_forces_blk (field_or_array, mode);
18628 }
18629
18630 /* Bitmasks that indicate whether earlier versions of GCC would have
18631    taken a different path through the ABI logic.  This should result in
18632    a -Wpsabi warning if the earlier path led to a different ABI decision.
18633
18634    WARN_PSABI_EMPTY_CXX17_BASE
18635       Indicates that the type includes an artificial empty C++17 base field
18636       that, prior to GCC 10.1, would prevent the type from being treated as
18637       a HFA or HVA.  See PR94383 for details.
18638
18639    WARN_PSABI_NO_UNIQUE_ADDRESS
18640       Indicates that the type includes an empty [[no_unique_address]] field
18641       that, prior to GCC 10.1, would prevent the type from being treated as
18642       a HFA or HVA.  */
18643 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
18644 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
18645
18646 /* Walk down the type tree of TYPE counting consecutive base elements.
18647    If *MODEP is VOIDmode, then set it to the first valid floating point
18648    type.  If a non-floating point type is found, or if a floating point
18649    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
18650    otherwise return the count in the sub-tree.
18651
18652    The WARN_PSABI_FLAGS argument allows the caller to check whether this
18653    function has changed its behavior relative to earlier versions of GCC.
18654    Normally the argument should be nonnull and point to a zero-initialized
18655    variable.  The function then records whether the ABI decision might
18656    be affected by a known fix to the ABI logic, setting the associated
18657    WARN_PSABI_* bits if so.
18658
18659    When the argument is instead a null pointer, the function tries to
18660    simulate the behavior of GCC before all such ABI fixes were made.
18661    This is useful to check whether the function returns something
18662    different after the ABI fixes.  */
18663 static int
18664 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
18665                          unsigned int *warn_psabi_flags)
18666 {
18667   machine_mode mode;
18668   HOST_WIDE_INT size;
18669
18670   if (aarch64_sve::builtin_type_p (type))
18671     return -1;
18672
18673   switch (TREE_CODE (type))
18674     {
18675     case REAL_TYPE:
18676       mode = TYPE_MODE (type);
18677       if (mode != DFmode && mode != SFmode
18678           && mode != TFmode && mode != HFmode)
18679         return -1;
18680
18681       if (*modep == VOIDmode)
18682         *modep = mode;
18683
18684       if (*modep == mode)
18685         return 1;
18686
18687       break;
18688
18689     case COMPLEX_TYPE:
18690       mode = TYPE_MODE (TREE_TYPE (type));
18691       if (mode != DFmode && mode != SFmode
18692           && mode != TFmode && mode != HFmode)
18693         return -1;
18694
18695       if (*modep == VOIDmode)
18696         *modep = mode;
18697
18698       if (*modep == mode)
18699         return 2;
18700
18701       break;
18702
18703     case VECTOR_TYPE:
18704       /* Use V2SImode and V4SImode as representatives of all 64-bit
18705          and 128-bit vector types.  */
18706       size = int_size_in_bytes (type);
18707       switch (size)
18708         {
18709         case 8:
18710           mode = V2SImode;
18711           break;
18712         case 16:
18713           mode = V4SImode;
18714           break;
18715         default:
18716           return -1;
18717         }
18718
18719       if (*modep == VOIDmode)
18720         *modep = mode;
18721
18722       /* Vector modes are considered to be opaque: two vectors are
18723          equivalent for the purposes of being homogeneous aggregates
18724          if they are the same size.  */
18725       if (*modep == mode)
18726         return 1;
18727
18728       break;
18729
18730     case ARRAY_TYPE:
18731       {
18732         int count;
18733         tree index = TYPE_DOMAIN (type);
18734
18735         /* Can't handle incomplete types nor sizes that are not
18736            fixed.  */
18737         if (!COMPLETE_TYPE_P (type)
18738             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
18739           return -1;
18740
18741         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
18742                                          warn_psabi_flags);
18743         if (count == -1
18744             || !index
18745             || !TYPE_MAX_VALUE (index)
18746             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
18747             || !TYPE_MIN_VALUE (index)
18748             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
18749             || count < 0)
18750           return -1;
18751
18752         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
18753                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
18754
18755         /* There must be no padding.  */
18756         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
18757                       count * GET_MODE_BITSIZE (*modep)))
18758           return -1;
18759
18760         return count;
18761       }
18762
18763     case RECORD_TYPE:
18764       {
18765         int count = 0;
18766         int sub_count;
18767         tree field;
18768
18769         /* Can't handle incomplete types nor sizes that are not
18770            fixed.  */
18771         if (!COMPLETE_TYPE_P (type)
18772             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
18773           return -1;
18774
18775         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
18776           {
18777             if (TREE_CODE (field) != FIELD_DECL)
18778               continue;
18779
18780             if (DECL_FIELD_ABI_IGNORED (field))
18781               {
18782                 /* See whether this is something that earlier versions of
18783                    GCC failed to ignore.  */
18784                 unsigned int flag;
18785                 if (lookup_attribute ("no_unique_address",
18786                                       DECL_ATTRIBUTES (field)))
18787                   flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
18788                 else if (cxx17_empty_base_field_p (field))
18789                   flag = WARN_PSABI_EMPTY_CXX17_BASE;
18790                 else
18791                   /* No compatibility problem.  */
18792                   continue;
18793
18794                 /* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
18795                 if (warn_psabi_flags)
18796                   {
18797                     *warn_psabi_flags |= flag;
18798                     continue;
18799                   }
18800               }
18801
18802             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
18803                                                  warn_psabi_flags);
18804             if (sub_count < 0)
18805               return -1;
18806             count += sub_count;
18807           }
18808
18809         /* There must be no padding.  */
18810         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
18811                       count * GET_MODE_BITSIZE (*modep)))
18812           return -1;
18813
18814         return count;
18815       }
18816
18817     case UNION_TYPE:
18818     case QUAL_UNION_TYPE:
18819       {
18820         /* These aren't very interesting except in a degenerate case.  */
18821         int count = 0;
18822         int sub_count;
18823         tree field;
18824
18825         /* Can't handle incomplete types nor sizes that are not
18826            fixed.  */
18827         if (!COMPLETE_TYPE_P (type)
18828             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
18829           return -1;
18830
18831         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
18832           {
18833             if (TREE_CODE (field) != FIELD_DECL)
18834               continue;
18835
18836             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
18837                                                  warn_psabi_flags);
18838             if (sub_count < 0)
18839               return -1;
18840             count = count > sub_count ? count : sub_count;
18841           }
18842
18843         /* There must be no padding.  */
18844         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
18845                       count * GET_MODE_BITSIZE (*modep)))
18846           return -1;
18847
18848         return count;
18849       }
18850
18851     default:
18852       break;
18853     }
18854
18855   return -1;
18856 }
18857
18858 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
18859    type as described in AAPCS64 \S 4.1.2.
18860
18861    See the comment above aarch64_composite_type_p for the notes on MODE.  */
18862
18863 static bool
18864 aarch64_short_vector_p (const_tree type,
18865                         machine_mode mode)
18866 {
18867   poly_int64 size = -1;
18868
18869   if (type && TREE_CODE (type) == VECTOR_TYPE)
18870     {
18871       if (aarch64_sve::builtin_type_p (type))
18872         return false;
18873       size = int_size_in_bytes (type);
18874     }
18875   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
18876            || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
18877     {
18878       /* Rely only on the type, not the mode, when processing SVE types.  */
18879       if (type && aarch64_some_values_include_pst_objects_p (type))
18880         /* Leave later code to report an error if SVE is disabled.  */
18881         gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
18882       else
18883         size = GET_MODE_SIZE (mode);
18884     }
18885   if (known_eq (size, 8) || known_eq (size, 16))
18886     {
18887       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
18888          they are being treated as scalable AAPCS64 types.  */
18889       gcc_assert (!aarch64_sve_mode_p (mode));
18890       return true;
18891     }
18892   return false;
18893 }
18894
18895 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
18896    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
18897    array types.  The C99 floating-point complex types are also considered
18898    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
18899    types, which are GCC extensions and out of the scope of AAPCS64, are
18900    treated as composite types here as well.
18901
18902    Note that MODE itself is not sufficient in determining whether a type
18903    is such a composite type or not.  This is because
18904    stor-layout.c:compute_record_mode may have already changed the MODE
18905    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
18906    structure with only one field may have its MODE set to the mode of the
18907    field.  Also an integer mode whose size matches the size of the
18908    RECORD_TYPE type may be used to substitute the original mode
18909    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
18910    solely relied on.  */
18911
18912 static bool
18913 aarch64_composite_type_p (const_tree type,
18914                           machine_mode mode)
18915 {
18916   if (aarch64_short_vector_p (type, mode))
18917     return false;
18918
18919   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
18920     return true;
18921
18922   if (mode == BLKmode
18923       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
18924       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
18925     return true;
18926
18927   return false;
18928 }
18929
18930 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
18931    shall be passed or returned in simd/fp register(s) (providing these
18932    parameter passing registers are available).
18933
18934    Upon successful return, *COUNT returns the number of needed registers,
18935    *BASE_MODE returns the mode of the individual register and when IS_HA
18936    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
18937    floating-point aggregate or a homogeneous short-vector aggregate.
18938
18939    SILENT_P is true if the function should refrain from reporting any
18940    diagnostics.  This should only be used if the caller is certain that
18941    any ABI decisions would eventually come through this function with
18942    SILENT_P set to false.  */
18943
18944 static bool
18945 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
18946                                          const_tree type,
18947                                          machine_mode *base_mode,
18948                                          int *count,
18949                                          bool *is_ha,
18950                                          bool silent_p)
18951 {
18952   if (is_ha != NULL) *is_ha = false;
18953
18954   machine_mode new_mode = VOIDmode;
18955   bool composite_p = aarch64_composite_type_p (type, mode);
18956
18957   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
18958       || aarch64_short_vector_p (type, mode))
18959     {
18960       *count = 1;
18961       new_mode = mode;
18962     }
18963   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
18964     {
18965       if (is_ha != NULL) *is_ha = true;
18966       *count = 2;
18967       new_mode = GET_MODE_INNER (mode);
18968     }
18969   else if (type && composite_p)
18970     {
18971       unsigned int warn_psabi_flags = 0;
18972       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
18973                                               &warn_psabi_flags);
18974       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
18975         {
18976           static unsigned last_reported_type_uid;
18977           unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
18978           int alt;
18979           if (!silent_p
18980               && warn_psabi
18981               && warn_psabi_flags
18982               && uid != last_reported_type_uid
18983               && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
18984                   != ag_count))
18985             {
18986               const char *url
18987                 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
18988               gcc_assert (alt == -1);
18989               last_reported_type_uid = uid;
18990               /* Use TYPE_MAIN_VARIANT to strip any redundant const
18991                  qualification.  */
18992               if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
18993                 inform (input_location, "parameter passing for argument of "
18994                         "type %qT with %<[[no_unique_address]]%> members "
18995                         "changed %{in GCC 10.1%}",
18996                         TYPE_MAIN_VARIANT (type), url);
18997               else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
18998                 inform (input_location, "parameter passing for argument of "
18999                         "type %qT when C++17 is enabled changed to match "
19000                         "C++14 %{in GCC 10.1%}",
19001                         TYPE_MAIN_VARIANT (type), url);
19002             }
19003
19004           if (is_ha != NULL) *is_ha = true;
19005           *count = ag_count;
19006         }
19007       else
19008         return false;
19009     }
19010   else
19011     return false;
19012
19013   gcc_assert (!aarch64_sve_mode_p (new_mode));
19014   *base_mode = new_mode;
19015   return true;
19016 }
19017
19018 /* Implement TARGET_STRUCT_VALUE_RTX.  */
19019
19020 static rtx
19021 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
19022                           int incoming ATTRIBUTE_UNUSED)
19023 {
19024   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
19025 }
19026
19027 /* Implements target hook vector_mode_supported_p.  */
19028 static bool
19029 aarch64_vector_mode_supported_p (machine_mode mode)
19030 {
19031   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19032   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
19033 }
19034
19035 /* Return the full-width SVE vector mode for element mode MODE, if one
19036    exists.  */
19037 opt_machine_mode
19038 aarch64_full_sve_mode (scalar_mode mode)
19039 {
19040   switch (mode)
19041     {
19042     case E_DFmode:
19043       return VNx2DFmode;
19044     case E_SFmode:
19045       return VNx4SFmode;
19046     case E_HFmode:
19047       return VNx8HFmode;
19048     case E_BFmode:
19049       return VNx8BFmode;
19050     case E_DImode:
19051       return VNx2DImode;
19052     case E_SImode:
19053       return VNx4SImode;
19054     case E_HImode:
19055       return VNx8HImode;
19056     case E_QImode:
19057       return VNx16QImode;
19058     default:
19059       return opt_machine_mode ();
19060     }
19061 }
19062
19063 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
19064    if it exists.  */
19065 opt_machine_mode
19066 aarch64_vq_mode (scalar_mode mode)
19067 {
19068   switch (mode)
19069     {
19070     case E_DFmode:
19071       return V2DFmode;
19072     case E_SFmode:
19073       return V4SFmode;
19074     case E_HFmode:
19075       return V8HFmode;
19076     case E_BFmode:
19077       return V8BFmode;
19078     case E_SImode:
19079       return V4SImode;
19080     case E_HImode:
19081       return V8HImode;
19082     case E_QImode:
19083       return V16QImode;
19084     case E_DImode:
19085       return V2DImode;
19086     default:
19087       return opt_machine_mode ();
19088     }
19089 }
19090
19091 /* Return appropriate SIMD container
19092    for MODE within a vector of WIDTH bits.  */
19093 static machine_mode
19094 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
19095 {
19096   if (TARGET_SVE
19097       && maybe_ne (width, 128)
19098       && known_eq (width, BITS_PER_SVE_VECTOR))
19099     return aarch64_full_sve_mode (mode).else_mode (word_mode);
19100
19101   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
19102   if (TARGET_SIMD)
19103     {
19104       if (known_eq (width, 128))
19105         return aarch64_vq_mode (mode).else_mode (word_mode);
19106       else
19107         switch (mode)
19108           {
19109           case E_SFmode:
19110             return V2SFmode;
19111           case E_HFmode:
19112             return V4HFmode;
19113           case E_BFmode:
19114             return V4BFmode;
19115           case E_SImode:
19116             return V2SImode;
19117           case E_HImode:
19118             return V4HImode;
19119           case E_QImode:
19120             return V8QImode;
19121           default:
19122             break;
19123           }
19124     }
19125   return word_mode;
19126 }
19127
19128 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
19129    and return whether the SVE mode should be preferred over the
19130    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
19131 static bool
19132 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
19133 {
19134   /* Take into account the aarch64-autovec-preference param if non-zero.  */
19135   bool only_asimd_p = aarch64_autovec_preference == 1;
19136   bool only_sve_p = aarch64_autovec_preference == 2;
19137
19138   if (only_asimd_p)
19139     return false;
19140   if (only_sve_p)
19141     return true;
19142
19143   /* The preference in case of a tie in costs.  */
19144   bool prefer_asimd = aarch64_autovec_preference == 3;
19145   bool prefer_sve = aarch64_autovec_preference == 4;
19146
19147   aarch64_sve_vector_bits_enum tune_width = aarch64_tune_params.sve_width;
19148
19149   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
19150   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
19151   /* If the CPU information does not have an SVE width registered use the
19152      generic poly_int comparison that prefers SVE.  If a preference is
19153      explicitly requested avoid this path.  */
19154   if (tune_width == SVE_SCALABLE
19155       && !prefer_asimd
19156       && !prefer_sve)
19157     return maybe_gt (nunits_sve, nunits_asimd);
19158
19159   /* Otherwise estimate the runtime width of the modes involved.  */
19160   HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
19161   HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
19162
19163   /* Preferring SVE means picking it first unless the Advanced SIMD mode
19164      is clearly wider.  */
19165   if (prefer_sve)
19166     return est_sve >= est_asimd;
19167   /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
19168      is clearly wider.  */
19169   if (prefer_asimd)
19170     return est_sve > est_asimd;
19171
19172   /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
19173   return est_sve > est_asimd;
19174 }
19175
19176 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
19177 static machine_mode
19178 aarch64_preferred_simd_mode (scalar_mode mode)
19179 {
19180   /* Take into account explicit auto-vectorization ISA preferences through
19181      aarch64_cmp_autovec_modes.  */
19182   if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
19183     return aarch64_full_sve_mode (mode).else_mode (word_mode);
19184   if (TARGET_SIMD)
19185     return aarch64_vq_mode (mode).else_mode (word_mode);
19186   return word_mode;
19187 }
19188
19189 /* Return a list of possible vector sizes for the vectorizer
19190    to iterate over.  */
19191 static unsigned int
19192 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
19193 {
19194   static const machine_mode sve_modes[] = {
19195     /* Try using full vectors for all element types.  */
19196     VNx16QImode,
19197
19198     /* Try using 16-bit containers for 8-bit elements and full vectors
19199        for wider elements.  */
19200     VNx8QImode,
19201
19202     /* Try using 32-bit containers for 8-bit and 16-bit elements and
19203        full vectors for wider elements.  */
19204     VNx4QImode,
19205
19206     /* Try using 64-bit containers for all element types.  */
19207     VNx2QImode
19208   };
19209
19210   static const machine_mode advsimd_modes[] = {
19211     /* Try using 128-bit vectors for all element types.  */
19212     V16QImode,
19213
19214     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
19215        for wider elements.  */
19216     V8QImode,
19217
19218     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
19219        for wider elements.
19220
19221        TODO: We could support a limited form of V4QImode too, so that
19222        we use 32-bit vectors for 8-bit elements.  */
19223     V4HImode,
19224
19225     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
19226        for 64-bit elements.
19227
19228        TODO: We could similarly support limited forms of V2QImode and V2HImode
19229        for this case.  */
19230     V2SImode
19231   };
19232
19233   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
19234      This is because:
19235
19236      - If we can't use N-byte Advanced SIMD vectors then the placement
19237        doesn't matter; we'll just continue as though the Advanced SIMD
19238        entry didn't exist.
19239
19240      - If an SVE main loop with N bytes ends up being cheaper than an
19241        Advanced SIMD main loop with N bytes then by default we'll replace
19242        the Advanced SIMD version with the SVE one.
19243
19244      - If an Advanced SIMD main loop with N bytes ends up being cheaper
19245        than an SVE main loop with N bytes then by default we'll try to
19246        use the SVE loop to vectorize the epilogue instead.  */
19247
19248   bool only_asimd_p = aarch64_autovec_preference == 1;
19249   bool only_sve_p = aarch64_autovec_preference == 2;
19250
19251   unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
19252   unsigned int advsimd_i = 0;
19253
19254   while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
19255     {
19256       if (sve_i < ARRAY_SIZE (sve_modes)
19257           && aarch64_cmp_autovec_modes (sve_modes[sve_i],
19258                                         advsimd_modes[advsimd_i]))
19259         modes->safe_push (sve_modes[sve_i++]);
19260       else
19261         modes->safe_push (advsimd_modes[advsimd_i++]);
19262     }
19263   while (sve_i < ARRAY_SIZE (sve_modes))
19264    modes->safe_push (sve_modes[sve_i++]);
19265
19266   unsigned int flags = 0;
19267   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
19268      can compare SVE against Advanced SIMD and so that we can compare
19269      multiple SVE vectorization approaches against each other.  There's
19270      not really any point doing this for Advanced SIMD only, since the
19271      first mode that works should always be the best.  */
19272   if (TARGET_SVE && aarch64_sve_compare_costs)
19273     flags |= VECT_COMPARE_COSTS;
19274   return flags;
19275 }
19276
19277 /* Implement TARGET_MANGLE_TYPE.  */
19278
19279 static const char *
19280 aarch64_mangle_type (const_tree type)
19281 {
19282   /* The AArch64 ABI documents say that "__va_list" has to be
19283      mangled as if it is in the "std" namespace.  */
19284   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
19285     return "St9__va_list";
19286
19287   /* Half-precision floating point types.  */
19288   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
19289     {
19290       if (TYPE_MODE (type) == BFmode)
19291         return "u6__bf16";
19292       else
19293         return "Dh";
19294     }
19295
19296   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
19297      builtin types.  */
19298   if (TYPE_NAME (type) != NULL)
19299     {
19300       const char *res;
19301       if ((res = aarch64_general_mangle_builtin_type (type))
19302           || (res = aarch64_sve::mangle_builtin_type (type)))
19303         return res;
19304     }
19305
19306   /* Use the default mangling.  */
19307   return NULL;
19308 }
19309
19310 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
19311
19312 static bool
19313 aarch64_verify_type_context (location_t loc, type_context_kind context,
19314                              const_tree type, bool silent_p)
19315 {
19316   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
19317 }
19318
19319 /* Find the first rtx_insn before insn that will generate an assembly
19320    instruction.  */
19321
19322 static rtx_insn *
19323 aarch64_prev_real_insn (rtx_insn *insn)
19324 {
19325   if (!insn)
19326     return NULL;
19327
19328   do
19329     {
19330       insn = prev_real_insn (insn);
19331     }
19332   while (insn && recog_memoized (insn) < 0);
19333
19334   return insn;
19335 }
19336
19337 static bool
19338 is_madd_op (enum attr_type t1)
19339 {
19340   unsigned int i;
19341   /* A number of these may be AArch32 only.  */
19342   enum attr_type mlatypes[] = {
19343     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
19344     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
19345     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
19346   };
19347
19348   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
19349     {
19350       if (t1 == mlatypes[i])
19351         return true;
19352     }
19353
19354   return false;
19355 }
19356
19357 /* Check if there is a register dependency between a load and the insn
19358    for which we hold recog_data.  */
19359
19360 static bool
19361 dep_between_memop_and_curr (rtx memop)
19362 {
19363   rtx load_reg;
19364   int opno;
19365
19366   gcc_assert (GET_CODE (memop) == SET);
19367
19368   if (!REG_P (SET_DEST (memop)))
19369     return false;
19370
19371   load_reg = SET_DEST (memop);
19372   for (opno = 1; opno < recog_data.n_operands; opno++)
19373     {
19374       rtx operand = recog_data.operand[opno];
19375       if (REG_P (operand)
19376           && reg_overlap_mentioned_p (load_reg, operand))
19377         return true;
19378
19379     }
19380   return false;
19381 }
19382
19383
19384 /* When working around the Cortex-A53 erratum 835769,
19385    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
19386    instruction and has a preceding memory instruction such that a NOP
19387    should be inserted between them.  */
19388
19389 bool
19390 aarch64_madd_needs_nop (rtx_insn* insn)
19391 {
19392   enum attr_type attr_type;
19393   rtx_insn *prev;
19394   rtx body;
19395
19396   if (!TARGET_FIX_ERR_A53_835769)
19397     return false;
19398
19399   if (!INSN_P (insn) || recog_memoized (insn) < 0)
19400     return false;
19401
19402   attr_type = get_attr_type (insn);
19403   if (!is_madd_op (attr_type))
19404     return false;
19405
19406   prev = aarch64_prev_real_insn (insn);
19407   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
19408      Restore recog state to INSN to avoid state corruption.  */
19409   extract_constrain_insn_cached (insn);
19410
19411   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
19412     return false;
19413
19414   body = single_set (prev);
19415
19416   /* If the previous insn is a memory op and there is no dependency between
19417      it and the DImode madd, emit a NOP between them.  If body is NULL then we
19418      have a complex memory operation, probably a load/store pair.
19419      Be conservative for now and emit a NOP.  */
19420   if (GET_MODE (recog_data.operand[0]) == DImode
19421       && (!body || !dep_between_memop_and_curr (body)))
19422     return true;
19423
19424   return false;
19425
19426 }
19427
19428
19429 /* Implement FINAL_PRESCAN_INSN.  */
19430
19431 void
19432 aarch64_final_prescan_insn (rtx_insn *insn)
19433 {
19434   if (aarch64_madd_needs_nop (insn))
19435     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
19436 }
19437
19438
19439 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
19440    instruction.  */
19441
19442 bool
19443 aarch64_sve_index_immediate_p (rtx base_or_step)
19444 {
19445   return (CONST_INT_P (base_or_step)
19446           && IN_RANGE (INTVAL (base_or_step), -16, 15));
19447 }
19448
19449 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
19450    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
19451
19452 bool
19453 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
19454 {
19455   rtx elt = unwrap_const_vec_duplicate (x);
19456   if (!CONST_INT_P (elt))
19457     return false;
19458
19459   HOST_WIDE_INT val = INTVAL (elt);
19460   if (negate_p)
19461     val = -val;
19462   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
19463
19464   if (val & 0xff)
19465     return IN_RANGE (val, 0, 0xff);
19466   return IN_RANGE (val, 0, 0xff00);
19467 }
19468
19469 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
19470    instructions when applied to mode MODE.  Negate X first if NEGATE_P
19471    is true.  */
19472
19473 bool
19474 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
19475 {
19476   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
19477     return false;
19478
19479   /* After the optional negation, the immediate must be nonnegative.
19480      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
19481      instead of SQADD Zn.B, Zn.B, #129.  */
19482   rtx elt = unwrap_const_vec_duplicate (x);
19483   return negate_p == (INTVAL (elt) < 0);
19484 }
19485
19486 /* Return true if X is a valid immediate operand for an SVE logical
19487    instruction such as AND.  */
19488
19489 bool
19490 aarch64_sve_bitmask_immediate_p (rtx x)
19491 {
19492   rtx elt;
19493
19494   return (const_vec_duplicate_p (x, &elt)
19495           && CONST_INT_P (elt)
19496           && aarch64_bitmask_imm (INTVAL (elt),
19497                                   GET_MODE_INNER (GET_MODE (x))));
19498 }
19499
19500 /* Return true if X is a valid immediate for the SVE DUP and CPY
19501    instructions.  */
19502
19503 bool
19504 aarch64_sve_dup_immediate_p (rtx x)
19505 {
19506   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
19507   if (!CONST_INT_P (x))
19508     return false;
19509
19510   HOST_WIDE_INT val = INTVAL (x);
19511   if (val & 0xff)
19512     return IN_RANGE (val, -0x80, 0x7f);
19513   return IN_RANGE (val, -0x8000, 0x7f00);
19514 }
19515
19516 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
19517    SIGNED_P says whether the operand is signed rather than unsigned.  */
19518
19519 bool
19520 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
19521 {
19522   x = unwrap_const_vec_duplicate (x);
19523   return (CONST_INT_P (x)
19524           && (signed_p
19525               ? IN_RANGE (INTVAL (x), -16, 15)
19526               : IN_RANGE (INTVAL (x), 0, 127)));
19527 }
19528
19529 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
19530    instruction.  Negate X first if NEGATE_P is true.  */
19531
19532 bool
19533 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
19534 {
19535   rtx elt;
19536   REAL_VALUE_TYPE r;
19537
19538   if (!const_vec_duplicate_p (x, &elt)
19539       || !CONST_DOUBLE_P (elt))
19540     return false;
19541
19542   r = *CONST_DOUBLE_REAL_VALUE (elt);
19543
19544   if (negate_p)
19545     r = real_value_negate (&r);
19546
19547   if (real_equal (&r, &dconst1))
19548     return true;
19549   if (real_equal (&r, &dconsthalf))
19550     return true;
19551   return false;
19552 }
19553
19554 /* Return true if X is a valid immediate operand for an SVE FMUL
19555    instruction.  */
19556
19557 bool
19558 aarch64_sve_float_mul_immediate_p (rtx x)
19559 {
19560   rtx elt;
19561
19562   return (const_vec_duplicate_p (x, &elt)
19563           && CONST_DOUBLE_P (elt)
19564           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
19565               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
19566 }
19567
19568 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
19569    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
19570    is nonnull, use it to describe valid immediates.  */
19571 static bool
19572 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
19573                                     simd_immediate_info *info,
19574                                     enum simd_immediate_check which,
19575                                     simd_immediate_info::insn_type insn)
19576 {
19577   /* Try a 4-byte immediate with LSL.  */
19578   for (unsigned int shift = 0; shift < 32; shift += 8)
19579     if ((val32 & (0xff << shift)) == val32)
19580       {
19581         if (info)
19582           *info = simd_immediate_info (SImode, val32 >> shift, insn,
19583                                        simd_immediate_info::LSL, shift);
19584         return true;
19585       }
19586
19587   /* Try a 2-byte immediate with LSL.  */
19588   unsigned int imm16 = val32 & 0xffff;
19589   if (imm16 == (val32 >> 16))
19590     for (unsigned int shift = 0; shift < 16; shift += 8)
19591       if ((imm16 & (0xff << shift)) == imm16)
19592         {
19593           if (info)
19594             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
19595                                          simd_immediate_info::LSL, shift);
19596           return true;
19597         }
19598
19599   /* Try a 4-byte immediate with MSL, except for cases that MVN
19600      can handle.  */
19601   if (which == AARCH64_CHECK_MOV)
19602     for (unsigned int shift = 8; shift < 24; shift += 8)
19603       {
19604         unsigned int low = (1 << shift) - 1;
19605         if (((val32 & (0xff << shift)) | low) == val32)
19606           {
19607             if (info)
19608               *info = simd_immediate_info (SImode, val32 >> shift, insn,
19609                                            simd_immediate_info::MSL, shift);
19610             return true;
19611           }
19612       }
19613
19614   return false;
19615 }
19616
19617 /* Return true if replicating VAL64 is a valid immediate for the
19618    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
19619    use it to describe valid immediates.  */
19620 static bool
19621 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
19622                                  simd_immediate_info *info,
19623                                  enum simd_immediate_check which)
19624 {
19625   unsigned int val32 = val64 & 0xffffffff;
19626   unsigned int val16 = val64 & 0xffff;
19627   unsigned int val8 = val64 & 0xff;
19628
19629   if (val32 == (val64 >> 32))
19630     {
19631       if ((which & AARCH64_CHECK_ORR) != 0
19632           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
19633                                                  simd_immediate_info::MOV))
19634         return true;
19635
19636       if ((which & AARCH64_CHECK_BIC) != 0
19637           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
19638                                                  simd_immediate_info::MVN))
19639         return true;
19640
19641       /* Try using a replicated byte.  */
19642       if (which == AARCH64_CHECK_MOV
19643           && val16 == (val32 >> 16)
19644           && val8 == (val16 >> 8))
19645         {
19646           if (info)
19647             *info = simd_immediate_info (QImode, val8);
19648           return true;
19649         }
19650     }
19651
19652   /* Try using a bit-to-bytemask.  */
19653   if (which == AARCH64_CHECK_MOV)
19654     {
19655       unsigned int i;
19656       for (i = 0; i < 64; i += 8)
19657         {
19658           unsigned char byte = (val64 >> i) & 0xff;
19659           if (byte != 0 && byte != 0xff)
19660             break;
19661         }
19662       if (i == 64)
19663         {
19664           if (info)
19665             *info = simd_immediate_info (DImode, val64);
19666           return true;
19667         }
19668     }
19669   return false;
19670 }
19671
19672 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
19673    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
19674
19675 static bool
19676 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
19677                              simd_immediate_info *info)
19678 {
19679   scalar_int_mode mode = DImode;
19680   unsigned int val32 = val64 & 0xffffffff;
19681   if (val32 == (val64 >> 32))
19682     {
19683       mode = SImode;
19684       unsigned int val16 = val32 & 0xffff;
19685       if (val16 == (val32 >> 16))
19686         {
19687           mode = HImode;
19688           unsigned int val8 = val16 & 0xff;
19689           if (val8 == (val16 >> 8))
19690             mode = QImode;
19691         }
19692     }
19693   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
19694   if (IN_RANGE (val, -0x80, 0x7f))
19695     {
19696       /* DUP with no shift.  */
19697       if (info)
19698         *info = simd_immediate_info (mode, val);
19699       return true;
19700     }
19701   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
19702     {
19703       /* DUP with LSL #8.  */
19704       if (info)
19705         *info = simd_immediate_info (mode, val);
19706       return true;
19707     }
19708   if (aarch64_bitmask_imm (val64, mode))
19709     {
19710       /* DUPM.  */
19711       if (info)
19712         *info = simd_immediate_info (mode, val);
19713       return true;
19714     }
19715   return false;
19716 }
19717
19718 /* Return true if X is an UNSPEC_PTRUE constant of the form:
19719
19720        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
19721
19722    where PATTERN is the svpattern as a CONST_INT and where ZERO
19723    is a zero constant of the required PTRUE mode (which can have
19724    fewer elements than X's mode, if zero bits are significant).
19725
19726    If so, and if INFO is nonnull, describe the immediate in INFO.  */
19727 bool
19728 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
19729 {
19730   if (GET_CODE (x) != CONST)
19731     return false;
19732
19733   x = XEXP (x, 0);
19734   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
19735     return false;
19736
19737   if (info)
19738     {
19739       aarch64_svpattern pattern
19740         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
19741       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
19742       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
19743       *info = simd_immediate_info (int_mode, pattern);
19744     }
19745   return true;
19746 }
19747
19748 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
19749    it to describe valid immediates.  */
19750
19751 static bool
19752 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
19753 {
19754   if (aarch64_sve_ptrue_svpattern_p (x, info))
19755     return true;
19756
19757   if (x == CONST0_RTX (GET_MODE (x)))
19758     {
19759       if (info)
19760         *info = simd_immediate_info (DImode, 0);
19761       return true;
19762     }
19763
19764   /* Analyze the value as a VNx16BImode.  This should be relatively
19765      efficient, since rtx_vector_builder has enough built-in capacity
19766      to store all VLA predicate constants without needing the heap.  */
19767   rtx_vector_builder builder;
19768   if (!aarch64_get_sve_pred_bits (builder, x))
19769     return false;
19770
19771   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
19772   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
19773     {
19774       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
19775       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
19776       if (pattern != AARCH64_NUM_SVPATTERNS)
19777         {
19778           if (info)
19779             {
19780               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
19781               *info = simd_immediate_info (int_mode, pattern);
19782             }
19783           return true;
19784         }
19785     }
19786   return false;
19787 }
19788
19789 /* Return true if OP is a valid SIMD immediate for the operation
19790    described by WHICH.  If INFO is nonnull, use it to describe valid
19791    immediates.  */
19792 bool
19793 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
19794                               enum simd_immediate_check which)
19795 {
19796   machine_mode mode = GET_MODE (op);
19797   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19798   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
19799     return false;
19800
19801   if (vec_flags & VEC_SVE_PRED)
19802     return aarch64_sve_pred_valid_immediate (op, info);
19803
19804   scalar_mode elt_mode = GET_MODE_INNER (mode);
19805   rtx base, step;
19806   unsigned int n_elts;
19807   if (GET_CODE (op) == CONST_VECTOR
19808       && CONST_VECTOR_DUPLICATE_P (op))
19809     n_elts = CONST_VECTOR_NPATTERNS (op);
19810   else if ((vec_flags & VEC_SVE_DATA)
19811            && const_vec_series_p (op, &base, &step))
19812     {
19813       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
19814       if (!aarch64_sve_index_immediate_p (base)
19815           || !aarch64_sve_index_immediate_p (step))
19816         return false;
19817
19818       if (info)
19819         {
19820           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
19821              should yield two integer values per 128-bit block, meaning
19822              that we need to treat it in the same way as V2DI and then
19823              ignore the upper 32 bits of each element.  */
19824           elt_mode = aarch64_sve_container_int_mode (mode);
19825           *info = simd_immediate_info (elt_mode, base, step);
19826         }
19827       return true;
19828     }
19829   else if (GET_CODE (op) == CONST_VECTOR
19830            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
19831     /* N_ELTS set above.  */;
19832   else
19833     return false;
19834
19835   scalar_float_mode elt_float_mode;
19836   if (n_elts == 1
19837       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
19838     {
19839       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
19840       if (aarch64_float_const_zero_rtx_p (elt)
19841           || aarch64_float_const_representable_p (elt))
19842         {
19843           if (info)
19844             *info = simd_immediate_info (elt_float_mode, elt);
19845           return true;
19846         }
19847     }
19848
19849   /* If all elements in an SVE vector have the same value, we have a free
19850      choice between using the element mode and using the container mode.
19851      Using the element mode means that unused parts of the vector are
19852      duplicates of the used elements, while using the container mode means
19853      that the unused parts are an extension of the used elements.  Using the
19854      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
19855      for its container mode VNx4SI while 0x00000101 isn't.
19856
19857      If not all elements in an SVE vector have the same value, we need the
19858      transition from one element to the next to occur at container boundaries.
19859      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
19860      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
19861   scalar_int_mode elt_int_mode;
19862   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
19863     elt_int_mode = aarch64_sve_container_int_mode (mode);
19864   else
19865     elt_int_mode = int_mode_for_mode (elt_mode).require ();
19866
19867   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
19868   if (elt_size > 8)
19869     return false;
19870
19871   /* Expand the vector constant out into a byte vector, with the least
19872      significant byte of the register first.  */
19873   auto_vec<unsigned char, 16> bytes;
19874   bytes.reserve (n_elts * elt_size);
19875   for (unsigned int i = 0; i < n_elts; i++)
19876     {
19877       /* The vector is provided in gcc endian-neutral fashion.
19878          For aarch64_be Advanced SIMD, it must be laid out in the vector
19879          register in reverse order.  */
19880       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
19881       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
19882
19883       if (elt_mode != elt_int_mode)
19884         elt = gen_lowpart (elt_int_mode, elt);
19885
19886       if (!CONST_INT_P (elt))
19887         return false;
19888
19889       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
19890       for (unsigned int byte = 0; byte < elt_size; byte++)
19891         {
19892           bytes.quick_push (elt_val & 0xff);
19893           elt_val >>= BITS_PER_UNIT;
19894         }
19895     }
19896
19897   /* The immediate must repeat every eight bytes.  */
19898   unsigned int nbytes = bytes.length ();
19899   for (unsigned i = 8; i < nbytes; ++i)
19900     if (bytes[i] != bytes[i - 8])
19901       return false;
19902
19903   /* Get the repeating 8-byte value as an integer.  No endian correction
19904      is needed here because bytes is already in lsb-first order.  */
19905   unsigned HOST_WIDE_INT val64 = 0;
19906   for (unsigned int i = 0; i < 8; i++)
19907     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
19908               << (i * BITS_PER_UNIT));
19909
19910   if (vec_flags & VEC_SVE_DATA)
19911     return aarch64_sve_valid_immediate (val64, info);
19912   else
19913     return aarch64_advsimd_valid_immediate (val64, info, which);
19914 }
19915
19916 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
19917    has a step in the range of INDEX.  Return the index expression if so,
19918    otherwise return null.  */
19919 rtx
19920 aarch64_check_zero_based_sve_index_immediate (rtx x)
19921 {
19922   rtx base, step;
19923   if (const_vec_series_p (x, &base, &step)
19924       && base == const0_rtx
19925       && aarch64_sve_index_immediate_p (step))
19926     return step;
19927   return NULL_RTX;
19928 }
19929
19930 /* Check of immediate shift constants are within range.  */
19931 bool
19932 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
19933 {
19934   x = unwrap_const_vec_duplicate (x);
19935   if (!CONST_INT_P (x))
19936     return false;
19937   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
19938   if (left)
19939     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
19940   else
19941     return IN_RANGE (INTVAL (x), 1, bit_width);
19942 }
19943
19944 /* Return the bitmask CONST_INT to select the bits required by a zero extract
19945    operation of width WIDTH at bit position POS.  */
19946
19947 rtx
19948 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
19949 {
19950   gcc_assert (CONST_INT_P (width));
19951   gcc_assert (CONST_INT_P (pos));
19952
19953   unsigned HOST_WIDE_INT mask
19954     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
19955   return GEN_INT (mask << UINTVAL (pos));
19956 }
19957
19958 bool
19959 aarch64_mov_operand_p (rtx x, machine_mode mode)
19960 {
19961   if (GET_CODE (x) == HIGH
19962       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
19963     return true;
19964
19965   if (CONST_INT_P (x))
19966     return true;
19967
19968   if (VECTOR_MODE_P (GET_MODE (x)))
19969     {
19970       /* Require predicate constants to be VNx16BI before RA, so that we
19971          force everything to have a canonical form.  */
19972       if (!lra_in_progress
19973           && !reload_completed
19974           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
19975           && GET_MODE (x) != VNx16BImode)
19976         return false;
19977
19978       return aarch64_simd_valid_immediate (x, NULL);
19979     }
19980
19981   x = strip_salt (x);
19982   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
19983     return true;
19984
19985   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
19986     return true;
19987
19988   return aarch64_classify_symbolic_expression (x)
19989     == SYMBOL_TINY_ABSOLUTE;
19990 }
19991
19992 /* Return a const_int vector of VAL.  */
19993 rtx
19994 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
19995 {
19996   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
19997   return gen_const_vec_duplicate (mode, c);
19998 }
19999
20000 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
20001
20002 bool
20003 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
20004 {
20005   machine_mode vmode;
20006
20007   vmode = aarch64_simd_container_mode (mode, 64);
20008   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
20009   return aarch64_simd_valid_immediate (op_v, NULL);
20010 }
20011
20012 /* Construct and return a PARALLEL RTX vector with elements numbering the
20013    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
20014    the vector - from the perspective of the architecture.  This does not
20015    line up with GCC's perspective on lane numbers, so we end up with
20016    different masks depending on our target endian-ness.  The diagram
20017    below may help.  We must draw the distinction when building masks
20018    which select one half of the vector.  An instruction selecting
20019    architectural low-lanes for a big-endian target, must be described using
20020    a mask selecting GCC high-lanes.
20021
20022                  Big-Endian             Little-Endian
20023
20024 GCC             0   1   2   3           3   2   1   0
20025               | x | x | x | x |       | x | x | x | x |
20026 Architecture    3   2   1   0           3   2   1   0
20027
20028 Low Mask:         { 2, 3 }                { 0, 1 }
20029 High Mask:        { 0, 1 }                { 2, 3 }
20030
20031    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
20032
20033 rtx
20034 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
20035 {
20036   rtvec v = rtvec_alloc (nunits / 2);
20037   int high_base = nunits / 2;
20038   int low_base = 0;
20039   int base;
20040   rtx t1;
20041   int i;
20042
20043   if (BYTES_BIG_ENDIAN)
20044     base = high ? low_base : high_base;
20045   else
20046     base = high ? high_base : low_base;
20047
20048   for (i = 0; i < nunits / 2; i++)
20049     RTVEC_ELT (v, i) = GEN_INT (base + i);
20050
20051   t1 = gen_rtx_PARALLEL (mode, v);
20052   return t1;
20053 }
20054
20055 /* Check OP for validity as a PARALLEL RTX vector with elements
20056    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
20057    from the perspective of the architecture.  See the diagram above
20058    aarch64_simd_vect_par_cnst_half for more details.  */
20059
20060 bool
20061 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
20062                                        bool high)
20063 {
20064   int nelts;
20065   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
20066     return false;
20067
20068   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
20069   HOST_WIDE_INT count_op = XVECLEN (op, 0);
20070   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
20071   int i = 0;
20072
20073   if (count_op != count_ideal)
20074     return false;
20075
20076   for (i = 0; i < count_ideal; i++)
20077     {
20078       rtx elt_op = XVECEXP (op, 0, i);
20079       rtx elt_ideal = XVECEXP (ideal, 0, i);
20080
20081       if (!CONST_INT_P (elt_op)
20082           || INTVAL (elt_ideal) != INTVAL (elt_op))
20083         return false;
20084     }
20085   return true;
20086 }
20087
20088 /* Return a PARALLEL containing NELTS elements, with element I equal
20089    to BASE + I * STEP.  */
20090
20091 rtx
20092 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
20093 {
20094   rtvec vec = rtvec_alloc (nelts);
20095   for (unsigned int i = 0; i < nelts; ++i)
20096     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
20097   return gen_rtx_PARALLEL (VOIDmode, vec);
20098 }
20099
20100 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
20101    series with step STEP.  */
20102
20103 bool
20104 aarch64_stepped_int_parallel_p (rtx op, int step)
20105 {
20106   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
20107     return false;
20108
20109   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
20110   for (int i = 1; i < XVECLEN (op, 0); ++i)
20111     if (!CONST_INT_P (XVECEXP (op, 0, i))
20112         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
20113       return false;
20114
20115   return true;
20116 }
20117
20118 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
20119    HIGH (exclusive).  */
20120 void
20121 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
20122                           const_tree exp)
20123 {
20124   HOST_WIDE_INT lane;
20125   gcc_assert (CONST_INT_P (operand));
20126   lane = INTVAL (operand);
20127
20128   if (lane < low || lane >= high)
20129   {
20130     if (exp)
20131       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
20132     else
20133       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
20134   }
20135 }
20136
20137 /* Peform endian correction on lane number N, which indexes a vector
20138    of mode MODE, and return the result as an SImode rtx.  */
20139
20140 rtx
20141 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
20142 {
20143   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
20144 }
20145
20146 /* Return TRUE if OP is a valid vector addressing mode.  */
20147
20148 bool
20149 aarch64_simd_mem_operand_p (rtx op)
20150 {
20151   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
20152                         || REG_P (XEXP (op, 0)));
20153 }
20154
20155 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
20156
20157 bool
20158 aarch64_sve_ld1r_operand_p (rtx op)
20159 {
20160   struct aarch64_address_info addr;
20161   scalar_mode mode;
20162
20163   return (MEM_P (op)
20164           && is_a <scalar_mode> (GET_MODE (op), &mode)
20165           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
20166           && addr.type == ADDRESS_REG_IMM
20167           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
20168 }
20169
20170 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
20171    where the size of the read data is specified by `mode` and the size of the
20172    vector elements are specified by `elem_mode`.   */
20173 bool
20174 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
20175                                    scalar_mode elem_mode)
20176 {
20177   struct aarch64_address_info addr;
20178   if (!MEM_P (op)
20179       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
20180     return false;
20181
20182   if (addr.type == ADDRESS_REG_IMM)
20183     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
20184
20185   if (addr.type == ADDRESS_REG_REG)
20186     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
20187
20188   return false;
20189 }
20190
20191 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
20192 bool
20193 aarch64_sve_ld1rq_operand_p (rtx op)
20194 {
20195   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
20196                                             GET_MODE_INNER (GET_MODE (op)));
20197 }
20198
20199 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
20200    accessing a vector where the element size is specified by `elem_mode`.  */
20201 bool
20202 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
20203 {
20204   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
20205 }
20206
20207 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
20208 bool
20209 aarch64_sve_ldff1_operand_p (rtx op)
20210 {
20211   if (!MEM_P (op))
20212     return false;
20213
20214   struct aarch64_address_info addr;
20215   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
20216     return false;
20217
20218   if (addr.type == ADDRESS_REG_IMM)
20219     return known_eq (addr.const_offset, 0);
20220
20221   return addr.type == ADDRESS_REG_REG;
20222 }
20223
20224 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
20225 bool
20226 aarch64_sve_ldnf1_operand_p (rtx op)
20227 {
20228   struct aarch64_address_info addr;
20229
20230   return (MEM_P (op)
20231           && aarch64_classify_address (&addr, XEXP (op, 0),
20232                                        GET_MODE (op), false)
20233           && addr.type == ADDRESS_REG_IMM);
20234 }
20235
20236 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
20237    The conditions for STR are the same.  */
20238 bool
20239 aarch64_sve_ldr_operand_p (rtx op)
20240 {
20241   struct aarch64_address_info addr;
20242
20243   return (MEM_P (op)
20244           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
20245                                        false, ADDR_QUERY_ANY)
20246           && addr.type == ADDRESS_REG_IMM);
20247 }
20248
20249 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
20250    addressing memory of mode MODE.  */
20251 bool
20252 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
20253 {
20254   struct aarch64_address_info addr;
20255   if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
20256     return false;
20257
20258   if (addr.type == ADDRESS_REG_IMM)
20259     return offset_6bit_signed_scaled_p (mode, addr.const_offset);
20260
20261   return addr.type == ADDRESS_REG_REG;
20262 }
20263
20264 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
20265    We need to be able to access the individual pieces, so the range
20266    is different from LD[234] and ST[234].  */
20267 bool
20268 aarch64_sve_struct_memory_operand_p (rtx op)
20269 {
20270   if (!MEM_P (op))
20271     return false;
20272
20273   machine_mode mode = GET_MODE (op);
20274   struct aarch64_address_info addr;
20275   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
20276                                  ADDR_QUERY_ANY)
20277       || addr.type != ADDRESS_REG_IMM)
20278     return false;
20279
20280   poly_int64 first = addr.const_offset;
20281   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
20282   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
20283           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
20284 }
20285
20286 /* Emit a register copy from operand to operand, taking care not to
20287    early-clobber source registers in the process.
20288
20289    COUNT is the number of components into which the copy needs to be
20290    decomposed.  */
20291 void
20292 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
20293                                 unsigned int count)
20294 {
20295   unsigned int i;
20296   int rdest = REGNO (operands[0]);
20297   int rsrc = REGNO (operands[1]);
20298
20299   if (!reg_overlap_mentioned_p (operands[0], operands[1])
20300       || rdest < rsrc)
20301     for (i = 0; i < count; i++)
20302       emit_move_insn (gen_rtx_REG (mode, rdest + i),
20303                       gen_rtx_REG (mode, rsrc + i));
20304   else
20305     for (i = 0; i < count; i++)
20306       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
20307                       gen_rtx_REG (mode, rsrc + count - i - 1));
20308 }
20309
20310 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
20311    one of VSTRUCT modes: OI, CI, or XI.  */
20312 int
20313 aarch64_simd_attr_length_rglist (machine_mode mode)
20314 {
20315   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
20316   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
20317 }
20318
20319 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
20320    alignment of a vector to 128 bits.  SVE predicates have an alignment of
20321    16 bits.  */
20322 static HOST_WIDE_INT
20323 aarch64_simd_vector_alignment (const_tree type)
20324 {
20325   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
20326      be set for non-predicate vectors of booleans.  Modes are the most
20327      direct way we have of identifying real SVE predicate types.  */
20328   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
20329     return 16;
20330   widest_int min_size
20331     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
20332   return wi::umin (min_size, 128).to_uhwi ();
20333 }
20334
20335 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
20336 static poly_uint64
20337 aarch64_vectorize_preferred_vector_alignment (const_tree type)
20338 {
20339   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
20340     {
20341       /* If the length of the vector is a fixed power of 2, try to align
20342          to that length, otherwise don't try to align at all.  */
20343       HOST_WIDE_INT result;
20344       if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
20345           || !pow2p_hwi (result))
20346         result = TYPE_ALIGN (TREE_TYPE (type));
20347       return result;
20348     }
20349   return TYPE_ALIGN (type);
20350 }
20351
20352 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
20353 static bool
20354 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
20355 {
20356   if (is_packed)
20357     return false;
20358
20359   /* For fixed-length vectors, check that the vectorizer will aim for
20360      full-vector alignment.  This isn't true for generic GCC vectors
20361      that are wider than the ABI maximum of 128 bits.  */
20362   poly_uint64 preferred_alignment =
20363     aarch64_vectorize_preferred_vector_alignment (type);
20364   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
20365       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
20366                    preferred_alignment))
20367     return false;
20368
20369   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
20370   return true;
20371 }
20372
20373 /* Return true if the vector misalignment factor is supported by the
20374    target.  */
20375 static bool
20376 aarch64_builtin_support_vector_misalignment (machine_mode mode,
20377                                              const_tree type, int misalignment,
20378                                              bool is_packed)
20379 {
20380   if (TARGET_SIMD && STRICT_ALIGNMENT)
20381     {
20382       /* Return if movmisalign pattern is not supported for this mode.  */
20383       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
20384         return false;
20385
20386       /* Misalignment factor is unknown at compile time.  */
20387       if (misalignment == -1)
20388         return false;
20389     }
20390   return default_builtin_support_vector_misalignment (mode, type, misalignment,
20391                                                       is_packed);
20392 }
20393
20394 /* If VALS is a vector constant that can be loaded into a register
20395    using DUP, generate instructions to do so and return an RTX to
20396    assign to the register.  Otherwise return NULL_RTX.  */
20397 static rtx
20398 aarch64_simd_dup_constant (rtx vals)
20399 {
20400   machine_mode mode = GET_MODE (vals);
20401   machine_mode inner_mode = GET_MODE_INNER (mode);
20402   rtx x;
20403
20404   if (!const_vec_duplicate_p (vals, &x))
20405     return NULL_RTX;
20406
20407   /* We can load this constant by using DUP and a constant in a
20408      single ARM register.  This will be cheaper than a vector
20409      load.  */
20410   x = copy_to_mode_reg (inner_mode, x);
20411   return gen_vec_duplicate (mode, x);
20412 }
20413
20414
20415 /* Generate code to load VALS, which is a PARALLEL containing only
20416    constants (for vec_init) or CONST_VECTOR, efficiently into a
20417    register.  Returns an RTX to copy into the register, or NULL_RTX
20418    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
20419 static rtx
20420 aarch64_simd_make_constant (rtx vals)
20421 {
20422   machine_mode mode = GET_MODE (vals);
20423   rtx const_dup;
20424   rtx const_vec = NULL_RTX;
20425   int n_const = 0;
20426   int i;
20427
20428   if (GET_CODE (vals) == CONST_VECTOR)
20429     const_vec = vals;
20430   else if (GET_CODE (vals) == PARALLEL)
20431     {
20432       /* A CONST_VECTOR must contain only CONST_INTs and
20433          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
20434          Only store valid constants in a CONST_VECTOR.  */
20435       int n_elts = XVECLEN (vals, 0);
20436       for (i = 0; i < n_elts; ++i)
20437         {
20438           rtx x = XVECEXP (vals, 0, i);
20439           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
20440             n_const++;
20441         }
20442       if (n_const == n_elts)
20443         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
20444     }
20445   else
20446     gcc_unreachable ();
20447
20448   if (const_vec != NULL_RTX
20449       && aarch64_simd_valid_immediate (const_vec, NULL))
20450     /* Load using MOVI/MVNI.  */
20451     return const_vec;
20452   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
20453     /* Loaded using DUP.  */
20454     return const_dup;
20455   else if (const_vec != NULL_RTX)
20456     /* Load from constant pool. We cannot take advantage of single-cycle
20457        LD1 because we need a PC-relative addressing mode.  */
20458     return const_vec;
20459   else
20460     /* A PARALLEL containing something not valid inside CONST_VECTOR.
20461        We cannot construct an initializer.  */
20462     return NULL_RTX;
20463 }
20464
20465 /* Expand a vector initialisation sequence, such that TARGET is
20466    initialised to contain VALS.  */
20467
20468 void
20469 aarch64_expand_vector_init (rtx target, rtx vals)
20470 {
20471   machine_mode mode = GET_MODE (target);
20472   scalar_mode inner_mode = GET_MODE_INNER (mode);
20473   /* The number of vector elements.  */
20474   int n_elts = XVECLEN (vals, 0);
20475   /* The number of vector elements which are not constant.  */
20476   int n_var = 0;
20477   rtx any_const = NULL_RTX;
20478   /* The first element of vals.  */
20479   rtx v0 = XVECEXP (vals, 0, 0);
20480   bool all_same = true;
20481
20482   /* This is a special vec_init<M><N> where N is not an element mode but a
20483      vector mode with half the elements of M.  We expect to find two entries
20484      of mode N in VALS and we must put their concatentation into TARGET.  */
20485   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
20486     {
20487       gcc_assert (known_eq (GET_MODE_SIZE (mode),
20488                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
20489       rtx lo = XVECEXP (vals, 0, 0);
20490       rtx hi = XVECEXP (vals, 0, 1);
20491       machine_mode narrow_mode = GET_MODE (lo);
20492       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
20493       gcc_assert (narrow_mode == GET_MODE (hi));
20494
20495       /* When we want to concatenate a half-width vector with zeroes we can
20496          use the aarch64_combinez[_be] patterns.  Just make sure that the
20497          zeroes are in the right half.  */
20498       if (BYTES_BIG_ENDIAN
20499           && aarch64_simd_imm_zero (lo, narrow_mode)
20500           && general_operand (hi, narrow_mode))
20501         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
20502       else if (!BYTES_BIG_ENDIAN
20503                && aarch64_simd_imm_zero (hi, narrow_mode)
20504                && general_operand (lo, narrow_mode))
20505         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
20506       else
20507         {
20508           /* Else create the two half-width registers and combine them.  */
20509           if (!REG_P (lo))
20510             lo = force_reg (GET_MODE (lo), lo);
20511           if (!REG_P (hi))
20512             hi = force_reg (GET_MODE (hi), hi);
20513
20514           if (BYTES_BIG_ENDIAN)
20515             std::swap (lo, hi);
20516           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
20517         }
20518      return;
20519    }
20520
20521   /* Count the number of variable elements to initialise.  */
20522   for (int i = 0; i < n_elts; ++i)
20523     {
20524       rtx x = XVECEXP (vals, 0, i);
20525       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
20526         ++n_var;
20527       else
20528         any_const = x;
20529
20530       all_same &= rtx_equal_p (x, v0);
20531     }
20532
20533   /* No variable elements, hand off to aarch64_simd_make_constant which knows
20534      how best to handle this.  */
20535   if (n_var == 0)
20536     {
20537       rtx constant = aarch64_simd_make_constant (vals);
20538       if (constant != NULL_RTX)
20539         {
20540           emit_move_insn (target, constant);
20541           return;
20542         }
20543     }
20544
20545   /* Splat a single non-constant element if we can.  */
20546   if (all_same)
20547     {
20548       rtx x = copy_to_mode_reg (inner_mode, v0);
20549       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
20550       return;
20551     }
20552
20553   enum insn_code icode = optab_handler (vec_set_optab, mode);
20554   gcc_assert (icode != CODE_FOR_nothing);
20555
20556   /* If there are only variable elements, try to optimize
20557      the insertion using dup for the most common element
20558      followed by insertions.  */
20559
20560   /* The algorithm will fill matches[*][0] with the earliest matching element,
20561      and matches[X][1] with the count of duplicate elements (if X is the
20562      earliest element which has duplicates).  */
20563
20564   if (n_var == n_elts && n_elts <= 16)
20565     {
20566       int matches[16][2] = {0};
20567       for (int i = 0; i < n_elts; i++)
20568         {
20569           for (int j = 0; j <= i; j++)
20570             {
20571               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
20572                 {
20573                   matches[i][0] = j;
20574                   matches[j][1]++;
20575                   break;
20576                 }
20577             }
20578         }
20579       int maxelement = 0;
20580       int maxv = 0;
20581       for (int i = 0; i < n_elts; i++)
20582         if (matches[i][1] > maxv)
20583           {
20584             maxelement = i;
20585             maxv = matches[i][1];
20586           }
20587
20588       /* Create a duplicate of the most common element, unless all elements
20589          are equally useless to us, in which case just immediately set the
20590          vector register using the first element.  */
20591
20592       if (maxv == 1)
20593         {
20594           /* For vectors of two 64-bit elements, we can do even better.  */
20595           if (n_elts == 2
20596               && (inner_mode == E_DImode
20597                   || inner_mode == E_DFmode))
20598
20599             {
20600               rtx x0 = XVECEXP (vals, 0, 0);
20601               rtx x1 = XVECEXP (vals, 0, 1);
20602               /* Combine can pick up this case, but handling it directly
20603                  here leaves clearer RTL.
20604
20605                  This is load_pair_lanes<mode>, and also gives us a clean-up
20606                  for store_pair_lanes<mode>.  */
20607               if (memory_operand (x0, inner_mode)
20608                   && memory_operand (x1, inner_mode)
20609                   && !STRICT_ALIGNMENT
20610                   && rtx_equal_p (XEXP (x1, 0),
20611                                   plus_constant (Pmode,
20612                                                  XEXP (x0, 0),
20613                                                  GET_MODE_SIZE (inner_mode))))
20614                 {
20615                   rtx t;
20616                   if (inner_mode == DFmode)
20617                     t = gen_load_pair_lanesdf (target, x0, x1);
20618                   else
20619                     t = gen_load_pair_lanesdi (target, x0, x1);
20620                   emit_insn (t);
20621                   return;
20622                 }
20623             }
20624           /* The subreg-move sequence below will move into lane zero of the
20625              vector register.  For big-endian we want that position to hold
20626              the last element of VALS.  */
20627           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
20628           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
20629           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
20630         }
20631       else
20632         {
20633           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
20634           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
20635         }
20636
20637       /* Insert the rest.  */
20638       for (int i = 0; i < n_elts; i++)
20639         {
20640           rtx x = XVECEXP (vals, 0, i);
20641           if (matches[i][0] == maxelement)
20642             continue;
20643           x = copy_to_mode_reg (inner_mode, x);
20644           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
20645         }
20646       return;
20647     }
20648
20649   /* Initialise a vector which is part-variable.  We want to first try
20650      to build those lanes which are constant in the most efficient way we
20651      can.  */
20652   if (n_var != n_elts)
20653     {
20654       rtx copy = copy_rtx (vals);
20655
20656       /* Load constant part of vector.  We really don't care what goes into the
20657          parts we will overwrite, but we're more likely to be able to load the
20658          constant efficiently if it has fewer, larger, repeating parts
20659          (see aarch64_simd_valid_immediate).  */
20660       for (int i = 0; i < n_elts; i++)
20661         {
20662           rtx x = XVECEXP (vals, 0, i);
20663           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
20664             continue;
20665           rtx subst = any_const;
20666           for (int bit = n_elts / 2; bit > 0; bit /= 2)
20667             {
20668               /* Look in the copied vector, as more elements are const.  */
20669               rtx test = XVECEXP (copy, 0, i ^ bit);
20670               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
20671                 {
20672                   subst = test;
20673                   break;
20674                 }
20675             }
20676           XVECEXP (copy, 0, i) = subst;
20677         }
20678       aarch64_expand_vector_init (target, copy);
20679     }
20680
20681   /* Insert the variable lanes directly.  */
20682   for (int i = 0; i < n_elts; i++)
20683     {
20684       rtx x = XVECEXP (vals, 0, i);
20685       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
20686         continue;
20687       x = copy_to_mode_reg (inner_mode, x);
20688       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
20689     }
20690 }
20691
20692 /* Emit RTL corresponding to:
20693    insr TARGET, ELEM.  */
20694
20695 static void
20696 emit_insr (rtx target, rtx elem)
20697 {
20698   machine_mode mode = GET_MODE (target);
20699   scalar_mode elem_mode = GET_MODE_INNER (mode);
20700   elem = force_reg (elem_mode, elem);
20701
20702   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
20703   gcc_assert (icode != CODE_FOR_nothing);
20704   emit_insn (GEN_FCN (icode) (target, target, elem));
20705 }
20706
20707 /* Subroutine of aarch64_sve_expand_vector_init for handling
20708    trailing constants.
20709    This function works as follows:
20710    (a) Create a new vector consisting of trailing constants.
20711    (b) Initialize TARGET with the constant vector using emit_move_insn.
20712    (c) Insert remaining elements in TARGET using insr.
20713    NELTS is the total number of elements in original vector while
20714    while NELTS_REQD is the number of elements that are actually
20715    significant.
20716
20717    ??? The heuristic used is to do above only if number of constants
20718    is at least half the total number of elements.  May need fine tuning.  */
20719
20720 static bool
20721 aarch64_sve_expand_vector_init_handle_trailing_constants
20722  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
20723 {
20724   machine_mode mode = GET_MODE (target);
20725   scalar_mode elem_mode = GET_MODE_INNER (mode);
20726   int n_trailing_constants = 0;
20727
20728   for (int i = nelts_reqd - 1;
20729        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
20730        i--)
20731     n_trailing_constants++;
20732
20733   if (n_trailing_constants >= nelts_reqd / 2)
20734     {
20735       /* Try to use the natural pattern of BUILDER to extend the trailing
20736          constant elements to a full vector.  Replace any variables in the
20737          extra elements with zeros.
20738
20739          ??? It would be better if the builders supported "don't care"
20740              elements, with the builder filling in whichever elements
20741              give the most compact encoding.  */
20742       rtx_vector_builder v (mode, nelts, 1);
20743       for (int i = 0; i < nelts; i++)
20744         {
20745           rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
20746           if (!valid_for_const_vector_p (elem_mode, x))
20747             x = const0_rtx;
20748           v.quick_push (x);
20749         }
20750       rtx const_vec = v.build ();
20751       emit_move_insn (target, const_vec);
20752
20753       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
20754         emit_insr (target, builder.elt (i));
20755
20756       return true;
20757     }
20758
20759   return false;
20760 }
20761
20762 /* Subroutine of aarch64_sve_expand_vector_init.
20763    Works as follows:
20764    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
20765    (b) Skip trailing elements from BUILDER, which are the same as
20766        element NELTS_REQD - 1.
20767    (c) Insert earlier elements in reverse order in TARGET using insr.  */
20768
20769 static void
20770 aarch64_sve_expand_vector_init_insert_elems (rtx target,
20771                                              const rtx_vector_builder &builder,
20772                                              int nelts_reqd)
20773 {
20774   machine_mode mode = GET_MODE (target);
20775   scalar_mode elem_mode = GET_MODE_INNER (mode);
20776
20777   struct expand_operand ops[2];
20778   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
20779   gcc_assert (icode != CODE_FOR_nothing);
20780
20781   create_output_operand (&ops[0], target, mode);
20782   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
20783   expand_insn (icode, 2, ops);
20784
20785   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
20786   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
20787     emit_insr (target, builder.elt (i));
20788 }
20789
20790 /* Subroutine of aarch64_sve_expand_vector_init to handle case
20791    when all trailing elements of builder are same.
20792    This works as follows:
20793    (a) Use expand_insn interface to broadcast last vector element in TARGET.
20794    (b) Insert remaining elements in TARGET using insr.
20795
20796    ??? The heuristic used is to do above if number of same trailing elements
20797    is at least 3/4 of total number of elements, loosely based on
20798    heuristic from mostly_zeros_p.  May need fine-tuning.  */
20799
20800 static bool
20801 aarch64_sve_expand_vector_init_handle_trailing_same_elem
20802  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
20803 {
20804   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
20805   if (ndups >= (3 * nelts_reqd) / 4)
20806     {
20807       aarch64_sve_expand_vector_init_insert_elems (target, builder,
20808                                                    nelts_reqd - ndups + 1);
20809       return true;
20810     }
20811
20812   return false;
20813 }
20814
20815 /* Initialize register TARGET from BUILDER. NELTS is the constant number
20816    of elements in BUILDER.
20817
20818    The function tries to initialize TARGET from BUILDER if it fits one
20819    of the special cases outlined below.
20820
20821    Failing that, the function divides BUILDER into two sub-vectors:
20822    v_even = even elements of BUILDER;
20823    v_odd = odd elements of BUILDER;
20824
20825    and recursively calls itself with v_even and v_odd.
20826
20827    if (recursive call succeeded for v_even or v_odd)
20828      TARGET = zip (v_even, v_odd)
20829
20830    The function returns true if it managed to build TARGET from BUILDER
20831    with one of the special cases, false otherwise.
20832
20833    Example: {a, 1, b, 2, c, 3, d, 4}
20834
20835    The vector gets divided into:
20836    v_even = {a, b, c, d}
20837    v_odd = {1, 2, 3, 4}
20838
20839    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
20840    initialize tmp2 from constant vector v_odd using emit_move_insn.
20841
20842    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
20843    4 elements, so we construct tmp1 from v_even using insr:
20844    tmp1 = dup(d)
20845    insr tmp1, c
20846    insr tmp1, b
20847    insr tmp1, a
20848
20849    And finally:
20850    TARGET = zip (tmp1, tmp2)
20851    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
20852
20853 static bool
20854 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
20855                                 int nelts, int nelts_reqd)
20856 {
20857   machine_mode mode = GET_MODE (target);
20858
20859   /* Case 1: Vector contains trailing constants.  */
20860
20861   if (aarch64_sve_expand_vector_init_handle_trailing_constants
20862        (target, builder, nelts, nelts_reqd))
20863     return true;
20864
20865   /* Case 2: Vector contains leading constants.  */
20866
20867   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
20868   for (int i = 0; i < nelts_reqd; i++)
20869     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
20870   rev_builder.finalize ();
20871
20872   if (aarch64_sve_expand_vector_init_handle_trailing_constants
20873        (target, rev_builder, nelts, nelts_reqd))
20874     {
20875       emit_insn (gen_aarch64_sve_rev (mode, target, target));
20876       return true;
20877     }
20878
20879   /* Case 3: Vector contains trailing same element.  */
20880
20881   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
20882        (target, builder, nelts_reqd))
20883     return true;
20884
20885   /* Case 4: Vector contains leading same element.  */
20886
20887   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
20888        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
20889     {
20890       emit_insn (gen_aarch64_sve_rev (mode, target, target));
20891       return true;
20892     }
20893
20894   /* Avoid recursing below 4-elements.
20895      ??? The threshold 4 may need fine-tuning.  */
20896
20897   if (nelts_reqd <= 4)
20898     return false;
20899
20900   rtx_vector_builder v_even (mode, nelts, 1);
20901   rtx_vector_builder v_odd (mode, nelts, 1);
20902
20903   for (int i = 0; i < nelts * 2; i += 2)
20904     {
20905       v_even.quick_push (builder.elt (i));
20906       v_odd.quick_push (builder.elt (i + 1));
20907     }
20908
20909   v_even.finalize ();
20910   v_odd.finalize ();
20911
20912   rtx tmp1 = gen_reg_rtx (mode);
20913   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
20914                                                     nelts, nelts_reqd / 2);
20915
20916   rtx tmp2 = gen_reg_rtx (mode);
20917   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
20918                                                    nelts, nelts_reqd / 2);
20919
20920   if (!did_even_p && !did_odd_p)
20921     return false;
20922
20923   /* Initialize v_even and v_odd using INSR if it didn't match any of the
20924      special cases and zip v_even, v_odd.  */
20925
20926   if (!did_even_p)
20927     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
20928
20929   if (!did_odd_p)
20930     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
20931
20932   rtvec v = gen_rtvec (2, tmp1, tmp2);
20933   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
20934   return true;
20935 }
20936
20937 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
20938
20939 void
20940 aarch64_sve_expand_vector_init (rtx target, rtx vals)
20941 {
20942   machine_mode mode = GET_MODE (target);
20943   int nelts = XVECLEN (vals, 0);
20944
20945   rtx_vector_builder v (mode, nelts, 1);
20946   for (int i = 0; i < nelts; i++)
20947     v.quick_push (XVECEXP (vals, 0, i));
20948   v.finalize ();
20949
20950   /* If neither sub-vectors of v could be initialized specially,
20951      then use INSR to insert all elements from v into TARGET.
20952      ??? This might not be optimal for vectors with large
20953      initializers like 16-element or above.
20954      For nelts < 4, it probably isn't useful to handle specially.  */
20955
20956   if (nelts < 4
20957       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
20958     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
20959 }
20960
20961 /* Check whether VALUE is a vector constant in which every element
20962    is either a power of 2 or a negated power of 2.  If so, return
20963    a constant vector of log2s, and flip CODE between PLUS and MINUS
20964    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
20965
20966 static rtx
20967 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
20968 {
20969   if (GET_CODE (value) != CONST_VECTOR)
20970     return NULL_RTX;
20971
20972   rtx_vector_builder builder;
20973   if (!builder.new_unary_operation (GET_MODE (value), value, false))
20974     return NULL_RTX;
20975
20976   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
20977   /* 1 if the result of the multiplication must be negated,
20978      0 if it mustn't, or -1 if we don't yet care.  */
20979   int negate = -1;
20980   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
20981   for (unsigned int i = 0; i < encoded_nelts; ++i)
20982     {
20983       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
20984       if (!CONST_SCALAR_INT_P (elt))
20985         return NULL_RTX;
20986       rtx_mode_t val (elt, int_mode);
20987       wide_int pow2 = wi::neg (val);
20988       if (val != pow2)
20989         {
20990           /* It matters whether we negate or not.  Make that choice,
20991              and make sure that it's consistent with previous elements.  */
20992           if (negate == !wi::neg_p (val))
20993             return NULL_RTX;
20994           negate = wi::neg_p (val);
20995           if (!negate)
20996             pow2 = val;
20997         }
20998       /* POW2 is now the value that we want to be a power of 2.  */
20999       int shift = wi::exact_log2 (pow2);
21000       if (shift < 0)
21001         return NULL_RTX;
21002       builder.quick_push (gen_int_mode (shift, int_mode));
21003     }
21004   if (negate == -1)
21005     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
21006     code = PLUS;
21007   else if (negate == 1)
21008     code = code == PLUS ? MINUS : PLUS;
21009   return builder.build ();
21010 }
21011
21012 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
21013    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
21014    operands array, in the same order as for fma_optab.  Return true if
21015    the function emitted all the necessary instructions, false if the caller
21016    should generate the pattern normally with the new OPERANDS array.  */
21017
21018 bool
21019 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
21020 {
21021   machine_mode mode = GET_MODE (operands[0]);
21022   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
21023     {
21024       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
21025                                   NULL_RTX, true, OPTAB_DIRECT);
21026       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
21027                           operands[3], product, operands[0], true,
21028                           OPTAB_DIRECT);
21029       return true;
21030     }
21031   operands[2] = force_reg (mode, operands[2]);
21032   return false;
21033 }
21034
21035 /* Likewise, but for a conditional pattern.  */
21036
21037 bool
21038 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
21039 {
21040   machine_mode mode = GET_MODE (operands[0]);
21041   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
21042     {
21043       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
21044                                   NULL_RTX, true, OPTAB_DIRECT);
21045       emit_insn (gen_cond (code, mode, operands[0], operands[1],
21046                            operands[4], product, operands[5]));
21047       return true;
21048     }
21049   operands[3] = force_reg (mode, operands[3]);
21050   return false;
21051 }
21052
21053 static unsigned HOST_WIDE_INT
21054 aarch64_shift_truncation_mask (machine_mode mode)
21055 {
21056   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
21057     return 0;
21058   return GET_MODE_UNIT_BITSIZE (mode) - 1;
21059 }
21060
21061 /* Select a format to encode pointers in exception handling data.  */
21062 int
21063 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
21064 {
21065    int type;
21066    switch (aarch64_cmodel)
21067      {
21068      case AARCH64_CMODEL_TINY:
21069      case AARCH64_CMODEL_TINY_PIC:
21070      case AARCH64_CMODEL_SMALL:
21071      case AARCH64_CMODEL_SMALL_PIC:
21072      case AARCH64_CMODEL_SMALL_SPIC:
21073        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
21074           for everything.  */
21075        type = DW_EH_PE_sdata4;
21076        break;
21077      default:
21078        /* No assumptions here.  8-byte relocs required.  */
21079        type = DW_EH_PE_sdata8;
21080        break;
21081      }
21082    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
21083 }
21084
21085 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
21086
21087 static void
21088 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
21089 {
21090   if (TREE_CODE (decl) == FUNCTION_DECL)
21091     {
21092       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
21093       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
21094         {
21095           fprintf (stream, "\t.variant_pcs\t");
21096           assemble_name (stream, name);
21097           fprintf (stream, "\n");
21098         }
21099     }
21100 }
21101
21102 /* The last .arch and .tune assembly strings that we printed.  */
21103 static std::string aarch64_last_printed_arch_string;
21104 static std::string aarch64_last_printed_tune_string;
21105
21106 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
21107    by the function fndecl.  */
21108
21109 void
21110 aarch64_declare_function_name (FILE *stream, const char* name,
21111                                 tree fndecl)
21112 {
21113   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
21114
21115   struct cl_target_option *targ_options;
21116   if (target_parts)
21117     targ_options = TREE_TARGET_OPTION (target_parts);
21118   else
21119     targ_options = TREE_TARGET_OPTION (target_option_current_node);
21120   gcc_assert (targ_options);
21121
21122   const struct processor *this_arch
21123     = aarch64_get_arch (targ_options->x_explicit_arch);
21124
21125   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
21126   std::string extension
21127     = aarch64_get_extension_string_for_isa_flags (isa_flags,
21128                                                   this_arch->flags);
21129   /* Only update the assembler .arch string if it is distinct from the last
21130      such string we printed.  */
21131   std::string to_print = this_arch->name + extension;
21132   if (to_print != aarch64_last_printed_arch_string)
21133     {
21134       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
21135       aarch64_last_printed_arch_string = to_print;
21136     }
21137
21138   /* Print the cpu name we're tuning for in the comments, might be
21139      useful to readers of the generated asm.  Do it only when it changes
21140      from function to function and verbose assembly is requested.  */
21141   const struct processor *this_tune
21142     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
21143
21144   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
21145     {
21146       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
21147                    this_tune->name);
21148       aarch64_last_printed_tune_string = this_tune->name;
21149     }
21150
21151   aarch64_asm_output_variant_pcs (stream, fndecl, name);
21152
21153   /* Don't forget the type directive for ELF.  */
21154   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
21155   ASM_OUTPUT_LABEL (stream, name);
21156
21157   cfun->machine->label_is_assembled = true;
21158 }
21159
21160 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  Check if the patch area is after
21161    the function label and emit a BTI if necessary.  */
21162
21163 void
21164 aarch64_print_patchable_function_entry (FILE *file,
21165                                         unsigned HOST_WIDE_INT patch_area_size,
21166                                         bool record_p)
21167 {
21168   if (cfun->machine->label_is_assembled
21169       && aarch64_bti_enabled ()
21170       && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
21171     {
21172       /* Remove the BTI that follows the patch area and insert a new BTI
21173          before the patch area right after the function label.  */
21174       rtx_insn *insn = next_real_nondebug_insn (get_insns ());
21175       if (insn
21176           && INSN_P (insn)
21177           && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
21178           && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
21179         delete_insn (insn);
21180       asm_fprintf (file, "\thint\t34 // bti c\n");
21181     }
21182
21183   default_print_patchable_function_entry (file, patch_area_size, record_p);
21184 }
21185
21186 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
21187
21188 void
21189 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
21190 {
21191   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
21192   const char *value = IDENTIFIER_POINTER (target);
21193   aarch64_asm_output_variant_pcs (stream, decl, name);
21194   ASM_OUTPUT_DEF (stream, name, value);
21195 }
21196
21197 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
21198    function symbol references.  */
21199
21200 void
21201 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
21202 {
21203   default_elf_asm_output_external (stream, decl, name);
21204   aarch64_asm_output_variant_pcs (stream, decl, name);
21205 }
21206
21207 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
21208    Used to output the .cfi_b_key_frame directive when signing the current
21209    function with the B key.  */
21210
21211 void
21212 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
21213 {
21214   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
21215       && aarch64_ra_sign_key == AARCH64_KEY_B)
21216         asm_fprintf (f, "\t.cfi_b_key_frame\n");
21217 }
21218
21219 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
21220
21221 static void
21222 aarch64_start_file (void)
21223 {
21224   struct cl_target_option *default_options
21225     = TREE_TARGET_OPTION (target_option_default_node);
21226
21227   const struct processor *default_arch
21228     = aarch64_get_arch (default_options->x_explicit_arch);
21229   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
21230   std::string extension
21231     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
21232                                                   default_arch->flags);
21233
21234    aarch64_last_printed_arch_string = default_arch->name + extension;
21235    aarch64_last_printed_tune_string = "";
21236    asm_fprintf (asm_out_file, "\t.arch %s\n",
21237                 aarch64_last_printed_arch_string.c_str ());
21238
21239    default_file_start ();
21240 }
21241
21242 /* Emit load exclusive.  */
21243
21244 static void
21245 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
21246                              rtx mem, rtx model_rtx)
21247 {
21248   if (mode == TImode)
21249     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
21250                                                 gen_highpart (DImode, rval),
21251                                                 mem, model_rtx));
21252   else
21253     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
21254 }
21255
21256 /* Emit store exclusive.  */
21257
21258 static void
21259 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
21260                               rtx mem, rtx rval, rtx model_rtx)
21261 {
21262   if (mode == TImode)
21263     emit_insn (gen_aarch64_store_exclusive_pair
21264                (bval, mem, operand_subword (rval, 0, 0, TImode),
21265                 operand_subword (rval, 1, 0, TImode), model_rtx));
21266   else
21267     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
21268 }
21269
21270 /* Mark the previous jump instruction as unlikely.  */
21271
21272 static void
21273 aarch64_emit_unlikely_jump (rtx insn)
21274 {
21275   rtx_insn *jump = emit_jump_insn (insn);
21276   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
21277 }
21278
21279 /* We store the names of the various atomic helpers in a 5x4 array.
21280    Return the libcall function given MODE, MODEL and NAMES.  */
21281
21282 rtx
21283 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
21284                         const atomic_ool_names *names)
21285 {
21286   memmodel model = memmodel_base (INTVAL (model_rtx));
21287   int mode_idx, model_idx;
21288
21289   switch (mode)
21290     {
21291     case E_QImode:
21292       mode_idx = 0;
21293       break;
21294     case E_HImode:
21295       mode_idx = 1;
21296       break;
21297     case E_SImode:
21298       mode_idx = 2;
21299       break;
21300     case E_DImode:
21301       mode_idx = 3;
21302       break;
21303     case E_TImode:
21304       mode_idx = 4;
21305       break;
21306     default:
21307       gcc_unreachable ();
21308     }
21309
21310   switch (model)
21311     {
21312     case MEMMODEL_RELAXED:
21313       model_idx = 0;
21314       break;
21315     case MEMMODEL_CONSUME:
21316     case MEMMODEL_ACQUIRE:
21317       model_idx = 1;
21318       break;
21319     case MEMMODEL_RELEASE:
21320       model_idx = 2;
21321       break;
21322     case MEMMODEL_ACQ_REL:
21323     case MEMMODEL_SEQ_CST:
21324       model_idx = 3;
21325       break;
21326     default:
21327       gcc_unreachable ();
21328     }
21329
21330   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
21331                                       VISIBILITY_HIDDEN);
21332 }
21333
21334 #define DEF0(B, N) \
21335   { "__aarch64_" #B #N "_relax", \
21336     "__aarch64_" #B #N "_acq", \
21337     "__aarch64_" #B #N "_rel", \
21338     "__aarch64_" #B #N "_acq_rel" }
21339
21340 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
21341                  { NULL, NULL, NULL, NULL }
21342 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
21343
21344 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
21345 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
21346 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
21347 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
21348 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
21349 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
21350
21351 #undef DEF0
21352 #undef DEF4
21353 #undef DEF5
21354
21355 /* Expand a compare and swap pattern.  */
21356
21357 void
21358 aarch64_expand_compare_and_swap (rtx operands[])
21359 {
21360   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
21361   machine_mode mode, r_mode;
21362
21363   bval = operands[0];
21364   rval = operands[1];
21365   mem = operands[2];
21366   oldval = operands[3];
21367   newval = operands[4];
21368   is_weak = operands[5];
21369   mod_s = operands[6];
21370   mod_f = operands[7];
21371   mode = GET_MODE (mem);
21372
21373   /* Normally the succ memory model must be stronger than fail, but in the
21374      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
21375      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
21376   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
21377       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
21378     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
21379
21380   r_mode = mode;
21381   if (mode == QImode || mode == HImode)
21382     {
21383       r_mode = SImode;
21384       rval = gen_reg_rtx (r_mode);
21385     }
21386
21387   if (TARGET_LSE)
21388     {
21389       /* The CAS insn requires oldval and rval overlap, but we need to
21390          have a copy of oldval saved across the operation to tell if
21391          the operation is successful.  */
21392       if (reg_overlap_mentioned_p (rval, oldval))
21393         rval = copy_to_mode_reg (r_mode, oldval);
21394       else
21395         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
21396
21397       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
21398                                                    newval, mod_s));
21399       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
21400     }
21401   else if (TARGET_OUTLINE_ATOMICS)
21402     {
21403       /* Oldval must satisfy compare afterward.  */
21404       if (!aarch64_plus_operand (oldval, mode))
21405         oldval = force_reg (mode, oldval);
21406       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
21407       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
21408                                       oldval, mode, newval, mode,
21409                                       XEXP (mem, 0), Pmode);
21410       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
21411     }
21412   else
21413     {
21414       /* The oldval predicate varies by mode.  Test it and force to reg.  */
21415       insn_code code = code_for_aarch64_compare_and_swap (mode);
21416       if (!insn_data[code].operand[2].predicate (oldval, mode))
21417         oldval = force_reg (mode, oldval);
21418
21419       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
21420                                  is_weak, mod_s, mod_f));
21421       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
21422     }
21423
21424   if (r_mode != mode)
21425     rval = gen_lowpart (mode, rval);
21426   emit_move_insn (operands[1], rval);
21427
21428   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
21429   emit_insn (gen_rtx_SET (bval, x));
21430 }
21431
21432 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
21433    sequence implementing an atomic operation.  */
21434
21435 static void
21436 aarch64_emit_post_barrier (enum memmodel model)
21437 {
21438   const enum memmodel base_model = memmodel_base (model);
21439
21440   if (is_mm_sync (model)
21441       && (base_model == MEMMODEL_ACQUIRE
21442           || base_model == MEMMODEL_ACQ_REL
21443           || base_model == MEMMODEL_SEQ_CST))
21444     {
21445       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
21446     }
21447 }
21448
21449 /* Split a compare and swap pattern.  */
21450
21451 void
21452 aarch64_split_compare_and_swap (rtx operands[])
21453 {
21454   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
21455   gcc_assert (epilogue_completed);
21456
21457   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
21458   machine_mode mode;
21459   bool is_weak;
21460   rtx_code_label *label1, *label2;
21461   enum memmodel model;
21462
21463   rval = operands[0];
21464   mem = operands[1];
21465   oldval = operands[2];
21466   newval = operands[3];
21467   is_weak = (operands[4] != const0_rtx);
21468   model_rtx = operands[5];
21469   scratch = operands[7];
21470   mode = GET_MODE (mem);
21471   model = memmodel_from_int (INTVAL (model_rtx));
21472
21473   /* When OLDVAL is zero and we want the strong version we can emit a tighter
21474     loop:
21475     .label1:
21476         LD[A]XR rval, [mem]
21477         CBNZ    rval, .label2
21478         ST[L]XR scratch, newval, [mem]
21479         CBNZ    scratch, .label1
21480     .label2:
21481         CMP     rval, 0.  */
21482   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
21483                         oldval == const0_rtx && mode != TImode);
21484
21485   label1 = NULL;
21486   if (!is_weak)
21487     {
21488       label1 = gen_label_rtx ();
21489       emit_label (label1);
21490     }
21491   label2 = gen_label_rtx ();
21492
21493   /* The initial load can be relaxed for a __sync operation since a final
21494      barrier will be emitted to stop code hoisting.  */
21495   if (is_mm_sync (model))
21496     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
21497   else
21498     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
21499
21500   if (strong_zero_p)
21501     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
21502   else
21503     {
21504       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
21505       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
21506     }
21507   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21508                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
21509   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21510
21511   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
21512
21513   if (!is_weak)
21514     {
21515       if (aarch64_track_speculation)
21516         {
21517           /* Emit an explicit compare instruction, so that we can correctly
21518              track the condition codes.  */
21519           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
21520           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
21521         }
21522       else
21523         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
21524
21525       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21526                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
21527       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21528     }
21529   else
21530     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
21531
21532   emit_label (label2);
21533
21534   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
21535      to set the condition flags.  If this is not used it will be removed by
21536      later passes.  */
21537   if (strong_zero_p)
21538     aarch64_gen_compare_reg (NE, rval, const0_rtx);
21539
21540   /* Emit any final barrier needed for a __sync operation.  */
21541   if (is_mm_sync (model))
21542     aarch64_emit_post_barrier (model);
21543 }
21544
21545 /* Split an atomic operation.  */
21546
21547 void
21548 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
21549                          rtx value, rtx model_rtx, rtx cond)
21550 {
21551   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
21552   gcc_assert (epilogue_completed);
21553
21554   machine_mode mode = GET_MODE (mem);
21555   machine_mode wmode = (mode == DImode ? DImode : SImode);
21556   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
21557   const bool is_sync = is_mm_sync (model);
21558   rtx_code_label *label;
21559   rtx x;
21560
21561   /* Split the atomic operation into a sequence.  */
21562   label = gen_label_rtx ();
21563   emit_label (label);
21564
21565   if (new_out)
21566     new_out = gen_lowpart (wmode, new_out);
21567   if (old_out)
21568     old_out = gen_lowpart (wmode, old_out);
21569   else
21570     old_out = new_out;
21571   value = simplify_gen_subreg (wmode, value, mode, 0);
21572
21573   /* The initial load can be relaxed for a __sync operation since a final
21574      barrier will be emitted to stop code hoisting.  */
21575  if (is_sync)
21576     aarch64_emit_load_exclusive (mode, old_out, mem,
21577                                  GEN_INT (MEMMODEL_RELAXED));
21578   else
21579     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
21580
21581   switch (code)
21582     {
21583     case SET:
21584       new_out = value;
21585       break;
21586
21587     case NOT:
21588       x = gen_rtx_AND (wmode, old_out, value);
21589       emit_insn (gen_rtx_SET (new_out, x));
21590       x = gen_rtx_NOT (wmode, new_out);
21591       emit_insn (gen_rtx_SET (new_out, x));
21592       break;
21593
21594     case MINUS:
21595       if (CONST_INT_P (value))
21596         {
21597           value = GEN_INT (-UINTVAL (value));
21598           code = PLUS;
21599         }
21600       /* Fall through.  */
21601
21602     default:
21603       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
21604       emit_insn (gen_rtx_SET (new_out, x));
21605       break;
21606     }
21607
21608   aarch64_emit_store_exclusive (mode, cond, mem,
21609                                 gen_lowpart (mode, new_out), model_rtx);
21610
21611   if (aarch64_track_speculation)
21612     {
21613       /* Emit an explicit compare instruction, so that we can correctly
21614          track the condition codes.  */
21615       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
21616       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
21617     }
21618   else
21619     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
21620
21621   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21622                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
21623   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21624
21625   /* Emit any final barrier needed for a __sync operation.  */
21626   if (is_sync)
21627     aarch64_emit_post_barrier (model);
21628 }
21629
21630 static void
21631 aarch64_init_libfuncs (void)
21632 {
21633    /* Half-precision float operations.  The compiler handles all operations
21634      with NULL libfuncs by converting to SFmode.  */
21635
21636   /* Conversions.  */
21637   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
21638   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
21639
21640   /* Arithmetic.  */
21641   set_optab_libfunc (add_optab, HFmode, NULL);
21642   set_optab_libfunc (sdiv_optab, HFmode, NULL);
21643   set_optab_libfunc (smul_optab, HFmode, NULL);
21644   set_optab_libfunc (neg_optab, HFmode, NULL);
21645   set_optab_libfunc (sub_optab, HFmode, NULL);
21646
21647   /* Comparisons.  */
21648   set_optab_libfunc (eq_optab, HFmode, NULL);
21649   set_optab_libfunc (ne_optab, HFmode, NULL);
21650   set_optab_libfunc (lt_optab, HFmode, NULL);
21651   set_optab_libfunc (le_optab, HFmode, NULL);
21652   set_optab_libfunc (ge_optab, HFmode, NULL);
21653   set_optab_libfunc (gt_optab, HFmode, NULL);
21654   set_optab_libfunc (unord_optab, HFmode, NULL);
21655 }
21656
21657 /* Target hook for c_mode_for_suffix.  */
21658 static machine_mode
21659 aarch64_c_mode_for_suffix (char suffix)
21660 {
21661   if (suffix == 'q')
21662     return TFmode;
21663
21664   return VOIDmode;
21665 }
21666
21667 /* We can only represent floating point constants which will fit in
21668    "quarter-precision" values.  These values are characterised by
21669    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
21670    by:
21671
21672    (-1)^s * (n/16) * 2^r
21673
21674    Where:
21675      's' is the sign bit.
21676      'n' is an integer in the range 16 <= n <= 31.
21677      'r' is an integer in the range -3 <= r <= 4.  */
21678
21679 /* Return true iff X can be represented by a quarter-precision
21680    floating point immediate operand X.  Note, we cannot represent 0.0.  */
21681 bool
21682 aarch64_float_const_representable_p (rtx x)
21683 {
21684   /* This represents our current view of how many bits
21685      make up the mantissa.  */
21686   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
21687   int exponent;
21688   unsigned HOST_WIDE_INT mantissa, mask;
21689   REAL_VALUE_TYPE r, m;
21690   bool fail;
21691
21692   x = unwrap_const_vec_duplicate (x);
21693   if (!CONST_DOUBLE_P (x))
21694     return false;
21695
21696   if (GET_MODE (x) == VOIDmode
21697       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
21698     return false;
21699
21700   r = *CONST_DOUBLE_REAL_VALUE (x);
21701
21702   /* We cannot represent infinities, NaNs or +/-zero.  We won't
21703      know if we have +zero until we analyse the mantissa, but we
21704      can reject the other invalid values.  */
21705   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
21706       || REAL_VALUE_MINUS_ZERO (r))
21707     return false;
21708
21709   /* Extract exponent.  */
21710   r = real_value_abs (&r);
21711   exponent = REAL_EXP (&r);
21712
21713   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
21714      highest (sign) bit, with a fixed binary point at bit point_pos.
21715      m1 holds the low part of the mantissa, m2 the high part.
21716      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
21717      bits for the mantissa, this can fail (low bits will be lost).  */
21718   real_ldexp (&m, &r, point_pos - exponent);
21719   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
21720
21721   /* If the low part of the mantissa has bits set we cannot represent
21722      the value.  */
21723   if (w.ulow () != 0)
21724     return false;
21725   /* We have rejected the lower HOST_WIDE_INT, so update our
21726      understanding of how many bits lie in the mantissa and
21727      look only at the high HOST_WIDE_INT.  */
21728   mantissa = w.elt (1);
21729   point_pos -= HOST_BITS_PER_WIDE_INT;
21730
21731   /* We can only represent values with a mantissa of the form 1.xxxx.  */
21732   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
21733   if ((mantissa & mask) != 0)
21734     return false;
21735
21736   /* Having filtered unrepresentable values, we may now remove all
21737      but the highest 5 bits.  */
21738   mantissa >>= point_pos - 5;
21739
21740   /* We cannot represent the value 0.0, so reject it.  This is handled
21741      elsewhere.  */
21742   if (mantissa == 0)
21743     return false;
21744
21745   /* Then, as bit 4 is always set, we can mask it off, leaving
21746      the mantissa in the range [0, 15].  */
21747   mantissa &= ~(1 << 4);
21748   gcc_assert (mantissa <= 15);
21749
21750   /* GCC internally does not use IEEE754-like encoding (where normalized
21751      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
21752      Our mantissa values are shifted 4 places to the left relative to
21753      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
21754      by 5 places to correct for GCC's representation.  */
21755   exponent = 5 - exponent;
21756
21757   return (exponent >= 0 && exponent <= 7);
21758 }
21759
21760 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
21761    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
21762    output MOVI/MVNI, ORR or BIC immediate.  */
21763 char*
21764 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
21765                                    enum simd_immediate_check which)
21766 {
21767   bool is_valid;
21768   static char templ[40];
21769   const char *mnemonic;
21770   const char *shift_op;
21771   unsigned int lane_count = 0;
21772   char element_char;
21773
21774   struct simd_immediate_info info;
21775
21776   /* This will return true to show const_vector is legal for use as either
21777      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
21778      It will also update INFO to show how the immediate should be generated.
21779      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
21780   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
21781   gcc_assert (is_valid);
21782
21783   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
21784   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
21785
21786   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
21787     {
21788       gcc_assert (info.insn == simd_immediate_info::MOV
21789                   && info.u.mov.shift == 0);
21790       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
21791          move immediate path.  */
21792       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
21793         info.u.mov.value = GEN_INT (0);
21794       else
21795         {
21796           const unsigned int buf_size = 20;
21797           char float_buf[buf_size] = {'\0'};
21798           real_to_decimal_for_mode (float_buf,
21799                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
21800                                     buf_size, buf_size, 1, info.elt_mode);
21801
21802           if (lane_count == 1)
21803             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
21804           else
21805             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
21806                       lane_count, element_char, float_buf);
21807           return templ;
21808         }
21809     }
21810
21811   gcc_assert (CONST_INT_P (info.u.mov.value));
21812
21813   if (which == AARCH64_CHECK_MOV)
21814     {
21815       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
21816       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
21817                   ? "msl" : "lsl");
21818       if (lane_count == 1)
21819         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
21820                   mnemonic, UINTVAL (info.u.mov.value));
21821       else if (info.u.mov.shift)
21822         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
21823                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
21824                   element_char, UINTVAL (info.u.mov.value), shift_op,
21825                   info.u.mov.shift);
21826       else
21827         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
21828                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
21829                   element_char, UINTVAL (info.u.mov.value));
21830     }
21831   else
21832     {
21833       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
21834       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
21835       if (info.u.mov.shift)
21836         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
21837                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
21838                   element_char, UINTVAL (info.u.mov.value), "lsl",
21839                   info.u.mov.shift);
21840       else
21841         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
21842                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
21843                   element_char, UINTVAL (info.u.mov.value));
21844     }
21845   return templ;
21846 }
21847
21848 char*
21849 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
21850 {
21851
21852   /* If a floating point number was passed and we desire to use it in an
21853      integer mode do the conversion to integer.  */
21854   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
21855     {
21856       unsigned HOST_WIDE_INT ival;
21857       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
21858           gcc_unreachable ();
21859       immediate = gen_int_mode (ival, mode);
21860     }
21861
21862   machine_mode vmode;
21863   /* use a 64 bit mode for everything except for DI/DF mode, where we use
21864      a 128 bit vector mode.  */
21865   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
21866
21867   vmode = aarch64_simd_container_mode (mode, width);
21868   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
21869   return aarch64_output_simd_mov_immediate (v_op, width);
21870 }
21871
21872 /* Return the output string to use for moving immediate CONST_VECTOR
21873    into an SVE register.  */
21874
21875 char *
21876 aarch64_output_sve_mov_immediate (rtx const_vector)
21877 {
21878   static char templ[40];
21879   struct simd_immediate_info info;
21880   char element_char;
21881
21882   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
21883   gcc_assert (is_valid);
21884
21885   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
21886
21887   machine_mode vec_mode = GET_MODE (const_vector);
21888   if (aarch64_sve_pred_mode_p (vec_mode))
21889     {
21890       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
21891       if (info.insn == simd_immediate_info::MOV)
21892         {
21893           gcc_assert (info.u.mov.value == const0_rtx);
21894           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
21895         }
21896       else
21897         {
21898           gcc_assert (info.insn == simd_immediate_info::PTRUE);
21899           unsigned int total_bytes;
21900           if (info.u.pattern == AARCH64_SV_ALL
21901               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
21902             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
21903                       total_bytes / GET_MODE_SIZE (info.elt_mode));
21904           else
21905             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
21906                       svpattern_token (info.u.pattern));
21907         }
21908       return buf;
21909     }
21910
21911   if (info.insn == simd_immediate_info::INDEX)
21912     {
21913       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
21914                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
21915                 element_char, INTVAL (info.u.index.base),
21916                 INTVAL (info.u.index.step));
21917       return templ;
21918     }
21919
21920   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
21921     {
21922       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
21923         info.u.mov.value = GEN_INT (0);
21924       else
21925         {
21926           const int buf_size = 20;
21927           char float_buf[buf_size] = {};
21928           real_to_decimal_for_mode (float_buf,
21929                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
21930                                     buf_size, buf_size, 1, info.elt_mode);
21931
21932           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
21933                     element_char, float_buf);
21934           return templ;
21935         }
21936     }
21937
21938   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
21939             element_char, INTVAL (info.u.mov.value));
21940   return templ;
21941 }
21942
21943 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
21944    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
21945    pattern.  */
21946
21947 char *
21948 aarch64_output_sve_ptrues (rtx const_unspec)
21949 {
21950   static char templ[40];
21951
21952   struct simd_immediate_info info;
21953   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
21954   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
21955
21956   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
21957   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
21958             svpattern_token (info.u.pattern));
21959   return templ;
21960 }
21961
21962 /* Split operands into moves from op[1] + op[2] into op[0].  */
21963
21964 void
21965 aarch64_split_combinev16qi (rtx operands[3])
21966 {
21967   unsigned int dest = REGNO (operands[0]);
21968   unsigned int src1 = REGNO (operands[1]);
21969   unsigned int src2 = REGNO (operands[2]);
21970   machine_mode halfmode = GET_MODE (operands[1]);
21971   unsigned int halfregs = REG_NREGS (operands[1]);
21972   rtx destlo, desthi;
21973
21974   gcc_assert (halfmode == V16QImode);
21975
21976   if (src1 == dest && src2 == dest + halfregs)
21977     {
21978       /* No-op move.  Can't split to nothing; emit something.  */
21979       emit_note (NOTE_INSN_DELETED);
21980       return;
21981     }
21982
21983   /* Preserve register attributes for variable tracking.  */
21984   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
21985   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
21986                                GET_MODE_SIZE (halfmode));
21987
21988   /* Special case of reversed high/low parts.  */
21989   if (reg_overlap_mentioned_p (operands[2], destlo)
21990       && reg_overlap_mentioned_p (operands[1], desthi))
21991     {
21992       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
21993       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
21994       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
21995     }
21996   else if (!reg_overlap_mentioned_p (operands[2], destlo))
21997     {
21998       /* Try to avoid unnecessary moves if part of the result
21999          is in the right place already.  */
22000       if (src1 != dest)
22001         emit_move_insn (destlo, operands[1]);
22002       if (src2 != dest + halfregs)
22003         emit_move_insn (desthi, operands[2]);
22004     }
22005   else
22006     {
22007       if (src2 != dest + halfregs)
22008         emit_move_insn (desthi, operands[2]);
22009       if (src1 != dest)
22010         emit_move_insn (destlo, operands[1]);
22011     }
22012 }
22013
22014 /* vec_perm support.  */
22015
22016 struct expand_vec_perm_d
22017 {
22018   rtx target, op0, op1;
22019   vec_perm_indices perm;
22020   machine_mode vmode;
22021   unsigned int vec_flags;
22022   bool one_vector_p;
22023   bool testing_p;
22024 };
22025
22026 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
22027
22028 /* Generate a variable permutation.  */
22029
22030 static void
22031 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
22032 {
22033   machine_mode vmode = GET_MODE (target);
22034   bool one_vector_p = rtx_equal_p (op0, op1);
22035
22036   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
22037   gcc_checking_assert (GET_MODE (op0) == vmode);
22038   gcc_checking_assert (GET_MODE (op1) == vmode);
22039   gcc_checking_assert (GET_MODE (sel) == vmode);
22040   gcc_checking_assert (TARGET_SIMD);
22041
22042   if (one_vector_p)
22043     {
22044       if (vmode == V8QImode)
22045         {
22046           /* Expand the argument to a V16QI mode by duplicating it.  */
22047           rtx pair = gen_reg_rtx (V16QImode);
22048           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
22049           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
22050         }
22051       else
22052         {
22053           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
22054         }
22055     }
22056   else
22057     {
22058       rtx pair;
22059
22060       if (vmode == V8QImode)
22061         {
22062           pair = gen_reg_rtx (V16QImode);
22063           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
22064           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
22065         }
22066       else
22067         {
22068           pair = gen_reg_rtx (OImode);
22069           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
22070           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
22071         }
22072     }
22073 }
22074
22075 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
22076    NELT is the number of elements in the vector.  */
22077
22078 void
22079 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
22080                          unsigned int nelt)
22081 {
22082   machine_mode vmode = GET_MODE (target);
22083   bool one_vector_p = rtx_equal_p (op0, op1);
22084   rtx mask;
22085
22086   /* The TBL instruction does not use a modulo index, so we must take care
22087      of that ourselves.  */
22088   mask = aarch64_simd_gen_const_vector_dup (vmode,
22089       one_vector_p ? nelt - 1 : 2 * nelt - 1);
22090   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
22091
22092   /* For big-endian, we also need to reverse the index within the vector
22093      (but not which vector).  */
22094   if (BYTES_BIG_ENDIAN)
22095     {
22096       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
22097       if (!one_vector_p)
22098         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
22099       sel = expand_simple_binop (vmode, XOR, sel, mask,
22100                                  NULL, 0, OPTAB_LIB_WIDEN);
22101     }
22102   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
22103 }
22104
22105 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
22106
22107 static void
22108 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
22109 {
22110   emit_insn (gen_rtx_SET (target,
22111                           gen_rtx_UNSPEC (GET_MODE (target),
22112                                           gen_rtvec (2, op0, op1), code)));
22113 }
22114
22115 /* Expand an SVE vec_perm with the given operands.  */
22116
22117 void
22118 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
22119 {
22120   machine_mode data_mode = GET_MODE (target);
22121   machine_mode sel_mode = GET_MODE (sel);
22122   /* Enforced by the pattern condition.  */
22123   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
22124
22125   /* Note: vec_perm indices are supposed to wrap when they go beyond the
22126      size of the two value vectors, i.e. the upper bits of the indices
22127      are effectively ignored.  SVE TBL instead produces 0 for any
22128      out-of-range indices, so we need to modulo all the vec_perm indices
22129      to ensure they are all in range.  */
22130   rtx sel_reg = force_reg (sel_mode, sel);
22131
22132   /* Check if the sel only references the first values vector.  */
22133   if (GET_CODE (sel) == CONST_VECTOR
22134       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
22135     {
22136       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
22137       return;
22138     }
22139
22140   /* Check if the two values vectors are the same.  */
22141   if (rtx_equal_p (op0, op1))
22142     {
22143       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
22144       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
22145                                          NULL, 0, OPTAB_DIRECT);
22146       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
22147       return;
22148     }
22149
22150   /* Run TBL on for each value vector and combine the results.  */
22151
22152   rtx res0 = gen_reg_rtx (data_mode);
22153   rtx res1 = gen_reg_rtx (data_mode);
22154   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
22155   if (GET_CODE (sel) != CONST_VECTOR
22156       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
22157     {
22158       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
22159                                                        2 * nunits - 1);
22160       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
22161                                      NULL, 0, OPTAB_DIRECT);
22162     }
22163   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
22164   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
22165                                      NULL, 0, OPTAB_DIRECT);
22166   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
22167   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
22168     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
22169   else
22170     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
22171 }
22172
22173 /* Recognize patterns suitable for the TRN instructions.  */
22174 static bool
22175 aarch64_evpc_trn (struct expand_vec_perm_d *d)
22176 {
22177   HOST_WIDE_INT odd;
22178   poly_uint64 nelt = d->perm.length ();
22179   rtx out, in0, in1, x;
22180   machine_mode vmode = d->vmode;
22181
22182   if (GET_MODE_UNIT_SIZE (vmode) > 8)
22183     return false;
22184
22185   /* Note that these are little-endian tests.
22186      We correct for big-endian later.  */
22187   if (!d->perm[0].is_constant (&odd)
22188       || (odd != 0 && odd != 1)
22189       || !d->perm.series_p (0, 2, odd, 2)
22190       || !d->perm.series_p (1, 2, nelt + odd, 2))
22191     return false;
22192
22193   /* Success!  */
22194   if (d->testing_p)
22195     return true;
22196
22197   in0 = d->op0;
22198   in1 = d->op1;
22199   /* We don't need a big-endian lane correction for SVE; see the comment
22200      at the head of aarch64-sve.md for details.  */
22201   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
22202     {
22203       x = in0, in0 = in1, in1 = x;
22204       odd = !odd;
22205     }
22206   out = d->target;
22207
22208   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
22209                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
22210   return true;
22211 }
22212
22213 /* Try to re-encode the PERM constant so it combines odd and even elements.
22214    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
22215    We retry with this new constant with the full suite of patterns.  */
22216 static bool
22217 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
22218 {
22219   expand_vec_perm_d newd;
22220   unsigned HOST_WIDE_INT nelt;
22221
22222   if (d->vec_flags != VEC_ADVSIMD)
22223     return false;
22224
22225   /* Get the new mode.  Always twice the size of the inner
22226      and half the elements.  */
22227   poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
22228   unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
22229   auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
22230   machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
22231
22232   if (new_mode == word_mode)
22233     return false;
22234
22235   /* to_constant is safe since this routine is specific to Advanced SIMD
22236      vectors.  */
22237   nelt = d->perm.length ().to_constant ();
22238
22239   vec_perm_builder newpermconst;
22240   newpermconst.new_vector (nelt / 2, nelt / 2, 1);
22241
22242   /* Convert the perm constant if we can.  Require even, odd as the pairs.  */
22243   for (unsigned int i = 0; i < nelt; i += 2)
22244     {
22245       poly_int64 elt0 = d->perm[i];
22246       poly_int64 elt1 = d->perm[i + 1];
22247       poly_int64 newelt;
22248       if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
22249         return false;
22250       newpermconst.quick_push (newelt.to_constant ());
22251     }
22252   newpermconst.finalize ();
22253
22254   newd.vmode = new_mode;
22255   newd.vec_flags = VEC_ADVSIMD;
22256   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
22257   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
22258   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
22259   newd.testing_p = d->testing_p;
22260   newd.one_vector_p = d->one_vector_p;
22261
22262   newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
22263   return aarch64_expand_vec_perm_const_1 (&newd);
22264 }
22265
22266 /* Recognize patterns suitable for the UZP instructions.  */
22267 static bool
22268 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
22269 {
22270   HOST_WIDE_INT odd;
22271   rtx out, in0, in1, x;
22272   machine_mode vmode = d->vmode;
22273
22274   if (GET_MODE_UNIT_SIZE (vmode) > 8)
22275     return false;
22276
22277   /* Note that these are little-endian tests.
22278      We correct for big-endian later.  */
22279   if (!d->perm[0].is_constant (&odd)
22280       || (odd != 0 && odd != 1)
22281       || !d->perm.series_p (0, 1, odd, 2))
22282     return false;
22283
22284   /* Success!  */
22285   if (d->testing_p)
22286     return true;
22287
22288   in0 = d->op0;
22289   in1 = d->op1;
22290   /* We don't need a big-endian lane correction for SVE; see the comment
22291      at the head of aarch64-sve.md for details.  */
22292   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
22293     {
22294       x = in0, in0 = in1, in1 = x;
22295       odd = !odd;
22296     }
22297   out = d->target;
22298
22299   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
22300                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
22301   return true;
22302 }
22303
22304 /* Recognize patterns suitable for the ZIP instructions.  */
22305 static bool
22306 aarch64_evpc_zip (struct expand_vec_perm_d *d)
22307 {
22308   unsigned int high;
22309   poly_uint64 nelt = d->perm.length ();
22310   rtx out, in0, in1, x;
22311   machine_mode vmode = d->vmode;
22312
22313   if (GET_MODE_UNIT_SIZE (vmode) > 8)
22314     return false;
22315
22316   /* Note that these are little-endian tests.
22317      We correct for big-endian later.  */
22318   poly_uint64 first = d->perm[0];
22319   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
22320       || !d->perm.series_p (0, 2, first, 1)
22321       || !d->perm.series_p (1, 2, first + nelt, 1))
22322     return false;
22323   high = maybe_ne (first, 0U);
22324
22325   /* Success!  */
22326   if (d->testing_p)
22327     return true;
22328
22329   in0 = d->op0;
22330   in1 = d->op1;
22331   /* We don't need a big-endian lane correction for SVE; see the comment
22332      at the head of aarch64-sve.md for details.  */
22333   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
22334     {
22335       x = in0, in0 = in1, in1 = x;
22336       high = !high;
22337     }
22338   out = d->target;
22339
22340   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
22341                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
22342   return true;
22343 }
22344
22345 /* Recognize patterns for the EXT insn.  */
22346
22347 static bool
22348 aarch64_evpc_ext (struct expand_vec_perm_d *d)
22349 {
22350   HOST_WIDE_INT location;
22351   rtx offset;
22352
22353   /* The first element always refers to the first vector.
22354      Check if the extracted indices are increasing by one.  */
22355   if (d->vec_flags == VEC_SVE_PRED
22356       || !d->perm[0].is_constant (&location)
22357       || !d->perm.series_p (0, 1, location, 1))
22358     return false;
22359
22360   /* Success! */
22361   if (d->testing_p)
22362     return true;
22363
22364   /* The case where (location == 0) is a no-op for both big- and little-endian,
22365      and is removed by the mid-end at optimization levels -O1 and higher.
22366
22367      We don't need a big-endian lane correction for SVE; see the comment
22368      at the head of aarch64-sve.md for details.  */
22369   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
22370     {
22371       /* After setup, we want the high elements of the first vector (stored
22372          at the LSB end of the register), and the low elements of the second
22373          vector (stored at the MSB end of the register). So swap.  */
22374       std::swap (d->op0, d->op1);
22375       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
22376          to_constant () is safe since this is restricted to Advanced SIMD
22377          vectors.  */
22378       location = d->perm.length ().to_constant () - location;
22379     }
22380
22381   offset = GEN_INT (location);
22382   emit_set_insn (d->target,
22383                  gen_rtx_UNSPEC (d->vmode,
22384                                  gen_rtvec (3, d->op0, d->op1, offset),
22385                                  UNSPEC_EXT));
22386   return true;
22387 }
22388
22389 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
22390    within each 64-bit, 32-bit or 16-bit granule.  */
22391
22392 static bool
22393 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
22394 {
22395   HOST_WIDE_INT diff;
22396   unsigned int i, size, unspec;
22397   machine_mode pred_mode;
22398
22399   if (d->vec_flags == VEC_SVE_PRED
22400       || !d->one_vector_p
22401       || !d->perm[0].is_constant (&diff)
22402       || !diff)
22403     return false;
22404
22405   if (d->vec_flags & VEC_SVE_DATA)
22406     size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
22407   else
22408     size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
22409   if (size == 64)
22410     {
22411       unspec = UNSPEC_REV64;
22412       pred_mode = VNx2BImode;
22413     }
22414   else if (size == 32)
22415     {
22416       unspec = UNSPEC_REV32;
22417       pred_mode = VNx4BImode;
22418     }
22419   else if (size == 16)
22420     {
22421       unspec = UNSPEC_REV16;
22422       pred_mode = VNx8BImode;
22423     }
22424   else
22425     return false;
22426
22427   unsigned int step = diff + 1;
22428   for (i = 0; i < step; ++i)
22429     if (!d->perm.series_p (i, step, diff - i, step))
22430       return false;
22431
22432   /* Success! */
22433   if (d->testing_p)
22434     return true;
22435
22436   if (d->vec_flags & VEC_SVE_DATA)
22437     {
22438       rtx pred = aarch64_ptrue_reg (pred_mode);
22439       emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
22440                                          d->target, pred, d->op0));
22441       return true;
22442     }
22443   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
22444   emit_set_insn (d->target, src);
22445   return true;
22446 }
22447
22448 /* Recognize patterns for the REV insn, which reverses elements within
22449    a full vector.  */
22450
22451 static bool
22452 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
22453 {
22454   poly_uint64 nelt = d->perm.length ();
22455
22456   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
22457     return false;
22458
22459   if (!d->perm.series_p (0, 1, nelt - 1, -1))
22460     return false;
22461
22462   /* Success! */
22463   if (d->testing_p)
22464     return true;
22465
22466   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
22467   emit_set_insn (d->target, src);
22468   return true;
22469 }
22470
22471 static bool
22472 aarch64_evpc_dup (struct expand_vec_perm_d *d)
22473 {
22474   rtx out = d->target;
22475   rtx in0;
22476   HOST_WIDE_INT elt;
22477   machine_mode vmode = d->vmode;
22478   rtx lane;
22479
22480   if (d->vec_flags == VEC_SVE_PRED
22481       || d->perm.encoding ().encoded_nelts () != 1
22482       || !d->perm[0].is_constant (&elt))
22483     return false;
22484
22485   if ((d->vec_flags & VEC_SVE_DATA)
22486       && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
22487     return false;
22488
22489   /* Success! */
22490   if (d->testing_p)
22491     return true;
22492
22493   /* The generic preparation in aarch64_expand_vec_perm_const_1
22494      swaps the operand order and the permute indices if it finds
22495      d->perm[0] to be in the second operand.  Thus, we can always
22496      use d->op0 and need not do any extra arithmetic to get the
22497      correct lane number.  */
22498   in0 = d->op0;
22499   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
22500
22501   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
22502   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
22503   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
22504   return true;
22505 }
22506
22507 static bool
22508 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
22509 {
22510   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
22511   machine_mode vmode = d->vmode;
22512
22513   /* Make sure that the indices are constant.  */
22514   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
22515   for (unsigned int i = 0; i < encoded_nelts; ++i)
22516     if (!d->perm[i].is_constant ())
22517       return false;
22518
22519   if (d->testing_p)
22520     return true;
22521
22522   /* Generic code will try constant permutation twice.  Once with the
22523      original mode and again with the elements lowered to QImode.
22524      So wait and don't do the selector expansion ourselves.  */
22525   if (vmode != V8QImode && vmode != V16QImode)
22526     return false;
22527
22528   /* to_constant is safe since this routine is specific to Advanced SIMD
22529      vectors.  */
22530   unsigned int nelt = d->perm.length ().to_constant ();
22531   for (unsigned int i = 0; i < nelt; ++i)
22532     /* If big-endian and two vectors we end up with a weird mixed-endian
22533        mode on NEON.  Reverse the index within each word but not the word
22534        itself.  to_constant is safe because we checked is_constant above.  */
22535     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
22536                         ? d->perm[i].to_constant () ^ (nelt - 1)
22537                         : d->perm[i].to_constant ());
22538
22539   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
22540   sel = force_reg (vmode, sel);
22541
22542   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
22543   return true;
22544 }
22545
22546 /* Try to implement D using an SVE TBL instruction.  */
22547
22548 static bool
22549 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
22550 {
22551   unsigned HOST_WIDE_INT nelt;
22552
22553   /* Permuting two variable-length vectors could overflow the
22554      index range.  */
22555   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
22556     return false;
22557
22558   if (d->testing_p)
22559     return true;
22560
22561   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
22562   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
22563   if (d->one_vector_p)
22564     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
22565   else
22566     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
22567   return true;
22568 }
22569
22570 /* Try to implement D using SVE SEL instruction.  */
22571
22572 static bool
22573 aarch64_evpc_sel (struct expand_vec_perm_d *d)
22574 {
22575   machine_mode vmode = d->vmode;
22576   int unit_size = GET_MODE_UNIT_SIZE (vmode);
22577
22578   if (d->vec_flags != VEC_SVE_DATA
22579       || unit_size > 8)
22580     return false;
22581
22582   int n_patterns = d->perm.encoding ().npatterns ();
22583   poly_int64 vec_len = d->perm.length ();
22584
22585   for (int i = 0; i < n_patterns; ++i)
22586     if (!known_eq (d->perm[i], i)
22587         && !known_eq (d->perm[i], vec_len + i))
22588       return false;
22589
22590   for (int i = n_patterns; i < n_patterns * 2; i++)
22591     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
22592         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
22593       return false;
22594
22595   if (d->testing_p)
22596     return true;
22597
22598   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
22599
22600   /* Build a predicate that is true when op0 elements should be used.  */
22601   rtx_vector_builder builder (pred_mode, n_patterns, 2);
22602   for (int i = 0; i < n_patterns * 2; i++)
22603     {
22604       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
22605                                           : CONST0_RTX (BImode);
22606       builder.quick_push (elem);
22607     }
22608
22609   rtx const_vec = builder.build ();
22610   rtx pred = force_reg (pred_mode, const_vec);
22611   /* TARGET = PRED ? OP0 : OP1.  */
22612   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
22613   return true;
22614 }
22615
22616 /* Recognize patterns suitable for the INS instructions.  */
22617 static bool
22618 aarch64_evpc_ins (struct expand_vec_perm_d *d)
22619 {
22620   machine_mode mode = d->vmode;
22621   unsigned HOST_WIDE_INT nelt;
22622
22623   if (d->vec_flags != VEC_ADVSIMD)
22624     return false;
22625
22626   /* to_constant is safe since this routine is specific to Advanced SIMD
22627      vectors.  */
22628   nelt = d->perm.length ().to_constant ();
22629   rtx insv = d->op0;
22630
22631   HOST_WIDE_INT idx = -1;
22632
22633   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
22634     {
22635       HOST_WIDE_INT elt;
22636       if (!d->perm[i].is_constant (&elt))
22637         return false;
22638       if (elt == (HOST_WIDE_INT) i)
22639         continue;
22640       if (idx != -1)
22641         {
22642           idx = -1;
22643           break;
22644         }
22645       idx = i;
22646     }
22647
22648   if (idx == -1)
22649     {
22650       insv = d->op1;
22651       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
22652         {
22653           if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
22654             continue;
22655           if (idx != -1)
22656             return false;
22657           idx = i;
22658         }
22659
22660       if (idx == -1)
22661         return false;
22662     }
22663
22664   if (d->testing_p)
22665     return true;
22666
22667   gcc_assert (idx != -1);
22668
22669   unsigned extractindex = d->perm[idx].to_constant ();
22670   rtx extractv = d->op0;
22671   if (extractindex >= nelt)
22672     {
22673       extractv = d->op1;
22674       extractindex -= nelt;
22675     }
22676   gcc_assert (extractindex < nelt);
22677
22678   emit_move_insn (d->target, insv);
22679   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
22680   expand_operand ops[5];
22681   create_output_operand (&ops[0], d->target, mode);
22682   create_input_operand (&ops[1], d->target, mode);
22683   create_integer_operand (&ops[2], 1 << idx);
22684   create_input_operand (&ops[3], extractv, mode);
22685   create_integer_operand (&ops[4], extractindex);
22686   expand_insn (icode, 5, ops);
22687
22688   return true;
22689 }
22690
22691 static bool
22692 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
22693 {
22694   /* The pattern matching functions above are written to look for a small
22695      number to begin the sequence (0, 1, N/2).  If we begin with an index
22696      from the second operand, we can swap the operands.  */
22697   poly_int64 nelt = d->perm.length ();
22698   if (known_ge (d->perm[0], nelt))
22699     {
22700       d->perm.rotate_inputs (1);
22701       std::swap (d->op0, d->op1);
22702     }
22703
22704   if ((d->vec_flags == VEC_ADVSIMD
22705        || d->vec_flags == VEC_SVE_DATA
22706        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
22707        || d->vec_flags == VEC_SVE_PRED)
22708       && known_gt (nelt, 1))
22709     {
22710       if (aarch64_evpc_rev_local (d))
22711         return true;
22712       else if (aarch64_evpc_rev_global (d))
22713         return true;
22714       else if (aarch64_evpc_ext (d))
22715         return true;
22716       else if (aarch64_evpc_dup (d))
22717         return true;
22718       else if (aarch64_evpc_zip (d))
22719         return true;
22720       else if (aarch64_evpc_uzp (d))
22721         return true;
22722       else if (aarch64_evpc_trn (d))
22723         return true;
22724       else if (aarch64_evpc_sel (d))
22725         return true;
22726       else if (aarch64_evpc_ins (d))
22727         return true;
22728       else if (aarch64_evpc_reencode (d))
22729         return true;
22730       if (d->vec_flags == VEC_SVE_DATA)
22731         return aarch64_evpc_sve_tbl (d);
22732       else if (d->vec_flags == VEC_ADVSIMD)
22733         return aarch64_evpc_tbl (d);
22734     }
22735   return false;
22736 }
22737
22738 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
22739
22740 static bool
22741 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
22742                                   rtx op1, const vec_perm_indices &sel)
22743 {
22744   struct expand_vec_perm_d d;
22745
22746   /* Check whether the mask can be applied to a single vector.  */
22747   if (sel.ninputs () == 1
22748       || (op0 && rtx_equal_p (op0, op1)))
22749     d.one_vector_p = true;
22750   else if (sel.all_from_input_p (0))
22751     {
22752       d.one_vector_p = true;
22753       op1 = op0;
22754     }
22755   else if (sel.all_from_input_p (1))
22756     {
22757       d.one_vector_p = true;
22758       op0 = op1;
22759     }
22760   else
22761     d.one_vector_p = false;
22762
22763   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
22764                      sel.nelts_per_input ());
22765   d.vmode = vmode;
22766   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
22767   d.target = target;
22768   d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
22769   if (op0 == op1)
22770     d.op1 = d.op0;
22771   else
22772     d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX;
22773   d.testing_p = !target;
22774
22775   if (!d.testing_p)
22776     return aarch64_expand_vec_perm_const_1 (&d);
22777
22778   rtx_insn *last = get_last_insn ();
22779   bool ret = aarch64_expand_vec_perm_const_1 (&d);
22780   gcc_assert (last == get_last_insn ());
22781
22782   return ret;
22783 }
22784
22785 /* Generate a byte permute mask for a register of mode MODE,
22786    which has NUNITS units.  */
22787
22788 rtx
22789 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
22790 {
22791   /* We have to reverse each vector because we dont have
22792      a permuted load that can reverse-load according to ABI rules.  */
22793   rtx mask;
22794   rtvec v = rtvec_alloc (16);
22795   unsigned int i, j;
22796   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
22797
22798   gcc_assert (BYTES_BIG_ENDIAN);
22799   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
22800
22801   for (i = 0; i < nunits; i++)
22802     for (j = 0; j < usize; j++)
22803       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
22804   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
22805   return force_reg (V16QImode, mask);
22806 }
22807
22808 /* Expand an SVE integer comparison using the SVE equivalent of:
22809
22810      (set TARGET (CODE OP0 OP1)).  */
22811
22812 void
22813 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
22814 {
22815   machine_mode pred_mode = GET_MODE (target);
22816   machine_mode data_mode = GET_MODE (op0);
22817   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
22818                                       op0, op1);
22819   if (!rtx_equal_p (target, res))
22820     emit_move_insn (target, res);
22821 }
22822
22823 /* Return the UNSPEC_COND_* code for comparison CODE.  */
22824
22825 static unsigned int
22826 aarch64_unspec_cond_code (rtx_code code)
22827 {
22828   switch (code)
22829     {
22830     case NE:
22831       return UNSPEC_COND_FCMNE;
22832     case EQ:
22833       return UNSPEC_COND_FCMEQ;
22834     case LT:
22835       return UNSPEC_COND_FCMLT;
22836     case GT:
22837       return UNSPEC_COND_FCMGT;
22838     case LE:
22839       return UNSPEC_COND_FCMLE;
22840     case GE:
22841       return UNSPEC_COND_FCMGE;
22842     case UNORDERED:
22843       return UNSPEC_COND_FCMUO;
22844     default:
22845       gcc_unreachable ();
22846     }
22847 }
22848
22849 /* Emit:
22850
22851       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
22852
22853    where <X> is the operation associated with comparison CODE.
22854    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
22855
22856 static void
22857 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
22858                           bool known_ptrue_p, rtx op0, rtx op1)
22859 {
22860   rtx flag = gen_int_mode (known_ptrue_p, SImode);
22861   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
22862                                gen_rtvec (4, pred, flag, op0, op1),
22863                                aarch64_unspec_cond_code (code));
22864   emit_set_insn (target, unspec);
22865 }
22866
22867 /* Emit the SVE equivalent of:
22868
22869       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
22870       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
22871       (set TARGET (ior:PRED_MODE TMP1 TMP2))
22872
22873    where <Xi> is the operation associated with comparison CODEi.
22874    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
22875
22876 static void
22877 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
22878                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
22879 {
22880   machine_mode pred_mode = GET_MODE (pred);
22881   rtx tmp1 = gen_reg_rtx (pred_mode);
22882   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
22883   rtx tmp2 = gen_reg_rtx (pred_mode);
22884   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
22885   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
22886 }
22887
22888 /* Emit the SVE equivalent of:
22889
22890       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
22891       (set TARGET (not TMP))
22892
22893    where <X> is the operation associated with comparison CODE.
22894    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
22895
22896 static void
22897 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
22898                                  bool known_ptrue_p, rtx op0, rtx op1)
22899 {
22900   machine_mode pred_mode = GET_MODE (pred);
22901   rtx tmp = gen_reg_rtx (pred_mode);
22902   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
22903   aarch64_emit_unop (target, one_cmpl_optab, tmp);
22904 }
22905
22906 /* Expand an SVE floating-point comparison using the SVE equivalent of:
22907
22908      (set TARGET (CODE OP0 OP1))
22909
22910    If CAN_INVERT_P is true, the caller can also handle inverted results;
22911    return true if the result is in fact inverted.  */
22912
22913 bool
22914 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
22915                                   rtx op0, rtx op1, bool can_invert_p)
22916 {
22917   machine_mode pred_mode = GET_MODE (target);
22918   machine_mode data_mode = GET_MODE (op0);
22919
22920   rtx ptrue = aarch64_ptrue_reg (pred_mode);
22921   switch (code)
22922     {
22923     case UNORDERED:
22924       /* UNORDERED has no immediate form.  */
22925       op1 = force_reg (data_mode, op1);
22926       /* fall through */
22927     case LT:
22928     case LE:
22929     case GT:
22930     case GE:
22931     case EQ:
22932     case NE:
22933       {
22934         /* There is native support for the comparison.  */
22935         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
22936         return false;
22937       }
22938
22939     case LTGT:
22940       /* This is a trapping operation (LT or GT).  */
22941       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
22942       return false;
22943
22944     case UNEQ:
22945       if (!flag_trapping_math)
22946         {
22947           /* This would trap for signaling NaNs.  */
22948           op1 = force_reg (data_mode, op1);
22949           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
22950                                         ptrue, true, op0, op1);
22951           return false;
22952         }
22953       /* fall through */
22954     case UNLT:
22955     case UNLE:
22956     case UNGT:
22957     case UNGE:
22958       if (flag_trapping_math)
22959         {
22960           /* Work out which elements are ordered.  */
22961           rtx ordered = gen_reg_rtx (pred_mode);
22962           op1 = force_reg (data_mode, op1);
22963           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
22964                                            ptrue, true, op0, op1);
22965
22966           /* Test the opposite condition for the ordered elements,
22967              then invert the result.  */
22968           if (code == UNEQ)
22969             code = NE;
22970           else
22971             code = reverse_condition_maybe_unordered (code);
22972           if (can_invert_p)
22973             {
22974               aarch64_emit_sve_fp_cond (target, code,
22975                                         ordered, false, op0, op1);
22976               return true;
22977             }
22978           aarch64_emit_sve_invert_fp_cond (target, code,
22979                                            ordered, false, op0, op1);
22980           return false;
22981         }
22982       break;
22983
22984     case ORDERED:
22985       /* ORDERED has no immediate form.  */
22986       op1 = force_reg (data_mode, op1);
22987       break;
22988
22989     default:
22990       gcc_unreachable ();
22991     }
22992
22993   /* There is native support for the inverse comparison.  */
22994   code = reverse_condition_maybe_unordered (code);
22995   if (can_invert_p)
22996     {
22997       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
22998       return true;
22999     }
23000   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
23001   return false;
23002 }
23003
23004 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
23005    of the data being selected and CMP_MODE is the mode of the values being
23006    compared.  */
23007
23008 void
23009 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
23010                           rtx *ops)
23011 {
23012   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
23013   rtx pred = gen_reg_rtx (pred_mode);
23014   if (FLOAT_MODE_P (cmp_mode))
23015     {
23016       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
23017                                             ops[4], ops[5], true))
23018         std::swap (ops[1], ops[2]);
23019     }
23020   else
23021     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
23022
23023   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
23024     ops[1] = force_reg (data_mode, ops[1]);
23025   /* The "false" value can only be zero if the "true" value is a constant.  */
23026   if (register_operand (ops[1], data_mode)
23027       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
23028     ops[2] = force_reg (data_mode, ops[2]);
23029
23030   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
23031   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
23032 }
23033
23034 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
23035    true.  However due to issues with register allocation it is preferable
23036    to avoid tieing integer scalar and FP scalar modes.  Executing integer
23037    operations in general registers is better than treating them as scalar
23038    vector operations.  This reduces latency and avoids redundant int<->FP
23039    moves.  So tie modes if they are either the same class, or vector modes
23040    with other vector modes, vector structs or any scalar mode.  */
23041
23042 static bool
23043 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
23044 {
23045   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
23046     return true;
23047
23048   /* We specifically want to allow elements of "structure" modes to
23049      be tieable to the structure.  This more general condition allows
23050      other rarer situations too.  The reason we don't extend this to
23051      predicate modes is that there are no predicate structure modes
23052      nor any specific instructions for extracting part of a predicate
23053      register.  */
23054   if (aarch64_vector_data_mode_p (mode1)
23055       && aarch64_vector_data_mode_p (mode2))
23056     return true;
23057
23058   /* Also allow any scalar modes with vectors.  */
23059   if (aarch64_vector_mode_supported_p (mode1)
23060       || aarch64_vector_mode_supported_p (mode2))
23061     return true;
23062
23063   return false;
23064 }
23065
23066 /* Return a new RTX holding the result of moving POINTER forward by
23067    AMOUNT bytes.  */
23068
23069 static rtx
23070 aarch64_move_pointer (rtx pointer, poly_int64 amount)
23071 {
23072   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
23073
23074   return adjust_automodify_address (pointer, GET_MODE (pointer),
23075                                     next, amount);
23076 }
23077
23078 /* Return a new RTX holding the result of moving POINTER forward by the
23079    size of the mode it points to.  */
23080
23081 static rtx
23082 aarch64_progress_pointer (rtx pointer)
23083 {
23084   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
23085 }
23086
23087 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
23088    MODE bytes.  */
23089
23090 static void
23091 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
23092                                               machine_mode mode)
23093 {
23094   /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
23095      address copies using V4SImode so that we can use Q registers.  */
23096   if (known_eq (GET_MODE_BITSIZE (mode), 256))
23097     {
23098       mode = V4SImode;
23099       rtx reg1 = gen_reg_rtx (mode);
23100       rtx reg2 = gen_reg_rtx (mode);
23101       /* "Cast" the pointers to the correct mode.  */
23102       *src = adjust_address (*src, mode, 0);
23103       *dst = adjust_address (*dst, mode, 0);
23104       /* Emit the memcpy.  */
23105       emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
23106                                         aarch64_progress_pointer (*src)));
23107       emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
23108                                          aarch64_progress_pointer (*dst), reg2));
23109       /* Move the pointers forward.  */
23110       *src = aarch64_move_pointer (*src, 32);
23111       *dst = aarch64_move_pointer (*dst, 32);
23112       return;
23113     }
23114
23115   rtx reg = gen_reg_rtx (mode);
23116
23117   /* "Cast" the pointers to the correct mode.  */
23118   *src = adjust_address (*src, mode, 0);
23119   *dst = adjust_address (*dst, mode, 0);
23120   /* Emit the memcpy.  */
23121   emit_move_insn (reg, *src);
23122   emit_move_insn (*dst, reg);
23123   /* Move the pointers forward.  */
23124   *src = aarch64_progress_pointer (*src);
23125   *dst = aarch64_progress_pointer (*dst);
23126 }
23127
23128 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
23129    we succeed, otherwise return false.  */
23130
23131 bool
23132 aarch64_expand_cpymem (rtx *operands)
23133 {
23134   int mode_bits;
23135   rtx dst = operands[0];
23136   rtx src = operands[1];
23137   rtx base;
23138   machine_mode cur_mode = BLKmode;
23139
23140   /* Only expand fixed-size copies.  */
23141   if (!CONST_INT_P (operands[2]))
23142     return false;
23143
23144   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
23145
23146   /* Inline up to 256 bytes when optimizing for speed.  */
23147   unsigned HOST_WIDE_INT max_copy_size = 256;
23148
23149   if (optimize_function_for_size_p (cfun))
23150     max_copy_size = 128;
23151
23152   int copy_bits = 256;
23153
23154   /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
23155      support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
23156   if (size <= 24
23157       || !TARGET_SIMD
23158       || (aarch64_tune_params.extra_tuning_flags
23159           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
23160     {
23161       copy_bits = 128;
23162       max_copy_size = max_copy_size / 2;
23163     }
23164
23165   if (size > max_copy_size)
23166     return false;
23167
23168   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
23169   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
23170
23171   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
23172   src = adjust_automodify_address (src, VOIDmode, base, 0);
23173
23174   /* Convert size to bits to make the rest of the code simpler.  */
23175   int n = size * BITS_PER_UNIT;
23176
23177   while (n > 0)
23178     {
23179       /* Find the largest mode in which to do the copy in without over reading
23180          or writing.  */
23181       opt_scalar_int_mode mode_iter;
23182       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
23183         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
23184           cur_mode = mode_iter.require ();
23185
23186       gcc_assert (cur_mode != BLKmode);
23187
23188       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
23189
23190       /* Prefer Q-register accesses for the last bytes.  */
23191       if (mode_bits == 128 && copy_bits == 256)
23192         cur_mode = V4SImode;
23193
23194       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
23195
23196       n -= mode_bits;
23197
23198       /* Emit trailing copies using overlapping unaligned accesses - this is
23199          smaller and faster.  */
23200       if (n > 0 && n < copy_bits / 2)
23201         {
23202           machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
23203           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
23204           gcc_assert (n_bits <= mode_bits);
23205           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
23206           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
23207           n = n_bits;
23208         }
23209     }
23210
23211   return true;
23212 }
23213
23214 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
23215    SRC is a register we have created with the duplicated value to be set.  */
23216 static void
23217 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
23218                                             machine_mode mode)
23219 {
23220   /* If we are copying 128bits or 256bits, we can do that straight from
23221      the SIMD register we prepared.  */
23222   if (known_eq (GET_MODE_BITSIZE (mode), 256))
23223     {
23224       mode = GET_MODE (src);
23225       /* "Cast" the *dst to the correct mode.  */
23226       *dst = adjust_address (*dst, mode, 0);
23227       /* Emit the memset.  */
23228       emit_insn (aarch64_gen_store_pair (mode, *dst, src,
23229                                          aarch64_progress_pointer (*dst), src));
23230
23231       /* Move the pointers forward.  */
23232       *dst = aarch64_move_pointer (*dst, 32);
23233       return;
23234     }
23235   if (known_eq (GET_MODE_BITSIZE (mode), 128))
23236     {
23237       /* "Cast" the *dst to the correct mode.  */
23238       *dst = adjust_address (*dst, GET_MODE (src), 0);
23239       /* Emit the memset.  */
23240       emit_move_insn (*dst, src);
23241       /* Move the pointers forward.  */
23242       *dst = aarch64_move_pointer (*dst, 16);
23243       return;
23244     }
23245   /* For copying less, we have to extract the right amount from src.  */
23246   rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
23247
23248   /* "Cast" the *dst to the correct mode.  */
23249   *dst = adjust_address (*dst, mode, 0);
23250   /* Emit the memset.  */
23251   emit_move_insn (*dst, reg);
23252   /* Move the pointer forward.  */
23253   *dst = aarch64_progress_pointer (*dst);
23254 }
23255
23256 /* Expand setmem, as if from a __builtin_memset.  Return true if
23257    we succeed, otherwise return false.  */
23258
23259 bool
23260 aarch64_expand_setmem (rtx *operands)
23261 {
23262   int n, mode_bits;
23263   unsigned HOST_WIDE_INT len;
23264   rtx dst = operands[0];
23265   rtx val = operands[2], src;
23266   rtx base;
23267   machine_mode cur_mode = BLKmode, next_mode;
23268
23269   /* We can't do anything smart if the amount to copy is not constant.  */
23270   if (!CONST_INT_P (operands[1]))
23271     return false;
23272
23273   bool speed_p = !optimize_function_for_size_p (cfun);
23274
23275   /* Default the maximum to 256-bytes.  */
23276   unsigned max_set_size = 256;
23277
23278   /* In case we are optimizing for size or if the core does not
23279      want to use STP Q regs, lower the max_set_size.  */
23280   max_set_size = (!speed_p
23281                   || (aarch64_tune_params.extra_tuning_flags
23282                       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
23283                   ? max_set_size / 2 : max_set_size;
23284
23285   len = INTVAL (operands[1]);
23286
23287   /* Upper bound check.  */
23288   if (len > max_set_size)
23289     return false;
23290
23291   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
23292   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
23293
23294   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
23295   src = expand_vector_broadcast (V16QImode, val);
23296   src = force_reg (V16QImode, src);
23297
23298   /* Convert len to bits to make the rest of the code simpler.  */
23299   n = len * BITS_PER_UNIT;
23300
23301   /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
23302      AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  setmem expand
23303      pattern is only turned on for TARGET_SIMD.  */
23304   const int copy_limit = (speed_p
23305                           && (aarch64_tune_params.extra_tuning_flags
23306                               & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
23307                           ? GET_MODE_BITSIZE (TImode) : 256;
23308
23309   while (n > 0)
23310     {
23311       /* Find the largest mode in which to do the copy without
23312          over writing.  */
23313       opt_scalar_int_mode mode_iter;
23314       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
23315         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
23316           cur_mode = mode_iter.require ();
23317
23318       gcc_assert (cur_mode != BLKmode);
23319
23320       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
23321       aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
23322
23323       n -= mode_bits;
23324
23325       /* Do certain trailing copies as overlapping if it's going to be
23326          cheaper.  i.e. less instructions to do so.  For instance doing a 15
23327          byte copy it's more efficient to do two overlapping 8 byte copies than
23328          8 + 4 + 2 + 1.  */
23329       if (n > 0 && n < copy_limit / 2)
23330         {
23331           next_mode = smallest_mode_for_size (n, MODE_INT);
23332           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
23333           gcc_assert (n_bits <= mode_bits);
23334           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
23335           n = n_bits;
23336         }
23337     }
23338
23339   return true;
23340 }
23341
23342
23343 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
23344    SImode stores.  Handle the case when the constant has identical
23345    bottom and top halves.  This is beneficial when the two stores can be
23346    merged into an STP and we avoid synthesising potentially expensive
23347    immediates twice.  Return true if such a split is possible.  */
23348
23349 bool
23350 aarch64_split_dimode_const_store (rtx dst, rtx src)
23351 {
23352   rtx lo = gen_lowpart (SImode, src);
23353   rtx hi = gen_highpart_mode (SImode, DImode, src);
23354
23355   bool size_p = optimize_function_for_size_p (cfun);
23356
23357   if (!rtx_equal_p (lo, hi))
23358     return false;
23359
23360   unsigned int orig_cost
23361     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
23362   unsigned int lo_cost
23363     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
23364
23365   /* We want to transform:
23366      MOV        x1, 49370
23367      MOVK       x1, 0x140, lsl 16
23368      MOVK       x1, 0xc0da, lsl 32
23369      MOVK       x1, 0x140, lsl 48
23370      STR        x1, [x0]
23371    into:
23372      MOV        w1, 49370
23373      MOVK       w1, 0x140, lsl 16
23374      STP        w1, w1, [x0]
23375    So we want to perform this only when we save two instructions
23376    or more.  When optimizing for size, however, accept any code size
23377    savings we can.  */
23378   if (size_p && orig_cost <= lo_cost)
23379     return false;
23380
23381   if (!size_p
23382       && (orig_cost <= lo_cost + 1))
23383     return false;
23384
23385   rtx mem_lo = adjust_address (dst, SImode, 0);
23386   if (!aarch64_mem_pair_operand (mem_lo, SImode))
23387     return false;
23388
23389   rtx tmp_reg = gen_reg_rtx (SImode);
23390   aarch64_expand_mov_immediate (tmp_reg, lo);
23391   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
23392   /* Don't emit an explicit store pair as this may not be always profitable.
23393      Let the sched-fusion logic decide whether to merge them.  */
23394   emit_move_insn (mem_lo, tmp_reg);
23395   emit_move_insn (mem_hi, tmp_reg);
23396
23397   return true;
23398 }
23399
23400 /* Generate RTL for a conditional branch with rtx comparison CODE in
23401    mode CC_MODE.  The destination of the unlikely conditional branch
23402    is LABEL_REF.  */
23403
23404 void
23405 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
23406                               rtx label_ref)
23407 {
23408   rtx x;
23409   x = gen_rtx_fmt_ee (code, VOIDmode,
23410                       gen_rtx_REG (cc_mode, CC_REGNUM),
23411                       const0_rtx);
23412
23413   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23414                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
23415                             pc_rtx);
23416   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23417 }
23418
23419 /* Generate DImode scratch registers for 128-bit (TImode) addition.
23420
23421    OP1 represents the TImode destination operand 1
23422    OP2 represents the TImode destination operand 2
23423    LOW_DEST represents the low half (DImode) of TImode operand 0
23424    LOW_IN1 represents the low half (DImode) of TImode operand 1
23425    LOW_IN2 represents the low half (DImode) of TImode operand 2
23426    HIGH_DEST represents the high half (DImode) of TImode operand 0
23427    HIGH_IN1 represents the high half (DImode) of TImode operand 1
23428    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
23429
23430 void
23431 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
23432                             rtx *low_in1, rtx *low_in2,
23433                             rtx *high_dest, rtx *high_in1,
23434                             rtx *high_in2)
23435 {
23436   *low_dest = gen_reg_rtx (DImode);
23437   *low_in1 = gen_lowpart (DImode, op1);
23438   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
23439                                   subreg_lowpart_offset (DImode, TImode));
23440   *high_dest = gen_reg_rtx (DImode);
23441   *high_in1 = gen_highpart (DImode, op1);
23442   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
23443                                    subreg_highpart_offset (DImode, TImode));
23444 }
23445
23446 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
23447
23448    This function differs from 'arch64_addti_scratch_regs' in that
23449    OP1 can be an immediate constant (zero). We must call
23450    subreg_highpart_offset with DImode and TImode arguments, otherwise
23451    VOIDmode will be used for the const_int which generates an internal
23452    error from subreg_size_highpart_offset which does not expect a size of zero.
23453
23454    OP1 represents the TImode destination operand 1
23455    OP2 represents the TImode destination operand 2
23456    LOW_DEST represents the low half (DImode) of TImode operand 0
23457    LOW_IN1 represents the low half (DImode) of TImode operand 1
23458    LOW_IN2 represents the low half (DImode) of TImode operand 2
23459    HIGH_DEST represents the high half (DImode) of TImode operand 0
23460    HIGH_IN1 represents the high half (DImode) of TImode operand 1
23461    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
23462
23463
23464 void
23465 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
23466                              rtx *low_in1, rtx *low_in2,
23467                              rtx *high_dest, rtx *high_in1,
23468                              rtx *high_in2)
23469 {
23470   *low_dest = gen_reg_rtx (DImode);
23471   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
23472                                   subreg_lowpart_offset (DImode, TImode));
23473
23474   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
23475                                   subreg_lowpart_offset (DImode, TImode));
23476   *high_dest = gen_reg_rtx (DImode);
23477
23478   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
23479                                    subreg_highpart_offset (DImode, TImode));
23480   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
23481                                    subreg_highpart_offset (DImode, TImode));
23482 }
23483
23484 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
23485
23486    OP0 represents the TImode destination operand 0
23487    LOW_DEST represents the low half (DImode) of TImode operand 0
23488    LOW_IN1 represents the low half (DImode) of TImode operand 1
23489    LOW_IN2 represents the low half (DImode) of TImode operand 2
23490    HIGH_DEST represents the high half (DImode) of TImode operand 0
23491    HIGH_IN1 represents the high half (DImode) of TImode operand 1
23492    HIGH_IN2 represents the high half (DImode) of TImode operand 2
23493    UNSIGNED_P is true if the operation is being performed on unsigned
23494    values.  */
23495 void
23496 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
23497                        rtx low_in2, rtx high_dest, rtx high_in1,
23498                        rtx high_in2, bool unsigned_p)
23499 {
23500   if (low_in2 == const0_rtx)
23501     {
23502       low_dest = low_in1;
23503       high_in2 = force_reg (DImode, high_in2);
23504       if (unsigned_p)
23505         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
23506       else
23507         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
23508     }
23509   else
23510     {
23511       if (aarch64_plus_immediate (low_in2, DImode))
23512         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
23513                                             GEN_INT (-UINTVAL (low_in2))));
23514       else
23515         {
23516           low_in2 = force_reg (DImode, low_in2);
23517           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
23518         }
23519       high_in2 = force_reg (DImode, high_in2);
23520
23521       if (unsigned_p)
23522         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
23523       else
23524         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
23525     }
23526
23527   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
23528   emit_move_insn (gen_highpart (DImode, op0), high_dest);
23529
23530 }
23531
23532 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
23533
23534 static unsigned HOST_WIDE_INT
23535 aarch64_asan_shadow_offset (void)
23536 {
23537   if (TARGET_ILP32)
23538     return (HOST_WIDE_INT_1 << 29);
23539   else
23540     return (HOST_WIDE_INT_1 << 36);
23541 }
23542
23543 static rtx
23544 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
23545                         int code, tree treeop0, tree treeop1)
23546 {
23547   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
23548   rtx op0, op1;
23549   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
23550   insn_code icode;
23551   struct expand_operand ops[4];
23552
23553   start_sequence ();
23554   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
23555
23556   op_mode = GET_MODE (op0);
23557   if (op_mode == VOIDmode)
23558     op_mode = GET_MODE (op1);
23559
23560   switch (op_mode)
23561     {
23562     case E_QImode:
23563     case E_HImode:
23564     case E_SImode:
23565       cmp_mode = SImode;
23566       icode = CODE_FOR_cmpsi;
23567       break;
23568
23569     case E_DImode:
23570       cmp_mode = DImode;
23571       icode = CODE_FOR_cmpdi;
23572       break;
23573
23574     case E_SFmode:
23575       cmp_mode = SFmode;
23576       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
23577       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
23578       break;
23579
23580     case E_DFmode:
23581       cmp_mode = DFmode;
23582       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
23583       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
23584       break;
23585
23586     default:
23587       end_sequence ();
23588       return NULL_RTX;
23589     }
23590
23591   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
23592   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
23593   if (!op0 || !op1)
23594     {
23595       end_sequence ();
23596       return NULL_RTX;
23597     }
23598   *prep_seq = get_insns ();
23599   end_sequence ();
23600
23601   create_fixed_operand (&ops[0], op0);
23602   create_fixed_operand (&ops[1], op1);
23603
23604   start_sequence ();
23605   if (!maybe_expand_insn (icode, 2, ops))
23606     {
23607       end_sequence ();
23608       return NULL_RTX;
23609     }
23610   *gen_seq = get_insns ();
23611   end_sequence ();
23612
23613   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
23614                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
23615 }
23616
23617 static rtx
23618 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
23619                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
23620 {
23621   rtx op0, op1, target;
23622   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
23623   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
23624   insn_code icode;
23625   struct expand_operand ops[6];
23626   int aarch64_cond;
23627
23628   push_to_sequence (*prep_seq);
23629   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
23630
23631   op_mode = GET_MODE (op0);
23632   if (op_mode == VOIDmode)
23633     op_mode = GET_MODE (op1);
23634
23635   switch (op_mode)
23636     {
23637     case E_QImode:
23638     case E_HImode:
23639     case E_SImode:
23640       cmp_mode = SImode;
23641       break;
23642
23643     case E_DImode:
23644       cmp_mode = DImode;
23645       break;
23646
23647     case E_SFmode:
23648       cmp_mode = SFmode;
23649       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
23650       break;
23651
23652     case E_DFmode:
23653       cmp_mode = DFmode;
23654       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
23655       break;
23656
23657     default:
23658       end_sequence ();
23659       return NULL_RTX;
23660     }
23661
23662   icode = code_for_ccmp (cc_mode, cmp_mode);
23663
23664   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
23665   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
23666   if (!op0 || !op1)
23667     {
23668       end_sequence ();
23669       return NULL_RTX;
23670     }
23671   *prep_seq = get_insns ();
23672   end_sequence ();
23673
23674   target = gen_rtx_REG (cc_mode, CC_REGNUM);
23675   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
23676
23677   if (bit_code != AND)
23678     {
23679       /* Treat the ccmp patterns as canonical and use them where possible,
23680          but fall back to ccmp_rev patterns if there's no other option.  */
23681       rtx_code prev_code = GET_CODE (prev);
23682       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
23683       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
23684           && !(prev_code == EQ
23685                || prev_code == NE
23686                || prev_code == ORDERED
23687                || prev_code == UNORDERED))
23688         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
23689       else
23690         {
23691           rtx_code code = reverse_condition (prev_code);
23692           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
23693         }
23694       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
23695     }
23696
23697   create_fixed_operand (&ops[0], XEXP (prev, 0));
23698   create_fixed_operand (&ops[1], target);
23699   create_fixed_operand (&ops[2], op0);
23700   create_fixed_operand (&ops[3], op1);
23701   create_fixed_operand (&ops[4], prev);
23702   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
23703
23704   push_to_sequence (*gen_seq);
23705   if (!maybe_expand_insn (icode, 6, ops))
23706     {
23707       end_sequence ();
23708       return NULL_RTX;
23709     }
23710
23711   *gen_seq = get_insns ();
23712   end_sequence ();
23713
23714   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
23715 }
23716
23717 #undef TARGET_GEN_CCMP_FIRST
23718 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
23719
23720 #undef TARGET_GEN_CCMP_NEXT
23721 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
23722
23723 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
23724    instruction fusion of some sort.  */
23725
23726 static bool
23727 aarch64_macro_fusion_p (void)
23728 {
23729   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
23730 }
23731
23732
23733 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
23734    should be kept together during scheduling.  */
23735
23736 static bool
23737 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
23738 {
23739   rtx set_dest;
23740   rtx prev_set = single_set (prev);
23741   rtx curr_set = single_set (curr);
23742   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
23743   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
23744
23745   if (!aarch64_macro_fusion_p ())
23746     return false;
23747
23748   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
23749     {
23750       /* We are trying to match:
23751          prev (mov)  == (set (reg r0) (const_int imm16))
23752          curr (movk) == (set (zero_extract (reg r0)
23753                                            (const_int 16)
23754                                            (const_int 16))
23755                              (const_int imm16_1))  */
23756
23757       set_dest = SET_DEST (curr_set);
23758
23759       if (GET_CODE (set_dest) == ZERO_EXTRACT
23760           && CONST_INT_P (SET_SRC (curr_set))
23761           && CONST_INT_P (SET_SRC (prev_set))
23762           && CONST_INT_P (XEXP (set_dest, 2))
23763           && INTVAL (XEXP (set_dest, 2)) == 16
23764           && REG_P (XEXP (set_dest, 0))
23765           && REG_P (SET_DEST (prev_set))
23766           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
23767         {
23768           return true;
23769         }
23770     }
23771
23772   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
23773     {
23774
23775       /*  We're trying to match:
23776           prev (adrp) == (set (reg r1)
23777                               (high (symbol_ref ("SYM"))))
23778           curr (add) == (set (reg r0)
23779                              (lo_sum (reg r1)
23780                                      (symbol_ref ("SYM"))))
23781           Note that r0 need not necessarily be the same as r1, especially
23782           during pre-regalloc scheduling.  */
23783
23784       if (satisfies_constraint_Ush (SET_SRC (prev_set))
23785           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
23786         {
23787           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
23788               && REG_P (XEXP (SET_SRC (curr_set), 0))
23789               && REGNO (XEXP (SET_SRC (curr_set), 0))
23790                  == REGNO (SET_DEST (prev_set))
23791               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
23792                               XEXP (SET_SRC (curr_set), 1)))
23793             return true;
23794         }
23795     }
23796
23797   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
23798     {
23799
23800       /* We're trying to match:
23801          prev (movk) == (set (zero_extract (reg r0)
23802                                            (const_int 16)
23803                                            (const_int 32))
23804                              (const_int imm16_1))
23805          curr (movk) == (set (zero_extract (reg r0)
23806                                            (const_int 16)
23807                                            (const_int 48))
23808                              (const_int imm16_2))  */
23809
23810       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
23811           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
23812           && REG_P (XEXP (SET_DEST (prev_set), 0))
23813           && REG_P (XEXP (SET_DEST (curr_set), 0))
23814           && REGNO (XEXP (SET_DEST (prev_set), 0))
23815              == REGNO (XEXP (SET_DEST (curr_set), 0))
23816           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
23817           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
23818           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
23819           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
23820           && CONST_INT_P (SET_SRC (prev_set))
23821           && CONST_INT_P (SET_SRC (curr_set)))
23822         return true;
23823
23824     }
23825   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
23826     {
23827       /* We're trying to match:
23828           prev (adrp) == (set (reg r0)
23829                               (high (symbol_ref ("SYM"))))
23830           curr (ldr) == (set (reg r1)
23831                              (mem (lo_sum (reg r0)
23832                                              (symbol_ref ("SYM")))))
23833                  or
23834           curr (ldr) == (set (reg r1)
23835                              (zero_extend (mem
23836                                            (lo_sum (reg r0)
23837                                                    (symbol_ref ("SYM"))))))  */
23838       if (satisfies_constraint_Ush (SET_SRC (prev_set))
23839           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
23840         {
23841           rtx curr_src = SET_SRC (curr_set);
23842
23843           if (GET_CODE (curr_src) == ZERO_EXTEND)
23844             curr_src = XEXP (curr_src, 0);
23845
23846           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
23847               && REG_P (XEXP (XEXP (curr_src, 0), 0))
23848               && REGNO (XEXP (XEXP (curr_src, 0), 0))
23849                  == REGNO (SET_DEST (prev_set))
23850               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
23851                               XEXP (SET_SRC (prev_set), 0)))
23852               return true;
23853         }
23854     }
23855
23856   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
23857   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
23858       && prev_set && curr_set && any_condjump_p (curr)
23859       && GET_CODE (SET_SRC (prev_set)) == COMPARE
23860       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
23861       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
23862     return true;
23863
23864   /* Fuse flag-setting ALU instructions and conditional branch.  */
23865   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
23866       && any_condjump_p (curr))
23867     {
23868       unsigned int condreg1, condreg2;
23869       rtx cc_reg_1;
23870       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
23871       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
23872
23873       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
23874           && prev
23875           && modified_in_p (cc_reg_1, prev))
23876         {
23877           enum attr_type prev_type = get_attr_type (prev);
23878
23879           /* FIXME: this misses some which is considered simple arthematic
23880              instructions for ThunderX.  Simple shifts are missed here.  */
23881           if (prev_type == TYPE_ALUS_SREG
23882               || prev_type == TYPE_ALUS_IMM
23883               || prev_type == TYPE_LOGICS_REG
23884               || prev_type == TYPE_LOGICS_IMM)
23885             return true;
23886         }
23887     }
23888
23889   /* Fuse ALU instructions and CBZ/CBNZ.  */
23890   if (prev_set
23891       && curr_set
23892       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
23893       && any_condjump_p (curr))
23894     {
23895       /* We're trying to match:
23896           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
23897           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
23898                                                          (const_int 0))
23899                                                  (label_ref ("SYM"))
23900                                                  (pc))  */
23901       if (SET_DEST (curr_set) == (pc_rtx)
23902           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
23903           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
23904           && REG_P (SET_DEST (prev_set))
23905           && REGNO (SET_DEST (prev_set))
23906              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
23907         {
23908           /* Fuse ALU operations followed by conditional branch instruction.  */
23909           switch (get_attr_type (prev))
23910             {
23911             case TYPE_ALU_IMM:
23912             case TYPE_ALU_SREG:
23913             case TYPE_ADC_REG:
23914             case TYPE_ADC_IMM:
23915             case TYPE_ADCS_REG:
23916             case TYPE_ADCS_IMM:
23917             case TYPE_LOGIC_REG:
23918             case TYPE_LOGIC_IMM:
23919             case TYPE_CSEL:
23920             case TYPE_ADR:
23921             case TYPE_MOV_IMM:
23922             case TYPE_SHIFT_REG:
23923             case TYPE_SHIFT_IMM:
23924             case TYPE_BFM:
23925             case TYPE_RBIT:
23926             case TYPE_REV:
23927             case TYPE_EXTEND:
23928               return true;
23929
23930             default:;
23931             }
23932         }
23933     }
23934
23935   return false;
23936 }
23937
23938 /* Return true iff the instruction fusion described by OP is enabled.  */
23939
23940 bool
23941 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
23942 {
23943   return (aarch64_tune_params.fusible_ops & op) != 0;
23944 }
23945
23946 /* If MEM is in the form of [base+offset], extract the two parts
23947    of address and set to BASE and OFFSET, otherwise return false
23948    after clearing BASE and OFFSET.  */
23949
23950 bool
23951 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
23952 {
23953   rtx addr;
23954
23955   gcc_assert (MEM_P (mem));
23956
23957   addr = XEXP (mem, 0);
23958
23959   if (REG_P (addr))
23960     {
23961       *base = addr;
23962       *offset = const0_rtx;
23963       return true;
23964     }
23965
23966   if (GET_CODE (addr) == PLUS
23967       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
23968     {
23969       *base = XEXP (addr, 0);
23970       *offset = XEXP (addr, 1);
23971       return true;
23972     }
23973
23974   *base = NULL_RTX;
23975   *offset = NULL_RTX;
23976
23977   return false;
23978 }
23979
23980 /* Types for scheduling fusion.  */
23981 enum sched_fusion_type
23982 {
23983   SCHED_FUSION_NONE = 0,
23984   SCHED_FUSION_LD_SIGN_EXTEND,
23985   SCHED_FUSION_LD_ZERO_EXTEND,
23986   SCHED_FUSION_LD,
23987   SCHED_FUSION_ST,
23988   SCHED_FUSION_NUM
23989 };
23990
23991 /* If INSN is a load or store of address in the form of [base+offset],
23992    extract the two parts and set to BASE and OFFSET.  Return scheduling
23993    fusion type this INSN is.  */
23994
23995 static enum sched_fusion_type
23996 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
23997 {
23998   rtx x, dest, src;
23999   enum sched_fusion_type fusion = SCHED_FUSION_LD;
24000
24001   gcc_assert (INSN_P (insn));
24002   x = PATTERN (insn);
24003   if (GET_CODE (x) != SET)
24004     return SCHED_FUSION_NONE;
24005
24006   src = SET_SRC (x);
24007   dest = SET_DEST (x);
24008
24009   machine_mode dest_mode = GET_MODE (dest);
24010
24011   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
24012     return SCHED_FUSION_NONE;
24013
24014   if (GET_CODE (src) == SIGN_EXTEND)
24015     {
24016       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
24017       src = XEXP (src, 0);
24018       if (!MEM_P (src) || GET_MODE (src) != SImode)
24019         return SCHED_FUSION_NONE;
24020     }
24021   else if (GET_CODE (src) == ZERO_EXTEND)
24022     {
24023       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
24024       src = XEXP (src, 0);
24025       if (!MEM_P (src) || GET_MODE (src) != SImode)
24026         return SCHED_FUSION_NONE;
24027     }
24028
24029   if (MEM_P (src) && REG_P (dest))
24030     extract_base_offset_in_addr (src, base, offset);
24031   else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
24032     {
24033       fusion = SCHED_FUSION_ST;
24034       extract_base_offset_in_addr (dest, base, offset);
24035     }
24036   else
24037     return SCHED_FUSION_NONE;
24038
24039   if (*base == NULL_RTX || *offset == NULL_RTX)
24040     fusion = SCHED_FUSION_NONE;
24041
24042   return fusion;
24043 }
24044
24045 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
24046
24047    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
24048    and PRI are only calculated for these instructions.  For other instruction,
24049    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
24050    type instruction fusion can be added by returning different priorities.
24051
24052    It's important that irrelevant instructions get the largest FUSION_PRI.  */
24053
24054 static void
24055 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
24056                                int *fusion_pri, int *pri)
24057 {
24058   int tmp, off_val;
24059   rtx base, offset;
24060   enum sched_fusion_type fusion;
24061
24062   gcc_assert (INSN_P (insn));
24063
24064   tmp = max_pri - 1;
24065   fusion = fusion_load_store (insn, &base, &offset);
24066   if (fusion == SCHED_FUSION_NONE)
24067     {
24068       *pri = tmp;
24069       *fusion_pri = tmp;
24070       return;
24071     }
24072
24073   /* Set FUSION_PRI according to fusion type and base register.  */
24074   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
24075
24076   /* Calculate PRI.  */
24077   tmp /= 2;
24078
24079   /* INSN with smaller offset goes first.  */
24080   off_val = (int)(INTVAL (offset));
24081   if (off_val >= 0)
24082     tmp -= (off_val & 0xfffff);
24083   else
24084     tmp += ((- off_val) & 0xfffff);
24085
24086   *pri = tmp;
24087   return;
24088 }
24089
24090 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
24091    Adjust priority of sha1h instructions so they are scheduled before
24092    other SHA1 instructions.  */
24093
24094 static int
24095 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
24096 {
24097   rtx x = PATTERN (insn);
24098
24099   if (GET_CODE (x) == SET)
24100     {
24101       x = SET_SRC (x);
24102
24103       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
24104         return priority + 10;
24105     }
24106
24107   return priority;
24108 }
24109
24110 /* Given OPERANDS of consecutive load/store, check if we can merge
24111    them into ldp/stp.  LOAD is true if they are load instructions.
24112    MODE is the mode of memory operands.  */
24113
24114 bool
24115 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
24116                                 machine_mode mode)
24117 {
24118   HOST_WIDE_INT offval_1, offval_2, msize;
24119   enum reg_class rclass_1, rclass_2;
24120   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
24121
24122   if (load)
24123     {
24124       mem_1 = operands[1];
24125       mem_2 = operands[3];
24126       reg_1 = operands[0];
24127       reg_2 = operands[2];
24128       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
24129       if (REGNO (reg_1) == REGNO (reg_2))
24130         return false;
24131     }
24132   else
24133     {
24134       mem_1 = operands[0];
24135       mem_2 = operands[2];
24136       reg_1 = operands[1];
24137       reg_2 = operands[3];
24138     }
24139
24140   /* The mems cannot be volatile.  */
24141   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
24142     return false;
24143
24144   /* If we have SImode and slow unaligned ldp,
24145      check the alignment to be at least 8 byte. */
24146   if (mode == SImode
24147       && (aarch64_tune_params.extra_tuning_flags
24148           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
24149       && !optimize_size
24150       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
24151     return false;
24152
24153   /* Check if the addresses are in the form of [base+offset].  */
24154   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
24155   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
24156     return false;
24157   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
24158   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
24159     return false;
24160
24161   /* Check if the bases are same.  */
24162   if (!rtx_equal_p (base_1, base_2))
24163     return false;
24164
24165   /* The operands must be of the same size.  */
24166   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
24167                          GET_MODE_SIZE (GET_MODE (mem_2))));
24168
24169   offval_1 = INTVAL (offset_1);
24170   offval_2 = INTVAL (offset_2);
24171   /* We should only be trying this for fixed-sized modes.  There is no
24172      SVE LDP/STP instruction.  */
24173   msize = GET_MODE_SIZE (mode).to_constant ();
24174   /* Check if the offsets are consecutive.  */
24175   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
24176     return false;
24177
24178   /* Check if the addresses are clobbered by load.  */
24179   if (load)
24180     {
24181       if (reg_mentioned_p (reg_1, mem_1))
24182         return false;
24183
24184       /* In increasing order, the last load can clobber the address.  */
24185       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
24186         return false;
24187     }
24188
24189   /* One of the memory accesses must be a mempair operand.
24190      If it is not the first one, they need to be swapped by the
24191      peephole.  */
24192   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
24193        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
24194     return false;
24195
24196   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
24197     rclass_1 = FP_REGS;
24198   else
24199     rclass_1 = GENERAL_REGS;
24200
24201   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
24202     rclass_2 = FP_REGS;
24203   else
24204     rclass_2 = GENERAL_REGS;
24205
24206   /* Check if the registers are of same class.  */
24207   if (rclass_1 != rclass_2)
24208     return false;
24209
24210   return true;
24211 }
24212
24213 /* Given OPERANDS of consecutive load/store that can be merged,
24214    swap them if they are not in ascending order.  */
24215 void
24216 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
24217 {
24218   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
24219   HOST_WIDE_INT offval_1, offval_2;
24220
24221   if (load)
24222     {
24223       mem_1 = operands[1];
24224       mem_2 = operands[3];
24225     }
24226   else
24227     {
24228       mem_1 = operands[0];
24229       mem_2 = operands[2];
24230     }
24231
24232   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
24233   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
24234
24235   offval_1 = INTVAL (offset_1);
24236   offval_2 = INTVAL (offset_2);
24237
24238   if (offval_1 > offval_2)
24239     {
24240       /* Irrespective of whether this is a load or a store,
24241          we do the same swap.  */
24242       std::swap (operands[0], operands[2]);
24243       std::swap (operands[1], operands[3]);
24244     }
24245 }
24246
24247 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
24248    comparison between the two.  */
24249 int
24250 aarch64_host_wide_int_compare (const void *x, const void *y)
24251 {
24252   return wi::cmps (* ((const HOST_WIDE_INT *) x),
24253                    * ((const HOST_WIDE_INT *) y));
24254 }
24255
24256 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
24257    other pointing to a REG rtx containing an offset, compare the offsets
24258    of the two pairs.
24259
24260    Return:
24261
24262         1 iff offset (X) > offset (Y)
24263         0 iff offset (X) == offset (Y)
24264         -1 iff offset (X) < offset (Y)  */
24265 int
24266 aarch64_ldrstr_offset_compare (const void *x, const void *y)
24267 {
24268   const rtx * operands_1 = (const rtx *) x;
24269   const rtx * operands_2 = (const rtx *) y;
24270   rtx mem_1, mem_2, base, offset_1, offset_2;
24271
24272   if (MEM_P (operands_1[0]))
24273     mem_1 = operands_1[0];
24274   else
24275     mem_1 = operands_1[1];
24276
24277   if (MEM_P (operands_2[0]))
24278     mem_2 = operands_2[0];
24279   else
24280     mem_2 = operands_2[1];
24281
24282   /* Extract the offsets.  */
24283   extract_base_offset_in_addr (mem_1, &base, &offset_1);
24284   extract_base_offset_in_addr (mem_2, &base, &offset_2);
24285
24286   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
24287
24288   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
24289 }
24290
24291 /* Given OPERANDS of consecutive load/store, check if we can merge
24292    them into ldp/stp by adjusting the offset.  LOAD is true if they
24293    are load instructions.  MODE is the mode of memory operands.
24294
24295    Given below consecutive stores:
24296
24297      str  w1, [xb, 0x100]
24298      str  w1, [xb, 0x104]
24299      str  w1, [xb, 0x108]
24300      str  w1, [xb, 0x10c]
24301
24302    Though the offsets are out of the range supported by stp, we can
24303    still pair them after adjusting the offset, like:
24304
24305      add  scratch, xb, 0x100
24306      stp  w1, w1, [scratch]
24307      stp  w1, w1, [scratch, 0x8]
24308
24309    The peephole patterns detecting this opportunity should guarantee
24310    the scratch register is avaliable.  */
24311
24312 bool
24313 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
24314                                        machine_mode mode)
24315 {
24316   const int num_insns = 4;
24317   enum reg_class rclass;
24318   HOST_WIDE_INT offvals[num_insns], msize;
24319   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
24320
24321   if (load)
24322     {
24323       for (int i = 0; i < num_insns; i++)
24324         {
24325           reg[i] = operands[2 * i];
24326           mem[i] = operands[2 * i + 1];
24327
24328           gcc_assert (REG_P (reg[i]));
24329         }
24330
24331       /* Do not attempt to merge the loads if the loads clobber each other.  */
24332       for (int i = 0; i < 8; i += 2)
24333         for (int j = i + 2; j < 8; j += 2)
24334           if (reg_overlap_mentioned_p (operands[i], operands[j]))
24335             return false;
24336     }
24337   else
24338     for (int i = 0; i < num_insns; i++)
24339       {
24340         mem[i] = operands[2 * i];
24341         reg[i] = operands[2 * i + 1];
24342       }
24343
24344   /* Skip if memory operand is by itself valid for ldp/stp.  */
24345   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
24346     return false;
24347
24348   for (int i = 0; i < num_insns; i++)
24349     {
24350       /* The mems cannot be volatile.  */
24351       if (MEM_VOLATILE_P (mem[i]))
24352         return false;
24353
24354       /* Check if the addresses are in the form of [base+offset].  */
24355       extract_base_offset_in_addr (mem[i], base + i, offset + i);
24356       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
24357         return false;
24358     }
24359
24360   /* Check if the registers are of same class.  */
24361   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
24362     ? FP_REGS : GENERAL_REGS;
24363
24364   for (int i = 1; i < num_insns; i++)
24365     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
24366       {
24367         if (rclass != FP_REGS)
24368           return false;
24369       }
24370     else
24371       {
24372         if (rclass != GENERAL_REGS)
24373           return false;
24374       }
24375
24376   /* Only the last register in the order in which they occur
24377      may be clobbered by the load.  */
24378   if (rclass == GENERAL_REGS && load)
24379     for (int i = 0; i < num_insns - 1; i++)
24380       if (reg_mentioned_p (reg[i], mem[i]))
24381         return false;
24382
24383   /* Check if the bases are same.  */
24384   for (int i = 0; i < num_insns - 1; i++)
24385     if (!rtx_equal_p (base[i], base[i + 1]))
24386       return false;
24387
24388   for (int i = 0; i < num_insns; i++)
24389     offvals[i] = INTVAL (offset[i]);
24390
24391   msize = GET_MODE_SIZE (mode).to_constant ();
24392
24393   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
24394   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
24395          aarch64_host_wide_int_compare);
24396
24397   if (!(offvals[1] == offvals[0] + msize
24398         && offvals[3] == offvals[2] + msize))
24399     return false;
24400
24401   /* Check that offsets are within range of each other.  The ldp/stp
24402      instructions have 7 bit immediate offsets, so use 0x80.  */
24403   if (offvals[2] - offvals[0] >= msize * 0x80)
24404     return false;
24405
24406   /* The offsets must be aligned with respect to each other.  */
24407   if (offvals[0] % msize != offvals[2] % msize)
24408     return false;
24409
24410   /* If we have SImode and slow unaligned ldp,
24411      check the alignment to be at least 8 byte. */
24412   if (mode == SImode
24413       && (aarch64_tune_params.extra_tuning_flags
24414           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
24415       && !optimize_size
24416       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
24417     return false;
24418
24419   return true;
24420 }
24421
24422 /* Given OPERANDS of consecutive load/store, this function pairs them
24423    into LDP/STP after adjusting the offset.  It depends on the fact
24424    that the operands can be sorted so the offsets are correct for STP.
24425    MODE is the mode of memory operands.  CODE is the rtl operator
24426    which should be applied to all memory operands, it's SIGN_EXTEND,
24427    ZERO_EXTEND or UNKNOWN.  */
24428
24429 bool
24430 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
24431                              machine_mode mode, RTX_CODE code)
24432 {
24433   rtx base, offset_1, offset_3, t1, t2;
24434   rtx mem_1, mem_2, mem_3, mem_4;
24435   rtx temp_operands[8];
24436   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
24437                 stp_off_upper_limit, stp_off_lower_limit, msize;
24438
24439   /* We make changes on a copy as we may still bail out.  */
24440   for (int i = 0; i < 8; i ++)
24441     temp_operands[i] = operands[i];
24442
24443   /* Sort the operands.  */
24444   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
24445
24446   /* Copy the memory operands so that if we have to bail for some
24447      reason the original addresses are unchanged.  */
24448   if (load)
24449     {
24450       mem_1 = copy_rtx (temp_operands[1]);
24451       mem_2 = copy_rtx (temp_operands[3]);
24452       mem_3 = copy_rtx (temp_operands[5]);
24453       mem_4 = copy_rtx (temp_operands[7]);
24454     }
24455   else
24456     {
24457       mem_1 = copy_rtx (temp_operands[0]);
24458       mem_2 = copy_rtx (temp_operands[2]);
24459       mem_3 = copy_rtx (temp_operands[4]);
24460       mem_4 = copy_rtx (temp_operands[6]);
24461       gcc_assert (code == UNKNOWN);
24462     }
24463
24464   extract_base_offset_in_addr (mem_1, &base, &offset_1);
24465   extract_base_offset_in_addr (mem_3, &base, &offset_3);
24466   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
24467               && offset_3 != NULL_RTX);
24468
24469   /* Adjust offset so it can fit in LDP/STP instruction.  */
24470   msize = GET_MODE_SIZE (mode).to_constant();
24471   stp_off_upper_limit = msize * (0x40 - 1);
24472   stp_off_lower_limit = - msize * 0x40;
24473
24474   off_val_1 = INTVAL (offset_1);
24475   off_val_3 = INTVAL (offset_3);
24476
24477   /* The base offset is optimally half way between the two STP/LDP offsets.  */
24478   if (msize <= 4)
24479     base_off = (off_val_1 + off_val_3) / 2;
24480   else
24481     /* However, due to issues with negative LDP/STP offset generation for
24482        larger modes, for DF, DI and vector modes. we must not use negative
24483        addresses smaller than 9 signed unadjusted bits can store.  This
24484        provides the most range in this case.  */
24485     base_off = off_val_1;
24486
24487   /* Adjust the base so that it is aligned with the addresses but still
24488      optimal.  */
24489   if (base_off % msize != off_val_1 % msize)
24490     /* Fix the offset, bearing in mind we want to make it bigger not
24491        smaller.  */
24492     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
24493   else if (msize <= 4)
24494     /* The negative range of LDP/STP is one larger than the positive range.  */
24495     base_off += msize;
24496
24497   /* Check if base offset is too big or too small.  We can attempt to resolve
24498      this issue by setting it to the maximum value and seeing if the offsets
24499      still fit.  */
24500   if (base_off >= 0x1000)
24501     {
24502       base_off = 0x1000 - 1;
24503       /* We must still make sure that the base offset is aligned with respect
24504          to the address.  But it may not be made any bigger.  */
24505       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
24506     }
24507
24508   /* Likewise for the case where the base is too small.  */
24509   if (base_off <= -0x1000)
24510     {
24511       base_off = -0x1000 + 1;
24512       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
24513     }
24514
24515   /* Offset of the first STP/LDP.  */
24516   new_off_1 = off_val_1 - base_off;
24517
24518   /* Offset of the second STP/LDP.  */
24519   new_off_3 = off_val_3 - base_off;
24520
24521   /* The offsets must be within the range of the LDP/STP instructions.  */
24522   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
24523       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
24524     return false;
24525
24526   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
24527                                                   new_off_1), true);
24528   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
24529                                                   new_off_1 + msize), true);
24530   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
24531                                                   new_off_3), true);
24532   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
24533                                                   new_off_3 + msize), true);
24534
24535   if (!aarch64_mem_pair_operand (mem_1, mode)
24536       || !aarch64_mem_pair_operand (mem_3, mode))
24537     return false;
24538
24539   if (code == ZERO_EXTEND)
24540     {
24541       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
24542       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
24543       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
24544       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
24545     }
24546   else if (code == SIGN_EXTEND)
24547     {
24548       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
24549       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
24550       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
24551       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
24552     }
24553
24554   if (load)
24555     {
24556       operands[0] = temp_operands[0];
24557       operands[1] = mem_1;
24558       operands[2] = temp_operands[2];
24559       operands[3] = mem_2;
24560       operands[4] = temp_operands[4];
24561       operands[5] = mem_3;
24562       operands[6] = temp_operands[6];
24563       operands[7] = mem_4;
24564     }
24565   else
24566     {
24567       operands[0] = mem_1;
24568       operands[1] = temp_operands[1];
24569       operands[2] = mem_2;
24570       operands[3] = temp_operands[3];
24571       operands[4] = mem_3;
24572       operands[5] = temp_operands[5];
24573       operands[6] = mem_4;
24574       operands[7] = temp_operands[7];
24575     }
24576
24577   /* Emit adjusting instruction.  */
24578   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
24579   /* Emit ldp/stp instructions.  */
24580   t1 = gen_rtx_SET (operands[0], operands[1]);
24581   t2 = gen_rtx_SET (operands[2], operands[3]);
24582   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
24583   t1 = gen_rtx_SET (operands[4], operands[5]);
24584   t2 = gen_rtx_SET (operands[6], operands[7]);
24585   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
24586   return true;
24587 }
24588
24589 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
24590    it isn't worth branching around empty masked ops (including masked
24591    stores).  */
24592
24593 static bool
24594 aarch64_empty_mask_is_expensive (unsigned)
24595 {
24596   return false;
24597 }
24598
24599 /* Return 1 if pseudo register should be created and used to hold
24600    GOT address for PIC code.  */
24601
24602 bool
24603 aarch64_use_pseudo_pic_reg (void)
24604 {
24605   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
24606 }
24607
24608 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
24609
24610 static int
24611 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
24612 {
24613   switch (XINT (x, 1))
24614     {
24615     case UNSPEC_GOTSMALLPIC:
24616     case UNSPEC_GOTSMALLPIC28K:
24617     case UNSPEC_GOTTINYPIC:
24618       return 0;
24619     default:
24620       break;
24621     }
24622
24623   return default_unspec_may_trap_p (x, flags);
24624 }
24625
24626
24627 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
24628    return the log2 of that value.  Otherwise return -1.  */
24629
24630 int
24631 aarch64_fpconst_pow_of_2 (rtx x)
24632 {
24633   const REAL_VALUE_TYPE *r;
24634
24635   if (!CONST_DOUBLE_P (x))
24636     return -1;
24637
24638   r = CONST_DOUBLE_REAL_VALUE (x);
24639
24640   if (REAL_VALUE_NEGATIVE (*r)
24641       || REAL_VALUE_ISNAN (*r)
24642       || REAL_VALUE_ISINF (*r)
24643       || !real_isinteger (r, DFmode))
24644     return -1;
24645
24646   return exact_log2 (real_to_integer (r));
24647 }
24648
24649 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
24650    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
24651    return n. Otherwise return -1.  */
24652
24653 int
24654 aarch64_fpconst_pow2_recip (rtx x)
24655 {
24656   REAL_VALUE_TYPE r0;
24657
24658   if (!CONST_DOUBLE_P (x))
24659     return -1;
24660
24661   r0 = *CONST_DOUBLE_REAL_VALUE (x);
24662   if (exact_real_inverse (DFmode, &r0)
24663       && !REAL_VALUE_NEGATIVE (r0))
24664     {
24665         int ret = exact_log2 (real_to_integer (&r0));
24666         if (ret >= 1 && ret <= 32)
24667             return ret;
24668     }
24669   return -1;
24670 }
24671
24672 /* If X is a vector of equal CONST_DOUBLE values and that value is
24673    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
24674
24675 int
24676 aarch64_vec_fpconst_pow_of_2 (rtx x)
24677 {
24678   int nelts;
24679   if (GET_CODE (x) != CONST_VECTOR
24680       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
24681     return -1;
24682
24683   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
24684     return -1;
24685
24686   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
24687   if (firstval <= 0)
24688     return -1;
24689
24690   for (int i = 1; i < nelts; i++)
24691     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
24692       return -1;
24693
24694   return firstval;
24695 }
24696
24697 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
24698    to float.
24699
24700    __fp16 always promotes through this hook.
24701    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
24702    through the generic excess precision logic rather than here.  */
24703
24704 static tree
24705 aarch64_promoted_type (const_tree t)
24706 {
24707   if (SCALAR_FLOAT_TYPE_P (t)
24708       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
24709     return float_type_node;
24710
24711   return NULL_TREE;
24712 }
24713
24714 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
24715
24716 static bool
24717 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
24718                            optimization_type opt_type)
24719 {
24720   switch (op)
24721     {
24722     case rsqrt_optab:
24723       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
24724
24725     default:
24726       return true;
24727     }
24728 }
24729
24730 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
24731
24732 static unsigned int
24733 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
24734                                         int *offset)
24735 {
24736   /* Polynomial invariant 1 == (VG / 2) - 1.  */
24737   gcc_assert (i == 1);
24738   *factor = 2;
24739   *offset = 1;
24740   return AARCH64_DWARF_VG;
24741 }
24742
24743 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
24744    if MODE is HFmode, and punt to the generic implementation otherwise.  */
24745
24746 static bool
24747 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
24748 {
24749   return (mode == HFmode
24750           ? true
24751           : default_libgcc_floating_mode_supported_p (mode));
24752 }
24753
24754 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
24755    if MODE is HFmode, and punt to the generic implementation otherwise.  */
24756
24757 static bool
24758 aarch64_scalar_mode_supported_p (scalar_mode mode)
24759 {
24760   return (mode == HFmode
24761           ? true
24762           : default_scalar_mode_supported_p (mode));
24763 }
24764
24765 /* Set the value of FLT_EVAL_METHOD.
24766    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
24767
24768     0: evaluate all operations and constants, whose semantic type has at
24769        most the range and precision of type float, to the range and
24770        precision of float; evaluate all other operations and constants to
24771        the range and precision of the semantic type;
24772
24773     N, where _FloatN is a supported interchange floating type
24774        evaluate all operations and constants, whose semantic type has at
24775        most the range and precision of _FloatN type, to the range and
24776        precision of the _FloatN type; evaluate all other operations and
24777        constants to the range and precision of the semantic type;
24778
24779    If we have the ARMv8.2-A extensions then we support _Float16 in native
24780    precision, so we should set this to 16.  Otherwise, we support the type,
24781    but want to evaluate expressions in float precision, so set this to
24782    0.  */
24783
24784 static enum flt_eval_method
24785 aarch64_excess_precision (enum excess_precision_type type)
24786 {
24787   switch (type)
24788     {
24789       case EXCESS_PRECISION_TYPE_FAST:
24790       case EXCESS_PRECISION_TYPE_STANDARD:
24791         /* We can calculate either in 16-bit range and precision or
24792            32-bit range and precision.  Make that decision based on whether
24793            we have native support for the ARMv8.2-A 16-bit floating-point
24794            instructions or not.  */
24795         return (TARGET_FP_F16INST
24796                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
24797                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
24798       case EXCESS_PRECISION_TYPE_IMPLICIT:
24799         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
24800       default:
24801         gcc_unreachable ();
24802     }
24803   return FLT_EVAL_METHOD_UNPREDICTABLE;
24804 }
24805
24806 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
24807    scheduled for speculative execution.  Reject the long-running division
24808    and square-root instructions.  */
24809
24810 static bool
24811 aarch64_sched_can_speculate_insn (rtx_insn *insn)
24812 {
24813   switch (get_attr_type (insn))
24814     {
24815       case TYPE_SDIV:
24816       case TYPE_UDIV:
24817       case TYPE_FDIVS:
24818       case TYPE_FDIVD:
24819       case TYPE_FSQRTS:
24820       case TYPE_FSQRTD:
24821       case TYPE_NEON_FP_SQRT_S:
24822       case TYPE_NEON_FP_SQRT_D:
24823       case TYPE_NEON_FP_SQRT_S_Q:
24824       case TYPE_NEON_FP_SQRT_D_Q:
24825       case TYPE_NEON_FP_DIV_S:
24826       case TYPE_NEON_FP_DIV_D:
24827       case TYPE_NEON_FP_DIV_S_Q:
24828       case TYPE_NEON_FP_DIV_D_Q:
24829         return false;
24830       default:
24831         return true;
24832     }
24833 }
24834
24835 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
24836
24837 static int
24838 aarch64_compute_pressure_classes (reg_class *classes)
24839 {
24840   int i = 0;
24841   classes[i++] = GENERAL_REGS;
24842   classes[i++] = FP_REGS;
24843   /* PR_REGS isn't a useful pressure class because many predicate pseudo
24844      registers need to go in PR_LO_REGS at some point during their
24845      lifetime.  Splitting it into two halves has the effect of making
24846      all predicates count against PR_LO_REGS, so that we try whenever
24847      possible to restrict the number of live predicates to 8.  This
24848      greatly reduces the amount of spilling in certain loops.  */
24849   classes[i++] = PR_LO_REGS;
24850   classes[i++] = PR_HI_REGS;
24851   return i;
24852 }
24853
24854 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
24855
24856 static bool
24857 aarch64_can_change_mode_class (machine_mode from,
24858                                machine_mode to, reg_class_t)
24859 {
24860   unsigned int from_flags = aarch64_classify_vector_mode (from);
24861   unsigned int to_flags = aarch64_classify_vector_mode (to);
24862
24863   bool from_sve_p = (from_flags & VEC_ANY_SVE);
24864   bool to_sve_p = (to_flags & VEC_ANY_SVE);
24865
24866   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
24867   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
24868
24869   bool from_pred_p = (from_flags & VEC_SVE_PRED);
24870   bool to_pred_p = (to_flags & VEC_SVE_PRED);
24871
24872   /* Don't allow changes between predicate modes and other modes.
24873      Only predicate registers can hold predicate modes and only
24874      non-predicate registers can hold non-predicate modes, so any
24875      attempt to mix them would require a round trip through memory.  */
24876   if (from_pred_p != to_pred_p)
24877     return false;
24878
24879   /* Don't allow changes between partial SVE modes and other modes.
24880      The contents of partial SVE modes are distributed evenly across
24881      the register, whereas GCC expects them to be clustered together.  */
24882   if (from_partial_sve_p != to_partial_sve_p)
24883     return false;
24884
24885   /* Similarly reject changes between partial SVE modes that have
24886      different patterns of significant and insignificant bits.  */
24887   if (from_partial_sve_p
24888       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
24889           || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
24890     return false;
24891
24892   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
24893     {
24894       /* Don't allow changes between SVE modes and other modes that might
24895          be bigger than 128 bits.  In particular, OImode, CImode and XImode
24896          divide into 128-bit quantities while SVE modes divide into
24897          BITS_PER_SVE_VECTOR quantities.  */
24898       if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
24899         return false;
24900       if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
24901         return false;
24902     }
24903
24904   if (BYTES_BIG_ENDIAN)
24905     {
24906       /* Don't allow changes between SVE data modes and non-SVE modes.
24907          See the comment at the head of aarch64-sve.md for details.  */
24908       if (from_sve_p != to_sve_p)
24909         return false;
24910
24911       /* Don't allow changes in element size: lane 0 of the new vector
24912          would not then be lane 0 of the old vector.  See the comment
24913          above aarch64_maybe_expand_sve_subreg_move for a more detailed
24914          description.
24915
24916          In the worst case, this forces a register to be spilled in
24917          one mode and reloaded in the other, which handles the
24918          endianness correctly.  */
24919       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
24920         return false;
24921     }
24922   return true;
24923 }
24924
24925 /* Implement TARGET_EARLY_REMAT_MODES.  */
24926
24927 static void
24928 aarch64_select_early_remat_modes (sbitmap modes)
24929 {
24930   /* SVE values are not normally live across a call, so it should be
24931      worth doing early rematerialization even in VL-specific mode.  */
24932   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
24933     if (aarch64_sve_mode_p ((machine_mode) i))
24934       bitmap_set_bit (modes, i);
24935 }
24936
24937 /* Override the default target speculation_safe_value.  */
24938 static rtx
24939 aarch64_speculation_safe_value (machine_mode mode,
24940                                 rtx result, rtx val, rtx failval)
24941 {
24942   /* Maybe we should warn if falling back to hard barriers.  They are
24943      likely to be noticably more expensive than the alternative below.  */
24944   if (!aarch64_track_speculation)
24945     return default_speculation_safe_value (mode, result, val, failval);
24946
24947   if (!REG_P (val))
24948     val = copy_to_mode_reg (mode, val);
24949
24950   if (!aarch64_reg_or_zero (failval, mode))
24951     failval = copy_to_mode_reg (mode, failval);
24952
24953   emit_insn (gen_despeculate_copy (mode, result, val, failval));
24954   return result;
24955 }
24956
24957 /* Implement TARGET_ESTIMATED_POLY_VALUE.
24958    Look into the tuning structure for an estimate.
24959    KIND specifies the type of requested estimate: min, max or likely.
24960    For cores with a known SVE width all three estimates are the same.
24961    For generic SVE tuning we want to distinguish the maximum estimate from
24962    the minimum and likely ones.
24963    The likely estimate is the same as the minimum in that case to give a
24964    conservative behavior of auto-vectorizing with SVE when it is a win
24965    even for 128-bit SVE.
24966    When SVE width information is available VAL.coeffs[1] is multiplied by
24967    the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
24968
24969 static HOST_WIDE_INT
24970 aarch64_estimated_poly_value (poly_int64 val,
24971                               poly_value_estimate_kind kind
24972                                 = POLY_VALUE_LIKELY)
24973 {
24974   enum aarch64_sve_vector_bits_enum width_source
24975     = aarch64_tune_params.sve_width;
24976
24977   /* If there is no core-specific information then the minimum and likely
24978      values are based on 128-bit vectors and the maximum is based on
24979      the architectural maximum of 2048 bits.  */
24980   if (width_source == SVE_SCALABLE)
24981     switch (kind)
24982       {
24983       case POLY_VALUE_MIN:
24984       case POLY_VALUE_LIKELY:
24985         return val.coeffs[0];
24986       case POLY_VALUE_MAX:
24987           return val.coeffs[0] + val.coeffs[1] * 15;
24988       }
24989
24990   /* If the core provides width information, use that.  */
24991   HOST_WIDE_INT over_128 = width_source - 128;
24992   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
24993 }
24994
24995
24996 /* Return true for types that could be supported as SIMD return or
24997    argument types.  */
24998
24999 static bool
25000 supported_simd_type (tree t)
25001 {
25002   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
25003     {
25004       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
25005       return s == 1 || s == 2 || s == 4 || s == 8;
25006     }
25007   return false;
25008 }
25009
25010 /* Return true for types that currently are supported as SIMD return
25011    or argument types.  */
25012
25013 static bool
25014 currently_supported_simd_type (tree t, tree b)
25015 {
25016   if (COMPLEX_FLOAT_TYPE_P (t))
25017     return false;
25018
25019   if (TYPE_SIZE (t) != TYPE_SIZE (b))
25020     return false;
25021
25022   return supported_simd_type (t);
25023 }
25024
25025 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
25026
25027 static int
25028 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
25029                                         struct cgraph_simd_clone *clonei,
25030                                         tree base_type, int num)
25031 {
25032   tree t, ret_type;
25033   unsigned int elt_bits, count;
25034   unsigned HOST_WIDE_INT const_simdlen;
25035   poly_uint64 vec_bits;
25036
25037   if (!TARGET_SIMD)
25038     return 0;
25039
25040   /* For now, SVE simdclones won't produce illegal simdlen, So only check
25041      const simdlens here.  */
25042   if (maybe_ne (clonei->simdlen, 0U)
25043       && clonei->simdlen.is_constant (&const_simdlen)
25044       && (const_simdlen < 2
25045           || const_simdlen > 1024
25046           || (const_simdlen & (const_simdlen - 1)) != 0))
25047     {
25048       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25049                   "unsupported simdlen %wd", const_simdlen);
25050       return 0;
25051     }
25052
25053   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
25054   if (TREE_CODE (ret_type) != VOID_TYPE
25055       && !currently_supported_simd_type (ret_type, base_type))
25056     {
25057       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
25058         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25059                     "GCC does not currently support mixed size types "
25060                     "for %<simd%> functions");
25061       else if (supported_simd_type (ret_type))
25062         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25063                     "GCC does not currently support return type %qT "
25064                     "for %<simd%> functions", ret_type);
25065       else
25066         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25067                     "unsupported return type %qT for %<simd%> functions",
25068                     ret_type);
25069       return 0;
25070     }
25071
25072   int i;
25073   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
25074   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
25075
25076   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
25077        t && t != void_list_node; t = TREE_CHAIN (t), i++)
25078     {
25079       tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
25080
25081       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
25082           && !currently_supported_simd_type (arg_type, base_type))
25083         {
25084           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
25085             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25086                         "GCC does not currently support mixed size types "
25087                         "for %<simd%> functions");
25088           else
25089             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25090                         "GCC does not currently support argument type %qT "
25091                         "for %<simd%> functions", arg_type);
25092           return 0;
25093         }
25094     }
25095
25096   clonei->vecsize_mangle = 'n';
25097   clonei->mask_mode = VOIDmode;
25098   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
25099   if (known_eq (clonei->simdlen, 0U))
25100     {
25101       count = 2;
25102       vec_bits = (num == 0 ? 64 : 128);
25103       clonei->simdlen = exact_div (vec_bits, elt_bits);
25104     }
25105   else
25106     {
25107       count = 1;
25108       vec_bits = clonei->simdlen * elt_bits;
25109       /* For now, SVE simdclones won't produce illegal simdlen, So only check
25110          const simdlens here.  */
25111       if (clonei->simdlen.is_constant (&const_simdlen)
25112           && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
25113         {
25114           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25115                       "GCC does not currently support simdlen %wd for type %qT",
25116                       const_simdlen, base_type);
25117           return 0;
25118         }
25119     }
25120   clonei->vecsize_int = vec_bits;
25121   clonei->vecsize_float = vec_bits;
25122   return count;
25123 }
25124
25125 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
25126
25127 static void
25128 aarch64_simd_clone_adjust (struct cgraph_node *node)
25129 {
25130   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
25131      use the correct ABI.  */
25132
25133   tree t = TREE_TYPE (node->decl);
25134   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
25135                                         TYPE_ATTRIBUTES (t));
25136 }
25137
25138 /* Implement TARGET_SIMD_CLONE_USABLE.  */
25139
25140 static int
25141 aarch64_simd_clone_usable (struct cgraph_node *node)
25142 {
25143   switch (node->simdclone->vecsize_mangle)
25144     {
25145     case 'n':
25146       if (!TARGET_SIMD)
25147         return -1;
25148       return 0;
25149     default:
25150       gcc_unreachable ();
25151     }
25152 }
25153
25154 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
25155
25156 static int
25157 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
25158 {
25159   auto check_attr = [&](const char *name) {
25160     tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
25161     tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
25162     if (!attr1 && !attr2)
25163       return true;
25164
25165     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
25166   };
25167
25168   if (!check_attr ("aarch64_vector_pcs"))
25169     return 0;
25170   if (!check_attr ("Advanced SIMD type"))
25171     return 0;
25172   if (!check_attr ("SVE type"))
25173     return 0;
25174   if (!check_attr ("SVE sizeless type"))
25175     return 0;
25176   return 1;
25177 }
25178
25179 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
25180
25181 static const char *
25182 aarch64_get_multilib_abi_name (void)
25183 {
25184   if (TARGET_BIG_END)
25185     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
25186   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
25187 }
25188
25189 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
25190    global variable based guard use the default else
25191    return a null tree.  */
25192 static tree
25193 aarch64_stack_protect_guard (void)
25194 {
25195   if (aarch64_stack_protector_guard == SSP_GLOBAL)
25196     return default_stack_protect_guard ();
25197
25198   return NULL_TREE;
25199 }
25200
25201 /* Return the diagnostic message string if conversion from FROMTYPE to
25202    TOTYPE is not allowed, NULL otherwise.  */
25203
25204 static const char *
25205 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
25206 {
25207   if (element_mode (fromtype) != element_mode (totype))
25208     {
25209       /* Do no allow conversions to/from BFmode scalar types.  */
25210       if (TYPE_MODE (fromtype) == BFmode)
25211         return N_("invalid conversion from type %<bfloat16_t%>");
25212       if (TYPE_MODE (totype) == BFmode)
25213         return N_("invalid conversion to type %<bfloat16_t%>");
25214     }
25215
25216   /* Conversion allowed.  */
25217   return NULL;
25218 }
25219
25220 /* Return the diagnostic message string if the unary operation OP is
25221    not permitted on TYPE, NULL otherwise.  */
25222
25223 static const char *
25224 aarch64_invalid_unary_op (int op, const_tree type)
25225 {
25226   /* Reject all single-operand operations on BFmode except for &.  */
25227   if (element_mode (type) == BFmode && op != ADDR_EXPR)
25228     return N_("operation not permitted on type %<bfloat16_t%>");
25229
25230   /* Operation allowed.  */
25231   return NULL;
25232 }
25233
25234 /* Return the diagnostic message string if the binary operation OP is
25235    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
25236
25237 static const char *
25238 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
25239                            const_tree type2)
25240 {
25241   /* Reject all 2-operand operations on BFmode.  */
25242   if (element_mode (type1) == BFmode
25243       || element_mode (type2) == BFmode)
25244     return N_("operation not permitted on type %<bfloat16_t%>");
25245
25246   if (VECTOR_TYPE_P (type1)
25247       && VECTOR_TYPE_P (type2)
25248       && !TYPE_INDIVISIBLE_P (type1)
25249       && !TYPE_INDIVISIBLE_P (type2)
25250       && (aarch64_sve::builtin_type_p (type1)
25251           != aarch64_sve::builtin_type_p (type2)))
25252     return N_("cannot combine GNU and SVE vectors in a binary operation");
25253
25254   /* Operation allowed.  */
25255   return NULL;
25256 }
25257
25258 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES.  Here we tell the rest of the
25259    compiler that we automatically ignore the top byte of our pointers, which
25260    allows using -fsanitize=hwaddress.  */
25261 bool
25262 aarch64_can_tag_addresses ()
25263 {
25264   return !TARGET_ILP32;
25265 }
25266
25267 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
25268    section at the end if needed.  */
25269 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
25270 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
25271 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
25272 void
25273 aarch64_file_end_indicate_exec_stack ()
25274 {
25275   file_end_indicate_exec_stack ();
25276
25277   unsigned feature_1_and = 0;
25278   if (aarch64_bti_enabled ())
25279     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
25280
25281   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
25282     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
25283
25284   if (feature_1_and)
25285     {
25286       /* Generate .note.gnu.property section.  */
25287       switch_to_section (get_section (".note.gnu.property",
25288                                       SECTION_NOTYPE, NULL));
25289
25290       /* PT_NOTE header: namesz, descsz, type.
25291          namesz = 4 ("GNU\0")
25292          descsz = 16 (Size of the program property array)
25293                   [(12 + padding) * Number of array elements]
25294          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
25295       assemble_align (POINTER_SIZE);
25296       assemble_integer (GEN_INT (4), 4, 32, 1);
25297       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
25298       assemble_integer (GEN_INT (5), 4, 32, 1);
25299
25300       /* PT_NOTE name.  */
25301       assemble_string ("GNU", 4);
25302
25303       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
25304          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
25305          datasz = 4
25306          data   = feature_1_and.  */
25307       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
25308       assemble_integer (GEN_INT (4), 4, 32, 1);
25309       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
25310
25311       /* Pad the size of the note to the required alignment.  */
25312       assemble_align (POINTER_SIZE);
25313     }
25314 }
25315 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
25316 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
25317 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
25318
25319 /* Helper function for straight line speculation.
25320    Return what barrier should be emitted for straight line speculation
25321    mitigation.
25322    When not mitigating against straight line speculation this function returns
25323    an empty string.
25324    When mitigating against straight line speculation, use:
25325    * SB when the v8.5-A SB extension is enabled.
25326    * DSB+ISB otherwise.  */
25327 const char *
25328 aarch64_sls_barrier (int mitigation_required)
25329 {
25330   return mitigation_required
25331     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
25332     : "";
25333 }
25334
25335 static GTY (()) tree aarch64_sls_shared_thunks[30];
25336 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
25337 const char *indirect_symbol_names[30] = {
25338     "__call_indirect_x0",
25339     "__call_indirect_x1",
25340     "__call_indirect_x2",
25341     "__call_indirect_x3",
25342     "__call_indirect_x4",
25343     "__call_indirect_x5",
25344     "__call_indirect_x6",
25345     "__call_indirect_x7",
25346     "__call_indirect_x8",
25347     "__call_indirect_x9",
25348     "__call_indirect_x10",
25349     "__call_indirect_x11",
25350     "__call_indirect_x12",
25351     "__call_indirect_x13",
25352     "__call_indirect_x14",
25353     "__call_indirect_x15",
25354     "", /* "__call_indirect_x16",  */
25355     "", /* "__call_indirect_x17",  */
25356     "__call_indirect_x18",
25357     "__call_indirect_x19",
25358     "__call_indirect_x20",
25359     "__call_indirect_x21",
25360     "__call_indirect_x22",
25361     "__call_indirect_x23",
25362     "__call_indirect_x24",
25363     "__call_indirect_x25",
25364     "__call_indirect_x26",
25365     "__call_indirect_x27",
25366     "__call_indirect_x28",
25367     "__call_indirect_x29",
25368 };
25369
25370 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
25371    line speculation.  Instead of a simple BLR that can be speculated past,
25372    we emit a BL to this thunk, and this thunk contains a BR to the relevant
25373    register.  These thunks have the relevant speculation barries put after
25374    their indirect branch so that speculation is blocked.
25375
25376    We use such a thunk so the speculation barriers are kept off the
25377    architecturally executed path in order to reduce the performance overhead.
25378
25379    When optimizing for size we use stubs shared by the linked object.
25380    When optimizing for performance we emit stubs for each function in the hope
25381    that the branch predictor can better train on jumps specific for a given
25382    function.  */
25383 rtx
25384 aarch64_sls_create_blr_label (int regnum)
25385 {
25386   gcc_assert (STUB_REGNUM_P (regnum));
25387   if (optimize_function_for_size_p (cfun))
25388     {
25389       /* For the thunks shared between different functions in this compilation
25390          unit we use a named symbol -- this is just for users to more easily
25391          understand the generated assembly.  */
25392       aarch64_sls_shared_thunks_needed = true;
25393       const char *thunk_name = indirect_symbol_names[regnum];
25394       if (aarch64_sls_shared_thunks[regnum] == NULL)
25395         {
25396           /* Build a decl representing this function stub and record it for
25397              later.  We build a decl here so we can use the GCC machinery for
25398              handling sections automatically (through `get_named_section` and
25399              `make_decl_one_only`).  That saves us a lot of trouble handling
25400              the specifics of different output file formats.  */
25401           tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
25402                                   get_identifier (thunk_name),
25403                                   build_function_type_list (void_type_node,
25404                                                             NULL_TREE));
25405           DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
25406                                            NULL_TREE, void_type_node);
25407           TREE_PUBLIC (decl) = 1;
25408           TREE_STATIC (decl) = 1;
25409           DECL_IGNORED_P (decl) = 1;
25410           DECL_ARTIFICIAL (decl) = 1;
25411           make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
25412           resolve_unique_section (decl, 0, false);
25413           aarch64_sls_shared_thunks[regnum] = decl;
25414         }
25415
25416       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
25417     }
25418
25419   if (cfun->machine->call_via[regnum] == NULL)
25420     cfun->machine->call_via[regnum]
25421       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
25422   return cfun->machine->call_via[regnum];
25423 }
25424
25425 /* Helper function for aarch64_sls_emit_blr_function_thunks and
25426    aarch64_sls_emit_shared_blr_thunks below.  */
25427 static void
25428 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
25429 {
25430   /* Save in x16 and branch to that function so this transformation does
25431      not prevent jumping to `BTI c` instructions.  */
25432   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
25433   asm_fprintf (out_file, "\tbr\tx16\n");
25434 }
25435
25436 /* Emit all BLR stubs for this particular function.
25437    Here we emit all the BLR stubs needed for the current function.  Since we
25438    emit these stubs in a consecutive block we know there will be no speculation
25439    gadgets between each stub, and hence we only emit a speculation barrier at
25440    the end of the stub sequences.
25441
25442    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
25443 void
25444 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
25445 {
25446   if (! aarch64_harden_sls_blr_p ())
25447     return;
25448
25449   bool any_functions_emitted = false;
25450   /* We must save and restore the current function section since this assembly
25451      is emitted at the end of the function.  This means it can be emitted *just
25452      after* the cold section of a function.  That cold part would be emitted in
25453      a different section.  That switch would trigger a `.cfi_endproc` directive
25454      to be emitted in the original section and a `.cfi_startproc` directive to
25455      be emitted in the new section.  Switching to the original section without
25456      restoring would mean that the `.cfi_endproc` emitted as a function ends
25457      would happen in a different section -- leaving an unmatched
25458      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
25459      in the standard text section.  */
25460   section *save_text_section = in_section;
25461   switch_to_section (function_section (current_function_decl));
25462   for (int regnum = 0; regnum < 30; ++regnum)
25463     {
25464       rtx specu_label = cfun->machine->call_via[regnum];
25465       if (specu_label == NULL)
25466         continue;
25467
25468       targetm.asm_out.print_operand (out_file, specu_label, 0);
25469       asm_fprintf (out_file, ":\n");
25470       aarch64_sls_emit_function_stub (out_file, regnum);
25471       any_functions_emitted = true;
25472     }
25473   if (any_functions_emitted)
25474     /* Can use the SB if needs be here, since this stub will only be used
25475       by the current function, and hence for the current target.  */
25476     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
25477   switch_to_section (save_text_section);
25478 }
25479
25480 /* Emit shared BLR stubs for the current compilation unit.
25481    Over the course of compiling this unit we may have converted some BLR
25482    instructions to a BL to a shared stub function.  This is where we emit those
25483    stub functions.
25484    This function is for the stubs shared between different functions in this
25485    compilation unit.  We share when optimizing for size instead of speed.
25486
25487    This function is called through the TARGET_ASM_FILE_END hook.  */
25488 void
25489 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
25490 {
25491   if (! aarch64_sls_shared_thunks_needed)
25492     return;
25493
25494   for (int regnum = 0; regnum < 30; ++regnum)
25495     {
25496       tree decl = aarch64_sls_shared_thunks[regnum];
25497       if (!decl)
25498         continue;
25499
25500       const char *name = indirect_symbol_names[regnum];
25501       switch_to_section (get_named_section (decl, NULL, 0));
25502       ASM_OUTPUT_ALIGN (out_file, 2);
25503       targetm.asm_out.globalize_label (out_file, name);
25504       /* Only emits if the compiler is configured for an assembler that can
25505          handle visibility directives.  */
25506       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
25507       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
25508       ASM_OUTPUT_LABEL (out_file, name);
25509       aarch64_sls_emit_function_stub (out_file, regnum);
25510       /* Use the most conservative target to ensure it can always be used by any
25511          function in the translation unit.  */
25512       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
25513       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
25514     }
25515 }
25516
25517 /* Implement TARGET_ASM_FILE_END.  */
25518 void
25519 aarch64_asm_file_end ()
25520 {
25521   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
25522   /* Since this function will be called for the ASM_FILE_END hook, we ensure
25523      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
25524      for FreeBSD) still gets called.  */
25525 #ifdef TARGET_ASM_FILE_END
25526   TARGET_ASM_FILE_END ();
25527 #endif
25528 }
25529
25530 const char *
25531 aarch64_indirect_call_asm (rtx addr)
25532 {
25533   gcc_assert (REG_P (addr));
25534   if (aarch64_harden_sls_blr_p ())
25535     {
25536       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
25537       output_asm_insn ("bl\t%0", &stub_label);
25538     }
25539   else
25540    output_asm_insn ("blr\t%0", &addr);
25541   return "";
25542 }
25543
25544 /* Target-specific selftests.  */
25545
25546 #if CHECKING_P
25547
25548 namespace selftest {
25549
25550 /* Selftest for the RTL loader.
25551    Verify that the RTL loader copes with a dump from
25552    print_rtx_function.  This is essentially just a test that class
25553    function_reader can handle a real dump, but it also verifies
25554    that lookup_reg_by_dump_name correctly handles hard regs.
25555    The presence of hard reg names in the dump means that the test is
25556    target-specific, hence it is in this file.  */
25557
25558 static void
25559 aarch64_test_loading_full_dump ()
25560 {
25561   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
25562
25563   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
25564
25565   rtx_insn *insn_1 = get_insn_by_uid (1);
25566   ASSERT_EQ (NOTE, GET_CODE (insn_1));
25567
25568   rtx_insn *insn_15 = get_insn_by_uid (15);
25569   ASSERT_EQ (INSN, GET_CODE (insn_15));
25570   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
25571
25572   /* Verify crtl->return_rtx.  */
25573   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
25574   ASSERT_EQ (0, REGNO (crtl->return_rtx));
25575   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
25576 }
25577
25578 /* Run all target-specific selftests.  */
25579
25580 static void
25581 aarch64_run_selftests (void)
25582 {
25583   aarch64_test_loading_full_dump ();
25584 }
25585
25586 } // namespace selftest
25587
25588 #endif /* #if CHECKING_P */
25589
25590 #undef TARGET_STACK_PROTECT_GUARD
25591 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
25592
25593 #undef TARGET_ADDRESS_COST
25594 #define TARGET_ADDRESS_COST aarch64_address_cost
25595
25596 /* This hook will determines whether unnamed bitfields affect the alignment
25597    of the containing structure.  The hook returns true if the structure
25598    should inherit the alignment requirements of an unnamed bitfield's
25599    type.  */
25600 #undef TARGET_ALIGN_ANON_BITFIELD
25601 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
25602
25603 #undef TARGET_ASM_ALIGNED_DI_OP
25604 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
25605
25606 #undef TARGET_ASM_ALIGNED_HI_OP
25607 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
25608
25609 #undef TARGET_ASM_ALIGNED_SI_OP
25610 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
25611
25612 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
25613 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
25614   hook_bool_const_tree_hwi_hwi_const_tree_true
25615
25616 #undef TARGET_ASM_FILE_START
25617 #define TARGET_ASM_FILE_START aarch64_start_file
25618
25619 #undef TARGET_ASM_OUTPUT_MI_THUNK
25620 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
25621
25622 #undef TARGET_ASM_SELECT_RTX_SECTION
25623 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
25624
25625 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
25626 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
25627
25628 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
25629 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
25630
25631 #undef TARGET_BUILD_BUILTIN_VA_LIST
25632 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
25633
25634 #undef TARGET_CALLEE_COPIES
25635 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
25636
25637 #undef TARGET_CAN_ELIMINATE
25638 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
25639
25640 #undef TARGET_CAN_INLINE_P
25641 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
25642
25643 #undef TARGET_CANNOT_FORCE_CONST_MEM
25644 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
25645
25646 #undef TARGET_CASE_VALUES_THRESHOLD
25647 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
25648
25649 #undef TARGET_CONDITIONAL_REGISTER_USAGE
25650 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
25651
25652 #undef TARGET_MEMBER_TYPE_FORCES_BLK
25653 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
25654
25655 /* Only the least significant bit is used for initialization guard
25656    variables.  */
25657 #undef TARGET_CXX_GUARD_MASK_BIT
25658 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
25659
25660 #undef TARGET_C_MODE_FOR_SUFFIX
25661 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
25662
25663 #ifdef TARGET_BIG_ENDIAN_DEFAULT
25664 #undef  TARGET_DEFAULT_TARGET_FLAGS
25665 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
25666 #endif
25667
25668 #undef TARGET_CLASS_MAX_NREGS
25669 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
25670
25671 #undef TARGET_BUILTIN_DECL
25672 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
25673
25674 #undef TARGET_BUILTIN_RECIPROCAL
25675 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
25676
25677 #undef TARGET_C_EXCESS_PRECISION
25678 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
25679
25680 #undef  TARGET_EXPAND_BUILTIN
25681 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
25682
25683 #undef TARGET_EXPAND_BUILTIN_VA_START
25684 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
25685
25686 #undef TARGET_FOLD_BUILTIN
25687 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
25688
25689 #undef TARGET_FUNCTION_ARG
25690 #define TARGET_FUNCTION_ARG aarch64_function_arg
25691
25692 #undef TARGET_FUNCTION_ARG_ADVANCE
25693 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
25694
25695 #undef TARGET_FUNCTION_ARG_BOUNDARY
25696 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
25697
25698 #undef TARGET_FUNCTION_ARG_PADDING
25699 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
25700
25701 #undef TARGET_GET_RAW_RESULT_MODE
25702 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
25703 #undef TARGET_GET_RAW_ARG_MODE
25704 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
25705
25706 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
25707 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
25708
25709 #undef TARGET_FUNCTION_VALUE
25710 #define TARGET_FUNCTION_VALUE aarch64_function_value
25711
25712 #undef TARGET_FUNCTION_VALUE_REGNO_P
25713 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
25714
25715 #undef TARGET_GIMPLE_FOLD_BUILTIN
25716 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
25717
25718 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
25719 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
25720
25721 #undef  TARGET_INIT_BUILTINS
25722 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
25723
25724 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
25725 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
25726   aarch64_ira_change_pseudo_allocno_class
25727
25728 #undef TARGET_LEGITIMATE_ADDRESS_P
25729 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
25730
25731 #undef TARGET_LEGITIMATE_CONSTANT_P
25732 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
25733
25734 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
25735 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
25736   aarch64_legitimize_address_displacement
25737
25738 #undef TARGET_LIBGCC_CMP_RETURN_MODE
25739 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
25740
25741 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
25742 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
25743 aarch64_libgcc_floating_mode_supported_p
25744
25745 #undef TARGET_MANGLE_TYPE
25746 #define TARGET_MANGLE_TYPE aarch64_mangle_type
25747
25748 #undef TARGET_INVALID_CONVERSION
25749 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
25750
25751 #undef TARGET_INVALID_UNARY_OP
25752 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
25753
25754 #undef TARGET_INVALID_BINARY_OP
25755 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
25756
25757 #undef TARGET_VERIFY_TYPE_CONTEXT
25758 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
25759
25760 #undef TARGET_MEMORY_MOVE_COST
25761 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
25762
25763 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
25764 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
25765
25766 #undef TARGET_MUST_PASS_IN_STACK
25767 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
25768
25769 /* This target hook should return true if accesses to volatile bitfields
25770    should use the narrowest mode possible.  It should return false if these
25771    accesses should use the bitfield container type.  */
25772 #undef TARGET_NARROW_VOLATILE_BITFIELD
25773 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
25774
25775 #undef  TARGET_OPTION_OVERRIDE
25776 #define TARGET_OPTION_OVERRIDE aarch64_override_options
25777
25778 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
25779 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
25780   aarch64_override_options_after_change
25781
25782 #undef TARGET_OFFLOAD_OPTIONS
25783 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
25784
25785 #undef TARGET_OPTION_SAVE
25786 #define TARGET_OPTION_SAVE aarch64_option_save
25787
25788 #undef TARGET_OPTION_RESTORE
25789 #define TARGET_OPTION_RESTORE aarch64_option_restore
25790
25791 #undef TARGET_OPTION_PRINT
25792 #define TARGET_OPTION_PRINT aarch64_option_print
25793
25794 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
25795 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
25796
25797 #undef TARGET_SET_CURRENT_FUNCTION
25798 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
25799
25800 #undef TARGET_PASS_BY_REFERENCE
25801 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
25802
25803 #undef TARGET_PREFERRED_RELOAD_CLASS
25804 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
25805
25806 #undef TARGET_SCHED_REASSOCIATION_WIDTH
25807 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
25808
25809 #undef TARGET_PROMOTED_TYPE
25810 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
25811
25812 #undef TARGET_SECONDARY_RELOAD
25813 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
25814
25815 #undef TARGET_SHIFT_TRUNCATION_MASK
25816 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
25817
25818 #undef TARGET_SETUP_INCOMING_VARARGS
25819 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
25820
25821 #undef TARGET_STRUCT_VALUE_RTX
25822 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
25823
25824 #undef TARGET_REGISTER_MOVE_COST
25825 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
25826
25827 #undef TARGET_RETURN_IN_MEMORY
25828 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
25829
25830 #undef TARGET_RETURN_IN_MSB
25831 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
25832
25833 #undef TARGET_RTX_COSTS
25834 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
25835
25836 #undef TARGET_SCALAR_MODE_SUPPORTED_P
25837 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
25838
25839 #undef TARGET_SCHED_ISSUE_RATE
25840 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
25841
25842 #undef TARGET_SCHED_VARIABLE_ISSUE
25843 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
25844
25845 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
25846 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
25847   aarch64_sched_first_cycle_multipass_dfa_lookahead
25848
25849 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
25850 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
25851   aarch64_first_cycle_multipass_dfa_lookahead_guard
25852
25853 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
25854 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
25855   aarch64_get_separate_components
25856
25857 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
25858 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
25859   aarch64_components_for_bb
25860
25861 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
25862 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
25863   aarch64_disqualify_components
25864
25865 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
25866 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
25867   aarch64_emit_prologue_components
25868
25869 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
25870 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
25871   aarch64_emit_epilogue_components
25872
25873 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
25874 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
25875   aarch64_set_handled_components
25876
25877 #undef TARGET_TRAMPOLINE_INIT
25878 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
25879
25880 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
25881 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
25882
25883 #undef TARGET_VECTOR_MODE_SUPPORTED_P
25884 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
25885
25886 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
25887 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
25888
25889 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
25890 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
25891   aarch64_builtin_support_vector_misalignment
25892
25893 #undef TARGET_ARRAY_MODE
25894 #define TARGET_ARRAY_MODE aarch64_array_mode
25895
25896 #undef TARGET_ARRAY_MODE_SUPPORTED_P
25897 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
25898
25899 #undef TARGET_VECTORIZE_INIT_COST
25900 #define TARGET_VECTORIZE_INIT_COST aarch64_init_cost
25901
25902 #undef TARGET_VECTORIZE_ADD_STMT_COST
25903 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
25904
25905 #undef TARGET_VECTORIZE_FINISH_COST
25906 #define TARGET_VECTORIZE_FINISH_COST aarch64_finish_cost
25907
25908 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
25909 #define TARGET_VECTORIZE_DESTROY_COST_DATA aarch64_destroy_cost_data
25910
25911 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
25912 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
25913   aarch64_builtin_vectorization_cost
25914
25915 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
25916 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
25917
25918 #undef TARGET_VECTORIZE_BUILTINS
25919 #define TARGET_VECTORIZE_BUILTINS
25920
25921 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
25922 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
25923   aarch64_builtin_vectorized_function
25924
25925 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
25926 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
25927   aarch64_autovectorize_vector_modes
25928
25929 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
25930 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
25931   aarch64_atomic_assign_expand_fenv
25932
25933 /* Section anchor support.  */
25934
25935 #undef TARGET_MIN_ANCHOR_OFFSET
25936 #define TARGET_MIN_ANCHOR_OFFSET -256
25937
25938 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
25939    byte offset; we can do much more for larger data types, but have no way
25940    to determine the size of the access.  We assume accesses are aligned.  */
25941 #undef TARGET_MAX_ANCHOR_OFFSET
25942 #define TARGET_MAX_ANCHOR_OFFSET 4095
25943
25944 #undef TARGET_VECTOR_ALIGNMENT
25945 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
25946
25947 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
25948 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
25949   aarch64_vectorize_preferred_vector_alignment
25950 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
25951 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
25952   aarch64_simd_vector_alignment_reachable
25953
25954 /* vec_perm support.  */
25955
25956 #undef TARGET_VECTORIZE_VEC_PERM_CONST
25957 #define TARGET_VECTORIZE_VEC_PERM_CONST \
25958   aarch64_vectorize_vec_perm_const
25959
25960 #undef TARGET_VECTORIZE_RELATED_MODE
25961 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
25962 #undef TARGET_VECTORIZE_GET_MASK_MODE
25963 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
25964 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
25965 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
25966   aarch64_empty_mask_is_expensive
25967 #undef TARGET_PREFERRED_ELSE_VALUE
25968 #define TARGET_PREFERRED_ELSE_VALUE \
25969   aarch64_preferred_else_value
25970
25971 #undef TARGET_INIT_LIBFUNCS
25972 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
25973
25974 #undef TARGET_FIXED_CONDITION_CODE_REGS
25975 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
25976
25977 #undef TARGET_FLAGS_REGNUM
25978 #define TARGET_FLAGS_REGNUM CC_REGNUM
25979
25980 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
25981 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
25982
25983 #undef TARGET_ASAN_SHADOW_OFFSET
25984 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
25985
25986 #undef TARGET_LEGITIMIZE_ADDRESS
25987 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
25988
25989 #undef TARGET_SCHED_CAN_SPECULATE_INSN
25990 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
25991
25992 #undef TARGET_CAN_USE_DOLOOP_P
25993 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
25994
25995 #undef TARGET_SCHED_ADJUST_PRIORITY
25996 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
25997
25998 #undef TARGET_SCHED_MACRO_FUSION_P
25999 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
26000
26001 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
26002 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
26003
26004 #undef TARGET_SCHED_FUSION_PRIORITY
26005 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
26006
26007 #undef TARGET_UNSPEC_MAY_TRAP_P
26008 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
26009
26010 #undef TARGET_USE_PSEUDO_PIC_REG
26011 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
26012
26013 #undef TARGET_PRINT_OPERAND
26014 #define TARGET_PRINT_OPERAND aarch64_print_operand
26015
26016 #undef TARGET_PRINT_OPERAND_ADDRESS
26017 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
26018
26019 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
26020 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
26021
26022 #undef TARGET_OPTAB_SUPPORTED_P
26023 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
26024
26025 #undef TARGET_OMIT_STRUCT_RETURN_REG
26026 #define TARGET_OMIT_STRUCT_RETURN_REG true
26027
26028 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
26029 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
26030   aarch64_dwarf_poly_indeterminate_value
26031
26032 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
26033 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
26034 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
26035
26036 #undef TARGET_HARD_REGNO_NREGS
26037 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
26038 #undef TARGET_HARD_REGNO_MODE_OK
26039 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
26040
26041 #undef TARGET_MODES_TIEABLE_P
26042 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
26043
26044 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
26045 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
26046   aarch64_hard_regno_call_part_clobbered
26047
26048 #undef TARGET_INSN_CALLEE_ABI
26049 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
26050
26051 #undef TARGET_CONSTANT_ALIGNMENT
26052 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
26053
26054 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
26055 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
26056   aarch64_stack_clash_protection_alloca_probe_range
26057
26058 #undef TARGET_COMPUTE_PRESSURE_CLASSES
26059 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
26060
26061 #undef TARGET_CAN_CHANGE_MODE_CLASS
26062 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
26063
26064 #undef TARGET_SELECT_EARLY_REMAT_MODES
26065 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
26066
26067 #undef TARGET_SPECULATION_SAFE_VALUE
26068 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
26069
26070 #undef TARGET_ESTIMATED_POLY_VALUE
26071 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
26072
26073 #undef TARGET_ATTRIBUTE_TABLE
26074 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
26075
26076 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
26077 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
26078   aarch64_simd_clone_compute_vecsize_and_simdlen
26079
26080 #undef TARGET_SIMD_CLONE_ADJUST
26081 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
26082
26083 #undef TARGET_SIMD_CLONE_USABLE
26084 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
26085
26086 #undef TARGET_COMP_TYPE_ATTRIBUTES
26087 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
26088
26089 #undef TARGET_GET_MULTILIB_ABI_NAME
26090 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
26091
26092 #undef TARGET_FNTYPE_ABI
26093 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
26094
26095 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
26096 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
26097
26098 #if CHECKING_P
26099 #undef TARGET_RUN_TARGET_SELFTESTS
26100 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
26101 #endif /* #if CHECKING_P */
26102
26103 #undef TARGET_ASM_POST_CFI_STARTPROC
26104 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
26105
26106 #undef TARGET_STRICT_ARGUMENT_NAMING
26107 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
26108
26109 #undef TARGET_MD_ASM_ADJUST
26110 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
26111
26112 #undef TARGET_ASM_FILE_END
26113 #define TARGET_ASM_FILE_END aarch64_asm_file_end
26114
26115 #undef TARGET_ASM_FUNCTION_EPILOGUE
26116 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
26117
26118 struct gcc_target targetm = TARGET_INITIALIZER;
26119
26120 #include "gt-aarch64.h"