gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2020 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74 #include "intl.h"
  75 #include "expmed.h"
  76 #include "function-abi.h"
  77
  78 /* This file should be included last.  */
  79 #include "target-def.h"
  80
  81 /* Defined for convenience.  */
  82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  83
  84 /* Information about a legitimate vector immediate operand.  */
  85 struct simd_immediate_info
  86 {
  87   enum insn_type { MOV, MVN, INDEX, PTRUE };
  88   enum modifier_type { LSL, MSL };
  89
  90   simd_immediate_info () {}
  91   simd_immediate_info (scalar_float_mode, rtx);
  92   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  93                        insn_type = MOV, modifier_type = LSL,
  94                        unsigned int = 0);
  95   simd_immediate_info (scalar_mode, rtx, rtx);
  96   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  97
  98   /* The mode of the elements.  */
  99   scalar_mode elt_mode;
 100
 101   /* The instruction to use to move the immediate into a vector.  */
 102   insn_type insn;
 103
 104   union
 105   {
 106     /* For MOV and MVN.  */
 107     struct
 108     {
 109       /* The value of each element.  */
 110       rtx value;
 111
 112       /* The kind of shift modifier to use, and the number of bits to shift.
 113          This is (LSL, 0) if no shift is needed.  */
 114       modifier_type modifier;
 115       unsigned int shift;
 116     } mov;
 117
 118     /* For INDEX.  */
 119     struct
 120     {
 121       /* The value of the first element and the step to be added for each
 122          subsequent element.  */
 123       rtx base, step;
 124     } index;
 125
 126     /* For PTRUE.  */
 127     aarch64_svpattern pattern;
 128   } u;
 129 };
 130
 131 /* Construct a floating-point immediate in which each element has mode
 132    ELT_MODE_IN and value VALUE_IN.  */
 133 inline simd_immediate_info
 134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 135   : elt_mode (elt_mode_in), insn (MOV)
 136 {
 137   u.mov.value = value_in;
 138   u.mov.modifier = LSL;
 139   u.mov.shift = 0;
 140 }
 141
 142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 143    and value VALUE_IN.  The other parameters are as for the structure
 144    fields.  */
 145 inline simd_immediate_info
 146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 147                        unsigned HOST_WIDE_INT value_in,
 148                        insn_type insn_in, modifier_type modifier_in,
 149                        unsigned int shift_in)
 150   : elt_mode (elt_mode_in), insn (insn_in)
 151 {
 152   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 153   u.mov.modifier = modifier_in;
 154   u.mov.shift = shift_in;
 155 }
 156
 157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 158    and where element I is equal to BASE_IN + I * STEP_IN.  */
 159 inline simd_immediate_info
 160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 161   : elt_mode (elt_mode_in), insn (INDEX)
 162 {
 163   u.index.base = base_in;
 164   u.index.step = step_in;
 165 }
 166
 167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 168    and has PTRUE pattern PATTERN_IN.  */
 169 inline simd_immediate_info
 170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 171                        aarch64_svpattern pattern_in)
 172   : elt_mode (elt_mode_in), insn (PTRUE)
 173 {
 174   u.pattern = pattern_in;
 175 }
 176
 177 namespace {
 178
 179 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
 180 class pure_scalable_type_info
 181 {
 182 public:
 183   /* Represents the result of analyzing a type.  All values are nonzero,
 184      in the possibly forlorn hope that accidental conversions to bool
 185      trigger a warning.  */
 186   enum analysis_result
 187   {
 188     /* The type does not have an ABI identity; i.e. it doesn't contain
 189        at least one object whose type is a Fundamental Data Type.  */
 190     NO_ABI_IDENTITY = 1,
 191
 192     /* The type is definitely a Pure Scalable Type.  */
 193     IS_PST,
 194
 195     /* The type is definitely not a Pure Scalable Type.  */
 196     ISNT_PST,
 197
 198     /* It doesn't matter for PCS purposes whether the type is a Pure
 199        Scalable Type or not, since the type will be handled the same
 200        way regardless.
 201
 202        Specifically, this means that if the type is a Pure Scalable Type,
 203        there aren't enough argument registers to hold it, and so it will
 204        need to be passed or returned in memory.  If the type isn't a
 205        Pure Scalable Type, it's too big to be passed or returned in core
 206        or SIMD&FP registers, and so again will need to go in memory.  */
 207     DOESNT_MATTER
 208   };
 209
 210   /* Aggregates of 17 bytes or more are normally passed and returned
 211      in memory, so aggregates of that size can safely be analyzed as
 212      DOESNT_MATTER.  We need to be able to collect enough pieces to
 213      represent a PST that is smaller than that.  Since predicates are
 214      2 bytes in size for -msve-vector-bits=128, that means we need to be
 215      able to store at least 8 pieces.
 216
 217      We also need to be able to store enough pieces to represent
 218      a single vector in each vector argument register and a single
 219      predicate in each predicate argument register.  This means that
 220      we need at least 12 pieces.  */
 221   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
 222 #if __cplusplus >= 201103L
 223   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
 224 #endif
 225
 226   /* Describes one piece of a PST.  Each piece is one of:
 227
 228      - a single Scalable Vector Type (SVT)
 229      - a single Scalable Predicate Type (SPT)
 230      - a PST containing 2, 3 or 4 SVTs, with no padding
 231
 232      It either represents a single built-in type or a PST formed from
 233      multiple homogeneous built-in types.  */
 234   struct piece
 235   {
 236     rtx get_rtx (unsigned int, unsigned int) const;
 237
 238     /* The number of vector and predicate registers that the piece
 239        occupies.  One of the two is always zero.  */
 240     unsigned int num_zr;
 241     unsigned int num_pr;
 242
 243     /* The mode of the registers described above.  */
 244     machine_mode mode;
 245
 246     /* If this piece is formed from multiple homogeneous built-in types,
 247        this is the mode of the built-in types, otherwise it is MODE.  */
 248     machine_mode orig_mode;
 249
 250     /* The offset in bytes of the piece from the start of the type.  */
 251     poly_uint64_pod offset;
 252   };
 253
 254   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
 255      are in memory order.  */
 256   auto_vec<piece, MAX_PIECES> pieces;
 257
 258   unsigned int num_zr () const;
 259   unsigned int num_pr () const;
 260
 261   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
 262
 263   analysis_result analyze (const_tree);
 264   bool analyze_registers (const_tree);
 265
 266 private:
 267   analysis_result analyze_array (const_tree);
 268   analysis_result analyze_record (const_tree);
 269   void add_piece (const piece &);
 270 };
 271 }
 272
 273 /* The current code model.  */
 274 enum aarch64_code_model aarch64_cmodel;
 275
 276 /* The number of 64-bit elements in an SVE vector.  */
 277 poly_uint16 aarch64_sve_vg;
 278
 279 #ifdef HAVE_AS_TLS
 280 #undef TARGET_HAVE_TLS
 281 #define TARGET_HAVE_TLS 1
 282 #endif
 283
 284 static bool aarch64_composite_type_p (const_tree, machine_mode);
 285 static bool aarch64_return_in_memory_1 (const_tree);
 286 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 287                                                      const_tree,
 288                                                      machine_mode *, int *,
 289                                                      bool *, bool);
 290 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 291 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 292 static void aarch64_override_options_after_change (void);
 293 static bool aarch64_vector_mode_supported_p (machine_mode);
 294 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 295 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 296                                                          const_tree type,
 297                                                          int misalignment,
 298                                                          bool is_packed);
 299 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 300 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 301                                             aarch64_addr_query_type);
 302 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 303
 304 /* Major revision number of the ARM Architecture implemented by the target.  */
 305 unsigned aarch64_architecture_version;
 306
 307 /* The processor for which instructions should be scheduled.  */
 308 enum aarch64_processor aarch64_tune = cortexa53;
 309
 310 /* Mask to specify which instruction scheduling options should be used.  */
 311 uint64_t aarch64_tune_flags = 0;
 312
 313 /* Global flag for PC relative loads.  */
 314 bool aarch64_pcrelative_literal_loads;
 315
 316 /* Global flag for whether frame pointer is enabled.  */
 317 bool aarch64_use_frame_pointer;
 318
 319 #define BRANCH_PROTECT_STR_MAX 255
 320 char *accepted_branch_protection_string = NULL;
 321
 322 static enum aarch64_parse_opt_result
 323 aarch64_parse_branch_protection (const char*, char**);
 324
 325 /* Support for command line parsing of boolean flags in the tuning
 326    structures.  */
 327 struct aarch64_flag_desc
 328 {
 329   const char* name;
 330   unsigned int flag;
 331 };
 332
 333 #define AARCH64_FUSION_PAIR(name, internal_name) \
 334   { name, AARCH64_FUSE_##internal_name },
 335 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 336 {
 337   { "none", AARCH64_FUSE_NOTHING },
 338 #include "aarch64-fusion-pairs.def"
 339   { "all", AARCH64_FUSE_ALL },
 340   { NULL, AARCH64_FUSE_NOTHING }
 341 };
 342
 343 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 344   { name, AARCH64_EXTRA_TUNE_##internal_name },
 345 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 346 {
 347   { "none", AARCH64_EXTRA_TUNE_NONE },
 348 #include "aarch64-tuning-flags.def"
 349   { "all", AARCH64_EXTRA_TUNE_ALL },
 350   { NULL, AARCH64_EXTRA_TUNE_NONE }
 351 };
 352
 353 /* Tuning parameters.  */
 354
 355 static const struct cpu_addrcost_table generic_addrcost_table =
 356 {
 357     {
 358       1, /* hi  */
 359       0, /* si  */
 360       0, /* di  */
 361       1, /* ti  */
 362     },
 363   0, /* pre_modify  */
 364   0, /* post_modify  */
 365   0, /* register_offset  */
 366   0, /* register_sextend  */
 367   0, /* register_zextend  */
 368   0 /* imm_offset  */
 369 };
 370
 371 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 372 {
 373     {
 374       0, /* hi  */
 375       0, /* si  */
 376       0, /* di  */
 377       2, /* ti  */
 378     },
 379   0, /* pre_modify  */
 380   0, /* post_modify  */
 381   1, /* register_offset  */
 382   1, /* register_sextend  */
 383   2, /* register_zextend  */
 384   0, /* imm_offset  */
 385 };
 386
 387 static const struct cpu_addrcost_table xgene1_addrcost_table =
 388 {
 389     {
 390       1, /* hi  */
 391       0, /* si  */
 392       0, /* di  */
 393       1, /* ti  */
 394     },
 395   1, /* pre_modify  */
 396   1, /* post_modify  */
 397   0, /* register_offset  */
 398   1, /* register_sextend  */
 399   1, /* register_zextend  */
 400   0, /* imm_offset  */
 401 };
 402
 403 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 404 {
 405     {
 406       1, /* hi  */
 407       1, /* si  */
 408       1, /* di  */
 409       2, /* ti  */
 410     },
 411   0, /* pre_modify  */
 412   0, /* post_modify  */
 413   2, /* register_offset  */
 414   3, /* register_sextend  */
 415   3, /* register_zextend  */
 416   0, /* imm_offset  */
 417 };
 418
 419 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
 420 {
 421     {
 422       1, /* hi  */
 423       1, /* si  */
 424       1, /* di  */
 425       2, /* ti  */
 426     },
 427   0, /* pre_modify  */
 428   0, /* post_modify  */
 429   2, /* register_offset  */
 430   3, /* register_sextend  */
 431   3, /* register_zextend  */
 432   0, /* imm_offset  */
 433 };
 434
 435 static const struct cpu_addrcost_table tsv110_addrcost_table =
 436 {
 437     {
 438       1, /* hi  */
 439       0, /* si  */
 440       0, /* di  */
 441       1, /* ti  */
 442     },
 443   0, /* pre_modify  */
 444   0, /* post_modify  */
 445   0, /* register_offset  */
 446   1, /* register_sextend  */
 447   1, /* register_zextend  */
 448   0, /* imm_offset  */
 449 };
 450
 451 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 452 {
 453     {
 454       1, /* hi  */
 455       1, /* si  */
 456       1, /* di  */
 457       2, /* ti  */
 458     },
 459   1, /* pre_modify  */
 460   1, /* post_modify  */
 461   3, /* register_offset  */
 462   3, /* register_sextend  */
 463   3, /* register_zextend  */
 464   2, /* imm_offset  */
 465 };
 466
 467 static const struct cpu_regmove_cost generic_regmove_cost =
 468 {
 469   1, /* GP2GP  */
 470   /* Avoid the use of slow int<->fp moves for spilling by setting
 471      their cost higher than memmov_cost.  */
 472   5, /* GP2FP  */
 473   5, /* FP2GP  */
 474   2 /* FP2FP  */
 475 };
 476
 477 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 478 {
 479   1, /* GP2GP  */
 480   /* Avoid the use of slow int<->fp moves for spilling by setting
 481      their cost higher than memmov_cost.  */
 482   5, /* GP2FP  */
 483   5, /* FP2GP  */
 484   2 /* FP2FP  */
 485 };
 486
 487 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 488 {
 489   1, /* GP2GP  */
 490   /* Avoid the use of slow int<->fp moves for spilling by setting
 491      their cost higher than memmov_cost.  */
 492   5, /* GP2FP  */
 493   5, /* FP2GP  */
 494   2 /* FP2FP  */
 495 };
 496
 497 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 498 {
 499   1, /* GP2GP  */
 500   /* Avoid the use of slow int<->fp moves for spilling by setting
 501      their cost higher than memmov_cost (actual, 4 and 9).  */
 502   9, /* GP2FP  */
 503   9, /* FP2GP  */
 504   1 /* FP2FP  */
 505 };
 506
 507 static const struct cpu_regmove_cost thunderx_regmove_cost =
 508 {
 509   2, /* GP2GP  */
 510   2, /* GP2FP  */
 511   6, /* FP2GP  */
 512   4 /* FP2FP  */
 513 };
 514
 515 static const struct cpu_regmove_cost xgene1_regmove_cost =
 516 {
 517   1, /* GP2GP  */
 518   /* Avoid the use of slow int<->fp moves for spilling by setting
 519      their cost higher than memmov_cost.  */
 520   8, /* GP2FP  */
 521   8, /* FP2GP  */
 522   2 /* FP2FP  */
 523 };
 524
 525 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 526 {
 527   2, /* GP2GP  */
 528   /* Avoid the use of int<->fp moves for spilling.  */
 529   6, /* GP2FP  */
 530   6, /* FP2GP  */
 531   4 /* FP2FP  */
 532 };
 533
 534 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 535 {
 536   1, /* GP2GP  */
 537   /* Avoid the use of int<->fp moves for spilling.  */
 538   5, /* GP2FP  */
 539   6, /* FP2GP  */
 540   3, /* FP2FP  */
 541 };
 542
 543 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
 544 {
 545   1, /* GP2GP  */
 546   /* Avoid the use of int<->fp moves for spilling.  */
 547   4, /* GP2FP  */
 548   5, /* FP2GP  */
 549   4  /* FP2FP  */
 550 };
 551
 552 static const struct cpu_regmove_cost tsv110_regmove_cost =
 553 {
 554   1, /* GP2GP  */
 555   /* Avoid the use of slow int<->fp moves for spilling by setting
 556      their cost higher than memmov_cost.  */
 557   2, /* GP2FP  */
 558   3, /* FP2GP  */
 559   2  /* FP2FP  */
 560 };
 561
 562 /* Generic costs for vector insn classes.  */
 563 static const struct cpu_vector_cost generic_vector_cost =
 564 {
 565   1, /* scalar_int_stmt_cost  */
 566   1, /* scalar_fp_stmt_cost  */
 567   1, /* scalar_load_cost  */
 568   1, /* scalar_store_cost  */
 569   1, /* vec_int_stmt_cost  */
 570   1, /* vec_fp_stmt_cost  */
 571   2, /* vec_permute_cost  */
 572   2, /* vec_to_scalar_cost  */
 573   1, /* scalar_to_vec_cost  */
 574   1, /* vec_align_load_cost  */
 575   1, /* vec_unalign_load_cost  */
 576   1, /* vec_unalign_store_cost  */
 577   1, /* vec_store_cost  */
 578   3, /* cond_taken_branch_cost  */
 579   1 /* cond_not_taken_branch_cost  */
 580 };
 581
 582 /* QDF24XX costs for vector insn classes.  */
 583 static const struct cpu_vector_cost qdf24xx_vector_cost =
 584 {
 585   1, /* scalar_int_stmt_cost  */
 586   1, /* scalar_fp_stmt_cost  */
 587   1, /* scalar_load_cost  */
 588   1, /* scalar_store_cost  */
 589   1, /* vec_int_stmt_cost  */
 590   3, /* vec_fp_stmt_cost  */
 591   2, /* vec_permute_cost  */
 592   1, /* vec_to_scalar_cost  */
 593   1, /* scalar_to_vec_cost  */
 594   1, /* vec_align_load_cost  */
 595   1, /* vec_unalign_load_cost  */
 596   1, /* vec_unalign_store_cost  */
 597   1, /* vec_store_cost  */
 598   3, /* cond_taken_branch_cost  */
 599   1 /* cond_not_taken_branch_cost  */
 600 };
 601
 602 /* ThunderX costs for vector insn classes.  */
 603 static const struct cpu_vector_cost thunderx_vector_cost =
 604 {
 605   1, /* scalar_int_stmt_cost  */
 606   1, /* scalar_fp_stmt_cost  */
 607   3, /* scalar_load_cost  */
 608   1, /* scalar_store_cost  */
 609   4, /* vec_int_stmt_cost  */
 610   1, /* vec_fp_stmt_cost  */
 611   4, /* vec_permute_cost  */
 612   2, /* vec_to_scalar_cost  */
 613   2, /* scalar_to_vec_cost  */
 614   3, /* vec_align_load_cost  */
 615   5, /* vec_unalign_load_cost  */
 616   5, /* vec_unalign_store_cost  */
 617   1, /* vec_store_cost  */
 618   3, /* cond_taken_branch_cost  */
 619   3 /* cond_not_taken_branch_cost  */
 620 };
 621
 622 static const struct cpu_vector_cost tsv110_vector_cost =
 623 {
 624   1, /* scalar_int_stmt_cost  */
 625   1, /* scalar_fp_stmt_cost  */
 626   5, /* scalar_load_cost  */
 627   1, /* scalar_store_cost  */
 628   2, /* vec_int_stmt_cost  */
 629   2, /* vec_fp_stmt_cost  */
 630   2, /* vec_permute_cost  */
 631   3, /* vec_to_scalar_cost  */
 632   2, /* scalar_to_vec_cost  */
 633   5, /* vec_align_load_cost  */
 634   5, /* vec_unalign_load_cost  */
 635   1, /* vec_unalign_store_cost  */
 636   1, /* vec_store_cost  */
 637   1, /* cond_taken_branch_cost  */
 638   1 /* cond_not_taken_branch_cost  */
 639 };
 640
 641 /* Generic costs for vector insn classes.  */
 642 static const struct cpu_vector_cost cortexa57_vector_cost =
 643 {
 644   1, /* scalar_int_stmt_cost  */
 645   1, /* scalar_fp_stmt_cost  */
 646   4, /* scalar_load_cost  */
 647   1, /* scalar_store_cost  */
 648   2, /* vec_int_stmt_cost  */
 649   2, /* vec_fp_stmt_cost  */
 650   3, /* vec_permute_cost  */
 651   8, /* vec_to_scalar_cost  */
 652   8, /* scalar_to_vec_cost  */
 653   4, /* vec_align_load_cost  */
 654   4, /* vec_unalign_load_cost  */
 655   1, /* vec_unalign_store_cost  */
 656   1, /* vec_store_cost  */
 657   1, /* cond_taken_branch_cost  */
 658   1 /* cond_not_taken_branch_cost  */
 659 };
 660
 661 static const struct cpu_vector_cost exynosm1_vector_cost =
 662 {
 663   1, /* scalar_int_stmt_cost  */
 664   1, /* scalar_fp_stmt_cost  */
 665   5, /* scalar_load_cost  */
 666   1, /* scalar_store_cost  */
 667   3, /* vec_int_stmt_cost  */
 668   3, /* vec_fp_stmt_cost  */
 669   3, /* vec_permute_cost  */
 670   3, /* vec_to_scalar_cost  */
 671   3, /* scalar_to_vec_cost  */
 672   5, /* vec_align_load_cost  */
 673   5, /* vec_unalign_load_cost  */
 674   1, /* vec_unalign_store_cost  */
 675   1, /* vec_store_cost  */
 676   1, /* cond_taken_branch_cost  */
 677   1 /* cond_not_taken_branch_cost  */
 678 };
 679
 680 /* Generic costs for vector insn classes.  */
 681 static const struct cpu_vector_cost xgene1_vector_cost =
 682 {
 683   1, /* scalar_int_stmt_cost  */
 684   1, /* scalar_fp_stmt_cost  */
 685   5, /* scalar_load_cost  */
 686   1, /* scalar_store_cost  */
 687   2, /* vec_int_stmt_cost  */
 688   2, /* vec_fp_stmt_cost  */
 689   2, /* vec_permute_cost  */
 690   4, /* vec_to_scalar_cost  */
 691   4, /* scalar_to_vec_cost  */
 692   10, /* vec_align_load_cost  */
 693   10, /* vec_unalign_load_cost  */
 694   2, /* vec_unalign_store_cost  */
 695   2, /* vec_store_cost  */
 696   2, /* cond_taken_branch_cost  */
 697   1 /* cond_not_taken_branch_cost  */
 698 };
 699
 700 /* Costs for vector insn classes for Vulcan.  */
 701 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 702 {
 703   1, /* scalar_int_stmt_cost  */
 704   6, /* scalar_fp_stmt_cost  */
 705   4, /* scalar_load_cost  */
 706   1, /* scalar_store_cost  */
 707   4, /* vec_int_stmt_cost  */
 708   5, /* vec_fp_stmt_cost  */
 709   10, /* vec_permute_cost  */
 710   6, /* vec_to_scalar_cost  */
 711   5, /* scalar_to_vec_cost  */
 712   4, /* vec_align_load_cost  */
 713   4, /* vec_unalign_load_cost  */
 714   1, /* vec_unalign_store_cost  */
 715   1, /* vec_store_cost  */
 716   2, /* cond_taken_branch_cost  */
 717   1  /* cond_not_taken_branch_cost  */
 718 };
 719
 720 static const struct cpu_vector_cost thunderx3t110_vector_cost =
 721 {
 722   1, /* scalar_int_stmt_cost  */
 723   5, /* scalar_fp_stmt_cost  */
 724   4, /* scalar_load_cost  */
 725   1, /* scalar_store_cost  */
 726   5, /* vec_int_stmt_cost  */
 727   5, /* vec_fp_stmt_cost  */
 728   10, /* vec_permute_cost  */
 729   5, /* vec_to_scalar_cost  */
 730   5, /* scalar_to_vec_cost  */
 731   4, /* vec_align_load_cost  */
 732   4, /* vec_unalign_load_cost  */
 733   4, /* vec_unalign_store_cost  */
 734   4, /* vec_store_cost  */
 735   2, /* cond_taken_branch_cost  */
 736   1  /* cond_not_taken_branch_cost  */
 737 };
 738
 739
 740 /* Generic costs for branch instructions.  */
 741 static const struct cpu_branch_cost generic_branch_cost =
 742 {
 743   1,  /* Predictable.  */
 744   3   /* Unpredictable.  */
 745 };
 746
 747 /* Generic approximation modes.  */
 748 static const cpu_approx_modes generic_approx_modes =
 749 {
 750   AARCH64_APPROX_NONE,  /* division  */
 751   AARCH64_APPROX_NONE,  /* sqrt  */
 752   AARCH64_APPROX_NONE   /* recip_sqrt  */
 753 };
 754
 755 /* Approximation modes for Exynos M1.  */
 756 static const cpu_approx_modes exynosm1_approx_modes =
 757 {
 758   AARCH64_APPROX_NONE,  /* division  */
 759   AARCH64_APPROX_ALL,   /* sqrt  */
 760   AARCH64_APPROX_ALL    /* recip_sqrt  */
 761 };
 762
 763 /* Approximation modes for X-Gene 1.  */
 764 static const cpu_approx_modes xgene1_approx_modes =
 765 {
 766   AARCH64_APPROX_NONE,  /* division  */
 767   AARCH64_APPROX_NONE,  /* sqrt  */
 768   AARCH64_APPROX_ALL    /* recip_sqrt  */
 769 };
 770
 771 /* Generic prefetch settings (which disable prefetch).  */
 772 static const cpu_prefetch_tune generic_prefetch_tune =
 773 {
 774   0,                    /* num_slots  */
 775   -1,                   /* l1_cache_size  */
 776   -1,                   /* l1_cache_line_size  */
 777   -1,                   /* l2_cache_size  */
 778   true,                 /* prefetch_dynamic_strides */
 779   -1,                   /* minimum_stride */
 780   -1                    /* default_opt_level  */
 781 };
 782
 783 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 784 {
 785   0,                    /* num_slots  */
 786   -1,                   /* l1_cache_size  */
 787   64,                   /* l1_cache_line_size  */
 788   -1,                   /* l2_cache_size  */
 789   true,                 /* prefetch_dynamic_strides */
 790   -1,                   /* minimum_stride */
 791   -1                    /* default_opt_level  */
 792 };
 793
 794 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 795 {
 796   4,                    /* num_slots  */
 797   32,                   /* l1_cache_size  */
 798   64,                   /* l1_cache_line_size  */
 799   512,                  /* l2_cache_size  */
 800   false,                /* prefetch_dynamic_strides */
 801   2048,                 /* minimum_stride */
 802   3                     /* default_opt_level  */
 803 };
 804
 805 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 806 {
 807   8,                    /* num_slots  */
 808   32,                   /* l1_cache_size  */
 809   128,                  /* l1_cache_line_size  */
 810   16*1024,              /* l2_cache_size  */
 811   true,                 /* prefetch_dynamic_strides */
 812   -1,                   /* minimum_stride */
 813   3                     /* default_opt_level  */
 814 };
 815
 816 static const cpu_prefetch_tune thunderx_prefetch_tune =
 817 {
 818   8,                    /* num_slots  */
 819   32,                   /* l1_cache_size  */
 820   128,                  /* l1_cache_line_size  */
 821   -1,                   /* l2_cache_size  */
 822   true,                 /* prefetch_dynamic_strides */
 823   -1,                   /* minimum_stride */
 824   -1                    /* default_opt_level  */
 825 };
 826
 827 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 828 {
 829   8,                    /* num_slots  */
 830   32,                   /* l1_cache_size  */
 831   64,                   /* l1_cache_line_size  */
 832   256,                  /* l2_cache_size  */
 833   true,                 /* prefetch_dynamic_strides */
 834   -1,                   /* minimum_stride */
 835   -1                    /* default_opt_level  */
 836 };
 837
 838 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
 839 {
 840   8,                    /* num_slots  */
 841   32,                   /* l1_cache_size  */
 842   64,                   /* l1_cache_line_size  */
 843   256,                  /* l2_cache_size  */
 844   true,                 /* prefetch_dynamic_strides */
 845   -1,                   /* minimum_stride */
 846   -1                    /* default_opt_level  */
 847 };
 848
 849 static const cpu_prefetch_tune tsv110_prefetch_tune =
 850 {
 851   0,                    /* num_slots  */
 852   64,                   /* l1_cache_size  */
 853   64,                   /* l1_cache_line_size  */
 854   512,                  /* l2_cache_size  */
 855   true,                 /* prefetch_dynamic_strides */
 856   -1,                   /* minimum_stride */
 857   -1                    /* default_opt_level  */
 858 };
 859
 860 static const cpu_prefetch_tune xgene1_prefetch_tune =
 861 {
 862   8,                    /* num_slots  */
 863   32,                   /* l1_cache_size  */
 864   64,                   /* l1_cache_line_size  */
 865   256,                  /* l2_cache_size  */
 866   true,                 /* prefetch_dynamic_strides */
 867   -1,                   /* minimum_stride */
 868   -1                    /* default_opt_level  */
 869 };
 870
 871 static const struct tune_params generic_tunings =
 872 {
 873   &cortexa57_extra_costs,
 874   &generic_addrcost_table,
 875   &generic_regmove_cost,
 876   &generic_vector_cost,
 877   &generic_branch_cost,
 878   &generic_approx_modes,
 879   SVE_NOT_IMPLEMENTED, /* sve_width  */
 880   4, /* memmov_cost  */
 881   2, /* issue_rate  */
 882   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
 883   "16:12",      /* function_align.  */
 884   "4",  /* jump_align.  */
 885   "8",  /* loop_align.  */
 886   2,    /* int_reassoc_width.  */
 887   4,    /* fp_reassoc_width.  */
 888   1,    /* vec_reassoc_width.  */
 889   2,    /* min_div_recip_mul_sf.  */
 890   2,    /* min_div_recip_mul_df.  */
 891   0,    /* max_case_values.  */
 892   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 893   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 894   &generic_prefetch_tune
 895 };
 896
 897 static const struct tune_params cortexa35_tunings =
 898 {
 899   &cortexa53_extra_costs,
 900   &generic_addrcost_table,
 901   &cortexa53_regmove_cost,
 902   &generic_vector_cost,
 903   &generic_branch_cost,
 904   &generic_approx_modes,
 905   SVE_NOT_IMPLEMENTED, /* sve_width  */
 906   4, /* memmov_cost  */
 907   1, /* issue_rate  */
 908   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 909    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 910   "16", /* function_align.  */
 911   "4",  /* jump_align.  */
 912   "8",  /* loop_align.  */
 913   2,    /* int_reassoc_width.  */
 914   4,    /* fp_reassoc_width.  */
 915   1,    /* vec_reassoc_width.  */
 916   2,    /* min_div_recip_mul_sf.  */
 917   2,    /* min_div_recip_mul_df.  */
 918   0,    /* max_case_values.  */
 919   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 920   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 921   &generic_prefetch_tune
 922 };
 923
 924 static const struct tune_params cortexa53_tunings =
 925 {
 926   &cortexa53_extra_costs,
 927   &generic_addrcost_table,
 928   &cortexa53_regmove_cost,
 929   &generic_vector_cost,
 930   &generic_branch_cost,
 931   &generic_approx_modes,
 932   SVE_NOT_IMPLEMENTED, /* sve_width  */
 933   4, /* memmov_cost  */
 934   2, /* issue_rate  */
 935   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 936    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 937   "16", /* function_align.  */
 938   "4",  /* jump_align.  */
 939   "8",  /* loop_align.  */
 940   2,    /* int_reassoc_width.  */
 941   4,    /* fp_reassoc_width.  */
 942   1,    /* vec_reassoc_width.  */
 943   2,    /* min_div_recip_mul_sf.  */
 944   2,    /* min_div_recip_mul_df.  */
 945   0,    /* max_case_values.  */
 946   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 947   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 948   &generic_prefetch_tune
 949 };
 950
 951 static const struct tune_params cortexa57_tunings =
 952 {
 953   &cortexa57_extra_costs,
 954   &generic_addrcost_table,
 955   &cortexa57_regmove_cost,
 956   &cortexa57_vector_cost,
 957   &generic_branch_cost,
 958   &generic_approx_modes,
 959   SVE_NOT_IMPLEMENTED, /* sve_width  */
 960   4, /* memmov_cost  */
 961   3, /* issue_rate  */
 962   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 963    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 964   "16", /* function_align.  */
 965   "4",  /* jump_align.  */
 966   "8",  /* loop_align.  */
 967   2,    /* int_reassoc_width.  */
 968   4,    /* fp_reassoc_width.  */
 969   1,    /* vec_reassoc_width.  */
 970   2,    /* min_div_recip_mul_sf.  */
 971   2,    /* min_div_recip_mul_df.  */
 972   0,    /* max_case_values.  */
 973   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 974   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 975   &generic_prefetch_tune
 976 };
 977
 978 static const struct tune_params cortexa72_tunings =
 979 {
 980   &cortexa57_extra_costs,
 981   &generic_addrcost_table,
 982   &cortexa57_regmove_cost,
 983   &cortexa57_vector_cost,
 984   &generic_branch_cost,
 985   &generic_approx_modes,
 986   SVE_NOT_IMPLEMENTED, /* sve_width  */
 987   4, /* memmov_cost  */
 988   3, /* issue_rate  */
 989   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 990    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 991   "16", /* function_align.  */
 992   "4",  /* jump_align.  */
 993   "8",  /* loop_align.  */
 994   2,    /* int_reassoc_width.  */
 995   4,    /* fp_reassoc_width.  */
 996   1,    /* vec_reassoc_width.  */
 997   2,    /* min_div_recip_mul_sf.  */
 998   2,    /* min_div_recip_mul_df.  */
 999   0,    /* max_case_values.  */
1000   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1001   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1002   &generic_prefetch_tune
1003 };
1004
1005 static const struct tune_params cortexa73_tunings =
1006 {
1007   &cortexa57_extra_costs,
1008   &generic_addrcost_table,
1009   &cortexa57_regmove_cost,
1010   &cortexa57_vector_cost,
1011   &generic_branch_cost,
1012   &generic_approx_modes,
1013   SVE_NOT_IMPLEMENTED, /* sve_width  */
1014   4, /* memmov_cost.  */
1015   2, /* issue_rate.  */
1016   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1018   "16", /* function_align.  */
1019   "4",  /* jump_align.  */
1020   "8",  /* loop_align.  */
1021   2,    /* int_reassoc_width.  */
1022   4,    /* fp_reassoc_width.  */
1023   1,    /* vec_reassoc_width.  */
1024   2,    /* min_div_recip_mul_sf.  */
1025   2,    /* min_div_recip_mul_df.  */
1026   0,    /* max_case_values.  */
1027   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1028   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1029   &generic_prefetch_tune
1030 };
1031
1032
1033
1034 static const struct tune_params exynosm1_tunings =
1035 {
1036   &exynosm1_extra_costs,
1037   &exynosm1_addrcost_table,
1038   &exynosm1_regmove_cost,
1039   &exynosm1_vector_cost,
1040   &generic_branch_cost,
1041   &exynosm1_approx_modes,
1042   SVE_NOT_IMPLEMENTED, /* sve_width  */
1043   4,    /* memmov_cost  */
1044   3,    /* issue_rate  */
1045   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
1046   "4",  /* function_align.  */
1047   "4",  /* jump_align.  */
1048   "4",  /* loop_align.  */
1049   2,    /* int_reassoc_width.  */
1050   4,    /* fp_reassoc_width.  */
1051   1,    /* vec_reassoc_width.  */
1052   2,    /* min_div_recip_mul_sf.  */
1053   2,    /* min_div_recip_mul_df.  */
1054   48,   /* max_case_values.  */
1055   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
1056   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
1057   &exynosm1_prefetch_tune
1058 };
1059
1060 static const struct tune_params thunderxt88_tunings =
1061 {
1062   &thunderx_extra_costs,
1063   &generic_addrcost_table,
1064   &thunderx_regmove_cost,
1065   &thunderx_vector_cost,
1066   &generic_branch_cost,
1067   &generic_approx_modes,
1068   SVE_NOT_IMPLEMENTED, /* sve_width  */
1069   6, /* memmov_cost  */
1070   2, /* issue_rate  */
1071   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1072   "8",  /* function_align.  */
1073   "8",  /* jump_align.  */
1074   "8",  /* loop_align.  */
1075   2,    /* int_reassoc_width.  */
1076   4,    /* fp_reassoc_width.  */
1077   1,    /* vec_reassoc_width.  */
1078   2,    /* min_div_recip_mul_sf.  */
1079   2,    /* min_div_recip_mul_df.  */
1080   0,    /* max_case_values.  */
1081   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1082   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
1083   &thunderxt88_prefetch_tune
1084 };
1085
1086 static const struct tune_params thunderx_tunings =
1087 {
1088   &thunderx_extra_costs,
1089   &generic_addrcost_table,
1090   &thunderx_regmove_cost,
1091   &thunderx_vector_cost,
1092   &generic_branch_cost,
1093   &generic_approx_modes,
1094   SVE_NOT_IMPLEMENTED, /* sve_width  */
1095   6, /* memmov_cost  */
1096   2, /* issue_rate  */
1097   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1098   "8",  /* function_align.  */
1099   "8",  /* jump_align.  */
1100   "8",  /* loop_align.  */
1101   2,    /* int_reassoc_width.  */
1102   4,    /* fp_reassoc_width.  */
1103   1,    /* vec_reassoc_width.  */
1104   2,    /* min_div_recip_mul_sf.  */
1105   2,    /* min_div_recip_mul_df.  */
1106   0,    /* max_case_values.  */
1107   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1108   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1109    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
1110   &thunderx_prefetch_tune
1111 };
1112
1113 static const struct tune_params tsv110_tunings =
1114 {
1115   &tsv110_extra_costs,
1116   &tsv110_addrcost_table,
1117   &tsv110_regmove_cost,
1118   &tsv110_vector_cost,
1119   &generic_branch_cost,
1120   &generic_approx_modes,
1121   SVE_NOT_IMPLEMENTED, /* sve_width  */
1122   4,    /* memmov_cost  */
1123   4,    /* issue_rate  */
1124   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1125    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1126   "16", /* function_align.  */
1127   "4",  /* jump_align.  */
1128   "8",  /* loop_align.  */
1129   2,    /* int_reassoc_width.  */
1130   4,    /* fp_reassoc_width.  */
1131   1,    /* vec_reassoc_width.  */
1132   2,    /* min_div_recip_mul_sf.  */
1133   2,    /* min_div_recip_mul_df.  */
1134   0,    /* max_case_values.  */
1135   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1136   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
1137   &tsv110_prefetch_tune
1138 };
1139
1140 static const struct tune_params xgene1_tunings =
1141 {
1142   &xgene1_extra_costs,
1143   &xgene1_addrcost_table,
1144   &xgene1_regmove_cost,
1145   &xgene1_vector_cost,
1146   &generic_branch_cost,
1147   &xgene1_approx_modes,
1148   SVE_NOT_IMPLEMENTED, /* sve_width  */
1149   6, /* memmov_cost  */
1150   4, /* issue_rate  */
1151   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1152   "16", /* function_align.  */
1153   "16", /* jump_align.  */
1154   "16", /* loop_align.  */
1155   2,    /* int_reassoc_width.  */
1156   4,    /* fp_reassoc_width.  */
1157   1,    /* vec_reassoc_width.  */
1158   2,    /* min_div_recip_mul_sf.  */
1159   2,    /* min_div_recip_mul_df.  */
1160   17,   /* max_case_values.  */
1161   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1162   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1163   &xgene1_prefetch_tune
1164 };
1165
1166 static const struct tune_params emag_tunings =
1167 {
1168   &xgene1_extra_costs,
1169   &xgene1_addrcost_table,
1170   &xgene1_regmove_cost,
1171   &xgene1_vector_cost,
1172   &generic_branch_cost,
1173   &xgene1_approx_modes,
1174   SVE_NOT_IMPLEMENTED,
1175   6, /* memmov_cost  */
1176   4, /* issue_rate  */
1177   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1178   "16", /* function_align.  */
1179   "16", /* jump_align.  */
1180   "16", /* loop_align.  */
1181   2,    /* int_reassoc_width.  */
1182   4,    /* fp_reassoc_width.  */
1183   1,    /* vec_reassoc_width.  */
1184   2,    /* min_div_recip_mul_sf.  */
1185   2,    /* min_div_recip_mul_df.  */
1186   17,   /* max_case_values.  */
1187   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1188   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1189   &xgene1_prefetch_tune
1190 };
1191
1192 static const struct tune_params qdf24xx_tunings =
1193 {
1194   &qdf24xx_extra_costs,
1195   &qdf24xx_addrcost_table,
1196   &qdf24xx_regmove_cost,
1197   &qdf24xx_vector_cost,
1198   &generic_branch_cost,
1199   &generic_approx_modes,
1200   SVE_NOT_IMPLEMENTED, /* sve_width  */
1201   4, /* memmov_cost  */
1202   4, /* issue_rate  */
1203   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1204    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1205   "16", /* function_align.  */
1206   "8",  /* jump_align.  */
1207   "16", /* loop_align.  */
1208   2,    /* int_reassoc_width.  */
1209   4,    /* fp_reassoc_width.  */
1210   1,    /* vec_reassoc_width.  */
1211   2,    /* min_div_recip_mul_sf.  */
1212   2,    /* min_div_recip_mul_df.  */
1213   0,    /* max_case_values.  */
1214   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1215   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1216   &qdf24xx_prefetch_tune
1217 };
1218
1219 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1220    for now.  */
1221 static const struct tune_params saphira_tunings =
1222 {
1223   &generic_extra_costs,
1224   &generic_addrcost_table,
1225   &generic_regmove_cost,
1226   &generic_vector_cost,
1227   &generic_branch_cost,
1228   &generic_approx_modes,
1229   SVE_NOT_IMPLEMENTED, /* sve_width  */
1230   4, /* memmov_cost  */
1231   4, /* issue_rate  */
1232   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1233    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1234   "16", /* function_align.  */
1235   "8",  /* jump_align.  */
1236   "16", /* loop_align.  */
1237   2,    /* int_reassoc_width.  */
1238   4,    /* fp_reassoc_width.  */
1239   1,    /* vec_reassoc_width.  */
1240   2,    /* min_div_recip_mul_sf.  */
1241   2,    /* min_div_recip_mul_df.  */
1242   0,    /* max_case_values.  */
1243   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1244   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1245   &generic_prefetch_tune
1246 };
1247
1248 static const struct tune_params thunderx2t99_tunings =
1249 {
1250   &thunderx2t99_extra_costs,
1251   &thunderx2t99_addrcost_table,
1252   &thunderx2t99_regmove_cost,
1253   &thunderx2t99_vector_cost,
1254   &generic_branch_cost,
1255   &generic_approx_modes,
1256   SVE_NOT_IMPLEMENTED, /* sve_width  */
1257   4, /* memmov_cost.  */
1258   4, /* issue_rate.  */
1259   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1260    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1261   "16", /* function_align.  */
1262   "8",  /* jump_align.  */
1263   "16", /* loop_align.  */
1264   3,    /* int_reassoc_width.  */
1265   2,    /* fp_reassoc_width.  */
1266   2,    /* vec_reassoc_width.  */
1267   2,    /* min_div_recip_mul_sf.  */
1268   2,    /* min_div_recip_mul_df.  */
1269   0,    /* max_case_values.  */
1270   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1271   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1272   &thunderx2t99_prefetch_tune
1273 };
1274
1275 static const struct tune_params thunderx3t110_tunings =
1276 {
1277   &thunderx3t110_extra_costs,
1278   &thunderx3t110_addrcost_table,
1279   &thunderx3t110_regmove_cost,
1280   &thunderx3t110_vector_cost,
1281   &generic_branch_cost,
1282   &generic_approx_modes,
1283   SVE_NOT_IMPLEMENTED, /* sve_width  */
1284   4, /* memmov_cost.  */
1285   6, /* issue_rate.  */
1286   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1287    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1288   "16", /* function_align.  */
1289   "8",  /* jump_align.  */
1290   "16", /* loop_align.  */
1291   3,    /* int_reassoc_width.  */
1292   2,    /* fp_reassoc_width.  */
1293   2,    /* vec_reassoc_width.  */
1294   2,    /* min_div_recip_mul_sf.  */
1295   2,    /* min_div_recip_mul_df.  */
1296   0,    /* max_case_values.  */
1297   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1298   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1299   &thunderx3t110_prefetch_tune
1300 };
1301
1302 static const struct tune_params neoversen1_tunings =
1303 {
1304   &cortexa57_extra_costs,
1305   &generic_addrcost_table,
1306   &generic_regmove_cost,
1307   &cortexa57_vector_cost,
1308   &generic_branch_cost,
1309   &generic_approx_modes,
1310   SVE_NOT_IMPLEMENTED, /* sve_width  */
1311   4, /* memmov_cost  */
1312   3, /* issue_rate  */
1313   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1314   "32:16",      /* function_align.  */
1315   "4",          /* jump_align.  */
1316   "32:16",      /* loop_align.  */
1317   2,    /* int_reassoc_width.  */
1318   4,    /* fp_reassoc_width.  */
1319   2,    /* vec_reassoc_width.  */
1320   2,    /* min_div_recip_mul_sf.  */
1321   2,    /* min_div_recip_mul_df.  */
1322   0,    /* max_case_values.  */
1323   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1324   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1325   &generic_prefetch_tune
1326 };
1327
1328 /* Support for fine-grained override of the tuning structures.  */
1329 struct aarch64_tuning_override_function
1330 {
1331   const char* name;
1332   void (*parse_override)(const char*, struct tune_params*);
1333 };
1334
1335 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1336 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1337 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1338
1339 static const struct aarch64_tuning_override_function
1340 aarch64_tuning_override_functions[] =
1341 {
1342   { "fuse", aarch64_parse_fuse_string },
1343   { "tune", aarch64_parse_tune_string },
1344   { "sve_width", aarch64_parse_sve_width_string },
1345   { NULL, NULL }
1346 };
1347
1348 /* A processor implementing AArch64.  */
1349 struct processor
1350 {
1351   const char *const name;
1352   enum aarch64_processor ident;
1353   enum aarch64_processor sched_core;
1354   enum aarch64_arch arch;
1355   unsigned architecture_version;
1356   const uint64_t flags;
1357   const struct tune_params *const tune;
1358 };
1359
1360 /* Architectures implementing AArch64.  */
1361 static const struct processor all_architectures[] =
1362 {
1363 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1364   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1365 #include "aarch64-arches.def"
1366   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1367 };
1368
1369 /* Processor cores implementing AArch64.  */
1370 static const struct processor all_cores[] =
1371 {
1372 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1373   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1374   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1375   FLAGS, &COSTS##_tunings},
1376 #include "aarch64-cores.def"
1377   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1378     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1379   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1380 };
1381
1382
1383 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1384    handling code or by target attributes.  */
1385 static const struct processor *selected_arch;
1386 static const struct processor *selected_cpu;
1387 static const struct processor *selected_tune;
1388
1389 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1390
1391 /* The current tuning set.  */
1392 struct tune_params aarch64_tune_params = generic_tunings;
1393
1394 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
1395
1396 static tree
1397 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1398                                      int, bool *no_add_attrs)
1399 {
1400   /* Since we set fn_type_req to true, the caller should have checked
1401      this for us.  */
1402   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1403   switch ((arm_pcs) fntype_abi (*node).id ())
1404     {
1405     case ARM_PCS_AAPCS64:
1406     case ARM_PCS_SIMD:
1407       return NULL_TREE;
1408
1409     case ARM_PCS_SVE:
1410       error ("the %qE attribute cannot be applied to an SVE function type",
1411              name);
1412       *no_add_attrs = true;
1413       return NULL_TREE;
1414
1415     case ARM_PCS_TLSDESC:
1416     case ARM_PCS_UNKNOWN:
1417       break;
1418     }
1419   gcc_unreachable ();
1420 }
1421
1422 /* Table of machine attributes.  */
1423 static const struct attribute_spec aarch64_attribute_table[] =
1424 {
1425   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1426        affects_type_identity, handler, exclude } */
1427   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
1428                           handle_aarch64_vector_pcs_attribute, NULL },
1429   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
1430                           aarch64_sve::handle_arm_sve_vector_bits_attribute,
1431                           NULL },
1432   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
1433   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
1434   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
1435   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1436 };
1437
1438 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1439
1440 /* An ISA extension in the co-processor and main instruction set space.  */
1441 struct aarch64_option_extension
1442 {
1443   const char *const name;
1444   const unsigned long flags_on;
1445   const unsigned long flags_off;
1446 };
1447
1448 typedef enum aarch64_cond_code
1449 {
1450   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1451   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1452   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1453 }
1454 aarch64_cc;
1455
1456 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1457
1458 struct aarch64_branch_protect_type
1459 {
1460   /* The type's name that the user passes to the branch-protection option
1461     string.  */
1462   const char* name;
1463   /* Function to handle the protection type and set global variables.
1464     First argument is the string token corresponding with this type and the
1465     second argument is the next token in the option string.
1466     Return values:
1467     * AARCH64_PARSE_OK: Handling was sucessful.
1468     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1469       should print an error.
1470     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1471       own error.  */
1472   enum aarch64_parse_opt_result (*handler)(char*, char*);
1473   /* A list of types that can follow this type in the option string.  */
1474   const aarch64_branch_protect_type* subtypes;
1475   unsigned int num_subtypes;
1476 };
1477
1478 static enum aarch64_parse_opt_result
1479 aarch64_handle_no_branch_protection (char* str, char* rest)
1480 {
1481   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1482   aarch64_enable_bti = 0;
1483   if (rest)
1484     {
1485       error ("unexpected %<%s%> after %<%s%>", rest, str);
1486       return AARCH64_PARSE_INVALID_FEATURE;
1487     }
1488   return AARCH64_PARSE_OK;
1489 }
1490
1491 static enum aarch64_parse_opt_result
1492 aarch64_handle_standard_branch_protection (char* str, char* rest)
1493 {
1494   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1495   aarch64_ra_sign_key = AARCH64_KEY_A;
1496   aarch64_enable_bti = 1;
1497   if (rest)
1498     {
1499       error ("unexpected %<%s%> after %<%s%>", rest, str);
1500       return AARCH64_PARSE_INVALID_FEATURE;
1501     }
1502   return AARCH64_PARSE_OK;
1503 }
1504
1505 static enum aarch64_parse_opt_result
1506 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1507                                     char* rest ATTRIBUTE_UNUSED)
1508 {
1509   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1510   aarch64_ra_sign_key = AARCH64_KEY_A;
1511   return AARCH64_PARSE_OK;
1512 }
1513
1514 static enum aarch64_parse_opt_result
1515 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1516                               char* rest ATTRIBUTE_UNUSED)
1517 {
1518   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1519   return AARCH64_PARSE_OK;
1520 }
1521
1522 static enum aarch64_parse_opt_result
1523 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1524                               char* rest ATTRIBUTE_UNUSED)
1525 {
1526   aarch64_ra_sign_key = AARCH64_KEY_B;
1527   return AARCH64_PARSE_OK;
1528 }
1529
1530 static enum aarch64_parse_opt_result
1531 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1532                                     char* rest ATTRIBUTE_UNUSED)
1533 {
1534   aarch64_enable_bti = 1;
1535   return AARCH64_PARSE_OK;
1536 }
1537
1538 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1539   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1540   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1541   { NULL, NULL, NULL, 0 }
1542 };
1543
1544 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1545   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1546   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1547   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1548     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1549   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1550   { NULL, NULL, NULL, 0 }
1551 };
1552
1553 /* The condition codes of the processor, and the inverse function.  */
1554 static const char * const aarch64_condition_codes[] =
1555 {
1556   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1557   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1558 };
1559
1560 /* The preferred condition codes for SVE conditions.  */
1561 static const char *const aarch64_sve_condition_codes[] =
1562 {
1563   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1564   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1565 };
1566
1567 /* Return the assembly token for svpattern value VALUE.  */
1568
1569 static const char *
1570 svpattern_token (enum aarch64_svpattern pattern)
1571 {
1572   switch (pattern)
1573     {
1574 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1575     AARCH64_FOR_SVPATTERN (CASE)
1576 #undef CASE
1577     case AARCH64_NUM_SVPATTERNS:
1578       break;
1579     }
1580   gcc_unreachable ();
1581 }
1582
1583 /* Return the location of a piece that is known to be passed or returned
1584    in registers.  FIRST_ZR is the first unused vector argument register
1585    and FIRST_PR is the first unused predicate argument register.  */
1586
1587 rtx
1588 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
1589                                          unsigned int first_pr) const
1590 {
1591   gcc_assert (VECTOR_MODE_P (mode)
1592               && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
1593               && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
1594
1595   if (num_zr > 0 && num_pr == 0)
1596     return gen_rtx_REG (mode, first_zr);
1597
1598   if (num_zr == 0 && num_pr == 1)
1599     return gen_rtx_REG (mode, first_pr);
1600
1601   gcc_unreachable ();
1602 }
1603
1604 /* Return the total number of vector registers required by the PST.  */
1605
1606 unsigned int
1607 pure_scalable_type_info::num_zr () const
1608 {
1609   unsigned int res = 0;
1610   for (unsigned int i = 0; i < pieces.length (); ++i)
1611     res += pieces[i].num_zr;
1612   return res;
1613 }
1614
1615 /* Return the total number of predicate registers required by the PST.  */
1616
1617 unsigned int
1618 pure_scalable_type_info::num_pr () const
1619 {
1620   unsigned int res = 0;
1621   for (unsigned int i = 0; i < pieces.length (); ++i)
1622     res += pieces[i].num_pr;
1623   return res;
1624 }
1625
1626 /* Return the location of a PST that is known to be passed or returned
1627    in registers.  FIRST_ZR is the first unused vector argument register
1628    and FIRST_PR is the first unused predicate argument register.  */
1629
1630 rtx
1631 pure_scalable_type_info::get_rtx (machine_mode mode,
1632                                   unsigned int first_zr,
1633                                   unsigned int first_pr) const
1634 {
1635   /* Try to return a single REG if possible.  This leads to better
1636      code generation; it isn't required for correctness.  */
1637   if (mode == pieces[0].mode)
1638     {
1639       gcc_assert (pieces.length () == 1);
1640       return pieces[0].get_rtx (first_zr, first_pr);
1641     }
1642
1643   /* Build up a PARALLEL that contains the individual pieces.  */
1644   rtvec rtxes = rtvec_alloc (pieces.length ());
1645   for (unsigned int i = 0; i < pieces.length (); ++i)
1646     {
1647       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1648       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1649       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1650       first_zr += pieces[i].num_zr;
1651       first_pr += pieces[i].num_pr;
1652     }
1653   return gen_rtx_PARALLEL (mode, rtxes);
1654 }
1655
1656 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1657    in the AAPCS64.  */
1658
1659 pure_scalable_type_info::analysis_result
1660 pure_scalable_type_info::analyze (const_tree type)
1661 {
1662   /* Prevent accidental reuse.  */
1663   gcc_assert (pieces.is_empty ());
1664
1665   /* No code will be generated for erroneous types, so we won't establish
1666      an ABI mapping.  */
1667   if (type == error_mark_node)
1668     return NO_ABI_IDENTITY;
1669
1670   /* Zero-sized types disappear in the language->ABI mapping.  */
1671   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1672     return NO_ABI_IDENTITY;
1673
1674   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
1675   piece p = {};
1676   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1677     {
1678       machine_mode mode = TYPE_MODE_RAW (type);
1679       gcc_assert (VECTOR_MODE_P (mode)
1680                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1681
1682       p.mode = p.orig_mode = mode;
1683       add_piece (p);
1684       return IS_PST;
1685     }
1686
1687   /* Check for user-defined PSTs.  */
1688   if (TREE_CODE (type) == ARRAY_TYPE)
1689     return analyze_array (type);
1690   if (TREE_CODE (type) == RECORD_TYPE)
1691     return analyze_record (type);
1692
1693   return ISNT_PST;
1694 }
1695
1696 /* Analyze a type that is known not to be passed or returned in memory.
1697    Return true if it has an ABI identity and is a Pure Scalable Type.  */
1698
1699 bool
1700 pure_scalable_type_info::analyze_registers (const_tree type)
1701 {
1702   analysis_result result = analyze (type);
1703   gcc_assert (result != DOESNT_MATTER);
1704   return result == IS_PST;
1705 }
1706
1707 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
1708
1709 pure_scalable_type_info::analysis_result
1710 pure_scalable_type_info::analyze_array (const_tree type)
1711 {
1712   /* Analyze the element type.  */
1713   pure_scalable_type_info element_info;
1714   analysis_result result = element_info.analyze (TREE_TYPE (type));
1715   if (result != IS_PST)
1716     return result;
1717
1718   /* An array of unknown, flexible or variable length will be passed and
1719      returned by reference whatever we do.  */
1720   tree nelts_minus_one = array_type_nelts (type);
1721   if (!tree_fits_uhwi_p (nelts_minus_one))
1722     return DOESNT_MATTER;
1723
1724   /* Likewise if the array is constant-sized but too big to be interesting.
1725      The double checks against MAX_PIECES are to protect against overflow.  */
1726   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1727   if (count > MAX_PIECES)
1728     return DOESNT_MATTER;
1729   count += 1;
1730   if (count * element_info.pieces.length () > MAX_PIECES)
1731     return DOESNT_MATTER;
1732
1733   /* The above checks should have weeded out elements of unknown size.  */
1734   poly_uint64 element_bytes;
1735   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1736     gcc_unreachable ();
1737
1738   /* Build up the list of individual vectors and predicates.  */
1739   gcc_assert (!element_info.pieces.is_empty ());
1740   for (unsigned int i = 0; i < count; ++i)
1741     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1742       {
1743         piece p = element_info.pieces[j];
1744         p.offset += i * element_bytes;
1745         add_piece (p);
1746       }
1747   return IS_PST;
1748 }
1749
1750 /* Subroutine of analyze for handling RECORD_TYPEs.  */
1751
1752 pure_scalable_type_info::analysis_result
1753 pure_scalable_type_info::analyze_record (const_tree type)
1754 {
1755   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1756     {
1757       if (TREE_CODE (field) != FIELD_DECL)
1758         continue;
1759
1760       /* Zero-sized fields disappear in the language->ABI mapping.  */
1761       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1762         continue;
1763
1764       /* All fields with an ABI identity must be PSTs for the record as
1765          a whole to be a PST.  If any individual field is too big to be
1766          interesting then the record is too.  */
1767       pure_scalable_type_info field_info;
1768       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1769       if (subresult == NO_ABI_IDENTITY)
1770         continue;
1771       if (subresult != IS_PST)
1772         return subresult;
1773
1774       /* Since all previous fields are PSTs, we ought to be able to track
1775          the field offset using poly_ints.  */
1776       tree bitpos = bit_position (field);
1777       gcc_assert (poly_int_tree_p (bitpos));
1778
1779       /* For the same reason, it shouldn't be possible to create a PST field
1780          whose offset isn't byte-aligned.  */
1781       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1782                                                 BITS_PER_UNIT);
1783
1784       /* Punt if the record is too big to be interesting.  */
1785       poly_uint64 bytepos;
1786       if (!wide_bytepos.to_uhwi (&bytepos)
1787           || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1788         return DOESNT_MATTER;
1789
1790       /* Add the individual vectors and predicates in the field to the
1791          record's list.  */
1792       gcc_assert (!field_info.pieces.is_empty ());
1793       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1794         {
1795           piece p = field_info.pieces[i];
1796           p.offset += bytepos;
1797           add_piece (p);
1798         }
1799     }
1800   /* Empty structures disappear in the language->ABI mapping.  */
1801   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1802 }
1803
1804 /* Add P to the list of pieces in the type.  */
1805
1806 void
1807 pure_scalable_type_info::add_piece (const piece &p)
1808 {
1809   /* Try to fold the new piece into the previous one to form a
1810      single-mode PST.  For example, if we see three consecutive vectors
1811      of the same mode, we can represent them using the corresponding
1812      3-tuple mode.
1813
1814      This is purely an optimization.  */
1815   if (!pieces.is_empty ())
1816     {
1817       piece &prev = pieces.last ();
1818       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1819       unsigned int nelems1, nelems2;
1820       if (prev.orig_mode == p.orig_mode
1821           && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1822           && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1823                                   GET_MODE_NUNITS (p.orig_mode), &nelems1)
1824           && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1825                                   GET_MODE_NUNITS (p.orig_mode), &nelems2)
1826           && targetm.array_mode (p.orig_mode,
1827                                  nelems1 + nelems2).exists (&prev.mode))
1828         {
1829           prev.num_zr += p.num_zr;
1830           prev.num_pr += p.num_pr;
1831           return;
1832         }
1833     }
1834   pieces.quick_push (p);
1835 }
1836
1837 /* Return true if at least one possible value of type TYPE includes at
1838    least one object of Pure Scalable Type, in the sense of the AAPCS64.
1839
1840    This is a relatively expensive test for some types, so it should
1841    generally be made as late as possible.  */
1842
1843 static bool
1844 aarch64_some_values_include_pst_objects_p (const_tree type)
1845 {
1846   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1847     return false;
1848
1849   if (aarch64_sve::builtin_type_p (type))
1850     return true;
1851
1852   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1853     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1854
1855   if (RECORD_OR_UNION_TYPE_P (type))
1856     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1857       if (TREE_CODE (field) == FIELD_DECL
1858           && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1859         return true;
1860
1861   return false;
1862 }
1863
1864 /* Return the descriptor of the SIMD ABI.  */
1865
1866 static const predefined_function_abi &
1867 aarch64_simd_abi (void)
1868 {
1869   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1870   if (!simd_abi.initialized_p ())
1871     {
1872       HARD_REG_SET full_reg_clobbers
1873         = default_function_abi.full_reg_clobbers ();
1874       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1875         if (FP_SIMD_SAVED_REGNUM_P (regno))
1876           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1877       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1878     }
1879   return simd_abi;
1880 }
1881
1882 /* Return the descriptor of the SVE PCS.  */
1883
1884 static const predefined_function_abi &
1885 aarch64_sve_abi (void)
1886 {
1887   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1888   if (!sve_abi.initialized_p ())
1889     {
1890       HARD_REG_SET full_reg_clobbers
1891         = default_function_abi.full_reg_clobbers ();
1892       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1893         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1894       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1895         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1896       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1897     }
1898   return sve_abi;
1899 }
1900
1901 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1902 const char *
1903 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1904                         const char * branch_format)
1905 {
1906     rtx_code_label * tmp_label = gen_label_rtx ();
1907     char label_buf[256];
1908     char buffer[128];
1909     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1910                                  CODE_LABEL_NUMBER (tmp_label));
1911     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1912     rtx dest_label = operands[pos_label];
1913     operands[pos_label] = tmp_label;
1914
1915     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1916     output_asm_insn (buffer, operands);
1917
1918     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1919     operands[pos_label] = dest_label;
1920     output_asm_insn (buffer, operands);
1921     return "";
1922 }
1923
1924 void
1925 aarch64_err_no_fpadvsimd (machine_mode mode)
1926 {
1927   if (TARGET_GENERAL_REGS_ONLY)
1928     if (FLOAT_MODE_P (mode))
1929       error ("%qs is incompatible with the use of floating-point types",
1930              "-mgeneral-regs-only");
1931     else
1932       error ("%qs is incompatible with the use of vector types",
1933              "-mgeneral-regs-only");
1934   else
1935     if (FLOAT_MODE_P (mode))
1936       error ("%qs feature modifier is incompatible with the use of"
1937              " floating-point types", "+nofp");
1938     else
1939       error ("%qs feature modifier is incompatible with the use of"
1940              " vector types", "+nofp");
1941 }
1942
1943 /* Report when we try to do something that requires SVE when SVE is disabled.
1944    This is an error of last resort and isn't very high-quality.  It usually
1945    involves attempts to measure the vector length in some way.  */
1946 static void
1947 aarch64_report_sve_required (void)
1948 {
1949   static bool reported_p = false;
1950
1951   /* Avoid reporting a slew of messages for a single oversight.  */
1952   if (reported_p)
1953     return;
1954
1955   error ("this operation requires the SVE ISA extension");
1956   inform (input_location, "you can enable SVE using the command-line"
1957           " option %<-march%>, or by using the %<target%>"
1958           " attribute or pragma");
1959   reported_p = true;
1960 }
1961
1962 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1963    registers.  */
1964 inline bool
1965 pr_or_ffr_regnum_p (unsigned int regno)
1966 {
1967   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1968 }
1969
1970 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1971    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1972    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1973    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1974    and GENERAL_REGS is lower than the memory cost (in this case the best class
1975    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1976    cost results in bad allocations with many redundant int<->FP moves which
1977    are expensive on various cores.
1978    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1979    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1980    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1981    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1982    The result of this is that it is no longer inefficient to have a higher
1983    memory move cost than the register move cost.
1984 */
1985
1986 static reg_class_t
1987 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1988                                          reg_class_t best_class)
1989 {
1990   machine_mode mode;
1991
1992   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1993       || !reg_class_subset_p (FP_REGS, allocno_class))
1994     return allocno_class;
1995
1996   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1997       || !reg_class_subset_p (FP_REGS, best_class))
1998     return best_class;
1999
2000   mode = PSEUDO_REGNO_MODE (regno);
2001   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
2002 }
2003
2004 static unsigned int
2005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
2006 {
2007   if (GET_MODE_UNIT_SIZE (mode) == 4)
2008     return aarch64_tune_params.min_div_recip_mul_sf;
2009   return aarch64_tune_params.min_div_recip_mul_df;
2010 }
2011
2012 /* Return the reassociation width of treeop OPC with mode MODE.  */
2013 static int
2014 aarch64_reassociation_width (unsigned opc, machine_mode mode)
2015 {
2016   if (VECTOR_MODE_P (mode))
2017     return aarch64_tune_params.vec_reassoc_width;
2018   if (INTEGRAL_MODE_P (mode))
2019     return aarch64_tune_params.int_reassoc_width;
2020   /* Avoid reassociating floating point addition so we emit more FMAs.  */
2021   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
2022     return aarch64_tune_params.fp_reassoc_width;
2023   return 1;
2024 }
2025
2026 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
2027 unsigned
2028 aarch64_dbx_register_number (unsigned regno)
2029 {
2030    if (GP_REGNUM_P (regno))
2031      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
2032    else if (regno == SP_REGNUM)
2033      return AARCH64_DWARF_SP;
2034    else if (FP_REGNUM_P (regno))
2035      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
2036    else if (PR_REGNUM_P (regno))
2037      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
2038    else if (regno == VG_REGNUM)
2039      return AARCH64_DWARF_VG;
2040
2041    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
2042       equivalent DWARF register.  */
2043    return DWARF_FRAME_REGISTERS;
2044 }
2045
2046 /* If X is a CONST_DOUBLE, return its bit representation as a constant
2047    integer, otherwise return X unmodified.  */
2048 static rtx
2049 aarch64_bit_representation (rtx x)
2050 {
2051   if (CONST_DOUBLE_P (x))
2052     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
2053   return x;
2054 }
2055
2056 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
2057 static bool
2058 aarch64_advsimd_struct_mode_p (machine_mode mode)
2059 {
2060   return (TARGET_SIMD
2061           && (mode == OImode || mode == CImode || mode == XImode));
2062 }
2063
2064 /* Return true if MODE is an SVE predicate mode.  */
2065 static bool
2066 aarch64_sve_pred_mode_p (machine_mode mode)
2067 {
2068   return (TARGET_SVE
2069           && (mode == VNx16BImode
2070               || mode == VNx8BImode
2071               || mode == VNx4BImode
2072               || mode == VNx2BImode));
2073 }
2074
2075 /* Three mutually-exclusive flags describing a vector or predicate type.  */
2076 const unsigned int VEC_ADVSIMD  = 1;
2077 const unsigned int VEC_SVE_DATA = 2;
2078 const unsigned int VEC_SVE_PRED = 4;
2079 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
2080    a structure of 2, 3 or 4 vectors.  */
2081 const unsigned int VEC_STRUCT   = 8;
2082 /* Can be used in combination with VEC_SVE_DATA to indicate that the
2083    vector has fewer significant bytes than a full SVE vector.  */
2084 const unsigned int VEC_PARTIAL  = 16;
2085 /* Useful combinations of the above.  */
2086 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
2087 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2088
2089 /* Return a set of flags describing the vector properties of mode MODE.
2090    Ignore modes that are not supported by the current target.  */
2091 static unsigned int
2092 aarch64_classify_vector_mode (machine_mode mode)
2093 {
2094   if (aarch64_advsimd_struct_mode_p (mode))
2095     return VEC_ADVSIMD | VEC_STRUCT;
2096
2097   if (aarch64_sve_pred_mode_p (mode))
2098     return VEC_SVE_PRED;
2099
2100   /* Make the decision based on the mode's enum value rather than its
2101      properties, so that we keep the correct classification regardless
2102      of -msve-vector-bits.  */
2103   switch (mode)
2104     {
2105     /* Partial SVE QI vectors.  */
2106     case E_VNx2QImode:
2107     case E_VNx4QImode:
2108     case E_VNx8QImode:
2109     /* Partial SVE HI vectors.  */
2110     case E_VNx2HImode:
2111     case E_VNx4HImode:
2112     /* Partial SVE SI vector.  */
2113     case E_VNx2SImode:
2114     /* Partial SVE HF vectors.  */
2115     case E_VNx2HFmode:
2116     case E_VNx4HFmode:
2117     /* Partial SVE SF vector.  */
2118     case E_VNx2SFmode:
2119       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2120
2121     case E_VNx16QImode:
2122     case E_VNx8HImode:
2123     case E_VNx4SImode:
2124     case E_VNx2DImode:
2125     case E_VNx8BFmode:
2126     case E_VNx8HFmode:
2127     case E_VNx4SFmode:
2128     case E_VNx2DFmode:
2129       return TARGET_SVE ? VEC_SVE_DATA : 0;
2130
2131     /* x2 SVE vectors.  */
2132     case E_VNx32QImode:
2133     case E_VNx16HImode:
2134     case E_VNx8SImode:
2135     case E_VNx4DImode:
2136     case E_VNx16BFmode:
2137     case E_VNx16HFmode:
2138     case E_VNx8SFmode:
2139     case E_VNx4DFmode:
2140     /* x3 SVE vectors.  */
2141     case E_VNx48QImode:
2142     case E_VNx24HImode:
2143     case E_VNx12SImode:
2144     case E_VNx6DImode:
2145     case E_VNx24BFmode:
2146     case E_VNx24HFmode:
2147     case E_VNx12SFmode:
2148     case E_VNx6DFmode:
2149     /* x4 SVE vectors.  */
2150     case E_VNx64QImode:
2151     case E_VNx32HImode:
2152     case E_VNx16SImode:
2153     case E_VNx8DImode:
2154     case E_VNx32BFmode:
2155     case E_VNx32HFmode:
2156     case E_VNx16SFmode:
2157     case E_VNx8DFmode:
2158       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2159
2160     /* 64-bit Advanced SIMD vectors.  */
2161     case E_V8QImode:
2162     case E_V4HImode:
2163     case E_V2SImode:
2164     /* ...E_V1DImode doesn't exist.  */
2165     case E_V4HFmode:
2166     case E_V4BFmode:
2167     case E_V2SFmode:
2168     case E_V1DFmode:
2169     /* 128-bit Advanced SIMD vectors.  */
2170     case E_V16QImode:
2171     case E_V8HImode:
2172     case E_V4SImode:
2173     case E_V2DImode:
2174     case E_V8HFmode:
2175     case E_V8BFmode:
2176     case E_V4SFmode:
2177     case E_V2DFmode:
2178       return TARGET_SIMD ? VEC_ADVSIMD : 0;
2179
2180     default:
2181       return 0;
2182     }
2183 }
2184
2185 /* Return true if MODE is any of the data vector modes, including
2186    structure modes.  */
2187 static bool
2188 aarch64_vector_data_mode_p (machine_mode mode)
2189 {
2190   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
2191 }
2192
2193 /* Return true if MODE is any form of SVE mode, including predicates,
2194    vectors and structures.  */
2195 bool
2196 aarch64_sve_mode_p (machine_mode mode)
2197 {
2198   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
2199 }
2200
2201 /* Return true if MODE is an SVE data vector mode; either a single vector
2202    or a structure of vectors.  */
2203 static bool
2204 aarch64_sve_data_mode_p (machine_mode mode)
2205 {
2206   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
2207 }
2208
2209 /* Return the number of defined bytes in one constituent vector of
2210    SVE mode MODE, which has vector flags VEC_FLAGS.  */
2211 static poly_int64
2212 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
2213 {
2214   if (vec_flags & VEC_PARTIAL)
2215     /* A single partial vector.  */
2216     return GET_MODE_SIZE (mode);
2217
2218   if (vec_flags & VEC_SVE_DATA)
2219     /* A single vector or a tuple.  */
2220     return BYTES_PER_SVE_VECTOR;
2221
2222   /* A single predicate.  */
2223   gcc_assert (vec_flags & VEC_SVE_PRED);
2224   return BYTES_PER_SVE_PRED;
2225 }
2226
2227 /* Implement target hook TARGET_ARRAY_MODE.  */
2228 static opt_machine_mode
2229 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
2230 {
2231   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
2232       && IN_RANGE (nelems, 2, 4))
2233     return mode_for_vector (GET_MODE_INNER (mode),
2234                             GET_MODE_NUNITS (mode) * nelems);
2235
2236   return opt_machine_mode ();
2237 }
2238
2239 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
2240 static bool
2241 aarch64_array_mode_supported_p (machine_mode mode,
2242                                 unsigned HOST_WIDE_INT nelems)
2243 {
2244   if (TARGET_SIMD
2245       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
2246           || AARCH64_VALID_SIMD_DREG_MODE (mode))
2247       && (nelems >= 2 && nelems <= 4))
2248     return true;
2249
2250   return false;
2251 }
2252
2253 /* MODE is some form of SVE vector mode.  For data modes, return the number
2254    of vector register bits that each element of MODE occupies, such as 64
2255    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2256    in a 64-bit container).  For predicate modes, return the number of
2257    data bits controlled by each significant predicate bit.  */
2258
2259 static unsigned int
2260 aarch64_sve_container_bits (machine_mode mode)
2261 {
2262   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2263   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
2264                              ? BITS_PER_SVE_VECTOR
2265                              : GET_MODE_BITSIZE (mode));
2266   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
2267 }
2268
2269 /* Return the SVE predicate mode to use for elements that have
2270    ELEM_NBYTES bytes, if such a mode exists.  */
2271
2272 opt_machine_mode
2273 aarch64_sve_pred_mode (unsigned int elem_nbytes)
2274 {
2275   if (TARGET_SVE)
2276     {
2277       if (elem_nbytes == 1)
2278         return VNx16BImode;
2279       if (elem_nbytes == 2)
2280         return VNx8BImode;
2281       if (elem_nbytes == 4)
2282         return VNx4BImode;
2283       if (elem_nbytes == 8)
2284         return VNx2BImode;
2285     }
2286   return opt_machine_mode ();
2287 }
2288
2289 /* Return the SVE predicate mode that should be used to control
2290    SVE mode MODE.  */
2291
2292 machine_mode
2293 aarch64_sve_pred_mode (machine_mode mode)
2294 {
2295   unsigned int bits = aarch64_sve_container_bits (mode);
2296   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
2297 }
2298
2299 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
2300
2301 static opt_machine_mode
2302 aarch64_get_mask_mode (machine_mode mode)
2303 {
2304   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2305   if (vec_flags & VEC_SVE_DATA)
2306     return aarch64_sve_pred_mode (mode);
2307
2308   return default_get_mask_mode (mode);
2309 }
2310
2311 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
2312
2313 opt_machine_mode
2314 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
2315 {
2316   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
2317                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
2318   machine_mode mode;
2319   FOR_EACH_MODE_IN_CLASS (mode, mclass)
2320     if (inner_mode == GET_MODE_INNER (mode)
2321         && known_eq (nunits, GET_MODE_NUNITS (mode))
2322         && aarch64_sve_data_mode_p (mode))
2323       return mode;
2324   return opt_machine_mode ();
2325 }
2326
2327 /* Return the integer element mode associated with SVE mode MODE.  */
2328
2329 static scalar_int_mode
2330 aarch64_sve_element_int_mode (machine_mode mode)
2331 {
2332   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2333                              ? BITS_PER_SVE_VECTOR
2334                              : GET_MODE_BITSIZE (mode));
2335   unsigned int elt_bits = vector_element_size (vector_bits,
2336                                                GET_MODE_NUNITS (mode));
2337   return int_mode_for_size (elt_bits, 0).require ();
2338 }
2339
2340 /* Return an integer element mode that contains exactly
2341    aarch64_sve_container_bits (MODE) bits.  This is wider than
2342    aarch64_sve_element_int_mode if MODE is a partial vector,
2343    otherwise it's the same.  */
2344
2345 static scalar_int_mode
2346 aarch64_sve_container_int_mode (machine_mode mode)
2347 {
2348   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
2349 }
2350
2351 /* Return the integer vector mode associated with SVE mode MODE.
2352    Unlike related_int_vector_mode, this can handle the case in which
2353    MODE is a predicate (and thus has a different total size).  */
2354
2355 machine_mode
2356 aarch64_sve_int_mode (machine_mode mode)
2357 {
2358   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
2359   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
2360 }
2361
2362 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
2363
2364 static opt_machine_mode
2365 aarch64_vectorize_related_mode (machine_mode vector_mode,
2366                                 scalar_mode element_mode,
2367                                 poly_uint64 nunits)
2368 {
2369   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2370
2371   /* If we're operating on SVE vectors, try to return an SVE mode.  */
2372   poly_uint64 sve_nunits;
2373   if ((vec_flags & VEC_SVE_DATA)
2374       && multiple_p (BYTES_PER_SVE_VECTOR,
2375                      GET_MODE_SIZE (element_mode), &sve_nunits))
2376     {
2377       machine_mode sve_mode;
2378       if (maybe_ne (nunits, 0U))
2379         {
2380           /* Try to find a full or partial SVE mode with exactly
2381              NUNITS units.  */
2382           if (multiple_p (sve_nunits, nunits)
2383               && aarch64_sve_data_mode (element_mode,
2384                                         nunits).exists (&sve_mode))
2385             return sve_mode;
2386         }
2387       else
2388         {
2389           /* Take the preferred number of units from the number of bytes
2390              that fit in VECTOR_MODE.  We always start by "autodetecting"
2391              a full vector mode with preferred_simd_mode, so vectors
2392              chosen here will also be full vector modes.  Then
2393              autovectorize_vector_modes tries smaller starting modes
2394              and thus smaller preferred numbers of units.  */
2395           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2396           if (aarch64_sve_data_mode (element_mode,
2397                                      sve_nunits).exists (&sve_mode))
2398             return sve_mode;
2399         }
2400     }
2401
2402   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
2403   if ((vec_flags & VEC_ADVSIMD)
2404       && known_eq (nunits, 0U)
2405       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
2406       && maybe_ge (GET_MODE_BITSIZE (element_mode)
2407                    * GET_MODE_NUNITS (vector_mode), 128U))
2408     {
2409       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
2410       if (VECTOR_MODE_P (res))
2411         return res;
2412     }
2413
2414   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2415 }
2416
2417 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
2418    prefer to use the first arithmetic operand as the else value if
2419    the else value doesn't matter, since that exactly matches the SVE
2420    destructive merging form.  For ternary operations we could either
2421    pick the first operand and use FMAD-like instructions or the last
2422    operand and use FMLA-like instructions; the latter seems more
2423    natural.  */
2424
2425 static tree
2426 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
2427 {
2428   return nops == 3 ? ops[2] : ops[0];
2429 }
2430
2431 /* Implement TARGET_HARD_REGNO_NREGS.  */
2432
2433 static unsigned int
2434 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
2435 {
2436   /* ??? Logically we should only need to provide a value when
2437      HARD_REGNO_MODE_OK says that the combination is valid,
2438      but at the moment we need to handle all modes.  Just ignore
2439      any runtime parts for registers that can't store them.  */
2440   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
2441   switch (aarch64_regno_regclass (regno))
2442     {
2443     case FP_REGS:
2444     case FP_LO_REGS:
2445     case FP_LO8_REGS:
2446       {
2447         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2448         if (vec_flags & VEC_SVE_DATA)
2449           return exact_div (GET_MODE_SIZE (mode),
2450                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2451         return CEIL (lowest_size, UNITS_PER_VREG);
2452       }
2453     case PR_REGS:
2454     case PR_LO_REGS:
2455     case PR_HI_REGS:
2456     case FFR_REGS:
2457     case PR_AND_FFR_REGS:
2458       return 1;
2459     default:
2460       return CEIL (lowest_size, UNITS_PER_WORD);
2461     }
2462   gcc_unreachable ();
2463 }
2464
2465 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
2466
2467 static bool
2468 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2469 {
2470   if (GET_MODE_CLASS (mode) == MODE_CC)
2471     return regno == CC_REGNUM;
2472
2473   if (regno == VG_REGNUM)
2474     /* This must have the same size as _Unwind_Word.  */
2475     return mode == DImode;
2476
2477   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2478   if (vec_flags & VEC_SVE_PRED)
2479     return pr_or_ffr_regnum_p (regno);
2480
2481   if (pr_or_ffr_regnum_p (regno))
2482     return false;
2483
2484   if (regno == SP_REGNUM)
2485     /* The purpose of comparing with ptr_mode is to support the
2486        global register variable associated with the stack pointer
2487        register via the syntax of asm ("wsp") in ILP32.  */
2488     return mode == Pmode || mode == ptr_mode;
2489
2490   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2491     return mode == Pmode;
2492
2493   if (GP_REGNUM_P (regno))
2494     {
2495       if (vec_flags & VEC_ANY_SVE)
2496         return false;
2497       if (known_le (GET_MODE_SIZE (mode), 8))
2498         return true;
2499       if (known_le (GET_MODE_SIZE (mode), 16))
2500         return (regno & 1) == 0;
2501     }
2502   else if (FP_REGNUM_P (regno))
2503     {
2504       if (vec_flags & VEC_STRUCT)
2505         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2506       else
2507         return !VECTOR_MODE_P (mode) || vec_flags != 0;
2508     }
2509
2510   return false;
2511 }
2512
2513 /* Return true if a function with type FNTYPE returns its value in
2514    SVE vector or predicate registers.  */
2515
2516 static bool
2517 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2518 {
2519   tree return_type = TREE_TYPE (fntype);
2520
2521   pure_scalable_type_info pst_info;
2522   switch (pst_info.analyze (return_type))
2523     {
2524     case pure_scalable_type_info::IS_PST:
2525       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2526               && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2527
2528     case pure_scalable_type_info::DOESNT_MATTER:
2529       gcc_assert (aarch64_return_in_memory_1 (return_type));
2530       return false;
2531
2532     case pure_scalable_type_info::NO_ABI_IDENTITY:
2533     case pure_scalable_type_info::ISNT_PST:
2534       return false;
2535     }
2536   gcc_unreachable ();
2537 }
2538
2539 /* Return true if a function with type FNTYPE takes arguments in
2540    SVE vector or predicate registers.  */
2541
2542 static bool
2543 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2544 {
2545   CUMULATIVE_ARGS args_so_far_v;
2546   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2547                                 NULL_TREE, 0, true);
2548   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2549
2550   for (tree chain = TYPE_ARG_TYPES (fntype);
2551        chain && chain != void_list_node;
2552        chain = TREE_CHAIN (chain))
2553     {
2554       tree arg_type = TREE_VALUE (chain);
2555       if (arg_type == error_mark_node)
2556         return false;
2557
2558       function_arg_info arg (arg_type, /*named=*/true);
2559       apply_pass_by_reference_rules (&args_so_far_v, arg);
2560       pure_scalable_type_info pst_info;
2561       if (pst_info.analyze_registers (arg.type))
2562         {
2563           unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2564           unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2565           gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2566           return true;
2567         }
2568
2569       targetm.calls.function_arg_advance (args_so_far, arg);
2570     }
2571   return false;
2572 }
2573
2574 /* Implement TARGET_FNTYPE_ABI.  */
2575
2576 static const predefined_function_abi &
2577 aarch64_fntype_abi (const_tree fntype)
2578 {
2579   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2580     return aarch64_simd_abi ();
2581
2582   if (aarch64_returns_value_in_sve_regs_p (fntype)
2583       || aarch64_takes_arguments_in_sve_regs_p (fntype))
2584     return aarch64_sve_abi ();
2585
2586   return default_function_abi;
2587 }
2588
2589 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
2590
2591 static bool
2592 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2593 {
2594   return (aarch64_sve::builtin_type_p (type1)
2595           == aarch64_sve::builtin_type_p (type2));
2596 }
2597
2598 /* Return true if we should emit CFI for register REGNO.  */
2599
2600 static bool
2601 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2602 {
2603   return (GP_REGNUM_P (regno)
2604           || !default_function_abi.clobbers_full_reg_p (regno));
2605 }
2606
2607 /* Return the mode we should use to save and restore register REGNO.  */
2608
2609 static machine_mode
2610 aarch64_reg_save_mode (unsigned int regno)
2611 {
2612   if (GP_REGNUM_P (regno))
2613     return DImode;
2614
2615   if (FP_REGNUM_P (regno))
2616     switch (crtl->abi->id ())
2617       {
2618       case ARM_PCS_AAPCS64:
2619         /* Only the low 64 bits are saved by the base PCS.  */
2620         return DFmode;
2621
2622       case ARM_PCS_SIMD:
2623         /* The vector PCS saves the low 128 bits (which is the full
2624            register on non-SVE targets).  */
2625         return TFmode;
2626
2627       case ARM_PCS_SVE:
2628         /* Use vectors of DImode for registers that need frame
2629            information, so that the first 64 bytes of the save slot
2630            are always the equivalent of what storing D<n> would give.  */
2631         if (aarch64_emit_cfi_for_reg_p (regno))
2632           return VNx2DImode;
2633
2634         /* Use vectors of bytes otherwise, so that the layout is
2635            endian-agnostic, and so that we can use LDR and STR for
2636            big-endian targets.  */
2637         return VNx16QImode;
2638
2639       case ARM_PCS_TLSDESC:
2640       case ARM_PCS_UNKNOWN:
2641         break;
2642       }
2643
2644   if (PR_REGNUM_P (regno))
2645     /* Save the full predicate register.  */
2646     return VNx16BImode;
2647
2648   gcc_unreachable ();
2649 }
2650
2651 /* Implement TARGET_INSN_CALLEE_ABI.  */
2652
2653 const predefined_function_abi &
2654 aarch64_insn_callee_abi (const rtx_insn *insn)
2655 {
2656   rtx pat = PATTERN (insn);
2657   gcc_assert (GET_CODE (pat) == PARALLEL);
2658   rtx unspec = XVECEXP (pat, 0, 1);
2659   gcc_assert (GET_CODE (unspec) == UNSPEC
2660               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2661   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2662 }
2663
2664 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2665    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2666    clobbers the top 64 bits when restoring the bottom 64 bits.  */
2667
2668 static bool
2669 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2670                                         unsigned int regno,
2671                                         machine_mode mode)
2672 {
2673   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2674     {
2675       poly_int64 per_register_size = GET_MODE_SIZE (mode);
2676       unsigned int nregs = hard_regno_nregs (regno, mode);
2677       if (nregs > 1)
2678         per_register_size = exact_div (per_register_size, nregs);
2679       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2680         return maybe_gt (per_register_size, 16);
2681       return maybe_gt (per_register_size, 8);
2682     }
2683   return false;
2684 }
2685
2686 /* Implement REGMODE_NATURAL_SIZE.  */
2687 poly_uint64
2688 aarch64_regmode_natural_size (machine_mode mode)
2689 {
2690   /* The natural size for SVE data modes is one SVE data vector,
2691      and similarly for predicates.  We can't independently modify
2692      anything smaller than that.  */
2693   /* ??? For now, only do this for variable-width SVE registers.
2694      Doing it for constant-sized registers breaks lower-subreg.c.  */
2695   /* ??? And once that's fixed, we should probably have similar
2696      code for Advanced SIMD.  */
2697   if (!aarch64_sve_vg.is_constant ())
2698     {
2699       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2700       if (vec_flags & VEC_SVE_PRED)
2701         return BYTES_PER_SVE_PRED;
2702       if (vec_flags & VEC_SVE_DATA)
2703         return BYTES_PER_SVE_VECTOR;
2704     }
2705   return UNITS_PER_WORD;
2706 }
2707
2708 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2709 machine_mode
2710 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2711                                      machine_mode mode)
2712 {
2713   /* The predicate mode determines which bits are significant and
2714      which are "don't care".  Decreasing the number of lanes would
2715      lose data while increasing the number of lanes would make bits
2716      unnecessarily significant.  */
2717   if (PR_REGNUM_P (regno))
2718     return mode;
2719   if (known_ge (GET_MODE_SIZE (mode), 4))
2720     return mode;
2721   else
2722     return SImode;
2723 }
2724
2725 /* Return true if I's bits are consecutive ones from the MSB.  */
2726 bool
2727 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2728 {
2729   return exact_log2 (-i) != HOST_WIDE_INT_M1;
2730 }
2731
2732 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2733    that strcpy from constants will be faster.  */
2734
2735 static HOST_WIDE_INT
2736 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2737 {
2738   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2739     return MAX (align, BITS_PER_WORD);
2740   return align;
2741 }
2742
2743 /* Return true if calls to DECL should be treated as
2744    long-calls (ie called via a register).  */
2745 static bool
2746 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2747 {
2748   return false;
2749 }
2750
2751 /* Return true if calls to symbol-ref SYM should be treated as
2752    long-calls (ie called via a register).  */
2753 bool
2754 aarch64_is_long_call_p (rtx sym)
2755 {
2756   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2757 }
2758
2759 /* Return true if calls to symbol-ref SYM should not go through
2760    plt stubs.  */
2761
2762 bool
2763 aarch64_is_noplt_call_p (rtx sym)
2764 {
2765   const_tree decl = SYMBOL_REF_DECL (sym);
2766
2767   if (flag_pic
2768       && decl
2769       && (!flag_plt
2770           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2771       && !targetm.binds_local_p (decl))
2772     return true;
2773
2774   return false;
2775 }
2776
2777 /* Return true if the offsets to a zero/sign-extract operation
2778    represent an expression that matches an extend operation.  The
2779    operands represent the parameters from
2780
2781    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
2782 bool
2783 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2784                                 rtx extract_imm)
2785 {
2786   HOST_WIDE_INT mult_val, extract_val;
2787
2788   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2789     return false;
2790
2791   mult_val = INTVAL (mult_imm);
2792   extract_val = INTVAL (extract_imm);
2793
2794   if (extract_val > 8
2795       && extract_val < GET_MODE_BITSIZE (mode)
2796       && exact_log2 (extract_val & ~7) > 0
2797       && (extract_val & 7) <= 4
2798       && mult_val == (1 << (extract_val & 7)))
2799     return true;
2800
2801   return false;
2802 }
2803
2804 /* Emit an insn that's a simple single-set.  Both the operands must be
2805    known to be valid.  */
2806 inline static rtx_insn *
2807 emit_set_insn (rtx x, rtx y)
2808 {
2809   return emit_insn (gen_rtx_SET (x, y));
2810 }
2811
2812 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2813    return the rtx for register 0 in the proper mode.  */
2814 rtx
2815 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2816 {
2817   machine_mode cmp_mode = GET_MODE (x);
2818   machine_mode cc_mode;
2819   rtx cc_reg;
2820
2821   if (cmp_mode == TImode)
2822     {
2823       gcc_assert (code == NE);
2824
2825       cc_mode = CCmode;
2826       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2827
2828       rtx x_lo = operand_subword (x, 0, 0, TImode);
2829       rtx y_lo = operand_subword (y, 0, 0, TImode);
2830       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2831
2832       rtx x_hi = operand_subword (x, 1, 0, TImode);
2833       rtx y_hi = operand_subword (y, 1, 0, TImode);
2834       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2835                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2836                                GEN_INT (AARCH64_EQ)));
2837     }
2838   else
2839     {
2840       cc_mode = SELECT_CC_MODE (code, x, y);
2841       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2842       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2843     }
2844   return cc_reg;
2845 }
2846
2847 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2848
2849 static rtx
2850 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2851                                   machine_mode y_mode)
2852 {
2853   if (y_mode == E_QImode || y_mode == E_HImode)
2854     {
2855       if (CONST_INT_P (y))
2856         {
2857           y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2858           y_mode = SImode;
2859         }
2860       else
2861         {
2862           rtx t, cc_reg;
2863           machine_mode cc_mode;
2864
2865           t = gen_rtx_ZERO_EXTEND (SImode, y);
2866           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2867           cc_mode = CC_SWPmode;
2868           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2869           emit_set_insn (cc_reg, t);
2870           return cc_reg;
2871         }
2872     }
2873
2874   if (!aarch64_plus_operand (y, y_mode))
2875     y = force_reg (y_mode, y);
2876
2877   return aarch64_gen_compare_reg (code, x, y);
2878 }
2879
2880 /* Build the SYMBOL_REF for __tls_get_addr.  */
2881
2882 static GTY(()) rtx tls_get_addr_libfunc;
2883
2884 rtx
2885 aarch64_tls_get_addr (void)
2886 {
2887   if (!tls_get_addr_libfunc)
2888     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2889   return tls_get_addr_libfunc;
2890 }
2891
2892 /* Return the TLS model to use for ADDR.  */
2893
2894 static enum tls_model
2895 tls_symbolic_operand_type (rtx addr)
2896 {
2897   enum tls_model tls_kind = TLS_MODEL_NONE;
2898   if (GET_CODE (addr) == CONST)
2899     {
2900       poly_int64 addend;
2901       rtx sym = strip_offset (addr, &addend);
2902       if (GET_CODE (sym) == SYMBOL_REF)
2903         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2904     }
2905   else if (GET_CODE (addr) == SYMBOL_REF)
2906     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2907
2908   return tls_kind;
2909 }
2910
2911 /* We'll allow lo_sum's in addresses in our legitimate addresses
2912    so that combine would take care of combining addresses where
2913    necessary, but for generation purposes, we'll generate the address
2914    as :
2915    RTL                               Absolute
2916    tmp = hi (symbol_ref);            adrp  x1, foo
2917    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2918                                      nop
2919
2920    PIC                               TLS
2921    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2922    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2923                                      bl   __tls_get_addr
2924                                      nop
2925
2926    Load TLS symbol, depending on TLS mechanism and TLS access model.
2927
2928    Global Dynamic - Traditional TLS:
2929    adrp tmp, :tlsgd:imm
2930    add  dest, tmp, #:tlsgd_lo12:imm
2931    bl   __tls_get_addr
2932
2933    Global Dynamic - TLS Descriptors:
2934    adrp dest, :tlsdesc:imm
2935    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2936    add  dest, dest, #:tlsdesc_lo12:imm
2937    blr  tmp
2938    mrs  tp, tpidr_el0
2939    add  dest, dest, tp
2940
2941    Initial Exec:
2942    mrs  tp, tpidr_el0
2943    adrp tmp, :gottprel:imm
2944    ldr  dest, [tmp, #:gottprel_lo12:imm]
2945    add  dest, dest, tp
2946
2947    Local Exec:
2948    mrs  tp, tpidr_el0
2949    add  t0, tp, #:tprel_hi12:imm, lsl #12
2950    add  t0, t0, #:tprel_lo12_nc:imm
2951 */
2952
2953 static void
2954 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2955                                    enum aarch64_symbol_type type)
2956 {
2957   switch (type)
2958     {
2959     case SYMBOL_SMALL_ABSOLUTE:
2960       {
2961         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2962         rtx tmp_reg = dest;
2963         machine_mode mode = GET_MODE (dest);
2964
2965         gcc_assert (mode == Pmode || mode == ptr_mode);
2966
2967         if (can_create_pseudo_p ())
2968           tmp_reg = gen_reg_rtx (mode);
2969
2970         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2971         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2972         return;
2973       }
2974
2975     case SYMBOL_TINY_ABSOLUTE:
2976       emit_insn (gen_rtx_SET (dest, imm));
2977       return;
2978
2979     case SYMBOL_SMALL_GOT_28K:
2980       {
2981         machine_mode mode = GET_MODE (dest);
2982         rtx gp_rtx = pic_offset_table_rtx;
2983         rtx insn;
2984         rtx mem;
2985
2986         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2987            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2988            decide rtx costs, in which case pic_offset_table_rtx is not
2989            initialized.  For that case no need to generate the first adrp
2990            instruction as the final cost for global variable access is
2991            one instruction.  */
2992         if (gp_rtx != NULL)
2993           {
2994             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2995                using the page base as GOT base, the first page may be wasted,
2996                in the worst scenario, there is only 28K space for GOT).
2997
2998                The generate instruction sequence for accessing global variable
2999                is:
3000
3001                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
3002
3003                Only one instruction needed. But we must initialize
3004                pic_offset_table_rtx properly.  We generate initialize insn for
3005                every global access, and allow CSE to remove all redundant.
3006
3007                The final instruction sequences will look like the following
3008                for multiply global variables access.
3009
3010                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3011
3012                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3013                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3014                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3015                  ...  */
3016
3017             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3018             crtl->uses_pic_offset_table = 1;
3019             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3020
3021             if (mode != GET_MODE (gp_rtx))
3022              gp_rtx = gen_lowpart (mode, gp_rtx);
3023
3024           }
3025
3026         if (mode == ptr_mode)
3027           {
3028             if (mode == DImode)
3029               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
3030             else
3031               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3032
3033             mem = XVECEXP (SET_SRC (insn), 0, 0);
3034           }
3035         else
3036           {
3037             gcc_assert (mode == Pmode);
3038
3039             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3040             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3041           }
3042
3043         /* The operand is expected to be MEM.  Whenever the related insn
3044            pattern changed, above code which calculate mem should be
3045            updated.  */
3046         gcc_assert (GET_CODE (mem) == MEM);
3047         MEM_READONLY_P (mem) = 1;
3048         MEM_NOTRAP_P (mem) = 1;
3049         emit_insn (insn);
3050         return;
3051       }
3052
3053     case SYMBOL_SMALL_GOT_4G:
3054       {
3055         /* In ILP32, the mode of dest can be either SImode or DImode,
3056            while the got entry is always of SImode size.  The mode of
3057            dest depends on how dest is used: if dest is assigned to a
3058            pointer (e.g. in the memory), it has SImode; it may have
3059            DImode if dest is dereferenced to access the memeory.
3060            This is why we have to handle three different ldr_got_small
3061            patterns here (two patterns for ILP32).  */
3062
3063         rtx insn;
3064         rtx mem;
3065         rtx tmp_reg = dest;
3066         machine_mode mode = GET_MODE (dest);
3067
3068         if (can_create_pseudo_p ())
3069           tmp_reg = gen_reg_rtx (mode);
3070
3071         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3072         if (mode == ptr_mode)
3073           {
3074             if (mode == DImode)
3075               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
3076             else
3077               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
3078
3079             mem = XVECEXP (SET_SRC (insn), 0, 0);
3080           }
3081         else
3082           {
3083             gcc_assert (mode == Pmode);
3084
3085             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
3086             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3087           }
3088
3089         gcc_assert (GET_CODE (mem) == MEM);
3090         MEM_READONLY_P (mem) = 1;
3091         MEM_NOTRAP_P (mem) = 1;
3092         emit_insn (insn);
3093         return;
3094       }
3095
3096     case SYMBOL_SMALL_TLSGD:
3097       {
3098         rtx_insn *insns;
3099         /* The return type of __tls_get_addr is the C pointer type
3100            so use ptr_mode.  */
3101         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3102         rtx tmp_reg = dest;
3103
3104         if (GET_MODE (dest) != ptr_mode)
3105           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3106
3107         start_sequence ();
3108         if (ptr_mode == SImode)
3109           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3110         else
3111           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3112         insns = get_insns ();
3113         end_sequence ();
3114
3115         RTL_CONST_CALL_P (insns) = 1;
3116         emit_libcall_block (insns, tmp_reg, result, imm);
3117         /* Convert back to the mode of the dest adding a zero_extend
3118            from SImode (ptr_mode) to DImode (Pmode). */
3119         if (dest != tmp_reg)
3120           convert_move (dest, tmp_reg, true);
3121         return;
3122       }
3123
3124     case SYMBOL_SMALL_TLSDESC:
3125       {
3126         machine_mode mode = GET_MODE (dest);
3127         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3128         rtx tp;
3129
3130         gcc_assert (mode == Pmode || mode == ptr_mode);
3131
3132         /* In ILP32, the got entry is always of SImode size.  Unlike
3133            small GOT, the dest is fixed at reg 0.  */
3134         if (TARGET_ILP32)
3135           emit_insn (gen_tlsdesc_small_si (imm));
3136         else
3137           emit_insn (gen_tlsdesc_small_di (imm));
3138         tp = aarch64_load_tp (NULL);
3139
3140         if (mode != Pmode)
3141           tp = gen_lowpart (mode, tp);
3142
3143         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3144         if (REG_P (dest))
3145           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3146         return;
3147       }
3148
3149     case SYMBOL_SMALL_TLSIE:
3150       {
3151         /* In ILP32, the mode of dest can be either SImode or DImode,
3152            while the got entry is always of SImode size.  The mode of
3153            dest depends on how dest is used: if dest is assigned to a
3154            pointer (e.g. in the memory), it has SImode; it may have
3155            DImode if dest is dereferenced to access the memeory.
3156            This is why we have to handle three different tlsie_small
3157            patterns here (two patterns for ILP32).  */
3158         machine_mode mode = GET_MODE (dest);
3159         rtx tmp_reg = gen_reg_rtx (mode);
3160         rtx tp = aarch64_load_tp (NULL);
3161
3162         if (mode == ptr_mode)
3163           {
3164             if (mode == DImode)
3165               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3166             else
3167               {
3168                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3169                 tp = gen_lowpart (mode, tp);
3170               }
3171           }
3172         else
3173           {
3174             gcc_assert (mode == Pmode);
3175             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3176           }
3177
3178         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3179         if (REG_P (dest))
3180           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3181         return;
3182       }
3183
3184     case SYMBOL_TLSLE12:
3185     case SYMBOL_TLSLE24:
3186     case SYMBOL_TLSLE32:
3187     case SYMBOL_TLSLE48:
3188       {
3189         machine_mode mode = GET_MODE (dest);
3190         rtx tp = aarch64_load_tp (NULL);
3191
3192         if (mode != Pmode)
3193           tp = gen_lowpart (mode, tp);
3194
3195         switch (type)
3196           {
3197           case SYMBOL_TLSLE12:
3198             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3199                         (dest, tp, imm));
3200             break;
3201           case SYMBOL_TLSLE24:
3202             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3203                         (dest, tp, imm));
3204           break;
3205           case SYMBOL_TLSLE32:
3206             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3207                         (dest, imm));
3208             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3209                         (dest, dest, tp));
3210           break;
3211           case SYMBOL_TLSLE48:
3212             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3213                         (dest, imm));
3214             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3215                         (dest, dest, tp));
3216             break;
3217           default:
3218             gcc_unreachable ();
3219           }
3220
3221         if (REG_P (dest))
3222           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3223         return;
3224       }
3225
3226     case SYMBOL_TINY_GOT:
3227       {
3228         rtx insn;
3229         machine_mode mode = GET_MODE (dest);
3230
3231         if (mode == ptr_mode)
3232           insn = gen_ldr_got_tiny (mode, dest, imm);
3233         else
3234           {
3235             gcc_assert (mode == Pmode);
3236             insn = gen_ldr_got_tiny_sidi (dest, imm);
3237           }
3238
3239         emit_insn (insn);
3240         return;
3241       }
3242
3243     case SYMBOL_TINY_TLSIE:
3244       {
3245         machine_mode mode = GET_MODE (dest);
3246         rtx tp = aarch64_load_tp (NULL);
3247
3248         if (mode == ptr_mode)
3249           {
3250             if (mode == DImode)
3251               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3252             else
3253               {
3254                 tp = gen_lowpart (mode, tp);
3255                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3256               }
3257           }
3258         else
3259           {
3260             gcc_assert (mode == Pmode);
3261             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3262           }
3263
3264         if (REG_P (dest))
3265           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3266         return;
3267       }
3268
3269     default:
3270       gcc_unreachable ();
3271     }
3272 }
3273
3274 /* Emit a move from SRC to DEST.  Assume that the move expanders can
3275    handle all moves if !can_create_pseudo_p ().  The distinction is
3276    important because, unlike emit_move_insn, the move expanders know
3277    how to force Pmode objects into the constant pool even when the
3278    constant pool address is not itself legitimate.  */
3279 static rtx
3280 aarch64_emit_move (rtx dest, rtx src)
3281 {
3282   return (can_create_pseudo_p ()
3283           ? emit_move_insn (dest, src)
3284           : emit_move_insn_1 (dest, src));
3285 }
3286
3287 /* Apply UNOPTAB to OP and store the result in DEST.  */
3288
3289 static void
3290 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3291 {
3292   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3293   if (dest != tmp)
3294     emit_move_insn (dest, tmp);
3295 }
3296
3297 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
3298
3299 static void
3300 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3301 {
3302   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3303                           OPTAB_DIRECT);
3304   if (dest != tmp)
3305     emit_move_insn (dest, tmp);
3306 }
3307
3308 /* Split a 128-bit move operation into two 64-bit move operations,
3309    taking care to handle partial overlap of register to register
3310    copies.  Special cases are needed when moving between GP regs and
3311    FP regs.  SRC can be a register, constant or memory; DST a register
3312    or memory.  If either operand is memory it must not have any side
3313    effects.  */
3314 void
3315 aarch64_split_128bit_move (rtx dst, rtx src)
3316 {
3317   rtx dst_lo, dst_hi;
3318   rtx src_lo, src_hi;
3319
3320   machine_mode mode = GET_MODE (dst);
3321
3322   gcc_assert (mode == TImode || mode == TFmode);
3323   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3324   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3325
3326   if (REG_P (dst) && REG_P (src))
3327     {
3328       int src_regno = REGNO (src);
3329       int dst_regno = REGNO (dst);
3330
3331       /* Handle FP <-> GP regs.  */
3332       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3333         {
3334           src_lo = gen_lowpart (word_mode, src);
3335           src_hi = gen_highpart (word_mode, src);
3336
3337           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3338           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3339           return;
3340         }
3341       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3342         {
3343           dst_lo = gen_lowpart (word_mode, dst);
3344           dst_hi = gen_highpart (word_mode, dst);
3345
3346           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3347           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3348           return;
3349         }
3350     }
3351
3352   dst_lo = gen_lowpart (word_mode, dst);
3353   dst_hi = gen_highpart (word_mode, dst);
3354   src_lo = gen_lowpart (word_mode, src);
3355   src_hi = gen_highpart_mode (word_mode, mode, src);
3356
3357   /* At most one pairing may overlap.  */
3358   if (reg_overlap_mentioned_p (dst_lo, src_hi))
3359     {
3360       aarch64_emit_move (dst_hi, src_hi);
3361       aarch64_emit_move (dst_lo, src_lo);
3362     }
3363   else
3364     {
3365       aarch64_emit_move (dst_lo, src_lo);
3366       aarch64_emit_move (dst_hi, src_hi);
3367     }
3368 }
3369
3370 bool
3371 aarch64_split_128bit_move_p (rtx dst, rtx src)
3372 {
3373   return (! REG_P (src)
3374           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
3375 }
3376
3377 /* Split a complex SIMD combine.  */
3378
3379 void
3380 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
3381 {
3382   machine_mode src_mode = GET_MODE (src1);
3383   machine_mode dst_mode = GET_MODE (dst);
3384
3385   gcc_assert (VECTOR_MODE_P (dst_mode));
3386   gcc_assert (register_operand (dst, dst_mode)
3387               && register_operand (src1, src_mode)
3388               && register_operand (src2, src_mode));
3389
3390   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
3391   return;
3392 }
3393
3394 /* Split a complex SIMD move.  */
3395
3396 void
3397 aarch64_split_simd_move (rtx dst, rtx src)
3398 {
3399   machine_mode src_mode = GET_MODE (src);
3400   machine_mode dst_mode = GET_MODE (dst);
3401
3402   gcc_assert (VECTOR_MODE_P (dst_mode));
3403
3404   if (REG_P (dst) && REG_P (src))
3405     {
3406       gcc_assert (VECTOR_MODE_P (src_mode));
3407       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3408     }
3409 }
3410
3411 bool
3412 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3413                               machine_mode ymode, rtx y)
3414 {
3415   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3416   gcc_assert (r != NULL);
3417   return rtx_equal_p (x, r);
3418 }
3419
3420 /* Return TARGET if it is nonnull and a register of mode MODE.
3421    Otherwise, return a fresh register of mode MODE if we can,
3422    or TARGET reinterpreted as MODE if we can't.  */
3423
3424 static rtx
3425 aarch64_target_reg (rtx target, machine_mode mode)
3426 {
3427   if (target && REG_P (target) && GET_MODE (target) == mode)
3428     return target;
3429   if (!can_create_pseudo_p ())
3430     {
3431       gcc_assert (target);
3432       return gen_lowpart (mode, target);
3433     }
3434   return gen_reg_rtx (mode);
3435 }
3436
3437 /* Return a register that contains the constant in BUILDER, given that
3438    the constant is a legitimate move operand.  Use TARGET as the register
3439    if it is nonnull and convenient.  */
3440
3441 static rtx
3442 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3443 {
3444   rtx src = builder.build ();
3445   target = aarch64_target_reg (target, GET_MODE (src));
3446   emit_insn (gen_rtx_SET (target, src));
3447   return target;
3448 }
3449
3450 static rtx
3451 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3452 {
3453   if (can_create_pseudo_p ())
3454     return force_reg (mode, value);
3455   else
3456     {
3457       gcc_assert (x);
3458       aarch64_emit_move (x, value);
3459       return x;
3460     }
3461 }
3462
3463 /* Return true if predicate value X is a constant in which every element
3464    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
3465    value, i.e. as a predicate in which all bits are significant.  */
3466
3467 static bool
3468 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3469 {
3470   if (GET_CODE (x) != CONST_VECTOR)
3471     return false;
3472
3473   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3474                                              GET_MODE_NUNITS (GET_MODE (x)));
3475   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3476   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3477   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3478
3479   unsigned int nelts = const_vector_encoded_nelts (x);
3480   for (unsigned int i = 0; i < nelts; ++i)
3481     {
3482       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3483       if (!CONST_INT_P (elt))
3484         return false;
3485
3486       builder.quick_push (elt);
3487       for (unsigned int j = 1; j < factor; ++j)
3488         builder.quick_push (const0_rtx);
3489     }
3490   builder.finalize ();
3491   return true;
3492 }
3493
3494 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
3495    widest predicate element size it can have (that is, the largest size
3496    for which each element would still be 0 or 1).  */
3497
3498 unsigned int
3499 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3500 {
3501   /* Start with the most optimistic assumption: that we only need
3502      one bit per pattern.  This is what we will use if only the first
3503      bit in each pattern is ever set.  */
3504   unsigned int mask = GET_MODE_SIZE (DImode);
3505   mask |= builder.npatterns ();
3506
3507   /* Look for set bits.  */
3508   unsigned int nelts = builder.encoded_nelts ();
3509   for (unsigned int i = 1; i < nelts; ++i)
3510     if (INTVAL (builder.elt (i)) != 0)
3511       {
3512         if (i & 1)
3513           return 1;
3514         mask |= i;
3515       }
3516   return mask & -mask;
3517 }
3518
3519 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3520    return that predicate mode, otherwise return opt_machine_mode ().  */
3521
3522 opt_machine_mode
3523 aarch64_ptrue_all_mode (rtx x)
3524 {
3525   gcc_assert (GET_MODE (x) == VNx16BImode);
3526   if (GET_CODE (x) != CONST_VECTOR
3527       || !CONST_VECTOR_DUPLICATE_P (x)
3528       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3529       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3530     return opt_machine_mode ();
3531
3532   unsigned int nelts = const_vector_encoded_nelts (x);
3533   for (unsigned int i = 1; i < nelts; ++i)
3534     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3535       return opt_machine_mode ();
3536
3537   return aarch64_sve_pred_mode (nelts);
3538 }
3539
3540 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
3541    that the constant would have with predicate element size ELT_SIZE
3542    (ignoring the upper bits in each element) and return:
3543
3544    * -1 if all bits are set
3545    * N if the predicate has N leading set bits followed by all clear bits
3546    * 0 if the predicate does not have any of these forms.  */
3547
3548 int
3549 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3550                               unsigned int elt_size)
3551 {
3552   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3553      followed by set bits.  */
3554   if (builder.nelts_per_pattern () == 3)
3555     return 0;
3556
3557   /* Skip over leading set bits.  */
3558   unsigned int nelts = builder.encoded_nelts ();
3559   unsigned int i = 0;
3560   for (; i < nelts; i += elt_size)
3561     if (INTVAL (builder.elt (i)) == 0)
3562       break;
3563   unsigned int vl = i / elt_size;
3564
3565   /* Check for the all-true case.  */
3566   if (i == nelts)
3567     return -1;
3568
3569   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3570      repeating pattern of set bits followed by clear bits.  */
3571   if (builder.nelts_per_pattern () != 2)
3572     return 0;
3573
3574   /* We have a "foreground" value and a duplicated "background" value.
3575      If the background might repeat and the last set bit belongs to it,
3576      we might have set bits followed by clear bits followed by set bits.  */
3577   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3578     return 0;
3579
3580   /* Make sure that the rest are all clear.  */
3581   for (; i < nelts; i += elt_size)
3582     if (INTVAL (builder.elt (i)) != 0)
3583       return 0;
3584
3585   return vl;
3586 }
3587
3588 /* See if there is an svpattern that encodes an SVE predicate of mode
3589    PRED_MODE in which the first VL bits are set and the rest are clear.
3590    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3591    A VL of -1 indicates an all-true vector.  */
3592
3593 aarch64_svpattern
3594 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3595 {
3596   if (vl < 0)
3597     return AARCH64_SV_ALL;
3598
3599   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3600     return AARCH64_NUM_SVPATTERNS;
3601
3602   if (vl >= 1 && vl <= 8)
3603     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3604
3605   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3606     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3607
3608   int max_vl;
3609   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3610     {
3611       if (vl == (max_vl / 3) * 3)
3612         return AARCH64_SV_MUL3;
3613       /* These would only trigger for non-power-of-2 lengths.  */
3614       if (vl == (max_vl & -4))
3615         return AARCH64_SV_MUL4;
3616       if (vl == (1 << floor_log2 (max_vl)))
3617         return AARCH64_SV_POW2;
3618       if (vl == max_vl)
3619         return AARCH64_SV_ALL;
3620     }
3621   return AARCH64_NUM_SVPATTERNS;
3622 }
3623
3624 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3625    bits has the lowest bit set and the upper bits clear.  This is the
3626    VNx16BImode equivalent of a PTRUE for controlling elements of
3627    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3628    all bits are significant, even the upper zeros.  */
3629
3630 rtx
3631 aarch64_ptrue_all (unsigned int elt_size)
3632 {
3633   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3634   builder.quick_push (const1_rtx);
3635   for (unsigned int i = 1; i < elt_size; ++i)
3636     builder.quick_push (const0_rtx);
3637   return builder.build ();
3638 }
3639
3640 /* Return an all-true predicate register of mode MODE.  */
3641
3642 rtx
3643 aarch64_ptrue_reg (machine_mode mode)
3644 {
3645   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3646   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3647   return gen_lowpart (mode, reg);
3648 }
3649
3650 /* Return an all-false predicate register of mode MODE.  */
3651
3652 rtx
3653 aarch64_pfalse_reg (machine_mode mode)
3654 {
3655   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3656   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3657   return gen_lowpart (mode, reg);
3658 }
3659
3660 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3661    true, or alternatively if we know that the operation predicated by
3662    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
3663    aarch64_sve_gp_strictness operand that describes the operation
3664    predicated by PRED1[0].  */
3665
3666 bool
3667 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3668 {
3669   machine_mode mode = GET_MODE (pred2);
3670   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3671               && mode == GET_MODE (pred1[0])
3672               && aarch64_sve_gp_strictness (pred1[1], SImode));
3673   return (pred1[0] == CONSTM1_RTX (mode)
3674           || INTVAL (pred1[1]) == SVE_RELAXED_GP
3675           || rtx_equal_p (pred1[0], pred2));
3676 }
3677
3678 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3679    for it.  PRED2[0] is the predicate for the instruction whose result
3680    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3681    for it.  Return true if we can prove that the two predicates are
3682    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3683    with PRED1[0] without changing behavior.  */
3684
3685 bool
3686 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3687 {
3688   machine_mode mode = GET_MODE (pred1[0]);
3689   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3690               && mode == GET_MODE (pred2[0])
3691               && aarch64_sve_ptrue_flag (pred1[1], SImode)
3692               && aarch64_sve_ptrue_flag (pred2[1], SImode));
3693
3694   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3695                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3696   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3697                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3698   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3699 }
3700
3701 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3702    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3703    Use TARGET as the target register if nonnull and convenient.  */
3704
3705 static rtx
3706 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3707                           machine_mode data_mode, rtx op1, rtx op2)
3708 {
3709   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3710   expand_operand ops[5];
3711   create_output_operand (&ops[0], target, pred_mode);
3712   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3713   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3714   create_input_operand (&ops[3], op1, data_mode);
3715   create_input_operand (&ops[4], op2, data_mode);
3716   expand_insn (icode, 5, ops);
3717   return ops[0].value;
3718 }
3719
3720 /* Use a comparison to convert integer vector SRC into MODE, which is
3721    the corresponding SVE predicate mode.  Use TARGET for the result
3722    if it's nonnull and convenient.  */
3723
3724 rtx
3725 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3726 {
3727   machine_mode src_mode = GET_MODE (src);
3728   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3729                                    src, CONST0_RTX (src_mode));
3730 }
3731
3732 /* Return the assembly token for svprfop value PRFOP.  */
3733
3734 static const char *
3735 svprfop_token (enum aarch64_svprfop prfop)
3736 {
3737   switch (prfop)
3738     {
3739 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3740     AARCH64_FOR_SVPRFOP (CASE)
3741 #undef CASE
3742     case AARCH64_NUM_SVPRFOPS:
3743       break;
3744     }
3745   gcc_unreachable ();
3746 }
3747
3748 /* Return the assembly string for an SVE prefetch operation with
3749    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3750    and that SUFFIX is the format for the remaining operands.  */
3751
3752 char *
3753 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3754                              const char *suffix)
3755 {
3756   static char buffer[128];
3757   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3758   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3759                                    mnemonic, svprfop_token (prfop), suffix);
3760   gcc_assert (written < sizeof (buffer));
3761   return buffer;
3762 }
3763
3764 /* Check whether we can calculate the number of elements in PATTERN
3765    at compile time, given that there are NELTS_PER_VQ elements per
3766    128-bit block.  Return the value if so, otherwise return -1.  */
3767
3768 HOST_WIDE_INT
3769 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3770 {
3771   unsigned int vl, const_vg;
3772   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3773     vl = 1 + (pattern - AARCH64_SV_VL1);
3774   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3775     vl = 16 << (pattern - AARCH64_SV_VL16);
3776   else if (aarch64_sve_vg.is_constant (&const_vg))
3777     {
3778       /* There are two vector granules per quadword.  */
3779       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3780       switch (pattern)
3781         {
3782         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3783         case AARCH64_SV_MUL4: return nelts & -4;
3784         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3785         case AARCH64_SV_ALL: return nelts;
3786         default: gcc_unreachable ();
3787         }
3788     }
3789   else
3790     return -1;
3791
3792   /* There are two vector granules per quadword.  */
3793   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3794   if (known_le (vl, nelts_all))
3795     return vl;
3796
3797   /* Requesting more elements than are available results in a PFALSE.  */
3798   if (known_gt (vl, nelts_all))
3799     return 0;
3800
3801   return -1;
3802 }
3803
3804 /* Return true if we can move VALUE into a register using a single
3805    CNT[BHWD] instruction.  */
3806
3807 static bool
3808 aarch64_sve_cnt_immediate_p (poly_int64 value)
3809 {
3810   HOST_WIDE_INT factor = value.coeffs[0];
3811   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
3812   return (value.coeffs[1] == factor
3813           && IN_RANGE (factor, 2, 16 * 16)
3814           && (factor & 1) == 0
3815           && factor <= 16 * (factor & -factor));
3816 }
3817
3818 /* Likewise for rtx X.  */
3819
3820 bool
3821 aarch64_sve_cnt_immediate_p (rtx x)
3822 {
3823   poly_int64 value;
3824   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3825 }
3826
3827 /* Return the asm string for an instruction with a CNT-like vector size
3828    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3829    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3830    first part of the operands template (the part that comes before the
3831    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
3832    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
3833    in each quadword.  If it is zero, we can use any element size.  */
3834
3835 static char *
3836 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3837                                   aarch64_svpattern pattern,
3838                                   unsigned int factor,
3839                                   unsigned int nelts_per_vq)
3840 {
3841   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3842
3843   if (nelts_per_vq == 0)
3844     /* There is some overlap in the ranges of the four CNT instructions.
3845        Here we always use the smallest possible element size, so that the
3846        multiplier is 1 whereever possible.  */
3847     nelts_per_vq = factor & -factor;
3848   int shift = std::min (exact_log2 (nelts_per_vq), 4);
3849   gcc_assert (IN_RANGE (shift, 1, 4));
3850   char suffix = "dwhb"[shift - 1];
3851
3852   factor >>= shift;
3853   unsigned int written;
3854   if (pattern == AARCH64_SV_ALL && factor == 1)
3855     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3856                         prefix, suffix, operands);
3857   else if (factor == 1)
3858     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3859                         prefix, suffix, operands, svpattern_token (pattern));
3860   else
3861     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3862                         prefix, suffix, operands, svpattern_token (pattern),
3863                         factor);
3864   gcc_assert (written < sizeof (buffer));
3865   return buffer;
3866 }
3867
3868 /* Return the asm string for an instruction with a CNT-like vector size
3869    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3870    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3871    first part of the operands template (the part that comes before the
3872    vector size itself).  X is the value of the vector size operand,
3873    as a polynomial integer rtx; we need to convert this into an "all"
3874    pattern with a multiplier.  */
3875
3876 char *
3877 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3878                                   rtx x)
3879 {
3880   poly_int64 value = rtx_to_poly_int64 (x);
3881   gcc_assert (aarch64_sve_cnt_immediate_p (value));
3882   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3883                                            value.coeffs[1], 0);
3884 }
3885
3886 /* Return the asm string for an instruction with a CNT-like vector size
3887    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3888    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3889    first part of the operands template (the part that comes before the
3890    vector size itself).  CNT_PAT[0..2] are the operands of the
3891    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
3892
3893 char *
3894 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3895                                       const char *operands, rtx *cnt_pat)
3896 {
3897   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3898   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3899   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3900   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3901                                            factor, nelts_per_vq);
3902 }
3903
3904 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
3905
3906 bool
3907 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3908 {
3909   poly_int64 value;
3910   return (poly_int_rtx_p (x, &value)
3911           && (aarch64_sve_cnt_immediate_p (value)
3912               || aarch64_sve_cnt_immediate_p (-value)));
3913 }
3914
3915 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3916    operand 0.  */
3917
3918 char *
3919 aarch64_output_sve_scalar_inc_dec (rtx offset)
3920 {
3921   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3922   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3923   if (offset_value.coeffs[1] > 0)
3924     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3925                                              offset_value.coeffs[1], 0);
3926   else
3927     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3928                                              -offset_value.coeffs[1], 0);
3929 }
3930
3931 /* Return true if we can add VALUE to a register using a single ADDVL
3932    or ADDPL instruction.  */
3933
3934 static bool
3935 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3936 {
3937   HOST_WIDE_INT factor = value.coeffs[0];
3938   if (factor == 0 || value.coeffs[1] != factor)
3939     return false;
3940   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3941      and a value of 16 is one vector width.  */
3942   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3943           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3944 }
3945
3946 /* Likewise for rtx X.  */
3947
3948 bool
3949 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3950 {
3951   poly_int64 value;
3952   return (poly_int_rtx_p (x, &value)
3953           && aarch64_sve_addvl_addpl_immediate_p (value));
3954 }
3955
3956 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3957    to operand 1 and storing the result in operand 0.  */
3958
3959 char *
3960 aarch64_output_sve_addvl_addpl (rtx offset)
3961 {
3962   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3963   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3964   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3965
3966   int factor = offset_value.coeffs[1];
3967   if ((factor & 15) == 0)
3968     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3969   else
3970     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3971   return buffer;
3972 }
3973
3974 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3975    instruction.  If it is, store the number of elements in each vector
3976    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3977    factor in *FACTOR_OUT (if nonnull).  */
3978
3979 bool
3980 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3981                                         unsigned int *nelts_per_vq_out)
3982 {
3983   rtx elt;
3984   poly_int64 value;
3985
3986   if (!const_vec_duplicate_p (x, &elt)
3987       || !poly_int_rtx_p (elt, &value))
3988     return false;
3989
3990   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3991   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3992     /* There's no vector INCB.  */
3993     return false;
3994
3995   HOST_WIDE_INT factor = value.coeffs[0];
3996   if (value.coeffs[1] != factor)
3997     return false;
3998
3999   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
4000   if ((factor % nelts_per_vq) != 0
4001       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4002     return false;
4003
4004   if (factor_out)
4005     *factor_out = factor;
4006   if (nelts_per_vq_out)
4007     *nelts_per_vq_out = nelts_per_vq;
4008   return true;
4009 }
4010
4011 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4012    instruction.  */
4013
4014 bool
4015 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
4016 {
4017   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
4018 }
4019
4020 /* Return the asm template for an SVE vector INC or DEC instruction.
4021    OPERANDS gives the operands before the vector count and X is the
4022    value of the vector count operand itself.  */
4023
4024 char *
4025 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
4026 {
4027   int factor;
4028   unsigned int nelts_per_vq;
4029   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
4030     gcc_unreachable ();
4031   if (factor < 0)
4032     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4033                                              -factor, nelts_per_vq);
4034   else
4035     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4036                                              factor, nelts_per_vq);
4037 }
4038
4039 static int
4040 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4041                                 scalar_int_mode mode)
4042 {
4043   int i;
4044   unsigned HOST_WIDE_INT val, val2, mask;
4045   int one_match, zero_match;
4046   int num_insns;
4047
4048   val = INTVAL (imm);
4049
4050   if (aarch64_move_imm (val, mode))
4051     {
4052       if (generate)
4053         emit_insn (gen_rtx_SET (dest, imm));
4054       return 1;
4055     }
4056
4057   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
4058      (with XXXX non-zero). In that case check to see if the move can be done in
4059      a smaller mode.  */
4060   val2 = val & 0xffffffff;
4061   if (mode == DImode
4062       && aarch64_move_imm (val2, SImode)
4063       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
4064     {
4065       if (generate)
4066         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4067
4068       /* Check if we have to emit a second instruction by checking to see
4069          if any of the upper 32 bits of the original DI mode value is set.  */
4070       if (val == val2)
4071         return 1;
4072
4073       i = (val >> 48) ? 48 : 32;
4074
4075       if (generate)
4076          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4077                                     GEN_INT ((val >> i) & 0xffff)));
4078
4079       return 2;
4080     }
4081
4082   if ((val >> 32) == 0 || mode == SImode)
4083     {
4084       if (generate)
4085         {
4086           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4087           if (mode == SImode)
4088             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4089                                        GEN_INT ((val >> 16) & 0xffff)));
4090           else
4091             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4092                                        GEN_INT ((val >> 16) & 0xffff)));
4093         }
4094       return 2;
4095     }
4096
4097   /* Remaining cases are all for DImode.  */
4098
4099   mask = 0xffff;
4100   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4101     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4102   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4103     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4104
4105   if (zero_match != 2 && one_match != 2)
4106     {
4107       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4108          For a 64-bit bitmask try whether changing 16 bits to all ones or
4109          zeroes creates a valid bitmask.  To check any repeated bitmask,
4110          try using 16 bits from the other 32-bit half of val.  */
4111
4112       for (i = 0; i < 64; i += 16, mask <<= 16)
4113         {
4114           val2 = val & ~mask;
4115           if (val2 != val && aarch64_bitmask_imm (val2, mode))
4116             break;
4117           val2 = val | mask;
4118           if (val2 != val && aarch64_bitmask_imm (val2, mode))
4119             break;
4120           val2 = val2 & ~mask;
4121           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4122           if (val2 != val && aarch64_bitmask_imm (val2, mode))
4123             break;
4124         }
4125       if (i != 64)
4126         {
4127           if (generate)
4128             {
4129               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4130               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4131                                          GEN_INT ((val >> i) & 0xffff)));
4132             }
4133           return 2;
4134         }
4135     }
4136
4137   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4138      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
4139      otherwise skip zero bits.  */
4140
4141   num_insns = 1;
4142   mask = 0xffff;
4143   val2 = one_match > zero_match ? ~val : val;
4144   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4145
4146   if (generate)
4147     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4148                                            ? (val | ~(mask << i))
4149                                            : (val & (mask << i)))));
4150   for (i += 16; i < 64; i += 16)
4151     {
4152       if ((val2 & (mask << i)) == 0)
4153         continue;
4154       if (generate)
4155         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4156                                    GEN_INT ((val >> i) & 0xffff)));
4157       num_insns ++;
4158     }
4159
4160   return num_insns;
4161 }
4162
4163 /* Return whether imm is a 128-bit immediate which is simple enough to
4164    expand inline.  */
4165 bool
4166 aarch64_mov128_immediate (rtx imm)
4167 {
4168   if (GET_CODE (imm) == CONST_INT)
4169     return true;
4170
4171   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4172
4173   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4174   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4175
4176   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4177          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4178 }
4179
4180
4181 /* Return the number of temporary registers that aarch64_add_offset_1
4182    would need to add OFFSET to a register.  */
4183
4184 static unsigned int
4185 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4186 {
4187   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
4188 }
4189
4190 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
4191    a non-polynomial OFFSET.  MODE is the mode of the addition.
4192    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4193    be set and CFA adjustments added to the generated instructions.
4194
4195    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4196    temporary if register allocation is already complete.  This temporary
4197    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
4198    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4199    the immediate again.
4200
4201    Since this function may be used to adjust the stack pointer, we must
4202    ensure that it cannot cause transient stack deallocation (for example
4203    by first incrementing SP and then decrementing when adjusting by a
4204    large immediate).  */
4205
4206 static void
4207 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4208                       rtx src, HOST_WIDE_INT offset, rtx temp1,
4209                       bool frame_related_p, bool emit_move_imm)
4210 {
4211   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4212   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4213
4214   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4215   rtx_insn *insn;
4216
4217   if (!moffset)
4218     {
4219       if (!rtx_equal_p (dest, src))
4220         {
4221           insn = emit_insn (gen_rtx_SET (dest, src));
4222           RTX_FRAME_RELATED_P (insn) = frame_related_p;
4223         }
4224       return;
4225     }
4226
4227   /* Single instruction adjustment.  */
4228   if (aarch64_uimm12_shift (moffset))
4229     {
4230       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4231       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4232       return;
4233     }
4234
4235   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4236      and either:
4237
4238      a) the offset cannot be loaded by a 16-bit move or
4239      b) there is no spare register into which we can move it.  */
4240   if (moffset < 0x1000000
4241       && ((!temp1 && !can_create_pseudo_p ())
4242           || !aarch64_move_imm (moffset, mode)))
4243     {
4244       HOST_WIDE_INT low_off = moffset & 0xfff;
4245
4246       low_off = offset < 0 ? -low_off : low_off;
4247       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4248       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4249       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4250       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4251       return;
4252     }
4253
4254   /* Emit a move immediate if required and an addition/subtraction.  */
4255   if (emit_move_imm)
4256     {
4257       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4258       temp1 = aarch64_force_temporary (mode, temp1,
4259                                        gen_int_mode (moffset, mode));
4260     }
4261   insn = emit_insn (offset < 0
4262                     ? gen_sub3_insn (dest, src, temp1)
4263                     : gen_add3_insn (dest, src, temp1));
4264   if (frame_related_p)
4265     {
4266       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4267       rtx adj = plus_constant (mode, src, offset);
4268       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4269     }
4270 }
4271
4272 /* Return the number of temporary registers that aarch64_add_offset
4273    would need to move OFFSET into a register or add OFFSET to a register;
4274    ADD_P is true if we want the latter rather than the former.  */
4275
4276 static unsigned int
4277 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4278 {
4279   /* This follows the same structure as aarch64_add_offset.  */
4280   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4281     return 0;
4282
4283   unsigned int count = 0;
4284   HOST_WIDE_INT factor = offset.coeffs[1];
4285   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4286   poly_int64 poly_offset (factor, factor);
4287   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4288     /* Need one register for the ADDVL/ADDPL result.  */
4289     count += 1;
4290   else if (factor != 0)
4291     {
4292       factor = abs (factor);
4293       if (factor > 16 * (factor & -factor))
4294         /* Need one register for the CNT result and one for the multiplication
4295            factor.  If necessary, the second temporary can be reused for the
4296            constant part of the offset.  */
4297         return 2;
4298       /* Need one register for the CNT result (which might then
4299          be shifted).  */
4300       count += 1;
4301     }
4302   return count + aarch64_add_offset_1_temporaries (constant);
4303 }
4304
4305 /* If X can be represented as a poly_int64, return the number
4306    of temporaries that are required to add it to a register.
4307    Return -1 otherwise.  */
4308
4309 int
4310 aarch64_add_offset_temporaries (rtx x)
4311 {
4312   poly_int64 offset;
4313   if (!poly_int_rtx_p (x, &offset))
4314     return -1;
4315   return aarch64_offset_temporaries (true, offset);
4316 }
4317
4318 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
4319    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4320    be set and CFA adjustments added to the generated instructions.
4321
4322    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4323    temporary if register allocation is already complete.  This temporary
4324    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4325    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4326    false to avoid emitting the immediate again.
4327
4328    TEMP2, if nonnull, is a second temporary register that doesn't
4329    overlap either DEST or REG.
4330
4331    Since this function may be used to adjust the stack pointer, we must
4332    ensure that it cannot cause transient stack deallocation (for example
4333    by first incrementing SP and then decrementing when adjusting by a
4334    large immediate).  */
4335
4336 static void
4337 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4338                     poly_int64 offset, rtx temp1, rtx temp2,
4339                     bool frame_related_p, bool emit_move_imm = true)
4340 {
4341   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4342   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4343   gcc_assert (temp1 == NULL_RTX
4344               || !frame_related_p
4345               || !reg_overlap_mentioned_p (temp1, dest));
4346   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4347
4348   /* Try using ADDVL or ADDPL to add the whole value.  */
4349   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4350     {
4351       rtx offset_rtx = gen_int_mode (offset, mode);
4352       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4353       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4354       return;
4355     }
4356
4357   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4358      SVE vector register, over and above the minimum size of 128 bits.
4359      This is equivalent to half the value returned by CNTD with a
4360      vector shape of ALL.  */
4361   HOST_WIDE_INT factor = offset.coeffs[1];
4362   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4363
4364   /* Try using ADDVL or ADDPL to add the VG-based part.  */
4365   poly_int64 poly_offset (factor, factor);
4366   if (src != const0_rtx
4367       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4368     {
4369       rtx offset_rtx = gen_int_mode (poly_offset, mode);
4370       if (frame_related_p)
4371         {
4372           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4373           RTX_FRAME_RELATED_P (insn) = true;
4374           src = dest;
4375         }
4376       else
4377         {
4378           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4379           src = aarch64_force_temporary (mode, temp1, addr);
4380           temp1 = temp2;
4381           temp2 = NULL_RTX;
4382         }
4383     }
4384   /* Otherwise use a CNT-based sequence.  */
4385   else if (factor != 0)
4386     {
4387       /* Use a subtraction if we have a negative factor.  */
4388       rtx_code code = PLUS;
4389       if (factor < 0)
4390         {
4391           factor = -factor;
4392           code = MINUS;
4393         }
4394
4395       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
4396          into the multiplication.  */
4397       rtx val;
4398       int shift = 0;
4399       if (factor & 1)
4400         /* Use a right shift by 1.  */
4401         shift = -1;
4402       else
4403         factor /= 2;
4404       HOST_WIDE_INT low_bit = factor & -factor;
4405       if (factor <= 16 * low_bit)
4406         {
4407           if (factor > 16 * 8)
4408             {
4409               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
4410                  the value with the minimum multiplier and shift it into
4411                  position.  */
4412               int extra_shift = exact_log2 (low_bit);
4413               shift += extra_shift;
4414               factor >>= extra_shift;
4415             }
4416           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
4417         }
4418       else
4419         {
4420           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
4421              directly, since that should increase the chances of being
4422              able to use a shift and add sequence.  If LOW_BIT itself
4423              is out of range, just use CNTD.  */
4424           if (low_bit <= 16 * 8)
4425             factor /= low_bit;
4426           else
4427             low_bit = 1;
4428
4429           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
4430           val = aarch64_force_temporary (mode, temp1, val);
4431
4432           if (can_create_pseudo_p ())
4433             {
4434               rtx coeff1 = gen_int_mode (factor, mode);
4435               val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
4436             }
4437           else
4438             {
4439               /* Go back to using a negative multiplication factor if we have
4440                  no register from which to subtract.  */
4441               if (code == MINUS && src == const0_rtx)
4442                 {
4443                   factor = -factor;
4444                   code = PLUS;
4445                 }
4446               rtx coeff1 = gen_int_mode (factor, mode);
4447               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4448               val = gen_rtx_MULT (mode, val, coeff1);
4449             }
4450         }
4451
4452       if (shift > 0)
4453         {
4454           /* Multiply by 1 << SHIFT.  */
4455           val = aarch64_force_temporary (mode, temp1, val);
4456           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4457         }
4458       else if (shift == -1)
4459         {
4460           /* Divide by 2.  */
4461           val = aarch64_force_temporary (mode, temp1, val);
4462           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
4463         }
4464
4465       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
4466       if (src != const0_rtx)
4467         {
4468           val = aarch64_force_temporary (mode, temp1, val);
4469           val = gen_rtx_fmt_ee (code, mode, src, val);
4470         }
4471       else if (code == MINUS)
4472         {
4473           val = aarch64_force_temporary (mode, temp1, val);
4474           val = gen_rtx_NEG (mode, val);
4475         }
4476
4477       if (constant == 0 || frame_related_p)
4478         {
4479           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4480           if (frame_related_p)
4481             {
4482               RTX_FRAME_RELATED_P (insn) = true;
4483               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4484                             gen_rtx_SET (dest, plus_constant (Pmode, src,
4485                                                               poly_offset)));
4486             }
4487           src = dest;
4488           if (constant == 0)
4489             return;
4490         }
4491       else
4492         {
4493           src = aarch64_force_temporary (mode, temp1, val);
4494           temp1 = temp2;
4495           temp2 = NULL_RTX;
4496         }
4497
4498       emit_move_imm = true;
4499     }
4500
4501   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4502                         frame_related_p, emit_move_imm);
4503 }
4504
4505 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4506    than a poly_int64.  */
4507
4508 void
4509 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4510                           rtx offset_rtx, rtx temp1, rtx temp2)
4511 {
4512   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4513                       temp1, temp2, false);
4514 }
4515
4516 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4517    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
4518    if TEMP1 already contains abs (DELTA).  */
4519
4520 static inline void
4521 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4522 {
4523   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4524                       temp1, temp2, true, emit_move_imm);
4525 }
4526
4527 /* Subtract DELTA from the stack pointer, marking the instructions
4528    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
4529    if nonnull.  */
4530
4531 static inline void
4532 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4533                 bool emit_move_imm = true)
4534 {
4535   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4536                       temp1, temp2, frame_related_p, emit_move_imm);
4537 }
4538
4539 /* Set DEST to (vec_series BASE STEP).  */
4540
4541 static void
4542 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4543 {
4544   machine_mode mode = GET_MODE (dest);
4545   scalar_mode inner = GET_MODE_INNER (mode);
4546
4547   /* Each operand can be a register or an immediate in the range [-16, 15].  */
4548   if (!aarch64_sve_index_immediate_p (base))
4549     base = force_reg (inner, base);
4550   if (!aarch64_sve_index_immediate_p (step))
4551     step = force_reg (inner, step);
4552
4553   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4554 }
4555
4556 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4557    register of mode MODE.  Use TARGET for the result if it's nonnull
4558    and convenient.
4559
4560    The two vector modes must have the same element mode.  The behavior
4561    is to duplicate architectural lane N of SRC into architectural lanes
4562    N + I * STEP of the result.  On big-endian targets, architectural
4563    lane 0 of an Advanced SIMD vector is the last element of the vector
4564    in memory layout, so for big-endian targets this operation has the
4565    effect of reversing SRC before duplicating it.  Callers need to
4566    account for this.  */
4567
4568 rtx
4569 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4570 {
4571   machine_mode src_mode = GET_MODE (src);
4572   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4573   insn_code icode = (BYTES_BIG_ENDIAN
4574                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
4575                      : code_for_aarch64_vec_duplicate_vq_le (mode));
4576
4577   unsigned int i = 0;
4578   expand_operand ops[3];
4579   create_output_operand (&ops[i++], target, mode);
4580   create_output_operand (&ops[i++], src, src_mode);
4581   if (BYTES_BIG_ENDIAN)
4582     {
4583       /* Create a PARALLEL describing the reversal of SRC.  */
4584       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4585       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4586                                                   nelts_per_vq - 1, -1);
4587       create_fixed_operand (&ops[i++], sel);
4588     }
4589   expand_insn (icode, i, ops);
4590   return ops[0].value;
4591 }
4592
4593 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4594    the memory image into DEST.  Return true on success.  */
4595
4596 static bool
4597 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4598 {
4599   src = force_const_mem (GET_MODE (src), src);
4600   if (!src)
4601     return false;
4602
4603   /* Make sure that the address is legitimate.  */
4604   if (!aarch64_sve_ld1rq_operand_p (src))
4605     {
4606       rtx addr = force_reg (Pmode, XEXP (src, 0));
4607       src = replace_equiv_address (src, addr);
4608     }
4609
4610   machine_mode mode = GET_MODE (dest);
4611   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4612   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4613   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4614   return true;
4615 }
4616
4617 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4618    SVE data mode and isn't a legitimate constant.  Use TARGET for the
4619    result if convenient.
4620
4621    The returned register can have whatever mode seems most natural
4622    given the contents of SRC.  */
4623
4624 static rtx
4625 aarch64_expand_sve_const_vector (rtx target, rtx src)
4626 {
4627   machine_mode mode = GET_MODE (src);
4628   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4629   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4630   scalar_mode elt_mode = GET_MODE_INNER (mode);
4631   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4632   unsigned int container_bits = aarch64_sve_container_bits (mode);
4633   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4634
4635   if (nelts_per_pattern == 1
4636       && encoded_bits <= 128
4637       && container_bits != elt_bits)
4638     {
4639       /* We have a partial vector mode and a constant whose full-vector
4640          equivalent would occupy a repeating 128-bit sequence.  Build that
4641          full-vector equivalent instead, so that we have the option of
4642          using LD1RQ and Advanced SIMD operations.  */
4643       unsigned int repeat = container_bits / elt_bits;
4644       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4645       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4646       for (unsigned int i = 0; i < npatterns; ++i)
4647         for (unsigned int j = 0; j < repeat; ++j)
4648           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4649       target = aarch64_target_reg (target, full_mode);
4650       return aarch64_expand_sve_const_vector (target, builder.build ());
4651     }
4652
4653   if (nelts_per_pattern == 1 && encoded_bits == 128)
4654     {
4655       /* The constant is a duplicated quadword but can't be narrowed
4656          beyond a quadword.  Get the memory image of the first quadword
4657          as a 128-bit vector and try using LD1RQ to load it from memory.
4658
4659          The effect for both endiannesses is to load memory lane N into
4660          architectural lanes N + I * STEP of the result.  On big-endian
4661          targets, the layout of the 128-bit vector in an Advanced SIMD
4662          register would be different from its layout in an SVE register,
4663          but this 128-bit vector is a memory value only.  */
4664       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4665       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4666       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4667         return target;
4668     }
4669
4670   if (nelts_per_pattern == 1 && encoded_bits < 128)
4671     {
4672       /* The vector is a repeating sequence of 64 bits or fewer.
4673          See if we can load them using an Advanced SIMD move and then
4674          duplicate it to fill a vector.  This is better than using a GPR
4675          move because it keeps everything in the same register file.  */
4676       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4677       rtx_vector_builder builder (vq_mode, npatterns, 1);
4678       for (unsigned int i = 0; i < npatterns; ++i)
4679         {
4680           /* We want memory lane N to go into architectural lane N,
4681              so reverse for big-endian targets.  The DUP .Q pattern
4682              has a compensating reverse built-in.  */
4683           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4684           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4685         }
4686       rtx vq_src = builder.build ();
4687       if (aarch64_simd_valid_immediate (vq_src, NULL))
4688         {
4689           vq_src = force_reg (vq_mode, vq_src);
4690           return aarch64_expand_sve_dupq (target, mode, vq_src);
4691         }
4692
4693       /* Get an integer representation of the repeating part of Advanced
4694          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
4695          which for big-endian targets is lane-swapped wrt a normal
4696          Advanced SIMD vector.  This means that for both endiannesses,
4697          memory lane N of SVE vector SRC corresponds to architectural
4698          lane N of a register holding VQ_SRC.  This in turn means that
4699          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4700          as a single 128-bit value) and thus that memory lane 0 of SRC is
4701          in the lsb of the integer.  Duplicating the integer therefore
4702          ensures that memory lane N of SRC goes into architectural lane
4703          N + I * INDEX of the SVE register.  */
4704       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4705       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4706       if (elt_value)
4707         {
4708           /* Pretend that we had a vector of INT_MODE to start with.  */
4709           elt_mode = int_mode;
4710           mode = aarch64_full_sve_mode (int_mode).require ();
4711
4712           /* If the integer can be moved into a general register by a
4713              single instruction, do that and duplicate the result.  */
4714           if (CONST_INT_P (elt_value)
4715               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4716             {
4717               elt_value = force_reg (elt_mode, elt_value);
4718               return expand_vector_broadcast (mode, elt_value);
4719             }
4720         }
4721       else if (npatterns == 1)
4722         /* We're duplicating a single value, but can't do better than
4723            force it to memory and load from there.  This handles things
4724            like symbolic constants.  */
4725         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4726
4727       if (elt_value)
4728         {
4729           /* Load the element from memory if we can, otherwise move it into
4730              a register and use a DUP.  */
4731           rtx op = force_const_mem (elt_mode, elt_value);
4732           if (!op)
4733             op = force_reg (elt_mode, elt_value);
4734           return expand_vector_broadcast (mode, op);
4735         }
4736     }
4737
4738   /* Try using INDEX.  */
4739   rtx base, step;
4740   if (const_vec_series_p (src, &base, &step))
4741     {
4742       aarch64_expand_vec_series (target, base, step);
4743       return target;
4744     }
4745
4746   /* From here on, it's better to force the whole constant to memory
4747      if we can.  */
4748   if (GET_MODE_NUNITS (mode).is_constant ())
4749     return NULL_RTX;
4750
4751   /* Expand each pattern individually.  */
4752   gcc_assert (npatterns > 1);
4753   rtx_vector_builder builder;
4754   auto_vec<rtx, 16> vectors (npatterns);
4755   for (unsigned int i = 0; i < npatterns; ++i)
4756     {
4757       builder.new_vector (mode, 1, nelts_per_pattern);
4758       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4759         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4760       vectors.quick_push (force_reg (mode, builder.build ()));
4761     }
4762
4763   /* Use permutes to interleave the separate vectors.  */
4764   while (npatterns > 1)
4765     {
4766       npatterns /= 2;
4767       for (unsigned int i = 0; i < npatterns; ++i)
4768         {
4769           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4770           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4771           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4772           vectors[i] = tmp;
4773         }
4774     }
4775   gcc_assert (vectors[0] == target);
4776   return target;
4777 }
4778
4779 /* Use WHILE to set a predicate register of mode MODE in which the first
4780    VL bits are set and the rest are clear.  Use TARGET for the register
4781    if it's nonnull and convenient.  */
4782
4783 static rtx
4784 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4785                                  unsigned int vl)
4786 {
4787   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4788   target = aarch64_target_reg (target, mode);
4789   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
4790                         target, const0_rtx, limit));
4791   return target;
4792 }
4793
4794 static rtx
4795 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4796
4797 /* BUILDER is a constant predicate in which the index of every set bit
4798    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4799    by inverting every element at a multiple of ELT_SIZE and EORing the
4800    result with an ELT_SIZE PTRUE.
4801
4802    Return a register that contains the constant on success, otherwise
4803    return null.  Use TARGET as the register if it is nonnull and
4804    convenient.  */
4805
4806 static rtx
4807 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4808                                    unsigned int elt_size)
4809 {
4810   /* Invert every element at a multiple of ELT_SIZE, keeping the
4811      other bits zero.  */
4812   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4813                                   builder.nelts_per_pattern ());
4814   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4815     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4816       inv_builder.quick_push (const1_rtx);
4817     else
4818       inv_builder.quick_push (const0_rtx);
4819   inv_builder.finalize ();
4820
4821   /* See if we can load the constant cheaply.  */
4822   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4823   if (!inv)
4824     return NULL_RTX;
4825
4826   /* EOR the result with an ELT_SIZE PTRUE.  */
4827   rtx mask = aarch64_ptrue_all (elt_size);
4828   mask = force_reg (VNx16BImode, mask);
4829   inv = gen_lowpart (VNx16BImode, inv);
4830   target = aarch64_target_reg (target, VNx16BImode);
4831   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4832   return target;
4833 }
4834
4835 /* BUILDER is a constant predicate in which the index of every set bit
4836    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4837    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
4838    register on success, otherwise return null.  Use TARGET as the register
4839    if nonnull and convenient.  */
4840
4841 static rtx
4842 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4843                                    unsigned int elt_size,
4844                                    unsigned int permute_size)
4845 {
4846   /* We're going to split the constant into two new constants A and B,
4847      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4848      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4849
4850      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4851      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4852
4853      where _ indicates elements that will be discarded by the permute.
4854
4855      First calculate the ELT_SIZEs for A and B.  */
4856   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4857   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4858   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4859     if (INTVAL (builder.elt (i)) != 0)
4860       {
4861         if (i & permute_size)
4862           b_elt_size |= i - permute_size;
4863         else
4864           a_elt_size |= i;
4865       }
4866   a_elt_size &= -a_elt_size;
4867   b_elt_size &= -b_elt_size;
4868
4869   /* Now construct the vectors themselves.  */
4870   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4871                                 builder.nelts_per_pattern ());
4872   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4873                                 builder.nelts_per_pattern ());
4874   unsigned int nelts = builder.encoded_nelts ();
4875   for (unsigned int i = 0; i < nelts; ++i)
4876     if (i & (elt_size - 1))
4877       {
4878         a_builder.quick_push (const0_rtx);
4879         b_builder.quick_push (const0_rtx);
4880       }
4881     else if ((i & permute_size) == 0)
4882       {
4883         /* The A and B elements are significant.  */
4884         a_builder.quick_push (builder.elt (i));
4885         b_builder.quick_push (builder.elt (i + permute_size));
4886       }
4887     else
4888       {
4889         /* The A and B elements are going to be discarded, so pick whatever
4890            is likely to give a nice constant.  We are targeting element
4891            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4892            with the aim of each being a sequence of ones followed by
4893            a sequence of zeros.  So:
4894
4895            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4896              duplicate the last X_ELT_SIZE element, to extend the
4897              current sequence of ones or zeros.
4898
4899            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4900              zero, so that the constant really does have X_ELT_SIZE and
4901              not a smaller size.  */
4902         if (a_elt_size > permute_size)
4903           a_builder.quick_push (const0_rtx);
4904         else
4905           a_builder.quick_push (a_builder.elt (i - a_elt_size));
4906         if (b_elt_size > permute_size)
4907           b_builder.quick_push (const0_rtx);
4908         else
4909           b_builder.quick_push (b_builder.elt (i - b_elt_size));
4910       }
4911   a_builder.finalize ();
4912   b_builder.finalize ();
4913
4914   /* Try loading A into a register.  */
4915   rtx_insn *last = get_last_insn ();
4916   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4917   if (!a)
4918     return NULL_RTX;
4919
4920   /* Try loading B into a register.  */
4921   rtx b = a;
4922   if (a_builder != b_builder)
4923     {
4924       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4925       if (!b)
4926         {
4927           delete_insns_since (last);
4928           return NULL_RTX;
4929         }
4930     }
4931
4932   /* Emit the TRN1 itself.  */
4933   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4934   target = aarch64_target_reg (target, mode);
4935   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4936                               gen_lowpart (mode, a),
4937                               gen_lowpart (mode, b)));
4938   return target;
4939 }
4940
4941 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
4942    constant in BUILDER into an SVE predicate register.  Return the register
4943    on success, otherwise return null.  Use TARGET for the register if
4944    nonnull and convenient.
4945
4946    ALLOW_RECURSE_P is true if we can use methods that would call this
4947    function recursively.  */
4948
4949 static rtx
4950 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4951                                  bool allow_recurse_p)
4952 {
4953   if (builder.encoded_nelts () == 1)
4954     /* A PFALSE or a PTRUE .B ALL.  */
4955     return aarch64_emit_set_immediate (target, builder);
4956
4957   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4958   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4959     {
4960       /* If we can load the constant using PTRUE, use it as-is.  */
4961       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4962       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4963         return aarch64_emit_set_immediate (target, builder);
4964
4965       /* Otherwise use WHILE to set the first VL bits.  */
4966       return aarch64_sve_move_pred_via_while (target, mode, vl);
4967     }
4968
4969   if (!allow_recurse_p)
4970     return NULL_RTX;
4971
4972   /* Try inverting the vector in element size ELT_SIZE and then EORing
4973      the result with an ELT_SIZE PTRUE.  */
4974   if (INTVAL (builder.elt (0)) == 0)
4975     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4976                                                      elt_size))
4977       return res;
4978
4979   /* Try using TRN1 to permute two simpler constants.  */
4980   for (unsigned int i = elt_size; i <= 8; i *= 2)
4981     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4982                                                      elt_size, i))
4983       return res;
4984
4985   return NULL_RTX;
4986 }
4987
4988 /* Return an SVE predicate register that contains the VNx16BImode
4989    constant in BUILDER, without going through the move expanders.
4990
4991    The returned register can have whatever mode seems most natural
4992    given the contents of BUILDER.  Use TARGET for the result if
4993    convenient.  */
4994
4995 static rtx
4996 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4997 {
4998   /* Try loading the constant using pure predicate operations.  */
4999   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5000     return res;
5001
5002   /* Try forcing the constant to memory.  */
5003   if (builder.full_nelts ().is_constant ())
5004     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5005       {
5006         target = aarch64_target_reg (target, VNx16BImode);
5007         emit_move_insn (target, mem);
5008         return target;
5009       }
5010
5011   /* The last resort is to load the constant as an integer and then
5012      compare it against zero.  Use -1 for set bits in order to increase
5013      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
5014   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5015                                   builder.nelts_per_pattern ());
5016   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5017     int_builder.quick_push (INTVAL (builder.elt (i))
5018                             ? constm1_rtx : const0_rtx);
5019   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5020                                            int_builder.build ());
5021 }
5022
5023 /* Set DEST to immediate IMM.  */
5024
5025 void
5026 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5027 {
5028   machine_mode mode = GET_MODE (dest);
5029
5030   /* Check on what type of symbol it is.  */
5031   scalar_int_mode int_mode;
5032   if ((GET_CODE (imm) == SYMBOL_REF
5033        || GET_CODE (imm) == LABEL_REF
5034        || GET_CODE (imm) == CONST
5035        || GET_CODE (imm) == CONST_POLY_INT)
5036       && is_a <scalar_int_mode> (mode, &int_mode))
5037     {
5038       rtx mem;
5039       poly_int64 offset;
5040       HOST_WIDE_INT const_offset;
5041       enum aarch64_symbol_type sty;
5042
5043       /* If we have (const (plus symbol offset)), separate out the offset
5044          before we start classifying the symbol.  */
5045       rtx base = strip_offset (imm, &offset);
5046
5047       /* We must always add an offset involving VL separately, rather than
5048          folding it into the relocation.  */
5049       if (!offset.is_constant (&const_offset))
5050         {
5051           if (!TARGET_SVE)
5052             {
5053               aarch64_report_sve_required ();
5054               return;
5055             }
5056           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
5057             emit_insn (gen_rtx_SET (dest, imm));
5058           else
5059             {
5060               /* Do arithmetic on 32-bit values if the result is smaller
5061                  than that.  */
5062               if (partial_subreg_p (int_mode, SImode))
5063                 {
5064                   /* It is invalid to do symbol calculations in modes
5065                      narrower than SImode.  */
5066                   gcc_assert (base == const0_rtx);
5067                   dest = gen_lowpart (SImode, dest);
5068                   int_mode = SImode;
5069                 }
5070               if (base != const0_rtx)
5071                 {
5072                   base = aarch64_force_temporary (int_mode, dest, base);
5073                   aarch64_add_offset (int_mode, dest, base, offset,
5074                                       NULL_RTX, NULL_RTX, false);
5075                 }
5076               else
5077                 aarch64_add_offset (int_mode, dest, base, offset,
5078                                     dest, NULL_RTX, false);
5079             }
5080           return;
5081         }
5082
5083       sty = aarch64_classify_symbol (base, const_offset);
5084       switch (sty)
5085         {
5086         case SYMBOL_FORCE_TO_MEM:
5087           if (const_offset != 0
5088               && targetm.cannot_force_const_mem (int_mode, imm))
5089             {
5090               gcc_assert (can_create_pseudo_p ());
5091               base = aarch64_force_temporary (int_mode, dest, base);
5092               aarch64_add_offset (int_mode, dest, base, const_offset,
5093                                   NULL_RTX, NULL_RTX, false);
5094               return;
5095             }
5096
5097           mem = force_const_mem (ptr_mode, imm);
5098           gcc_assert (mem);
5099
5100           /* If we aren't generating PC relative literals, then
5101              we need to expand the literal pool access carefully.
5102              This is something that needs to be done in a number
5103              of places, so could well live as a separate function.  */
5104           if (!aarch64_pcrelative_literal_loads)
5105             {
5106               gcc_assert (can_create_pseudo_p ());
5107               base = gen_reg_rtx (ptr_mode);
5108               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
5109               if (ptr_mode != Pmode)
5110                 base = convert_memory_address (Pmode, base);
5111               mem = gen_rtx_MEM (ptr_mode, base);
5112             }
5113
5114           if (int_mode != ptr_mode)
5115             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
5116
5117           emit_insn (gen_rtx_SET (dest, mem));
5118
5119           return;
5120
5121         case SYMBOL_SMALL_TLSGD:
5122         case SYMBOL_SMALL_TLSDESC:
5123         case SYMBOL_SMALL_TLSIE:
5124         case SYMBOL_SMALL_GOT_28K:
5125         case SYMBOL_SMALL_GOT_4G:
5126         case SYMBOL_TINY_GOT:
5127         case SYMBOL_TINY_TLSIE:
5128           if (const_offset != 0)
5129             {
5130               gcc_assert(can_create_pseudo_p ());
5131               base = aarch64_force_temporary (int_mode, dest, base);
5132               aarch64_add_offset (int_mode, dest, base, const_offset,
5133                                   NULL_RTX, NULL_RTX, false);
5134               return;
5135             }
5136           /* FALLTHRU */
5137
5138         case SYMBOL_SMALL_ABSOLUTE:
5139         case SYMBOL_TINY_ABSOLUTE:
5140         case SYMBOL_TLSLE12:
5141         case SYMBOL_TLSLE24:
5142         case SYMBOL_TLSLE32:
5143         case SYMBOL_TLSLE48:
5144           aarch64_load_symref_appropriately (dest, imm, sty);
5145           return;
5146
5147         default:
5148           gcc_unreachable ();
5149         }
5150     }
5151
5152   if (!CONST_INT_P (imm))
5153     {
5154       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
5155         {
5156           /* Only the low bit of each .H, .S and .D element is defined,
5157              so we can set the upper bits to whatever we like.  If the
5158              predicate is all-true in MODE, prefer to set all the undefined
5159              bits as well, so that we can share a single .B predicate for
5160              all modes.  */
5161           if (imm == CONSTM1_RTX (mode))
5162             imm = CONSTM1_RTX (VNx16BImode);
5163
5164           /* All methods for constructing predicate modes wider than VNx16BI
5165              will set the upper bits of each element to zero.  Expose this
5166              by moving such constants as a VNx16BI, so that all bits are
5167              significant and so that constants for different modes can be
5168              shared.  The wider constant will still be available as a
5169              REG_EQUAL note.  */
5170           rtx_vector_builder builder;
5171           if (aarch64_get_sve_pred_bits (builder, imm))
5172             {
5173               rtx res = aarch64_expand_sve_const_pred (dest, builder);
5174               if (dest != res)
5175                 emit_move_insn (dest, gen_lowpart (mode, res));
5176               return;
5177             }
5178         }
5179
5180       if (GET_CODE (imm) == HIGH
5181           || aarch64_simd_valid_immediate (imm, NULL))
5182         {
5183           emit_insn (gen_rtx_SET (dest, imm));
5184           return;
5185         }
5186
5187       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
5188         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
5189           {
5190             if (dest != res)
5191               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
5192             return;
5193           }
5194
5195       rtx mem = force_const_mem (mode, imm);
5196       gcc_assert (mem);
5197       emit_move_insn (dest, mem);
5198       return;
5199     }
5200
5201   aarch64_internal_mov_immediate (dest, imm, true,
5202                                   as_a <scalar_int_mode> (mode));
5203 }
5204
5205 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
5206    that is known to contain PTRUE.  */
5207
5208 void
5209 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
5210 {
5211   expand_operand ops[3];
5212   machine_mode mode = GET_MODE (dest);
5213   create_output_operand (&ops[0], dest, mode);
5214   create_input_operand (&ops[1], pred, GET_MODE(pred));
5215   create_input_operand (&ops[2], src, mode);
5216   temporary_volatile_ok v (true);
5217   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
5218 }
5219
5220 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
5221    operand is in memory.  In this case we need to use the predicated LD1
5222    and ST1 instead of LDR and STR, both for correctness on big-endian
5223    targets and because LD1 and ST1 support a wider range of addressing modes.
5224    PRED_MODE is the mode of the predicate.
5225
5226    See the comment at the head of aarch64-sve.md for details about the
5227    big-endian handling.  */
5228
5229 void
5230 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
5231 {
5232   machine_mode mode = GET_MODE (dest);
5233   rtx ptrue = aarch64_ptrue_reg (pred_mode);
5234   if (!register_operand (src, mode)
5235       && !register_operand (dest, mode))
5236     {
5237       rtx tmp = gen_reg_rtx (mode);
5238       if (MEM_P (src))
5239         aarch64_emit_sve_pred_move (tmp, ptrue, src);
5240       else
5241         emit_move_insn (tmp, src);
5242       src = tmp;
5243     }
5244   aarch64_emit_sve_pred_move (dest, ptrue, src);
5245 }
5246
5247 /* Called only on big-endian targets.  See whether an SVE vector move
5248    from SRC to DEST is effectively a REV[BHW] instruction, because at
5249    least one operand is a subreg of an SVE vector that has wider or
5250    narrower elements.  Return true and emit the instruction if so.
5251
5252    For example:
5253
5254      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
5255
5256    represents a VIEW_CONVERT between the following vectors, viewed
5257    in memory order:
5258
5259      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
5260      R1: { [0],      [1],      [2],      [3],     ... }
5261
5262    The high part of lane X in R2 should therefore correspond to lane X*2
5263    of R1, but the register representations are:
5264
5265          msb                                      lsb
5266      R2: ...... [1].high  [1].low   [0].high  [0].low
5267      R1: ...... [3]       [2]       [1]       [0]
5268
5269    where the low part of lane X in R2 corresponds to lane X*2 in R1.
5270    We therefore need a reverse operation to swap the high and low values
5271    around.
5272
5273    This is purely an optimization.  Without it we would spill the
5274    subreg operand to the stack in one mode and reload it in the
5275    other mode, which has the same effect as the REV.  */
5276
5277 bool
5278 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
5279 {
5280   gcc_assert (BYTES_BIG_ENDIAN);
5281   if (GET_CODE (dest) == SUBREG)
5282     dest = SUBREG_REG (dest);
5283   if (GET_CODE (src) == SUBREG)
5284     src = SUBREG_REG (src);
5285
5286   /* The optimization handles two single SVE REGs with different element
5287      sizes.  */
5288   if (!REG_P (dest)
5289       || !REG_P (src)
5290       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
5291       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
5292       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
5293           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
5294     return false;
5295
5296   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
5297   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
5298   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
5299                                UNSPEC_REV_SUBREG);
5300   emit_insn (gen_rtx_SET (dest, unspec));
5301   return true;
5302 }
5303
5304 /* Return a copy of X with mode MODE, without changing its other
5305    attributes.  Unlike gen_lowpart, this doesn't care whether the
5306    mode change is valid.  */
5307
5308 rtx
5309 aarch64_replace_reg_mode (rtx x, machine_mode mode)
5310 {
5311   if (GET_MODE (x) == mode)
5312     return x;
5313
5314   x = shallow_copy_rtx (x);
5315   set_mode_and_regno (x, mode, REGNO (x));
5316   return x;
5317 }
5318
5319 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
5320    stored in wider integer containers.  */
5321
5322 static unsigned int
5323 aarch64_sve_rev_unspec (machine_mode mode)
5324 {
5325   switch (GET_MODE_UNIT_SIZE (mode))
5326     {
5327     case 1: return UNSPEC_REVB;
5328     case 2: return UNSPEC_REVH;
5329     case 4: return UNSPEC_REVW;
5330     }
5331   gcc_unreachable ();
5332 }
5333
5334 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
5335    operands.  */
5336
5337 void
5338 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
5339 {
5340   /* Decide which REV operation we need.  The mode with wider elements
5341      determines the mode of the operands and the mode with the narrower
5342      elements determines the reverse width.  */
5343   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
5344   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
5345   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
5346       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
5347     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
5348
5349   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
5350   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
5351
5352   /* Get the operands in the appropriate modes and emit the instruction.  */
5353   ptrue = gen_lowpart (pred_mode, ptrue);
5354   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
5355   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
5356   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
5357                                dest, ptrue, src));
5358 }
5359
5360 static bool
5361 aarch64_function_ok_for_sibcall (tree, tree exp)
5362 {
5363   if (crtl->abi->id () != expr_callee_abi (exp).id ())
5364     return false;
5365
5366   return true;
5367 }
5368
5369 /* Subroutine of aarch64_pass_by_reference for arguments that are not
5370    passed in SVE registers.  */
5371
5372 static bool
5373 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
5374                              const function_arg_info &arg)
5375 {
5376   HOST_WIDE_INT size;
5377   machine_mode dummymode;
5378   int nregs;
5379
5380   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
5381   if (arg.mode == BLKmode && arg.type)
5382     size = int_size_in_bytes (arg.type);
5383   else
5384     /* No frontends can create types with variable-sized modes, so we
5385        shouldn't be asked to pass or return them.  */
5386     size = GET_MODE_SIZE (arg.mode).to_constant ();
5387
5388   /* Aggregates are passed by reference based on their size.  */
5389   if (arg.aggregate_type_p ())
5390     size = int_size_in_bytes (arg.type);
5391
5392   /* Variable sized arguments are always returned by reference.  */
5393   if (size < 0)
5394     return true;
5395
5396   /* Can this be a candidate to be passed in fp/simd register(s)?  */
5397   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
5398                                                &dummymode, &nregs, NULL,
5399                                                !pcum || pcum->silent_p))
5400     return false;
5401
5402   /* Arguments which are variable sized or larger than 2 registers are
5403      passed by reference unless they are a homogenous floating point
5404      aggregate.  */
5405   return size > 2 * UNITS_PER_WORD;
5406 }
5407
5408 /* Implement TARGET_PASS_BY_REFERENCE.  */
5409
5410 static bool
5411 aarch64_pass_by_reference (cumulative_args_t pcum_v,
5412                            const function_arg_info &arg)
5413 {
5414   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5415
5416   if (!arg.type)
5417     return aarch64_pass_by_reference_1 (pcum, arg);
5418
5419   pure_scalable_type_info pst_info;
5420   switch (pst_info.analyze (arg.type))
5421     {
5422     case pure_scalable_type_info::IS_PST:
5423       if (pcum && !pcum->silent_p && !TARGET_SVE)
5424         /* We can't gracefully recover at this point, so make this a
5425            fatal error.  */
5426         fatal_error (input_location, "arguments of type %qT require"
5427                      " the SVE ISA extension", arg.type);
5428
5429       /* Variadic SVE types are passed by reference.  Normal non-variadic
5430          arguments are too if we've run out of registers.  */
5431       return (!arg.named
5432               || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
5433               || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
5434
5435     case pure_scalable_type_info::DOESNT_MATTER:
5436       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
5437       return true;
5438
5439     case pure_scalable_type_info::NO_ABI_IDENTITY:
5440     case pure_scalable_type_info::ISNT_PST:
5441       return aarch64_pass_by_reference_1 (pcum, arg);
5442     }
5443   gcc_unreachable ();
5444 }
5445
5446 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
5447 static bool
5448 aarch64_return_in_msb (const_tree valtype)
5449 {
5450   machine_mode dummy_mode;
5451   int dummy_int;
5452
5453   /* Never happens in little-endian mode.  */
5454   if (!BYTES_BIG_ENDIAN)
5455     return false;
5456
5457   /* Only composite types smaller than or equal to 16 bytes can
5458      be potentially returned in registers.  */
5459   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
5460       || int_size_in_bytes (valtype) <= 0
5461       || int_size_in_bytes (valtype) > 16)
5462     return false;
5463
5464   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
5465      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
5466      is always passed/returned in the least significant bits of fp/simd
5467      register(s).  */
5468   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
5469                                                &dummy_mode, &dummy_int, NULL,
5470                                                false))
5471     return false;
5472
5473   /* Likewise pure scalable types for SVE vector and predicate registers.  */
5474   pure_scalable_type_info pst_info;
5475   if (pst_info.analyze_registers (valtype))
5476     return false;
5477
5478   return true;
5479 }
5480
5481 /* Implement TARGET_FUNCTION_VALUE.
5482    Define how to find the value returned by a function.  */
5483
5484 static rtx
5485 aarch64_function_value (const_tree type, const_tree func,
5486                         bool outgoing ATTRIBUTE_UNUSED)
5487 {
5488   machine_mode mode;
5489   int unsignedp;
5490
5491   mode = TYPE_MODE (type);
5492   if (INTEGRAL_TYPE_P (type))
5493     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5494
5495   pure_scalable_type_info pst_info;
5496   if (type && pst_info.analyze_registers (type))
5497     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
5498
5499   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5500      are returned in memory, not by value.  */
5501   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5502   bool sve_p = (vec_flags & VEC_ANY_SVE);
5503
5504   if (aarch64_return_in_msb (type))
5505     {
5506       HOST_WIDE_INT size = int_size_in_bytes (type);
5507
5508       if (size % UNITS_PER_WORD != 0)
5509         {
5510           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
5511           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
5512         }
5513     }
5514
5515   int count;
5516   machine_mode ag_mode;
5517   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
5518                                                NULL, false))
5519     {
5520       gcc_assert (!sve_p);
5521       if (!aarch64_composite_type_p (type, mode))
5522         {
5523           gcc_assert (count == 1 && mode == ag_mode);
5524           return gen_rtx_REG (mode, V0_REGNUM);
5525         }
5526       else
5527         {
5528           int i;
5529           rtx par;
5530
5531           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5532           for (i = 0; i < count; i++)
5533             {
5534               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
5535               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5536               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5537               XVECEXP (par, 0, i) = tmp;
5538             }
5539           return par;
5540         }
5541     }
5542   else
5543     {
5544       if (sve_p)
5545         {
5546           /* Vector types can acquire a partial SVE mode using things like
5547              __attribute__((vector_size(N))), and this is potentially useful.
5548              However, the choice of mode doesn't affect the type's ABI
5549              identity, so we should treat the types as though they had
5550              the associated integer mode, just like they did before SVE
5551              was introduced.
5552
5553              We know that the vector must be 128 bits or smaller,
5554              otherwise we'd have returned it in memory instead.  */
5555           gcc_assert (type
5556                       && (aarch64_some_values_include_pst_objects_p (type)
5557                           || (vec_flags & VEC_PARTIAL)));
5558
5559           scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5560           rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
5561           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5562           return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5563         }
5564       return gen_rtx_REG (mode, R0_REGNUM);
5565     }
5566 }
5567
5568 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5569    Return true if REGNO is the number of a hard register in which the values
5570    of called function may come back.  */
5571
5572 static bool
5573 aarch64_function_value_regno_p (const unsigned int regno)
5574 {
5575   /* Maximum of 16 bytes can be returned in the general registers.  Examples
5576      of 16-byte return values are: 128-bit integers and 16-byte small
5577      structures (excluding homogeneous floating-point aggregates).  */
5578   if (regno == R0_REGNUM || regno == R1_REGNUM)
5579     return true;
5580
5581   /* Up to four fp/simd registers can return a function value, e.g. a
5582      homogeneous floating-point aggregate having four members.  */
5583   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5584     return TARGET_FLOAT;
5585
5586   return false;
5587 }
5588
5589 /* Subroutine for aarch64_return_in_memory for types that are not returned
5590    in SVE registers.  */
5591
5592 static bool
5593 aarch64_return_in_memory_1 (const_tree type)
5594 {
5595   HOST_WIDE_INT size;
5596   machine_mode ag_mode;
5597   int count;
5598
5599   if (!AGGREGATE_TYPE_P (type)
5600       && TREE_CODE (type) != COMPLEX_TYPE
5601       && TREE_CODE (type) != VECTOR_TYPE)
5602     /* Simple scalar types always returned in registers.  */
5603     return false;
5604
5605   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5606                                                &ag_mode, &count, NULL, false))
5607     return false;
5608
5609   /* Types larger than 2 registers returned in memory.  */
5610   size = int_size_in_bytes (type);
5611   return (size < 0 || size > 2 * UNITS_PER_WORD);
5612 }
5613
5614 /* Implement TARGET_RETURN_IN_MEMORY.
5615
5616    If the type T of the result of a function is such that
5617      void func (T arg)
5618    would require that arg be passed as a value in a register (or set of
5619    registers) according to the parameter passing rules, then the result
5620    is returned in the same registers as would be used for such an
5621    argument.  */
5622
5623 static bool
5624 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5625 {
5626   pure_scalable_type_info pst_info;
5627   switch (pst_info.analyze (type))
5628     {
5629     case pure_scalable_type_info::IS_PST:
5630       return (pst_info.num_zr () > NUM_FP_ARG_REGS
5631               || pst_info.num_pr () > NUM_PR_ARG_REGS);
5632
5633     case pure_scalable_type_info::DOESNT_MATTER:
5634       gcc_assert (aarch64_return_in_memory_1 (type));
5635       return true;
5636
5637     case pure_scalable_type_info::NO_ABI_IDENTITY:
5638     case pure_scalable_type_info::ISNT_PST:
5639       return aarch64_return_in_memory_1 (type);
5640     }
5641   gcc_unreachable ();
5642 }
5643
5644 static bool
5645 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5646                                const_tree type, int *nregs)
5647 {
5648   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5649   return aarch64_vfp_is_call_or_return_candidate (mode, type,
5650                                                   &pcum->aapcs_vfp_rmode,
5651                                                   nregs, NULL, pcum->silent_p);
5652 }
5653
5654 /* Given MODE and TYPE of a function argument, return the alignment in
5655    bits.  The idea is to suppress any stronger alignment requested by
5656    the user and opt for the natural alignment (specified in AAPCS64 \S
5657    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
5658    calculated in versions of GCC prior to GCC-9.  This is a helper
5659    function for local use only.  */
5660
5661 static unsigned int
5662 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5663                                 bool *abi_break)
5664 {
5665   *abi_break = false;
5666   if (!type)
5667     return GET_MODE_ALIGNMENT (mode);
5668
5669   if (integer_zerop (TYPE_SIZE (type)))
5670     return 0;
5671
5672   gcc_assert (TYPE_MODE (type) == mode);
5673
5674   if (!AGGREGATE_TYPE_P (type))
5675     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
5676
5677   if (TREE_CODE (type) == ARRAY_TYPE)
5678     return TYPE_ALIGN (TREE_TYPE (type));
5679
5680   unsigned int alignment = 0;
5681   unsigned int bitfield_alignment = 0;
5682   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5683     if (TREE_CODE (field) == FIELD_DECL)
5684       {
5685         /* Note that we explicitly consider zero-sized fields here,
5686            even though they don't map to AAPCS64 machine types.
5687            For example, in:
5688
5689                struct __attribute__((aligned(8))) empty {};
5690
5691                struct s {
5692                  [[no_unique_address]] empty e;
5693                  int x;
5694                };
5695
5696            "s" contains only one Fundamental Data Type (the int field)
5697            but gains 8-byte alignment and size thanks to "e".  */
5698         alignment = std::max (alignment, DECL_ALIGN (field));
5699         if (DECL_BIT_FIELD_TYPE (field))
5700           bitfield_alignment
5701             = std::max (bitfield_alignment,
5702                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5703       }
5704
5705   if (bitfield_alignment > alignment)
5706     {
5707       *abi_break = true;
5708       return bitfield_alignment;
5709     }
5710
5711   return alignment;
5712 }
5713
5714 /* Layout a function argument according to the AAPCS64 rules.  The rule
5715    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
5716    mode that was originally given to us by the target hook, whereas the
5717    mode in ARG might be the result of replacing partial SVE modes with
5718    the equivalent integer mode.  */
5719
5720 static void
5721 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5722 {
5723   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5724   tree type = arg.type;
5725   machine_mode mode = arg.mode;
5726   int ncrn, nvrn, nregs;
5727   bool allocate_ncrn, allocate_nvrn;
5728   HOST_WIDE_INT size;
5729   bool abi_break;
5730
5731   /* We need to do this once per argument.  */
5732   if (pcum->aapcs_arg_processed)
5733     return;
5734
5735   pcum->aapcs_arg_processed = true;
5736
5737   pure_scalable_type_info pst_info;
5738   if (type && pst_info.analyze_registers (type))
5739     {
5740       /* The PCS says that it is invalid to pass an SVE value to an
5741          unprototyped function.  There is no ABI-defined location we
5742          can return in this case, so we have no real choice but to raise
5743          an error immediately, even though this is only a query function.  */
5744       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5745         {
5746           gcc_assert (!pcum->silent_p);
5747           error ("SVE type %qT cannot be passed to an unprototyped function",
5748                  arg.type);
5749           /* Avoid repeating the message, and avoid tripping the assert
5750              below.  */
5751           pcum->pcs_variant = ARM_PCS_SVE;
5752         }
5753
5754       /* We would have converted the argument into pass-by-reference
5755          form if it didn't fit in registers.  */
5756       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
5757       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
5758       gcc_assert (arg.named
5759                   && pcum->pcs_variant == ARM_PCS_SVE
5760                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5761                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5762       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
5763                                           P0_REGNUM + pcum->aapcs_nprn);
5764       return;
5765     }
5766
5767   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5768      are passed by reference, not by value.  */
5769   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5770   bool sve_p = (vec_flags & VEC_ANY_SVE);
5771   if (sve_p)
5772     /* Vector types can acquire a partial SVE mode using things like
5773        __attribute__((vector_size(N))), and this is potentially useful.
5774        However, the choice of mode doesn't affect the type's ABI
5775        identity, so we should treat the types as though they had
5776        the associated integer mode, just like they did before SVE
5777        was introduced.
5778
5779        We know that the vector must be 128 bits or smaller,
5780        otherwise we'd have passed it in memory instead.  */
5781     gcc_assert (type
5782                 && (aarch64_some_values_include_pst_objects_p (type)
5783                     || (vec_flags & VEC_PARTIAL)));
5784
5785   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
5786   if (type)
5787     size = int_size_in_bytes (type);
5788   else
5789     /* No frontends can create types with variable-sized modes, so we
5790        shouldn't be asked to pass or return them.  */
5791     size = GET_MODE_SIZE (mode).to_constant ();
5792   size = ROUND_UP (size, UNITS_PER_WORD);
5793
5794   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5795   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5796                                                  mode,
5797                                                  type,
5798                                                  &nregs);
5799   gcc_assert (!sve_p || !allocate_nvrn);
5800
5801   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5802      The following code thus handles passing by SIMD/FP registers first.  */
5803
5804   nvrn = pcum->aapcs_nvrn;
5805
5806   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5807      and homogenous short-vector aggregates (HVA).  */
5808   if (allocate_nvrn)
5809     {
5810       if (!pcum->silent_p && !TARGET_FLOAT)
5811         aarch64_err_no_fpadvsimd (mode);
5812
5813       if (nvrn + nregs <= NUM_FP_ARG_REGS)
5814         {
5815           pcum->aapcs_nextnvrn = nvrn + nregs;
5816           if (!aarch64_composite_type_p (type, mode))
5817             {
5818               gcc_assert (nregs == 1);
5819               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5820             }
5821           else
5822             {
5823               rtx par;
5824               int i;
5825               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5826               for (i = 0; i < nregs; i++)
5827                 {
5828                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5829                                          V0_REGNUM + nvrn + i);
5830                   rtx offset = gen_int_mode
5831                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5832                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5833                   XVECEXP (par, 0, i) = tmp;
5834                 }
5835               pcum->aapcs_reg = par;
5836             }
5837           return;
5838         }
5839       else
5840         {
5841           /* C.3 NSRN is set to 8.  */
5842           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5843           goto on_stack;
5844         }
5845     }
5846
5847   ncrn = pcum->aapcs_ncrn;
5848   nregs = size / UNITS_PER_WORD;
5849
5850   /* C6 - C9.  though the sign and zero extension semantics are
5851      handled elsewhere.  This is the case where the argument fits
5852      entirely general registers.  */
5853   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5854     {
5855       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5856
5857       /* C.8 if the argument has an alignment of 16 then the NGRN is
5858          rounded up to the next even number.  */
5859       if (nregs == 2
5860           && ncrn % 2
5861           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5862              comparison is there because for > 16 * BITS_PER_UNIT
5863              alignment nregs should be > 2 and therefore it should be
5864              passed by reference rather than value.  */
5865           && (aarch64_function_arg_alignment (mode, type, &abi_break)
5866               == 16 * BITS_PER_UNIT))
5867         {
5868           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5869             inform (input_location, "parameter passing for argument of type "
5870                     "%qT changed in GCC 9.1", type);
5871           ++ncrn;
5872           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
5873         }
5874
5875       /* If an argument with an SVE mode needs to be shifted up to the
5876          high part of the register, treat it as though it had an integer mode.
5877          Using the normal (parallel [...]) would suppress the shifting.  */
5878       if (sve_p
5879           && BYTES_BIG_ENDIAN
5880           && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
5881           && aarch64_pad_reg_upward (mode, type, false))
5882         {
5883           mode = int_mode_for_mode (mode).require ();
5884           sve_p = false;
5885         }
5886
5887       /* NREGS can be 0 when e.g. an empty structure is to be passed.
5888          A reg is still generated for it, but the caller should be smart
5889          enough not to use it.  */
5890       if (nregs == 0
5891           || (nregs == 1 && !sve_p)
5892           || GET_MODE_CLASS (mode) == MODE_INT)
5893         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
5894       else
5895         {
5896           rtx par;
5897           int i;
5898
5899           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5900           for (i = 0; i < nregs; i++)
5901             {
5902               scalar_int_mode reg_mode = word_mode;
5903               if (nregs == 1)
5904                 reg_mode = int_mode_for_mode (mode).require ();
5905               rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
5906               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5907                                        GEN_INT (i * UNITS_PER_WORD));
5908               XVECEXP (par, 0, i) = tmp;
5909             }
5910           pcum->aapcs_reg = par;
5911         }
5912
5913       pcum->aapcs_nextncrn = ncrn + nregs;
5914       return;
5915     }
5916
5917   /* C.11  */
5918   pcum->aapcs_nextncrn = NUM_ARG_REGS;
5919
5920   /* The argument is passed on stack; record the needed number of words for
5921      this argument and align the total size if necessary.  */
5922 on_stack:
5923   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
5924
5925   if (aarch64_function_arg_alignment (mode, type, &abi_break)
5926       == 16 * BITS_PER_UNIT)
5927     {
5928       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5929       if (pcum->aapcs_stack_size != new_size)
5930         {
5931           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5932             inform (input_location, "parameter passing for argument of type "
5933                     "%qT changed in GCC 9.1", type);
5934           pcum->aapcs_stack_size = new_size;
5935         }
5936     }
5937   return;
5938 }
5939
5940 /* Implement TARGET_FUNCTION_ARG.  */
5941
5942 static rtx
5943 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5944 {
5945   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5946   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
5947               || pcum->pcs_variant == ARM_PCS_SIMD
5948               || pcum->pcs_variant == ARM_PCS_SVE);
5949
5950   if (arg.end_marker_p ())
5951     return gen_int_mode (pcum->pcs_variant, DImode);
5952
5953   aarch64_layout_arg (pcum_v, arg);
5954   return pcum->aapcs_reg;
5955 }
5956
5957 void
5958 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
5959                               const_tree fntype,
5960                               rtx libname ATTRIBUTE_UNUSED,
5961                               const_tree fndecl ATTRIBUTE_UNUSED,
5962                               unsigned n_named ATTRIBUTE_UNUSED,
5963                               bool silent_p)
5964 {
5965   pcum->aapcs_ncrn = 0;
5966   pcum->aapcs_nvrn = 0;
5967   pcum->aapcs_nprn = 0;
5968   pcum->aapcs_nextncrn = 0;
5969   pcum->aapcs_nextnvrn = 0;
5970   pcum->aapcs_nextnprn = 0;
5971   if (fntype)
5972     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5973   else
5974     pcum->pcs_variant = ARM_PCS_AAPCS64;
5975   pcum->aapcs_reg = NULL_RTX;
5976   pcum->aapcs_arg_processed = false;
5977   pcum->aapcs_stack_words = 0;
5978   pcum->aapcs_stack_size = 0;
5979   pcum->silent_p = silent_p;
5980
5981   if (!silent_p
5982       && !TARGET_FLOAT
5983       && fndecl && TREE_PUBLIC (fndecl)
5984       && fntype && fntype != error_mark_node)
5985     {
5986       const_tree type = TREE_TYPE (fntype);
5987       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
5988       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
5989       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5990                                                    &mode, &nregs, NULL, false))
5991         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
5992     }
5993
5994   if (!silent_p
5995       && !TARGET_SVE
5996       && pcum->pcs_variant == ARM_PCS_SVE)
5997     {
5998       /* We can't gracefully recover at this point, so make this a
5999          fatal error.  */
6000       if (fndecl)
6001         fatal_error (input_location, "%qE requires the SVE ISA extension",
6002                      fndecl);
6003       else
6004         fatal_error (input_location, "calls to functions of type %qT require"
6005                      " the SVE ISA extension", fntype);
6006     }
6007 }
6008
6009 static void
6010 aarch64_function_arg_advance (cumulative_args_t pcum_v,
6011                               const function_arg_info &arg)
6012 {
6013   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6014   if (pcum->pcs_variant == ARM_PCS_AAPCS64
6015       || pcum->pcs_variant == ARM_PCS_SIMD
6016       || pcum->pcs_variant == ARM_PCS_SVE)
6017     {
6018       aarch64_layout_arg (pcum_v, arg);
6019       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
6020                   != (pcum->aapcs_stack_words != 0));
6021       pcum->aapcs_arg_processed = false;
6022       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
6023       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
6024       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
6025       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
6026       pcum->aapcs_stack_words = 0;
6027       pcum->aapcs_reg = NULL_RTX;
6028     }
6029 }
6030
6031 bool
6032 aarch64_function_arg_regno_p (unsigned regno)
6033 {
6034   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
6035           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
6036 }
6037
6038 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
6039    PARM_BOUNDARY bits of alignment, but will be given anything up
6040    to STACK_BOUNDARY bits if the type requires it.  This makes sure
6041    that both before and after the layout of each argument, the Next
6042    Stacked Argument Address (NSAA) will have a minimum alignment of
6043    8 bytes.  */
6044
6045 static unsigned int
6046 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
6047 {
6048   bool abi_break;
6049   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
6050                                                            &abi_break);
6051   if (abi_break & warn_psabi)
6052     inform (input_location, "parameter passing for argument of type "
6053             "%qT changed in GCC 9.1", type);
6054
6055   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
6056 }
6057
6058 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
6059
6060 static fixed_size_mode
6061 aarch64_get_reg_raw_mode (int regno)
6062 {
6063   if (TARGET_SVE && FP_REGNUM_P (regno))
6064     /* Don't use the SVE part of the register for __builtin_apply and
6065        __builtin_return.  The SVE registers aren't used by the normal PCS,
6066        so using them there would be a waste of time.  The PCS extensions
6067        for SVE types are fundamentally incompatible with the
6068        __builtin_return/__builtin_apply interface.  */
6069     return as_a <fixed_size_mode> (V16QImode);
6070   return default_get_reg_raw_mode (regno);
6071 }
6072
6073 /* Implement TARGET_FUNCTION_ARG_PADDING.
6074
6075    Small aggregate types are placed in the lowest memory address.
6076
6077    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
6078
6079 static pad_direction
6080 aarch64_function_arg_padding (machine_mode mode, const_tree type)
6081 {
6082   /* On little-endian targets, the least significant byte of every stack
6083      argument is passed at the lowest byte address of the stack slot.  */
6084   if (!BYTES_BIG_ENDIAN)
6085     return PAD_UPWARD;
6086
6087   /* Otherwise, integral, floating-point and pointer types are padded downward:
6088      the least significant byte of a stack argument is passed at the highest
6089      byte address of the stack slot.  */
6090   if (type
6091       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
6092          || POINTER_TYPE_P (type))
6093       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
6094     return PAD_DOWNWARD;
6095
6096   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
6097   return PAD_UPWARD;
6098 }
6099
6100 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6101
6102    It specifies padding for the last (may also be the only)
6103    element of a block move between registers and memory.  If
6104    assuming the block is in the memory, padding upward means that
6105    the last element is padded after its highest significant byte,
6106    while in downward padding, the last element is padded at the
6107    its least significant byte side.
6108
6109    Small aggregates and small complex types are always padded
6110    upwards.
6111
6112    We don't need to worry about homogeneous floating-point or
6113    short-vector aggregates; their move is not affected by the
6114    padding direction determined here.  Regardless of endianness,
6115    each element of such an aggregate is put in the least
6116    significant bits of a fp/simd register.
6117
6118    Return !BYTES_BIG_ENDIAN if the least significant byte of the
6119    register has useful data, and return the opposite if the most
6120    significant byte does.  */
6121
6122 bool
6123 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
6124                      bool first ATTRIBUTE_UNUSED)
6125 {
6126
6127   /* Aside from pure scalable types, small composite types are always
6128      padded upward.  */
6129   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
6130     {
6131       HOST_WIDE_INT size;
6132       if (type)
6133         size = int_size_in_bytes (type);
6134       else
6135         /* No frontends can create types with variable-sized modes, so we
6136            shouldn't be asked to pass or return them.  */
6137         size = GET_MODE_SIZE (mode).to_constant ();
6138       if (size < 2 * UNITS_PER_WORD)
6139         {
6140           pure_scalable_type_info pst_info;
6141           if (pst_info.analyze_registers (type))
6142             return false;
6143           return true;
6144         }
6145     }
6146
6147   /* Otherwise, use the default padding.  */
6148   return !BYTES_BIG_ENDIAN;
6149 }
6150
6151 static scalar_int_mode
6152 aarch64_libgcc_cmp_return_mode (void)
6153 {
6154   return SImode;
6155 }
6156
6157 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6158
6159 /* We use the 12-bit shifted immediate arithmetic instructions so values
6160    must be multiple of (1 << 12), i.e. 4096.  */
6161 #define ARITH_FACTOR 4096
6162
6163 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6164 #error Cannot use simple address calculation for stack probing
6165 #endif
6166
6167 /* The pair of scratch registers used for stack probing.  */
6168 #define PROBE_STACK_FIRST_REG  R9_REGNUM
6169 #define PROBE_STACK_SECOND_REG R10_REGNUM
6170
6171 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
6172    inclusive.  These are offsets from the current stack pointer.  */
6173
6174 static void
6175 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
6176 {
6177   HOST_WIDE_INT size;
6178   if (!poly_size.is_constant (&size))
6179     {
6180       sorry ("stack probes for SVE frames");
6181       return;
6182     }
6183
6184   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
6185
6186   /* See the same assertion on PROBE_INTERVAL above.  */
6187   gcc_assert ((first % ARITH_FACTOR) == 0);
6188
6189   /* See if we have a constant small number of probes to generate.  If so,
6190      that's the easy case.  */
6191   if (size <= PROBE_INTERVAL)
6192     {
6193       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
6194
6195       emit_set_insn (reg1,
6196                      plus_constant (Pmode,
6197                                     stack_pointer_rtx, -(first + base)));
6198       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
6199     }
6200
6201   /* The run-time loop is made up of 8 insns in the generic case while the
6202      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
6203   else if (size <= 4 * PROBE_INTERVAL)
6204     {
6205       HOST_WIDE_INT i, rem;
6206
6207       emit_set_insn (reg1,
6208                      plus_constant (Pmode,
6209                                     stack_pointer_rtx,
6210                                     -(first + PROBE_INTERVAL)));
6211       emit_stack_probe (reg1);
6212
6213       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
6214          it exceeds SIZE.  If only two probes are needed, this will not
6215          generate any code.  Then probe at FIRST + SIZE.  */
6216       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
6217         {
6218           emit_set_insn (reg1,
6219                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
6220           emit_stack_probe (reg1);
6221         }
6222
6223       rem = size - (i - PROBE_INTERVAL);
6224       if (rem > 256)
6225         {
6226           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6227
6228           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
6229           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
6230         }
6231       else
6232         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
6233     }
6234
6235   /* Otherwise, do the same as above, but in a loop.  Note that we must be
6236      extra careful with variables wrapping around because we might be at
6237      the very top (or the very bottom) of the address space and we have
6238      to be able to handle this case properly; in particular, we use an
6239      equality test for the loop condition.  */
6240   else
6241     {
6242       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
6243
6244       /* Step 1: round SIZE to the previous multiple of the interval.  */
6245
6246       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
6247
6248
6249       /* Step 2: compute initial and final value of the loop counter.  */
6250
6251       /* TEST_ADDR = SP + FIRST.  */
6252       emit_set_insn (reg1,
6253                      plus_constant (Pmode, stack_pointer_rtx, -first));
6254
6255       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
6256       HOST_WIDE_INT adjustment = - (first + rounded_size);
6257       if (! aarch64_uimm12_shift (adjustment))
6258         {
6259           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
6260                                           true, Pmode);
6261           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
6262         }
6263       else
6264         emit_set_insn (reg2,
6265                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
6266
6267       /* Step 3: the loop
6268
6269          do
6270            {
6271              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
6272              probe at TEST_ADDR
6273            }
6274          while (TEST_ADDR != LAST_ADDR)
6275
6276          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
6277          until it is equal to ROUNDED_SIZE.  */
6278
6279       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
6280
6281
6282       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
6283          that SIZE is equal to ROUNDED_SIZE.  */
6284
6285       if (size != rounded_size)
6286         {
6287           HOST_WIDE_INT rem = size - rounded_size;
6288
6289           if (rem > 256)
6290             {
6291               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6292
6293               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
6294               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
6295             }
6296           else
6297             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
6298         }
6299     }
6300
6301   /* Make sure nothing is scheduled before we are done.  */
6302   emit_insn (gen_blockage ());
6303 }
6304
6305 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
6306    absolute addresses.  */
6307
6308 const char *
6309 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
6310 {
6311   static int labelno = 0;
6312   char loop_lab[32];
6313   rtx xops[2];
6314
6315   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
6316
6317   /* Loop.  */
6318   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
6319
6320   HOST_WIDE_INT stack_clash_probe_interval
6321     = 1 << param_stack_clash_protection_guard_size;
6322
6323   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
6324   xops[0] = reg1;
6325   HOST_WIDE_INT interval;
6326   if (flag_stack_clash_protection)
6327     interval = stack_clash_probe_interval;
6328   else
6329     interval = PROBE_INTERVAL;
6330
6331   gcc_assert (aarch64_uimm12_shift (interval));
6332   xops[1] = GEN_INT (interval);
6333
6334   output_asm_insn ("sub\t%0, %0, %1", xops);
6335
6336   /* If doing stack clash protection then we probe up by the ABI specified
6337      amount.  We do this because we're dropping full pages at a time in the
6338      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
6339   if (flag_stack_clash_protection)
6340     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
6341   else
6342     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
6343
6344   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
6345      by this amount for each iteration.  */
6346   output_asm_insn ("str\txzr, [%0, %1]", xops);
6347
6348   /* Test if TEST_ADDR == LAST_ADDR.  */
6349   xops[1] = reg2;
6350   output_asm_insn ("cmp\t%0, %1", xops);
6351
6352   /* Branch.  */
6353   fputs ("\tb.ne\t", asm_out_file);
6354   assemble_name_raw (asm_out_file, loop_lab);
6355   fputc ('\n', asm_out_file);
6356
6357   return "";
6358 }
6359
6360 /* Emit the probe loop for doing stack clash probes and stack adjustments for
6361    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
6362    of GUARD_SIZE.  When a probe is emitted it is done at most
6363    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
6364    at most MIN_PROBE_THRESHOLD.  By the end of this function
6365    BASE = BASE - ADJUSTMENT.  */
6366
6367 const char *
6368 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
6369                                       rtx min_probe_threshold, rtx guard_size)
6370 {
6371   /* This function is not allowed to use any instruction generation function
6372      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
6373      so instead emit the code you want using output_asm_insn.  */
6374   gcc_assert (flag_stack_clash_protection);
6375   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
6376   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
6377
6378   /* The minimum required allocation before the residual requires probing.  */
6379   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
6380
6381   /* Clamp the value down to the nearest value that can be used with a cmp.  */
6382   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
6383   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
6384
6385   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
6386   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
6387
6388   static int labelno = 0;
6389   char loop_start_lab[32];
6390   char loop_end_lab[32];
6391   rtx xops[2];
6392
6393   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
6394   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
6395
6396   /* Emit loop start label.  */
6397   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
6398
6399   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
6400   xops[0] = adjustment;
6401   xops[1] = probe_offset_value_rtx;
6402   output_asm_insn ("cmp\t%0, %1", xops);
6403
6404   /* Branch to end if not enough adjustment to probe.  */
6405   fputs ("\tb.lt\t", asm_out_file);
6406   assemble_name_raw (asm_out_file, loop_end_lab);
6407   fputc ('\n', asm_out_file);
6408
6409   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
6410   xops[0] = base;
6411   xops[1] = probe_offset_value_rtx;
6412   output_asm_insn ("sub\t%0, %0, %1", xops);
6413
6414   /* Probe at BASE.  */
6415   xops[1] = const0_rtx;
6416   output_asm_insn ("str\txzr, [%0, %1]", xops);
6417
6418   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
6419   xops[0] = adjustment;
6420   xops[1] = probe_offset_value_rtx;
6421   output_asm_insn ("sub\t%0, %0, %1", xops);
6422
6423   /* Branch to start if still more bytes to allocate.  */
6424   fputs ("\tb\t", asm_out_file);
6425   assemble_name_raw (asm_out_file, loop_start_lab);
6426   fputc ('\n', asm_out_file);
6427
6428   /* No probe leave.  */
6429   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
6430
6431   /* BASE = BASE - ADJUSTMENT.  */
6432   xops[0] = base;
6433   xops[1] = adjustment;
6434   output_asm_insn ("sub\t%0, %0, %1", xops);
6435   return "";
6436 }
6437
6438 /* Determine whether a frame chain needs to be generated.  */
6439 static bool
6440 aarch64_needs_frame_chain (void)
6441 {
6442   /* Force a frame chain for EH returns so the return address is at FP+8.  */
6443   if (frame_pointer_needed || crtl->calls_eh_return)
6444     return true;
6445
6446   /* A leaf function cannot have calls or write LR.  */
6447   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
6448
6449   /* Don't use a frame chain in leaf functions if leaf frame pointers
6450      are disabled.  */
6451   if (flag_omit_leaf_frame_pointer && is_leaf)
6452     return false;
6453
6454   return aarch64_use_frame_pointer;
6455 }
6456
6457 /* Mark the registers that need to be saved by the callee and calculate
6458    the size of the callee-saved registers area and frame record (both FP
6459    and LR may be omitted).  */
6460 static void
6461 aarch64_layout_frame (void)
6462 {
6463   poly_int64 offset = 0;
6464   int regno, last_fp_reg = INVALID_REGNUM;
6465   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
6466   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
6467   bool frame_related_fp_reg_p = false;
6468   aarch64_frame &frame = cfun->machine->frame;
6469
6470   frame.emit_frame_chain = aarch64_needs_frame_chain ();
6471
6472   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
6473      the mid-end is doing.  */
6474   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
6475
6476 #define SLOT_NOT_REQUIRED (-2)
6477 #define SLOT_REQUIRED     (-1)
6478
6479   frame.wb_candidate1 = INVALID_REGNUM;
6480   frame.wb_candidate2 = INVALID_REGNUM;
6481   frame.spare_pred_reg = INVALID_REGNUM;
6482
6483   /* First mark all the registers that really need to be saved...  */
6484   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6485     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
6486
6487   /* ... that includes the eh data registers (if needed)...  */
6488   if (crtl->calls_eh_return)
6489     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
6490       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
6491
6492   /* ... and any callee saved register that dataflow says is live.  */
6493   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6494     if (df_regs_ever_live_p (regno)
6495         && !fixed_regs[regno]
6496         && (regno == R30_REGNUM
6497             || !crtl->abi->clobbers_full_reg_p (regno)))
6498       frame.reg_offset[regno] = SLOT_REQUIRED;
6499
6500   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6501     if (df_regs_ever_live_p (regno)
6502         && !fixed_regs[regno]
6503         && !crtl->abi->clobbers_full_reg_p (regno))
6504       {
6505         frame.reg_offset[regno] = SLOT_REQUIRED;
6506         last_fp_reg = regno;
6507         if (aarch64_emit_cfi_for_reg_p (regno))
6508           frame_related_fp_reg_p = true;
6509       }
6510
6511   /* Big-endian SVE frames need a spare predicate register in order
6512      to save Z8-Z15.  Decide which register they should use.  Prefer
6513      an unused argument register if possible, so that we don't force P4
6514      to be saved unnecessarily.  */
6515   if (frame_related_fp_reg_p
6516       && crtl->abi->id () == ARM_PCS_SVE
6517       && BYTES_BIG_ENDIAN)
6518     {
6519       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
6520       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
6521       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
6522         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
6523           break;
6524       gcc_assert (regno <= P7_REGNUM);
6525       frame.spare_pred_reg = regno;
6526       df_set_regs_ever_live (regno, true);
6527     }
6528
6529   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6530     if (df_regs_ever_live_p (regno)
6531         && !fixed_regs[regno]
6532         && !crtl->abi->clobbers_full_reg_p (regno))
6533       frame.reg_offset[regno] = SLOT_REQUIRED;
6534
6535   /* With stack-clash, LR must be saved in non-leaf functions.  The saving of
6536      LR counts as an implicit probe which allows us to maintain the invariant
6537      described in the comment at expand_prologue.  */
6538   gcc_assert (crtl->is_leaf
6539               || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
6540
6541   /* Now assign stack slots for the registers.  Start with the predicate
6542      registers, since predicate LDR and STR have a relatively small
6543      offset range.  These saves happen below the hard frame pointer.  */
6544   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6545     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6546       {
6547         frame.reg_offset[regno] = offset;
6548         offset += BYTES_PER_SVE_PRED;
6549       }
6550
6551   if (maybe_ne (offset, 0))
6552     {
6553       /* If we have any vector registers to save above the predicate registers,
6554          the offset of the vector register save slots need to be a multiple
6555          of the vector size.  This lets us use the immediate forms of LDR/STR
6556          (or LD1/ST1 for big-endian).
6557
6558          A vector register is 8 times the size of a predicate register,
6559          and we need to save a maximum of 12 predicate registers, so the
6560          first vector register will be at either #1, MUL VL or #2, MUL VL.
6561
6562          If we don't have any vector registers to save, and we know how
6563          big the predicate save area is, we can just round it up to the
6564          next 16-byte boundary.  */
6565       if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
6566         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6567       else
6568         {
6569           if (known_le (offset, vector_save_size))
6570             offset = vector_save_size;
6571           else if (known_le (offset, vector_save_size * 2))
6572             offset = vector_save_size * 2;
6573           else
6574             gcc_unreachable ();
6575         }
6576     }
6577
6578   /* If we need to save any SVE vector registers, add them next.  */
6579   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6580     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6581       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6582         {
6583           frame.reg_offset[regno] = offset;
6584           offset += vector_save_size;
6585         }
6586
6587   /* OFFSET is now the offset of the hard frame pointer from the bottom
6588      of the callee save area.  */
6589   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6590   frame.below_hard_fp_saved_regs_size = offset;
6591   if (frame.emit_frame_chain)
6592     {
6593       /* FP and LR are placed in the linkage record.  */
6594       frame.reg_offset[R29_REGNUM] = offset;
6595       frame.wb_candidate1 = R29_REGNUM;
6596       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6597       frame.wb_candidate2 = R30_REGNUM;
6598       offset += 2 * UNITS_PER_WORD;
6599     }
6600
6601   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6602     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6603       {
6604         frame.reg_offset[regno] = offset;
6605         if (frame.wb_candidate1 == INVALID_REGNUM)
6606           frame.wb_candidate1 = regno;
6607         else if (frame.wb_candidate2 == INVALID_REGNUM)
6608           frame.wb_candidate2 = regno;
6609         offset += UNITS_PER_WORD;
6610       }
6611
6612   poly_int64 max_int_offset = offset;
6613   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6614   bool has_align_gap = maybe_ne (offset, max_int_offset);
6615
6616   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6617     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6618       {
6619         /* If there is an alignment gap between integer and fp callee-saves,
6620            allocate the last fp register to it if possible.  */
6621         if (regno == last_fp_reg
6622             && has_align_gap
6623             && known_eq (vector_save_size, 8)
6624             && multiple_p (offset, 16))
6625           {
6626             frame.reg_offset[regno] = max_int_offset;
6627             break;
6628           }
6629
6630         frame.reg_offset[regno] = offset;
6631         if (frame.wb_candidate1 == INVALID_REGNUM)
6632           frame.wb_candidate1 = regno;
6633         else if (frame.wb_candidate2 == INVALID_REGNUM
6634                  && frame.wb_candidate1 >= V0_REGNUM)
6635           frame.wb_candidate2 = regno;
6636         offset += vector_save_size;
6637       }
6638
6639   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6640
6641   frame.saved_regs_size = offset;
6642
6643   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
6644
6645   poly_int64 above_outgoing_args
6646     = aligned_upper_bound (varargs_and_saved_regs_size
6647                            + get_frame_size (),
6648                            STACK_BOUNDARY / BITS_PER_UNIT);
6649
6650   frame.hard_fp_offset
6651     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6652
6653   /* Both these values are already aligned.  */
6654   gcc_assert (multiple_p (crtl->outgoing_args_size,
6655                           STACK_BOUNDARY / BITS_PER_UNIT));
6656   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
6657
6658   frame.locals_offset = frame.saved_varargs_size;
6659
6660   frame.initial_adjust = 0;
6661   frame.final_adjust = 0;
6662   frame.callee_adjust = 0;
6663   frame.sve_callee_adjust = 0;
6664   frame.callee_offset = 0;
6665
6666   HOST_WIDE_INT max_push_offset = 0;
6667   if (frame.wb_candidate2 != INVALID_REGNUM)
6668     max_push_offset = 512;
6669   else if (frame.wb_candidate1 != INVALID_REGNUM)
6670     max_push_offset = 256;
6671
6672   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
6673   HOST_WIDE_INT const_saved_regs_size;
6674   if (frame.frame_size.is_constant (&const_size)
6675       && const_size < max_push_offset
6676       && known_eq (frame.hard_fp_offset, const_size))
6677     {
6678       /* Simple, small frame with no outgoing arguments:
6679
6680          stp reg1, reg2, [sp, -frame_size]!
6681          stp reg3, reg4, [sp, 16]  */
6682       frame.callee_adjust = const_size;
6683     }
6684   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
6685            && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6686            && const_outgoing_args_size + const_saved_regs_size < 512
6687            /* We could handle this case even with outgoing args, provided
6688               that the number of args left us with valid offsets for all
6689               predicate and vector save slots.  It's such a rare case that
6690               it hardly seems worth the effort though.  */
6691            && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
6692            && !(cfun->calls_alloca
6693                 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6694                 && const_fp_offset < max_push_offset))
6695     {
6696       /* Frame with small outgoing arguments:
6697
6698          sub sp, sp, frame_size
6699          stp reg1, reg2, [sp, outgoing_args_size]
6700          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
6701       frame.initial_adjust = frame.frame_size;
6702       frame.callee_offset = const_outgoing_args_size;
6703     }
6704   else if (saves_below_hard_fp_p
6705            && known_eq (frame.saved_regs_size,
6706                         frame.below_hard_fp_saved_regs_size))
6707     {
6708       /* Frame in which all saves are SVE saves:
6709
6710          sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6711          save SVE registers relative to SP
6712          sub sp, sp, outgoing_args_size  */
6713       frame.initial_adjust = (frame.hard_fp_offset
6714                               + frame.below_hard_fp_saved_regs_size);
6715       frame.final_adjust = crtl->outgoing_args_size;
6716     }
6717   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6718            && const_fp_offset < max_push_offset)
6719     {
6720       /* Frame with large outgoing arguments or SVE saves, but with
6721          a small local area:
6722
6723          stp reg1, reg2, [sp, -hard_fp_offset]!
6724          stp reg3, reg4, [sp, 16]
6725          [sub sp, sp, below_hard_fp_saved_regs_size]
6726          [save SVE registers relative to SP]
6727          sub sp, sp, outgoing_args_size  */
6728       frame.callee_adjust = const_fp_offset;
6729       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6730       frame.final_adjust = crtl->outgoing_args_size;
6731     }
6732   else
6733     {
6734       /* Frame with large local area and outgoing arguments or SVE saves,
6735          using frame pointer:
6736
6737          sub sp, sp, hard_fp_offset
6738          stp x29, x30, [sp, 0]
6739          add x29, sp, 0
6740          stp reg3, reg4, [sp, 16]
6741          [sub sp, sp, below_hard_fp_saved_regs_size]
6742          [save SVE registers relative to SP]
6743          sub sp, sp, outgoing_args_size  */
6744       frame.initial_adjust = frame.hard_fp_offset;
6745       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6746       frame.final_adjust = crtl->outgoing_args_size;
6747     }
6748
6749   /* Make sure the individual adjustments add up to the full frame size.  */
6750   gcc_assert (known_eq (frame.initial_adjust
6751                         + frame.callee_adjust
6752                         + frame.sve_callee_adjust
6753                         + frame.final_adjust, frame.frame_size));
6754
6755   if (!frame.emit_frame_chain && frame.callee_adjust == 0)
6756     {
6757       /* We've decided not to associate any register saves with the initial
6758          stack allocation.  */
6759       frame.wb_candidate1 = INVALID_REGNUM;
6760       frame.wb_candidate2 = INVALID_REGNUM;
6761     }
6762
6763   frame.laid_out = true;
6764 }
6765
6766 /* Return true if the register REGNO is saved on entry to
6767    the current function.  */
6768
6769 static bool
6770 aarch64_register_saved_on_entry (int regno)
6771 {
6772   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6773 }
6774
6775 /* Return the next register up from REGNO up to LIMIT for the callee
6776    to save.  */
6777
6778 static unsigned
6779 aarch64_next_callee_save (unsigned regno, unsigned limit)
6780 {
6781   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6782     regno ++;
6783   return regno;
6784 }
6785
6786 /* Push the register number REGNO of mode MODE to the stack with write-back
6787    adjusting the stack by ADJUSTMENT.  */
6788
6789 static void
6790 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6791                            HOST_WIDE_INT adjustment)
6792  {
6793   rtx base_rtx = stack_pointer_rtx;
6794   rtx insn, reg, mem;
6795
6796   reg = gen_rtx_REG (mode, regno);
6797   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6798                             plus_constant (Pmode, base_rtx, -adjustment));
6799   mem = gen_frame_mem (mode, mem);
6800
6801   insn = emit_move_insn (mem, reg);
6802   RTX_FRAME_RELATED_P (insn) = 1;
6803 }
6804
6805 /* Generate and return an instruction to store the pair of registers
6806    REG and REG2 of mode MODE to location BASE with write-back adjusting
6807    the stack location BASE by ADJUSTMENT.  */
6808
6809 static rtx
6810 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6811                           HOST_WIDE_INT adjustment)
6812 {
6813   switch (mode)
6814     {
6815     case E_DImode:
6816       return gen_storewb_pairdi_di (base, base, reg, reg2,
6817                                     GEN_INT (-adjustment),
6818                                     GEN_INT (UNITS_PER_WORD - adjustment));
6819     case E_DFmode:
6820       return gen_storewb_pairdf_di (base, base, reg, reg2,
6821                                     GEN_INT (-adjustment),
6822                                     GEN_INT (UNITS_PER_WORD - adjustment));
6823     case E_TFmode:
6824       return gen_storewb_pairtf_di (base, base, reg, reg2,
6825                                     GEN_INT (-adjustment),
6826                                     GEN_INT (UNITS_PER_VREG - adjustment));
6827     default:
6828       gcc_unreachable ();
6829     }
6830 }
6831
6832 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6833    stack pointer by ADJUSTMENT.  */
6834
6835 static void
6836 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
6837 {
6838   rtx_insn *insn;
6839   machine_mode mode = aarch64_reg_save_mode (regno1);
6840
6841   if (regno2 == INVALID_REGNUM)
6842     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6843
6844   rtx reg1 = gen_rtx_REG (mode, regno1);
6845   rtx reg2 = gen_rtx_REG (mode, regno2);
6846
6847   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6848                                               reg2, adjustment));
6849   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
6850   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6851   RTX_FRAME_RELATED_P (insn) = 1;
6852 }
6853
6854 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6855    adjusting it by ADJUSTMENT afterwards.  */
6856
6857 static rtx
6858 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6859                          HOST_WIDE_INT adjustment)
6860 {
6861   switch (mode)
6862     {
6863     case E_DImode:
6864       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
6865                                    GEN_INT (UNITS_PER_WORD));
6866     case E_DFmode:
6867       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
6868                                    GEN_INT (UNITS_PER_WORD));
6869     case E_TFmode:
6870       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6871                                    GEN_INT (UNITS_PER_VREG));
6872     default:
6873       gcc_unreachable ();
6874     }
6875 }
6876
6877 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6878    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6879    into CFI_OPS.  */
6880
6881 static void
6882 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6883                   rtx *cfi_ops)
6884 {
6885   machine_mode mode = aarch64_reg_save_mode (regno1);
6886   rtx reg1 = gen_rtx_REG (mode, regno1);
6887
6888   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6889
6890   if (regno2 == INVALID_REGNUM)
6891     {
6892       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6893       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
6894       emit_move_insn (reg1, gen_frame_mem (mode, mem));
6895     }
6896   else
6897     {
6898       rtx reg2 = gen_rtx_REG (mode, regno2);
6899       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6900       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6901                                           reg2, adjustment));
6902     }
6903 }
6904
6905 /* Generate and return a store pair instruction of mode MODE to store
6906    register REG1 to MEM1 and register REG2 to MEM2.  */
6907
6908 static rtx
6909 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
6910                         rtx reg2)
6911 {
6912   switch (mode)
6913     {
6914     case E_DImode:
6915       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
6916
6917     case E_DFmode:
6918       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
6919
6920     case E_TFmode:
6921       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6922
6923     default:
6924       gcc_unreachable ();
6925     }
6926 }
6927
6928 /* Generate and regurn a load pair isntruction of mode MODE to load register
6929    REG1 from MEM1 and register REG2 from MEM2.  */
6930
6931 static rtx
6932 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
6933                        rtx mem2)
6934 {
6935   switch (mode)
6936     {
6937     case E_DImode:
6938       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
6939
6940     case E_DFmode:
6941       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
6942
6943     case E_TFmode:
6944       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6945
6946     default:
6947       gcc_unreachable ();
6948     }
6949 }
6950
6951 /* Return TRUE if return address signing should be enabled for the current
6952    function, otherwise return FALSE.  */
6953
6954 bool
6955 aarch64_return_address_signing_enabled (void)
6956 {
6957   /* This function should only be called after frame laid out.   */
6958   gcc_assert (cfun->machine->frame.laid_out);
6959
6960   /* Turn return address signing off in any function that uses
6961      __builtin_eh_return.  The address passed to __builtin_eh_return
6962      is not signed so either it has to be signed (with original sp)
6963      or the code path that uses it has to avoid authenticating it.
6964      Currently eh return introduces a return to anywhere gadget, no
6965      matter what we do here since it uses ret with user provided
6966      address. An ideal fix for that is to use indirect branch which
6967      can be protected with BTI j (to some extent).  */
6968   if (crtl->calls_eh_return)
6969     return false;
6970
6971   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6972      if its LR is pushed onto stack.  */
6973   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6974           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
6975               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
6976 }
6977
6978 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
6979 bool
6980 aarch64_bti_enabled (void)
6981 {
6982   return (aarch64_enable_bti == 1);
6983 }
6984
6985 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6986    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6987    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
6988
6989      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6990          or LD1D address
6991
6992      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6993          if the variable isn't already nonnull
6994
6995    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6996    Handle this case using a temporary base register that is suitable for
6997    all offsets in that range.  Use ANCHOR_REG as this base register if it
6998    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
6999
7000 static inline void
7001 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
7002                                      rtx &anchor_reg, poly_int64 &offset,
7003                                      rtx &ptrue)
7004 {
7005   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
7006     {
7007       /* This is the maximum valid offset of the anchor from the base.
7008          Lower values would be valid too.  */
7009       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
7010       if (!anchor_reg)
7011         {
7012           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7013           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7014                                     gen_int_mode (anchor_offset, Pmode)));
7015         }
7016       base_rtx = anchor_reg;
7017       offset -= anchor_offset;
7018     }
7019   if (!ptrue)
7020     {
7021       int pred_reg = cfun->machine->frame.spare_pred_reg;
7022       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
7023                       CONSTM1_RTX (VNx16BImode));
7024       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
7025     }
7026 }
7027
7028 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
7029    is saved at BASE + OFFSET.  */
7030
7031 static void
7032 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
7033                             rtx base, poly_int64 offset)
7034 {
7035   rtx mem = gen_frame_mem (GET_MODE (reg),
7036                            plus_constant (Pmode, base, offset));
7037   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
7038 }
7039
7040 /* Emit code to save the callee-saved registers from register number START
7041    to LIMIT to the stack at the location starting at offset START_OFFSET,
7042    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
7043    is true if the hard frame pointer has been set up.  */
7044
7045 static void
7046 aarch64_save_callee_saves (poly_int64 start_offset,
7047                            unsigned start, unsigned limit, bool skip_wb,
7048                            bool hard_fp_valid_p)
7049 {
7050   rtx_insn *insn;
7051   unsigned regno;
7052   unsigned regno2;
7053   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7054
7055   for (regno = aarch64_next_callee_save (start, limit);
7056        regno <= limit;
7057        regno = aarch64_next_callee_save (regno + 1, limit))
7058     {
7059       rtx reg, mem;
7060       poly_int64 offset;
7061       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7062
7063       if (skip_wb
7064           && (regno == cfun->machine->frame.wb_candidate1
7065               || regno == cfun->machine->frame.wb_candidate2))
7066         continue;
7067
7068       if (cfun->machine->reg_is_wrapped_separately[regno])
7069         continue;
7070
7071       machine_mode mode = aarch64_reg_save_mode (regno);
7072       reg = gen_rtx_REG (mode, regno);
7073       offset = start_offset + cfun->machine->frame.reg_offset[regno];
7074       rtx base_rtx = stack_pointer_rtx;
7075       poly_int64 sp_offset = offset;
7076
7077       HOST_WIDE_INT const_offset;
7078       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7079         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7080                                              offset, ptrue);
7081       else if (GP_REGNUM_P (regno)
7082                && (!offset.is_constant (&const_offset) || const_offset >= 512))
7083         {
7084           gcc_assert (known_eq (start_offset, 0));
7085           poly_int64 fp_offset
7086             = cfun->machine->frame.below_hard_fp_saved_regs_size;
7087           if (hard_fp_valid_p)
7088             base_rtx = hard_frame_pointer_rtx;
7089           else
7090             {
7091               if (!anchor_reg)
7092                 {
7093                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7094                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7095                                             gen_int_mode (fp_offset, Pmode)));
7096                 }
7097               base_rtx = anchor_reg;
7098             }
7099           offset -= fp_offset;
7100         }
7101       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7102       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
7103
7104       if (!aarch64_sve_mode_p (mode)
7105           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7106           && !cfun->machine->reg_is_wrapped_separately[regno2]
7107           && known_eq (GET_MODE_SIZE (mode),
7108                        cfun->machine->frame.reg_offset[regno2]
7109                        - cfun->machine->frame.reg_offset[regno]))
7110         {
7111           rtx reg2 = gen_rtx_REG (mode, regno2);
7112           rtx mem2;
7113
7114           offset += GET_MODE_SIZE (mode);
7115           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7116           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
7117                                                     reg2));
7118
7119           /* The first part of a frame-related parallel insn is
7120              always assumed to be relevant to the frame
7121              calculations; subsequent parts, are only
7122              frame-related if explicitly marked.  */
7123           if (aarch64_emit_cfi_for_reg_p (regno2))
7124             {
7125               if (need_cfa_note_p)
7126                 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
7127                                             sp_offset + GET_MODE_SIZE (mode));
7128               else
7129                 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7130             }
7131
7132           regno = regno2;
7133         }
7134       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7135         {
7136           insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
7137           need_cfa_note_p = true;
7138         }
7139       else if (aarch64_sve_mode_p (mode))
7140         insn = emit_insn (gen_rtx_SET (mem, reg));
7141       else
7142         insn = emit_move_insn (mem, reg);
7143
7144       RTX_FRAME_RELATED_P (insn) = frame_related_p;
7145       if (frame_related_p && need_cfa_note_p)
7146         aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
7147     }
7148 }
7149
7150 /* Emit code to restore the callee registers from register number START
7151    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
7152    skipping any write-back candidates if SKIP_WB is true.  Write the
7153    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
7154
7155 static void
7156 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
7157                               unsigned limit, bool skip_wb, rtx *cfi_ops)
7158 {
7159   unsigned regno;
7160   unsigned regno2;
7161   poly_int64 offset;
7162   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7163
7164   for (regno = aarch64_next_callee_save (start, limit);
7165        regno <= limit;
7166        regno = aarch64_next_callee_save (regno + 1, limit))
7167     {
7168       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7169       if (cfun->machine->reg_is_wrapped_separately[regno])
7170         continue;
7171
7172       rtx reg, mem;
7173
7174       if (skip_wb
7175           && (regno == cfun->machine->frame.wb_candidate1
7176               || regno == cfun->machine->frame.wb_candidate2))
7177         continue;
7178
7179       machine_mode mode = aarch64_reg_save_mode (regno);
7180       reg = gen_rtx_REG (mode, regno);
7181       offset = start_offset + cfun->machine->frame.reg_offset[regno];
7182       rtx base_rtx = stack_pointer_rtx;
7183       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7184         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7185                                              offset, ptrue);
7186       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7187
7188       if (!aarch64_sve_mode_p (mode)
7189           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7190           && !cfun->machine->reg_is_wrapped_separately[regno2]
7191           && known_eq (GET_MODE_SIZE (mode),
7192                        cfun->machine->frame.reg_offset[regno2]
7193                        - cfun->machine->frame.reg_offset[regno]))
7194         {
7195           rtx reg2 = gen_rtx_REG (mode, regno2);
7196           rtx mem2;
7197
7198           offset += GET_MODE_SIZE (mode);
7199           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7200           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7201
7202           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7203           regno = regno2;
7204         }
7205       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7206         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
7207       else if (aarch64_sve_mode_p (mode))
7208         emit_insn (gen_rtx_SET (reg, mem));
7209       else
7210         emit_move_insn (reg, mem);
7211       if (frame_related_p)
7212         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
7213     }
7214 }
7215
7216 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
7217    of MODE.  */
7218
7219 static inline bool
7220 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7221 {
7222   HOST_WIDE_INT multiple;
7223   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7224           && IN_RANGE (multiple, -8, 7));
7225 }
7226
7227 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
7228    of MODE.  */
7229
7230 static inline bool
7231 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7232 {
7233   HOST_WIDE_INT multiple;
7234   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7235           && IN_RANGE (multiple, 0, 63));
7236 }
7237
7238 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
7239    of MODE.  */
7240
7241 bool
7242 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7243 {
7244   HOST_WIDE_INT multiple;
7245   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7246           && IN_RANGE (multiple, -64, 63));
7247 }
7248
7249 /* Return true if OFFSET is a signed 9-bit value.  */
7250
7251 bool
7252 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
7253                                        poly_int64 offset)
7254 {
7255   HOST_WIDE_INT const_offset;
7256   return (offset.is_constant (&const_offset)
7257           && IN_RANGE (const_offset, -256, 255));
7258 }
7259
7260 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
7261    of MODE.  */
7262
7263 static inline bool
7264 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7265 {
7266   HOST_WIDE_INT multiple;
7267   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7268           && IN_RANGE (multiple, -256, 255));
7269 }
7270
7271 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
7272    of MODE.  */
7273
7274 static inline bool
7275 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7276 {
7277   HOST_WIDE_INT multiple;
7278   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7279           && IN_RANGE (multiple, 0, 4095));
7280 }
7281
7282 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
7283
7284 static sbitmap
7285 aarch64_get_separate_components (void)
7286 {
7287   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7288   bitmap_clear (components);
7289
7290   /* The registers we need saved to the frame.  */
7291   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7292     if (aarch64_register_saved_on_entry (regno))
7293       {
7294         /* Punt on saves and restores that use ST1D and LD1D.  We could
7295            try to be smarter, but it would involve making sure that the
7296            spare predicate register itself is safe to use at the save
7297            and restore points.  Also, when a frame pointer is being used,
7298            the slots are often out of reach of ST1D and LD1D anyway.  */
7299         machine_mode mode = aarch64_reg_save_mode (regno);
7300         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7301           continue;
7302
7303         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7304
7305         /* If the register is saved in the first SVE save slot, we use
7306            it as a stack probe for -fstack-clash-protection.  */
7307         if (flag_stack_clash_protection
7308             && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
7309             && known_eq (offset, 0))
7310           continue;
7311
7312         /* Get the offset relative to the register we'll use.  */
7313         if (frame_pointer_needed)
7314           offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7315         else
7316           offset += crtl->outgoing_args_size;
7317
7318         /* Check that we can access the stack slot of the register with one
7319            direct load with no adjustments needed.  */
7320         if (aarch64_sve_mode_p (mode)
7321             ? offset_9bit_signed_scaled_p (mode, offset)
7322             : offset_12bit_unsigned_scaled_p (mode, offset))
7323           bitmap_set_bit (components, regno);
7324       }
7325
7326   /* Don't mess with the hard frame pointer.  */
7327   if (frame_pointer_needed)
7328     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
7329
7330   /* If the spare predicate register used by big-endian SVE code
7331      is call-preserved, it must be saved in the main prologue
7332      before any saves that use it.  */
7333   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
7334     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
7335
7336   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7337   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7338   /* If registers have been chosen to be stored/restored with
7339      writeback don't interfere with them to avoid having to output explicit
7340      stack adjustment instructions.  */
7341   if (reg2 != INVALID_REGNUM)
7342     bitmap_clear_bit (components, reg2);
7343   if (reg1 != INVALID_REGNUM)
7344     bitmap_clear_bit (components, reg1);
7345
7346   bitmap_clear_bit (components, LR_REGNUM);
7347   bitmap_clear_bit (components, SP_REGNUM);
7348
7349   return components;
7350 }
7351
7352 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
7353
7354 static sbitmap
7355 aarch64_components_for_bb (basic_block bb)
7356 {
7357   bitmap in = DF_LIVE_IN (bb);
7358   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
7359   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
7360
7361   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7362   bitmap_clear (components);
7363
7364   /* Clobbered registers don't generate values in any meaningful sense,
7365      since nothing after the clobber can rely on their value.  And we can't
7366      say that partially-clobbered registers are unconditionally killed,
7367      because whether they're killed or not depends on the mode of the
7368      value they're holding.  Thus partially call-clobbered registers
7369      appear in neither the kill set nor the gen set.
7370
7371      Check manually for any calls that clobber more of a register than the
7372      current function can.  */
7373   function_abi_aggregator callee_abis;
7374   rtx_insn *insn;
7375   FOR_BB_INSNS (bb, insn)
7376     if (CALL_P (insn))
7377       callee_abis.note_callee_abi (insn_callee_abi (insn));
7378   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
7379
7380   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
7381   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7382     if (!fixed_regs[regno]
7383         && !crtl->abi->clobbers_full_reg_p (regno)
7384         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
7385             || bitmap_bit_p (in, regno)
7386             || bitmap_bit_p (gen, regno)
7387             || bitmap_bit_p (kill, regno)))
7388       {
7389         bitmap_set_bit (components, regno);
7390
7391         /* If there is a callee-save at an adjacent offset, add it too
7392            to increase the use of LDP/STP.  */
7393         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7394         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
7395
7396         if (regno2 <= LAST_SAVED_REGNUM)
7397           {
7398             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7399             if (regno < regno2
7400                 ? known_eq (offset + 8, offset2)
7401                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
7402               bitmap_set_bit (components, regno2);
7403           }
7404       }
7405
7406   return components;
7407 }
7408
7409 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
7410    Nothing to do for aarch64.  */
7411
7412 static void
7413 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
7414 {
7415 }
7416
7417 /* Return the next set bit in BMP from START onwards.  Return the total number
7418    of bits in BMP if no set bit is found at or after START.  */
7419
7420 static unsigned int
7421 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
7422 {
7423   unsigned int nbits = SBITMAP_SIZE (bmp);
7424   if (start == nbits)
7425     return start;
7426
7427   gcc_assert (start < nbits);
7428   for (unsigned int i = start; i < nbits; i++)
7429     if (bitmap_bit_p (bmp, i))
7430       return i;
7431
7432   return nbits;
7433 }
7434
7435 /* Do the work for aarch64_emit_prologue_components and
7436    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
7437    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
7438    for these components or the epilogue sequence.  That is, it determines
7439    whether we should emit stores or loads and what kind of CFA notes to attach
7440    to the insns.  Otherwise the logic for the two sequences is very
7441    similar.  */
7442
7443 static void
7444 aarch64_process_components (sbitmap components, bool prologue_p)
7445 {
7446   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
7447                              ? HARD_FRAME_POINTER_REGNUM
7448                              : STACK_POINTER_REGNUM);
7449
7450   unsigned last_regno = SBITMAP_SIZE (components);
7451   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
7452   rtx_insn *insn = NULL;
7453
7454   while (regno != last_regno)
7455     {
7456       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7457       machine_mode mode = aarch64_reg_save_mode (regno);
7458
7459       rtx reg = gen_rtx_REG (mode, regno);
7460       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7461       if (frame_pointer_needed)
7462         offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7463       else
7464         offset += crtl->outgoing_args_size;
7465
7466       rtx addr = plus_constant (Pmode, ptr_reg, offset);
7467       rtx mem = gen_frame_mem (mode, addr);
7468
7469       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
7470       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
7471       /* No more registers to handle after REGNO.
7472          Emit a single save/restore and exit.  */
7473       if (regno2 == last_regno)
7474         {
7475           insn = emit_insn (set);
7476           if (frame_related_p)
7477             {
7478               RTX_FRAME_RELATED_P (insn) = 1;
7479               if (prologue_p)
7480                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7481               else
7482                 add_reg_note (insn, REG_CFA_RESTORE, reg);
7483             }
7484           break;
7485         }
7486
7487       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7488       /* The next register is not of the same class or its offset is not
7489          mergeable with the current one into a pair.  */
7490       if (aarch64_sve_mode_p (mode)
7491           || !satisfies_constraint_Ump (mem)
7492           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
7493           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
7494           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
7495                        GET_MODE_SIZE (mode)))
7496         {
7497           insn = emit_insn (set);
7498           if (frame_related_p)
7499             {
7500               RTX_FRAME_RELATED_P (insn) = 1;
7501               if (prologue_p)
7502                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7503               else
7504                 add_reg_note (insn, REG_CFA_RESTORE, reg);
7505             }
7506
7507           regno = regno2;
7508           continue;
7509         }
7510
7511       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
7512
7513       /* REGNO2 can be saved/restored in a pair with REGNO.  */
7514       rtx reg2 = gen_rtx_REG (mode, regno2);
7515       if (frame_pointer_needed)
7516         offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7517       else
7518         offset2 += crtl->outgoing_args_size;
7519       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
7520       rtx mem2 = gen_frame_mem (mode, addr2);
7521       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
7522                              : gen_rtx_SET (reg2, mem2);
7523
7524       if (prologue_p)
7525         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
7526       else
7527         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7528
7529       if (frame_related_p || frame_related2_p)
7530         {
7531           RTX_FRAME_RELATED_P (insn) = 1;
7532           if (prologue_p)
7533             {
7534               if (frame_related_p)
7535                 add_reg_note (insn, REG_CFA_OFFSET, set);
7536               if (frame_related2_p)
7537                 add_reg_note (insn, REG_CFA_OFFSET, set2);
7538             }
7539           else
7540             {
7541               if (frame_related_p)
7542                 add_reg_note (insn, REG_CFA_RESTORE, reg);
7543               if (frame_related2_p)
7544                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
7545             }
7546         }
7547
7548       regno = aarch64_get_next_set_bit (components, regno2 + 1);
7549     }
7550 }
7551
7552 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
7553
7554 static void
7555 aarch64_emit_prologue_components (sbitmap components)
7556 {
7557   aarch64_process_components (components, true);
7558 }
7559
7560 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
7561
7562 static void
7563 aarch64_emit_epilogue_components (sbitmap components)
7564 {
7565   aarch64_process_components (components, false);
7566 }
7567
7568 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
7569
7570 static void
7571 aarch64_set_handled_components (sbitmap components)
7572 {
7573   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7574     if (bitmap_bit_p (components, regno))
7575       cfun->machine->reg_is_wrapped_separately[regno] = true;
7576 }
7577
7578 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
7579    determining the probe offset for alloca.  */
7580
7581 static HOST_WIDE_INT
7582 aarch64_stack_clash_protection_alloca_probe_range (void)
7583 {
7584   return STACK_CLASH_CALLER_GUARD;
7585 }
7586
7587
7588 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7589    registers.  If POLY_SIZE is not large enough to require a probe this function
7590    will only adjust the stack.  When allocating the stack space
7591    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7592    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7593    arguments.  If we are then we ensure that any allocation larger than the ABI
7594    defined buffer needs a probe so that the invariant of having a 1KB buffer is
7595    maintained.
7596
7597    We emit barriers after each stack adjustment to prevent optimizations from
7598    breaking the invariant that we never drop the stack more than a page.  This
7599    invariant is needed to make it easier to correctly handle asynchronous
7600    events, e.g. if we were to allow the stack to be dropped by more than a page
7601    and then have multiple probes up and we take a signal somewhere in between
7602    then the signal handler doesn't know the state of the stack and can make no
7603    assumptions about which pages have been probed.  */
7604
7605 static void
7606 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7607                                         poly_int64 poly_size,
7608                                         bool frame_related_p,
7609                                         bool final_adjustment_p)
7610 {
7611   HOST_WIDE_INT guard_size
7612     = 1 << param_stack_clash_protection_guard_size;
7613   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7614   HOST_WIDE_INT min_probe_threshold
7615     = (final_adjustment_p
7616        ? guard_used_by_caller
7617        : guard_size - guard_used_by_caller);
7618   /* When doing the final adjustment for the outgoing arguments, take into
7619      account any unprobed space there is above the current SP.  There are
7620      two cases:
7621
7622      - When saving SVE registers below the hard frame pointer, we force
7623        the lowest save to take place in the prologue before doing the final
7624        adjustment (i.e. we don't allow the save to be shrink-wrapped).
7625        This acts as a probe at SP, so there is no unprobed space.
7626
7627      - When there are no SVE register saves, we use the store of the link
7628        register as a probe.  We can't assume that LR was saved at position 0
7629        though, so treat any space below it as unprobed.  */
7630   if (final_adjustment_p
7631       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7632     {
7633       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7634       if (known_ge (lr_offset, 0))
7635         min_probe_threshold -= lr_offset.to_constant ();
7636       else
7637         gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7638     }
7639
7640   poly_int64 frame_size = cfun->machine->frame.frame_size;
7641
7642   /* We should always have a positive probe threshold.  */
7643   gcc_assert (min_probe_threshold > 0);
7644
7645   if (flag_stack_clash_protection && !final_adjustment_p)
7646     {
7647       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7648       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7649       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7650
7651       if (known_eq (frame_size, 0))
7652         {
7653           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7654         }
7655       else if (known_lt (initial_adjust + sve_callee_adjust,
7656                          guard_size - guard_used_by_caller)
7657                && known_lt (final_adjust, guard_used_by_caller))
7658         {
7659           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7660         }
7661     }
7662
7663   /* If SIZE is not large enough to require probing, just adjust the stack and
7664      exit.  */
7665   if (known_lt (poly_size, min_probe_threshold)
7666       || !flag_stack_clash_protection)
7667     {
7668       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7669       return;
7670     }
7671
7672   HOST_WIDE_INT size;
7673   /* Handle the SVE non-constant case first.  */
7674   if (!poly_size.is_constant (&size))
7675     {
7676      if (dump_file)
7677       {
7678         fprintf (dump_file, "Stack clash SVE prologue: ");
7679         print_dec (poly_size, dump_file);
7680         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7681       }
7682
7683       /* First calculate the amount of bytes we're actually spilling.  */
7684       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7685                           poly_size, temp1, temp2, false, true);
7686
7687       rtx_insn *insn = get_last_insn ();
7688
7689       if (frame_related_p)
7690         {
7691           /* This is done to provide unwinding information for the stack
7692              adjustments we're about to do, however to prevent the optimizers
7693              from removing the R11 move and leaving the CFA note (which would be
7694              very wrong) we tie the old and new stack pointer together.
7695              The tie will expand to nothing but the optimizers will not touch
7696              the instruction.  */
7697           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7698           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7699           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7700
7701           /* We want the CFA independent of the stack pointer for the
7702              duration of the loop.  */
7703           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7704           RTX_FRAME_RELATED_P (insn) = 1;
7705         }
7706
7707       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7708       rtx guard_const = gen_int_mode (guard_size, Pmode);
7709
7710       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7711                                                    stack_pointer_rtx, temp1,
7712                                                    probe_const, guard_const));
7713
7714       /* Now reset the CFA register if needed.  */
7715       if (frame_related_p)
7716         {
7717           add_reg_note (insn, REG_CFA_DEF_CFA,
7718                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7719                                       gen_int_mode (poly_size, Pmode)));
7720           RTX_FRAME_RELATED_P (insn) = 1;
7721         }
7722
7723       return;
7724     }
7725
7726   if (dump_file)
7727     fprintf (dump_file,
7728              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7729              " bytes, probing will be required.\n", size);
7730
7731   /* Round size to the nearest multiple of guard_size, and calculate the
7732      residual as the difference between the original size and the rounded
7733      size.  */
7734   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7735   HOST_WIDE_INT residual = size - rounded_size;
7736
7737   /* We can handle a small number of allocations/probes inline.  Otherwise
7738      punt to a loop.  */
7739   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7740     {
7741       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7742         {
7743           aarch64_sub_sp (NULL, temp2, guard_size, true);
7744           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7745                                            guard_used_by_caller));
7746           emit_insn (gen_blockage ());
7747         }
7748       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7749     }
7750   else
7751     {
7752       /* Compute the ending address.  */
7753       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7754                           temp1, NULL, false, true);
7755       rtx_insn *insn = get_last_insn ();
7756
7757       /* For the initial allocation, we don't have a frame pointer
7758          set up, so we always need CFI notes.  If we're doing the
7759          final allocation, then we may have a frame pointer, in which
7760          case it is the CFA, otherwise we need CFI notes.
7761
7762          We can determine which allocation we are doing by looking at
7763          the value of FRAME_RELATED_P since the final allocations are not
7764          frame related.  */
7765       if (frame_related_p)
7766         {
7767           /* We want the CFA independent of the stack pointer for the
7768              duration of the loop.  */
7769           add_reg_note (insn, REG_CFA_DEF_CFA,
7770                         plus_constant (Pmode, temp1, rounded_size));
7771           RTX_FRAME_RELATED_P (insn) = 1;
7772         }
7773
7774       /* This allocates and probes the stack.  Note that this re-uses some of
7775          the existing Ada stack protection code.  However we are guaranteed not
7776          to enter the non loop or residual branches of that code.
7777
7778          The non-loop part won't be entered because if our allocation amount
7779          doesn't require a loop, the case above would handle it.
7780
7781          The residual amount won't be entered because TEMP1 is a mutliple of
7782          the allocation size.  The residual will always be 0.  As such, the only
7783          part we are actually using from that code is the loop setup.  The
7784          actual probing is done in aarch64_output_probe_stack_range.  */
7785       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7786                                                stack_pointer_rtx, temp1));
7787
7788       /* Now reset the CFA register if needed.  */
7789       if (frame_related_p)
7790         {
7791           add_reg_note (insn, REG_CFA_DEF_CFA,
7792                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7793           RTX_FRAME_RELATED_P (insn) = 1;
7794         }
7795
7796       emit_insn (gen_blockage ());
7797       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7798     }
7799
7800   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
7801      be probed.  This maintains the requirement that each page is probed at
7802      least once.  For initial probing we probe only if the allocation is
7803      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7804      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
7805      GUARD_SIZE.  This works that for any allocation that is large enough to
7806      trigger a probe here, we'll have at least one, and if they're not large
7807      enough for this code to emit anything for them, The page would have been
7808      probed by the saving of FP/LR either by this function or any callees.  If
7809      we don't have any callees then we won't have more stack adjustments and so
7810      are still safe.  */
7811   if (residual)
7812     {
7813       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7814       /* If we're doing final adjustments, and we've done any full page
7815          allocations then any residual needs to be probed.  */
7816       if (final_adjustment_p && rounded_size != 0)
7817         min_probe_threshold = 0;
7818       /* If doing a small final adjustment, we always probe at offset 0.
7819          This is done to avoid issues when LR is not at position 0 or when
7820          the final adjustment is smaller than the probing offset.  */
7821       else if (final_adjustment_p && rounded_size == 0)
7822         residual_probe_offset = 0;
7823
7824       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7825       if (residual >= min_probe_threshold)
7826         {
7827           if (dump_file)
7828             fprintf (dump_file,
7829                      "Stack clash AArch64 prologue residuals: "
7830                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7831                      "\n", residual);
7832
7833             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7834                                              residual_probe_offset));
7835           emit_insn (gen_blockage ());
7836         }
7837     }
7838 }
7839
7840 /* Return 1 if the register is used by the epilogue.  We need to say the
7841    return register is used, but only after epilogue generation is complete.
7842    Note that in the case of sibcalls, the values "used by the epilogue" are
7843    considered live at the start of the called function.
7844
7845    For SIMD functions we need to return 1 for FP registers that are saved and
7846    restored by a function but are not zero in call_used_regs.  If we do not do
7847    this optimizations may remove the restore of the register.  */
7848
7849 int
7850 aarch64_epilogue_uses (int regno)
7851 {
7852   if (epilogue_completed)
7853     {
7854       if (regno == LR_REGNUM)
7855         return 1;
7856     }
7857   return 0;
7858 }
7859
7860 /* AArch64 stack frames generated by this compiler look like:
7861
7862         +-------------------------------+
7863         |                               |
7864         |  incoming stack arguments     |
7865         |                               |
7866         +-------------------------------+
7867         |                               | <-- incoming stack pointer (aligned)
7868         |  callee-allocated save area   |
7869         |  for register varargs         |
7870         |                               |
7871         +-------------------------------+
7872         |  local variables              | <-- frame_pointer_rtx
7873         |                               |
7874         +-------------------------------+
7875         |  padding                      | \
7876         +-------------------------------+  |
7877         |  callee-saved registers       |  | frame.saved_regs_size
7878         +-------------------------------+  |
7879         |  LR'                          |  |
7880         +-------------------------------+  |
7881         |  FP'                          |  |
7882         +-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
7883         |  SVE vector registers         |  | \
7884         +-------------------------------+  |  | below_hard_fp_saved_regs_size
7885         |  SVE predicate registers      | /  /
7886         +-------------------------------+
7887         |  dynamic allocation           |
7888         +-------------------------------+
7889         |  padding                      |
7890         +-------------------------------+
7891         |  outgoing stack arguments     | <-- arg_pointer
7892         |                               |
7893         +-------------------------------+
7894         |                               | <-- stack_pointer_rtx (aligned)
7895
7896    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7897    but leave frame_pointer_rtx and hard_frame_pointer_rtx
7898    unchanged.
7899
7900    By default for stack-clash we assume the guard is at least 64KB, but this
7901    value is configurable to either 4KB or 64KB.  We also force the guard size to
7902    be the same as the probing interval and both values are kept in sync.
7903
7904    With those assumptions the callee can allocate up to 63KB (or 3KB depending
7905    on the guard size) of stack space without probing.
7906
7907    When probing is needed, we emit a probe at the start of the prologue
7908    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7909
7910    We have to track how much space has been allocated and the only stores
7911    to the stack we track as implicit probes are the FP/LR stores.
7912
7913    For outgoing arguments we probe if the size is larger than 1KB, such that
7914    the ABI specified buffer is maintained for the next callee.
7915
7916    The following registers are reserved during frame layout and should not be
7917    used for any other purpose:
7918
7919    - r11: Used by stack clash protection when SVE is enabled, and also
7920           as an anchor register when saving and restoring registers
7921    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7922    - r14 and r15: Used for speculation tracking.
7923    - r16(IP0), r17(IP1): Used by indirect tailcalls.
7924    - r30(LR), r29(FP): Used by standard frame layout.
7925
7926    These registers must be avoided in frame layout related code unless the
7927    explicit intention is to interact with one of the features listed above.  */
7928
7929 /* Generate the prologue instructions for entry into a function.
7930    Establish the stack frame by decreasing the stack pointer with a
7931    properly calculated size and, if necessary, create a frame record
7932    filled with the values of LR and previous frame pointer.  The
7933    current FP is also set up if it is in use.  */
7934
7935 void
7936 aarch64_expand_prologue (void)
7937 {
7938   poly_int64 frame_size = cfun->machine->frame.frame_size;
7939   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7940   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7941   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7942   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7943   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7944   poly_int64 below_hard_fp_saved_regs_size
7945     = cfun->machine->frame.below_hard_fp_saved_regs_size;
7946   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7947   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7948   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
7949   rtx_insn *insn;
7950
7951   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7952     {
7953       /* Fold the SVE allocation into the initial allocation.
7954          We don't do this in aarch64_layout_arg to avoid pessimizing
7955          the epilogue code.  */
7956       initial_adjust += sve_callee_adjust;
7957       sve_callee_adjust = 0;
7958     }
7959
7960   /* Sign return address for functions.  */
7961   if (aarch64_return_address_signing_enabled ())
7962     {
7963       switch (aarch64_ra_sign_key)
7964         {
7965           case AARCH64_KEY_A:
7966             insn = emit_insn (gen_paciasp ());
7967             break;
7968           case AARCH64_KEY_B:
7969             insn = emit_insn (gen_pacibsp ());
7970             break;
7971           default:
7972             gcc_unreachable ();
7973         }
7974       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7975       RTX_FRAME_RELATED_P (insn) = 1;
7976     }
7977
7978   if (flag_stack_usage_info)
7979     current_function_static_stack_size = constant_lower_bound (frame_size);
7980
7981   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7982     {
7983       if (crtl->is_leaf && !cfun->calls_alloca)
7984         {
7985           if (maybe_gt (frame_size, PROBE_INTERVAL)
7986               && maybe_gt (frame_size, get_stack_check_protect ()))
7987             aarch64_emit_probe_stack_range (get_stack_check_protect (),
7988                                             (frame_size
7989                                              - get_stack_check_protect ()));
7990         }
7991       else if (maybe_gt (frame_size, 0))
7992         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
7993     }
7994
7995   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7996   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7997
7998   /* In theory we should never have both an initial adjustment
7999      and a callee save adjustment.  Verify that is the case since the
8000      code below does not handle it for -fstack-clash-protection.  */
8001   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
8002
8003   /* Will only probe if the initial adjustment is larger than the guard
8004      less the amount of the guard reserved for use by the caller's
8005      outgoing args.  */
8006   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
8007                                           true, false);
8008
8009   if (callee_adjust != 0)
8010     aarch64_push_regs (reg1, reg2, callee_adjust);
8011
8012   /* The offset of the frame chain record (if any) from the current SP.  */
8013   poly_int64 chain_offset = (initial_adjust + callee_adjust
8014                              - cfun->machine->frame.hard_fp_offset);
8015   gcc_assert (known_ge (chain_offset, 0));
8016
8017   /* The offset of the bottom of the save area from the current SP.  */
8018   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
8019
8020   if (emit_frame_chain)
8021     {
8022       if (callee_adjust == 0)
8023         {
8024           reg1 = R29_REGNUM;
8025           reg2 = R30_REGNUM;
8026           aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
8027                                      false, false);
8028         }
8029       else
8030         gcc_assert (known_eq (chain_offset, 0));
8031       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
8032                           stack_pointer_rtx, chain_offset,
8033                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
8034       if (frame_pointer_needed && !frame_size.is_constant ())
8035         {
8036           /* Variable-sized frames need to describe the save slot
8037              address using DW_CFA_expression rather than DW_CFA_offset.
8038              This means that, without taking further action, the
8039              locations of the registers that we've already saved would
8040              remain based on the stack pointer even after we redefine
8041              the CFA based on the frame pointer.  We therefore need new
8042              DW_CFA_expressions to re-express the save slots with addresses
8043              based on the frame pointer.  */
8044           rtx_insn *insn = get_last_insn ();
8045           gcc_assert (RTX_FRAME_RELATED_P (insn));
8046
8047           /* Add an explicit CFA definition if this was previously
8048              implicit.  */
8049           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
8050             {
8051               rtx src = plus_constant (Pmode, stack_pointer_rtx,
8052                                        callee_offset);
8053               add_reg_note (insn, REG_CFA_ADJUST_CFA,
8054                             gen_rtx_SET (hard_frame_pointer_rtx, src));
8055             }
8056
8057           /* Change the save slot expressions for the registers that
8058              we've already saved.  */
8059           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
8060                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
8061           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
8062                                       hard_frame_pointer_rtx, 0);
8063         }
8064       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
8065     }
8066
8067   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
8068                              callee_adjust != 0 || emit_frame_chain,
8069                              emit_frame_chain);
8070   if (maybe_ne (sve_callee_adjust, 0))
8071     {
8072       gcc_assert (!flag_stack_clash_protection
8073                   || known_eq (initial_adjust, 0));
8074       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
8075                                               sve_callee_adjust,
8076                                               !frame_pointer_needed, false);
8077       saved_regs_offset += sve_callee_adjust;
8078     }
8079   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
8080                              false, emit_frame_chain);
8081   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
8082                              callee_adjust != 0 || emit_frame_chain,
8083                              emit_frame_chain);
8084
8085   /* We may need to probe the final adjustment if it is larger than the guard
8086      that is assumed by the called.  */
8087   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
8088                                           !frame_pointer_needed, true);
8089 }
8090
8091 /* Return TRUE if we can use a simple_return insn.
8092
8093    This function checks whether the callee saved stack is empty, which
8094    means no restore actions are need. The pro_and_epilogue will use
8095    this to check whether shrink-wrapping opt is feasible.  */
8096
8097 bool
8098 aarch64_use_return_insn_p (void)
8099 {
8100   if (!reload_completed)
8101     return false;
8102
8103   if (crtl->profile)
8104     return false;
8105
8106   return known_eq (cfun->machine->frame.frame_size, 0);
8107 }
8108
8109 /* Generate the epilogue instructions for returning from a function.
8110    This is almost exactly the reverse of the prolog sequence, except
8111    that we need to insert barriers to avoid scheduling loads that read
8112    from a deallocated stack, and we optimize the unwind records by
8113    emitting them all together if possible.  */
8114 void
8115 aarch64_expand_epilogue (bool for_sibcall)
8116 {
8117   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8118   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8119   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8120   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8121   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8122   poly_int64 below_hard_fp_saved_regs_size
8123     = cfun->machine->frame.below_hard_fp_saved_regs_size;
8124   unsigned reg1 = cfun->machine->frame.wb_candidate1;
8125   unsigned reg2 = cfun->machine->frame.wb_candidate2;
8126   rtx cfi_ops = NULL;
8127   rtx_insn *insn;
8128   /* A stack clash protection prologue may not have left EP0_REGNUM or
8129      EP1_REGNUM in a usable state.  The same is true for allocations
8130      with an SVE component, since we then need both temporary registers
8131      for each allocation.  For stack clash we are in a usable state if
8132      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
8133   HOST_WIDE_INT guard_size
8134     = 1 << param_stack_clash_protection_guard_size;
8135   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8136
8137   /* We can re-use the registers when:
8138
8139      (a) the deallocation amount is the same as the corresponding
8140          allocation amount (which is false if we combine the initial
8141          and SVE callee save allocations in the prologue); and
8142
8143      (b) the allocation amount doesn't need a probe (which is false
8144          if the amount is guard_size - guard_used_by_caller or greater).
8145
8146      In such situations the register should remain live with the correct
8147      value.  */
8148   bool can_inherit_p = (initial_adjust.is_constant ()
8149                         && final_adjust.is_constant ()
8150                         && (!flag_stack_clash_protection
8151                             || (known_lt (initial_adjust,
8152                                           guard_size - guard_used_by_caller)
8153                                 && known_eq (sve_callee_adjust, 0))));
8154
8155   /* We need to add memory barrier to prevent read from deallocated stack.  */
8156   bool need_barrier_p
8157     = maybe_ne (get_frame_size ()
8158                 + cfun->machine->frame.saved_varargs_size, 0);
8159
8160   /* Emit a barrier to prevent loads from a deallocated stack.  */
8161   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
8162       || cfun->calls_alloca
8163       || crtl->calls_eh_return)
8164     {
8165       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8166       need_barrier_p = false;
8167     }
8168
8169   /* Restore the stack pointer from the frame pointer if it may not
8170      be the same as the stack pointer.  */
8171   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8172   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8173   if (frame_pointer_needed
8174       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
8175     /* If writeback is used when restoring callee-saves, the CFA
8176        is restored on the instruction doing the writeback.  */
8177     aarch64_add_offset (Pmode, stack_pointer_rtx,
8178                         hard_frame_pointer_rtx,
8179                         -callee_offset - below_hard_fp_saved_regs_size,
8180                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
8181   else
8182      /* The case where we need to re-use the register here is very rare, so
8183         avoid the complicated condition and just always emit a move if the
8184         immediate doesn't fit.  */
8185      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
8186
8187   /* Restore the vector registers before the predicate registers,
8188      so that we can use P4 as a temporary for big-endian SVE frames.  */
8189   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
8190                                 callee_adjust != 0, &cfi_ops);
8191   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
8192                                 false, &cfi_ops);
8193   if (maybe_ne (sve_callee_adjust, 0))
8194     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
8195   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
8196                                 R0_REGNUM, R30_REGNUM,
8197                                 callee_adjust != 0, &cfi_ops);
8198
8199   if (need_barrier_p)
8200     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8201
8202   if (callee_adjust != 0)
8203     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
8204
8205   /* If we have no register restore information, the CFA must have been
8206      defined in terms of the stack pointer since the end of the prologue.  */
8207   gcc_assert (cfi_ops || !frame_pointer_needed);
8208
8209   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
8210     {
8211       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
8212       insn = get_last_insn ();
8213       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
8214       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
8215       RTX_FRAME_RELATED_P (insn) = 1;
8216       cfi_ops = NULL;
8217     }
8218
8219   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
8220      add restriction on emit_move optimization to leaf functions.  */
8221   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
8222                   (!can_inherit_p || !crtl->is_leaf
8223                    || df_regs_ever_live_p (EP0_REGNUM)));
8224
8225   if (cfi_ops)
8226     {
8227       /* Emit delayed restores and reset the CFA to be SP.  */
8228       insn = get_last_insn ();
8229       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
8230       REG_NOTES (insn) = cfi_ops;
8231       RTX_FRAME_RELATED_P (insn) = 1;
8232     }
8233
8234   /* We prefer to emit the combined return/authenticate instruction RETAA,
8235      however there are three cases in which we must instead emit an explicit
8236      authentication instruction.
8237
8238         1) Sibcalls don't return in a normal way, so if we're about to call one
8239            we must authenticate.
8240
8241         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
8242            generating code for !TARGET_ARMV8_3 we can't use it and must
8243            explicitly authenticate.
8244
8245         3) On an eh_return path we make extra stack adjustments to update the
8246            canonical frame address to be the exception handler's CFA.  We want
8247            to authenticate using the CFA of the function which calls eh_return.
8248     */
8249   if (aarch64_return_address_signing_enabled ()
8250       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
8251     {
8252       switch (aarch64_ra_sign_key)
8253         {
8254           case AARCH64_KEY_A:
8255             insn = emit_insn (gen_autiasp ());
8256             break;
8257           case AARCH64_KEY_B:
8258             insn = emit_insn (gen_autibsp ());
8259             break;
8260           default:
8261             gcc_unreachable ();
8262         }
8263       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8264       RTX_FRAME_RELATED_P (insn) = 1;
8265     }
8266
8267   /* Stack adjustment for exception handler.  */
8268   if (crtl->calls_eh_return && !for_sibcall)
8269     {
8270       /* We need to unwind the stack by the offset computed by
8271          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
8272          to be SP; letting the CFA move during this adjustment
8273          is just as correct as retaining the CFA from the body
8274          of the function.  Therefore, do nothing special.  */
8275       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
8276     }
8277
8278   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
8279   if (!for_sibcall)
8280     emit_jump_insn (ret_rtx);
8281 }
8282
8283 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
8284    normally or return to a previous frame after unwinding.
8285
8286    An EH return uses a single shared return sequence.  The epilogue is
8287    exactly like a normal epilogue except that it has an extra input
8288    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
8289    that must be applied after the frame has been destroyed.  An extra label
8290    is inserted before the epilogue which initializes this register to zero,
8291    and this is the entry point for a normal return.
8292
8293    An actual EH return updates the return address, initializes the stack
8294    adjustment and jumps directly into the epilogue (bypassing the zeroing
8295    of the adjustment).  Since the return address is typically saved on the
8296    stack when a function makes a call, the saved LR must be updated outside
8297    the epilogue.
8298
8299    This poses problems as the store is generated well before the epilogue,
8300    so the offset of LR is not known yet.  Also optimizations will remove the
8301    store as it appears dead, even after the epilogue is generated (as the
8302    base or offset for loading LR is different in many cases).
8303
8304    To avoid these problems this implementation forces the frame pointer
8305    in eh_return functions so that the location of LR is fixed and known early.
8306    It also marks the store volatile, so no optimization is permitted to
8307    remove the store.  */
8308 rtx
8309 aarch64_eh_return_handler_rtx (void)
8310 {
8311   rtx tmp = gen_frame_mem (Pmode,
8312     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
8313
8314   /* Mark the store volatile, so no optimization is permitted to remove it.  */
8315   MEM_VOLATILE_P (tmp) = true;
8316   return tmp;
8317 }
8318
8319 /* Output code to add DELTA to the first argument, and then jump
8320    to FUNCTION.  Used for C++ multiple inheritance.  */
8321 static void
8322 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
8323                          HOST_WIDE_INT delta,
8324                          HOST_WIDE_INT vcall_offset,
8325                          tree function)
8326 {
8327   /* The this pointer is always in x0.  Note that this differs from
8328      Arm where the this pointer maybe bumped to r1 if r0 is required
8329      to return a pointer to an aggregate.  On AArch64 a result value
8330      pointer will be in x8.  */
8331   int this_regno = R0_REGNUM;
8332   rtx this_rtx, temp0, temp1, addr, funexp;
8333   rtx_insn *insn;
8334   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
8335
8336   if (aarch64_bti_enabled ())
8337     emit_insn (gen_bti_c());
8338
8339   reload_completed = 1;
8340   emit_note (NOTE_INSN_PROLOGUE_END);
8341
8342   this_rtx = gen_rtx_REG (Pmode, this_regno);
8343   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
8344   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
8345
8346   if (vcall_offset == 0)
8347     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
8348   else
8349     {
8350       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
8351
8352       addr = this_rtx;
8353       if (delta != 0)
8354         {
8355           if (delta >= -256 && delta < 256)
8356             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
8357                                        plus_constant (Pmode, this_rtx, delta));
8358           else
8359             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
8360                                 temp1, temp0, false);
8361         }
8362
8363       if (Pmode == ptr_mode)
8364         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
8365       else
8366         aarch64_emit_move (temp0,
8367                            gen_rtx_ZERO_EXTEND (Pmode,
8368                                                 gen_rtx_MEM (ptr_mode, addr)));
8369
8370       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
8371           addr = plus_constant (Pmode, temp0, vcall_offset);
8372       else
8373         {
8374           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
8375                                           Pmode);
8376           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
8377         }
8378
8379       if (Pmode == ptr_mode)
8380         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
8381       else
8382         aarch64_emit_move (temp1,
8383                            gen_rtx_SIGN_EXTEND (Pmode,
8384                                                 gen_rtx_MEM (ptr_mode, addr)));
8385
8386       emit_insn (gen_add2_insn (this_rtx, temp1));
8387     }
8388
8389   /* Generate a tail call to the target function.  */
8390   if (!TREE_USED (function))
8391     {
8392       assemble_external (function);
8393       TREE_USED (function) = 1;
8394     }
8395   funexp = XEXP (DECL_RTL (function), 0);
8396   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
8397   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
8398   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
8399   SIBLING_CALL_P (insn) = 1;
8400
8401   insn = get_insns ();
8402   shorten_branches (insn);
8403
8404   assemble_start_function (thunk, fnname);
8405   final_start_function (insn, file, 1);
8406   final (insn, file, 1);
8407   final_end_function ();
8408   assemble_end_function (thunk, fnname);
8409
8410   /* Stop pretending to be a post-reload pass.  */
8411   reload_completed = 0;
8412 }
8413
8414 static bool
8415 aarch64_tls_referenced_p (rtx x)
8416 {
8417   if (!TARGET_HAVE_TLS)
8418     return false;
8419   subrtx_iterator::array_type array;
8420   FOR_EACH_SUBRTX (iter, array, x, ALL)
8421     {
8422       const_rtx x = *iter;
8423       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
8424         return true;
8425       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
8426          TLS offsets, not real symbol references.  */
8427       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8428         iter.skip_subrtxes ();
8429     }
8430   return false;
8431 }
8432
8433
8434 /* Return true if val can be encoded as a 12-bit unsigned immediate with
8435    a left shift of 0 or 12 bits.  */
8436 bool
8437 aarch64_uimm12_shift (HOST_WIDE_INT val)
8438 {
8439   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
8440           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
8441           );
8442 }
8443
8444 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
8445    that can be created with a left shift of 0 or 12.  */
8446 static HOST_WIDE_INT
8447 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
8448 {
8449   /* Check to see if the value fits in 24 bits, as that is the maximum we can
8450      handle correctly.  */
8451   gcc_assert ((val & 0xffffff) == val);
8452
8453   if (((val & 0xfff) << 0) == val)
8454     return val;
8455
8456   return val & (0xfff << 12);
8457 }
8458
8459 /* Return true if val is an immediate that can be loaded into a
8460    register by a MOVZ instruction.  */
8461 static bool
8462 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
8463 {
8464   if (GET_MODE_SIZE (mode) > 4)
8465     {
8466       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
8467           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
8468         return 1;
8469     }
8470   else
8471     {
8472       /* Ignore sign extension.  */
8473       val &= (HOST_WIDE_INT) 0xffffffff;
8474     }
8475   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
8476           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
8477 }
8478
8479 /* Test whether:
8480
8481      X = (X & AND_VAL) | IOR_VAL;
8482
8483    can be implemented using:
8484
8485      MOVK X, #(IOR_VAL >> shift), LSL #shift
8486
8487    Return the shift if so, otherwise return -1.  */
8488 int
8489 aarch64_movk_shift (const wide_int_ref &and_val,
8490                     const wide_int_ref &ior_val)
8491 {
8492   unsigned int precision = and_val.get_precision ();
8493   unsigned HOST_WIDE_INT mask = 0xffff;
8494   for (unsigned int shift = 0; shift < precision; shift += 16)
8495     {
8496       if (and_val == ~mask && (ior_val & mask) == ior_val)
8497         return shift;
8498       mask <<= 16;
8499     }
8500   return -1;
8501 }
8502
8503 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
8504    64-bit (DImode) integer.  */
8505
8506 static unsigned HOST_WIDE_INT
8507 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
8508 {
8509   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
8510   while (size < 64)
8511     {
8512       val &= (HOST_WIDE_INT_1U << size) - 1;
8513       val |= val << size;
8514       size *= 2;
8515     }
8516   return val;
8517 }
8518
8519 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
8520
8521 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
8522   {
8523     0x0000000100000001ull,
8524     0x0001000100010001ull,
8525     0x0101010101010101ull,
8526     0x1111111111111111ull,
8527     0x5555555555555555ull,
8528   };
8529
8530
8531 /* Return true if val is a valid bitmask immediate.  */
8532
8533 bool
8534 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
8535 {
8536   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
8537   int bits;
8538
8539   /* Check for a single sequence of one bits and return quickly if so.
8540      The special cases of all ones and all zeroes returns false.  */
8541   val = aarch64_replicate_bitmask_imm (val_in, mode);
8542   tmp = val + (val & -val);
8543
8544   if (tmp == (tmp & -tmp))
8545     return (val + 1) > 1;
8546
8547   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
8548   if (mode == SImode)
8549     val = (val << 32) | (val & 0xffffffff);
8550
8551   /* Invert if the immediate doesn't start with a zero bit - this means we
8552      only need to search for sequences of one bits.  */
8553   if (val & 1)
8554     val = ~val;
8555
8556   /* Find the first set bit and set tmp to val with the first sequence of one
8557      bits removed.  Return success if there is a single sequence of ones.  */
8558   first_one = val & -val;
8559   tmp = val & (val + first_one);
8560
8561   if (tmp == 0)
8562     return true;
8563
8564   /* Find the next set bit and compute the difference in bit position.  */
8565   next_one = tmp & -tmp;
8566   bits = clz_hwi (first_one) - clz_hwi (next_one);
8567   mask = val ^ tmp;
8568
8569   /* Check the bit position difference is a power of 2, and that the first
8570      sequence of one bits fits within 'bits' bits.  */
8571   if ((mask >> bits) != 0 || bits != (bits & -bits))
8572     return false;
8573
8574   /* Check the sequence of one bits is repeated 64/bits times.  */
8575   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
8576 }
8577
8578 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8579    Assumed precondition: VAL_IN Is not zero.  */
8580
8581 unsigned HOST_WIDE_INT
8582 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
8583 {
8584   int lowest_bit_set = ctz_hwi (val_in);
8585   int highest_bit_set = floor_log2 (val_in);
8586   gcc_assert (val_in != 0);
8587
8588   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
8589           (HOST_WIDE_INT_1U << lowest_bit_set));
8590 }
8591
8592 /* Create constant where bits outside of lowest bit set to highest bit set
8593    are set to 1.  */
8594
8595 unsigned HOST_WIDE_INT
8596 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
8597 {
8598   return val_in | ~aarch64_and_split_imm1 (val_in);
8599 }
8600
8601 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
8602
8603 bool
8604 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
8605 {
8606   scalar_int_mode int_mode;
8607   if (!is_a <scalar_int_mode> (mode, &int_mode))
8608     return false;
8609
8610   if (aarch64_bitmask_imm (val_in, int_mode))
8611     return false;
8612
8613   if (aarch64_move_imm (val_in, int_mode))
8614     return false;
8615
8616   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8617
8618   return aarch64_bitmask_imm (imm2, int_mode);
8619 }
8620
8621 /* Return true if val is an immediate that can be loaded into a
8622    register in a single instruction.  */
8623 bool
8624 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8625 {
8626   scalar_int_mode int_mode;
8627   if (!is_a <scalar_int_mode> (mode, &int_mode))
8628     return false;
8629
8630   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8631     return 1;
8632   return aarch64_bitmask_imm (val, int_mode);
8633 }
8634
8635 static bool
8636 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8637 {
8638   rtx base, offset;
8639
8640   if (GET_CODE (x) == HIGH)
8641     return true;
8642
8643   /* There's no way to calculate VL-based values using relocations.  */
8644   subrtx_iterator::array_type array;
8645   FOR_EACH_SUBRTX (iter, array, x, ALL)
8646     if (GET_CODE (*iter) == CONST_POLY_INT)
8647       return true;
8648
8649   split_const (x, &base, &offset);
8650   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
8651     {
8652       if (aarch64_classify_symbol (base, INTVAL (offset))
8653           != SYMBOL_FORCE_TO_MEM)
8654         return true;
8655       else
8656         /* Avoid generating a 64-bit relocation in ILP32; leave
8657            to aarch64_expand_mov_immediate to handle it properly.  */
8658         return mode != ptr_mode;
8659     }
8660
8661   return aarch64_tls_referenced_p (x);
8662 }
8663
8664 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8665    The expansion for a table switch is quite expensive due to the number
8666    of instructions, the table lookup and hard to predict indirect jump.
8667    When optimizing for speed, and -O3 enabled, use the per-core tuning if
8668    set, otherwise use tables for > 16 cases as a tradeoff between size and
8669    performance.  When optimizing for size, use the default setting.  */
8670
8671 static unsigned int
8672 aarch64_case_values_threshold (void)
8673 {
8674   /* Use the specified limit for the number of cases before using jump
8675      tables at higher optimization levels.  */
8676   if (optimize > 2
8677       && selected_cpu->tune->max_case_values != 0)
8678     return selected_cpu->tune->max_case_values;
8679   else
8680     return optimize_size ? default_case_values_threshold () : 17;
8681 }
8682
8683 /* Return true if register REGNO is a valid index register.
8684    STRICT_P is true if REG_OK_STRICT is in effect.  */
8685
8686 bool
8687 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8688 {
8689   if (!HARD_REGISTER_NUM_P (regno))
8690     {
8691       if (!strict_p)
8692         return true;
8693
8694       if (!reg_renumber)
8695         return false;
8696
8697       regno = reg_renumber[regno];
8698     }
8699   return GP_REGNUM_P (regno);
8700 }
8701
8702 /* Return true if register REGNO is a valid base register for mode MODE.
8703    STRICT_P is true if REG_OK_STRICT is in effect.  */
8704
8705 bool
8706 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8707 {
8708   if (!HARD_REGISTER_NUM_P (regno))
8709     {
8710       if (!strict_p)
8711         return true;
8712
8713       if (!reg_renumber)
8714         return false;
8715
8716       regno = reg_renumber[regno];
8717     }
8718
8719   /* The fake registers will be eliminated to either the stack or
8720      hard frame pointer, both of which are usually valid base registers.
8721      Reload deals with the cases where the eliminated form isn't valid.  */
8722   return (GP_REGNUM_P (regno)
8723           || regno == SP_REGNUM
8724           || regno == FRAME_POINTER_REGNUM
8725           || regno == ARG_POINTER_REGNUM);
8726 }
8727
8728 /* Return true if X is a valid base register for mode MODE.
8729    STRICT_P is true if REG_OK_STRICT is in effect.  */
8730
8731 static bool
8732 aarch64_base_register_rtx_p (rtx x, bool strict_p)
8733 {
8734   if (!strict_p
8735       && GET_CODE (x) == SUBREG
8736       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
8737     x = SUBREG_REG (x);
8738
8739   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8740 }
8741
8742 /* Return true if address offset is a valid index.  If it is, fill in INFO
8743    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
8744
8745 static bool
8746 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
8747                         machine_mode mode, bool strict_p)
8748 {
8749   enum aarch64_address_type type;
8750   rtx index;
8751   int shift;
8752
8753   /* (reg:P) */
8754   if ((REG_P (x) || GET_CODE (x) == SUBREG)
8755       && GET_MODE (x) == Pmode)
8756     {
8757       type = ADDRESS_REG_REG;
8758       index = x;
8759       shift = 0;
8760     }
8761   /* (sign_extend:DI (reg:SI)) */
8762   else if ((GET_CODE (x) == SIGN_EXTEND
8763             || GET_CODE (x) == ZERO_EXTEND)
8764            && GET_MODE (x) == DImode
8765            && GET_MODE (XEXP (x, 0)) == SImode)
8766     {
8767       type = (GET_CODE (x) == SIGN_EXTEND)
8768         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8769       index = XEXP (x, 0);
8770       shift = 0;
8771     }
8772   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8773   else if (GET_CODE (x) == MULT
8774            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8775                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8776            && GET_MODE (XEXP (x, 0)) == DImode
8777            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8778            && CONST_INT_P (XEXP (x, 1)))
8779     {
8780       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8781         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8782       index = XEXP (XEXP (x, 0), 0);
8783       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8784     }
8785   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8786   else if (GET_CODE (x) == ASHIFT
8787            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8788                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8789            && GET_MODE (XEXP (x, 0)) == DImode
8790            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8791            && CONST_INT_P (XEXP (x, 1)))
8792     {
8793       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8794         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8795       index = XEXP (XEXP (x, 0), 0);
8796       shift = INTVAL (XEXP (x, 1));
8797     }
8798   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8799   else if ((GET_CODE (x) == SIGN_EXTRACT
8800             || GET_CODE (x) == ZERO_EXTRACT)
8801            && GET_MODE (x) == DImode
8802            && GET_CODE (XEXP (x, 0)) == MULT
8803            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8804            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8805     {
8806       type = (GET_CODE (x) == SIGN_EXTRACT)
8807         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8808       index = XEXP (XEXP (x, 0), 0);
8809       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8810       if (INTVAL (XEXP (x, 1)) != 32 + shift
8811           || INTVAL (XEXP (x, 2)) != 0)
8812         shift = -1;
8813     }
8814   /* (and:DI (mult:DI (reg:DI) (const_int scale))
8815      (const_int 0xffffffff<<shift)) */
8816   else if (GET_CODE (x) == AND
8817            && GET_MODE (x) == DImode
8818            && GET_CODE (XEXP (x, 0)) == MULT
8819            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8820            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8821            && CONST_INT_P (XEXP (x, 1)))
8822     {
8823       type = ADDRESS_REG_UXTW;
8824       index = XEXP (XEXP (x, 0), 0);
8825       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8826       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8827         shift = -1;
8828     }
8829   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8830   else if ((GET_CODE (x) == SIGN_EXTRACT
8831             || GET_CODE (x) == ZERO_EXTRACT)
8832            && GET_MODE (x) == DImode
8833            && GET_CODE (XEXP (x, 0)) == ASHIFT
8834            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8835            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8836     {
8837       type = (GET_CODE (x) == SIGN_EXTRACT)
8838         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8839       index = XEXP (XEXP (x, 0), 0);
8840       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8841       if (INTVAL (XEXP (x, 1)) != 32 + shift
8842           || INTVAL (XEXP (x, 2)) != 0)
8843         shift = -1;
8844     }
8845   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8846      (const_int 0xffffffff<<shift)) */
8847   else if (GET_CODE (x) == AND
8848            && GET_MODE (x) == DImode
8849            && GET_CODE (XEXP (x, 0)) == ASHIFT
8850            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8851            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8852            && CONST_INT_P (XEXP (x, 1)))
8853     {
8854       type = ADDRESS_REG_UXTW;
8855       index = XEXP (XEXP (x, 0), 0);
8856       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8857       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8858         shift = -1;
8859     }
8860   /* (mult:P (reg:P) (const_int scale)) */
8861   else if (GET_CODE (x) == MULT
8862            && GET_MODE (x) == Pmode
8863            && GET_MODE (XEXP (x, 0)) == Pmode
8864            && CONST_INT_P (XEXP (x, 1)))
8865     {
8866       type = ADDRESS_REG_REG;
8867       index = XEXP (x, 0);
8868       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8869     }
8870   /* (ashift:P (reg:P) (const_int shift)) */
8871   else if (GET_CODE (x) == ASHIFT
8872            && GET_MODE (x) == Pmode
8873            && GET_MODE (XEXP (x, 0)) == Pmode
8874            && CONST_INT_P (XEXP (x, 1)))
8875     {
8876       type = ADDRESS_REG_REG;
8877       index = XEXP (x, 0);
8878       shift = INTVAL (XEXP (x, 1));
8879     }
8880   else
8881     return false;
8882
8883   if (!strict_p
8884       && GET_CODE (index) == SUBREG
8885       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
8886     index = SUBREG_REG (index);
8887
8888   if (aarch64_sve_data_mode_p (mode))
8889     {
8890       if (type != ADDRESS_REG_REG
8891           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8892         return false;
8893     }
8894   else
8895     {
8896       if (shift != 0
8897           && !(IN_RANGE (shift, 1, 3)
8898                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8899         return false;
8900     }
8901
8902   if (REG_P (index)
8903       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8904     {
8905       info->type = type;
8906       info->offset = index;
8907       info->shift = shift;
8908       return true;
8909     }
8910
8911   return false;
8912 }
8913
8914 /* Return true if MODE is one of the modes for which we
8915    support LDP/STP operations.  */
8916
8917 static bool
8918 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8919 {
8920   return mode == SImode || mode == DImode
8921          || mode == SFmode || mode == DFmode
8922          || (aarch64_vector_mode_supported_p (mode)
8923              && (known_eq (GET_MODE_SIZE (mode), 8)
8924                  || (known_eq (GET_MODE_SIZE (mode), 16)
8925                     && (aarch64_tune_params.extra_tuning_flags
8926                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
8927 }
8928
8929 /* Return true if REGNO is a virtual pointer register, or an eliminable
8930    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
8931    include stack_pointer or hard_frame_pointer.  */
8932 static bool
8933 virt_or_elim_regno_p (unsigned regno)
8934 {
8935   return ((regno >= FIRST_VIRTUAL_REGISTER
8936            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8937           || regno == FRAME_POINTER_REGNUM
8938           || regno == ARG_POINTER_REGNUM);
8939 }
8940
8941 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8942    If it is, fill in INFO appropriately.  STRICT_P is true if
8943    REG_OK_STRICT is in effect.  */
8944
8945 bool
8946 aarch64_classify_address (struct aarch64_address_info *info,
8947                           rtx x, machine_mode mode, bool strict_p,
8948                           aarch64_addr_query_type type)
8949 {
8950   enum rtx_code code = GET_CODE (x);
8951   rtx op0, op1;
8952   poly_int64 offset;
8953
8954   HOST_WIDE_INT const_size;
8955
8956   /* Whether a vector mode is partial doesn't affect address legitimacy.
8957      Partial vectors like VNx8QImode allow the same indexed addressing
8958      mode and MUL VL addressing mode as full vectors like VNx16QImode;
8959      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
8960   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8961   vec_flags &= ~VEC_PARTIAL;
8962
8963   /* On BE, we use load/store pair for all large int mode load/stores.
8964      TI/TFmode may also use a load/store pair.  */
8965   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
8966   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
8967                             || type == ADDR_QUERY_LDP_STP_N
8968                             || mode == TImode
8969                             || mode == TFmode
8970                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
8971
8972   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8973      corresponds to the actual size of the memory being loaded/stored and the
8974      mode of the corresponding addressing mode is half of that.  */
8975   if (type == ADDR_QUERY_LDP_STP_N
8976       && known_eq (GET_MODE_SIZE (mode), 16))
8977     mode = DFmode;
8978
8979   bool allow_reg_index_p = (!load_store_pair_p
8980                             && (known_lt (GET_MODE_SIZE (mode), 16)
8981                                 || vec_flags == VEC_ADVSIMD
8982                                 || vec_flags & VEC_SVE_DATA));
8983
8984   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8985      [Rn, #offset, MUL VL].  */
8986   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8987       && (code != REG && code != PLUS))
8988     return false;
8989
8990   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8991      REG addressing.  */
8992   if (advsimd_struct_p
8993       && !BYTES_BIG_ENDIAN
8994       && (code != POST_INC && code != REG))
8995     return false;
8996
8997   gcc_checking_assert (GET_MODE (x) == VOIDmode
8998                        || SCALAR_INT_MODE_P (GET_MODE (x)));
8999
9000   switch (code)
9001     {
9002     case REG:
9003     case SUBREG:
9004       info->type = ADDRESS_REG_IMM;
9005       info->base = x;
9006       info->offset = const0_rtx;
9007       info->const_offset = 0;
9008       return aarch64_base_register_rtx_p (x, strict_p);
9009
9010     case PLUS:
9011       op0 = XEXP (x, 0);
9012       op1 = XEXP (x, 1);
9013
9014       if (! strict_p
9015           && REG_P (op0)
9016           && virt_or_elim_regno_p (REGNO (op0))
9017           && poly_int_rtx_p (op1, &offset))
9018         {
9019           info->type = ADDRESS_REG_IMM;
9020           info->base = op0;
9021           info->offset = op1;
9022           info->const_offset = offset;
9023
9024           return true;
9025         }
9026
9027       if (maybe_ne (GET_MODE_SIZE (mode), 0)
9028           && aarch64_base_register_rtx_p (op0, strict_p)
9029           && poly_int_rtx_p (op1, &offset))
9030         {
9031           info->type = ADDRESS_REG_IMM;
9032           info->base = op0;
9033           info->offset = op1;
9034           info->const_offset = offset;
9035
9036           /* TImode and TFmode values are allowed in both pairs of X
9037              registers and individual Q registers.  The available
9038              address modes are:
9039              X,X: 7-bit signed scaled offset
9040              Q:   9-bit signed offset
9041              We conservatively require an offset representable in either mode.
9042              When performing the check for pairs of X registers i.e.  LDP/STP
9043              pass down DImode since that is the natural size of the LDP/STP
9044              instruction memory accesses.  */
9045           if (mode == TImode || mode == TFmode)
9046             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
9047                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9048                         || offset_12bit_unsigned_scaled_p (mode, offset)));
9049
9050           /* A 7bit offset check because OImode will emit a ldp/stp
9051              instruction (only big endian will get here).
9052              For ldp/stp instructions, the offset is scaled for the size of a
9053              single element of the pair.  */
9054           if (mode == OImode)
9055             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
9056
9057           /* Three 9/12 bit offsets checks because CImode will emit three
9058              ldr/str instructions (only big endian will get here).  */
9059           if (mode == CImode)
9060             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9061                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
9062                                                                offset + 32)
9063                         || offset_12bit_unsigned_scaled_p (V16QImode,
9064                                                            offset + 32)));
9065
9066           /* Two 7bit offsets checks because XImode will emit two ldp/stp
9067              instructions (only big endian will get here).  */
9068           if (mode == XImode)
9069             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9070                     && aarch64_offset_7bit_signed_scaled_p (TImode,
9071                                                             offset + 32));
9072
9073           /* Make "m" use the LD1 offset range for SVE data modes, so
9074              that pre-RTL optimizers like ivopts will work to that
9075              instead of the wider LDR/STR range.  */
9076           if (vec_flags == VEC_SVE_DATA)
9077             return (type == ADDR_QUERY_M
9078                     ? offset_4bit_signed_scaled_p (mode, offset)
9079                     : offset_9bit_signed_scaled_p (mode, offset));
9080
9081           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
9082             {
9083               poly_int64 end_offset = (offset
9084                                        + GET_MODE_SIZE (mode)
9085                                        - BYTES_PER_SVE_VECTOR);
9086               return (type == ADDR_QUERY_M
9087                       ? offset_4bit_signed_scaled_p (mode, offset)
9088                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
9089                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
9090                                                          end_offset)));
9091             }
9092
9093           if (vec_flags == VEC_SVE_PRED)
9094             return offset_9bit_signed_scaled_p (mode, offset);
9095
9096           if (load_store_pair_p)
9097             return ((known_eq (GET_MODE_SIZE (mode), 4)
9098                      || known_eq (GET_MODE_SIZE (mode), 8)
9099                      || known_eq (GET_MODE_SIZE (mode), 16))
9100                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9101           else
9102             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9103                     || offset_12bit_unsigned_scaled_p (mode, offset));
9104         }
9105
9106       if (allow_reg_index_p)
9107         {
9108           /* Look for base + (scaled/extended) index register.  */
9109           if (aarch64_base_register_rtx_p (op0, strict_p)
9110               && aarch64_classify_index (info, op1, mode, strict_p))
9111             {
9112               info->base = op0;
9113               return true;
9114             }
9115           if (aarch64_base_register_rtx_p (op1, strict_p)
9116               && aarch64_classify_index (info, op0, mode, strict_p))
9117             {
9118               info->base = op1;
9119               return true;
9120             }
9121         }
9122
9123       return false;
9124
9125     case POST_INC:
9126     case POST_DEC:
9127     case PRE_INC:
9128     case PRE_DEC:
9129       info->type = ADDRESS_REG_WB;
9130       info->base = XEXP (x, 0);
9131       info->offset = NULL_RTX;
9132       return aarch64_base_register_rtx_p (info->base, strict_p);
9133
9134     case POST_MODIFY:
9135     case PRE_MODIFY:
9136       info->type = ADDRESS_REG_WB;
9137       info->base = XEXP (x, 0);
9138       if (GET_CODE (XEXP (x, 1)) == PLUS
9139           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
9140           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
9141           && aarch64_base_register_rtx_p (info->base, strict_p))
9142         {
9143           info->offset = XEXP (XEXP (x, 1), 1);
9144           info->const_offset = offset;
9145
9146           /* TImode and TFmode values are allowed in both pairs of X
9147              registers and individual Q registers.  The available
9148              address modes are:
9149              X,X: 7-bit signed scaled offset
9150              Q:   9-bit signed offset
9151              We conservatively require an offset representable in either mode.
9152            */
9153           if (mode == TImode || mode == TFmode)
9154             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
9155                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
9156
9157           if (load_store_pair_p)
9158             return ((known_eq (GET_MODE_SIZE (mode), 4)
9159                      || known_eq (GET_MODE_SIZE (mode), 8)
9160                      || known_eq (GET_MODE_SIZE (mode), 16))
9161                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9162           else
9163             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
9164         }
9165       return false;
9166
9167     case CONST:
9168     case SYMBOL_REF:
9169     case LABEL_REF:
9170       /* load literal: pc-relative constant pool entry.  Only supported
9171          for SI mode or larger.  */
9172       info->type = ADDRESS_SYMBOLIC;
9173
9174       if (!load_store_pair_p
9175           && GET_MODE_SIZE (mode).is_constant (&const_size)
9176           && const_size >= 4)
9177         {
9178           rtx sym, addend;
9179
9180           split_const (x, &sym, &addend);
9181           return ((GET_CODE (sym) == LABEL_REF
9182                    || (GET_CODE (sym) == SYMBOL_REF
9183                        && CONSTANT_POOL_ADDRESS_P (sym)
9184                        && aarch64_pcrelative_literal_loads)));
9185         }
9186       return false;
9187
9188     case LO_SUM:
9189       info->type = ADDRESS_LO_SUM;
9190       info->base = XEXP (x, 0);
9191       info->offset = XEXP (x, 1);
9192       if (allow_reg_index_p
9193           && aarch64_base_register_rtx_p (info->base, strict_p))
9194         {
9195           rtx sym, offs;
9196           split_const (info->offset, &sym, &offs);
9197           if (GET_CODE (sym) == SYMBOL_REF
9198               && (aarch64_classify_symbol (sym, INTVAL (offs))
9199                   == SYMBOL_SMALL_ABSOLUTE))
9200             {
9201               /* The symbol and offset must be aligned to the access size.  */
9202               unsigned int align;
9203
9204               if (CONSTANT_POOL_ADDRESS_P (sym))
9205                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
9206               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
9207                 {
9208                   tree exp = SYMBOL_REF_DECL (sym);
9209                   align = TYPE_ALIGN (TREE_TYPE (exp));
9210                   align = aarch64_constant_alignment (exp, align);
9211                 }
9212               else if (SYMBOL_REF_DECL (sym))
9213                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
9214               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
9215                        && SYMBOL_REF_BLOCK (sym) != NULL)
9216                 align = SYMBOL_REF_BLOCK (sym)->alignment;
9217               else
9218                 align = BITS_PER_UNIT;
9219
9220               poly_int64 ref_size = GET_MODE_SIZE (mode);
9221               if (known_eq (ref_size, 0))
9222                 ref_size = GET_MODE_SIZE (DImode);
9223
9224               return (multiple_p (INTVAL (offs), ref_size)
9225                       && multiple_p (align / BITS_PER_UNIT, ref_size));
9226             }
9227         }
9228       return false;
9229
9230     default:
9231       return false;
9232     }
9233 }
9234
9235 /* Return true if the address X is valid for a PRFM instruction.
9236    STRICT_P is true if we should do strict checking with
9237    aarch64_classify_address.  */
9238
9239 bool
9240 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
9241 {
9242   struct aarch64_address_info addr;
9243
9244   /* PRFM accepts the same addresses as DImode...  */
9245   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9246   if (!res)
9247     return false;
9248
9249   /* ... except writeback forms.  */
9250   return addr.type != ADDRESS_REG_WB;
9251 }
9252
9253 bool
9254 aarch64_symbolic_address_p (rtx x)
9255 {
9256   rtx offset;
9257
9258   split_const (x, &x, &offset);
9259   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
9260 }
9261
9262 /* Classify the base of symbolic expression X.  */
9263
9264 enum aarch64_symbol_type
9265 aarch64_classify_symbolic_expression (rtx x)
9266 {
9267   rtx offset;
9268
9269   split_const (x, &x, &offset);
9270   return aarch64_classify_symbol (x, INTVAL (offset));
9271 }
9272
9273
9274 /* Return TRUE if X is a legitimate address for accessing memory in
9275    mode MODE.  */
9276 static bool
9277 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
9278 {
9279   struct aarch64_address_info addr;
9280
9281   return aarch64_classify_address (&addr, x, mode, strict_p);
9282 }
9283
9284 /* Return TRUE if X is a legitimate address of type TYPE for accessing
9285    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
9286 bool
9287 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
9288                               aarch64_addr_query_type type)
9289 {
9290   struct aarch64_address_info addr;
9291
9292   return aarch64_classify_address (&addr, x, mode, strict_p, type);
9293 }
9294
9295 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
9296
9297 static bool
9298 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
9299                                          poly_int64 orig_offset,
9300                                          machine_mode mode)
9301 {
9302   HOST_WIDE_INT size;
9303   if (GET_MODE_SIZE (mode).is_constant (&size))
9304     {
9305       HOST_WIDE_INT const_offset, second_offset;
9306
9307       /* A general SVE offset is A * VQ + B.  Remove the A component from
9308          coefficient 0 in order to get the constant B.  */
9309       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
9310
9311       /* Split an out-of-range address displacement into a base and
9312          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
9313          range otherwise to increase opportunities for sharing the base
9314          address of different sizes.  Unaligned accesses use the signed
9315          9-bit range, TImode/TFmode use the intersection of signed
9316          scaled 7-bit and signed 9-bit offset.  */
9317       if (mode == TImode || mode == TFmode)
9318         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
9319       else if ((const_offset & (size - 1)) != 0)
9320         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
9321       else
9322         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
9323
9324       if (second_offset == 0 || known_eq (orig_offset, second_offset))
9325         return false;
9326
9327       /* Split the offset into second_offset and the rest.  */
9328       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9329       *offset2 = gen_int_mode (second_offset, Pmode);
9330       return true;
9331     }
9332   else
9333     {
9334       /* Get the mode we should use as the basis of the range.  For structure
9335          modes this is the mode of one vector.  */
9336       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9337       machine_mode step_mode
9338         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
9339
9340       /* Get the "mul vl" multiplier we'd like to use.  */
9341       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
9342       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
9343       if (vec_flags & VEC_SVE_DATA)
9344         /* LDR supports a 9-bit range, but the move patterns for
9345            structure modes require all vectors to be in range of the
9346            same base.  The simplest way of accomodating that while still
9347            promoting reuse of anchor points between different modes is
9348            to use an 8-bit range unconditionally.  */
9349         vnum = ((vnum + 128) & 255) - 128;
9350       else
9351         /* Predicates are only handled singly, so we might as well use
9352            the full range.  */
9353         vnum = ((vnum + 256) & 511) - 256;
9354       if (vnum == 0)
9355         return false;
9356
9357       /* Convert the "mul vl" multiplier into a byte offset.  */
9358       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
9359       if (known_eq (second_offset, orig_offset))
9360         return false;
9361
9362       /* Split the offset into second_offset and the rest.  */
9363       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9364       *offset2 = gen_int_mode (second_offset, Pmode);
9365       return true;
9366     }
9367 }
9368
9369 /* Return the binary representation of floating point constant VALUE in INTVAL.
9370    If the value cannot be converted, return false without setting INTVAL.
9371    The conversion is done in the given MODE.  */
9372 bool
9373 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
9374 {
9375
9376   /* We make a general exception for 0.  */
9377   if (aarch64_float_const_zero_rtx_p (value))
9378     {
9379       *intval = 0;
9380       return true;
9381     }
9382
9383   scalar_float_mode mode;
9384   if (GET_CODE (value) != CONST_DOUBLE
9385       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
9386       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
9387       /* Only support up to DF mode.  */
9388       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
9389     return false;
9390
9391   unsigned HOST_WIDE_INT ival = 0;
9392
9393   long res[2];
9394   real_to_target (res,
9395                   CONST_DOUBLE_REAL_VALUE (value),
9396                   REAL_MODE_FORMAT (mode));
9397
9398   if (mode == DFmode)
9399     {
9400       int order = BYTES_BIG_ENDIAN ? 1 : 0;
9401       ival = zext_hwi (res[order], 32);
9402       ival |= (zext_hwi (res[1 - order], 32) << 32);
9403     }
9404   else
9405       ival = zext_hwi (res[0], 32);
9406
9407   *intval = ival;
9408   return true;
9409 }
9410
9411 /* Return TRUE if rtx X is an immediate constant that can be moved using a
9412    single MOV(+MOVK) followed by an FMOV.  */
9413 bool
9414 aarch64_float_const_rtx_p (rtx x)
9415 {
9416   machine_mode mode = GET_MODE (x);
9417   if (mode == VOIDmode)
9418     return false;
9419
9420   /* Determine whether it's cheaper to write float constants as
9421      mov/movk pairs over ldr/adrp pairs.  */
9422   unsigned HOST_WIDE_INT ival;
9423
9424   if (GET_CODE (x) == CONST_DOUBLE
9425       && SCALAR_FLOAT_MODE_P (mode)
9426       && aarch64_reinterpret_float_as_int (x, &ival))
9427     {
9428       scalar_int_mode imode = (mode == HFmode
9429                                ? SImode
9430                                : int_mode_for_mode (mode).require ());
9431       int num_instr = aarch64_internal_mov_immediate
9432                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9433       return num_instr < 3;
9434     }
9435
9436   return false;
9437 }
9438
9439 /* Return TRUE if rtx X is immediate constant 0.0 */
9440 bool
9441 aarch64_float_const_zero_rtx_p (rtx x)
9442 {
9443   if (GET_MODE (x) == VOIDmode)
9444     return false;
9445
9446   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
9447     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
9448   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
9449 }
9450
9451 /* Return TRUE if rtx X is immediate constant that fits in a single
9452    MOVI immediate operation.  */
9453 bool
9454 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
9455 {
9456   if (!TARGET_SIMD)
9457      return false;
9458
9459   machine_mode vmode;
9460   scalar_int_mode imode;
9461   unsigned HOST_WIDE_INT ival;
9462
9463   if (GET_CODE (x) == CONST_DOUBLE
9464       && SCALAR_FLOAT_MODE_P (mode))
9465     {
9466       if (!aarch64_reinterpret_float_as_int (x, &ival))
9467         return false;
9468
9469       /* We make a general exception for 0.  */
9470       if (aarch64_float_const_zero_rtx_p (x))
9471         return true;
9472
9473       imode = int_mode_for_mode (mode).require ();
9474     }
9475   else if (GET_CODE (x) == CONST_INT
9476            && is_a <scalar_int_mode> (mode, &imode))
9477     ival = INTVAL (x);
9478   else
9479     return false;
9480
9481    /* use a 64 bit mode for everything except for DI/DF mode, where we use
9482      a 128 bit vector mode.  */
9483   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
9484
9485   vmode = aarch64_simd_container_mode (imode, width);
9486   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
9487
9488   return aarch64_simd_valid_immediate (v_op, NULL);
9489 }
9490
9491
9492 /* Return the fixed registers used for condition codes.  */
9493
9494 static bool
9495 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
9496 {
9497   *p1 = CC_REGNUM;
9498   *p2 = INVALID_REGNUM;
9499   return true;
9500 }
9501
9502 /* This function is used by the call expanders of the machine description.
9503    RESULT is the register in which the result is returned.  It's NULL for
9504    "call" and "sibcall".
9505    MEM is the location of the function call.
9506    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
9507    SIBCALL indicates whether this function call is normal call or sibling call.
9508    It will generate different pattern accordingly.  */
9509
9510 void
9511 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
9512 {
9513   rtx call, callee, tmp;
9514   rtvec vec;
9515   machine_mode mode;
9516
9517   gcc_assert (MEM_P (mem));
9518   callee = XEXP (mem, 0);
9519   mode = GET_MODE (callee);
9520   gcc_assert (mode == Pmode);
9521
9522   /* Decide if we should generate indirect calls by loading the
9523      address of the callee into a register before performing
9524      the branch-and-link.  */
9525   if (SYMBOL_REF_P (callee)
9526       ? (aarch64_is_long_call_p (callee)
9527          || aarch64_is_noplt_call_p (callee))
9528       : !REG_P (callee))
9529     XEXP (mem, 0) = force_reg (mode, callee);
9530
9531   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
9532
9533   if (result != NULL_RTX)
9534     call = gen_rtx_SET (result, call);
9535
9536   if (sibcall)
9537     tmp = ret_rtx;
9538   else
9539     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
9540
9541   gcc_assert (CONST_INT_P (callee_abi));
9542   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
9543                                UNSPEC_CALLEE_ABI);
9544
9545   vec = gen_rtvec (3, call, callee_abi, tmp);
9546   call = gen_rtx_PARALLEL (VOIDmode, vec);
9547
9548   aarch64_emit_call_insn (call);
9549 }
9550
9551 /* Emit call insn with PAT and do aarch64-specific handling.  */
9552
9553 void
9554 aarch64_emit_call_insn (rtx pat)
9555 {
9556   rtx insn = emit_call_insn (pat);
9557
9558   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
9559   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
9560   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
9561 }
9562
9563 machine_mode
9564 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
9565 {
9566   machine_mode mode_x = GET_MODE (x);
9567   rtx_code code_x = GET_CODE (x);
9568
9569   /* All floating point compares return CCFP if it is an equality
9570      comparison, and CCFPE otherwise.  */
9571   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
9572     {
9573       switch (code)
9574         {
9575         case EQ:
9576         case NE:
9577         case UNORDERED:
9578         case ORDERED:
9579         case UNLT:
9580         case UNLE:
9581         case UNGT:
9582         case UNGE:
9583         case UNEQ:
9584           return CCFPmode;
9585
9586         case LT:
9587         case LE:
9588         case GT:
9589         case GE:
9590         case LTGT:
9591           return CCFPEmode;
9592
9593         default:
9594           gcc_unreachable ();
9595         }
9596     }
9597
9598   /* Equality comparisons of short modes against zero can be performed
9599      using the TST instruction with the appropriate bitmask.  */
9600   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
9601       && (code == EQ || code == NE)
9602       && (mode_x == HImode || mode_x == QImode))
9603     return CC_NZmode;
9604
9605   /* Similarly, comparisons of zero_extends from shorter modes can
9606      be performed using an ANDS with an immediate mask.  */
9607   if (y == const0_rtx && code_x == ZERO_EXTEND
9608       && (mode_x == SImode || mode_x == DImode)
9609       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9610       && (code == EQ || code == NE))
9611     return CC_NZmode;
9612
9613   if ((mode_x == SImode || mode_x == DImode)
9614       && y == const0_rtx
9615       && (code == EQ || code == NE || code == LT || code == GE)
9616       && (code_x == PLUS || code_x == MINUS || code_x == AND
9617           || code_x == NEG
9618           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9619               && CONST_INT_P (XEXP (x, 2)))))
9620     return CC_NZmode;
9621
9622   /* A compare with a shifted operand.  Because of canonicalization,
9623      the comparison will have to be swapped when we emit the assembly
9624      code.  */
9625   if ((mode_x == SImode || mode_x == DImode)
9626       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
9627       && (code_x == ASHIFT || code_x == ASHIFTRT
9628           || code_x == LSHIFTRT
9629           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9630     return CC_SWPmode;
9631
9632   /* Similarly for a negated operand, but we can only do this for
9633      equalities.  */
9634   if ((mode_x == SImode || mode_x == DImode)
9635       && (REG_P (y) || GET_CODE (y) == SUBREG)
9636       && (code == EQ || code == NE)
9637       && code_x == NEG)
9638     return CC_Zmode;
9639
9640   /* A test for unsigned overflow from an addition.  */
9641   if ((mode_x == DImode || mode_x == TImode)
9642       && (code == LTU || code == GEU)
9643       && code_x == PLUS
9644       && rtx_equal_p (XEXP (x, 0), y))
9645     return CC_Cmode;
9646
9647   /* A test for unsigned overflow from an add with carry.  */
9648   if ((mode_x == DImode || mode_x == TImode)
9649       && (code == LTU || code == GEU)
9650       && code_x == PLUS
9651       && CONST_SCALAR_INT_P (y)
9652       && (rtx_mode_t (y, mode_x)
9653           == (wi::shwi (1, mode_x)
9654               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9655     return CC_ADCmode;
9656
9657   /* A test for signed overflow.  */
9658   if ((mode_x == DImode || mode_x == TImode)
9659       && code == NE
9660       && code_x == PLUS
9661       && GET_CODE (y) == SIGN_EXTEND)
9662     return CC_Vmode;
9663
9664   /* For everything else, return CCmode.  */
9665   return CCmode;
9666 }
9667
9668 static int
9669 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
9670
9671 int
9672 aarch64_get_condition_code (rtx x)
9673 {
9674   machine_mode mode = GET_MODE (XEXP (x, 0));
9675   enum rtx_code comp_code = GET_CODE (x);
9676
9677   if (GET_MODE_CLASS (mode) != MODE_CC)
9678     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
9679   return aarch64_get_condition_code_1 (mode, comp_code);
9680 }
9681
9682 static int
9683 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
9684 {
9685   switch (mode)
9686     {
9687     case E_CCFPmode:
9688     case E_CCFPEmode:
9689       switch (comp_code)
9690         {
9691         case GE: return AARCH64_GE;
9692         case GT: return AARCH64_GT;
9693         case LE: return AARCH64_LS;
9694         case LT: return AARCH64_MI;
9695         case NE: return AARCH64_NE;
9696         case EQ: return AARCH64_EQ;
9697         case ORDERED: return AARCH64_VC;
9698         case UNORDERED: return AARCH64_VS;
9699         case UNLT: return AARCH64_LT;
9700         case UNLE: return AARCH64_LE;
9701         case UNGT: return AARCH64_HI;
9702         case UNGE: return AARCH64_PL;
9703         default: return -1;
9704         }
9705       break;
9706
9707     case E_CCmode:
9708       switch (comp_code)
9709         {
9710         case NE: return AARCH64_NE;
9711         case EQ: return AARCH64_EQ;
9712         case GE: return AARCH64_GE;
9713         case GT: return AARCH64_GT;
9714         case LE: return AARCH64_LE;
9715         case LT: return AARCH64_LT;
9716         case GEU: return AARCH64_CS;
9717         case GTU: return AARCH64_HI;
9718         case LEU: return AARCH64_LS;
9719         case LTU: return AARCH64_CC;
9720         default: return -1;
9721         }
9722       break;
9723
9724     case E_CC_SWPmode:
9725       switch (comp_code)
9726         {
9727         case NE: return AARCH64_NE;
9728         case EQ: return AARCH64_EQ;
9729         case GE: return AARCH64_LE;
9730         case GT: return AARCH64_LT;
9731         case LE: return AARCH64_GE;
9732         case LT: return AARCH64_GT;
9733         case GEU: return AARCH64_LS;
9734         case GTU: return AARCH64_CC;
9735         case LEU: return AARCH64_CS;
9736         case LTU: return AARCH64_HI;
9737         default: return -1;
9738         }
9739       break;
9740
9741     case E_CC_NZCmode:
9742       switch (comp_code)
9743         {
9744         case NE: return AARCH64_NE; /* = any */
9745         case EQ: return AARCH64_EQ; /* = none */
9746         case GE: return AARCH64_PL; /* = nfrst */
9747         case LT: return AARCH64_MI; /* = first */
9748         case GEU: return AARCH64_CS; /* = nlast */
9749         case GTU: return AARCH64_HI; /* = pmore */
9750         case LEU: return AARCH64_LS; /* = plast */
9751         case LTU: return AARCH64_CC; /* = last */
9752         default: return -1;
9753         }
9754       break;
9755
9756     case E_CC_NZmode:
9757       switch (comp_code)
9758         {
9759         case NE: return AARCH64_NE;
9760         case EQ: return AARCH64_EQ;
9761         case GE: return AARCH64_PL;
9762         case LT: return AARCH64_MI;
9763         default: return -1;
9764         }
9765       break;
9766
9767     case E_CC_Zmode:
9768       switch (comp_code)
9769         {
9770         case NE: return AARCH64_NE;
9771         case EQ: return AARCH64_EQ;
9772         default: return -1;
9773         }
9774       break;
9775
9776     case E_CC_Cmode:
9777       switch (comp_code)
9778         {
9779         case LTU: return AARCH64_CS;
9780         case GEU: return AARCH64_CC;
9781         default: return -1;
9782         }
9783       break;
9784
9785     case E_CC_ADCmode:
9786       switch (comp_code)
9787         {
9788         case GEU: return AARCH64_CS;
9789         case LTU: return AARCH64_CC;
9790         default: return -1;
9791         }
9792       break;
9793
9794     case E_CC_Vmode:
9795       switch (comp_code)
9796         {
9797         case NE: return AARCH64_VS;
9798         case EQ: return AARCH64_VC;
9799         default: return -1;
9800         }
9801       break;
9802
9803     default:
9804       return -1;
9805     }
9806
9807   return -1;
9808 }
9809
9810 bool
9811 aarch64_const_vec_all_same_in_range_p (rtx x,
9812                                        HOST_WIDE_INT minval,
9813                                        HOST_WIDE_INT maxval)
9814 {
9815   rtx elt;
9816   return (const_vec_duplicate_p (x, &elt)
9817           && CONST_INT_P (elt)
9818           && IN_RANGE (INTVAL (elt), minval, maxval));
9819 }
9820
9821 bool
9822 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9823 {
9824   return aarch64_const_vec_all_same_in_range_p (x, val, val);
9825 }
9826
9827 /* Return true if VEC is a constant in which every element is in the range
9828    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
9829
9830 static bool
9831 aarch64_const_vec_all_in_range_p (rtx vec,
9832                                   HOST_WIDE_INT minval,
9833                                   HOST_WIDE_INT maxval)
9834 {
9835   if (GET_CODE (vec) != CONST_VECTOR
9836       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9837     return false;
9838
9839   int nunits;
9840   if (!CONST_VECTOR_STEPPED_P (vec))
9841     nunits = const_vector_encoded_nelts (vec);
9842   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9843     return false;
9844
9845   for (int i = 0; i < nunits; i++)
9846     {
9847       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9848       if (!CONST_INT_P (vec_elem)
9849           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9850         return false;
9851     }
9852   return true;
9853 }
9854
9855 /* N Z C V.  */
9856 #define AARCH64_CC_V 1
9857 #define AARCH64_CC_C (1 << 1)
9858 #define AARCH64_CC_Z (1 << 2)
9859 #define AARCH64_CC_N (1 << 3)
9860
9861 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
9862 static const int aarch64_nzcv_codes[] =
9863 {
9864   0,            /* EQ, Z == 1.  */
9865   AARCH64_CC_Z, /* NE, Z == 0.  */
9866   0,            /* CS, C == 1.  */
9867   AARCH64_CC_C, /* CC, C == 0.  */
9868   0,            /* MI, N == 1.  */
9869   AARCH64_CC_N, /* PL, N == 0.  */
9870   0,            /* VS, V == 1.  */
9871   AARCH64_CC_V, /* VC, V == 0.  */
9872   0,            /* HI, C ==1 && Z == 0.  */
9873   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
9874   AARCH64_CC_V, /* GE, N == V.  */
9875   0,            /* LT, N != V.  */
9876   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
9877   0,            /* LE, !(Z == 0 && N == V).  */
9878   0,            /* AL, Any.  */
9879   0             /* NV, Any.  */
9880 };
9881
9882 /* Print floating-point vector immediate operand X to F, negating it
9883    first if NEGATE is true.  Return true on success, false if it isn't
9884    a constant we can handle.  */
9885
9886 static bool
9887 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9888 {
9889   rtx elt;
9890
9891   if (!const_vec_duplicate_p (x, &elt))
9892     return false;
9893
9894   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9895   if (negate)
9896     r = real_value_negate (&r);
9897
9898   /* Handle the SVE single-bit immediates specially, since they have a
9899      fixed form in the assembly syntax.  */
9900   if (real_equal (&r, &dconst0))
9901     asm_fprintf (f, "0.0");
9902   else if (real_equal (&r, &dconst2))
9903     asm_fprintf (f, "2.0");
9904   else if (real_equal (&r, &dconst1))
9905     asm_fprintf (f, "1.0");
9906   else if (real_equal (&r, &dconsthalf))
9907     asm_fprintf (f, "0.5");
9908   else
9909     {
9910       const int buf_size = 20;
9911       char float_buf[buf_size] = {'\0'};
9912       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9913                                 1, GET_MODE (elt));
9914       asm_fprintf (f, "%s", float_buf);
9915     }
9916
9917   return true;
9918 }
9919
9920 /* Return the equivalent letter for size.  */
9921 static char
9922 sizetochar (int size)
9923 {
9924   switch (size)
9925     {
9926     case 64: return 'd';
9927     case 32: return 's';
9928     case 16: return 'h';
9929     case 8 : return 'b';
9930     default: gcc_unreachable ();
9931     }
9932 }
9933
9934 /* Print operand X to file F in a target specific manner according to CODE.
9935    The acceptable formatting commands given by CODE are:
9936      'c':               An integer or symbol address without a preceding #
9937                         sign.
9938      'C':               Take the duplicated element in a vector constant
9939                         and print it in hex.
9940      'D':               Take the duplicated element in a vector constant
9941                         and print it as an unsigned integer, in decimal.
9942      'e':               Print the sign/zero-extend size as a character 8->b,
9943                         16->h, 32->w.  Can also be used for masks:
9944                         0xff->b, 0xffff->h, 0xffffffff->w.
9945      'I':               If the operand is a duplicated vector constant,
9946                         replace it with the duplicated scalar.  If the
9947                         operand is then a floating-point constant, replace
9948                         it with the integer bit representation.  Print the
9949                         transformed constant as a signed decimal number.
9950      'p':               Prints N such that 2^N == X (X must be power of 2 and
9951                         const int).
9952      'P':               Print the number of non-zero bits in X (a const_int).
9953      'H':               Print the higher numbered register of a pair (TImode)
9954                         of regs.
9955      'm':               Print a condition (eq, ne, etc).
9956      'M':               Same as 'm', but invert condition.
9957      'N':               Take the duplicated element in a vector constant
9958                         and print the negative of it in decimal.
9959      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
9960      'S/T/U/V':         Print a FP/SIMD register name for a register list.
9961                         The register printed is the FP/SIMD register name
9962                         of X + 0/1/2/3 for S/T/U/V.
9963      'R':               Print a scalar Integer/FP/SIMD register name + 1.
9964      'X':               Print bottom 16 bits of integer constant in hex.
9965      'w/x':             Print a general register name or the zero register
9966                         (32-bit or 64-bit).
9967      '0':               Print a normal operand, if it's a general register,
9968                         then we assume DImode.
9969      'k':               Print NZCV for conditional compare instructions.
9970      'A':               Output address constant representing the first
9971                         argument of X, specifying a relocation offset
9972                         if appropriate.
9973      'L':               Output constant address specified by X
9974                         with a relocation offset if appropriate.
9975      'G':               Prints address of X, specifying a PC relative
9976                         relocation mode if appropriate.
9977      'y':               Output address of LDP or STP - this is used for
9978                         some LDP/STPs which don't use a PARALLEL in their
9979                         pattern (so the mode needs to be adjusted).
9980      'z':               Output address of a typical LDP or STP.  */
9981
9982 static void
9983 aarch64_print_operand (FILE *f, rtx x, int code)
9984 {
9985   rtx elt;
9986   switch (code)
9987     {
9988     case 'c':
9989       switch (GET_CODE (x))
9990         {
9991         case CONST_INT:
9992           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9993           break;
9994
9995         case SYMBOL_REF:
9996           output_addr_const (f, x);
9997           break;
9998
9999         case CONST:
10000           if (GET_CODE (XEXP (x, 0)) == PLUS
10001               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
10002             {
10003               output_addr_const (f, x);
10004               break;
10005             }
10006           /* Fall through.  */
10007
10008         default:
10009           output_operand_lossage ("unsupported operand for code '%c'", code);
10010         }
10011       break;
10012
10013     case 'e':
10014       {
10015         x = unwrap_const_vec_duplicate (x);
10016         if (!CONST_INT_P (x))
10017           {
10018             output_operand_lossage ("invalid operand for '%%%c'", code);
10019             return;
10020           }
10021
10022         HOST_WIDE_INT val = INTVAL (x);
10023         if ((val & ~7) == 8 || val == 0xff)
10024           fputc ('b', f);
10025         else if ((val & ~7) == 16 || val == 0xffff)
10026           fputc ('h', f);
10027         else if ((val & ~7) == 32 || val == 0xffffffff)
10028           fputc ('w', f);
10029         else
10030           {
10031             output_operand_lossage ("invalid operand for '%%%c'", code);
10032             return;
10033           }
10034       }
10035       break;
10036
10037     case 'p':
10038       {
10039         int n;
10040
10041         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
10042           {
10043             output_operand_lossage ("invalid operand for '%%%c'", code);
10044             return;
10045           }
10046
10047         asm_fprintf (f, "%d", n);
10048       }
10049       break;
10050
10051     case 'P':
10052       if (!CONST_INT_P (x))
10053         {
10054           output_operand_lossage ("invalid operand for '%%%c'", code);
10055           return;
10056         }
10057
10058       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
10059       break;
10060
10061     case 'H':
10062       if (x == const0_rtx)
10063         {
10064           asm_fprintf (f, "xzr");
10065           break;
10066         }
10067
10068       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
10069         {
10070           output_operand_lossage ("invalid operand for '%%%c'", code);
10071           return;
10072         }
10073
10074       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
10075       break;
10076
10077     case 'I':
10078       {
10079         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
10080         if (CONST_INT_P (x))
10081           asm_fprintf (f, "%wd", INTVAL (x));
10082         else
10083           {
10084             output_operand_lossage ("invalid operand for '%%%c'", code);
10085             return;
10086           }
10087         break;
10088       }
10089
10090     case 'M':
10091     case 'm':
10092       {
10093         int cond_code;
10094         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
10095         if (x == const_true_rtx)
10096           {
10097             if (code == 'M')
10098               fputs ("nv", f);
10099             return;
10100           }
10101
10102         if (!COMPARISON_P (x))
10103           {
10104             output_operand_lossage ("invalid operand for '%%%c'", code);
10105             return;
10106           }
10107
10108         cond_code = aarch64_get_condition_code (x);
10109         gcc_assert (cond_code >= 0);
10110         if (code == 'M')
10111           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
10112         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
10113           fputs (aarch64_sve_condition_codes[cond_code], f);
10114         else
10115           fputs (aarch64_condition_codes[cond_code], f);
10116       }
10117       break;
10118
10119     case 'N':
10120       if (!const_vec_duplicate_p (x, &elt))
10121         {
10122           output_operand_lossage ("invalid vector constant");
10123           return;
10124         }
10125
10126       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10127         asm_fprintf (f, "%wd", -INTVAL (elt));
10128       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10129                && aarch64_print_vector_float_operand (f, x, true))
10130         ;
10131       else
10132         {
10133           output_operand_lossage ("invalid vector constant");
10134           return;
10135         }
10136       break;
10137
10138     case 'b':
10139     case 'h':
10140     case 's':
10141     case 'd':
10142     case 'q':
10143       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10144         {
10145           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10146           return;
10147         }
10148       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
10149       break;
10150
10151     case 'S':
10152     case 'T':
10153     case 'U':
10154     case 'V':
10155       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10156         {
10157           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10158           return;
10159         }
10160       asm_fprintf (f, "%c%d",
10161                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
10162                    REGNO (x) - V0_REGNUM + (code - 'S'));
10163       break;
10164
10165     case 'R':
10166       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
10167         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
10168       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10169         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
10170       else
10171         output_operand_lossage ("incompatible register operand for '%%%c'",
10172                                 code);
10173       break;
10174
10175     case 'X':
10176       if (!CONST_INT_P (x))
10177         {
10178           output_operand_lossage ("invalid operand for '%%%c'", code);
10179           return;
10180         }
10181       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
10182       break;
10183
10184     case 'C':
10185       {
10186         /* Print a replicated constant in hex.  */
10187         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10188           {
10189             output_operand_lossage ("invalid operand for '%%%c'", code);
10190             return;
10191           }
10192         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10193         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10194       }
10195       break;
10196
10197     case 'D':
10198       {
10199         /* Print a replicated constant in decimal, treating it as
10200            unsigned.  */
10201         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10202           {
10203             output_operand_lossage ("invalid operand for '%%%c'", code);
10204             return;
10205           }
10206         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10207         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10208       }
10209       break;
10210
10211     case 'w':
10212     case 'x':
10213       if (x == const0_rtx
10214           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
10215         {
10216           asm_fprintf (f, "%czr", code);
10217           break;
10218         }
10219
10220       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10221         {
10222           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
10223           break;
10224         }
10225
10226       if (REG_P (x) && REGNO (x) == SP_REGNUM)
10227         {
10228           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
10229           break;
10230         }
10231
10232       /* Fall through */
10233
10234     case 0:
10235       if (x == NULL)
10236         {
10237           output_operand_lossage ("missing operand");
10238           return;
10239         }
10240
10241       switch (GET_CODE (x))
10242         {
10243         case REG:
10244           if (aarch64_sve_data_mode_p (GET_MODE (x)))
10245             {
10246               if (REG_NREGS (x) == 1)
10247                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
10248               else
10249                 {
10250                   char suffix
10251                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
10252                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
10253                                REGNO (x) - V0_REGNUM, suffix,
10254                                END_REGNO (x) - V0_REGNUM - 1, suffix);
10255                 }
10256             }
10257           else
10258             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
10259           break;
10260
10261         case MEM:
10262           output_address (GET_MODE (x), XEXP (x, 0));
10263           break;
10264
10265         case LABEL_REF:
10266         case SYMBOL_REF:
10267           output_addr_const (asm_out_file, x);
10268           break;
10269
10270         case CONST_INT:
10271           asm_fprintf (f, "%wd", INTVAL (x));
10272           break;
10273
10274         case CONST:
10275           if (!VECTOR_MODE_P (GET_MODE (x)))
10276             {
10277               output_addr_const (asm_out_file, x);
10278               break;
10279             }
10280           /* fall through */
10281
10282         case CONST_VECTOR:
10283           if (!const_vec_duplicate_p (x, &elt))
10284             {
10285               output_operand_lossage ("invalid vector constant");
10286               return;
10287             }
10288
10289           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10290             asm_fprintf (f, "%wd", INTVAL (elt));
10291           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10292                    && aarch64_print_vector_float_operand (f, x, false))
10293             ;
10294           else
10295             {
10296               output_operand_lossage ("invalid vector constant");
10297               return;
10298             }
10299           break;
10300
10301         case CONST_DOUBLE:
10302           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
10303              be getting CONST_DOUBLEs holding integers.  */
10304           gcc_assert (GET_MODE (x) != VOIDmode);
10305           if (aarch64_float_const_zero_rtx_p (x))
10306             {
10307               fputc ('0', f);
10308               break;
10309             }
10310           else if (aarch64_float_const_representable_p (x))
10311             {
10312 #define buf_size 20
10313               char float_buf[buf_size] = {'\0'};
10314               real_to_decimal_for_mode (float_buf,
10315                                         CONST_DOUBLE_REAL_VALUE (x),
10316                                         buf_size, buf_size,
10317                                         1, GET_MODE (x));
10318               asm_fprintf (asm_out_file, "%s", float_buf);
10319               break;
10320 #undef buf_size
10321             }
10322           output_operand_lossage ("invalid constant");
10323           return;
10324         default:
10325           output_operand_lossage ("invalid operand");
10326           return;
10327         }
10328       break;
10329
10330     case 'A':
10331       if (GET_CODE (x) == HIGH)
10332         x = XEXP (x, 0);
10333
10334       switch (aarch64_classify_symbolic_expression (x))
10335         {
10336         case SYMBOL_SMALL_GOT_4G:
10337           asm_fprintf (asm_out_file, ":got:");
10338           break;
10339
10340         case SYMBOL_SMALL_TLSGD:
10341           asm_fprintf (asm_out_file, ":tlsgd:");
10342           break;
10343
10344         case SYMBOL_SMALL_TLSDESC:
10345           asm_fprintf (asm_out_file, ":tlsdesc:");
10346           break;
10347
10348         case SYMBOL_SMALL_TLSIE:
10349           asm_fprintf (asm_out_file, ":gottprel:");
10350           break;
10351
10352         case SYMBOL_TLSLE24:
10353           asm_fprintf (asm_out_file, ":tprel:");
10354           break;
10355
10356         case SYMBOL_TINY_GOT:
10357           gcc_unreachable ();
10358           break;
10359
10360         default:
10361           break;
10362         }
10363       output_addr_const (asm_out_file, x);
10364       break;
10365
10366     case 'L':
10367       switch (aarch64_classify_symbolic_expression (x))
10368         {
10369         case SYMBOL_SMALL_GOT_4G:
10370           asm_fprintf (asm_out_file, ":lo12:");
10371           break;
10372
10373         case SYMBOL_SMALL_TLSGD:
10374           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
10375           break;
10376
10377         case SYMBOL_SMALL_TLSDESC:
10378           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
10379           break;
10380
10381         case SYMBOL_SMALL_TLSIE:
10382           asm_fprintf (asm_out_file, ":gottprel_lo12:");
10383           break;
10384
10385         case SYMBOL_TLSLE12:
10386           asm_fprintf (asm_out_file, ":tprel_lo12:");
10387           break;
10388
10389         case SYMBOL_TLSLE24:
10390           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
10391           break;
10392
10393         case SYMBOL_TINY_GOT:
10394           asm_fprintf (asm_out_file, ":got:");
10395           break;
10396
10397         case SYMBOL_TINY_TLSIE:
10398           asm_fprintf (asm_out_file, ":gottprel:");
10399           break;
10400
10401         default:
10402           break;
10403         }
10404       output_addr_const (asm_out_file, x);
10405       break;
10406
10407     case 'G':
10408       switch (aarch64_classify_symbolic_expression (x))
10409         {
10410         case SYMBOL_TLSLE24:
10411           asm_fprintf (asm_out_file, ":tprel_hi12:");
10412           break;
10413         default:
10414           break;
10415         }
10416       output_addr_const (asm_out_file, x);
10417       break;
10418
10419     case 'k':
10420       {
10421         HOST_WIDE_INT cond_code;
10422
10423         if (!CONST_INT_P (x))
10424           {
10425             output_operand_lossage ("invalid operand for '%%%c'", code);
10426             return;
10427           }
10428
10429         cond_code = INTVAL (x);
10430         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
10431         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
10432       }
10433       break;
10434
10435     case 'y':
10436     case 'z':
10437       {
10438         machine_mode mode = GET_MODE (x);
10439
10440         if (GET_CODE (x) != MEM
10441             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
10442           {
10443             output_operand_lossage ("invalid operand for '%%%c'", code);
10444             return;
10445           }
10446
10447         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
10448                                             code == 'y'
10449                                             ? ADDR_QUERY_LDP_STP_N
10450                                             : ADDR_QUERY_LDP_STP))
10451           output_operand_lossage ("invalid operand prefix '%%%c'", code);
10452       }
10453       break;
10454
10455     default:
10456       output_operand_lossage ("invalid operand prefix '%%%c'", code);
10457       return;
10458     }
10459 }
10460
10461 /* Print address 'x' of a memory access with mode 'mode'.
10462    'op' is the context required by aarch64_classify_address.  It can either be
10463    MEM for a normal memory access or PARALLEL for LDP/STP.  */
10464 static bool
10465 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
10466                                 aarch64_addr_query_type type)
10467 {
10468   struct aarch64_address_info addr;
10469   unsigned int size, vec_flags;
10470
10471   /* Check all addresses are Pmode - including ILP32.  */
10472   if (GET_MODE (x) != Pmode
10473       && (!CONST_INT_P (x)
10474           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
10475     {
10476       output_operand_lossage ("invalid address mode");
10477       return false;
10478     }
10479
10480   if (aarch64_classify_address (&addr, x, mode, true, type))
10481     switch (addr.type)
10482       {
10483       case ADDRESS_REG_IMM:
10484         if (known_eq (addr.const_offset, 0))
10485           {
10486             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
10487             return true;
10488           }
10489
10490         vec_flags = aarch64_classify_vector_mode (mode);
10491         if (vec_flags & VEC_ANY_SVE)
10492           {
10493             HOST_WIDE_INT vnum
10494               = exact_div (addr.const_offset,
10495                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
10496             asm_fprintf (f, "[%s, #%wd, mul vl]",
10497                          reg_names[REGNO (addr.base)], vnum);
10498             return true;
10499           }
10500
10501         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
10502                      INTVAL (addr.offset));
10503         return true;
10504
10505       case ADDRESS_REG_REG:
10506         if (addr.shift == 0)
10507           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
10508                        reg_names [REGNO (addr.offset)]);
10509         else
10510           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
10511                        reg_names [REGNO (addr.offset)], addr.shift);
10512         return true;
10513
10514       case ADDRESS_REG_UXTW:
10515         if (addr.shift == 0)
10516           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
10517                        REGNO (addr.offset) - R0_REGNUM);
10518         else
10519           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
10520                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
10521         return true;
10522
10523       case ADDRESS_REG_SXTW:
10524         if (addr.shift == 0)
10525           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
10526                        REGNO (addr.offset) - R0_REGNUM);
10527         else
10528           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
10529                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
10530         return true;
10531
10532       case ADDRESS_REG_WB:
10533         /* Writeback is only supported for fixed-width modes.  */
10534         size = GET_MODE_SIZE (mode).to_constant ();
10535         switch (GET_CODE (x))
10536           {
10537           case PRE_INC:
10538             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
10539             return true;
10540           case POST_INC:
10541             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
10542             return true;
10543           case PRE_DEC:
10544             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
10545             return true;
10546           case POST_DEC:
10547             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
10548             return true;
10549           case PRE_MODIFY:
10550             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
10551                          INTVAL (addr.offset));
10552             return true;
10553           case POST_MODIFY:
10554             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
10555                          INTVAL (addr.offset));
10556             return true;
10557           default:
10558             break;
10559           }
10560         break;
10561
10562       case ADDRESS_LO_SUM:
10563         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
10564         output_addr_const (f, addr.offset);
10565         asm_fprintf (f, "]");
10566         return true;
10567
10568       case ADDRESS_SYMBOLIC:
10569         output_addr_const (f, x);
10570         return true;
10571       }
10572
10573   return false;
10574 }
10575
10576 /* Print address 'x' of a memory access with mode 'mode'.  */
10577 static void
10578 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
10579 {
10580   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
10581     output_addr_const (f, x);
10582 }
10583
10584 bool
10585 aarch64_label_mentioned_p (rtx x)
10586 {
10587   const char *fmt;
10588   int i;
10589
10590   if (GET_CODE (x) == LABEL_REF)
10591     return true;
10592
10593   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10594      referencing instruction, but they are constant offsets, not
10595      symbols.  */
10596   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10597     return false;
10598
10599   fmt = GET_RTX_FORMAT (GET_CODE (x));
10600   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
10601     {
10602       if (fmt[i] == 'E')
10603         {
10604           int j;
10605
10606           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10607             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10608               return 1;
10609         }
10610       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10611         return 1;
10612     }
10613
10614   return 0;
10615 }
10616
10617 /* Implement REGNO_REG_CLASS.  */
10618
10619 enum reg_class
10620 aarch64_regno_regclass (unsigned regno)
10621 {
10622   if (STUB_REGNUM_P (regno))
10623     return STUB_REGS;
10624
10625   if (GP_REGNUM_P (regno))
10626     return GENERAL_REGS;
10627
10628   if (regno == SP_REGNUM)
10629     return STACK_REG;
10630
10631   if (regno == FRAME_POINTER_REGNUM
10632       || regno == ARG_POINTER_REGNUM)
10633     return POINTER_REGS;
10634
10635   if (FP_REGNUM_P (regno))
10636     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10637             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10638
10639   if (PR_REGNUM_P (regno))
10640     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10641
10642   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10643     return FFR_REGS;
10644
10645   return NO_REGS;
10646 }
10647
10648 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10649    If OFFSET is out of range, return an offset of an anchor point
10650    that is in range.  Return 0 otherwise.  */
10651
10652 static HOST_WIDE_INT
10653 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10654                        machine_mode mode)
10655 {
10656   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
10657   if (size > 16)
10658     return (offset + 0x400) & ~0x7f0;
10659
10660   /* For offsets that aren't a multiple of the access size, the limit is
10661      -256...255.  */
10662   if (offset & (size - 1))
10663     {
10664       /* BLKmode typically uses LDP of X-registers.  */
10665       if (mode == BLKmode)
10666         return (offset + 512) & ~0x3ff;
10667       return (offset + 0x100) & ~0x1ff;
10668     }
10669
10670   /* Small negative offsets are supported.  */
10671   if (IN_RANGE (offset, -256, 0))
10672     return 0;
10673
10674   if (mode == TImode || mode == TFmode)
10675     return (offset + 0x100) & ~0x1ff;
10676
10677   /* Use 12-bit offset by access size.  */
10678   return offset & (~0xfff * size);
10679 }
10680
10681 static rtx
10682 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
10683 {
10684   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10685      where mask is selected by alignment and size of the offset.
10686      We try to pick as large a range for the offset as possible to
10687      maximize the chance of a CSE.  However, for aligned addresses
10688      we limit the range to 4k so that structures with different sized
10689      elements are likely to use the same base.  We need to be careful
10690      not to split a CONST for some forms of address expression, otherwise
10691      it will generate sub-optimal code.  */
10692
10693   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10694     {
10695       rtx base = XEXP (x, 0);
10696       rtx offset_rtx = XEXP (x, 1);
10697       HOST_WIDE_INT offset = INTVAL (offset_rtx);
10698
10699       if (GET_CODE (base) == PLUS)
10700         {
10701           rtx op0 = XEXP (base, 0);
10702           rtx op1 = XEXP (base, 1);
10703
10704           /* Force any scaling into a temp for CSE.  */
10705           op0 = force_reg (Pmode, op0);
10706           op1 = force_reg (Pmode, op1);
10707
10708           /* Let the pointer register be in op0.  */
10709           if (REG_POINTER (op1))
10710             std::swap (op0, op1);
10711
10712           /* If the pointer is virtual or frame related, then we know that
10713              virtual register instantiation or register elimination is going
10714              to apply a second constant.  We want the two constants folded
10715              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
10716           if (virt_or_elim_regno_p (REGNO (op0)))
10717             {
10718               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10719                                    NULL_RTX, true, OPTAB_DIRECT);
10720               return gen_rtx_PLUS (Pmode, base, op1);
10721             }
10722
10723           /* Otherwise, in order to encourage CSE (and thence loop strength
10724              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
10725           base = expand_binop (Pmode, add_optab, op0, op1,
10726                                NULL_RTX, true, OPTAB_DIRECT);
10727           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
10728         }
10729
10730       HOST_WIDE_INT size;
10731       if (GET_MODE_SIZE (mode).is_constant (&size))
10732         {
10733           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10734                                                              mode);
10735           if (base_offset != 0)
10736             {
10737               base = plus_constant (Pmode, base, base_offset);
10738               base = force_operand (base, NULL_RTX);
10739               return plus_constant (Pmode, base, offset - base_offset);
10740             }
10741         }
10742     }
10743
10744   return x;
10745 }
10746
10747 static reg_class_t
10748 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10749                           reg_class_t rclass,
10750                           machine_mode mode,
10751                           secondary_reload_info *sri)
10752 {
10753   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10754      LDR and STR.  See the comment at the head of aarch64-sve.md for
10755      more details about the big-endian handling.  */
10756   if (reg_class_subset_p (rclass, FP_REGS)
10757       && !((REG_P (x) && HARD_REGISTER_P (x))
10758            || aarch64_simd_valid_immediate (x, NULL))
10759       && mode != VNx16QImode)
10760     {
10761       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10762       if ((vec_flags & VEC_SVE_DATA)
10763           && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10764         {
10765           sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10766           return NO_REGS;
10767         }
10768     }
10769
10770   /* If we have to disable direct literal pool loads and stores because the
10771      function is too big, then we need a scratch register.  */
10772   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
10773       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10774           || targetm.vector_mode_supported_p (GET_MODE (x)))
10775       && !aarch64_pcrelative_literal_loads)
10776     {
10777       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
10778       return NO_REGS;
10779     }
10780
10781   /* Without the TARGET_SIMD instructions we cannot move a Q register
10782      to a Q register directly.  We need a scratch.  */
10783   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10784       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10785       && reg_class_subset_p (rclass, FP_REGS))
10786     {
10787       sri->icode = code_for_aarch64_reload_mov (mode);
10788       return NO_REGS;
10789     }
10790
10791   /* A TFmode or TImode memory access should be handled via an FP_REGS
10792      because AArch64 has richer addressing modes for LDR/STR instructions
10793      than LDP/STP instructions.  */
10794   if (TARGET_FLOAT && rclass == GENERAL_REGS
10795       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
10796     return FP_REGS;
10797
10798   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
10799       return GENERAL_REGS;
10800
10801   return NO_REGS;
10802 }
10803
10804 static bool
10805 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10806 {
10807   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10808
10809   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10810      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
10811   if (frame_pointer_needed)
10812     return to == HARD_FRAME_POINTER_REGNUM;
10813   return true;
10814 }
10815
10816 poly_int64
10817 aarch64_initial_elimination_offset (unsigned from, unsigned to)
10818 {
10819   if (to == HARD_FRAME_POINTER_REGNUM)
10820     {
10821       if (from == ARG_POINTER_REGNUM)
10822         return cfun->machine->frame.hard_fp_offset;
10823
10824       if (from == FRAME_POINTER_REGNUM)
10825         return cfun->machine->frame.hard_fp_offset
10826                - cfun->machine->frame.locals_offset;
10827     }
10828
10829   if (to == STACK_POINTER_REGNUM)
10830     {
10831       if (from == FRAME_POINTER_REGNUM)
10832           return cfun->machine->frame.frame_size
10833                  - cfun->machine->frame.locals_offset;
10834     }
10835
10836   return cfun->machine->frame.frame_size;
10837 }
10838
10839
10840 /* Get return address without mangling.  */
10841
10842 rtx
10843 aarch64_return_addr_rtx (void)
10844 {
10845   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
10846   /* Note: aarch64_return_address_signing_enabled only
10847      works after cfun->machine->frame.laid_out is set,
10848      so here we don't know if the return address will
10849      be signed or not.  */
10850   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
10851   emit_move_insn (lr, val);
10852   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
10853   return lr;
10854 }
10855
10856
10857 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
10858    previous frame.  */
10859
10860 rtx
10861 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10862 {
10863   if (count != 0)
10864     return const0_rtx;
10865   return aarch64_return_addr_rtx ();
10866 }
10867
10868 static void
10869 aarch64_asm_trampoline_template (FILE *f)
10870 {
10871   /* Even if the current function doesn't have branch protection, some
10872      later function might, so since this template is only generated once
10873      we have to add a BTI just in case. */
10874   asm_fprintf (f, "\thint\t34 // bti c\n");
10875
10876   if (TARGET_ILP32)
10877     {
10878       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
10879       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
10880     }
10881   else
10882     {
10883       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
10884       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
10885     }
10886   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
10887
10888   /* We always emit a speculation barrier.
10889      This is because the same trampoline template is used for every nested
10890      function.  Since nested functions are not particularly common or
10891      performant we don't worry too much about the extra instructions to copy
10892      around.
10893      This is not yet a problem, since we have not yet implemented function
10894      specific attributes to choose between hardening against straight line
10895      speculation or not, but such function specific attributes are likely to
10896      happen in the future.  */
10897   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
10898
10899   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10900   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10901 }
10902
10903 static void
10904 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10905 {
10906   rtx fnaddr, mem, a_tramp;
10907   const int tramp_code_sz = 24;
10908
10909   /* Don't need to copy the trailing D-words, we fill those in below.  */
10910   /* We create our own memory address in Pmode so that `emit_block_move` can
10911      use parts of the backend which expect Pmode addresses.  */
10912   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
10913   emit_block_move (gen_rtx_MEM (BLKmode, temp),
10914                    assemble_trampoline_template (),
10915                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10916   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
10917   fnaddr = XEXP (DECL_RTL (fndecl), 0);
10918   if (GET_MODE (fnaddr) != ptr_mode)
10919     fnaddr = convert_memory_address (ptr_mode, fnaddr);
10920   emit_move_insn (mem, fnaddr);
10921
10922   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
10923   emit_move_insn (mem, chain_value);
10924
10925   /* XXX We should really define a "clear_cache" pattern and use
10926      gen_clear_cache().  */
10927   a_tramp = XEXP (m_tramp, 0);
10928   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
10929                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
10930                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10931                      ptr_mode);
10932 }
10933
10934 static unsigned char
10935 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
10936 {
10937   /* ??? Logically we should only need to provide a value when
10938      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10939      can hold MODE, but at the moment we need to handle all modes.
10940      Just ignore any runtime parts for registers that can't store them.  */
10941   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
10942   unsigned int nregs, vec_flags;
10943   switch (regclass)
10944     {
10945     case STUB_REGS:
10946     case TAILCALL_ADDR_REGS:
10947     case POINTER_REGS:
10948     case GENERAL_REGS:
10949     case ALL_REGS:
10950     case POINTER_AND_FP_REGS:
10951     case FP_REGS:
10952     case FP_LO_REGS:
10953     case FP_LO8_REGS:
10954       vec_flags = aarch64_classify_vector_mode (mode);
10955       if ((vec_flags & VEC_SVE_DATA)
10956           && constant_multiple_p (GET_MODE_SIZE (mode),
10957                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
10958         return nregs;
10959       return (vec_flags & VEC_ADVSIMD
10960               ? CEIL (lowest_size, UNITS_PER_VREG)
10961               : CEIL (lowest_size, UNITS_PER_WORD));
10962     case STACK_REG:
10963     case PR_REGS:
10964     case PR_LO_REGS:
10965     case PR_HI_REGS:
10966     case FFR_REGS:
10967     case PR_AND_FFR_REGS:
10968       return 1;
10969
10970     case NO_REGS:
10971       return 0;
10972
10973     default:
10974       break;
10975     }
10976   gcc_unreachable ();
10977 }
10978
10979 static reg_class_t
10980 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
10981 {
10982   if (regclass == POINTER_REGS)
10983     return GENERAL_REGS;
10984
10985   if (regclass == STACK_REG)
10986     {
10987       if (REG_P(x)
10988           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10989           return regclass;
10990
10991       return NO_REGS;
10992     }
10993
10994   /* Register eliminiation can result in a request for
10995      SP+constant->FP_REGS.  We cannot support such operations which
10996      use SP as source and an FP_REG as destination, so reject out
10997      right now.  */
10998   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
10999     {
11000       rtx lhs = XEXP (x, 0);
11001
11002       /* Look through a possible SUBREG introduced by ILP32.  */
11003       if (GET_CODE (lhs) == SUBREG)
11004         lhs = SUBREG_REG (lhs);
11005
11006       gcc_assert (REG_P (lhs));
11007       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
11008                                       POINTER_REGS));
11009       return NO_REGS;
11010     }
11011
11012   return regclass;
11013 }
11014
11015 void
11016 aarch64_asm_output_labelref (FILE* f, const char *name)
11017 {
11018   asm_fprintf (f, "%U%s", name);
11019 }
11020
11021 static void
11022 aarch64_elf_asm_constructor (rtx symbol, int priority)
11023 {
11024   if (priority == DEFAULT_INIT_PRIORITY)
11025     default_ctor_section_asm_out_constructor (symbol, priority);
11026   else
11027     {
11028       section *s;
11029       /* While priority is known to be in range [0, 65535], so 18 bytes
11030          would be enough, the compiler might not know that.  To avoid
11031          -Wformat-truncation false positive, use a larger size.  */
11032       char buf[23];
11033       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
11034       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11035       switch_to_section (s);
11036       assemble_align (POINTER_SIZE);
11037       assemble_aligned_integer (POINTER_BYTES, symbol);
11038     }
11039 }
11040
11041 static void
11042 aarch64_elf_asm_destructor (rtx symbol, int priority)
11043 {
11044   if (priority == DEFAULT_INIT_PRIORITY)
11045     default_dtor_section_asm_out_destructor (symbol, priority);
11046   else
11047     {
11048       section *s;
11049       /* While priority is known to be in range [0, 65535], so 18 bytes
11050          would be enough, the compiler might not know that.  To avoid
11051          -Wformat-truncation false positive, use a larger size.  */
11052       char buf[23];
11053       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
11054       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11055       switch_to_section (s);
11056       assemble_align (POINTER_SIZE);
11057       assemble_aligned_integer (POINTER_BYTES, symbol);
11058     }
11059 }
11060
11061 const char*
11062 aarch64_output_casesi (rtx *operands)
11063 {
11064   char buf[100];
11065   char label[100];
11066   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
11067   int index;
11068   static const char *const patterns[4][2] =
11069   {
11070     {
11071       "ldrb\t%w3, [%0,%w1,uxtw]",
11072       "add\t%3, %4, %w3, sxtb #2"
11073     },
11074     {
11075       "ldrh\t%w3, [%0,%w1,uxtw #1]",
11076       "add\t%3, %4, %w3, sxth #2"
11077     },
11078     {
11079       "ldr\t%w3, [%0,%w1,uxtw #2]",
11080       "add\t%3, %4, %w3, sxtw #2"
11081     },
11082     /* We assume that DImode is only generated when not optimizing and
11083        that we don't really need 64-bit address offsets.  That would
11084        imply an object file with 8GB of code in a single function!  */
11085     {
11086       "ldr\t%w3, [%0,%w1,uxtw #2]",
11087       "add\t%3, %4, %w3, sxtw #2"
11088     }
11089   };
11090
11091   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
11092
11093   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
11094   index = exact_log2 (GET_MODE_SIZE (mode));
11095
11096   gcc_assert (index >= 0 && index <= 3);
11097
11098   /* Need to implement table size reduction, by chaning the code below.  */
11099   output_asm_insn (patterns[index][0], operands);
11100   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
11101   snprintf (buf, sizeof (buf),
11102             "adr\t%%4, %s", targetm.strip_name_encoding (label));
11103   output_asm_insn (buf, operands);
11104   output_asm_insn (patterns[index][1], operands);
11105   output_asm_insn ("br\t%3", operands);
11106   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
11107                    operands);
11108   assemble_label (asm_out_file, label);
11109   return "";
11110 }
11111
11112
11113 /* Return size in bits of an arithmetic operand which is shifted/scaled and
11114    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
11115    operator.  */
11116
11117 int
11118 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
11119 {
11120   if (shift >= 0 && shift <= 3)
11121     {
11122       int size;
11123       for (size = 8; size <= 32; size *= 2)
11124         {
11125           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
11126           if (mask == bits << shift)
11127             return size;
11128         }
11129     }
11130   return 0;
11131 }
11132
11133 /* Constant pools are per function only when PC relative
11134    literal loads are true or we are in the large memory
11135    model.  */
11136
11137 static inline bool
11138 aarch64_can_use_per_function_literal_pools_p (void)
11139 {
11140   return (aarch64_pcrelative_literal_loads
11141           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
11142 }
11143
11144 static bool
11145 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
11146 {
11147   /* We can't use blocks for constants when we're using a per-function
11148      constant pool.  */
11149   return !aarch64_can_use_per_function_literal_pools_p ();
11150 }
11151
11152 /* Select appropriate section for constants depending
11153    on where we place literal pools.  */
11154
11155 static section *
11156 aarch64_select_rtx_section (machine_mode mode,
11157                             rtx x,
11158                             unsigned HOST_WIDE_INT align)
11159 {
11160   if (aarch64_can_use_per_function_literal_pools_p ())
11161     return function_section (current_function_decl);
11162
11163   return default_elf_select_rtx_section (mode, x, align);
11164 }
11165
11166 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
11167 void
11168 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
11169                                   HOST_WIDE_INT offset)
11170 {
11171   /* When using per-function literal pools, we must ensure that any code
11172      section is aligned to the minimal instruction length, lest we get
11173      errors from the assembler re "unaligned instructions".  */
11174   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
11175     ASM_OUTPUT_ALIGN (f, 2);
11176 }
11177
11178 /* Costs.  */
11179
11180 /* Helper function for rtx cost calculation.  Strip a shift expression
11181    from X.  Returns the inner operand if successful, or the original
11182    expression on failure.  */
11183 static rtx
11184 aarch64_strip_shift (rtx x)
11185 {
11186   rtx op = x;
11187
11188   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11189      we can convert both to ROR during final output.  */
11190   if ((GET_CODE (op) == ASHIFT
11191        || GET_CODE (op) == ASHIFTRT
11192        || GET_CODE (op) == LSHIFTRT
11193        || GET_CODE (op) == ROTATERT
11194        || GET_CODE (op) == ROTATE)
11195       && CONST_INT_P (XEXP (op, 1)))
11196     return XEXP (op, 0);
11197
11198   if (GET_CODE (op) == MULT
11199       && CONST_INT_P (XEXP (op, 1))
11200       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
11201     return XEXP (op, 0);
11202
11203   return x;
11204 }
11205
11206 /* Helper function for rtx cost calculation.  Strip an extend
11207    expression from X.  Returns the inner operand if successful, or the
11208    original expression on failure.  We deal with a number of possible
11209    canonicalization variations here. If STRIP_SHIFT is true, then
11210    we can strip off a shift also.  */
11211 static rtx
11212 aarch64_strip_extend (rtx x, bool strip_shift)
11213 {
11214   scalar_int_mode mode;
11215   rtx op = x;
11216
11217   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
11218     return op;
11219
11220   /* Zero and sign extraction of a widened value.  */
11221   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
11222       && XEXP (op, 2) == const0_rtx
11223       && GET_CODE (XEXP (op, 0)) == MULT
11224       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
11225                                          XEXP (op, 1)))
11226     return XEXP (XEXP (op, 0), 0);
11227
11228   /* It can also be represented (for zero-extend) as an AND with an
11229      immediate.  */
11230   if (GET_CODE (op) == AND
11231       && GET_CODE (XEXP (op, 0)) == MULT
11232       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
11233       && CONST_INT_P (XEXP (op, 1))
11234       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
11235                            INTVAL (XEXP (op, 1))) != 0)
11236     return XEXP (XEXP (op, 0), 0);
11237
11238   /* Now handle extended register, as this may also have an optional
11239      left shift by 1..4.  */
11240   if (strip_shift
11241       && GET_CODE (op) == ASHIFT
11242       && CONST_INT_P (XEXP (op, 1))
11243       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
11244     op = XEXP (op, 0);
11245
11246   if (GET_CODE (op) == ZERO_EXTEND
11247       || GET_CODE (op) == SIGN_EXTEND)
11248     op = XEXP (op, 0);
11249
11250   if (op != x)
11251     return op;
11252
11253   return x;
11254 }
11255
11256 /* Return true iff CODE is a shift supported in combination
11257    with arithmetic instructions.  */
11258
11259 static bool
11260 aarch64_shift_p (enum rtx_code code)
11261 {
11262   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
11263 }
11264
11265
11266 /* Return true iff X is a cheap shift without a sign extend. */
11267
11268 static bool
11269 aarch64_cheap_mult_shift_p (rtx x)
11270 {
11271   rtx op0, op1;
11272
11273   op0 = XEXP (x, 0);
11274   op1 = XEXP (x, 1);
11275
11276   if (!(aarch64_tune_params.extra_tuning_flags
11277                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
11278     return false;
11279
11280   if (GET_CODE (op0) == SIGN_EXTEND)
11281     return false;
11282
11283   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
11284       && UINTVAL (op1) <= 4)
11285     return true;
11286
11287   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
11288     return false;
11289
11290   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
11291
11292   if (l2 > 0 && l2 <= 4)
11293     return true;
11294
11295   return false;
11296 }
11297
11298 /* Helper function for rtx cost calculation.  Calculate the cost of
11299    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
11300    Return the calculated cost of the expression, recursing manually in to
11301    operands where needed.  */
11302
11303 static int
11304 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
11305 {
11306   rtx op0, op1;
11307   const struct cpu_cost_table *extra_cost
11308     = aarch64_tune_params.insn_extra_cost;
11309   int cost = 0;
11310   bool compound_p = (outer == PLUS || outer == MINUS);
11311   machine_mode mode = GET_MODE (x);
11312
11313   gcc_checking_assert (code == MULT);
11314
11315   op0 = XEXP (x, 0);
11316   op1 = XEXP (x, 1);
11317
11318   if (VECTOR_MODE_P (mode))
11319     {
11320       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11321       mode = GET_MODE_INNER (mode);
11322       if (vec_flags & VEC_ADVSIMD)
11323         {
11324           /* The by-element versions of the instruction have the same costs as
11325              the normal 3-vector version.  So don't add the costs of the
11326              duplicate into the costs of the multiply.  We make an assumption
11327              that the input to the VEC_DUPLICATE is already on the FP & SIMD
11328              side.  This means costing of a MUL by element pre RA is a bit
11329              optimistic.  */
11330           if (GET_CODE (op0) == VEC_DUPLICATE)
11331             op0 = XEXP (op0, 0);
11332           else if (GET_CODE (op1) == VEC_DUPLICATE)
11333             op1 = XEXP (op1, 0);
11334         }
11335     }
11336
11337   /* Integer multiply/fma.  */
11338   if (GET_MODE_CLASS (mode) == MODE_INT)
11339     {
11340       /* The multiply will be canonicalized as a shift, cost it as such.  */
11341       if (aarch64_shift_p (GET_CODE (x))
11342           || (CONST_INT_P (op1)
11343               && exact_log2 (INTVAL (op1)) > 0))
11344         {
11345           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
11346                            || GET_CODE (op0) == SIGN_EXTEND;
11347           if (speed)
11348             {
11349               if (compound_p)
11350                 {
11351                   /* If the shift is considered cheap,
11352                      then don't add any cost. */
11353                   if (aarch64_cheap_mult_shift_p (x))
11354                     ;
11355                   else if (REG_P (op1))
11356                     /* ARITH + shift-by-register.  */
11357                     cost += extra_cost->alu.arith_shift_reg;
11358                   else if (is_extend)
11359                     /* ARITH + extended register.  We don't have a cost field
11360                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
11361                     cost += extra_cost->alu.extend_arith;
11362                   else
11363                     /* ARITH + shift-by-immediate.  */
11364                     cost += extra_cost->alu.arith_shift;
11365                 }
11366               else
11367                 /* LSL (immediate).  */
11368                 cost += extra_cost->alu.shift;
11369
11370             }
11371           /* Strip extends as we will have costed them in the case above.  */
11372           if (is_extend)
11373             op0 = aarch64_strip_extend (op0, true);
11374
11375           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
11376
11377           return cost;
11378         }
11379
11380       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
11381          compound and let the below cases handle it.  After all, MNEG is a
11382          special-case alias of MSUB.  */
11383       if (GET_CODE (op0) == NEG)
11384         {
11385           op0 = XEXP (op0, 0);
11386           compound_p = true;
11387         }
11388
11389       /* Integer multiplies or FMAs have zero/sign extending variants.  */
11390       if ((GET_CODE (op0) == ZERO_EXTEND
11391            && GET_CODE (op1) == ZERO_EXTEND)
11392           || (GET_CODE (op0) == SIGN_EXTEND
11393               && GET_CODE (op1) == SIGN_EXTEND))
11394         {
11395           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
11396           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
11397
11398           if (speed)
11399             {
11400               if (compound_p)
11401                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
11402                 cost += extra_cost->mult[0].extend_add;
11403               else
11404                 /* MUL/SMULL/UMULL.  */
11405                 cost += extra_cost->mult[0].extend;
11406             }
11407
11408           return cost;
11409         }
11410
11411       /* This is either an integer multiply or a MADD.  In both cases
11412          we want to recurse and cost the operands.  */
11413       cost += rtx_cost (op0, mode, MULT, 0, speed);
11414       cost += rtx_cost (op1, mode, MULT, 1, speed);
11415
11416       if (speed)
11417         {
11418           if (compound_p)
11419             /* MADD/MSUB.  */
11420             cost += extra_cost->mult[mode == DImode].add;
11421           else
11422             /* MUL.  */
11423             cost += extra_cost->mult[mode == DImode].simple;
11424         }
11425
11426       return cost;
11427     }
11428   else
11429     {
11430       if (speed)
11431         {
11432           /* Floating-point FMA/FMUL can also support negations of the
11433              operands, unless the rounding mode is upward or downward in
11434              which case FNMUL is different than FMUL with operand negation.  */
11435           bool neg0 = GET_CODE (op0) == NEG;
11436           bool neg1 = GET_CODE (op1) == NEG;
11437           if (compound_p || !flag_rounding_math || (neg0 && neg1))
11438             {
11439               if (neg0)
11440                 op0 = XEXP (op0, 0);
11441               if (neg1)
11442                 op1 = XEXP (op1, 0);
11443             }
11444
11445           if (compound_p)
11446             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
11447             cost += extra_cost->fp[mode == DFmode].fma;
11448           else
11449             /* FMUL/FNMUL.  */
11450             cost += extra_cost->fp[mode == DFmode].mult;
11451         }
11452
11453       cost += rtx_cost (op0, mode, MULT, 0, speed);
11454       cost += rtx_cost (op1, mode, MULT, 1, speed);
11455       return cost;
11456     }
11457 }
11458
11459 static int
11460 aarch64_address_cost (rtx x,
11461                       machine_mode mode,
11462                       addr_space_t as ATTRIBUTE_UNUSED,
11463                       bool speed)
11464 {
11465   enum rtx_code c = GET_CODE (x);
11466   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
11467   struct aarch64_address_info info;
11468   int cost = 0;
11469   info.shift = 0;
11470
11471   if (!aarch64_classify_address (&info, x, mode, false))
11472     {
11473       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
11474         {
11475           /* This is a CONST or SYMBOL ref which will be split
11476              in a different way depending on the code model in use.
11477              Cost it through the generic infrastructure.  */
11478           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
11479           /* Divide through by the cost of one instruction to
11480              bring it to the same units as the address costs.  */
11481           cost_symbol_ref /= COSTS_N_INSNS (1);
11482           /* The cost is then the cost of preparing the address,
11483              followed by an immediate (possibly 0) offset.  */
11484           return cost_symbol_ref + addr_cost->imm_offset;
11485         }
11486       else
11487         {
11488           /* This is most likely a jump table from a case
11489              statement.  */
11490           return addr_cost->register_offset;
11491         }
11492     }
11493
11494   switch (info.type)
11495     {
11496       case ADDRESS_LO_SUM:
11497       case ADDRESS_SYMBOLIC:
11498       case ADDRESS_REG_IMM:
11499         cost += addr_cost->imm_offset;
11500         break;
11501
11502       case ADDRESS_REG_WB:
11503         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
11504           cost += addr_cost->pre_modify;
11505         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
11506           cost += addr_cost->post_modify;
11507         else
11508           gcc_unreachable ();
11509
11510         break;
11511
11512       case ADDRESS_REG_REG:
11513         cost += addr_cost->register_offset;
11514         break;
11515
11516       case ADDRESS_REG_SXTW:
11517         cost += addr_cost->register_sextend;
11518         break;
11519
11520       case ADDRESS_REG_UXTW:
11521         cost += addr_cost->register_zextend;
11522         break;
11523
11524       default:
11525         gcc_unreachable ();
11526     }
11527
11528
11529   if (info.shift > 0)
11530     {
11531       /* For the sake of calculating the cost of the shifted register
11532          component, we can treat same sized modes in the same way.  */
11533       if (known_eq (GET_MODE_BITSIZE (mode), 16))
11534         cost += addr_cost->addr_scale_costs.hi;
11535       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
11536         cost += addr_cost->addr_scale_costs.si;
11537       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
11538         cost += addr_cost->addr_scale_costs.di;
11539       else
11540         /* We can't tell, or this is a 128-bit vector.  */
11541         cost += addr_cost->addr_scale_costs.ti;
11542     }
11543
11544   return cost;
11545 }
11546
11547 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
11548    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
11549    to be taken.  */
11550
11551 int
11552 aarch64_branch_cost (bool speed_p, bool predictable_p)
11553 {
11554   /* When optimizing for speed, use the cost of unpredictable branches.  */
11555   const struct cpu_branch_cost *branch_costs =
11556     aarch64_tune_params.branch_costs;
11557
11558   if (!speed_p || predictable_p)
11559     return branch_costs->predictable;
11560   else
11561     return branch_costs->unpredictable;
11562 }
11563
11564 /* Return true if the RTX X in mode MODE is a zero or sign extract
11565    usable in an ADD or SUB (extended register) instruction.  */
11566 static bool
11567 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
11568 {
11569   /* Catch add with a sign extract.
11570      This is add_<optab><mode>_multp2.  */
11571   if (GET_CODE (x) == SIGN_EXTRACT
11572       || GET_CODE (x) == ZERO_EXTRACT)
11573     {
11574       rtx op0 = XEXP (x, 0);
11575       rtx op1 = XEXP (x, 1);
11576       rtx op2 = XEXP (x, 2);
11577
11578       if (GET_CODE (op0) == MULT
11579           && CONST_INT_P (op1)
11580           && op2 == const0_rtx
11581           && CONST_INT_P (XEXP (op0, 1))
11582           && aarch64_is_extend_from_extract (mode,
11583                                              XEXP (op0, 1),
11584                                              op1))
11585         {
11586           return true;
11587         }
11588     }
11589   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
11590      No shift.  */
11591   else if (GET_CODE (x) == SIGN_EXTEND
11592            || GET_CODE (x) == ZERO_EXTEND)
11593     return REG_P (XEXP (x, 0));
11594
11595   return false;
11596 }
11597
11598 static bool
11599 aarch64_frint_unspec_p (unsigned int u)
11600 {
11601   switch (u)
11602     {
11603       case UNSPEC_FRINTZ:
11604       case UNSPEC_FRINTP:
11605       case UNSPEC_FRINTM:
11606       case UNSPEC_FRINTA:
11607       case UNSPEC_FRINTN:
11608       case UNSPEC_FRINTX:
11609       case UNSPEC_FRINTI:
11610         return true;
11611
11612       default:
11613         return false;
11614     }
11615 }
11616
11617 /* Return true iff X is an rtx that will match an extr instruction
11618    i.e. as described in the *extr<mode>5_insn family of patterns.
11619    OP0 and OP1 will be set to the operands of the shifts involved
11620    on success and will be NULL_RTX otherwise.  */
11621
11622 static bool
11623 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
11624 {
11625   rtx op0, op1;
11626   scalar_int_mode mode;
11627   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
11628     return false;
11629
11630   *res_op0 = NULL_RTX;
11631   *res_op1 = NULL_RTX;
11632
11633   if (GET_CODE (x) != IOR)
11634     return false;
11635
11636   op0 = XEXP (x, 0);
11637   op1 = XEXP (x, 1);
11638
11639   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
11640       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
11641     {
11642      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
11643       if (GET_CODE (op1) == ASHIFT)
11644         std::swap (op0, op1);
11645
11646       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
11647         return false;
11648
11649       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
11650       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
11651
11652       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
11653           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
11654         {
11655           *res_op0 = XEXP (op0, 0);
11656           *res_op1 = XEXP (op1, 0);
11657           return true;
11658         }
11659     }
11660
11661   return false;
11662 }
11663
11664 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11665    storing it in *COST.  Result is true if the total cost of the operation
11666    has now been calculated.  */
11667 static bool
11668 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11669 {
11670   rtx inner;
11671   rtx comparator;
11672   enum rtx_code cmpcode;
11673   const struct cpu_cost_table *extra_cost
11674     = aarch64_tune_params.insn_extra_cost;
11675
11676   if (COMPARISON_P (op0))
11677     {
11678       inner = XEXP (op0, 0);
11679       comparator = XEXP (op0, 1);
11680       cmpcode = GET_CODE (op0);
11681     }
11682   else
11683     {
11684       inner = op0;
11685       comparator = const0_rtx;
11686       cmpcode = NE;
11687     }
11688
11689   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11690     {
11691       /* Conditional branch.  */
11692       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11693         return true;
11694       else
11695         {
11696           if (cmpcode == NE || cmpcode == EQ)
11697             {
11698               if (comparator == const0_rtx)
11699                 {
11700                   /* TBZ/TBNZ/CBZ/CBNZ.  */
11701                   if (GET_CODE (inner) == ZERO_EXTRACT)
11702                     /* TBZ/TBNZ.  */
11703                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11704                                        ZERO_EXTRACT, 0, speed);
11705                   else
11706                     /* CBZ/CBNZ.  */
11707                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
11708
11709                   return true;
11710                 }
11711               if (register_operand (inner, VOIDmode)
11712                   && aarch64_imm24 (comparator, VOIDmode))
11713                 {
11714                   /* SUB and SUBS.  */
11715                   *cost += COSTS_N_INSNS (2);
11716                   if (speed)
11717                     *cost += extra_cost->alu.arith * 2;
11718                   return true;
11719                 }
11720             }
11721           else if (cmpcode == LT || cmpcode == GE)
11722             {
11723               /* TBZ/TBNZ.  */
11724               if (comparator == const0_rtx)
11725                 return true;
11726             }
11727         }
11728     }
11729   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11730     {
11731       /* CCMP.  */
11732       if (GET_CODE (op1) == COMPARE)
11733         {
11734           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
11735           if (XEXP (op1, 1) == const0_rtx)
11736             *cost += 1;
11737           if (speed)
11738             {
11739               machine_mode mode = GET_MODE (XEXP (op1, 0));
11740               const struct cpu_cost_table *extra_cost
11741                 = aarch64_tune_params.insn_extra_cost;
11742
11743               if (GET_MODE_CLASS (mode) == MODE_INT)
11744                 *cost += extra_cost->alu.arith;
11745               else
11746                 *cost += extra_cost->fp[mode == DFmode].compare;
11747             }
11748           return true;
11749         }
11750
11751       /* It's a conditional operation based on the status flags,
11752          so it must be some flavor of CSEL.  */
11753
11754       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
11755       if (GET_CODE (op1) == NEG
11756           || GET_CODE (op1) == NOT
11757           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11758         op1 = XEXP (op1, 0);
11759       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11760         {
11761           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
11762           op1 = XEXP (op1, 0);
11763           op2 = XEXP (op2, 0);
11764         }
11765       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
11766         {
11767           inner = XEXP (op1, 0);
11768           if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
11769             /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
11770             op1 = XEXP (inner, 0);
11771         }
11772
11773       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11774       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
11775       return true;
11776     }
11777
11778   /* We don't know what this is, cost all operands.  */
11779   return false;
11780 }
11781
11782 /* Check whether X is a bitfield operation of the form shift + extend that
11783    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
11784    operand to which the bitfield operation is applied.  Otherwise return
11785    NULL_RTX.  */
11786
11787 static rtx
11788 aarch64_extend_bitfield_pattern_p (rtx x)
11789 {
11790   rtx_code outer_code = GET_CODE (x);
11791   machine_mode outer_mode = GET_MODE (x);
11792
11793   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11794       && outer_mode != SImode && outer_mode != DImode)
11795     return NULL_RTX;
11796
11797   rtx inner = XEXP (x, 0);
11798   rtx_code inner_code = GET_CODE (inner);
11799   machine_mode inner_mode = GET_MODE (inner);
11800   rtx op = NULL_RTX;
11801
11802   switch (inner_code)
11803     {
11804       case ASHIFT:
11805         if (CONST_INT_P (XEXP (inner, 1))
11806             && (inner_mode == QImode || inner_mode == HImode))
11807           op = XEXP (inner, 0);
11808         break;
11809       case LSHIFTRT:
11810         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11811             && (inner_mode == QImode || inner_mode == HImode))
11812           op = XEXP (inner, 0);
11813         break;
11814       case ASHIFTRT:
11815         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11816             && (inner_mode == QImode || inner_mode == HImode))
11817           op = XEXP (inner, 0);
11818         break;
11819       default:
11820         break;
11821     }
11822
11823   return op;
11824 }
11825
11826 /* Return true if the mask and a shift amount from an RTX of the form
11827    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11828    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
11829
11830 bool
11831 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11832                                     rtx shft_amnt)
11833 {
11834   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11835          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11836          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
11837          && (INTVAL (mask)
11838              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
11839 }
11840
11841 /* Return true if the masks and a shift amount from an RTX of the form
11842    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11843    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
11844
11845 bool
11846 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11847                                    unsigned HOST_WIDE_INT mask1,
11848                                    unsigned HOST_WIDE_INT shft_amnt,
11849                                    unsigned HOST_WIDE_INT mask2)
11850 {
11851   unsigned HOST_WIDE_INT t;
11852
11853   /* Verify that there is no overlap in what bits are set in the two masks.  */
11854   if (mask1 != ~mask2)
11855     return false;
11856
11857   /* Verify that mask2 is not all zeros or ones.  */
11858   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11859     return false;
11860
11861   /* The shift amount should always be less than the mode size.  */
11862   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11863
11864   /* Verify that the mask being shifted is contiguous and would be in the
11865      least significant bits after shifting by shft_amnt.  */
11866   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11867   return (t == (t & -t));
11868 }
11869
11870 /* Calculate the cost of calculating X, storing it in *COST.  Result
11871    is true if the total cost of the operation has now been calculated.  */
11872 static bool
11873 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
11874                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11875 {
11876   rtx op0, op1, op2;
11877   const struct cpu_cost_table *extra_cost
11878     = aarch64_tune_params.insn_extra_cost;
11879   int code = GET_CODE (x);
11880   scalar_int_mode int_mode;
11881
11882   /* By default, assume that everything has equivalent cost to the
11883      cheapest instruction.  Any additional costs are applied as a delta
11884      above this default.  */
11885   *cost = COSTS_N_INSNS (1);
11886
11887   switch (code)
11888     {
11889     case SET:
11890       /* The cost depends entirely on the operands to SET.  */
11891       *cost = 0;
11892       op0 = SET_DEST (x);
11893       op1 = SET_SRC (x);
11894
11895       switch (GET_CODE (op0))
11896         {
11897         case MEM:
11898           if (speed)
11899             {
11900               rtx address = XEXP (op0, 0);
11901               if (VECTOR_MODE_P (mode))
11902                 *cost += extra_cost->ldst.storev;
11903               else if (GET_MODE_CLASS (mode) == MODE_INT)
11904                 *cost += extra_cost->ldst.store;
11905               else if (mode == SFmode)
11906                 *cost += extra_cost->ldst.storef;
11907               else if (mode == DFmode)
11908                 *cost += extra_cost->ldst.stored;
11909
11910               *cost +=
11911                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11912                                                      0, speed));
11913             }
11914
11915           *cost += rtx_cost (op1, mode, SET, 1, speed);
11916           return true;
11917
11918         case SUBREG:
11919           if (! REG_P (SUBREG_REG (op0)))
11920             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
11921
11922           /* Fall through.  */
11923         case REG:
11924           /* The cost is one per vector-register copied.  */
11925           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11926             {
11927               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11928               *cost = COSTS_N_INSNS (nregs);
11929             }
11930           /* const0_rtx is in general free, but we will use an
11931              instruction to set a register to 0.  */
11932           else if (REG_P (op1) || op1 == const0_rtx)
11933             {
11934               /* The cost is 1 per register copied.  */
11935               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11936               *cost = COSTS_N_INSNS (nregs);
11937             }
11938           else
11939             /* Cost is just the cost of the RHS of the set.  */
11940             *cost += rtx_cost (op1, mode, SET, 1, speed);
11941           return true;
11942
11943         case ZERO_EXTRACT:
11944         case SIGN_EXTRACT:
11945           /* Bit-field insertion.  Strip any redundant widening of
11946              the RHS to meet the width of the target.  */
11947           if (GET_CODE (op1) == SUBREG)
11948             op1 = SUBREG_REG (op1);
11949           if ((GET_CODE (op1) == ZERO_EXTEND
11950                || GET_CODE (op1) == SIGN_EXTEND)
11951               && CONST_INT_P (XEXP (op0, 1))
11952               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11953               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
11954             op1 = XEXP (op1, 0);
11955
11956           if (CONST_INT_P (op1))
11957             {
11958               /* MOV immediate is assumed to always be cheap.  */
11959               *cost = COSTS_N_INSNS (1);
11960             }
11961           else
11962             {
11963               /* BFM.  */
11964               if (speed)
11965                 *cost += extra_cost->alu.bfi;
11966               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
11967             }
11968
11969           return true;
11970
11971         default:
11972           /* We can't make sense of this, assume default cost.  */
11973           *cost = COSTS_N_INSNS (1);
11974           return false;
11975         }
11976       return false;
11977
11978     case CONST_INT:
11979       /* If an instruction can incorporate a constant within the
11980          instruction, the instruction's expression avoids calling
11981          rtx_cost() on the constant.  If rtx_cost() is called on a
11982          constant, then it is usually because the constant must be
11983          moved into a register by one or more instructions.
11984
11985          The exception is constant 0, which can be expressed
11986          as XZR/WZR and is therefore free.  The exception to this is
11987          if we have (set (reg) (const0_rtx)) in which case we must cost
11988          the move.  However, we can catch that when we cost the SET, so
11989          we don't need to consider that here.  */
11990       if (x == const0_rtx)
11991         *cost = 0;
11992       else
11993         {
11994           /* To an approximation, building any other constant is
11995              proportionally expensive to the number of instructions
11996              required to build that constant.  This is true whether we
11997              are compiling for SPEED or otherwise.  */
11998           if (!is_a <scalar_int_mode> (mode, &int_mode))
11999             int_mode = word_mode;
12000           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
12001                                  (NULL_RTX, x, false, int_mode));
12002         }
12003       return true;
12004
12005     case CONST_DOUBLE:
12006
12007       /* First determine number of instructions to do the move
12008           as an integer constant.  */
12009       if (!aarch64_float_const_representable_p (x)
12010            && !aarch64_can_const_movi_rtx_p (x, mode)
12011            && aarch64_float_const_rtx_p (x))
12012         {
12013           unsigned HOST_WIDE_INT ival;
12014           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
12015           gcc_assert (succeed);
12016
12017           scalar_int_mode imode = (mode == HFmode
12018                                    ? SImode
12019                                    : int_mode_for_mode (mode).require ());
12020           int ncost = aarch64_internal_mov_immediate
12021                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
12022           *cost += COSTS_N_INSNS (ncost);
12023           return true;
12024         }
12025
12026       if (speed)
12027         {
12028           /* mov[df,sf]_aarch64.  */
12029           if (aarch64_float_const_representable_p (x))
12030             /* FMOV (scalar immediate).  */
12031             *cost += extra_cost->fp[mode == DFmode].fpconst;
12032           else if (!aarch64_float_const_zero_rtx_p (x))
12033             {
12034               /* This will be a load from memory.  */
12035               if (mode == DFmode)
12036                 *cost += extra_cost->ldst.loadd;
12037               else
12038                 *cost += extra_cost->ldst.loadf;
12039             }
12040           else
12041             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
12042                or MOV v0.s[0], wzr - neither of which are modeled by the
12043                cost tables.  Just use the default cost.  */
12044             {
12045             }
12046         }
12047
12048       return true;
12049
12050     case MEM:
12051       if (speed)
12052         {
12053           /* For loads we want the base cost of a load, plus an
12054              approximation for the additional cost of the addressing
12055              mode.  */
12056           rtx address = XEXP (x, 0);
12057           if (VECTOR_MODE_P (mode))
12058             *cost += extra_cost->ldst.loadv;
12059           else if (GET_MODE_CLASS (mode) == MODE_INT)
12060             *cost += extra_cost->ldst.load;
12061           else if (mode == SFmode)
12062             *cost += extra_cost->ldst.loadf;
12063           else if (mode == DFmode)
12064             *cost += extra_cost->ldst.loadd;
12065
12066           *cost +=
12067                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12068                                                      0, speed));
12069         }
12070
12071       return true;
12072
12073     case NEG:
12074       op0 = XEXP (x, 0);
12075
12076       if (VECTOR_MODE_P (mode))
12077         {
12078           if (speed)
12079             {
12080               /* FNEG.  */
12081               *cost += extra_cost->vect.alu;
12082             }
12083           return false;
12084         }
12085
12086       if (GET_MODE_CLASS (mode) == MODE_INT)
12087         {
12088           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12089               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12090             {
12091               /* CSETM.  */
12092               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
12093               return true;
12094             }
12095
12096           /* Cost this as SUB wzr, X.  */
12097           op0 = CONST0_RTX (mode);
12098           op1 = XEXP (x, 0);
12099           goto cost_minus;
12100         }
12101
12102       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12103         {
12104           /* Support (neg(fma...)) as a single instruction only if
12105              sign of zeros is unimportant.  This matches the decision
12106              making in aarch64.md.  */
12107           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
12108             {
12109               /* FNMADD.  */
12110               *cost = rtx_cost (op0, mode, NEG, 0, speed);
12111               return true;
12112             }
12113           if (GET_CODE (op0) == MULT)
12114             {
12115               /* FNMUL.  */
12116               *cost = rtx_cost (op0, mode, NEG, 0, speed);
12117               return true;
12118             }
12119           if (speed)
12120             /* FNEG.  */
12121             *cost += extra_cost->fp[mode == DFmode].neg;
12122           return false;
12123         }
12124
12125       return false;
12126
12127     case CLRSB:
12128     case CLZ:
12129       if (speed)
12130         {
12131           if (VECTOR_MODE_P (mode))
12132             *cost += extra_cost->vect.alu;
12133           else
12134             *cost += extra_cost->alu.clz;
12135         }
12136
12137       return false;
12138
12139     case CTZ:
12140       *cost = COSTS_N_INSNS (2);
12141
12142       if (speed)
12143         *cost += extra_cost->alu.clz + extra_cost->alu.rev;
12144       return false;
12145
12146     case COMPARE:
12147       op0 = XEXP (x, 0);
12148       op1 = XEXP (x, 1);
12149
12150       if (op1 == const0_rtx
12151           && GET_CODE (op0) == AND)
12152         {
12153           x = op0;
12154           mode = GET_MODE (op0);
12155           goto cost_logic;
12156         }
12157
12158       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
12159         {
12160           /* TODO: A write to the CC flags possibly costs extra, this
12161              needs encoding in the cost tables.  */
12162
12163           mode = GET_MODE (op0);
12164           /* ANDS.  */
12165           if (GET_CODE (op0) == AND)
12166             {
12167               x = op0;
12168               goto cost_logic;
12169             }
12170
12171           if (GET_CODE (op0) == PLUS)
12172             {
12173               /* ADDS (and CMN alias).  */
12174               x = op0;
12175               goto cost_plus;
12176             }
12177
12178           if (GET_CODE (op0) == MINUS)
12179             {
12180               /* SUBS.  */
12181               x = op0;
12182               goto cost_minus;
12183             }
12184
12185           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
12186               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
12187               && CONST_INT_P (XEXP (op0, 2)))
12188             {
12189               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12190                  Handle it here directly rather than going to cost_logic
12191                  since we know the immediate generated for the TST is valid
12192                  so we can avoid creating an intermediate rtx for it only
12193                  for costing purposes.  */
12194               if (speed)
12195                 *cost += extra_cost->alu.logical;
12196
12197               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
12198                                  ZERO_EXTRACT, 0, speed);
12199               return true;
12200             }
12201
12202           if (GET_CODE (op1) == NEG)
12203             {
12204               /* CMN.  */
12205               if (speed)
12206                 *cost += extra_cost->alu.arith;
12207
12208               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
12209               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
12210               return true;
12211             }
12212
12213           /* CMP.
12214
12215              Compare can freely swap the order of operands, and
12216              canonicalization puts the more complex operation first.
12217              But the integer MINUS logic expects the shift/extend
12218              operation in op1.  */
12219           if (! (REG_P (op0)
12220                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
12221           {
12222             op0 = XEXP (x, 1);
12223             op1 = XEXP (x, 0);
12224           }
12225           goto cost_minus;
12226         }
12227
12228       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
12229         {
12230           /* FCMP.  */
12231           if (speed)
12232             *cost += extra_cost->fp[mode == DFmode].compare;
12233
12234           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
12235             {
12236               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
12237               /* FCMP supports constant 0.0 for no extra cost. */
12238               return true;
12239             }
12240           return false;
12241         }
12242
12243       if (VECTOR_MODE_P (mode))
12244         {
12245           /* Vector compare.  */
12246           if (speed)
12247             *cost += extra_cost->vect.alu;
12248
12249           if (aarch64_float_const_zero_rtx_p (op1))
12250             {
12251               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
12252                  cost.  */
12253               return true;
12254             }
12255           return false;
12256         }
12257       return false;
12258
12259     case MINUS:
12260       {
12261         op0 = XEXP (x, 0);
12262         op1 = XEXP (x, 1);
12263
12264 cost_minus:
12265         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
12266
12267         /* Detect valid immediates.  */
12268         if ((GET_MODE_CLASS (mode) == MODE_INT
12269              || (GET_MODE_CLASS (mode) == MODE_CC
12270                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
12271             && CONST_INT_P (op1)
12272             && aarch64_uimm12_shift (INTVAL (op1)))
12273           {
12274             if (speed)
12275               /* SUB(S) (immediate).  */
12276               *cost += extra_cost->alu.arith;
12277             return true;
12278           }
12279
12280         /* Look for SUB (extended register).  */
12281         if (is_a <scalar_int_mode> (mode, &int_mode)
12282             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
12283           {
12284             if (speed)
12285               *cost += extra_cost->alu.extend_arith;
12286
12287             op1 = aarch64_strip_extend (op1, true);
12288             *cost += rtx_cost (op1, VOIDmode,
12289                                (enum rtx_code) GET_CODE (op1), 0, speed);
12290             return true;
12291           }
12292
12293         rtx new_op1 = aarch64_strip_extend (op1, false);
12294
12295         /* Cost this as an FMA-alike operation.  */
12296         if ((GET_CODE (new_op1) == MULT
12297              || aarch64_shift_p (GET_CODE (new_op1)))
12298             && code != COMPARE)
12299           {
12300             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
12301                                             (enum rtx_code) code,
12302                                             speed);
12303             return true;
12304           }
12305
12306         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
12307
12308         if (speed)
12309           {
12310             if (VECTOR_MODE_P (mode))
12311               {
12312                 /* Vector SUB.  */
12313                 *cost += extra_cost->vect.alu;
12314               }
12315             else if (GET_MODE_CLASS (mode) == MODE_INT)
12316               {
12317                 /* SUB(S).  */
12318                 *cost += extra_cost->alu.arith;
12319               }
12320             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12321               {
12322                 /* FSUB.  */
12323                 *cost += extra_cost->fp[mode == DFmode].addsub;
12324               }
12325           }
12326         return true;
12327       }
12328
12329     case PLUS:
12330       {
12331         rtx new_op0;
12332
12333         op0 = XEXP (x, 0);
12334         op1 = XEXP (x, 1);
12335
12336 cost_plus:
12337         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12338             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12339           {
12340             /* CSINC.  */
12341             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
12342             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12343             return true;
12344           }
12345
12346         if (GET_MODE_CLASS (mode) == MODE_INT
12347             && (aarch64_plus_immediate (op1, mode)
12348                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
12349           {
12350             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
12351
12352             if (speed)
12353               /* ADD (immediate).  */
12354               *cost += extra_cost->alu.arith;
12355             return true;
12356           }
12357
12358         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12359
12360         /* Look for ADD (extended register).  */
12361         if (is_a <scalar_int_mode> (mode, &int_mode)
12362             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
12363           {
12364             if (speed)
12365               *cost += extra_cost->alu.extend_arith;
12366
12367             op0 = aarch64_strip_extend (op0, true);
12368             *cost += rtx_cost (op0, VOIDmode,
12369                                (enum rtx_code) GET_CODE (op0), 0, speed);
12370             return true;
12371           }
12372
12373         /* Strip any extend, leave shifts behind as we will
12374            cost them through mult_cost.  */
12375         new_op0 = aarch64_strip_extend (op0, false);
12376
12377         if (GET_CODE (new_op0) == MULT
12378             || aarch64_shift_p (GET_CODE (new_op0)))
12379           {
12380             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
12381                                             speed);
12382             return true;
12383           }
12384
12385         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
12386
12387         if (speed)
12388           {
12389             if (VECTOR_MODE_P (mode))
12390               {
12391                 /* Vector ADD.  */
12392                 *cost += extra_cost->vect.alu;
12393               }
12394             else if (GET_MODE_CLASS (mode) == MODE_INT)
12395               {
12396                 /* ADD.  */
12397                 *cost += extra_cost->alu.arith;
12398               }
12399             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12400               {
12401                 /* FADD.  */
12402                 *cost += extra_cost->fp[mode == DFmode].addsub;
12403               }
12404           }
12405         return true;
12406       }
12407
12408     case BSWAP:
12409       *cost = COSTS_N_INSNS (1);
12410
12411       if (speed)
12412         {
12413           if (VECTOR_MODE_P (mode))
12414             *cost += extra_cost->vect.alu;
12415           else
12416             *cost += extra_cost->alu.rev;
12417         }
12418       return false;
12419
12420     case IOR:
12421       if (aarch_rev16_p (x))
12422         {
12423           *cost = COSTS_N_INSNS (1);
12424
12425           if (speed)
12426             {
12427               if (VECTOR_MODE_P (mode))
12428                 *cost += extra_cost->vect.alu;
12429               else
12430                 *cost += extra_cost->alu.rev;
12431             }
12432           return true;
12433         }
12434
12435       if (aarch64_extr_rtx_p (x, &op0, &op1))
12436         {
12437           *cost += rtx_cost (op0, mode, IOR, 0, speed);
12438           *cost += rtx_cost (op1, mode, IOR, 1, speed);
12439           if (speed)
12440             *cost += extra_cost->alu.shift;
12441
12442           return true;
12443         }
12444     /* Fall through.  */
12445     case XOR:
12446     case AND:
12447     cost_logic:
12448       op0 = XEXP (x, 0);
12449       op1 = XEXP (x, 1);
12450
12451       if (VECTOR_MODE_P (mode))
12452         {
12453           if (speed)
12454             *cost += extra_cost->vect.alu;
12455           return true;
12456         }
12457
12458       if (code == AND
12459           && GET_CODE (op0) == MULT
12460           && CONST_INT_P (XEXP (op0, 1))
12461           && CONST_INT_P (op1)
12462           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
12463                                INTVAL (op1)) != 0)
12464         {
12465           /* This is a UBFM/SBFM.  */
12466           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
12467           if (speed)
12468             *cost += extra_cost->alu.bfx;
12469           return true;
12470         }
12471
12472       if (is_int_mode (mode, &int_mode))
12473         {
12474           if (CONST_INT_P (op1))
12475             {
12476               /* We have a mask + shift version of a UBFIZ
12477                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
12478               if (GET_CODE (op0) == ASHIFT
12479                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
12480                                                          XEXP (op0, 1)))
12481                 {
12482                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
12483                                      (enum rtx_code) code, 0, speed);
12484                   if (speed)
12485                     *cost += extra_cost->alu.bfx;
12486
12487                   return true;
12488                 }
12489               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
12490                 {
12491                 /* We possibly get the immediate for free, this is not
12492                    modelled.  */
12493                   *cost += rtx_cost (op0, int_mode,
12494                                      (enum rtx_code) code, 0, speed);
12495                   if (speed)
12496                     *cost += extra_cost->alu.logical;
12497
12498                   return true;
12499                 }
12500             }
12501           else
12502             {
12503               rtx new_op0 = op0;
12504
12505               /* Handle ORN, EON, or BIC.  */
12506               if (GET_CODE (op0) == NOT)
12507                 op0 = XEXP (op0, 0);
12508
12509               new_op0 = aarch64_strip_shift (op0);
12510
12511               /* If we had a shift on op0 then this is a logical-shift-
12512                  by-register/immediate operation.  Otherwise, this is just
12513                  a logical operation.  */
12514               if (speed)
12515                 {
12516                   if (new_op0 != op0)
12517                     {
12518                       /* Shift by immediate.  */
12519                       if (CONST_INT_P (XEXP (op0, 1)))
12520                         *cost += extra_cost->alu.log_shift;
12521                       else
12522                         *cost += extra_cost->alu.log_shift_reg;
12523                     }
12524                   else
12525                     *cost += extra_cost->alu.logical;
12526                 }
12527
12528               /* In both cases we want to cost both operands.  */
12529               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
12530                                  0, speed);
12531               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
12532                                  1, speed);
12533
12534               return true;
12535             }
12536         }
12537       return false;
12538
12539     case NOT:
12540       x = XEXP (x, 0);
12541       op0 = aarch64_strip_shift (x);
12542
12543       if (VECTOR_MODE_P (mode))
12544         {
12545           /* Vector NOT.  */
12546           *cost += extra_cost->vect.alu;
12547           return false;
12548         }
12549
12550       /* MVN-shifted-reg.  */
12551       if (op0 != x)
12552         {
12553           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12554
12555           if (speed)
12556             *cost += extra_cost->alu.log_shift;
12557
12558           return true;
12559         }
12560       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
12561          Handle the second form here taking care that 'a' in the above can
12562          be a shift.  */
12563       else if (GET_CODE (op0) == XOR)
12564         {
12565           rtx newop0 = XEXP (op0, 0);
12566           rtx newop1 = XEXP (op0, 1);
12567           rtx op0_stripped = aarch64_strip_shift (newop0);
12568
12569           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
12570           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
12571
12572           if (speed)
12573             {
12574               if (op0_stripped != newop0)
12575                 *cost += extra_cost->alu.log_shift;
12576               else
12577                 *cost += extra_cost->alu.logical;
12578             }
12579
12580           return true;
12581         }
12582       /* MVN.  */
12583       if (speed)
12584         *cost += extra_cost->alu.logical;
12585
12586       return false;
12587
12588     case ZERO_EXTEND:
12589
12590       op0 = XEXP (x, 0);
12591       /* If a value is written in SI mode, then zero extended to DI
12592          mode, the operation will in general be free as a write to
12593          a 'w' register implicitly zeroes the upper bits of an 'x'
12594          register.  However, if this is
12595
12596            (set (reg) (zero_extend (reg)))
12597
12598          we must cost the explicit register move.  */
12599       if (mode == DImode
12600           && GET_MODE (op0) == SImode
12601           && outer == SET)
12602         {
12603           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
12604
12605         /* If OP_COST is non-zero, then the cost of the zero extend
12606            is effectively the cost of the inner operation.  Otherwise
12607            we have a MOV instruction and we take the cost from the MOV
12608            itself.  This is true independently of whether we are
12609            optimizing for space or time.  */
12610           if (op_cost)
12611             *cost = op_cost;
12612
12613           return true;
12614         }
12615       else if (MEM_P (op0))
12616         {
12617           /* All loads can zero extend to any size for free.  */
12618           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
12619           return true;
12620         }
12621
12622       op0 = aarch64_extend_bitfield_pattern_p (x);
12623       if (op0)
12624         {
12625           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
12626           if (speed)
12627             *cost += extra_cost->alu.bfx;
12628           return true;
12629         }
12630
12631       if (speed)
12632         {
12633           if (VECTOR_MODE_P (mode))
12634             {
12635               /* UMOV.  */
12636               *cost += extra_cost->vect.alu;
12637             }
12638           else
12639             {
12640               /* We generate an AND instead of UXTB/UXTH.  */
12641               *cost += extra_cost->alu.logical;
12642             }
12643         }
12644       return false;
12645
12646     case SIGN_EXTEND:
12647       if (MEM_P (XEXP (x, 0)))
12648         {
12649           /* LDRSH.  */
12650           if (speed)
12651             {
12652               rtx address = XEXP (XEXP (x, 0), 0);
12653               *cost += extra_cost->ldst.load_sign_extend;
12654
12655               *cost +=
12656                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12657                                                      0, speed));
12658             }
12659           return true;
12660         }
12661
12662       op0 = aarch64_extend_bitfield_pattern_p (x);
12663       if (op0)
12664         {
12665           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
12666           if (speed)
12667             *cost += extra_cost->alu.bfx;
12668           return true;
12669         }
12670
12671       if (speed)
12672         {
12673           if (VECTOR_MODE_P (mode))
12674             *cost += extra_cost->vect.alu;
12675           else
12676             *cost += extra_cost->alu.extend;
12677         }
12678       return false;
12679
12680     case ASHIFT:
12681       op0 = XEXP (x, 0);
12682       op1 = XEXP (x, 1);
12683
12684       if (CONST_INT_P (op1))
12685         {
12686           if (speed)
12687             {
12688               if (VECTOR_MODE_P (mode))
12689                 {
12690                   /* Vector shift (immediate).  */
12691                   *cost += extra_cost->vect.alu;
12692                 }
12693               else
12694                 {
12695                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
12696                      aliases.  */
12697                   *cost += extra_cost->alu.shift;
12698                 }
12699             }
12700
12701           /* We can incorporate zero/sign extend for free.  */
12702           if (GET_CODE (op0) == ZERO_EXTEND
12703               || GET_CODE (op0) == SIGN_EXTEND)
12704             op0 = XEXP (op0, 0);
12705
12706           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
12707           return true;
12708         }
12709       else
12710         {
12711           if (VECTOR_MODE_P (mode))
12712             {
12713               if (speed)
12714                 /* Vector shift (register).  */
12715                 *cost += extra_cost->vect.alu;
12716             }
12717           else
12718             {
12719               if (speed)
12720                 /* LSLV.  */
12721                 *cost += extra_cost->alu.shift_reg;
12722
12723               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12724                   && CONST_INT_P (XEXP (op1, 1))
12725                   && known_eq (INTVAL (XEXP (op1, 1)),
12726                                GET_MODE_BITSIZE (mode) - 1))
12727                 {
12728                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12729                   /* We already demanded XEXP (op1, 0) to be REG_P, so
12730                      don't recurse into it.  */
12731                   return true;
12732                 }
12733             }
12734           return false;  /* All arguments need to be in registers.  */
12735         }
12736
12737     case ROTATE:
12738     case ROTATERT:
12739     case LSHIFTRT:
12740     case ASHIFTRT:
12741       op0 = XEXP (x, 0);
12742       op1 = XEXP (x, 1);
12743
12744       if (CONST_INT_P (op1))
12745         {
12746           /* ASR (immediate) and friends.  */
12747           if (speed)
12748             {
12749               if (VECTOR_MODE_P (mode))
12750                 *cost += extra_cost->vect.alu;
12751               else
12752                 *cost += extra_cost->alu.shift;
12753             }
12754
12755           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12756           return true;
12757         }
12758       else
12759         {
12760           if (VECTOR_MODE_P (mode))
12761             {
12762               if (speed)
12763                 /* Vector shift (register).  */
12764                 *cost += extra_cost->vect.alu;
12765             }
12766           else
12767             {
12768               if (speed)
12769                 /* ASR (register) and friends.  */
12770                 *cost += extra_cost->alu.shift_reg;
12771
12772               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12773                   && CONST_INT_P (XEXP (op1, 1))
12774                   && known_eq (INTVAL (XEXP (op1, 1)),
12775                                GET_MODE_BITSIZE (mode) - 1))
12776                 {
12777                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12778                   /* We already demanded XEXP (op1, 0) to be REG_P, so
12779                      don't recurse into it.  */
12780                   return true;
12781                 }
12782             }
12783           return false;  /* All arguments need to be in registers.  */
12784         }
12785
12786     case SYMBOL_REF:
12787
12788       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12789           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
12790         {
12791           /* LDR.  */
12792           if (speed)
12793             *cost += extra_cost->ldst.load;
12794         }
12795       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12796                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12797         {
12798           /* ADRP, followed by ADD.  */
12799           *cost += COSTS_N_INSNS (1);
12800           if (speed)
12801             *cost += 2 * extra_cost->alu.arith;
12802         }
12803       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12804                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12805         {
12806           /* ADR.  */
12807           if (speed)
12808             *cost += extra_cost->alu.arith;
12809         }
12810
12811       if (flag_pic)
12812         {
12813           /* One extra load instruction, after accessing the GOT.  */
12814           *cost += COSTS_N_INSNS (1);
12815           if (speed)
12816             *cost += extra_cost->ldst.load;
12817         }
12818       return true;
12819
12820     case HIGH:
12821     case LO_SUM:
12822       /* ADRP/ADD (immediate).  */
12823       if (speed)
12824         *cost += extra_cost->alu.arith;
12825       return true;
12826
12827     case ZERO_EXTRACT:
12828     case SIGN_EXTRACT:
12829       /* UBFX/SBFX.  */
12830       if (speed)
12831         {
12832           if (VECTOR_MODE_P (mode))
12833             *cost += extra_cost->vect.alu;
12834           else
12835             *cost += extra_cost->alu.bfx;
12836         }
12837
12838       /* We can trust that the immediates used will be correct (there
12839          are no by-register forms), so we need only cost op0.  */
12840       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
12841       return true;
12842
12843     case MULT:
12844       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12845       /* aarch64_rtx_mult_cost always handles recursion to its
12846          operands.  */
12847       return true;
12848
12849     case MOD:
12850     /* We can expand signed mod by power of 2 using a NEGS, two parallel
12851        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
12852        an unconditional negate.  This case should only ever be reached through
12853        the set_smod_pow2_cheap check in expmed.c.  */
12854       if (CONST_INT_P (XEXP (x, 1))
12855           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
12856           && (mode == SImode || mode == DImode))
12857         {
12858           /* We expand to 4 instructions.  Reset the baseline.  */
12859           *cost = COSTS_N_INSNS (4);
12860
12861           if (speed)
12862             *cost += 2 * extra_cost->alu.logical
12863                      + 2 * extra_cost->alu.arith;
12864
12865           return true;
12866         }
12867
12868     /* Fall-through.  */
12869     case UMOD:
12870       if (speed)
12871         {
12872           /* Slighly prefer UMOD over SMOD.  */
12873           if (VECTOR_MODE_P (mode))
12874             *cost += extra_cost->vect.alu;
12875           else if (GET_MODE_CLASS (mode) == MODE_INT)
12876             *cost += (extra_cost->mult[mode == DImode].add
12877                       + extra_cost->mult[mode == DImode].idiv
12878                       + (code == MOD ? 1 : 0));
12879         }
12880       return false;  /* All arguments need to be in registers.  */
12881
12882     case DIV:
12883     case UDIV:
12884     case SQRT:
12885       if (speed)
12886         {
12887           if (VECTOR_MODE_P (mode))
12888             *cost += extra_cost->vect.alu;
12889           else if (GET_MODE_CLASS (mode) == MODE_INT)
12890             /* There is no integer SQRT, so only DIV and UDIV can get
12891                here.  */
12892             *cost += (extra_cost->mult[mode == DImode].idiv
12893                      /* Slighly prefer UDIV over SDIV.  */
12894                      + (code == DIV ? 1 : 0));
12895           else
12896             *cost += extra_cost->fp[mode == DFmode].div;
12897         }
12898       return false;  /* All arguments need to be in registers.  */
12899
12900     case IF_THEN_ELSE:
12901       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12902                                          XEXP (x, 2), cost, speed);
12903
12904     case EQ:
12905     case NE:
12906     case GT:
12907     case GTU:
12908     case LT:
12909     case LTU:
12910     case GE:
12911     case GEU:
12912     case LE:
12913     case LEU:
12914
12915       return false; /* All arguments must be in registers.  */
12916
12917     case FMA:
12918       op0 = XEXP (x, 0);
12919       op1 = XEXP (x, 1);
12920       op2 = XEXP (x, 2);
12921
12922       if (speed)
12923         {
12924           if (VECTOR_MODE_P (mode))
12925             *cost += extra_cost->vect.alu;
12926           else
12927             *cost += extra_cost->fp[mode == DFmode].fma;
12928         }
12929
12930       /* FMSUB, FNMADD, and FNMSUB are free.  */
12931       if (GET_CODE (op0) == NEG)
12932         op0 = XEXP (op0, 0);
12933
12934       if (GET_CODE (op2) == NEG)
12935         op2 = XEXP (op2, 0);
12936
12937       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12938          and the by-element operand as operand 0.  */
12939       if (GET_CODE (op1) == NEG)
12940         op1 = XEXP (op1, 0);
12941
12942       /* Catch vector-by-element operations.  The by-element operand can
12943          either be (vec_duplicate (vec_select (x))) or just
12944          (vec_select (x)), depending on whether we are multiplying by
12945          a vector or a scalar.
12946
12947          Canonicalization is not very good in these cases, FMA4 will put the
12948          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
12949       if (GET_CODE (op0) == VEC_DUPLICATE)
12950         op0 = XEXP (op0, 0);
12951       else if (GET_CODE (op1) == VEC_DUPLICATE)
12952         op1 = XEXP (op1, 0);
12953
12954       if (GET_CODE (op0) == VEC_SELECT)
12955         op0 = XEXP (op0, 0);
12956       else if (GET_CODE (op1) == VEC_SELECT)
12957         op1 = XEXP (op1, 0);
12958
12959       /* If the remaining parameters are not registers,
12960          get the cost to put them into registers.  */
12961       *cost += rtx_cost (op0, mode, FMA, 0, speed);
12962       *cost += rtx_cost (op1, mode, FMA, 1, speed);
12963       *cost += rtx_cost (op2, mode, FMA, 2, speed);
12964       return true;
12965
12966     case FLOAT:
12967     case UNSIGNED_FLOAT:
12968       if (speed)
12969         *cost += extra_cost->fp[mode == DFmode].fromint;
12970       return false;
12971
12972     case FLOAT_EXTEND:
12973       if (speed)
12974         {
12975           if (VECTOR_MODE_P (mode))
12976             {
12977               /*Vector truncate.  */
12978               *cost += extra_cost->vect.alu;
12979             }
12980           else
12981             *cost += extra_cost->fp[mode == DFmode].widen;
12982         }
12983       return false;
12984
12985     case FLOAT_TRUNCATE:
12986       if (speed)
12987         {
12988           if (VECTOR_MODE_P (mode))
12989             {
12990               /*Vector conversion.  */
12991               *cost += extra_cost->vect.alu;
12992             }
12993           else
12994             *cost += extra_cost->fp[mode == DFmode].narrow;
12995         }
12996       return false;
12997
12998     case FIX:
12999     case UNSIGNED_FIX:
13000       x = XEXP (x, 0);
13001       /* Strip the rounding part.  They will all be implemented
13002          by the fcvt* family of instructions anyway.  */
13003       if (GET_CODE (x) == UNSPEC)
13004         {
13005           unsigned int uns_code = XINT (x, 1);
13006
13007           if (uns_code == UNSPEC_FRINTA
13008               || uns_code == UNSPEC_FRINTM
13009               || uns_code == UNSPEC_FRINTN
13010               || uns_code == UNSPEC_FRINTP
13011               || uns_code == UNSPEC_FRINTZ)
13012             x = XVECEXP (x, 0, 0);
13013         }
13014
13015       if (speed)
13016         {
13017           if (VECTOR_MODE_P (mode))
13018             *cost += extra_cost->vect.alu;
13019           else
13020             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
13021         }
13022
13023       /* We can combine fmul by a power of 2 followed by a fcvt into a single
13024          fixed-point fcvt.  */
13025       if (GET_CODE (x) == MULT
13026           && ((VECTOR_MODE_P (mode)
13027                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
13028               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
13029         {
13030           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
13031                              0, speed);
13032           return true;
13033         }
13034
13035       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
13036       return true;
13037
13038     case ABS:
13039       if (VECTOR_MODE_P (mode))
13040         {
13041           /* ABS (vector).  */
13042           if (speed)
13043             *cost += extra_cost->vect.alu;
13044         }
13045       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13046         {
13047           op0 = XEXP (x, 0);
13048
13049           /* FABD, which is analogous to FADD.  */
13050           if (GET_CODE (op0) == MINUS)
13051             {
13052               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
13053               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
13054               if (speed)
13055                 *cost += extra_cost->fp[mode == DFmode].addsub;
13056
13057               return true;
13058             }
13059           /* Simple FABS is analogous to FNEG.  */
13060           if (speed)
13061             *cost += extra_cost->fp[mode == DFmode].neg;
13062         }
13063       else
13064         {
13065           /* Integer ABS will either be split to
13066              two arithmetic instructions, or will be an ABS
13067              (scalar), which we don't model.  */
13068           *cost = COSTS_N_INSNS (2);
13069           if (speed)
13070             *cost += 2 * extra_cost->alu.arith;
13071         }
13072       return false;
13073
13074     case SMAX:
13075     case SMIN:
13076       if (speed)
13077         {
13078           if (VECTOR_MODE_P (mode))
13079             *cost += extra_cost->vect.alu;
13080           else
13081             {
13082               /* FMAXNM/FMINNM/FMAX/FMIN.
13083                  TODO: This may not be accurate for all implementations, but
13084                  we do not model this in the cost tables.  */
13085               *cost += extra_cost->fp[mode == DFmode].addsub;
13086             }
13087         }
13088       return false;
13089
13090     case UNSPEC:
13091       /* The floating point round to integer frint* instructions.  */
13092       if (aarch64_frint_unspec_p (XINT (x, 1)))
13093         {
13094           if (speed)
13095             *cost += extra_cost->fp[mode == DFmode].roundint;
13096
13097           return false;
13098         }
13099
13100       if (XINT (x, 1) == UNSPEC_RBIT)
13101         {
13102           if (speed)
13103             *cost += extra_cost->alu.rev;
13104
13105           return false;
13106         }
13107       break;
13108
13109     case TRUNCATE:
13110
13111       /* Decompose <su>muldi3_highpart.  */
13112       if (/* (truncate:DI  */
13113           mode == DImode
13114           /*   (lshiftrt:TI  */
13115           && GET_MODE (XEXP (x, 0)) == TImode
13116           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
13117           /*      (mult:TI  */
13118           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13119           /*        (ANY_EXTEND:TI (reg:DI))
13120                     (ANY_EXTEND:TI (reg:DI)))  */
13121           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
13122                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
13123               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
13124                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
13125           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
13126           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
13127           /*     (const_int 64)  */
13128           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13129           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
13130         {
13131           /* UMULH/SMULH.  */
13132           if (speed)
13133             *cost += extra_cost->mult[mode == DImode].extend;
13134           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
13135                              mode, MULT, 0, speed);
13136           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
13137                              mode, MULT, 1, speed);
13138           return true;
13139         }
13140
13141       /* Fall through.  */
13142     default:
13143       break;
13144     }
13145
13146   if (dump_file
13147       && flag_aarch64_verbose_cost)
13148     fprintf (dump_file,
13149       "\nFailed to cost RTX.  Assuming default cost.\n");
13150
13151   return true;
13152 }
13153
13154 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
13155    calculated for X.  This cost is stored in *COST.  Returns true
13156    if the total cost of X was calculated.  */
13157 static bool
13158 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
13159                    int param, int *cost, bool speed)
13160 {
13161   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
13162
13163   if (dump_file
13164       && flag_aarch64_verbose_cost)
13165     {
13166       print_rtl_single (dump_file, x);
13167       fprintf (dump_file, "\n%s cost: %d (%s)\n",
13168                speed ? "Hot" : "Cold",
13169                *cost, result ? "final" : "partial");
13170     }
13171
13172   return result;
13173 }
13174
13175 static int
13176 aarch64_register_move_cost (machine_mode mode,
13177                             reg_class_t from_i, reg_class_t to_i)
13178 {
13179   enum reg_class from = (enum reg_class) from_i;
13180   enum reg_class to = (enum reg_class) to_i;
13181   const struct cpu_regmove_cost *regmove_cost
13182     = aarch64_tune_params.regmove_cost;
13183
13184   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
13185   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
13186       || to == STUB_REGS)
13187     to = GENERAL_REGS;
13188
13189   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
13190       || from == STUB_REGS)
13191     from = GENERAL_REGS;
13192
13193   /* Make RDFFR very expensive.  In particular, if we know that the FFR
13194      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13195      as a way of obtaining a PTRUE.  */
13196   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
13197       && hard_reg_set_subset_p (reg_class_contents[from_i],
13198                                 reg_class_contents[FFR_REGS]))
13199     return 80;
13200
13201   /* Moving between GPR and stack cost is the same as GP2GP.  */
13202   if ((from == GENERAL_REGS && to == STACK_REG)
13203       || (to == GENERAL_REGS && from == STACK_REG))
13204     return regmove_cost->GP2GP;
13205
13206   /* To/From the stack register, we move via the gprs.  */
13207   if (to == STACK_REG || from == STACK_REG)
13208     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
13209             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
13210
13211   if (known_eq (GET_MODE_SIZE (mode), 16))
13212     {
13213       /* 128-bit operations on general registers require 2 instructions.  */
13214       if (from == GENERAL_REGS && to == GENERAL_REGS)
13215         return regmove_cost->GP2GP * 2;
13216       else if (from == GENERAL_REGS)
13217         return regmove_cost->GP2FP * 2;
13218       else if (to == GENERAL_REGS)
13219         return regmove_cost->FP2GP * 2;
13220
13221       /* When AdvSIMD instructions are disabled it is not possible to move
13222          a 128-bit value directly between Q registers.  This is handled in
13223          secondary reload.  A general register is used as a scratch to move
13224          the upper DI value and the lower DI value is moved directly,
13225          hence the cost is the sum of three moves. */
13226       if (! TARGET_SIMD)
13227         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
13228
13229       return regmove_cost->FP2FP;
13230     }
13231
13232   if (from == GENERAL_REGS && to == GENERAL_REGS)
13233     return regmove_cost->GP2GP;
13234   else if (from == GENERAL_REGS)
13235     return regmove_cost->GP2FP;
13236   else if (to == GENERAL_REGS)
13237     return regmove_cost->FP2GP;
13238
13239   return regmove_cost->FP2FP;
13240 }
13241
13242 static int
13243 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
13244                           reg_class_t rclass ATTRIBUTE_UNUSED,
13245                           bool in ATTRIBUTE_UNUSED)
13246 {
13247   return aarch64_tune_params.memmov_cost;
13248 }
13249
13250 /* Implement TARGET_INIT_BUILTINS.  */
13251 static void
13252 aarch64_init_builtins ()
13253 {
13254   aarch64_general_init_builtins ();
13255   aarch64_sve::init_builtins ();
13256 }
13257
13258 /* Implement TARGET_FOLD_BUILTIN.  */
13259 static tree
13260 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
13261 {
13262   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13263   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13264   tree type = TREE_TYPE (TREE_TYPE (fndecl));
13265   switch (code & AARCH64_BUILTIN_CLASS)
13266     {
13267     case AARCH64_BUILTIN_GENERAL:
13268       return aarch64_general_fold_builtin (subcode, type, nargs, args);
13269
13270     case AARCH64_BUILTIN_SVE:
13271       return NULL_TREE;
13272     }
13273   gcc_unreachable ();
13274 }
13275
13276 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
13277 static bool
13278 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
13279 {
13280   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
13281   tree fndecl = gimple_call_fndecl (stmt);
13282   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13283   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13284   gimple *new_stmt = NULL;
13285   switch (code & AARCH64_BUILTIN_CLASS)
13286     {
13287     case AARCH64_BUILTIN_GENERAL:
13288       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
13289       break;
13290
13291     case AARCH64_BUILTIN_SVE:
13292       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
13293       break;
13294     }
13295
13296   if (!new_stmt)
13297     return false;
13298
13299   gsi_replace (gsi, new_stmt, true);
13300   return true;
13301 }
13302
13303 /* Implement TARGET_EXPAND_BUILTIN.  */
13304 static rtx
13305 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
13306 {
13307   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
13308   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13309   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13310   switch (code & AARCH64_BUILTIN_CLASS)
13311     {
13312     case AARCH64_BUILTIN_GENERAL:
13313       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
13314
13315     case AARCH64_BUILTIN_SVE:
13316       return aarch64_sve::expand_builtin (subcode, exp, target);
13317     }
13318   gcc_unreachable ();
13319 }
13320
13321 /* Implement TARGET_BUILTIN_DECL.  */
13322 static tree
13323 aarch64_builtin_decl (unsigned int code, bool initialize_p)
13324 {
13325   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13326   switch (code & AARCH64_BUILTIN_CLASS)
13327     {
13328     case AARCH64_BUILTIN_GENERAL:
13329       return aarch64_general_builtin_decl (subcode, initialize_p);
13330
13331     case AARCH64_BUILTIN_SVE:
13332       return aarch64_sve::builtin_decl (subcode, initialize_p);
13333     }
13334   gcc_unreachable ();
13335 }
13336
13337 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
13338    to optimize 1.0/sqrt.  */
13339
13340 static bool
13341 use_rsqrt_p (machine_mode mode)
13342 {
13343   return (!flag_trapping_math
13344           && flag_unsafe_math_optimizations
13345           && ((aarch64_tune_params.approx_modes->recip_sqrt
13346                & AARCH64_APPROX_MODE (mode))
13347               || flag_mrecip_low_precision_sqrt));
13348 }
13349
13350 /* Function to decide when to use the approximate reciprocal square root
13351    builtin.  */
13352
13353 static tree
13354 aarch64_builtin_reciprocal (tree fndecl)
13355 {
13356   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
13357
13358   if (!use_rsqrt_p (mode))
13359     return NULL_TREE;
13360   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13361   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13362   switch (code & AARCH64_BUILTIN_CLASS)
13363     {
13364     case AARCH64_BUILTIN_GENERAL:
13365       return aarch64_general_builtin_rsqrt (subcode);
13366
13367     case AARCH64_BUILTIN_SVE:
13368       return NULL_TREE;
13369     }
13370   gcc_unreachable ();
13371 }
13372
13373 /* Emit code to perform the floating-point operation:
13374
13375      DST = SRC1 * SRC2
13376
13377    where all three operands are already known to be registers.
13378    If the operation is an SVE one, PTRUE is a suitable all-true
13379    predicate.  */
13380
13381 static void
13382 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
13383 {
13384   if (ptrue)
13385     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
13386                                  dst, ptrue, src1, src2,
13387                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
13388   else
13389     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
13390 }
13391
13392 /* Emit instruction sequence to compute either the approximate square root
13393    or its approximate reciprocal, depending on the flag RECP, and return
13394    whether the sequence was emitted or not.  */
13395
13396 bool
13397 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
13398 {
13399   machine_mode mode = GET_MODE (dst);
13400
13401   if (GET_MODE_INNER (mode) == HFmode)
13402     {
13403       gcc_assert (!recp);
13404       return false;
13405     }
13406
13407   if (!recp)
13408     {
13409       if (!(flag_mlow_precision_sqrt
13410             || (aarch64_tune_params.approx_modes->sqrt
13411                 & AARCH64_APPROX_MODE (mode))))
13412         return false;
13413
13414       if (!flag_finite_math_only
13415           || flag_trapping_math
13416           || !flag_unsafe_math_optimizations
13417           || optimize_function_for_size_p (cfun))
13418         return false;
13419     }
13420   else
13421     /* Caller assumes we cannot fail.  */
13422     gcc_assert (use_rsqrt_p (mode));
13423
13424   rtx pg = NULL_RTX;
13425   if (aarch64_sve_mode_p (mode))
13426     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13427   machine_mode mmsk = (VECTOR_MODE_P (mode)
13428                        ? related_int_vector_mode (mode).require ()
13429                        : int_mode_for_mode (mode).require ());
13430   rtx xmsk = NULL_RTX;
13431   if (!recp)
13432     {
13433       /* When calculating the approximate square root, compare the
13434          argument with 0.0 and create a mask.  */
13435       rtx zero = CONST0_RTX (mode);
13436       if (pg)
13437         {
13438           xmsk = gen_reg_rtx (GET_MODE (pg));
13439           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
13440           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
13441                                            xmsk, pg, hint, src, zero));
13442         }
13443       else
13444         {
13445           xmsk = gen_reg_rtx (mmsk);
13446           emit_insn (gen_rtx_SET (xmsk,
13447                                   gen_rtx_NEG (mmsk,
13448                                                gen_rtx_EQ (mmsk, src, zero))));
13449         }
13450     }
13451
13452   /* Estimate the approximate reciprocal square root.  */
13453   rtx xdst = gen_reg_rtx (mode);
13454   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
13455
13456   /* Iterate over the series twice for SF and thrice for DF.  */
13457   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13458
13459   /* Optionally iterate over the series once less for faster performance
13460      while sacrificing the accuracy.  */
13461   if ((recp && flag_mrecip_low_precision_sqrt)
13462       || (!recp && flag_mlow_precision_sqrt))
13463     iterations--;
13464
13465   /* Iterate over the series to calculate the approximate reciprocal square
13466      root.  */
13467   rtx x1 = gen_reg_rtx (mode);
13468   while (iterations--)
13469     {
13470       rtx x2 = gen_reg_rtx (mode);
13471       aarch64_emit_mult (x2, pg, xdst, xdst);
13472
13473       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
13474
13475       if (iterations > 0)
13476         aarch64_emit_mult (xdst, pg, xdst, x1);
13477     }
13478
13479   if (!recp)
13480     {
13481       if (pg)
13482         /* Multiply nonzero source values by the corresponding intermediate
13483            result elements, so that the final calculation is the approximate
13484            square root rather than its reciprocal.  Select a zero result for
13485            zero source values, to avoid the Inf * 0 -> NaN that we'd get
13486            otherwise.  */
13487         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
13488                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
13489       else
13490         {
13491           /* Qualify the approximate reciprocal square root when the
13492              argument is 0.0 by squashing the intermediary result to 0.0.  */
13493           rtx xtmp = gen_reg_rtx (mmsk);
13494           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
13495                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
13496           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
13497
13498           /* Calculate the approximate square root.  */
13499           aarch64_emit_mult (xdst, pg, xdst, src);
13500         }
13501     }
13502
13503   /* Finalize the approximation.  */
13504   aarch64_emit_mult (dst, pg, xdst, x1);
13505
13506   return true;
13507 }
13508
13509 /* Emit the instruction sequence to compute the approximation for the division
13510    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
13511
13512 bool
13513 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
13514 {
13515   machine_mode mode = GET_MODE (quo);
13516
13517   if (GET_MODE_INNER (mode) == HFmode)
13518     return false;
13519
13520   bool use_approx_division_p = (flag_mlow_precision_div
13521                                 || (aarch64_tune_params.approx_modes->division
13522                                     & AARCH64_APPROX_MODE (mode)));
13523
13524   if (!flag_finite_math_only
13525       || flag_trapping_math
13526       || !flag_unsafe_math_optimizations
13527       || optimize_function_for_size_p (cfun)
13528       || !use_approx_division_p)
13529     return false;
13530
13531   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
13532     return false;
13533
13534   rtx pg = NULL_RTX;
13535   if (aarch64_sve_mode_p (mode))
13536     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13537
13538   /* Estimate the approximate reciprocal.  */
13539   rtx xrcp = gen_reg_rtx (mode);
13540   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
13541
13542   /* Iterate over the series twice for SF and thrice for DF.  */
13543   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13544
13545   /* Optionally iterate over the series less for faster performance,
13546      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
13547   if (flag_mlow_precision_div)
13548     iterations = (GET_MODE_INNER (mode) == DFmode
13549                   ? aarch64_double_recp_precision
13550                   : aarch64_float_recp_precision);
13551
13552   /* Iterate over the series to calculate the approximate reciprocal.  */
13553   rtx xtmp = gen_reg_rtx (mode);
13554   while (iterations--)
13555     {
13556       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
13557
13558       if (iterations > 0)
13559         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
13560     }
13561
13562   if (num != CONST1_RTX (mode))
13563     {
13564       /* As the approximate reciprocal of DEN is already calculated, only
13565          calculate the approximate division when NUM is not 1.0.  */
13566       rtx xnum = force_reg (mode, num);
13567       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
13568     }
13569
13570   /* Finalize the approximation.  */
13571   aarch64_emit_mult (quo, pg, xrcp, xtmp);
13572   return true;
13573 }
13574
13575 /* Return the number of instructions that can be issued per cycle.  */
13576 static int
13577 aarch64_sched_issue_rate (void)
13578 {
13579   return aarch64_tune_params.issue_rate;
13580 }
13581
13582 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
13583 static int
13584 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
13585 {
13586   if (DEBUG_INSN_P (insn))
13587     return more;
13588
13589   rtx_code code = GET_CODE (PATTERN (insn));
13590   if (code == USE || code == CLOBBER)
13591     return more;
13592
13593   if (get_attr_type (insn) == TYPE_NO_INSN)
13594     return more;
13595
13596   return more - 1;
13597 }
13598
13599 static int
13600 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
13601 {
13602   int issue_rate = aarch64_sched_issue_rate ();
13603
13604   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
13605 }
13606
13607
13608 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
13609    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
13610    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
13611
13612 static int
13613 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
13614                                                     int ready_index)
13615 {
13616   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
13617 }
13618
13619
13620 /* Vectorizer cost model target hooks.  */
13621
13622 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
13623 static int
13624 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
13625                                     tree vectype,
13626                                     int misalign ATTRIBUTE_UNUSED)
13627 {
13628   unsigned elements;
13629   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
13630   bool fp = false;
13631
13632   if (vectype != NULL)
13633     fp = FLOAT_TYPE_P (vectype);
13634
13635   switch (type_of_cost)
13636     {
13637       case scalar_stmt:
13638         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
13639
13640       case scalar_load:
13641         return costs->scalar_load_cost;
13642
13643       case scalar_store:
13644         return costs->scalar_store_cost;
13645
13646       case vector_stmt:
13647         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13648
13649       case vector_load:
13650         return costs->vec_align_load_cost;
13651
13652       case vector_store:
13653         return costs->vec_store_cost;
13654
13655       case vec_to_scalar:
13656         return costs->vec_to_scalar_cost;
13657
13658       case scalar_to_vec:
13659         return costs->scalar_to_vec_cost;
13660
13661       case unaligned_load:
13662       case vector_gather_load:
13663         return costs->vec_unalign_load_cost;
13664
13665       case unaligned_store:
13666       case vector_scatter_store:
13667         return costs->vec_unalign_store_cost;
13668
13669       case cond_branch_taken:
13670         return costs->cond_taken_branch_cost;
13671
13672       case cond_branch_not_taken:
13673         return costs->cond_not_taken_branch_cost;
13674
13675       case vec_perm:
13676         return costs->vec_permute_cost;
13677
13678       case vec_promote_demote:
13679         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13680
13681       case vec_construct:
13682         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
13683         return elements / 2 + 1;
13684
13685       default:
13686         gcc_unreachable ();
13687     }
13688 }
13689
13690 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
13691    vectors would produce a series of LDP or STP operations.  KIND is the
13692    kind of statement that STMT_INFO represents.  */
13693 static bool
13694 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
13695                            stmt_vec_info stmt_info)
13696 {
13697   switch (kind)
13698     {
13699     case vector_load:
13700     case vector_store:
13701     case unaligned_load:
13702     case unaligned_store:
13703       break;
13704
13705     default:
13706       return false;
13707     }
13708
13709   if (aarch64_tune_params.extra_tuning_flags
13710       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
13711     return false;
13712
13713   return is_gimple_assign (stmt_info->stmt);
13714 }
13715
13716 /* Return true if STMT_INFO extends the result of a load.  */
13717 static bool
13718 aarch64_extending_load_p (class vec_info *vinfo, stmt_vec_info stmt_info)
13719 {
13720   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13721   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13722     return false;
13723
13724   tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
13725   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13726   tree rhs_type = TREE_TYPE (rhs);
13727   if (!INTEGRAL_TYPE_P (lhs_type)
13728       || !INTEGRAL_TYPE_P (rhs_type)
13729       || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
13730     return false;
13731
13732   stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
13733   return (def_stmt_info
13734           && STMT_VINFO_DATA_REF (def_stmt_info)
13735           && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
13736 }
13737
13738 /* Return true if STMT_INFO is an integer truncation.  */
13739 static bool
13740 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
13741 {
13742   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13743   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13744     return false;
13745
13746   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13747   tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
13748   return (INTEGRAL_TYPE_P (lhs_type)
13749           && INTEGRAL_TYPE_P (rhs_type)
13750           && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
13751 }
13752
13753 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13754    for STMT_INFO, which has cost kind KIND and which when vectorized would
13755    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
13756    targets.  */
13757 static unsigned int
13758 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
13759                               stmt_vec_info stmt_info, tree vectype,
13760                               unsigned int stmt_cost)
13761 {
13762   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13763      vector register size or number of units.  Integer promotions of this
13764      type therefore map to SXT[BHW] or UXT[BHW].
13765
13766      Most loads have extending forms that can do the sign or zero extension
13767      on the fly.  Optimistically assume that a load followed by an extension
13768      will fold to this form during combine, and that the extension therefore
13769      comes for free.  */
13770   if (kind == vector_stmt && aarch64_extending_load_p (vinfo, stmt_info))
13771     stmt_cost = 0;
13772
13773   /* For similar reasons, vector_stmt integer truncations are a no-op,
13774      because we can just ignore the unused upper bits of the source.  */
13775   if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13776     stmt_cost = 0;
13777
13778   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
13779      but there are no equivalent instructions for SVE.  This means that
13780      (all other things being equal) 128-bit SVE needs twice as many load
13781      and store instructions as Advanced SIMD in order to process vector pairs.
13782
13783      Also, scalar code can often use LDP and STP to access pairs of values,
13784      so it is too simplistic to say that one SVE load or store replaces
13785      VF scalar loads and stores.
13786
13787      Ideally we would account for this in the scalar and Advanced SIMD
13788      costs by making suitable load/store pairs as cheap as a single
13789      load/store.  However, that would be a very invasive change and in
13790      practice it tends to stress other parts of the cost model too much.
13791      E.g. stores of scalar constants currently count just a store,
13792      whereas stores of vector constants count a store and a vec_init.
13793      This is an artificial distinction for AArch64, where stores of
13794      nonzero scalar constants need the same kind of register invariant
13795      as vector stores.
13796
13797      An alternative would be to double the cost of any SVE loads and stores
13798      that could be paired in Advanced SIMD (and possibly also paired in
13799      scalar code).  But this tends to stress other parts of the cost model
13800      in the same way.  It also means that we can fall back to Advanced SIMD
13801      even if full-loop predication would have been useful.
13802
13803      Here we go for a more conservative version: double the costs of SVE
13804      loads and stores if one iteration of the scalar loop processes enough
13805      elements for it to use a whole number of Advanced SIMD LDP or STP
13806      instructions.  This makes it very likely that the VF would be 1 for
13807      Advanced SIMD, and so no epilogue should be needed.  */
13808   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
13809     {
13810       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
13811       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
13812       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
13813       if (multiple_p (count * elt_bits, 256)
13814           && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
13815         stmt_cost *= 2;
13816     }
13817
13818   return stmt_cost;
13819 }
13820
13821 /* Implement targetm.vectorize.add_stmt_cost.  */
13822 static unsigned
13823 aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
13824                        enum vect_cost_for_stmt kind,
13825                        struct _stmt_vec_info *stmt_info, tree vectype,
13826                        int misalign, enum vect_cost_model_location where)
13827 {
13828   unsigned *cost = (unsigned *) data;
13829   unsigned retval = 0;
13830
13831   if (flag_vect_cost_model)
13832     {
13833       int stmt_cost =
13834             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13835
13836       if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
13837         stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
13838                                                   vectype, stmt_cost);
13839
13840       /* Statements in an inner loop relative to the loop being
13841          vectorized are weighted more heavily.  The value here is
13842          arbitrary and could potentially be improved with analysis.  */
13843       if (where == vect_body && stmt_info
13844           && stmt_in_inner_loop_p (vinfo, stmt_info))
13845         count *= 50; /*  FIXME  */
13846
13847       retval = (unsigned) (count * stmt_cost);
13848       cost[where] += retval;
13849     }
13850
13851   return retval;
13852 }
13853
13854 static void initialize_aarch64_code_model (struct gcc_options *);
13855
13856 /* Parse the TO_PARSE string and put the architecture struct that it
13857    selects into RES and the architectural features into ISA_FLAGS.
13858    Return an aarch64_parse_opt_result describing the parse result.
13859    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13860    When the TO_PARSE string contains an invalid extension,
13861    a copy of the string is created and stored to INVALID_EXTENSION.  */
13862
13863 static enum aarch64_parse_opt_result
13864 aarch64_parse_arch (const char *to_parse, const struct processor **res,
13865                     uint64_t *isa_flags, std::string *invalid_extension)
13866 {
13867   const char *ext;
13868   const struct processor *arch;
13869   size_t len;
13870
13871   ext = strchr (to_parse, '+');
13872
13873   if (ext != NULL)
13874     len = ext - to_parse;
13875   else
13876     len = strlen (to_parse);
13877
13878   if (len == 0)
13879     return AARCH64_PARSE_MISSING_ARG;
13880
13881
13882   /* Loop through the list of supported ARCHes to find a match.  */
13883   for (arch = all_architectures; arch->name != NULL; arch++)
13884     {
13885       if (strlen (arch->name) == len
13886           && strncmp (arch->name, to_parse, len) == 0)
13887         {
13888           uint64_t isa_temp = arch->flags;
13889
13890           if (ext != NULL)
13891             {
13892               /* TO_PARSE string contains at least one extension.  */
13893               enum aarch64_parse_opt_result ext_res
13894                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13895
13896               if (ext_res != AARCH64_PARSE_OK)
13897                 return ext_res;
13898             }
13899           /* Extension parsing was successful.  Confirm the result
13900              arch and ISA flags.  */
13901           *res = arch;
13902           *isa_flags = isa_temp;
13903           return AARCH64_PARSE_OK;
13904         }
13905     }
13906
13907   /* ARCH name not found in list.  */
13908   return AARCH64_PARSE_INVALID_ARG;
13909 }
13910
13911 /* Parse the TO_PARSE string and put the result tuning in RES and the
13912    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
13913    describing the parse result.  If there is an error parsing, RES and
13914    ISA_FLAGS are left unchanged.
13915    When the TO_PARSE string contains an invalid extension,
13916    a copy of the string is created and stored to INVALID_EXTENSION.  */
13917
13918 static enum aarch64_parse_opt_result
13919 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
13920                    uint64_t *isa_flags, std::string *invalid_extension)
13921 {
13922   const char *ext;
13923   const struct processor *cpu;
13924   size_t len;
13925
13926   ext = strchr (to_parse, '+');
13927
13928   if (ext != NULL)
13929     len = ext - to_parse;
13930   else
13931     len = strlen (to_parse);
13932
13933   if (len == 0)
13934     return AARCH64_PARSE_MISSING_ARG;
13935
13936
13937   /* Loop through the list of supported CPUs to find a match.  */
13938   for (cpu = all_cores; cpu->name != NULL; cpu++)
13939     {
13940       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
13941         {
13942           uint64_t isa_temp = cpu->flags;
13943
13944
13945           if (ext != NULL)
13946             {
13947               /* TO_PARSE string contains at least one extension.  */
13948               enum aarch64_parse_opt_result ext_res
13949                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13950
13951               if (ext_res != AARCH64_PARSE_OK)
13952                 return ext_res;
13953             }
13954           /* Extension parsing was successfull.  Confirm the result
13955              cpu and ISA flags.  */
13956           *res = cpu;
13957           *isa_flags = isa_temp;
13958           return AARCH64_PARSE_OK;
13959         }
13960     }
13961
13962   /* CPU name not found in list.  */
13963   return AARCH64_PARSE_INVALID_ARG;
13964 }
13965
13966 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13967    Return an aarch64_parse_opt_result describing the parse result.
13968    If the parsing fails the RES does not change.  */
13969
13970 static enum aarch64_parse_opt_result
13971 aarch64_parse_tune (const char *to_parse, const struct processor **res)
13972 {
13973   const struct processor *cpu;
13974
13975   /* Loop through the list of supported CPUs to find a match.  */
13976   for (cpu = all_cores; cpu->name != NULL; cpu++)
13977     {
13978       if (strcmp (cpu->name, to_parse) == 0)
13979         {
13980           *res = cpu;
13981           return AARCH64_PARSE_OK;
13982         }
13983     }
13984
13985   /* CPU name not found in list.  */
13986   return AARCH64_PARSE_INVALID_ARG;
13987 }
13988
13989 /* Parse TOKEN, which has length LENGTH to see if it is an option
13990    described in FLAG.  If it is, return the index bit for that fusion type.
13991    If not, error (printing OPTION_NAME) and return zero.  */
13992
13993 static unsigned int
13994 aarch64_parse_one_option_token (const char *token,
13995                                 size_t length,
13996                                 const struct aarch64_flag_desc *flag,
13997                                 const char *option_name)
13998 {
13999   for (; flag->name != NULL; flag++)
14000     {
14001       if (length == strlen (flag->name)
14002           && !strncmp (flag->name, token, length))
14003         return flag->flag;
14004     }
14005
14006   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
14007   return 0;
14008 }
14009
14010 /* Parse OPTION which is a comma-separated list of flags to enable.
14011    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
14012    default state we inherit from the CPU tuning structures.  OPTION_NAME
14013    gives the top-level option we are parsing in the -moverride string,
14014    for use in error messages.  */
14015
14016 static unsigned int
14017 aarch64_parse_boolean_options (const char *option,
14018                                const struct aarch64_flag_desc *flags,
14019                                unsigned int initial_state,
14020                                const char *option_name)
14021 {
14022   const char separator = '.';
14023   const char* specs = option;
14024   const char* ntoken = option;
14025   unsigned int found_flags = initial_state;
14026
14027   while ((ntoken = strchr (specs, separator)))
14028     {
14029       size_t token_length = ntoken - specs;
14030       unsigned token_ops = aarch64_parse_one_option_token (specs,
14031                                                            token_length,
14032                                                            flags,
14033                                                            option_name);
14034       /* If we find "none" (or, for simplicity's sake, an error) anywhere
14035          in the token stream, reset the supported operations.  So:
14036
14037            adrp+add.cmp+branch.none.adrp+add
14038
14039            would have the result of turning on only adrp+add fusion.  */
14040       if (!token_ops)
14041         found_flags = 0;
14042
14043       found_flags |= token_ops;
14044       specs = ++ntoken;
14045     }
14046
14047   /* We ended with a comma, print something.  */
14048   if (!(*specs))
14049     {
14050       error ("%s string ill-formed\n", option_name);
14051       return 0;
14052     }
14053
14054   /* We still have one more token to parse.  */
14055   size_t token_length = strlen (specs);
14056   unsigned token_ops = aarch64_parse_one_option_token (specs,
14057                                                        token_length,
14058                                                        flags,
14059                                                        option_name);
14060    if (!token_ops)
14061      found_flags = 0;
14062
14063   found_flags |= token_ops;
14064   return found_flags;
14065 }
14066
14067 /* Support for overriding instruction fusion.  */
14068
14069 static void
14070 aarch64_parse_fuse_string (const char *fuse_string,
14071                             struct tune_params *tune)
14072 {
14073   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
14074                                                      aarch64_fusible_pairs,
14075                                                      tune->fusible_ops,
14076                                                      "fuse=");
14077 }
14078
14079 /* Support for overriding other tuning flags.  */
14080
14081 static void
14082 aarch64_parse_tune_string (const char *tune_string,
14083                             struct tune_params *tune)
14084 {
14085   tune->extra_tuning_flags
14086     = aarch64_parse_boolean_options (tune_string,
14087                                      aarch64_tuning_flags,
14088                                      tune->extra_tuning_flags,
14089                                      "tune=");
14090 }
14091
14092 /* Parse the sve_width tuning moverride string in TUNE_STRING.
14093    Accept the valid SVE vector widths allowed by
14094    aarch64_sve_vector_bits_enum and use it to override sve_width
14095    in TUNE.  */
14096
14097 static void
14098 aarch64_parse_sve_width_string (const char *tune_string,
14099                                 struct tune_params *tune)
14100 {
14101   int width = -1;
14102
14103   int n = sscanf (tune_string, "%d", &width);
14104   if (n == EOF)
14105     {
14106       error ("invalid format for sve_width");
14107       return;
14108     }
14109   switch (width)
14110     {
14111     case SVE_128:
14112     case SVE_256:
14113     case SVE_512:
14114     case SVE_1024:
14115     case SVE_2048:
14116       break;
14117     default:
14118       error ("invalid sve_width value: %d", width);
14119     }
14120   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
14121 }
14122
14123 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
14124    we understand.  If it is, extract the option string and handoff to
14125    the appropriate function.  */
14126
14127 void
14128 aarch64_parse_one_override_token (const char* token,
14129                                   size_t length,
14130                                   struct tune_params *tune)
14131 {
14132   const struct aarch64_tuning_override_function *fn
14133     = aarch64_tuning_override_functions;
14134
14135   const char *option_part = strchr (token, '=');
14136   if (!option_part)
14137     {
14138       error ("tuning string missing in option (%s)", token);
14139       return;
14140     }
14141
14142   /* Get the length of the option name.  */
14143   length = option_part - token;
14144   /* Skip the '=' to get to the option string.  */
14145   option_part++;
14146
14147   for (; fn->name != NULL; fn++)
14148     {
14149       if (!strncmp (fn->name, token, length))
14150         {
14151           fn->parse_override (option_part, tune);
14152           return;
14153         }
14154     }
14155
14156   error ("unknown tuning option (%s)",token);
14157   return;
14158 }
14159
14160 /* A checking mechanism for the implementation of the tls size.  */
14161
14162 static void
14163 initialize_aarch64_tls_size (struct gcc_options *opts)
14164 {
14165   if (aarch64_tls_size == 0)
14166     aarch64_tls_size = 24;
14167
14168   switch (opts->x_aarch64_cmodel_var)
14169     {
14170     case AARCH64_CMODEL_TINY:
14171       /* Both the default and maximum TLS size allowed under tiny is 1M which
14172          needs two instructions to address, so we clamp the size to 24.  */
14173       if (aarch64_tls_size > 24)
14174         aarch64_tls_size = 24;
14175       break;
14176     case AARCH64_CMODEL_SMALL:
14177       /* The maximum TLS size allowed under small is 4G.  */
14178       if (aarch64_tls_size > 32)
14179         aarch64_tls_size = 32;
14180       break;
14181     case AARCH64_CMODEL_LARGE:
14182       /* The maximum TLS size allowed under large is 16E.
14183          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
14184       if (aarch64_tls_size > 48)
14185         aarch64_tls_size = 48;
14186       break;
14187     default:
14188       gcc_unreachable ();
14189     }
14190
14191   return;
14192 }
14193
14194 /* Parse STRING looking for options in the format:
14195      string     :: option:string
14196      option     :: name=substring
14197      name       :: {a-z}
14198      substring  :: defined by option.  */
14199
14200 static void
14201 aarch64_parse_override_string (const char* input_string,
14202                                struct tune_params* tune)
14203 {
14204   const char separator = ':';
14205   size_t string_length = strlen (input_string) + 1;
14206   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
14207   char *string = string_root;
14208   strncpy (string, input_string, string_length);
14209   string[string_length - 1] = '\0';
14210
14211   char* ntoken = string;
14212
14213   while ((ntoken = strchr (string, separator)))
14214     {
14215       size_t token_length = ntoken - string;
14216       /* Make this substring look like a string.  */
14217       *ntoken = '\0';
14218       aarch64_parse_one_override_token (string, token_length, tune);
14219       string = ++ntoken;
14220     }
14221
14222   /* One last option to parse.  */
14223   aarch64_parse_one_override_token (string, strlen (string), tune);
14224   free (string_root);
14225 }
14226
14227
14228 static void
14229 aarch64_override_options_after_change_1 (struct gcc_options *opts)
14230 {
14231   if (accepted_branch_protection_string)
14232     {
14233       opts->x_aarch64_branch_protection_string
14234         = xstrdup (accepted_branch_protection_string);
14235     }
14236
14237   /* PR 70044: We have to be careful about being called multiple times for the
14238      same function.  This means all changes should be repeatable.  */
14239
14240   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
14241      Disable the frame pointer flag so the mid-end will not use a frame
14242      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
14243      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
14244      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
14245   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
14246   if (opts->x_flag_omit_frame_pointer == 0)
14247     opts->x_flag_omit_frame_pointer = 2;
14248
14249   /* If not optimizing for size, set the default
14250      alignment to what the target wants.  */
14251   if (!opts->x_optimize_size)
14252     {
14253       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
14254         opts->x_str_align_loops = aarch64_tune_params.loop_align;
14255       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
14256         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
14257       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
14258         opts->x_str_align_functions = aarch64_tune_params.function_align;
14259     }
14260
14261   /* We default to no pc-relative literal loads.  */
14262
14263   aarch64_pcrelative_literal_loads = false;
14264
14265   /* If -mpc-relative-literal-loads is set on the command line, this
14266      implies that the user asked for PC relative literal loads.  */
14267   if (opts->x_pcrelative_literal_loads == 1)
14268     aarch64_pcrelative_literal_loads = true;
14269
14270   /* In the tiny memory model it makes no sense to disallow PC relative
14271      literal pool loads.  */
14272   if (aarch64_cmodel == AARCH64_CMODEL_TINY
14273       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14274     aarch64_pcrelative_literal_loads = true;
14275
14276   /* When enabling the lower precision Newton series for the square root, also
14277      enable it for the reciprocal square root, since the latter is an
14278      intermediary step for the former.  */
14279   if (flag_mlow_precision_sqrt)
14280     flag_mrecip_low_precision_sqrt = true;
14281 }
14282
14283 /* 'Unpack' up the internal tuning structs and update the options
14284     in OPTS.  The caller must have set up selected_tune and selected_arch
14285     as all the other target-specific codegen decisions are
14286     derived from them.  */
14287
14288 void
14289 aarch64_override_options_internal (struct gcc_options *opts)
14290 {
14291   aarch64_tune_flags = selected_tune->flags;
14292   aarch64_tune = selected_tune->sched_core;
14293   /* Make a copy of the tuning parameters attached to the core, which
14294      we may later overwrite.  */
14295   aarch64_tune_params = *(selected_tune->tune);
14296   aarch64_architecture_version = selected_arch->architecture_version;
14297
14298   if (opts->x_aarch64_override_tune_string)
14299     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
14300                                   &aarch64_tune_params);
14301
14302   /* This target defaults to strict volatile bitfields.  */
14303   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
14304     opts->x_flag_strict_volatile_bitfields = 1;
14305
14306   if (aarch64_stack_protector_guard == SSP_GLOBAL
14307       && opts->x_aarch64_stack_protector_guard_offset_str)
14308     {
14309       error ("incompatible options %<-mstack-protector-guard=global%> and "
14310              "%<-mstack-protector-guard-offset=%s%>",
14311              aarch64_stack_protector_guard_offset_str);
14312     }
14313
14314   if (aarch64_stack_protector_guard == SSP_SYSREG
14315       && !(opts->x_aarch64_stack_protector_guard_offset_str
14316            && opts->x_aarch64_stack_protector_guard_reg_str))
14317     {
14318       error ("both %<-mstack-protector-guard-offset%> and "
14319              "%<-mstack-protector-guard-reg%> must be used "
14320              "with %<-mstack-protector-guard=sysreg%>");
14321     }
14322
14323   if (opts->x_aarch64_stack_protector_guard_reg_str)
14324     {
14325       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
14326           error ("specify a system register with a small string length.");
14327     }
14328
14329   if (opts->x_aarch64_stack_protector_guard_offset_str)
14330     {
14331       char *end;
14332       const char *str = aarch64_stack_protector_guard_offset_str;
14333       errno = 0;
14334       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
14335       if (!*str || *end || errno)
14336         error ("%qs is not a valid offset in %qs", str,
14337                "-mstack-protector-guard-offset=");
14338       aarch64_stack_protector_guard_offset = offs;
14339     }
14340
14341   initialize_aarch64_code_model (opts);
14342   initialize_aarch64_tls_size (opts);
14343
14344   int queue_depth = 0;
14345   switch (aarch64_tune_params.autoprefetcher_model)
14346     {
14347       case tune_params::AUTOPREFETCHER_OFF:
14348         queue_depth = -1;
14349         break;
14350       case tune_params::AUTOPREFETCHER_WEAK:
14351         queue_depth = 0;
14352         break;
14353       case tune_params::AUTOPREFETCHER_STRONG:
14354         queue_depth = max_insn_queue_index + 1;
14355         break;
14356       default:
14357         gcc_unreachable ();
14358     }
14359
14360   /* We don't mind passing in global_options_set here as we don't use
14361      the *options_set structs anyway.  */
14362   SET_OPTION_IF_UNSET (opts, &global_options_set,
14363                        param_sched_autopref_queue_depth, queue_depth);
14364
14365   /* Set up parameters to be used in prefetching algorithm.  Do not
14366      override the defaults unless we are tuning for a core we have
14367      researched values for.  */
14368   if (aarch64_tune_params.prefetch->num_slots > 0)
14369     SET_OPTION_IF_UNSET (opts, &global_options_set,
14370                          param_simultaneous_prefetches,
14371                          aarch64_tune_params.prefetch->num_slots);
14372   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
14373     SET_OPTION_IF_UNSET (opts, &global_options_set,
14374                          param_l1_cache_size,
14375                          aarch64_tune_params.prefetch->l1_cache_size);
14376   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
14377     SET_OPTION_IF_UNSET (opts, &global_options_set,
14378                          param_l1_cache_line_size,
14379                          aarch64_tune_params.prefetch->l1_cache_line_size);
14380   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
14381     SET_OPTION_IF_UNSET (opts, &global_options_set,
14382                          param_l2_cache_size,
14383                          aarch64_tune_params.prefetch->l2_cache_size);
14384   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
14385     SET_OPTION_IF_UNSET (opts, &global_options_set,
14386                          param_prefetch_dynamic_strides, 0);
14387   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
14388     SET_OPTION_IF_UNSET (opts, &global_options_set,
14389                          param_prefetch_minimum_stride,
14390                          aarch64_tune_params.prefetch->minimum_stride);
14391
14392   /* Use the alternative scheduling-pressure algorithm by default.  */
14393   SET_OPTION_IF_UNSET (opts, &global_options_set,
14394                        param_sched_pressure_algorithm,
14395                        SCHED_PRESSURE_MODEL);
14396
14397   /* Validate the guard size.  */
14398   int guard_size = param_stack_clash_protection_guard_size;
14399
14400   if (guard_size != 12 && guard_size != 16)
14401     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
14402            "size.  Given value %d (%llu KB) is out of range",
14403            guard_size, (1ULL << guard_size) / 1024ULL);
14404
14405   /* Enforce that interval is the same size as size so the mid-end does the
14406      right thing.  */
14407   SET_OPTION_IF_UNSET (opts, &global_options_set,
14408                        param_stack_clash_protection_probe_interval,
14409                        guard_size);
14410
14411   /* The maybe_set calls won't update the value if the user has explicitly set
14412      one.  Which means we need to validate that probing interval and guard size
14413      are equal.  */
14414   int probe_interval
14415     = param_stack_clash_protection_probe_interval;
14416   if (guard_size != probe_interval)
14417     error ("stack clash guard size %<%d%> must be equal to probing interval "
14418            "%<%d%>", guard_size, probe_interval);
14419
14420   /* Enable sw prefetching at specified optimization level for
14421      CPUS that have prefetch.  Lower optimization level threshold by 1
14422      when profiling is enabled.  */
14423   if (opts->x_flag_prefetch_loop_arrays < 0
14424       && !opts->x_optimize_size
14425       && aarch64_tune_params.prefetch->default_opt_level >= 0
14426       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
14427     opts->x_flag_prefetch_loop_arrays = 1;
14428
14429   if (opts->x_aarch64_arch_string == NULL)
14430     opts->x_aarch64_arch_string = selected_arch->name;
14431   if (opts->x_aarch64_cpu_string == NULL)
14432     opts->x_aarch64_cpu_string = selected_cpu->name;
14433   if (opts->x_aarch64_tune_string == NULL)
14434     opts->x_aarch64_tune_string = selected_tune->name;
14435
14436   aarch64_override_options_after_change_1 (opts);
14437 }
14438
14439 /* Print a hint with a suggestion for a core or architecture name that
14440    most closely resembles what the user passed in STR.  ARCH is true if
14441    the user is asking for an architecture name.  ARCH is false if the user
14442    is asking for a core name.  */
14443
14444 static void
14445 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
14446 {
14447   auto_vec<const char *> candidates;
14448   const struct processor *entry = arch ? all_architectures : all_cores;
14449   for (; entry->name != NULL; entry++)
14450     candidates.safe_push (entry->name);
14451
14452 #ifdef HAVE_LOCAL_CPU_DETECT
14453   /* Add also "native" as possible value.  */
14454   if (arch)
14455     candidates.safe_push ("native");
14456 #endif
14457
14458   char *s;
14459   const char *hint = candidates_list_and_hint (str, s, candidates);
14460   if (hint)
14461     inform (input_location, "valid arguments are: %s;"
14462                              " did you mean %qs?", s, hint);
14463   else
14464     inform (input_location, "valid arguments are: %s", s);
14465
14466   XDELETEVEC (s);
14467 }
14468
14469 /* Print a hint with a suggestion for a core name that most closely resembles
14470    what the user passed in STR.  */
14471
14472 inline static void
14473 aarch64_print_hint_for_core (const char *str)
14474 {
14475   aarch64_print_hint_for_core_or_arch (str, false);
14476 }
14477
14478 /* Print a hint with a suggestion for an architecture name that most closely
14479    resembles what the user passed in STR.  */
14480
14481 inline static void
14482 aarch64_print_hint_for_arch (const char *str)
14483 {
14484   aarch64_print_hint_for_core_or_arch (str, true);
14485 }
14486
14487
14488 /* Print a hint with a suggestion for an extension name
14489    that most closely resembles what the user passed in STR.  */
14490
14491 void
14492 aarch64_print_hint_for_extensions (const std::string &str)
14493 {
14494   auto_vec<const char *> candidates;
14495   aarch64_get_all_extension_candidates (&candidates);
14496   char *s;
14497   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
14498   if (hint)
14499     inform (input_location, "valid arguments are: %s;"
14500                              " did you mean %qs?", s, hint);
14501   else
14502     inform (input_location, "valid arguments are: %s;", s);
14503
14504   XDELETEVEC (s);
14505 }
14506
14507 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
14508    specified in STR and throw errors if appropriate.  Put the results if
14509    they are valid in RES and ISA_FLAGS.  Return whether the option is
14510    valid.  */
14511
14512 static bool
14513 aarch64_validate_mcpu (const char *str, const struct processor **res,
14514                        uint64_t *isa_flags)
14515 {
14516   std::string invalid_extension;
14517   enum aarch64_parse_opt_result parse_res
14518     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
14519
14520   if (parse_res == AARCH64_PARSE_OK)
14521     return true;
14522
14523   switch (parse_res)
14524     {
14525       case AARCH64_PARSE_MISSING_ARG:
14526         error ("missing cpu name in %<-mcpu=%s%>", str);
14527         break;
14528       case AARCH64_PARSE_INVALID_ARG:
14529         error ("unknown value %qs for %<-mcpu%>", str);
14530         aarch64_print_hint_for_core (str);
14531         break;
14532       case AARCH64_PARSE_INVALID_FEATURE:
14533         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
14534                invalid_extension.c_str (), str);
14535         aarch64_print_hint_for_extensions (invalid_extension);
14536         break;
14537       default:
14538         gcc_unreachable ();
14539     }
14540
14541   return false;
14542 }
14543
14544 /* Straight line speculation indicators.  */
14545 enum aarch64_sls_hardening_type
14546 {
14547   SLS_NONE = 0,
14548   SLS_RETBR = 1,
14549   SLS_BLR = 2,
14550   SLS_ALL = 3,
14551 };
14552 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
14553
14554 /* Return whether we should mitigatate Straight Line Speculation for the RET
14555    and BR instructions.  */
14556 bool
14557 aarch64_harden_sls_retbr_p (void)
14558 {
14559   return aarch64_sls_hardening & SLS_RETBR;
14560 }
14561
14562 /* Return whether we should mitigatate Straight Line Speculation for the BLR
14563    instruction.  */
14564 bool
14565 aarch64_harden_sls_blr_p (void)
14566 {
14567   return aarch64_sls_hardening & SLS_BLR;
14568 }
14569
14570 /* As of yet we only allow setting these options globally, in the future we may
14571    allow setting them per function.  */
14572 static void
14573 aarch64_validate_sls_mitigation (const char *const_str)
14574 {
14575   char *token_save = NULL;
14576   char *str = NULL;
14577
14578   if (strcmp (const_str, "none") == 0)
14579     {
14580       aarch64_sls_hardening = SLS_NONE;
14581       return;
14582     }
14583   if (strcmp (const_str, "all") == 0)
14584     {
14585       aarch64_sls_hardening = SLS_ALL;
14586       return;
14587     }
14588
14589   char *str_root = xstrdup (const_str);
14590   str = strtok_r (str_root, ",", &token_save);
14591   if (!str)
14592     error ("invalid argument given to %<-mharden-sls=%>");
14593
14594   int temp = SLS_NONE;
14595   while (str)
14596     {
14597       if (strcmp (str, "blr") == 0)
14598         temp |= SLS_BLR;
14599       else if (strcmp (str, "retbr") == 0)
14600         temp |= SLS_RETBR;
14601       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
14602         {
14603           error ("%<%s%> must be by itself for %<-mharden-sls=%>", str);
14604           break;
14605         }
14606       else
14607         {
14608           error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
14609           break;
14610         }
14611       str = strtok_r (NULL, ",", &token_save);
14612     }
14613   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
14614   free (str_root);
14615 }
14616
14617 /* Parses CONST_STR for branch protection features specified in
14618    aarch64_branch_protect_types, and set any global variables required.  Returns
14619    the parsing result and assigns LAST_STR to the last processed token from
14620    CONST_STR so that it can be used for error reporting.  */
14621
14622 static enum
14623 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
14624                                                           char** last_str)
14625 {
14626   char *str_root = xstrdup (const_str);
14627   char* token_save = NULL;
14628   char *str = strtok_r (str_root, "+", &token_save);
14629   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
14630   if (!str)
14631     res = AARCH64_PARSE_MISSING_ARG;
14632   else
14633     {
14634       char *next_str = strtok_r (NULL, "+", &token_save);
14635       /* Reset the branch protection features to their defaults.  */
14636       aarch64_handle_no_branch_protection (NULL, NULL);
14637
14638       while (str && res == AARCH64_PARSE_OK)
14639         {
14640           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
14641           bool found = false;
14642           /* Search for this type.  */
14643           while (type && type->name && !found && res == AARCH64_PARSE_OK)
14644             {
14645               if (strcmp (str, type->name) == 0)
14646                 {
14647                   found = true;
14648                   res = type->handler (str, next_str);
14649                   str = next_str;
14650                   next_str = strtok_r (NULL, "+", &token_save);
14651                 }
14652               else
14653                 type++;
14654             }
14655           if (found && res == AARCH64_PARSE_OK)
14656             {
14657               bool found_subtype = true;
14658               /* Loop through each token until we find one that isn't a
14659                  subtype.  */
14660               while (found_subtype)
14661                 {
14662                   found_subtype = false;
14663                   const aarch64_branch_protect_type *subtype = type->subtypes;
14664                   /* Search for the subtype.  */
14665                   while (str && subtype && subtype->name && !found_subtype
14666                           && res == AARCH64_PARSE_OK)
14667                     {
14668                       if (strcmp (str, subtype->name) == 0)
14669                         {
14670                           found_subtype = true;
14671                           res = subtype->handler (str, next_str);
14672                           str = next_str;
14673                           next_str = strtok_r (NULL, "+", &token_save);
14674                         }
14675                       else
14676                         subtype++;
14677                     }
14678                 }
14679             }
14680           else if (!found)
14681             res = AARCH64_PARSE_INVALID_ARG;
14682         }
14683     }
14684   /* Copy the last processed token into the argument to pass it back.
14685     Used by option and attribute validation to print the offending token.  */
14686   if (last_str)
14687     {
14688       if (str) strcpy (*last_str, str);
14689       else *last_str = NULL;
14690     }
14691   if (res == AARCH64_PARSE_OK)
14692     {
14693       /* If needed, alloc the accepted string then copy in const_str.
14694         Used by override_option_after_change_1.  */
14695       if (!accepted_branch_protection_string)
14696         accepted_branch_protection_string = (char *) xmalloc (
14697                                                       BRANCH_PROTECT_STR_MAX
14698                                                         + 1);
14699       strncpy (accepted_branch_protection_string, const_str,
14700                 BRANCH_PROTECT_STR_MAX + 1);
14701       /* Forcibly null-terminate.  */
14702       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
14703     }
14704   return res;
14705 }
14706
14707 static bool
14708 aarch64_validate_mbranch_protection (const char *const_str)
14709 {
14710   char *str = (char *) xmalloc (strlen (const_str));
14711   enum aarch64_parse_opt_result res =
14712     aarch64_parse_branch_protection (const_str, &str);
14713   if (res == AARCH64_PARSE_INVALID_ARG)
14714     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
14715   else if (res == AARCH64_PARSE_MISSING_ARG)
14716     error ("missing argument for %<-mbranch-protection=%>");
14717   free (str);
14718   return res == AARCH64_PARSE_OK;
14719 }
14720
14721 /* Validate a command-line -march option.  Parse the arch and extensions
14722    (if any) specified in STR and throw errors if appropriate.  Put the
14723    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
14724    option is valid.  */
14725
14726 static bool
14727 aarch64_validate_march (const char *str, const struct processor **res,
14728                          uint64_t *isa_flags)
14729 {
14730   std::string invalid_extension;
14731   enum aarch64_parse_opt_result parse_res
14732     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
14733
14734   if (parse_res == AARCH64_PARSE_OK)
14735     return true;
14736
14737   switch (parse_res)
14738     {
14739       case AARCH64_PARSE_MISSING_ARG:
14740         error ("missing arch name in %<-march=%s%>", str);
14741         break;
14742       case AARCH64_PARSE_INVALID_ARG:
14743         error ("unknown value %qs for %<-march%>", str);
14744         aarch64_print_hint_for_arch (str);
14745         break;
14746       case AARCH64_PARSE_INVALID_FEATURE:
14747         error ("invalid feature modifier %qs in %<-march=%s%>",
14748                invalid_extension.c_str (), str);
14749         aarch64_print_hint_for_extensions (invalid_extension);
14750         break;
14751       default:
14752         gcc_unreachable ();
14753     }
14754
14755   return false;
14756 }
14757
14758 /* Validate a command-line -mtune option.  Parse the cpu
14759    specified in STR and throw errors if appropriate.  Put the
14760    result, if it is valid, in RES.  Return whether the option is
14761    valid.  */
14762
14763 static bool
14764 aarch64_validate_mtune (const char *str, const struct processor **res)
14765 {
14766   enum aarch64_parse_opt_result parse_res
14767     = aarch64_parse_tune (str, res);
14768
14769   if (parse_res == AARCH64_PARSE_OK)
14770     return true;
14771
14772   switch (parse_res)
14773     {
14774       case AARCH64_PARSE_MISSING_ARG:
14775         error ("missing cpu name in %<-mtune=%s%>", str);
14776         break;
14777       case AARCH64_PARSE_INVALID_ARG:
14778         error ("unknown value %qs for %<-mtune%>", str);
14779         aarch64_print_hint_for_core (str);
14780         break;
14781       default:
14782         gcc_unreachable ();
14783     }
14784   return false;
14785 }
14786
14787 /* Return the CPU corresponding to the enum CPU.
14788    If it doesn't specify a cpu, return the default.  */
14789
14790 static const struct processor *
14791 aarch64_get_tune_cpu (enum aarch64_processor cpu)
14792 {
14793   if (cpu != aarch64_none)
14794     return &all_cores[cpu];
14795
14796   /* The & 0x3f is to extract the bottom 6 bits that encode the
14797      default cpu as selected by the --with-cpu GCC configure option
14798      in config.gcc.
14799      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
14800      flags mechanism should be reworked to make it more sane.  */
14801   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14802 }
14803
14804 /* Return the architecture corresponding to the enum ARCH.
14805    If it doesn't specify a valid architecture, return the default.  */
14806
14807 static const struct processor *
14808 aarch64_get_arch (enum aarch64_arch arch)
14809 {
14810   if (arch != aarch64_no_arch)
14811     return &all_architectures[arch];
14812
14813   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14814
14815   return &all_architectures[cpu->arch];
14816 }
14817
14818 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
14819
14820 static poly_uint16
14821 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
14822 {
14823   /* 128-bit SVE and Advanced SIMD modes use different register layouts
14824      on big-endian targets, so we would need to forbid subregs that convert
14825      from one to the other.  By default a reinterpret sequence would then
14826      involve a store to memory in one mode and a load back in the other.
14827      Even if we optimize that sequence using reverse instructions,
14828      it would still be a significant potential overhead.
14829
14830      For now, it seems better to generate length-agnostic code for that
14831      case instead.  */
14832   if (value == SVE_SCALABLE
14833       || (value == SVE_128 && BYTES_BIG_ENDIAN))
14834     return poly_uint16 (2, 2);
14835   else
14836     return (int) value / 64;
14837 }
14838
14839 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
14840    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
14841    tuning structs.  In particular it must set selected_tune and
14842    aarch64_isa_flags that define the available ISA features and tuning
14843    decisions.  It must also set selected_arch as this will be used to
14844    output the .arch asm tags for each function.  */
14845
14846 static void
14847 aarch64_override_options (void)
14848 {
14849   uint64_t cpu_isa = 0;
14850   uint64_t arch_isa = 0;
14851   aarch64_isa_flags = 0;
14852
14853   bool valid_cpu = true;
14854   bool valid_tune = true;
14855   bool valid_arch = true;
14856
14857   selected_cpu = NULL;
14858   selected_arch = NULL;
14859   selected_tune = NULL;
14860
14861   if (aarch64_harden_sls_string)
14862     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
14863
14864   if (aarch64_branch_protection_string)
14865     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
14866
14867   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
14868      If either of -march or -mtune is given, they override their
14869      respective component of -mcpu.  */
14870   if (aarch64_cpu_string)
14871     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
14872                                         &cpu_isa);
14873
14874   if (aarch64_arch_string)
14875     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
14876                                           &arch_isa);
14877
14878   if (aarch64_tune_string)
14879     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
14880
14881 #ifdef SUBTARGET_OVERRIDE_OPTIONS
14882   SUBTARGET_OVERRIDE_OPTIONS;
14883 #endif
14884
14885   /* If the user did not specify a processor, choose the default
14886      one for them.  This will be the CPU set during configuration using
14887      --with-cpu, otherwise it is "generic".  */
14888   if (!selected_cpu)
14889     {
14890       if (selected_arch)
14891         {
14892           selected_cpu = &all_cores[selected_arch->ident];
14893           aarch64_isa_flags = arch_isa;
14894           explicit_arch = selected_arch->arch;
14895         }
14896       else
14897         {
14898           /* Get default configure-time CPU.  */
14899           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
14900           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
14901         }
14902
14903       if (selected_tune)
14904         explicit_tune_core = selected_tune->ident;
14905     }
14906   /* If both -mcpu and -march are specified check that they are architecturally
14907      compatible, warn if they're not and prefer the -march ISA flags.  */
14908   else if (selected_arch)
14909     {
14910       if (selected_arch->arch != selected_cpu->arch)
14911         {
14912           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14913                        aarch64_cpu_string,
14914                        aarch64_arch_string);
14915         }
14916       aarch64_isa_flags = arch_isa;
14917       explicit_arch = selected_arch->arch;
14918       explicit_tune_core = selected_tune ? selected_tune->ident
14919                                           : selected_cpu->ident;
14920     }
14921   else
14922     {
14923       /* -mcpu but no -march.  */
14924       aarch64_isa_flags = cpu_isa;
14925       explicit_tune_core = selected_tune ? selected_tune->ident
14926                                           : selected_cpu->ident;
14927       gcc_assert (selected_cpu);
14928       selected_arch = &all_architectures[selected_cpu->arch];
14929       explicit_arch = selected_arch->arch;
14930     }
14931
14932   /* Set the arch as well as we will need it when outputing
14933      the .arch directive in assembly.  */
14934   if (!selected_arch)
14935     {
14936       gcc_assert (selected_cpu);
14937       selected_arch = &all_architectures[selected_cpu->arch];
14938     }
14939
14940   if (!selected_tune)
14941     selected_tune = selected_cpu;
14942
14943   if (aarch64_enable_bti == 2)
14944     {
14945 #ifdef TARGET_ENABLE_BTI
14946       aarch64_enable_bti = 1;
14947 #else
14948       aarch64_enable_bti = 0;
14949 #endif
14950     }
14951
14952   /* Return address signing is currently not supported for ILP32 targets.  For
14953      LP64 targets use the configured option in the absence of a command-line
14954      option for -mbranch-protection.  */
14955   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
14956     {
14957 #ifdef TARGET_ENABLE_PAC_RET
14958       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
14959 #else
14960       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
14961 #endif
14962     }
14963
14964 #ifndef HAVE_AS_MABI_OPTION
14965   /* The compiler may have been configured with 2.23.* binutils, which does
14966      not have support for ILP32.  */
14967   if (TARGET_ILP32)
14968     error ("assembler does not support %<-mabi=ilp32%>");
14969 #endif
14970
14971   /* Convert -msve-vector-bits to a VG count.  */
14972   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
14973
14974   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
14975     sorry ("return address signing is only supported for %<-mabi=lp64%>");
14976
14977   /* Make sure we properly set up the explicit options.  */
14978   if ((aarch64_cpu_string && valid_cpu)
14979        || (aarch64_tune_string && valid_tune))
14980     gcc_assert (explicit_tune_core != aarch64_none);
14981
14982   if ((aarch64_cpu_string && valid_cpu)
14983        || (aarch64_arch_string && valid_arch))
14984     gcc_assert (explicit_arch != aarch64_no_arch);
14985
14986   /* The pass to insert speculation tracking runs before
14987      shrink-wrapping and the latter does not know how to update the
14988      tracking status.  So disable it in this case.  */
14989   if (aarch64_track_speculation)
14990     flag_shrink_wrap = 0;
14991
14992   aarch64_override_options_internal (&global_options);
14993
14994   /* Save these options as the default ones in case we push and pop them later
14995      while processing functions with potential target attributes.  */
14996   target_option_default_node = target_option_current_node
14997       = build_target_option_node (&global_options);
14998 }
14999
15000 /* Implement targetm.override_options_after_change.  */
15001
15002 static void
15003 aarch64_override_options_after_change (void)
15004 {
15005   aarch64_override_options_after_change_1 (&global_options);
15006 }
15007
15008 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
15009 static char *
15010 aarch64_offload_options (void)
15011 {
15012   if (TARGET_ILP32)
15013     return xstrdup ("-foffload-abi=ilp32");
15014   else
15015     return xstrdup ("-foffload-abi=lp64");
15016 }
15017
15018 static struct machine_function *
15019 aarch64_init_machine_status (void)
15020 {
15021   struct machine_function *machine;
15022   machine = ggc_cleared_alloc<machine_function> ();
15023   return machine;
15024 }
15025
15026 void
15027 aarch64_init_expanders (void)
15028 {
15029   init_machine_status = aarch64_init_machine_status;
15030 }
15031
15032 /* A checking mechanism for the implementation of the various code models.  */
15033 static void
15034 initialize_aarch64_code_model (struct gcc_options *opts)
15035 {
15036   aarch64_cmodel = opts->x_aarch64_cmodel_var;
15037   switch (opts->x_aarch64_cmodel_var)
15038     {
15039     case AARCH64_CMODEL_TINY:
15040       if (opts->x_flag_pic)
15041         aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
15042       break;
15043     case AARCH64_CMODEL_SMALL:
15044       if (opts->x_flag_pic)
15045         {
15046 #ifdef HAVE_AS_SMALL_PIC_RELOCS
15047           aarch64_cmodel = (flag_pic == 2
15048                             ? AARCH64_CMODEL_SMALL_PIC
15049                             : AARCH64_CMODEL_SMALL_SPIC);
15050 #else
15051           aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
15052 #endif
15053         }
15054       break;
15055     case AARCH64_CMODEL_LARGE:
15056       if (opts->x_flag_pic)
15057         sorry ("code model %qs with %<-f%s%>", "large",
15058                opts->x_flag_pic > 1 ? "PIC" : "pic");
15059       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
15060         sorry ("code model %qs not supported in ilp32 mode", "large");
15061       break;
15062     case AARCH64_CMODEL_TINY_PIC:
15063     case AARCH64_CMODEL_SMALL_PIC:
15064     case AARCH64_CMODEL_SMALL_SPIC:
15065       gcc_unreachable ();
15066     }
15067 }
15068
15069 /* Implement TARGET_OPTION_SAVE.  */
15070
15071 static void
15072 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
15073 {
15074   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
15075   ptr->x_aarch64_branch_protection_string
15076     = opts->x_aarch64_branch_protection_string;
15077 }
15078
15079 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
15080    using the information saved in PTR.  */
15081
15082 static void
15083 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
15084 {
15085   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
15086   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
15087   opts->x_explicit_arch = ptr->x_explicit_arch;
15088   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
15089   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
15090   opts->x_aarch64_branch_protection_string
15091     = ptr->x_aarch64_branch_protection_string;
15092   if (opts->x_aarch64_branch_protection_string)
15093     {
15094       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
15095                                         NULL);
15096     }
15097
15098   aarch64_override_options_internal (opts);
15099 }
15100
15101 /* Implement TARGET_OPTION_PRINT.  */
15102
15103 static void
15104 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
15105 {
15106   const struct processor *cpu
15107     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
15108   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
15109   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
15110   std::string extension
15111     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
15112
15113   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
15114   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
15115            arch->name, extension.c_str ());
15116 }
15117
15118 static GTY(()) tree aarch64_previous_fndecl;
15119
15120 void
15121 aarch64_reset_previous_fndecl (void)
15122 {
15123   aarch64_previous_fndecl = NULL;
15124 }
15125
15126 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
15127    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
15128    make sure optab availability predicates are recomputed when necessary.  */
15129
15130 void
15131 aarch64_save_restore_target_globals (tree new_tree)
15132 {
15133   if (TREE_TARGET_GLOBALS (new_tree))
15134     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
15135   else if (new_tree == target_option_default_node)
15136     restore_target_globals (&default_target_globals);
15137   else
15138     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
15139 }
15140
15141 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
15142    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
15143    of the function, if such exists.  This function may be called multiple
15144    times on a single function so use aarch64_previous_fndecl to avoid
15145    setting up identical state.  */
15146
15147 static void
15148 aarch64_set_current_function (tree fndecl)
15149 {
15150   if (!fndecl || fndecl == aarch64_previous_fndecl)
15151     return;
15152
15153   tree old_tree = (aarch64_previous_fndecl
15154                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
15155                    : NULL_TREE);
15156
15157   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15158
15159   /* If current function has no attributes but the previous one did,
15160      use the default node.  */
15161   if (!new_tree && old_tree)
15162     new_tree = target_option_default_node;
15163
15164   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
15165      the default have been handled by aarch64_save_restore_target_globals from
15166      aarch64_pragma_target_parse.  */
15167   if (old_tree == new_tree)
15168     return;
15169
15170   aarch64_previous_fndecl = fndecl;
15171
15172   /* First set the target options.  */
15173   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
15174
15175   aarch64_save_restore_target_globals (new_tree);
15176 }
15177
15178 /* Enum describing the various ways we can handle attributes.
15179    In many cases we can reuse the generic option handling machinery.  */
15180
15181 enum aarch64_attr_opt_type
15182 {
15183   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
15184   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
15185   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
15186   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
15187 };
15188
15189 /* All the information needed to handle a target attribute.
15190    NAME is the name of the attribute.
15191    ATTR_TYPE specifies the type of behavior of the attribute as described
15192    in the definition of enum aarch64_attr_opt_type.
15193    ALLOW_NEG is true if the attribute supports a "no-" form.
15194    HANDLER is the function that takes the attribute string as an argument
15195    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
15196    OPT_NUM is the enum specifying the option that the attribute modifies.
15197    This is needed for attributes that mirror the behavior of a command-line
15198    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
15199    aarch64_attr_enum.  */
15200
15201 struct aarch64_attribute_info
15202 {
15203   const char *name;
15204   enum aarch64_attr_opt_type attr_type;
15205   bool allow_neg;
15206   bool (*handler) (const char *);
15207   enum opt_code opt_num;
15208 };
15209
15210 /* Handle the ARCH_STR argument to the arch= target attribute.  */
15211
15212 static bool
15213 aarch64_handle_attr_arch (const char *str)
15214 {
15215   const struct processor *tmp_arch = NULL;
15216   std::string invalid_extension;
15217   enum aarch64_parse_opt_result parse_res
15218     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
15219
15220   if (parse_res == AARCH64_PARSE_OK)
15221     {
15222       gcc_assert (tmp_arch);
15223       selected_arch = tmp_arch;
15224       explicit_arch = selected_arch->arch;
15225       return true;
15226     }
15227
15228   switch (parse_res)
15229     {
15230       case AARCH64_PARSE_MISSING_ARG:
15231         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
15232         break;
15233       case AARCH64_PARSE_INVALID_ARG:
15234         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
15235         aarch64_print_hint_for_arch (str);
15236         break;
15237       case AARCH64_PARSE_INVALID_FEATURE:
15238         error ("invalid feature modifier %s of value (\"%s\") in "
15239                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15240         aarch64_print_hint_for_extensions (invalid_extension);
15241         break;
15242       default:
15243         gcc_unreachable ();
15244     }
15245
15246   return false;
15247 }
15248
15249 /* Handle the argument CPU_STR to the cpu= target attribute.  */
15250
15251 static bool
15252 aarch64_handle_attr_cpu (const char *str)
15253 {
15254   const struct processor *tmp_cpu = NULL;
15255   std::string invalid_extension;
15256   enum aarch64_parse_opt_result parse_res
15257     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
15258
15259   if (parse_res == AARCH64_PARSE_OK)
15260     {
15261       gcc_assert (tmp_cpu);
15262       selected_tune = tmp_cpu;
15263       explicit_tune_core = selected_tune->ident;
15264
15265       selected_arch = &all_architectures[tmp_cpu->arch];
15266       explicit_arch = selected_arch->arch;
15267       return true;
15268     }
15269
15270   switch (parse_res)
15271     {
15272       case AARCH64_PARSE_MISSING_ARG:
15273         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
15274         break;
15275       case AARCH64_PARSE_INVALID_ARG:
15276         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
15277         aarch64_print_hint_for_core (str);
15278         break;
15279       case AARCH64_PARSE_INVALID_FEATURE:
15280         error ("invalid feature modifier %s of value (\"%s\") in "
15281                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15282         aarch64_print_hint_for_extensions (invalid_extension);
15283         break;
15284       default:
15285         gcc_unreachable ();
15286     }
15287
15288   return false;
15289 }
15290
15291 /* Handle the argument STR to the branch-protection= attribute.  */
15292
15293  static bool
15294  aarch64_handle_attr_branch_protection (const char* str)
15295  {
15296   char *err_str = (char *) xmalloc (strlen (str) + 1);
15297   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
15298                                                                       &err_str);
15299   bool success = false;
15300   switch (res)
15301     {
15302      case AARCH64_PARSE_MISSING_ARG:
15303        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
15304               " attribute");
15305        break;
15306      case AARCH64_PARSE_INVALID_ARG:
15307        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
15308               "=\")%> pragma or attribute", err_str);
15309        break;
15310      case AARCH64_PARSE_OK:
15311        success = true;
15312       /* Fall through.  */
15313      case AARCH64_PARSE_INVALID_FEATURE:
15314        break;
15315      default:
15316        gcc_unreachable ();
15317     }
15318   free (err_str);
15319   return success;
15320  }
15321
15322 /* Handle the argument STR to the tune= target attribute.  */
15323
15324 static bool
15325 aarch64_handle_attr_tune (const char *str)
15326 {
15327   const struct processor *tmp_tune = NULL;
15328   enum aarch64_parse_opt_result parse_res
15329     = aarch64_parse_tune (str, &tmp_tune);
15330
15331   if (parse_res == AARCH64_PARSE_OK)
15332     {
15333       gcc_assert (tmp_tune);
15334       selected_tune = tmp_tune;
15335       explicit_tune_core = selected_tune->ident;
15336       return true;
15337     }
15338
15339   switch (parse_res)
15340     {
15341       case AARCH64_PARSE_INVALID_ARG:
15342         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
15343         aarch64_print_hint_for_core (str);
15344         break;
15345       default:
15346         gcc_unreachable ();
15347     }
15348
15349   return false;
15350 }
15351
15352 /* Parse an architecture extensions target attribute string specified in STR.
15353    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
15354    if successful.  Update aarch64_isa_flags to reflect the ISA features
15355    modified.  */
15356
15357 static bool
15358 aarch64_handle_attr_isa_flags (char *str)
15359 {
15360   enum aarch64_parse_opt_result parse_res;
15361   uint64_t isa_flags = aarch64_isa_flags;
15362
15363   /* We allow "+nothing" in the beginning to clear out all architectural
15364      features if the user wants to handpick specific features.  */
15365   if (strncmp ("+nothing", str, 8) == 0)
15366     {
15367       isa_flags = 0;
15368       str += 8;
15369     }
15370
15371   std::string invalid_extension;
15372   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
15373
15374   if (parse_res == AARCH64_PARSE_OK)
15375     {
15376       aarch64_isa_flags = isa_flags;
15377       return true;
15378     }
15379
15380   switch (parse_res)
15381     {
15382       case AARCH64_PARSE_MISSING_ARG:
15383         error ("missing value in %<target()%> pragma or attribute");
15384         break;
15385
15386       case AARCH64_PARSE_INVALID_FEATURE:
15387         error ("invalid feature modifier %s of value (\"%s\") in "
15388                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15389         break;
15390
15391       default:
15392         gcc_unreachable ();
15393     }
15394
15395  return false;
15396 }
15397
15398 /* The target attributes that we support.  On top of these we also support just
15399    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
15400    handled explicitly in aarch64_process_one_target_attr.  */
15401
15402 static const struct aarch64_attribute_info aarch64_attributes[] =
15403 {
15404   { "general-regs-only", aarch64_attr_mask, false, NULL,
15405      OPT_mgeneral_regs_only },
15406   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
15407      OPT_mfix_cortex_a53_835769 },
15408   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
15409      OPT_mfix_cortex_a53_843419 },
15410   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
15411   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
15412   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
15413      OPT_momit_leaf_frame_pointer },
15414   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
15415   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
15416      OPT_march_ },
15417   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
15418   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
15419      OPT_mtune_ },
15420   { "branch-protection", aarch64_attr_custom, false,
15421      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
15422   { "sign-return-address", aarch64_attr_enum, false, NULL,
15423      OPT_msign_return_address_ },
15424   { "outline-atomics", aarch64_attr_bool, true, NULL,
15425      OPT_moutline_atomics},
15426   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
15427 };
15428
15429 /* Parse ARG_STR which contains the definition of one target attribute.
15430    Show appropriate errors if any or return true if the attribute is valid.  */
15431
15432 static bool
15433 aarch64_process_one_target_attr (char *arg_str)
15434 {
15435   bool invert = false;
15436
15437   size_t len = strlen (arg_str);
15438
15439   if (len == 0)
15440     {
15441       error ("malformed %<target()%> pragma or attribute");
15442       return false;
15443     }
15444
15445   char *str_to_check = (char *) alloca (len + 1);
15446   strcpy (str_to_check, arg_str);
15447
15448   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
15449      It is easier to detect and handle it explicitly here rather than going
15450      through the machinery for the rest of the target attributes in this
15451      function.  */
15452   if (*str_to_check == '+')
15453     return aarch64_handle_attr_isa_flags (str_to_check);
15454
15455   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
15456     {
15457       invert = true;
15458       str_to_check += 3;
15459     }
15460   char *arg = strchr (str_to_check, '=');
15461
15462   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
15463      and point ARG to "foo".  */
15464   if (arg)
15465     {
15466       *arg = '\0';
15467       arg++;
15468     }
15469   const struct aarch64_attribute_info *p_attr;
15470   bool found = false;
15471   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
15472     {
15473       /* If the names don't match up, or the user has given an argument
15474          to an attribute that doesn't accept one, or didn't give an argument
15475          to an attribute that expects one, fail to match.  */
15476       if (strcmp (str_to_check, p_attr->name) != 0)
15477         continue;
15478
15479       found = true;
15480       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
15481                               || p_attr->attr_type == aarch64_attr_enum;
15482
15483       if (attr_need_arg_p ^ (arg != NULL))
15484         {
15485           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
15486           return false;
15487         }
15488
15489       /* If the name matches but the attribute does not allow "no-" versions
15490          then we can't match.  */
15491       if (invert && !p_attr->allow_neg)
15492         {
15493           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
15494           return false;
15495         }
15496
15497       switch (p_attr->attr_type)
15498         {
15499         /* Has a custom handler registered.
15500            For example, cpu=, arch=, tune=.  */
15501           case aarch64_attr_custom:
15502             gcc_assert (p_attr->handler);
15503             if (!p_attr->handler (arg))
15504               return false;
15505             break;
15506
15507           /* Either set or unset a boolean option.  */
15508           case aarch64_attr_bool:
15509             {
15510               struct cl_decoded_option decoded;
15511
15512               generate_option (p_attr->opt_num, NULL, !invert,
15513                                CL_TARGET, &decoded);
15514               aarch64_handle_option (&global_options, &global_options_set,
15515                                       &decoded, input_location);
15516               break;
15517             }
15518           /* Set or unset a bit in the target_flags.  aarch64_handle_option
15519              should know what mask to apply given the option number.  */
15520           case aarch64_attr_mask:
15521             {
15522               struct cl_decoded_option decoded;
15523               /* We only need to specify the option number.
15524                  aarch64_handle_option will know which mask to apply.  */
15525               decoded.opt_index = p_attr->opt_num;
15526               decoded.value = !invert;
15527               aarch64_handle_option (&global_options, &global_options_set,
15528                                       &decoded, input_location);
15529               break;
15530             }
15531           /* Use the option setting machinery to set an option to an enum.  */
15532           case aarch64_attr_enum:
15533             {
15534               gcc_assert (arg);
15535               bool valid;
15536               int value;
15537               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
15538                                               &value, CL_TARGET);
15539               if (valid)
15540                 {
15541                   set_option (&global_options, NULL, p_attr->opt_num, value,
15542                               NULL, DK_UNSPECIFIED, input_location,
15543                               global_dc);
15544                 }
15545               else
15546                 {
15547                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
15548                 }
15549               break;
15550             }
15551           default:
15552             gcc_unreachable ();
15553         }
15554     }
15555
15556   /* If we reached here we either have found an attribute and validated
15557      it or didn't match any.  If we matched an attribute but its arguments
15558      were malformed we will have returned false already.  */
15559   return found;
15560 }
15561
15562 /* Count how many times the character C appears in
15563    NULL-terminated string STR.  */
15564
15565 static unsigned int
15566 num_occurences_in_str (char c, char *str)
15567 {
15568   unsigned int res = 0;
15569   while (*str != '\0')
15570     {
15571       if (*str == c)
15572         res++;
15573
15574       str++;
15575     }
15576
15577   return res;
15578 }
15579
15580 /* Parse the tree in ARGS that contains the target attribute information
15581    and update the global target options space.  */
15582
15583 bool
15584 aarch64_process_target_attr (tree args)
15585 {
15586   if (TREE_CODE (args) == TREE_LIST)
15587     {
15588       do
15589         {
15590           tree head = TREE_VALUE (args);
15591           if (head)
15592             {
15593               if (!aarch64_process_target_attr (head))
15594                 return false;
15595             }
15596           args = TREE_CHAIN (args);
15597         } while (args);
15598
15599       return true;
15600     }
15601
15602   if (TREE_CODE (args) != STRING_CST)
15603     {
15604       error ("attribute %<target%> argument not a string");
15605       return false;
15606     }
15607
15608   size_t len = strlen (TREE_STRING_POINTER (args));
15609   char *str_to_check = (char *) alloca (len + 1);
15610   strcpy (str_to_check, TREE_STRING_POINTER (args));
15611
15612   if (len == 0)
15613     {
15614       error ("malformed %<target()%> pragma or attribute");
15615       return false;
15616     }
15617
15618   /* Used to catch empty spaces between commas i.e.
15619      attribute ((target ("attr1,,attr2"))).  */
15620   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
15621
15622   /* Handle multiple target attributes separated by ','.  */
15623   char *token = strtok_r (str_to_check, ",", &str_to_check);
15624
15625   unsigned int num_attrs = 0;
15626   while (token)
15627     {
15628       num_attrs++;
15629       if (!aarch64_process_one_target_attr (token))
15630         {
15631           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
15632           return false;
15633         }
15634
15635       token = strtok_r (NULL, ",", &str_to_check);
15636     }
15637
15638   if (num_attrs != num_commas + 1)
15639     {
15640       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
15641       return false;
15642     }
15643
15644   return true;
15645 }
15646
15647 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
15648    process attribute ((target ("..."))).  */
15649
15650 static bool
15651 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
15652 {
15653   struct cl_target_option cur_target;
15654   bool ret;
15655   tree old_optimize;
15656   tree new_target, new_optimize;
15657   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15658
15659   /* If what we're processing is the current pragma string then the
15660      target option node is already stored in target_option_current_node
15661      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
15662      having to re-parse the string.  This is especially useful to keep
15663      arm_neon.h compile times down since that header contains a lot
15664      of intrinsics enclosed in pragmas.  */
15665   if (!existing_target && args == current_target_pragma)
15666     {
15667       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
15668       return true;
15669     }
15670   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15671
15672   old_optimize = build_optimization_node (&global_options);
15673   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15674
15675   /* If the function changed the optimization levels as well as setting
15676      target options, start with the optimizations specified.  */
15677   if (func_optimize && func_optimize != old_optimize)
15678     cl_optimization_restore (&global_options,
15679                              TREE_OPTIMIZATION (func_optimize));
15680
15681   /* Save the current target options to restore at the end.  */
15682   cl_target_option_save (&cur_target, &global_options);
15683
15684   /* If fndecl already has some target attributes applied to it, unpack
15685      them so that we add this attribute on top of them, rather than
15686      overwriting them.  */
15687   if (existing_target)
15688     {
15689       struct cl_target_option *existing_options
15690         = TREE_TARGET_OPTION (existing_target);
15691
15692       if (existing_options)
15693         cl_target_option_restore (&global_options, existing_options);
15694     }
15695   else
15696     cl_target_option_restore (&global_options,
15697                         TREE_TARGET_OPTION (target_option_current_node));
15698
15699   ret = aarch64_process_target_attr (args);
15700
15701   /* Set up any additional state.  */
15702   if (ret)
15703     {
15704       aarch64_override_options_internal (&global_options);
15705       /* Initialize SIMD builtins if we haven't already.
15706          Set current_target_pragma to NULL for the duration so that
15707          the builtin initialization code doesn't try to tag the functions
15708          being built with the attributes specified by any current pragma, thus
15709          going into an infinite recursion.  */
15710       if (TARGET_SIMD)
15711         {
15712           tree saved_current_target_pragma = current_target_pragma;
15713           current_target_pragma = NULL;
15714           aarch64_init_simd_builtins ();
15715           current_target_pragma = saved_current_target_pragma;
15716         }
15717       new_target = build_target_option_node (&global_options);
15718     }
15719   else
15720     new_target = NULL;
15721
15722   new_optimize = build_optimization_node (&global_options);
15723
15724   if (fndecl && ret)
15725     {
15726       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
15727
15728       if (old_optimize != new_optimize)
15729         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
15730     }
15731
15732   cl_target_option_restore (&global_options, &cur_target);
15733
15734   if (old_optimize != new_optimize)
15735     cl_optimization_restore (&global_options,
15736                              TREE_OPTIMIZATION (old_optimize));
15737   return ret;
15738 }
15739
15740 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
15741    tri-bool options (yes, no, don't care) and the default value is
15742    DEF, determine whether to reject inlining.  */
15743
15744 static bool
15745 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
15746                                      int dont_care, int def)
15747 {
15748   /* If the callee doesn't care, always allow inlining.  */
15749   if (callee == dont_care)
15750     return true;
15751
15752   /* If the caller doesn't care, always allow inlining.  */
15753   if (caller == dont_care)
15754     return true;
15755
15756   /* Otherwise, allow inlining if either the callee and caller values
15757      agree, or if the callee is using the default value.  */
15758   return (callee == caller || callee == def);
15759 }
15760
15761 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
15762    to inline CALLEE into CALLER based on target-specific info.
15763    Make sure that the caller and callee have compatible architectural
15764    features.  Then go through the other possible target attributes
15765    and see if they can block inlining.  Try not to reject always_inline
15766    callees unless they are incompatible architecturally.  */
15767
15768 static bool
15769 aarch64_can_inline_p (tree caller, tree callee)
15770 {
15771   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
15772   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
15773
15774   struct cl_target_option *caller_opts
15775         = TREE_TARGET_OPTION (caller_tree ? caller_tree
15776                                            : target_option_default_node);
15777
15778   struct cl_target_option *callee_opts
15779         = TREE_TARGET_OPTION (callee_tree ? callee_tree
15780                                            : target_option_default_node);
15781
15782   /* Callee's ISA flags should be a subset of the caller's.  */
15783   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
15784        != callee_opts->x_aarch64_isa_flags)
15785     return false;
15786
15787   /* Allow non-strict aligned functions inlining into strict
15788      aligned ones.  */
15789   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
15790        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
15791       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
15792            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
15793     return false;
15794
15795   bool always_inline = lookup_attribute ("always_inline",
15796                                           DECL_ATTRIBUTES (callee));
15797
15798   /* If the architectural features match up and the callee is always_inline
15799      then the other attributes don't matter.  */
15800   if (always_inline)
15801     return true;
15802
15803   if (caller_opts->x_aarch64_cmodel_var
15804       != callee_opts->x_aarch64_cmodel_var)
15805     return false;
15806
15807   if (caller_opts->x_aarch64_tls_dialect
15808       != callee_opts->x_aarch64_tls_dialect)
15809     return false;
15810
15811   /* Honour explicit requests to workaround errata.  */
15812   if (!aarch64_tribools_ok_for_inlining_p (
15813           caller_opts->x_aarch64_fix_a53_err835769,
15814           callee_opts->x_aarch64_fix_a53_err835769,
15815           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
15816     return false;
15817
15818   if (!aarch64_tribools_ok_for_inlining_p (
15819           caller_opts->x_aarch64_fix_a53_err843419,
15820           callee_opts->x_aarch64_fix_a53_err843419,
15821           2, TARGET_FIX_ERR_A53_843419))
15822     return false;
15823
15824   /* If the user explicitly specified -momit-leaf-frame-pointer for the
15825      caller and calle and they don't match up, reject inlining.  */
15826   if (!aarch64_tribools_ok_for_inlining_p (
15827           caller_opts->x_flag_omit_leaf_frame_pointer,
15828           callee_opts->x_flag_omit_leaf_frame_pointer,
15829           2, 1))
15830     return false;
15831
15832   /* If the callee has specific tuning overrides, respect them.  */
15833   if (callee_opts->x_aarch64_override_tune_string != NULL
15834       && caller_opts->x_aarch64_override_tune_string == NULL)
15835     return false;
15836
15837   /* If the user specified tuning override strings for the
15838      caller and callee and they don't match up, reject inlining.
15839      We just do a string compare here, we don't analyze the meaning
15840      of the string, as it would be too costly for little gain.  */
15841   if (callee_opts->x_aarch64_override_tune_string
15842       && caller_opts->x_aarch64_override_tune_string
15843       && (strcmp (callee_opts->x_aarch64_override_tune_string,
15844                   caller_opts->x_aarch64_override_tune_string) != 0))
15845     return false;
15846
15847   return true;
15848 }
15849
15850 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
15851    been already.  */
15852
15853 unsigned int
15854 aarch64_tlsdesc_abi_id ()
15855 {
15856   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
15857   if (!tlsdesc_abi.initialized_p ())
15858     {
15859       HARD_REG_SET full_reg_clobbers;
15860       CLEAR_HARD_REG_SET (full_reg_clobbers);
15861       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
15862       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
15863       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
15864         SET_HARD_REG_BIT (full_reg_clobbers, regno);
15865       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
15866     }
15867   return tlsdesc_abi.id ();
15868 }
15869
15870 /* Return true if SYMBOL_REF X binds locally.  */
15871
15872 static bool
15873 aarch64_symbol_binds_local_p (const_rtx x)
15874 {
15875   return (SYMBOL_REF_DECL (x)
15876           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
15877           : SYMBOL_REF_LOCAL_P (x));
15878 }
15879
15880 /* Return true if SYMBOL_REF X is thread local */
15881 static bool
15882 aarch64_tls_symbol_p (rtx x)
15883 {
15884   if (! TARGET_HAVE_TLS)
15885     return false;
15886
15887   if (GET_CODE (x) != SYMBOL_REF)
15888     return false;
15889
15890   return SYMBOL_REF_TLS_MODEL (x) != 0;
15891 }
15892
15893 /* Classify a TLS symbol into one of the TLS kinds.  */
15894 enum aarch64_symbol_type
15895 aarch64_classify_tls_symbol (rtx x)
15896 {
15897   enum tls_model tls_kind = tls_symbolic_operand_type (x);
15898
15899   switch (tls_kind)
15900     {
15901     case TLS_MODEL_GLOBAL_DYNAMIC:
15902     case TLS_MODEL_LOCAL_DYNAMIC:
15903       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
15904
15905     case TLS_MODEL_INITIAL_EXEC:
15906       switch (aarch64_cmodel)
15907         {
15908         case AARCH64_CMODEL_TINY:
15909         case AARCH64_CMODEL_TINY_PIC:
15910           return SYMBOL_TINY_TLSIE;
15911         default:
15912           return SYMBOL_SMALL_TLSIE;
15913         }
15914
15915     case TLS_MODEL_LOCAL_EXEC:
15916       if (aarch64_tls_size == 12)
15917         return SYMBOL_TLSLE12;
15918       else if (aarch64_tls_size == 24)
15919         return SYMBOL_TLSLE24;
15920       else if (aarch64_tls_size == 32)
15921         return SYMBOL_TLSLE32;
15922       else if (aarch64_tls_size == 48)
15923         return SYMBOL_TLSLE48;
15924       else
15925         gcc_unreachable ();
15926
15927     case TLS_MODEL_EMULATED:
15928     case TLS_MODEL_NONE:
15929       return SYMBOL_FORCE_TO_MEM;
15930
15931     default:
15932       gcc_unreachable ();
15933     }
15934 }
15935
15936 /* Return the correct method for accessing X + OFFSET, where X is either
15937    a SYMBOL_REF or LABEL_REF.  */
15938
15939 enum aarch64_symbol_type
15940 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
15941 {
15942   if (GET_CODE (x) == LABEL_REF)
15943     {
15944       switch (aarch64_cmodel)
15945         {
15946         case AARCH64_CMODEL_LARGE:
15947           return SYMBOL_FORCE_TO_MEM;
15948
15949         case AARCH64_CMODEL_TINY_PIC:
15950         case AARCH64_CMODEL_TINY:
15951           return SYMBOL_TINY_ABSOLUTE;
15952
15953         case AARCH64_CMODEL_SMALL_SPIC:
15954         case AARCH64_CMODEL_SMALL_PIC:
15955         case AARCH64_CMODEL_SMALL:
15956           return SYMBOL_SMALL_ABSOLUTE;
15957
15958         default:
15959           gcc_unreachable ();
15960         }
15961     }
15962
15963   if (GET_CODE (x) == SYMBOL_REF)
15964     {
15965       if (aarch64_tls_symbol_p (x))
15966         return aarch64_classify_tls_symbol (x);
15967
15968       switch (aarch64_cmodel)
15969         {
15970         case AARCH64_CMODEL_TINY:
15971           /* When we retrieve symbol + offset address, we have to make sure
15972              the offset does not cause overflow of the final address.  But
15973              we have no way of knowing the address of symbol at compile time
15974              so we can't accurately say if the distance between the PC and
15975              symbol + offset is outside the addressible range of +/-1MB in the
15976              TINY code model.  So we limit the maximum offset to +/-64KB and
15977              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15978              If offset_within_block_p is true we allow larger offsets.
15979              Furthermore force to memory if the symbol is a weak reference to
15980              something that doesn't resolve to a symbol in this module.  */
15981
15982           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15983             return SYMBOL_FORCE_TO_MEM;
15984           if (!(IN_RANGE (offset, -0x10000, 0x10000)
15985                 || offset_within_block_p (x, offset)))
15986             return SYMBOL_FORCE_TO_MEM;
15987
15988           return SYMBOL_TINY_ABSOLUTE;
15989
15990         case AARCH64_CMODEL_SMALL:
15991           /* Same reasoning as the tiny code model, but the offset cap here is
15992              1MB, allowing +/-3.9GB for the offset to the symbol.  */
15993
15994           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15995             return SYMBOL_FORCE_TO_MEM;
15996           if (!(IN_RANGE (offset, -0x100000, 0x100000)
15997                 || offset_within_block_p (x, offset)))
15998             return SYMBOL_FORCE_TO_MEM;
15999
16000           return SYMBOL_SMALL_ABSOLUTE;
16001
16002         case AARCH64_CMODEL_TINY_PIC:
16003           if (!aarch64_symbol_binds_local_p (x))
16004             return SYMBOL_TINY_GOT;
16005           return SYMBOL_TINY_ABSOLUTE;
16006
16007         case AARCH64_CMODEL_SMALL_SPIC:
16008         case AARCH64_CMODEL_SMALL_PIC:
16009           if (!aarch64_symbol_binds_local_p (x))
16010             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
16011                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
16012           return SYMBOL_SMALL_ABSOLUTE;
16013
16014         case AARCH64_CMODEL_LARGE:
16015           /* This is alright even in PIC code as the constant
16016              pool reference is always PC relative and within
16017              the same translation unit.  */
16018           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
16019             return SYMBOL_SMALL_ABSOLUTE;
16020           else
16021             return SYMBOL_FORCE_TO_MEM;
16022
16023         default:
16024           gcc_unreachable ();
16025         }
16026     }
16027
16028   /* By default push everything into the constant pool.  */
16029   return SYMBOL_FORCE_TO_MEM;
16030 }
16031
16032 bool
16033 aarch64_constant_address_p (rtx x)
16034 {
16035   return (CONSTANT_P (x) && memory_address_p (DImode, x));
16036 }
16037
16038 bool
16039 aarch64_legitimate_pic_operand_p (rtx x)
16040 {
16041   if (GET_CODE (x) == SYMBOL_REF
16042       || (GET_CODE (x) == CONST
16043           && GET_CODE (XEXP (x, 0)) == PLUS
16044           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
16045      return false;
16046
16047   return true;
16048 }
16049
16050 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
16051    that should be rematerialized rather than spilled.  */
16052
16053 static bool
16054 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
16055 {
16056   /* Support CSE and rematerialization of common constants.  */
16057   if (CONST_INT_P (x)
16058       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
16059       || GET_CODE (x) == CONST_VECTOR)
16060     return true;
16061
16062   /* Do not allow vector struct mode constants for Advanced SIMD.
16063      We could support 0 and -1 easily, but they need support in
16064      aarch64-simd.md.  */
16065   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16066   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
16067     return false;
16068
16069   /* Only accept variable-length vector constants if they can be
16070      handled directly.
16071
16072      ??? It would be possible to handle rematerialization of other
16073      constants via secondary reloads.  */
16074   if (vec_flags & VEC_ANY_SVE)
16075     return aarch64_simd_valid_immediate (x, NULL);
16076
16077   if (GET_CODE (x) == HIGH)
16078     x = XEXP (x, 0);
16079
16080   /* Accept polynomial constants that can be calculated by using the
16081      destination of a move as the sole temporary.  Constants that
16082      require a second temporary cannot be rematerialized (they can't be
16083      forced to memory and also aren't legitimate constants).  */
16084   poly_int64 offset;
16085   if (poly_int_rtx_p (x, &offset))
16086     return aarch64_offset_temporaries (false, offset) <= 1;
16087
16088   /* If an offset is being added to something else, we need to allow the
16089      base to be moved into the destination register, meaning that there
16090      are no free temporaries for the offset.  */
16091   x = strip_offset (x, &offset);
16092   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
16093     return false;
16094
16095   /* Do not allow const (plus (anchor_symbol, const_int)).  */
16096   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
16097     return false;
16098
16099   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
16100      so spilling them is better than rematerialization.  */
16101   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
16102     return true;
16103
16104   /* Label references are always constant.  */
16105   if (GET_CODE (x) == LABEL_REF)
16106     return true;
16107
16108   return false;
16109 }
16110
16111 rtx
16112 aarch64_load_tp (rtx target)
16113 {
16114   if (!target
16115       || GET_MODE (target) != Pmode
16116       || !register_operand (target, Pmode))
16117     target = gen_reg_rtx (Pmode);
16118
16119   /* Can return in any reg.  */
16120   emit_insn (gen_aarch64_load_tp_hard (target));
16121   return target;
16122 }
16123
16124 /* On AAPCS systems, this is the "struct __va_list".  */
16125 static GTY(()) tree va_list_type;
16126
16127 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
16128    Return the type to use as __builtin_va_list.
16129
16130    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
16131
16132    struct __va_list
16133    {
16134      void *__stack;
16135      void *__gr_top;
16136      void *__vr_top;
16137      int   __gr_offs;
16138      int   __vr_offs;
16139    };  */
16140
16141 static tree
16142 aarch64_build_builtin_va_list (void)
16143 {
16144   tree va_list_name;
16145   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16146
16147   /* Create the type.  */
16148   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
16149   /* Give it the required name.  */
16150   va_list_name = build_decl (BUILTINS_LOCATION,
16151                              TYPE_DECL,
16152                              get_identifier ("__va_list"),
16153                              va_list_type);
16154   DECL_ARTIFICIAL (va_list_name) = 1;
16155   TYPE_NAME (va_list_type) = va_list_name;
16156   TYPE_STUB_DECL (va_list_type) = va_list_name;
16157
16158   /* Create the fields.  */
16159   f_stack = build_decl (BUILTINS_LOCATION,
16160                         FIELD_DECL, get_identifier ("__stack"),
16161                         ptr_type_node);
16162   f_grtop = build_decl (BUILTINS_LOCATION,
16163                         FIELD_DECL, get_identifier ("__gr_top"),
16164                         ptr_type_node);
16165   f_vrtop = build_decl (BUILTINS_LOCATION,
16166                         FIELD_DECL, get_identifier ("__vr_top"),
16167                         ptr_type_node);
16168   f_groff = build_decl (BUILTINS_LOCATION,
16169                         FIELD_DECL, get_identifier ("__gr_offs"),
16170                         integer_type_node);
16171   f_vroff = build_decl (BUILTINS_LOCATION,
16172                         FIELD_DECL, get_identifier ("__vr_offs"),
16173                         integer_type_node);
16174
16175   /* Tell tree-stdarg pass about our internal offset fields.
16176      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
16177      purpose to identify whether the code is updating va_list internal
16178      offset fields through irregular way.  */
16179   va_list_gpr_counter_field = f_groff;
16180   va_list_fpr_counter_field = f_vroff;
16181
16182   DECL_ARTIFICIAL (f_stack) = 1;
16183   DECL_ARTIFICIAL (f_grtop) = 1;
16184   DECL_ARTIFICIAL (f_vrtop) = 1;
16185   DECL_ARTIFICIAL (f_groff) = 1;
16186   DECL_ARTIFICIAL (f_vroff) = 1;
16187
16188   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
16189   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
16190   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
16191   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
16192   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
16193
16194   TYPE_FIELDS (va_list_type) = f_stack;
16195   DECL_CHAIN (f_stack) = f_grtop;
16196   DECL_CHAIN (f_grtop) = f_vrtop;
16197   DECL_CHAIN (f_vrtop) = f_groff;
16198   DECL_CHAIN (f_groff) = f_vroff;
16199
16200   /* Compute its layout.  */
16201   layout_type (va_list_type);
16202
16203   return va_list_type;
16204 }
16205
16206 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
16207 static void
16208 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
16209 {
16210   const CUMULATIVE_ARGS *cum;
16211   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16212   tree stack, grtop, vrtop, groff, vroff;
16213   tree t;
16214   int gr_save_area_size = cfun->va_list_gpr_size;
16215   int vr_save_area_size = cfun->va_list_fpr_size;
16216   int vr_offset;
16217
16218   cum = &crtl->args.info;
16219   if (cfun->va_list_gpr_size)
16220     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
16221                              cfun->va_list_gpr_size);
16222   if (cfun->va_list_fpr_size)
16223     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
16224                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
16225
16226   if (!TARGET_FLOAT)
16227     {
16228       gcc_assert (cum->aapcs_nvrn == 0);
16229       vr_save_area_size = 0;
16230     }
16231
16232   f_stack = TYPE_FIELDS (va_list_type_node);
16233   f_grtop = DECL_CHAIN (f_stack);
16234   f_vrtop = DECL_CHAIN (f_grtop);
16235   f_groff = DECL_CHAIN (f_vrtop);
16236   f_vroff = DECL_CHAIN (f_groff);
16237
16238   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
16239                   NULL_TREE);
16240   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
16241                   NULL_TREE);
16242   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
16243                   NULL_TREE);
16244   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
16245                   NULL_TREE);
16246   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
16247                   NULL_TREE);
16248
16249   /* Emit code to initialize STACK, which points to the next varargs stack
16250      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
16251      by named arguments.  STACK is 8-byte aligned.  */
16252   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
16253   if (cum->aapcs_stack_size > 0)
16254     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
16255   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
16256   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16257
16258   /* Emit code to initialize GRTOP, the top of the GR save area.
16259      virtual_incoming_args_rtx should have been 16 byte aligned.  */
16260   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
16261   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
16262   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16263
16264   /* Emit code to initialize VRTOP, the top of the VR save area.
16265      This address is gr_save_area_bytes below GRTOP, rounded
16266      down to the next 16-byte boundary.  */
16267   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
16268   vr_offset = ROUND_UP (gr_save_area_size,
16269                         STACK_BOUNDARY / BITS_PER_UNIT);
16270
16271   if (vr_offset)
16272     t = fold_build_pointer_plus_hwi (t, -vr_offset);
16273   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
16274   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16275
16276   /* Emit code to initialize GROFF, the offset from GRTOP of the
16277      next GPR argument.  */
16278   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
16279               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
16280   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16281
16282   /* Likewise emit code to initialize VROFF, the offset from FTOP
16283      of the next VR argument.  */
16284   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
16285               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
16286   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16287 }
16288
16289 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
16290
16291 static tree
16292 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
16293                               gimple_seq *post_p ATTRIBUTE_UNUSED)
16294 {
16295   tree addr;
16296   bool indirect_p;
16297   bool is_ha;           /* is HFA or HVA.  */
16298   bool dw_align;        /* double-word align.  */
16299   machine_mode ag_mode = VOIDmode;
16300   int nregs;
16301   machine_mode mode;
16302
16303   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16304   tree stack, f_top, f_off, off, arg, roundup, on_stack;
16305   HOST_WIDE_INT size, rsize, adjust, align;
16306   tree t, u, cond1, cond2;
16307
16308   indirect_p = pass_va_arg_by_reference (type);
16309   if (indirect_p)
16310     type = build_pointer_type (type);
16311
16312   mode = TYPE_MODE (type);
16313
16314   f_stack = TYPE_FIELDS (va_list_type_node);
16315   f_grtop = DECL_CHAIN (f_stack);
16316   f_vrtop = DECL_CHAIN (f_grtop);
16317   f_groff = DECL_CHAIN (f_vrtop);
16318   f_vroff = DECL_CHAIN (f_groff);
16319
16320   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
16321                   f_stack, NULL_TREE);
16322   size = int_size_in_bytes (type);
16323
16324   bool abi_break;
16325   align
16326     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
16327
16328   dw_align = false;
16329   adjust = 0;
16330   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
16331                                                &is_ha, false))
16332     {
16333       /* No frontends can create types with variable-sized modes, so we
16334          shouldn't be asked to pass or return them.  */
16335       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
16336
16337       /* TYPE passed in fp/simd registers.  */
16338       if (!TARGET_FLOAT)
16339         aarch64_err_no_fpadvsimd (mode);
16340
16341       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
16342                       unshare_expr (valist), f_vrtop, NULL_TREE);
16343       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
16344                       unshare_expr (valist), f_vroff, NULL_TREE);
16345
16346       rsize = nregs * UNITS_PER_VREG;
16347
16348       if (is_ha)
16349         {
16350           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
16351             adjust = UNITS_PER_VREG - ag_size;
16352         }
16353       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16354                && size < UNITS_PER_VREG)
16355         {
16356           adjust = UNITS_PER_VREG - size;
16357         }
16358     }
16359   else
16360     {
16361       /* TYPE passed in general registers.  */
16362       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
16363                       unshare_expr (valist), f_grtop, NULL_TREE);
16364       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
16365                       unshare_expr (valist), f_groff, NULL_TREE);
16366       rsize = ROUND_UP (size, UNITS_PER_WORD);
16367       nregs = rsize / UNITS_PER_WORD;
16368
16369       if (align > 8)
16370         {
16371           if (abi_break && warn_psabi)
16372             inform (input_location, "parameter passing for argument of type "
16373                     "%qT changed in GCC 9.1", type);
16374           dw_align = true;
16375         }
16376
16377       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16378           && size < UNITS_PER_WORD)
16379         {
16380           adjust = UNITS_PER_WORD  - size;
16381         }
16382     }
16383
16384   /* Get a local temporary for the field value.  */
16385   off = get_initialized_tmp_var (f_off, pre_p, NULL);
16386
16387   /* Emit code to branch if off >= 0.  */
16388   t = build2 (GE_EXPR, boolean_type_node, off,
16389               build_int_cst (TREE_TYPE (off), 0));
16390   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
16391
16392   if (dw_align)
16393     {
16394       /* Emit: offs = (offs + 15) & -16.  */
16395       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16396                   build_int_cst (TREE_TYPE (off), 15));
16397       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
16398                   build_int_cst (TREE_TYPE (off), -16));
16399       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
16400     }
16401   else
16402     roundup = NULL;
16403
16404   /* Update ap.__[g|v]r_offs  */
16405   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16406               build_int_cst (TREE_TYPE (off), rsize));
16407   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
16408
16409   /* String up.  */
16410   if (roundup)
16411     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16412
16413   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
16414   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
16415               build_int_cst (TREE_TYPE (f_off), 0));
16416   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
16417
16418   /* String up: make sure the assignment happens before the use.  */
16419   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
16420   COND_EXPR_ELSE (cond1) = t;
16421
16422   /* Prepare the trees handling the argument that is passed on the stack;
16423      the top level node will store in ON_STACK.  */
16424   arg = get_initialized_tmp_var (stack, pre_p, NULL);
16425   if (align > 8)
16426     {
16427       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
16428       t = fold_build_pointer_plus_hwi (arg, 15);
16429       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16430                   build_int_cst (TREE_TYPE (t), -16));
16431       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
16432     }
16433   else
16434     roundup = NULL;
16435   /* Advance ap.__stack  */
16436   t = fold_build_pointer_plus_hwi (arg, size + 7);
16437   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16438               build_int_cst (TREE_TYPE (t), -8));
16439   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
16440   /* String up roundup and advance.  */
16441   if (roundup)
16442     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16443   /* String up with arg */
16444   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
16445   /* Big-endianness related address adjustment.  */
16446   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16447       && size < UNITS_PER_WORD)
16448   {
16449     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
16450                 size_int (UNITS_PER_WORD - size));
16451     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
16452   }
16453
16454   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
16455   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
16456
16457   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
16458   t = off;
16459   if (adjust)
16460     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
16461                 build_int_cst (TREE_TYPE (off), adjust));
16462
16463   t = fold_convert (sizetype, t);
16464   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
16465
16466   if (is_ha)
16467     {
16468       /* type ha; // treat as "struct {ftype field[n];}"
16469          ... [computing offs]
16470          for (i = 0; i <nregs; ++i, offs += 16)
16471            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
16472          return ha;  */
16473       int i;
16474       tree tmp_ha, field_t, field_ptr_t;
16475
16476       /* Declare a local variable.  */
16477       tmp_ha = create_tmp_var_raw (type, "ha");
16478       gimple_add_tmp_var (tmp_ha);
16479
16480       /* Establish the base type.  */
16481       switch (ag_mode)
16482         {
16483         case E_SFmode:
16484           field_t = float_type_node;
16485           field_ptr_t = float_ptr_type_node;
16486           break;
16487         case E_DFmode:
16488           field_t = double_type_node;
16489           field_ptr_t = double_ptr_type_node;
16490           break;
16491         case E_TFmode:
16492           field_t = long_double_type_node;
16493           field_ptr_t = long_double_ptr_type_node;
16494           break;
16495         case E_HFmode:
16496           field_t = aarch64_fp16_type_node;
16497           field_ptr_t = aarch64_fp16_ptr_type_node;
16498           break;
16499         case E_BFmode:
16500           field_t = aarch64_bf16_type_node;
16501           field_ptr_t = aarch64_bf16_ptr_type_node;
16502           break;
16503         case E_V2SImode:
16504         case E_V4SImode:
16505             {
16506               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
16507               field_t = build_vector_type_for_mode (innertype, ag_mode);
16508               field_ptr_t = build_pointer_type (field_t);
16509             }
16510           break;
16511         default:
16512           gcc_assert (0);
16513         }
16514
16515       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
16516       TREE_ADDRESSABLE (tmp_ha) = 1;
16517       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
16518       addr = t;
16519       t = fold_convert (field_ptr_t, addr);
16520       t = build2 (MODIFY_EXPR, field_t,
16521                   build1 (INDIRECT_REF, field_t, tmp_ha),
16522                   build1 (INDIRECT_REF, field_t, t));
16523
16524       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
16525       for (i = 1; i < nregs; ++i)
16526         {
16527           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
16528           u = fold_convert (field_ptr_t, addr);
16529           u = build2 (MODIFY_EXPR, field_t,
16530                       build2 (MEM_REF, field_t, tmp_ha,
16531                               build_int_cst (field_ptr_t,
16532                                              (i *
16533                                               int_size_in_bytes (field_t)))),
16534                       build1 (INDIRECT_REF, field_t, u));
16535           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
16536         }
16537
16538       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
16539       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
16540     }
16541
16542   COND_EXPR_ELSE (cond2) = t;
16543   addr = fold_convert (build_pointer_type (type), cond1);
16544   addr = build_va_arg_indirect_ref (addr);
16545
16546   if (indirect_p)
16547     addr = build_va_arg_indirect_ref (addr);
16548
16549   return addr;
16550 }
16551
16552 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
16553
16554 static void
16555 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
16556                                 const function_arg_info &arg,
16557                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
16558 {
16559   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
16560   CUMULATIVE_ARGS local_cum;
16561   int gr_saved = cfun->va_list_gpr_size;
16562   int vr_saved = cfun->va_list_fpr_size;
16563
16564   /* The caller has advanced CUM up to, but not beyond, the last named
16565      argument.  Advance a local copy of CUM past the last "real" named
16566      argument, to find out how many registers are left over.  */
16567   local_cum = *cum;
16568   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
16569
16570   /* Found out how many registers we need to save.
16571      Honor tree-stdvar analysis results.  */
16572   if (cfun->va_list_gpr_size)
16573     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
16574                     cfun->va_list_gpr_size / UNITS_PER_WORD);
16575   if (cfun->va_list_fpr_size)
16576     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
16577                     cfun->va_list_fpr_size / UNITS_PER_VREG);
16578
16579   if (!TARGET_FLOAT)
16580     {
16581       gcc_assert (local_cum.aapcs_nvrn == 0);
16582       vr_saved = 0;
16583     }
16584
16585   if (!no_rtl)
16586     {
16587       if (gr_saved > 0)
16588         {
16589           rtx ptr, mem;
16590
16591           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
16592           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
16593                                - gr_saved * UNITS_PER_WORD);
16594           mem = gen_frame_mem (BLKmode, ptr);
16595           set_mem_alias_set (mem, get_varargs_alias_set ());
16596
16597           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
16598                                mem, gr_saved);
16599         }
16600       if (vr_saved > 0)
16601         {
16602           /* We can't use move_block_from_reg, because it will use
16603              the wrong mode, storing D regs only.  */
16604           machine_mode mode = TImode;
16605           int off, i, vr_start;
16606
16607           /* Set OFF to the offset from virtual_incoming_args_rtx of
16608              the first vector register.  The VR save area lies below
16609              the GR one, and is aligned to 16 bytes.  */
16610           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
16611                            STACK_BOUNDARY / BITS_PER_UNIT);
16612           off -= vr_saved * UNITS_PER_VREG;
16613
16614           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
16615           for (i = 0; i < vr_saved; ++i)
16616             {
16617               rtx ptr, mem;
16618
16619               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
16620               mem = gen_frame_mem (mode, ptr);
16621               set_mem_alias_set (mem, get_varargs_alias_set ());
16622               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
16623               off += UNITS_PER_VREG;
16624             }
16625         }
16626     }
16627
16628   /* We don't save the size into *PRETEND_SIZE because we want to avoid
16629      any complication of having crtl->args.pretend_args_size changed.  */
16630   cfun->machine->frame.saved_varargs_size
16631     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
16632                  STACK_BOUNDARY / BITS_PER_UNIT)
16633        + vr_saved * UNITS_PER_VREG);
16634 }
16635
16636 static void
16637 aarch64_conditional_register_usage (void)
16638 {
16639   int i;
16640   if (!TARGET_FLOAT)
16641     {
16642       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
16643         {
16644           fixed_regs[i] = 1;
16645           call_used_regs[i] = 1;
16646         }
16647     }
16648   if (!TARGET_SVE)
16649     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
16650       {
16651         fixed_regs[i] = 1;
16652         call_used_regs[i] = 1;
16653       }
16654
16655   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
16656   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
16657   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
16658
16659   /* When tracking speculation, we need a couple of call-clobbered registers
16660      to track the speculation state.  It would be nice to just use
16661      IP0 and IP1, but currently there are numerous places that just
16662      assume these registers are free for other uses (eg pointer
16663      authentication).  */
16664   if (aarch64_track_speculation)
16665     {
16666       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
16667       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
16668       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16669       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16670     }
16671 }
16672
16673 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
16674
16675 bool
16676 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
16677 {
16678   /* For records we're passed a FIELD_DECL, for arrays we're passed
16679      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
16680   const_tree type = TREE_TYPE (field_or_array);
16681
16682   /* Assign BLKmode to anything that contains multiple SVE predicates.
16683      For structures, the "multiple" case is indicated by MODE being
16684      VOIDmode.  */
16685   unsigned int num_zr, num_pr;
16686   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
16687     {
16688       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
16689         return !simple_cst_equal (TYPE_SIZE (field_or_array),
16690                                   TYPE_SIZE (type));
16691       return mode == VOIDmode;
16692     }
16693
16694   return default_member_type_forces_blk (field_or_array, mode);
16695 }
16696
16697 /* Bitmasks that indicate whether earlier versions of GCC would have
16698    taken a different path through the ABI logic.  This should result in
16699    a -Wpsabi warning if the earlier path led to a different ABI decision.
16700
16701    WARN_PSABI_EMPTY_CXX17_BASE
16702       Indicates that the type includes an artificial empty C++17 base field
16703       that, prior to GCC 10.1, would prevent the type from being treated as
16704       a HFA or HVA.  See PR94383 for details.
16705
16706    WARN_PSABI_NO_UNIQUE_ADDRESS
16707       Indicates that the type includes an empty [[no_unique_address]] field
16708       that, prior to GCC 10.1, would prevent the type from being treated as
16709       a HFA or HVA.  */
16710 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
16711 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
16712
16713 /* Walk down the type tree of TYPE counting consecutive base elements.
16714    If *MODEP is VOIDmode, then set it to the first valid floating point
16715    type.  If a non-floating point type is found, or if a floating point
16716    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
16717    otherwise return the count in the sub-tree.
16718
16719    The WARN_PSABI_FLAGS argument allows the caller to check whether this
16720    function has changed its behavior relative to earlier versions of GCC.
16721    Normally the argument should be nonnull and point to a zero-initialized
16722    variable.  The function then records whether the ABI decision might
16723    be affected by a known fix to the ABI logic, setting the associated
16724    WARN_PSABI_* bits if so.
16725
16726    When the argument is instead a null pointer, the function tries to
16727    simulate the behavior of GCC before all such ABI fixes were made.
16728    This is useful to check whether the function returns something
16729    different after the ABI fixes.  */
16730 static int
16731 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
16732                          unsigned int *warn_psabi_flags)
16733 {
16734   machine_mode mode;
16735   HOST_WIDE_INT size;
16736
16737   if (aarch64_sve::builtin_type_p (type))
16738     return -1;
16739
16740   switch (TREE_CODE (type))
16741     {
16742     case REAL_TYPE:
16743       mode = TYPE_MODE (type);
16744       if (mode != DFmode && mode != SFmode
16745           && mode != TFmode && mode != HFmode)
16746         return -1;
16747
16748       if (*modep == VOIDmode)
16749         *modep = mode;
16750
16751       if (*modep == mode)
16752         return 1;
16753
16754       break;
16755
16756     case COMPLEX_TYPE:
16757       mode = TYPE_MODE (TREE_TYPE (type));
16758       if (mode != DFmode && mode != SFmode
16759           && mode != TFmode && mode != HFmode)
16760         return -1;
16761
16762       if (*modep == VOIDmode)
16763         *modep = mode;
16764
16765       if (*modep == mode)
16766         return 2;
16767
16768       break;
16769
16770     case VECTOR_TYPE:
16771       /* Use V2SImode and V4SImode as representatives of all 64-bit
16772          and 128-bit vector types.  */
16773       size = int_size_in_bytes (type);
16774       switch (size)
16775         {
16776         case 8:
16777           mode = V2SImode;
16778           break;
16779         case 16:
16780           mode = V4SImode;
16781           break;
16782         default:
16783           return -1;
16784         }
16785
16786       if (*modep == VOIDmode)
16787         *modep = mode;
16788
16789       /* Vector modes are considered to be opaque: two vectors are
16790          equivalent for the purposes of being homogeneous aggregates
16791          if they are the same size.  */
16792       if (*modep == mode)
16793         return 1;
16794
16795       break;
16796
16797     case ARRAY_TYPE:
16798       {
16799         int count;
16800         tree index = TYPE_DOMAIN (type);
16801
16802         /* Can't handle incomplete types nor sizes that are not
16803            fixed.  */
16804         if (!COMPLETE_TYPE_P (type)
16805             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16806           return -1;
16807
16808         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
16809                                          warn_psabi_flags);
16810         if (count == -1
16811             || !index
16812             || !TYPE_MAX_VALUE (index)
16813             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
16814             || !TYPE_MIN_VALUE (index)
16815             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
16816             || count < 0)
16817           return -1;
16818
16819         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
16820                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
16821
16822         /* There must be no padding.  */
16823         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16824                       count * GET_MODE_BITSIZE (*modep)))
16825           return -1;
16826
16827         return count;
16828       }
16829
16830     case RECORD_TYPE:
16831       {
16832         int count = 0;
16833         int sub_count;
16834         tree field;
16835
16836         /* Can't handle incomplete types nor sizes that are not
16837            fixed.  */
16838         if (!COMPLETE_TYPE_P (type)
16839             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16840           return -1;
16841
16842         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16843           {
16844             if (TREE_CODE (field) != FIELD_DECL)
16845               continue;
16846
16847             if (DECL_FIELD_ABI_IGNORED (field))
16848               {
16849                 /* See whether this is something that earlier versions of
16850                    GCC failed to ignore.  */
16851                 unsigned int flag;
16852                 if (lookup_attribute ("no_unique_address",
16853                                       DECL_ATTRIBUTES (field)))
16854                   flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
16855                 else if (cxx17_empty_base_field_p (field))
16856                   flag = WARN_PSABI_EMPTY_CXX17_BASE;
16857                 else
16858                   /* No compatibility problem.  */
16859                   continue;
16860
16861                 /* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
16862                 if (warn_psabi_flags)
16863                   {
16864                     *warn_psabi_flags |= flag;
16865                     continue;
16866                   }
16867               }
16868
16869             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
16870                                                  warn_psabi_flags);
16871             if (sub_count < 0)
16872               return -1;
16873             count += sub_count;
16874           }
16875
16876         /* There must be no padding.  */
16877         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16878                       count * GET_MODE_BITSIZE (*modep)))
16879           return -1;
16880
16881         return count;
16882       }
16883
16884     case UNION_TYPE:
16885     case QUAL_UNION_TYPE:
16886       {
16887         /* These aren't very interesting except in a degenerate case.  */
16888         int count = 0;
16889         int sub_count;
16890         tree field;
16891
16892         /* Can't handle incomplete types nor sizes that are not
16893            fixed.  */
16894         if (!COMPLETE_TYPE_P (type)
16895             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16896           return -1;
16897
16898         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16899           {
16900             if (TREE_CODE (field) != FIELD_DECL)
16901               continue;
16902
16903             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
16904                                                  warn_psabi_flags);
16905             if (sub_count < 0)
16906               return -1;
16907             count = count > sub_count ? count : sub_count;
16908           }
16909
16910         /* There must be no padding.  */
16911         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16912                       count * GET_MODE_BITSIZE (*modep)))
16913           return -1;
16914
16915         return count;
16916       }
16917
16918     default:
16919       break;
16920     }
16921
16922   return -1;
16923 }
16924
16925 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
16926    type as described in AAPCS64 \S 4.1.2.
16927
16928    See the comment above aarch64_composite_type_p for the notes on MODE.  */
16929
16930 static bool
16931 aarch64_short_vector_p (const_tree type,
16932                         machine_mode mode)
16933 {
16934   poly_int64 size = -1;
16935
16936   if (type && TREE_CODE (type) == VECTOR_TYPE)
16937     {
16938       if (aarch64_sve::builtin_type_p (type))
16939         return false;
16940       size = int_size_in_bytes (type);
16941     }
16942   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16943            || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16944     {
16945       /* Rely only on the type, not the mode, when processing SVE types.  */
16946       if (type && aarch64_some_values_include_pst_objects_p (type))
16947         /* Leave later code to report an error if SVE is disabled.  */
16948         gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
16949       else
16950         size = GET_MODE_SIZE (mode);
16951     }
16952   if (known_eq (size, 8) || known_eq (size, 16))
16953     {
16954       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
16955          they are being treated as scalable AAPCS64 types.  */
16956       gcc_assert (!aarch64_sve_mode_p (mode));
16957       return true;
16958     }
16959   return false;
16960 }
16961
16962 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
16963    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
16964    array types.  The C99 floating-point complex types are also considered
16965    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
16966    types, which are GCC extensions and out of the scope of AAPCS64, are
16967    treated as composite types here as well.
16968
16969    Note that MODE itself is not sufficient in determining whether a type
16970    is such a composite type or not.  This is because
16971    stor-layout.c:compute_record_mode may have already changed the MODE
16972    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
16973    structure with only one field may have its MODE set to the mode of the
16974    field.  Also an integer mode whose size matches the size of the
16975    RECORD_TYPE type may be used to substitute the original mode
16976    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
16977    solely relied on.  */
16978
16979 static bool
16980 aarch64_composite_type_p (const_tree type,
16981                           machine_mode mode)
16982 {
16983   if (aarch64_short_vector_p (type, mode))
16984     return false;
16985
16986   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
16987     return true;
16988
16989   if (mode == BLKmode
16990       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
16991       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
16992     return true;
16993
16994   return false;
16995 }
16996
16997 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16998    shall be passed or returned in simd/fp register(s) (providing these
16999    parameter passing registers are available).
17000
17001    Upon successful return, *COUNT returns the number of needed registers,
17002    *BASE_MODE returns the mode of the individual register and when IS_HAF
17003    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
17004    floating-point aggregate or a homogeneous short-vector aggregate.
17005
17006    SILENT_P is true if the function should refrain from reporting any
17007    diagnostics.  This should only be used if the caller is certain that
17008    any ABI decisions would eventually come through this function with
17009    SILENT_P set to false.  */
17010
17011 static bool
17012 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
17013                                          const_tree type,
17014                                          machine_mode *base_mode,
17015                                          int *count,
17016                                          bool *is_ha,
17017                                          bool silent_p)
17018 {
17019   if (is_ha != NULL) *is_ha = false;
17020
17021   machine_mode new_mode = VOIDmode;
17022   bool composite_p = aarch64_composite_type_p (type, mode);
17023
17024   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
17025       || aarch64_short_vector_p (type, mode))
17026     {
17027       *count = 1;
17028       new_mode = mode;
17029     }
17030   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
17031     {
17032       if (is_ha != NULL) *is_ha = true;
17033       *count = 2;
17034       new_mode = GET_MODE_INNER (mode);
17035     }
17036   else if (type && composite_p)
17037     {
17038       unsigned int warn_psabi_flags = 0;
17039       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
17040                                               &warn_psabi_flags);
17041       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
17042         {
17043           static unsigned last_reported_type_uid;
17044           unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
17045           int alt;
17046           if (!silent_p
17047               && warn_psabi
17048               && warn_psabi_flags
17049               && uid != last_reported_type_uid
17050               && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
17051                   != ag_count))
17052             {
17053               const char *url
17054                 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
17055               gcc_assert (alt == -1);
17056               last_reported_type_uid = uid;
17057               /* Use TYPE_MAIN_VARIANT to strip any redundant const
17058                  qualification.  */
17059               if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
17060                 inform (input_location, "parameter passing for argument of "
17061                         "type %qT with %<[[no_unique_address]]%> members "
17062                         "changed %{in GCC 10.1%}",
17063                         TYPE_MAIN_VARIANT (type), url);
17064               else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
17065                 inform (input_location, "parameter passing for argument of "
17066                         "type %qT when C++17 is enabled changed to match "
17067                         "C++14 %{in GCC 10.1%}",
17068                         TYPE_MAIN_VARIANT (type), url);
17069             }
17070
17071           if (is_ha != NULL) *is_ha = true;
17072           *count = ag_count;
17073         }
17074       else
17075         return false;
17076     }
17077   else
17078     return false;
17079
17080   gcc_assert (!aarch64_sve_mode_p (new_mode));
17081   *base_mode = new_mode;
17082   return true;
17083 }
17084
17085 /* Implement TARGET_STRUCT_VALUE_RTX.  */
17086
17087 static rtx
17088 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
17089                           int incoming ATTRIBUTE_UNUSED)
17090 {
17091   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
17092 }
17093
17094 /* Implements target hook vector_mode_supported_p.  */
17095 static bool
17096 aarch64_vector_mode_supported_p (machine_mode mode)
17097 {
17098   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17099   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
17100 }
17101
17102 /* Return the full-width SVE vector mode for element mode MODE, if one
17103    exists.  */
17104 opt_machine_mode
17105 aarch64_full_sve_mode (scalar_mode mode)
17106 {
17107   switch (mode)
17108     {
17109     case E_DFmode:
17110       return VNx2DFmode;
17111     case E_SFmode:
17112       return VNx4SFmode;
17113     case E_HFmode:
17114       return VNx8HFmode;
17115     case E_BFmode:
17116       return VNx8BFmode;
17117     case E_DImode:
17118       return VNx2DImode;
17119     case E_SImode:
17120       return VNx4SImode;
17121     case E_HImode:
17122       return VNx8HImode;
17123     case E_QImode:
17124       return VNx16QImode;
17125     default:
17126       return opt_machine_mode ();
17127     }
17128 }
17129
17130 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
17131    if it exists.  */
17132 opt_machine_mode
17133 aarch64_vq_mode (scalar_mode mode)
17134 {
17135   switch (mode)
17136     {
17137     case E_DFmode:
17138       return V2DFmode;
17139     case E_SFmode:
17140       return V4SFmode;
17141     case E_HFmode:
17142       return V8HFmode;
17143     case E_BFmode:
17144       return V8BFmode;
17145     case E_SImode:
17146       return V4SImode;
17147     case E_HImode:
17148       return V8HImode;
17149     case E_QImode:
17150       return V16QImode;
17151     case E_DImode:
17152       return V2DImode;
17153     default:
17154       return opt_machine_mode ();
17155     }
17156 }
17157
17158 /* Return appropriate SIMD container
17159    for MODE within a vector of WIDTH bits.  */
17160 static machine_mode
17161 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
17162 {
17163   if (TARGET_SVE
17164       && maybe_ne (width, 128)
17165       && known_eq (width, BITS_PER_SVE_VECTOR))
17166     return aarch64_full_sve_mode (mode).else_mode (word_mode);
17167
17168   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
17169   if (TARGET_SIMD)
17170     {
17171       if (known_eq (width, 128))
17172         return aarch64_vq_mode (mode).else_mode (word_mode);
17173       else
17174         switch (mode)
17175           {
17176           case E_SFmode:
17177             return V2SFmode;
17178           case E_HFmode:
17179             return V4HFmode;
17180           case E_BFmode:
17181             return V4BFmode;
17182           case E_SImode:
17183             return V2SImode;
17184           case E_HImode:
17185             return V4HImode;
17186           case E_QImode:
17187             return V8QImode;
17188           default:
17189             break;
17190           }
17191     }
17192   return word_mode;
17193 }
17194
17195 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
17196 static machine_mode
17197 aarch64_preferred_simd_mode (scalar_mode mode)
17198 {
17199   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
17200   return aarch64_simd_container_mode (mode, bits);
17201 }
17202
17203 /* Return a list of possible vector sizes for the vectorizer
17204    to iterate over.  */
17205 static unsigned int
17206 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
17207 {
17208   static const machine_mode sve_modes[] = {
17209     /* Try using full vectors for all element types.  */
17210     VNx16QImode,
17211
17212     /* Try using 16-bit containers for 8-bit elements and full vectors
17213        for wider elements.  */
17214     VNx8QImode,
17215
17216     /* Try using 32-bit containers for 8-bit and 16-bit elements and
17217        full vectors for wider elements.  */
17218     VNx4QImode,
17219
17220     /* Try using 64-bit containers for all element types.  */
17221     VNx2QImode
17222   };
17223
17224   static const machine_mode advsimd_modes[] = {
17225     /* Try using 128-bit vectors for all element types.  */
17226     V16QImode,
17227
17228     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
17229        for wider elements.  */
17230     V8QImode,
17231
17232     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
17233        for wider elements.
17234
17235        TODO: We could support a limited form of V4QImode too, so that
17236        we use 32-bit vectors for 8-bit elements.  */
17237     V4HImode,
17238
17239     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
17240        for 64-bit elements.
17241
17242        TODO: We could similarly support limited forms of V2QImode and V2HImode
17243        for this case.  */
17244     V2SImode
17245   };
17246
17247   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
17248      This is because:
17249
17250      - If we can't use N-byte Advanced SIMD vectors then the placement
17251        doesn't matter; we'll just continue as though the Advanced SIMD
17252        entry didn't exist.
17253
17254      - If an SVE main loop with N bytes ends up being cheaper than an
17255        Advanced SIMD main loop with N bytes then by default we'll replace
17256        the Advanced SIMD version with the SVE one.
17257
17258      - If an Advanced SIMD main loop with N bytes ends up being cheaper
17259        than an SVE main loop with N bytes then by default we'll try to
17260        use the SVE loop to vectorize the epilogue instead.  */
17261   unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
17262   unsigned int advsimd_i = 0;
17263   while (advsimd_i < ARRAY_SIZE (advsimd_modes))
17264     {
17265       if (sve_i < ARRAY_SIZE (sve_modes)
17266           && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
17267                        GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
17268         modes->safe_push (sve_modes[sve_i++]);
17269       else
17270         modes->safe_push (advsimd_modes[advsimd_i++]);
17271     }
17272   while (sve_i < ARRAY_SIZE (sve_modes))
17273     modes->safe_push (sve_modes[sve_i++]);
17274
17275   unsigned int flags = 0;
17276   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
17277      can compare SVE against Advanced SIMD and so that we can compare
17278      multiple SVE vectorization approaches against each other.  There's
17279      not really any point doing this for Advanced SIMD only, since the
17280      first mode that works should always be the best.  */
17281   if (TARGET_SVE && aarch64_sve_compare_costs)
17282     flags |= VECT_COMPARE_COSTS;
17283   return flags;
17284 }
17285
17286 /* Implement TARGET_MANGLE_TYPE.  */
17287
17288 static const char *
17289 aarch64_mangle_type (const_tree type)
17290 {
17291   /* The AArch64 ABI documents say that "__va_list" has to be
17292      mangled as if it is in the "std" namespace.  */
17293   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
17294     return "St9__va_list";
17295
17296   /* Half-precision floating point types.  */
17297   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
17298     {
17299       if (TYPE_MODE (type) == BFmode)
17300         return "u6__bf16";
17301       else
17302         return "Dh";
17303     }
17304
17305   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
17306      builtin types.  */
17307   if (TYPE_NAME (type) != NULL)
17308     {
17309       const char *res;
17310       if ((res = aarch64_general_mangle_builtin_type (type))
17311           || (res = aarch64_sve::mangle_builtin_type (type)))
17312         return res;
17313     }
17314
17315   /* Use the default mangling.  */
17316   return NULL;
17317 }
17318
17319 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
17320
17321 static bool
17322 aarch64_verify_type_context (location_t loc, type_context_kind context,
17323                              const_tree type, bool silent_p)
17324 {
17325   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
17326 }
17327
17328 /* Find the first rtx_insn before insn that will generate an assembly
17329    instruction.  */
17330
17331 static rtx_insn *
17332 aarch64_prev_real_insn (rtx_insn *insn)
17333 {
17334   if (!insn)
17335     return NULL;
17336
17337   do
17338     {
17339       insn = prev_real_insn (insn);
17340     }
17341   while (insn && recog_memoized (insn) < 0);
17342
17343   return insn;
17344 }
17345
17346 static bool
17347 is_madd_op (enum attr_type t1)
17348 {
17349   unsigned int i;
17350   /* A number of these may be AArch32 only.  */
17351   enum attr_type mlatypes[] = {
17352     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
17353     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
17354     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
17355   };
17356
17357   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
17358     {
17359       if (t1 == mlatypes[i])
17360         return true;
17361     }
17362
17363   return false;
17364 }
17365
17366 /* Check if there is a register dependency between a load and the insn
17367    for which we hold recog_data.  */
17368
17369 static bool
17370 dep_between_memop_and_curr (rtx memop)
17371 {
17372   rtx load_reg;
17373   int opno;
17374
17375   gcc_assert (GET_CODE (memop) == SET);
17376
17377   if (!REG_P (SET_DEST (memop)))
17378     return false;
17379
17380   load_reg = SET_DEST (memop);
17381   for (opno = 1; opno < recog_data.n_operands; opno++)
17382     {
17383       rtx operand = recog_data.operand[opno];
17384       if (REG_P (operand)
17385           && reg_overlap_mentioned_p (load_reg, operand))
17386         return true;
17387
17388     }
17389   return false;
17390 }
17391
17392
17393 /* When working around the Cortex-A53 erratum 835769,
17394    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
17395    instruction and has a preceding memory instruction such that a NOP
17396    should be inserted between them.  */
17397
17398 bool
17399 aarch64_madd_needs_nop (rtx_insn* insn)
17400 {
17401   enum attr_type attr_type;
17402   rtx_insn *prev;
17403   rtx body;
17404
17405   if (!TARGET_FIX_ERR_A53_835769)
17406     return false;
17407
17408   if (!INSN_P (insn) || recog_memoized (insn) < 0)
17409     return false;
17410
17411   attr_type = get_attr_type (insn);
17412   if (!is_madd_op (attr_type))
17413     return false;
17414
17415   prev = aarch64_prev_real_insn (insn);
17416   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
17417      Restore recog state to INSN to avoid state corruption.  */
17418   extract_constrain_insn_cached (insn);
17419
17420   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
17421     return false;
17422
17423   body = single_set (prev);
17424
17425   /* If the previous insn is a memory op and there is no dependency between
17426      it and the DImode madd, emit a NOP between them.  If body is NULL then we
17427      have a complex memory operation, probably a load/store pair.
17428      Be conservative for now and emit a NOP.  */
17429   if (GET_MODE (recog_data.operand[0]) == DImode
17430       && (!body || !dep_between_memop_and_curr (body)))
17431     return true;
17432
17433   return false;
17434
17435 }
17436
17437
17438 /* Implement FINAL_PRESCAN_INSN.  */
17439
17440 void
17441 aarch64_final_prescan_insn (rtx_insn *insn)
17442 {
17443   if (aarch64_madd_needs_nop (insn))
17444     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
17445 }
17446
17447
17448 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
17449    instruction.  */
17450
17451 bool
17452 aarch64_sve_index_immediate_p (rtx base_or_step)
17453 {
17454   return (CONST_INT_P (base_or_step)
17455           && IN_RANGE (INTVAL (base_or_step), -16, 15));
17456 }
17457
17458 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
17459    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
17460
17461 bool
17462 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
17463 {
17464   rtx elt = unwrap_const_vec_duplicate (x);
17465   if (!CONST_INT_P (elt))
17466     return false;
17467
17468   HOST_WIDE_INT val = INTVAL (elt);
17469   if (negate_p)
17470     val = -val;
17471   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
17472
17473   if (val & 0xff)
17474     return IN_RANGE (val, 0, 0xff);
17475   return IN_RANGE (val, 0, 0xff00);
17476 }
17477
17478 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
17479    instructions when applied to mode MODE.  Negate X first if NEGATE_P
17480    is true.  */
17481
17482 bool
17483 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
17484 {
17485   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
17486     return false;
17487
17488   /* After the optional negation, the immediate must be nonnegative.
17489      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
17490      instead of SQADD Zn.B, Zn.B, #129.  */
17491   rtx elt = unwrap_const_vec_duplicate (x);
17492   return negate_p == (INTVAL (elt) < 0);
17493 }
17494
17495 /* Return true if X is a valid immediate operand for an SVE logical
17496    instruction such as AND.  */
17497
17498 bool
17499 aarch64_sve_bitmask_immediate_p (rtx x)
17500 {
17501   rtx elt;
17502
17503   return (const_vec_duplicate_p (x, &elt)
17504           && CONST_INT_P (elt)
17505           && aarch64_bitmask_imm (INTVAL (elt),
17506                                   GET_MODE_INNER (GET_MODE (x))));
17507 }
17508
17509 /* Return true if X is a valid immediate for the SVE DUP and CPY
17510    instructions.  */
17511
17512 bool
17513 aarch64_sve_dup_immediate_p (rtx x)
17514 {
17515   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
17516   if (!CONST_INT_P (x))
17517     return false;
17518
17519   HOST_WIDE_INT val = INTVAL (x);
17520   if (val & 0xff)
17521     return IN_RANGE (val, -0x80, 0x7f);
17522   return IN_RANGE (val, -0x8000, 0x7f00);
17523 }
17524
17525 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
17526    SIGNED_P says whether the operand is signed rather than unsigned.  */
17527
17528 bool
17529 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
17530 {
17531   x = unwrap_const_vec_duplicate (x);
17532   return (CONST_INT_P (x)
17533           && (signed_p
17534               ? IN_RANGE (INTVAL (x), -16, 15)
17535               : IN_RANGE (INTVAL (x), 0, 127)));
17536 }
17537
17538 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
17539    instruction.  Negate X first if NEGATE_P is true.  */
17540
17541 bool
17542 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
17543 {
17544   rtx elt;
17545   REAL_VALUE_TYPE r;
17546
17547   if (!const_vec_duplicate_p (x, &elt)
17548       || GET_CODE (elt) != CONST_DOUBLE)
17549     return false;
17550
17551   r = *CONST_DOUBLE_REAL_VALUE (elt);
17552
17553   if (negate_p)
17554     r = real_value_negate (&r);
17555
17556   if (real_equal (&r, &dconst1))
17557     return true;
17558   if (real_equal (&r, &dconsthalf))
17559     return true;
17560   return false;
17561 }
17562
17563 /* Return true if X is a valid immediate operand for an SVE FMUL
17564    instruction.  */
17565
17566 bool
17567 aarch64_sve_float_mul_immediate_p (rtx x)
17568 {
17569   rtx elt;
17570
17571   return (const_vec_duplicate_p (x, &elt)
17572           && GET_CODE (elt) == CONST_DOUBLE
17573           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
17574               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
17575 }
17576
17577 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
17578    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
17579    is nonnull, use it to describe valid immediates.  */
17580 static bool
17581 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
17582                                     simd_immediate_info *info,
17583                                     enum simd_immediate_check which,
17584                                     simd_immediate_info::insn_type insn)
17585 {
17586   /* Try a 4-byte immediate with LSL.  */
17587   for (unsigned int shift = 0; shift < 32; shift += 8)
17588     if ((val32 & (0xff << shift)) == val32)
17589       {
17590         if (info)
17591           *info = simd_immediate_info (SImode, val32 >> shift, insn,
17592                                        simd_immediate_info::LSL, shift);
17593         return true;
17594       }
17595
17596   /* Try a 2-byte immediate with LSL.  */
17597   unsigned int imm16 = val32 & 0xffff;
17598   if (imm16 == (val32 >> 16))
17599     for (unsigned int shift = 0; shift < 16; shift += 8)
17600       if ((imm16 & (0xff << shift)) == imm16)
17601         {
17602           if (info)
17603             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
17604                                          simd_immediate_info::LSL, shift);
17605           return true;
17606         }
17607
17608   /* Try a 4-byte immediate with MSL, except for cases that MVN
17609      can handle.  */
17610   if (which == AARCH64_CHECK_MOV)
17611     for (unsigned int shift = 8; shift < 24; shift += 8)
17612       {
17613         unsigned int low = (1 << shift) - 1;
17614         if (((val32 & (0xff << shift)) | low) == val32)
17615           {
17616             if (info)
17617               *info = simd_immediate_info (SImode, val32 >> shift, insn,
17618                                            simd_immediate_info::MSL, shift);
17619             return true;
17620           }
17621       }
17622
17623   return false;
17624 }
17625
17626 /* Return true if replicating VAL64 is a valid immediate for the
17627    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
17628    use it to describe valid immediates.  */
17629 static bool
17630 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
17631                                  simd_immediate_info *info,
17632                                  enum simd_immediate_check which)
17633 {
17634   unsigned int val32 = val64 & 0xffffffff;
17635   unsigned int val16 = val64 & 0xffff;
17636   unsigned int val8 = val64 & 0xff;
17637
17638   if (val32 == (val64 >> 32))
17639     {
17640       if ((which & AARCH64_CHECK_ORR) != 0
17641           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
17642                                                  simd_immediate_info::MOV))
17643         return true;
17644
17645       if ((which & AARCH64_CHECK_BIC) != 0
17646           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
17647                                                  simd_immediate_info::MVN))
17648         return true;
17649
17650       /* Try using a replicated byte.  */
17651       if (which == AARCH64_CHECK_MOV
17652           && val16 == (val32 >> 16)
17653           && val8 == (val16 >> 8))
17654         {
17655           if (info)
17656             *info = simd_immediate_info (QImode, val8);
17657           return true;
17658         }
17659     }
17660
17661   /* Try using a bit-to-bytemask.  */
17662   if (which == AARCH64_CHECK_MOV)
17663     {
17664       unsigned int i;
17665       for (i = 0; i < 64; i += 8)
17666         {
17667           unsigned char byte = (val64 >> i) & 0xff;
17668           if (byte != 0 && byte != 0xff)
17669             break;
17670         }
17671       if (i == 64)
17672         {
17673           if (info)
17674             *info = simd_immediate_info (DImode, val64);
17675           return true;
17676         }
17677     }
17678   return false;
17679 }
17680
17681 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
17682    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
17683
17684 static bool
17685 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
17686                              simd_immediate_info *info)
17687 {
17688   scalar_int_mode mode = DImode;
17689   unsigned int val32 = val64 & 0xffffffff;
17690   if (val32 == (val64 >> 32))
17691     {
17692       mode = SImode;
17693       unsigned int val16 = val32 & 0xffff;
17694       if (val16 == (val32 >> 16))
17695         {
17696           mode = HImode;
17697           unsigned int val8 = val16 & 0xff;
17698           if (val8 == (val16 >> 8))
17699             mode = QImode;
17700         }
17701     }
17702   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
17703   if (IN_RANGE (val, -0x80, 0x7f))
17704     {
17705       /* DUP with no shift.  */
17706       if (info)
17707         *info = simd_immediate_info (mode, val);
17708       return true;
17709     }
17710   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
17711     {
17712       /* DUP with LSL #8.  */
17713       if (info)
17714         *info = simd_immediate_info (mode, val);
17715       return true;
17716     }
17717   if (aarch64_bitmask_imm (val64, mode))
17718     {
17719       /* DUPM.  */
17720       if (info)
17721         *info = simd_immediate_info (mode, val);
17722       return true;
17723     }
17724   return false;
17725 }
17726
17727 /* Return true if X is an UNSPEC_PTRUE constant of the form:
17728
17729        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
17730
17731    where PATTERN is the svpattern as a CONST_INT and where ZERO
17732    is a zero constant of the required PTRUE mode (which can have
17733    fewer elements than X's mode, if zero bits are significant).
17734
17735    If so, and if INFO is nonnull, describe the immediate in INFO.  */
17736 bool
17737 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
17738 {
17739   if (GET_CODE (x) != CONST)
17740     return false;
17741
17742   x = XEXP (x, 0);
17743   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
17744     return false;
17745
17746   if (info)
17747     {
17748       aarch64_svpattern pattern
17749         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
17750       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
17751       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
17752       *info = simd_immediate_info (int_mode, pattern);
17753     }
17754   return true;
17755 }
17756
17757 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
17758    it to describe valid immediates.  */
17759
17760 static bool
17761 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
17762 {
17763   if (aarch64_sve_ptrue_svpattern_p (x, info))
17764     return true;
17765
17766   if (x == CONST0_RTX (GET_MODE (x)))
17767     {
17768       if (info)
17769         *info = simd_immediate_info (DImode, 0);
17770       return true;
17771     }
17772
17773   /* Analyze the value as a VNx16BImode.  This should be relatively
17774      efficient, since rtx_vector_builder has enough built-in capacity
17775      to store all VLA predicate constants without needing the heap.  */
17776   rtx_vector_builder builder;
17777   if (!aarch64_get_sve_pred_bits (builder, x))
17778     return false;
17779
17780   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
17781   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
17782     {
17783       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
17784       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
17785       if (pattern != AARCH64_NUM_SVPATTERNS)
17786         {
17787           if (info)
17788             {
17789               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
17790               *info = simd_immediate_info (int_mode, pattern);
17791             }
17792           return true;
17793         }
17794     }
17795   return false;
17796 }
17797
17798 /* Return true if OP is a valid SIMD immediate for the operation
17799    described by WHICH.  If INFO is nonnull, use it to describe valid
17800    immediates.  */
17801 bool
17802 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
17803                               enum simd_immediate_check which)
17804 {
17805   machine_mode mode = GET_MODE (op);
17806   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17807   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
17808     return false;
17809
17810   if (vec_flags & VEC_SVE_PRED)
17811     return aarch64_sve_pred_valid_immediate (op, info);
17812
17813   scalar_mode elt_mode = GET_MODE_INNER (mode);
17814   rtx base, step;
17815   unsigned int n_elts;
17816   if (GET_CODE (op) == CONST_VECTOR
17817       && CONST_VECTOR_DUPLICATE_P (op))
17818     n_elts = CONST_VECTOR_NPATTERNS (op);
17819   else if ((vec_flags & VEC_SVE_DATA)
17820            && const_vec_series_p (op, &base, &step))
17821     {
17822       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
17823       if (!aarch64_sve_index_immediate_p (base)
17824           || !aarch64_sve_index_immediate_p (step))
17825         return false;
17826
17827       if (info)
17828         {
17829           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
17830              should yield two integer values per 128-bit block, meaning
17831              that we need to treat it in the same way as V2DI and then
17832              ignore the upper 32 bits of each element.  */
17833           elt_mode = aarch64_sve_container_int_mode (mode);
17834           *info = simd_immediate_info (elt_mode, base, step);
17835         }
17836       return true;
17837     }
17838   else if (GET_CODE (op) == CONST_VECTOR
17839            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
17840     /* N_ELTS set above.  */;
17841   else
17842     return false;
17843
17844   scalar_float_mode elt_float_mode;
17845   if (n_elts == 1
17846       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
17847     {
17848       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
17849       if (aarch64_float_const_zero_rtx_p (elt)
17850           || aarch64_float_const_representable_p (elt))
17851         {
17852           if (info)
17853             *info = simd_immediate_info (elt_float_mode, elt);
17854           return true;
17855         }
17856     }
17857
17858   /* If all elements in an SVE vector have the same value, we have a free
17859      choice between using the element mode and using the container mode.
17860      Using the element mode means that unused parts of the vector are
17861      duplicates of the used elements, while using the container mode means
17862      that the unused parts are an extension of the used elements.  Using the
17863      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
17864      for its container mode VNx4SI while 0x00000101 isn't.
17865
17866      If not all elements in an SVE vector have the same value, we need the
17867      transition from one element to the next to occur at container boundaries.
17868      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
17869      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
17870   scalar_int_mode elt_int_mode;
17871   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
17872     elt_int_mode = aarch64_sve_container_int_mode (mode);
17873   else
17874     elt_int_mode = int_mode_for_mode (elt_mode).require ();
17875
17876   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
17877   if (elt_size > 8)
17878     return false;
17879
17880   /* Expand the vector constant out into a byte vector, with the least
17881      significant byte of the register first.  */
17882   auto_vec<unsigned char, 16> bytes;
17883   bytes.reserve (n_elts * elt_size);
17884   for (unsigned int i = 0; i < n_elts; i++)
17885     {
17886       /* The vector is provided in gcc endian-neutral fashion.
17887          For aarch64_be Advanced SIMD, it must be laid out in the vector
17888          register in reverse order.  */
17889       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
17890       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
17891
17892       if (elt_mode != elt_int_mode)
17893         elt = gen_lowpart (elt_int_mode, elt);
17894
17895       if (!CONST_INT_P (elt))
17896         return false;
17897
17898       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
17899       for (unsigned int byte = 0; byte < elt_size; byte++)
17900         {
17901           bytes.quick_push (elt_val & 0xff);
17902           elt_val >>= BITS_PER_UNIT;
17903         }
17904     }
17905
17906   /* The immediate must repeat every eight bytes.  */
17907   unsigned int nbytes = bytes.length ();
17908   for (unsigned i = 8; i < nbytes; ++i)
17909     if (bytes[i] != bytes[i - 8])
17910       return false;
17911
17912   /* Get the repeating 8-byte value as an integer.  No endian correction
17913      is needed here because bytes is already in lsb-first order.  */
17914   unsigned HOST_WIDE_INT val64 = 0;
17915   for (unsigned int i = 0; i < 8; i++)
17916     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
17917               << (i * BITS_PER_UNIT));
17918
17919   if (vec_flags & VEC_SVE_DATA)
17920     return aarch64_sve_valid_immediate (val64, info);
17921   else
17922     return aarch64_advsimd_valid_immediate (val64, info, which);
17923 }
17924
17925 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
17926    has a step in the range of INDEX.  Return the index expression if so,
17927    otherwise return null.  */
17928 rtx
17929 aarch64_check_zero_based_sve_index_immediate (rtx x)
17930 {
17931   rtx base, step;
17932   if (const_vec_series_p (x, &base, &step)
17933       && base == const0_rtx
17934       && aarch64_sve_index_immediate_p (step))
17935     return step;
17936   return NULL_RTX;
17937 }
17938
17939 /* Check of immediate shift constants are within range.  */
17940 bool
17941 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
17942 {
17943   x = unwrap_const_vec_duplicate (x);
17944   if (!CONST_INT_P (x))
17945     return false;
17946   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
17947   if (left)
17948     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
17949   else
17950     return IN_RANGE (INTVAL (x), 1, bit_width);
17951 }
17952
17953 /* Return the bitmask CONST_INT to select the bits required by a zero extract
17954    operation of width WIDTH at bit position POS.  */
17955
17956 rtx
17957 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
17958 {
17959   gcc_assert (CONST_INT_P (width));
17960   gcc_assert (CONST_INT_P (pos));
17961
17962   unsigned HOST_WIDE_INT mask
17963     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
17964   return GEN_INT (mask << UINTVAL (pos));
17965 }
17966
17967 bool
17968 aarch64_mov_operand_p (rtx x, machine_mode mode)
17969 {
17970   if (GET_CODE (x) == HIGH
17971       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
17972     return true;
17973
17974   if (CONST_INT_P (x))
17975     return true;
17976
17977   if (VECTOR_MODE_P (GET_MODE (x)))
17978     {
17979       /* Require predicate constants to be VNx16BI before RA, so that we
17980          force everything to have a canonical form.  */
17981       if (!lra_in_progress
17982           && !reload_completed
17983           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
17984           && GET_MODE (x) != VNx16BImode)
17985         return false;
17986
17987       return aarch64_simd_valid_immediate (x, NULL);
17988     }
17989
17990   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
17991     return true;
17992
17993   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
17994     return true;
17995
17996   return aarch64_classify_symbolic_expression (x)
17997     == SYMBOL_TINY_ABSOLUTE;
17998 }
17999
18000 /* Return a const_int vector of VAL.  */
18001 rtx
18002 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
18003 {
18004   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
18005   return gen_const_vec_duplicate (mode, c);
18006 }
18007
18008 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
18009
18010 bool
18011 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
18012 {
18013   machine_mode vmode;
18014
18015   vmode = aarch64_simd_container_mode (mode, 64);
18016   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
18017   return aarch64_simd_valid_immediate (op_v, NULL);
18018 }
18019
18020 /* Construct and return a PARALLEL RTX vector with elements numbering the
18021    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
18022    the vector - from the perspective of the architecture.  This does not
18023    line up with GCC's perspective on lane numbers, so we end up with
18024    different masks depending on our target endian-ness.  The diagram
18025    below may help.  We must draw the distinction when building masks
18026    which select one half of the vector.  An instruction selecting
18027    architectural low-lanes for a big-endian target, must be described using
18028    a mask selecting GCC high-lanes.
18029
18030                  Big-Endian             Little-Endian
18031
18032 GCC             0   1   2   3           3   2   1   0
18033               | x | x | x | x |       | x | x | x | x |
18034 Architecture    3   2   1   0           3   2   1   0
18035
18036 Low Mask:         { 2, 3 }                { 0, 1 }
18037 High Mask:        { 0, 1 }                { 2, 3 }
18038
18039    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
18040
18041 rtx
18042 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
18043 {
18044   rtvec v = rtvec_alloc (nunits / 2);
18045   int high_base = nunits / 2;
18046   int low_base = 0;
18047   int base;
18048   rtx t1;
18049   int i;
18050
18051   if (BYTES_BIG_ENDIAN)
18052     base = high ? low_base : high_base;
18053   else
18054     base = high ? high_base : low_base;
18055
18056   for (i = 0; i < nunits / 2; i++)
18057     RTVEC_ELT (v, i) = GEN_INT (base + i);
18058
18059   t1 = gen_rtx_PARALLEL (mode, v);
18060   return t1;
18061 }
18062
18063 /* Check OP for validity as a PARALLEL RTX vector with elements
18064    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
18065    from the perspective of the architecture.  See the diagram above
18066    aarch64_simd_vect_par_cnst_half for more details.  */
18067
18068 bool
18069 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
18070                                        bool high)
18071 {
18072   int nelts;
18073   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
18074     return false;
18075
18076   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
18077   HOST_WIDE_INT count_op = XVECLEN (op, 0);
18078   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
18079   int i = 0;
18080
18081   if (count_op != count_ideal)
18082     return false;
18083
18084   for (i = 0; i < count_ideal; i++)
18085     {
18086       rtx elt_op = XVECEXP (op, 0, i);
18087       rtx elt_ideal = XVECEXP (ideal, 0, i);
18088
18089       if (!CONST_INT_P (elt_op)
18090           || INTVAL (elt_ideal) != INTVAL (elt_op))
18091         return false;
18092     }
18093   return true;
18094 }
18095
18096 /* Return a PARALLEL containing NELTS elements, with element I equal
18097    to BASE + I * STEP.  */
18098
18099 rtx
18100 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
18101 {
18102   rtvec vec = rtvec_alloc (nelts);
18103   for (unsigned int i = 0; i < nelts; ++i)
18104     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
18105   return gen_rtx_PARALLEL (VOIDmode, vec);
18106 }
18107
18108 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
18109    series with step STEP.  */
18110
18111 bool
18112 aarch64_stepped_int_parallel_p (rtx op, int step)
18113 {
18114   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
18115     return false;
18116
18117   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
18118   for (int i = 1; i < XVECLEN (op, 0); ++i)
18119     if (!CONST_INT_P (XVECEXP (op, 0, i))
18120         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
18121       return false;
18122
18123   return true;
18124 }
18125
18126 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
18127    HIGH (exclusive).  */
18128 void
18129 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
18130                           const_tree exp)
18131 {
18132   HOST_WIDE_INT lane;
18133   gcc_assert (CONST_INT_P (operand));
18134   lane = INTVAL (operand);
18135
18136   if (lane < low || lane >= high)
18137   {
18138     if (exp)
18139       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
18140     else
18141       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
18142   }
18143 }
18144
18145 /* Peform endian correction on lane number N, which indexes a vector
18146    of mode MODE, and return the result as an SImode rtx.  */
18147
18148 rtx
18149 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
18150 {
18151   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
18152 }
18153
18154 /* Return TRUE if OP is a valid vector addressing mode.  */
18155
18156 bool
18157 aarch64_simd_mem_operand_p (rtx op)
18158 {
18159   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
18160                         || REG_P (XEXP (op, 0)));
18161 }
18162
18163 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
18164
18165 bool
18166 aarch64_sve_ld1r_operand_p (rtx op)
18167 {
18168   struct aarch64_address_info addr;
18169   scalar_mode mode;
18170
18171   return (MEM_P (op)
18172           && is_a <scalar_mode> (GET_MODE (op), &mode)
18173           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
18174           && addr.type == ADDRESS_REG_IMM
18175           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
18176 }
18177
18178 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
18179    where the size of the read data is specified by `mode` and the size of the
18180    vector elements are specified by `elem_mode`.   */
18181 bool
18182 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
18183                                    scalar_mode elem_mode)
18184 {
18185   struct aarch64_address_info addr;
18186   if (!MEM_P (op)
18187       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
18188     return false;
18189
18190   if (addr.type == ADDRESS_REG_IMM)
18191     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
18192
18193   if (addr.type == ADDRESS_REG_REG)
18194     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
18195
18196   return false;
18197 }
18198
18199 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
18200 bool
18201 aarch64_sve_ld1rq_operand_p (rtx op)
18202 {
18203   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
18204                                             GET_MODE_INNER (GET_MODE (op)));
18205 }
18206
18207 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
18208    accessing a vector where the element size is specified by `elem_mode`.  */
18209 bool
18210 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
18211 {
18212   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
18213 }
18214
18215 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
18216 bool
18217 aarch64_sve_ldff1_operand_p (rtx op)
18218 {
18219   if (!MEM_P (op))
18220     return false;
18221
18222   struct aarch64_address_info addr;
18223   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
18224     return false;
18225
18226   if (addr.type == ADDRESS_REG_IMM)
18227     return known_eq (addr.const_offset, 0);
18228
18229   return addr.type == ADDRESS_REG_REG;
18230 }
18231
18232 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
18233 bool
18234 aarch64_sve_ldnf1_operand_p (rtx op)
18235 {
18236   struct aarch64_address_info addr;
18237
18238   return (MEM_P (op)
18239           && aarch64_classify_address (&addr, XEXP (op, 0),
18240                                        GET_MODE (op), false)
18241           && addr.type == ADDRESS_REG_IMM);
18242 }
18243
18244 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
18245    The conditions for STR are the same.  */
18246 bool
18247 aarch64_sve_ldr_operand_p (rtx op)
18248 {
18249   struct aarch64_address_info addr;
18250
18251   return (MEM_P (op)
18252           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
18253                                        false, ADDR_QUERY_ANY)
18254           && addr.type == ADDRESS_REG_IMM);
18255 }
18256
18257 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
18258    addressing memory of mode MODE.  */
18259 bool
18260 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
18261 {
18262   struct aarch64_address_info addr;
18263   if (!aarch64_classify_address (&addr, op, mode, false))
18264     return false;
18265
18266   if (addr.type == ADDRESS_REG_IMM)
18267     return known_eq (addr.const_offset, 0);
18268
18269   return addr.type == ADDRESS_REG_REG;
18270 }
18271
18272 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
18273    We need to be able to access the individual pieces, so the range
18274    is different from LD[234] and ST[234].  */
18275 bool
18276 aarch64_sve_struct_memory_operand_p (rtx op)
18277 {
18278   if (!MEM_P (op))
18279     return false;
18280
18281   machine_mode mode = GET_MODE (op);
18282   struct aarch64_address_info addr;
18283   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
18284                                  ADDR_QUERY_ANY)
18285       || addr.type != ADDRESS_REG_IMM)
18286     return false;
18287
18288   poly_int64 first = addr.const_offset;
18289   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
18290   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
18291           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
18292 }
18293
18294 /* Emit a register copy from operand to operand, taking care not to
18295    early-clobber source registers in the process.
18296
18297    COUNT is the number of components into which the copy needs to be
18298    decomposed.  */
18299 void
18300 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
18301                                 unsigned int count)
18302 {
18303   unsigned int i;
18304   int rdest = REGNO (operands[0]);
18305   int rsrc = REGNO (operands[1]);
18306
18307   if (!reg_overlap_mentioned_p (operands[0], operands[1])
18308       || rdest < rsrc)
18309     for (i = 0; i < count; i++)
18310       emit_move_insn (gen_rtx_REG (mode, rdest + i),
18311                       gen_rtx_REG (mode, rsrc + i));
18312   else
18313     for (i = 0; i < count; i++)
18314       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
18315                       gen_rtx_REG (mode, rsrc + count - i - 1));
18316 }
18317
18318 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
18319    one of VSTRUCT modes: OI, CI, or XI.  */
18320 int
18321 aarch64_simd_attr_length_rglist (machine_mode mode)
18322 {
18323   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
18324   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
18325 }
18326
18327 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
18328    alignment of a vector to 128 bits.  SVE predicates have an alignment of
18329    16 bits.  */
18330 static HOST_WIDE_INT
18331 aarch64_simd_vector_alignment (const_tree type)
18332 {
18333   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
18334      be set for non-predicate vectors of booleans.  Modes are the most
18335      direct way we have of identifying real SVE predicate types.  */
18336   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
18337     return 16;
18338   widest_int min_size
18339     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
18340   return wi::umin (min_size, 128).to_uhwi ();
18341 }
18342
18343 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
18344 static poly_uint64
18345 aarch64_vectorize_preferred_vector_alignment (const_tree type)
18346 {
18347   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
18348     {
18349       /* If the length of the vector is fixed, try to align to that length,
18350          otherwise don't try to align at all.  */
18351       HOST_WIDE_INT result;
18352       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
18353         result = TYPE_ALIGN (TREE_TYPE (type));
18354       return result;
18355     }
18356   return TYPE_ALIGN (type);
18357 }
18358
18359 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
18360 static bool
18361 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
18362 {
18363   if (is_packed)
18364     return false;
18365
18366   /* For fixed-length vectors, check that the vectorizer will aim for
18367      full-vector alignment.  This isn't true for generic GCC vectors
18368      that are wider than the ABI maximum of 128 bits.  */
18369   poly_uint64 preferred_alignment =
18370     aarch64_vectorize_preferred_vector_alignment (type);
18371   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
18372       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
18373                    preferred_alignment))
18374     return false;
18375
18376   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
18377   return true;
18378 }
18379
18380 /* Return true if the vector misalignment factor is supported by the
18381    target.  */
18382 static bool
18383 aarch64_builtin_support_vector_misalignment (machine_mode mode,
18384                                              const_tree type, int misalignment,
18385                                              bool is_packed)
18386 {
18387   if (TARGET_SIMD && STRICT_ALIGNMENT)
18388     {
18389       /* Return if movmisalign pattern is not supported for this mode.  */
18390       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
18391         return false;
18392
18393       /* Misalignment factor is unknown at compile time.  */
18394       if (misalignment == -1)
18395         return false;
18396     }
18397   return default_builtin_support_vector_misalignment (mode, type, misalignment,
18398                                                       is_packed);
18399 }
18400
18401 /* If VALS is a vector constant that can be loaded into a register
18402    using DUP, generate instructions to do so and return an RTX to
18403    assign to the register.  Otherwise return NULL_RTX.  */
18404 static rtx
18405 aarch64_simd_dup_constant (rtx vals)
18406 {
18407   machine_mode mode = GET_MODE (vals);
18408   machine_mode inner_mode = GET_MODE_INNER (mode);
18409   rtx x;
18410
18411   if (!const_vec_duplicate_p (vals, &x))
18412     return NULL_RTX;
18413
18414   /* We can load this constant by using DUP and a constant in a
18415      single ARM register.  This will be cheaper than a vector
18416      load.  */
18417   x = copy_to_mode_reg (inner_mode, x);
18418   return gen_vec_duplicate (mode, x);
18419 }
18420
18421
18422 /* Generate code to load VALS, which is a PARALLEL containing only
18423    constants (for vec_init) or CONST_VECTOR, efficiently into a
18424    register.  Returns an RTX to copy into the register, or NULL_RTX
18425    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
18426 static rtx
18427 aarch64_simd_make_constant (rtx vals)
18428 {
18429   machine_mode mode = GET_MODE (vals);
18430   rtx const_dup;
18431   rtx const_vec = NULL_RTX;
18432   int n_const = 0;
18433   int i;
18434
18435   if (GET_CODE (vals) == CONST_VECTOR)
18436     const_vec = vals;
18437   else if (GET_CODE (vals) == PARALLEL)
18438     {
18439       /* A CONST_VECTOR must contain only CONST_INTs and
18440          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
18441          Only store valid constants in a CONST_VECTOR.  */
18442       int n_elts = XVECLEN (vals, 0);
18443       for (i = 0; i < n_elts; ++i)
18444         {
18445           rtx x = XVECEXP (vals, 0, i);
18446           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18447             n_const++;
18448         }
18449       if (n_const == n_elts)
18450         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
18451     }
18452   else
18453     gcc_unreachable ();
18454
18455   if (const_vec != NULL_RTX
18456       && aarch64_simd_valid_immediate (const_vec, NULL))
18457     /* Load using MOVI/MVNI.  */
18458     return const_vec;
18459   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
18460     /* Loaded using DUP.  */
18461     return const_dup;
18462   else if (const_vec != NULL_RTX)
18463     /* Load from constant pool. We cannot take advantage of single-cycle
18464        LD1 because we need a PC-relative addressing mode.  */
18465     return const_vec;
18466   else
18467     /* A PARALLEL containing something not valid inside CONST_VECTOR.
18468        We cannot construct an initializer.  */
18469     return NULL_RTX;
18470 }
18471
18472 /* Expand a vector initialisation sequence, such that TARGET is
18473    initialised to contain VALS.  */
18474
18475 void
18476 aarch64_expand_vector_init (rtx target, rtx vals)
18477 {
18478   machine_mode mode = GET_MODE (target);
18479   scalar_mode inner_mode = GET_MODE_INNER (mode);
18480   /* The number of vector elements.  */
18481   int n_elts = XVECLEN (vals, 0);
18482   /* The number of vector elements which are not constant.  */
18483   int n_var = 0;
18484   rtx any_const = NULL_RTX;
18485   /* The first element of vals.  */
18486   rtx v0 = XVECEXP (vals, 0, 0);
18487   bool all_same = true;
18488
18489   /* This is a special vec_init<M><N> where N is not an element mode but a
18490      vector mode with half the elements of M.  We expect to find two entries
18491      of mode N in VALS and we must put their concatentation into TARGET.  */
18492   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
18493     {
18494       gcc_assert (known_eq (GET_MODE_SIZE (mode),
18495                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
18496       rtx lo = XVECEXP (vals, 0, 0);
18497       rtx hi = XVECEXP (vals, 0, 1);
18498       machine_mode narrow_mode = GET_MODE (lo);
18499       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
18500       gcc_assert (narrow_mode == GET_MODE (hi));
18501
18502       /* When we want to concatenate a half-width vector with zeroes we can
18503          use the aarch64_combinez[_be] patterns.  Just make sure that the
18504          zeroes are in the right half.  */
18505       if (BYTES_BIG_ENDIAN
18506           && aarch64_simd_imm_zero (lo, narrow_mode)
18507           && general_operand (hi, narrow_mode))
18508         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
18509       else if (!BYTES_BIG_ENDIAN
18510                && aarch64_simd_imm_zero (hi, narrow_mode)
18511                && general_operand (lo, narrow_mode))
18512         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
18513       else
18514         {
18515           /* Else create the two half-width registers and combine them.  */
18516           if (!REG_P (lo))
18517             lo = force_reg (GET_MODE (lo), lo);
18518           if (!REG_P (hi))
18519             hi = force_reg (GET_MODE (hi), hi);
18520
18521           if (BYTES_BIG_ENDIAN)
18522             std::swap (lo, hi);
18523           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
18524         }
18525      return;
18526    }
18527
18528   /* Count the number of variable elements to initialise.  */
18529   for (int i = 0; i < n_elts; ++i)
18530     {
18531       rtx x = XVECEXP (vals, 0, i);
18532       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
18533         ++n_var;
18534       else
18535         any_const = x;
18536
18537       all_same &= rtx_equal_p (x, v0);
18538     }
18539
18540   /* No variable elements, hand off to aarch64_simd_make_constant which knows
18541      how best to handle this.  */
18542   if (n_var == 0)
18543     {
18544       rtx constant = aarch64_simd_make_constant (vals);
18545       if (constant != NULL_RTX)
18546         {
18547           emit_move_insn (target, constant);
18548           return;
18549         }
18550     }
18551
18552   /* Splat a single non-constant element if we can.  */
18553   if (all_same)
18554     {
18555       rtx x = copy_to_mode_reg (inner_mode, v0);
18556       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18557       return;
18558     }
18559
18560   enum insn_code icode = optab_handler (vec_set_optab, mode);
18561   gcc_assert (icode != CODE_FOR_nothing);
18562
18563   /* If there are only variable elements, try to optimize
18564      the insertion using dup for the most common element
18565      followed by insertions.  */
18566
18567   /* The algorithm will fill matches[*][0] with the earliest matching element,
18568      and matches[X][1] with the count of duplicate elements (if X is the
18569      earliest element which has duplicates).  */
18570
18571   if (n_var == n_elts && n_elts <= 16)
18572     {
18573       int matches[16][2] = {0};
18574       for (int i = 0; i < n_elts; i++)
18575         {
18576           for (int j = 0; j <= i; j++)
18577             {
18578               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
18579                 {
18580                   matches[i][0] = j;
18581                   matches[j][1]++;
18582                   break;
18583                 }
18584             }
18585         }
18586       int maxelement = 0;
18587       int maxv = 0;
18588       for (int i = 0; i < n_elts; i++)
18589         if (matches[i][1] > maxv)
18590           {
18591             maxelement = i;
18592             maxv = matches[i][1];
18593           }
18594
18595       /* Create a duplicate of the most common element, unless all elements
18596          are equally useless to us, in which case just immediately set the
18597          vector register using the first element.  */
18598
18599       if (maxv == 1)
18600         {
18601           /* For vectors of two 64-bit elements, we can do even better.  */
18602           if (n_elts == 2
18603               && (inner_mode == E_DImode
18604                   || inner_mode == E_DFmode))
18605
18606             {
18607               rtx x0 = XVECEXP (vals, 0, 0);
18608               rtx x1 = XVECEXP (vals, 0, 1);
18609               /* Combine can pick up this case, but handling it directly
18610                  here leaves clearer RTL.
18611
18612                  This is load_pair_lanes<mode>, and also gives us a clean-up
18613                  for store_pair_lanes<mode>.  */
18614               if (memory_operand (x0, inner_mode)
18615                   && memory_operand (x1, inner_mode)
18616                   && !STRICT_ALIGNMENT
18617                   && rtx_equal_p (XEXP (x1, 0),
18618                                   plus_constant (Pmode,
18619                                                  XEXP (x0, 0),
18620                                                  GET_MODE_SIZE (inner_mode))))
18621                 {
18622                   rtx t;
18623                   if (inner_mode == DFmode)
18624                     t = gen_load_pair_lanesdf (target, x0, x1);
18625                   else
18626                     t = gen_load_pair_lanesdi (target, x0, x1);
18627                   emit_insn (t);
18628                   return;
18629                 }
18630             }
18631           /* The subreg-move sequence below will move into lane zero of the
18632              vector register.  For big-endian we want that position to hold
18633              the last element of VALS.  */
18634           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
18635           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18636           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
18637         }
18638       else
18639         {
18640           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18641           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18642         }
18643
18644       /* Insert the rest.  */
18645       for (int i = 0; i < n_elts; i++)
18646         {
18647           rtx x = XVECEXP (vals, 0, i);
18648           if (matches[i][0] == maxelement)
18649             continue;
18650           x = copy_to_mode_reg (inner_mode, x);
18651           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18652         }
18653       return;
18654     }
18655
18656   /* Initialise a vector which is part-variable.  We want to first try
18657      to build those lanes which are constant in the most efficient way we
18658      can.  */
18659   if (n_var != n_elts)
18660     {
18661       rtx copy = copy_rtx (vals);
18662
18663       /* Load constant part of vector.  We really don't care what goes into the
18664          parts we will overwrite, but we're more likely to be able to load the
18665          constant efficiently if it has fewer, larger, repeating parts
18666          (see aarch64_simd_valid_immediate).  */
18667       for (int i = 0; i < n_elts; i++)
18668         {
18669           rtx x = XVECEXP (vals, 0, i);
18670           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18671             continue;
18672           rtx subst = any_const;
18673           for (int bit = n_elts / 2; bit > 0; bit /= 2)
18674             {
18675               /* Look in the copied vector, as more elements are const.  */
18676               rtx test = XVECEXP (copy, 0, i ^ bit);
18677               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
18678                 {
18679                   subst = test;
18680                   break;
18681                 }
18682             }
18683           XVECEXP (copy, 0, i) = subst;
18684         }
18685       aarch64_expand_vector_init (target, copy);
18686     }
18687
18688   /* Insert the variable lanes directly.  */
18689   for (int i = 0; i < n_elts; i++)
18690     {
18691       rtx x = XVECEXP (vals, 0, i);
18692       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18693         continue;
18694       x = copy_to_mode_reg (inner_mode, x);
18695       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18696     }
18697 }
18698
18699 /* Emit RTL corresponding to:
18700    insr TARGET, ELEM.  */
18701
18702 static void
18703 emit_insr (rtx target, rtx elem)
18704 {
18705   machine_mode mode = GET_MODE (target);
18706   scalar_mode elem_mode = GET_MODE_INNER (mode);
18707   elem = force_reg (elem_mode, elem);
18708
18709   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
18710   gcc_assert (icode != CODE_FOR_nothing);
18711   emit_insn (GEN_FCN (icode) (target, target, elem));
18712 }
18713
18714 /* Subroutine of aarch64_sve_expand_vector_init for handling
18715    trailing constants.
18716    This function works as follows:
18717    (a) Create a new vector consisting of trailing constants.
18718    (b) Initialize TARGET with the constant vector using emit_move_insn.
18719    (c) Insert remaining elements in TARGET using insr.
18720    NELTS is the total number of elements in original vector while
18721    while NELTS_REQD is the number of elements that are actually
18722    significant.
18723
18724    ??? The heuristic used is to do above only if number of constants
18725    is at least half the total number of elements.  May need fine tuning.  */
18726
18727 static bool
18728 aarch64_sve_expand_vector_init_handle_trailing_constants
18729  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
18730 {
18731   machine_mode mode = GET_MODE (target);
18732   scalar_mode elem_mode = GET_MODE_INNER (mode);
18733   int n_trailing_constants = 0;
18734
18735   for (int i = nelts_reqd - 1;
18736        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
18737        i--)
18738     n_trailing_constants++;
18739
18740   if (n_trailing_constants >= nelts_reqd / 2)
18741     {
18742       /* Try to use the natural pattern of BUILDER to extend the trailing
18743          constant elements to a full vector.  Replace any variables in the
18744          extra elements with zeros.
18745
18746          ??? It would be better if the builders supported "don't care"
18747              elements, with the builder filling in whichever elements
18748              give the most compact encoding.  */
18749       rtx_vector_builder v (mode, nelts, 1);
18750       for (int i = 0; i < nelts; i++)
18751         {
18752           rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
18753           if (!valid_for_const_vector_p (elem_mode, x))
18754             x = const0_rtx;
18755           v.quick_push (x);
18756         }
18757       rtx const_vec = v.build ();
18758       emit_move_insn (target, const_vec);
18759
18760       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
18761         emit_insr (target, builder.elt (i));
18762
18763       return true;
18764     }
18765
18766   return false;
18767 }
18768
18769 /* Subroutine of aarch64_sve_expand_vector_init.
18770    Works as follows:
18771    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
18772    (b) Skip trailing elements from BUILDER, which are the same as
18773        element NELTS_REQD - 1.
18774    (c) Insert earlier elements in reverse order in TARGET using insr.  */
18775
18776 static void
18777 aarch64_sve_expand_vector_init_insert_elems (rtx target,
18778                                              const rtx_vector_builder &builder,
18779                                              int nelts_reqd)
18780 {
18781   machine_mode mode = GET_MODE (target);
18782   scalar_mode elem_mode = GET_MODE_INNER (mode);
18783
18784   struct expand_operand ops[2];
18785   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
18786   gcc_assert (icode != CODE_FOR_nothing);
18787
18788   create_output_operand (&ops[0], target, mode);
18789   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
18790   expand_insn (icode, 2, ops);
18791
18792   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
18793   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
18794     emit_insr (target, builder.elt (i));
18795 }
18796
18797 /* Subroutine of aarch64_sve_expand_vector_init to handle case
18798    when all trailing elements of builder are same.
18799    This works as follows:
18800    (a) Use expand_insn interface to broadcast last vector element in TARGET.
18801    (b) Insert remaining elements in TARGET using insr.
18802
18803    ??? The heuristic used is to do above if number of same trailing elements
18804    is at least 3/4 of total number of elements, loosely based on
18805    heuristic from mostly_zeros_p.  May need fine-tuning.  */
18806
18807 static bool
18808 aarch64_sve_expand_vector_init_handle_trailing_same_elem
18809  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
18810 {
18811   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
18812   if (ndups >= (3 * nelts_reqd) / 4)
18813     {
18814       aarch64_sve_expand_vector_init_insert_elems (target, builder,
18815                                                    nelts_reqd - ndups + 1);
18816       return true;
18817     }
18818
18819   return false;
18820 }
18821
18822 /* Initialize register TARGET from BUILDER. NELTS is the constant number
18823    of elements in BUILDER.
18824
18825    The function tries to initialize TARGET from BUILDER if it fits one
18826    of the special cases outlined below.
18827
18828    Failing that, the function divides BUILDER into two sub-vectors:
18829    v_even = even elements of BUILDER;
18830    v_odd = odd elements of BUILDER;
18831
18832    and recursively calls itself with v_even and v_odd.
18833
18834    if (recursive call succeeded for v_even or v_odd)
18835      TARGET = zip (v_even, v_odd)
18836
18837    The function returns true if it managed to build TARGET from BUILDER
18838    with one of the special cases, false otherwise.
18839
18840    Example: {a, 1, b, 2, c, 3, d, 4}
18841
18842    The vector gets divided into:
18843    v_even = {a, b, c, d}
18844    v_odd = {1, 2, 3, 4}
18845
18846    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
18847    initialize tmp2 from constant vector v_odd using emit_move_insn.
18848
18849    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
18850    4 elements, so we construct tmp1 from v_even using insr:
18851    tmp1 = dup(d)
18852    insr tmp1, c
18853    insr tmp1, b
18854    insr tmp1, a
18855
18856    And finally:
18857    TARGET = zip (tmp1, tmp2)
18858    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
18859
18860 static bool
18861 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
18862                                 int nelts, int nelts_reqd)
18863 {
18864   machine_mode mode = GET_MODE (target);
18865
18866   /* Case 1: Vector contains trailing constants.  */
18867
18868   if (aarch64_sve_expand_vector_init_handle_trailing_constants
18869        (target, builder, nelts, nelts_reqd))
18870     return true;
18871
18872   /* Case 2: Vector contains leading constants.  */
18873
18874   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
18875   for (int i = 0; i < nelts_reqd; i++)
18876     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
18877   rev_builder.finalize ();
18878
18879   if (aarch64_sve_expand_vector_init_handle_trailing_constants
18880        (target, rev_builder, nelts, nelts_reqd))
18881     {
18882       emit_insn (gen_aarch64_sve_rev (mode, target, target));
18883       return true;
18884     }
18885
18886   /* Case 3: Vector contains trailing same element.  */
18887
18888   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18889        (target, builder, nelts_reqd))
18890     return true;
18891
18892   /* Case 4: Vector contains leading same element.  */
18893
18894   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18895        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
18896     {
18897       emit_insn (gen_aarch64_sve_rev (mode, target, target));
18898       return true;
18899     }
18900
18901   /* Avoid recursing below 4-elements.
18902      ??? The threshold 4 may need fine-tuning.  */
18903
18904   if (nelts_reqd <= 4)
18905     return false;
18906
18907   rtx_vector_builder v_even (mode, nelts, 1);
18908   rtx_vector_builder v_odd (mode, nelts, 1);
18909
18910   for (int i = 0; i < nelts * 2; i += 2)
18911     {
18912       v_even.quick_push (builder.elt (i));
18913       v_odd.quick_push (builder.elt (i + 1));
18914     }
18915
18916   v_even.finalize ();
18917   v_odd.finalize ();
18918
18919   rtx tmp1 = gen_reg_rtx (mode);
18920   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
18921                                                     nelts, nelts_reqd / 2);
18922
18923   rtx tmp2 = gen_reg_rtx (mode);
18924   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
18925                                                    nelts, nelts_reqd / 2);
18926
18927   if (!did_even_p && !did_odd_p)
18928     return false;
18929
18930   /* Initialize v_even and v_odd using INSR if it didn't match any of the
18931      special cases and zip v_even, v_odd.  */
18932
18933   if (!did_even_p)
18934     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
18935
18936   if (!did_odd_p)
18937     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
18938
18939   rtvec v = gen_rtvec (2, tmp1, tmp2);
18940   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
18941   return true;
18942 }
18943
18944 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
18945
18946 void
18947 aarch64_sve_expand_vector_init (rtx target, rtx vals)
18948 {
18949   machine_mode mode = GET_MODE (target);
18950   int nelts = XVECLEN (vals, 0);
18951
18952   rtx_vector_builder v (mode, nelts, 1);
18953   for (int i = 0; i < nelts; i++)
18954     v.quick_push (XVECEXP (vals, 0, i));
18955   v.finalize ();
18956
18957   /* If neither sub-vectors of v could be initialized specially,
18958      then use INSR to insert all elements from v into TARGET.
18959      ??? This might not be optimal for vectors with large
18960      initializers like 16-element or above.
18961      For nelts < 4, it probably isn't useful to handle specially.  */
18962
18963   if (nelts < 4
18964       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
18965     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
18966 }
18967
18968 /* Check whether VALUE is a vector constant in which every element
18969    is either a power of 2 or a negated power of 2.  If so, return
18970    a constant vector of log2s, and flip CODE between PLUS and MINUS
18971    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
18972
18973 static rtx
18974 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
18975 {
18976   if (GET_CODE (value) != CONST_VECTOR)
18977     return NULL_RTX;
18978
18979   rtx_vector_builder builder;
18980   if (!builder.new_unary_operation (GET_MODE (value), value, false))
18981     return NULL_RTX;
18982
18983   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
18984   /* 1 if the result of the multiplication must be negated,
18985      0 if it mustn't, or -1 if we don't yet care.  */
18986   int negate = -1;
18987   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
18988   for (unsigned int i = 0; i < encoded_nelts; ++i)
18989     {
18990       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
18991       if (!CONST_SCALAR_INT_P (elt))
18992         return NULL_RTX;
18993       rtx_mode_t val (elt, int_mode);
18994       wide_int pow2 = wi::neg (val);
18995       if (val != pow2)
18996         {
18997           /* It matters whether we negate or not.  Make that choice,
18998              and make sure that it's consistent with previous elements.  */
18999           if (negate == !wi::neg_p (val))
19000             return NULL_RTX;
19001           negate = wi::neg_p (val);
19002           if (!negate)
19003             pow2 = val;
19004         }
19005       /* POW2 is now the value that we want to be a power of 2.  */
19006       int shift = wi::exact_log2 (pow2);
19007       if (shift < 0)
19008         return NULL_RTX;
19009       builder.quick_push (gen_int_mode (shift, int_mode));
19010     }
19011   if (negate == -1)
19012     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
19013     code = PLUS;
19014   else if (negate == 1)
19015     code = code == PLUS ? MINUS : PLUS;
19016   return builder.build ();
19017 }
19018
19019 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
19020    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
19021    operands array, in the same order as for fma_optab.  Return true if
19022    the function emitted all the necessary instructions, false if the caller
19023    should generate the pattern normally with the new OPERANDS array.  */
19024
19025 bool
19026 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
19027 {
19028   machine_mode mode = GET_MODE (operands[0]);
19029   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
19030     {
19031       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
19032                                   NULL_RTX, true, OPTAB_DIRECT);
19033       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
19034                           operands[3], product, operands[0], true,
19035                           OPTAB_DIRECT);
19036       return true;
19037     }
19038   operands[2] = force_reg (mode, operands[2]);
19039   return false;
19040 }
19041
19042 /* Likewise, but for a conditional pattern.  */
19043
19044 bool
19045 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
19046 {
19047   machine_mode mode = GET_MODE (operands[0]);
19048   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
19049     {
19050       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
19051                                   NULL_RTX, true, OPTAB_DIRECT);
19052       emit_insn (gen_cond (code, mode, operands[0], operands[1],
19053                            operands[4], product, operands[5]));
19054       return true;
19055     }
19056   operands[3] = force_reg (mode, operands[3]);
19057   return false;
19058 }
19059
19060 static unsigned HOST_WIDE_INT
19061 aarch64_shift_truncation_mask (machine_mode mode)
19062 {
19063   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
19064     return 0;
19065   return GET_MODE_UNIT_BITSIZE (mode) - 1;
19066 }
19067
19068 /* Select a format to encode pointers in exception handling data.  */
19069 int
19070 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
19071 {
19072    int type;
19073    switch (aarch64_cmodel)
19074      {
19075      case AARCH64_CMODEL_TINY:
19076      case AARCH64_CMODEL_TINY_PIC:
19077      case AARCH64_CMODEL_SMALL:
19078      case AARCH64_CMODEL_SMALL_PIC:
19079      case AARCH64_CMODEL_SMALL_SPIC:
19080        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
19081           for everything.  */
19082        type = DW_EH_PE_sdata4;
19083        break;
19084      default:
19085        /* No assumptions here.  8-byte relocs required.  */
19086        type = DW_EH_PE_sdata8;
19087        break;
19088      }
19089    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
19090 }
19091
19092 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
19093
19094 static void
19095 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
19096 {
19097   if (TREE_CODE (decl) == FUNCTION_DECL)
19098     {
19099       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
19100       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
19101         {
19102           fprintf (stream, "\t.variant_pcs\t");
19103           assemble_name (stream, name);
19104           fprintf (stream, "\n");
19105         }
19106     }
19107 }
19108
19109 /* The last .arch and .tune assembly strings that we printed.  */
19110 static std::string aarch64_last_printed_arch_string;
19111 static std::string aarch64_last_printed_tune_string;
19112
19113 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
19114    by the function fndecl.  */
19115
19116 void
19117 aarch64_declare_function_name (FILE *stream, const char* name,
19118                                 tree fndecl)
19119 {
19120   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19121
19122   struct cl_target_option *targ_options;
19123   if (target_parts)
19124     targ_options = TREE_TARGET_OPTION (target_parts);
19125   else
19126     targ_options = TREE_TARGET_OPTION (target_option_current_node);
19127   gcc_assert (targ_options);
19128
19129   const struct processor *this_arch
19130     = aarch64_get_arch (targ_options->x_explicit_arch);
19131
19132   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
19133   std::string extension
19134     = aarch64_get_extension_string_for_isa_flags (isa_flags,
19135                                                   this_arch->flags);
19136   /* Only update the assembler .arch string if it is distinct from the last
19137      such string we printed.  */
19138   std::string to_print = this_arch->name + extension;
19139   if (to_print != aarch64_last_printed_arch_string)
19140     {
19141       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
19142       aarch64_last_printed_arch_string = to_print;
19143     }
19144
19145   /* Print the cpu name we're tuning for in the comments, might be
19146      useful to readers of the generated asm.  Do it only when it changes
19147      from function to function and verbose assembly is requested.  */
19148   const struct processor *this_tune
19149     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
19150
19151   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
19152     {
19153       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
19154                    this_tune->name);
19155       aarch64_last_printed_tune_string = this_tune->name;
19156     }
19157
19158   aarch64_asm_output_variant_pcs (stream, fndecl, name);
19159
19160   /* Don't forget the type directive for ELF.  */
19161   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
19162   ASM_OUTPUT_LABEL (stream, name);
19163
19164   cfun->machine->label_is_assembled = true;
19165 }
19166
19167 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  Check if the patch area is after
19168    the function label and emit a BTI if necessary.  */
19169
19170 void
19171 aarch64_print_patchable_function_entry (FILE *file,
19172                                         unsigned HOST_WIDE_INT patch_area_size,
19173                                         bool record_p)
19174 {
19175   if (cfun->machine->label_is_assembled
19176       && aarch64_bti_enabled ()
19177       && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
19178     {
19179       /* Remove the BTI that follows the patch area and insert a new BTI
19180          before the patch area right after the function label.  */
19181       rtx_insn *insn = next_real_nondebug_insn (get_insns ());
19182       if (insn
19183           && INSN_P (insn)
19184           && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
19185           && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
19186         delete_insn (insn);
19187       asm_fprintf (file, "\thint\t34 // bti c\n");
19188     }
19189
19190   default_print_patchable_function_entry (file, patch_area_size, record_p);
19191 }
19192
19193 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
19194
19195 void
19196 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
19197 {
19198   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
19199   const char *value = IDENTIFIER_POINTER (target);
19200   aarch64_asm_output_variant_pcs (stream, decl, name);
19201   ASM_OUTPUT_DEF (stream, name, value);
19202 }
19203
19204 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
19205    function symbol references.  */
19206
19207 void
19208 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
19209 {
19210   default_elf_asm_output_external (stream, decl, name);
19211   aarch64_asm_output_variant_pcs (stream, decl, name);
19212 }
19213
19214 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
19215    Used to output the .cfi_b_key_frame directive when signing the current
19216    function with the B key.  */
19217
19218 void
19219 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
19220 {
19221   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
19222       && aarch64_ra_sign_key == AARCH64_KEY_B)
19223         asm_fprintf (f, "\t.cfi_b_key_frame\n");
19224 }
19225
19226 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
19227
19228 static void
19229 aarch64_start_file (void)
19230 {
19231   struct cl_target_option *default_options
19232     = TREE_TARGET_OPTION (target_option_default_node);
19233
19234   const struct processor *default_arch
19235     = aarch64_get_arch (default_options->x_explicit_arch);
19236   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
19237   std::string extension
19238     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
19239                                                   default_arch->flags);
19240
19241    aarch64_last_printed_arch_string = default_arch->name + extension;
19242    aarch64_last_printed_tune_string = "";
19243    asm_fprintf (asm_out_file, "\t.arch %s\n",
19244                 aarch64_last_printed_arch_string.c_str ());
19245
19246    default_file_start ();
19247 }
19248
19249 /* Emit load exclusive.  */
19250
19251 static void
19252 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
19253                              rtx mem, rtx model_rtx)
19254 {
19255   if (mode == TImode)
19256     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
19257                                                 gen_highpart (DImode, rval),
19258                                                 mem, model_rtx));
19259   else
19260     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
19261 }
19262
19263 /* Emit store exclusive.  */
19264
19265 static void
19266 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
19267                               rtx mem, rtx rval, rtx model_rtx)
19268 {
19269   if (mode == TImode)
19270     emit_insn (gen_aarch64_store_exclusive_pair
19271                (bval, mem, operand_subword (rval, 0, 0, TImode),
19272                 operand_subword (rval, 1, 0, TImode), model_rtx));
19273   else
19274     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
19275 }
19276
19277 /* Mark the previous jump instruction as unlikely.  */
19278
19279 static void
19280 aarch64_emit_unlikely_jump (rtx insn)
19281 {
19282   rtx_insn *jump = emit_jump_insn (insn);
19283   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
19284 }
19285
19286 /* We store the names of the various atomic helpers in a 5x4 array.
19287    Return the libcall function given MODE, MODEL and NAMES.  */
19288
19289 rtx
19290 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
19291                         const atomic_ool_names *names)
19292 {
19293   memmodel model = memmodel_base (INTVAL (model_rtx));
19294   int mode_idx, model_idx;
19295
19296   switch (mode)
19297     {
19298     case E_QImode:
19299       mode_idx = 0;
19300       break;
19301     case E_HImode:
19302       mode_idx = 1;
19303       break;
19304     case E_SImode:
19305       mode_idx = 2;
19306       break;
19307     case E_DImode:
19308       mode_idx = 3;
19309       break;
19310     case E_TImode:
19311       mode_idx = 4;
19312       break;
19313     default:
19314       gcc_unreachable ();
19315     }
19316
19317   switch (model)
19318     {
19319     case MEMMODEL_RELAXED:
19320       model_idx = 0;
19321       break;
19322     case MEMMODEL_CONSUME:
19323     case MEMMODEL_ACQUIRE:
19324       model_idx = 1;
19325       break;
19326     case MEMMODEL_RELEASE:
19327       model_idx = 2;
19328       break;
19329     case MEMMODEL_ACQ_REL:
19330     case MEMMODEL_SEQ_CST:
19331       model_idx = 3;
19332       break;
19333     default:
19334       gcc_unreachable ();
19335     }
19336
19337   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
19338                                       VISIBILITY_HIDDEN);
19339 }
19340
19341 #define DEF0(B, N) \
19342   { "__aarch64_" #B #N "_relax", \
19343     "__aarch64_" #B #N "_acq", \
19344     "__aarch64_" #B #N "_rel", \
19345     "__aarch64_" #B #N "_acq_rel" }
19346
19347 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
19348                  { NULL, NULL, NULL, NULL }
19349 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
19350
19351 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
19352 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
19353 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
19354 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
19355 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
19356 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
19357
19358 #undef DEF0
19359 #undef DEF4
19360 #undef DEF5
19361
19362 /* Expand a compare and swap pattern.  */
19363
19364 void
19365 aarch64_expand_compare_and_swap (rtx operands[])
19366 {
19367   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
19368   machine_mode mode, r_mode;
19369
19370   bval = operands[0];
19371   rval = operands[1];
19372   mem = operands[2];
19373   oldval = operands[3];
19374   newval = operands[4];
19375   is_weak = operands[5];
19376   mod_s = operands[6];
19377   mod_f = operands[7];
19378   mode = GET_MODE (mem);
19379
19380   /* Normally the succ memory model must be stronger than fail, but in the
19381      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
19382      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
19383   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
19384       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
19385     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
19386
19387   r_mode = mode;
19388   if (mode == QImode || mode == HImode)
19389     {
19390       r_mode = SImode;
19391       rval = gen_reg_rtx (r_mode);
19392     }
19393
19394   if (TARGET_LSE)
19395     {
19396       /* The CAS insn requires oldval and rval overlap, but we need to
19397          have a copy of oldval saved across the operation to tell if
19398          the operation is successful.  */
19399       if (reg_overlap_mentioned_p (rval, oldval))
19400         rval = copy_to_mode_reg (r_mode, oldval);
19401       else
19402         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
19403
19404       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
19405                                                    newval, mod_s));
19406       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19407     }
19408   else if (TARGET_OUTLINE_ATOMICS)
19409     {
19410       /* Oldval must satisfy compare afterward.  */
19411       if (!aarch64_plus_operand (oldval, mode))
19412         oldval = force_reg (mode, oldval);
19413       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
19414       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
19415                                       oldval, mode, newval, mode,
19416                                       XEXP (mem, 0), Pmode);
19417       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19418     }
19419   else
19420     {
19421       /* The oldval predicate varies by mode.  Test it and force to reg.  */
19422       insn_code code = code_for_aarch64_compare_and_swap (mode);
19423       if (!insn_data[code].operand[2].predicate (oldval, mode))
19424         oldval = force_reg (mode, oldval);
19425
19426       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
19427                                  is_weak, mod_s, mod_f));
19428       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
19429     }
19430
19431   if (r_mode != mode)
19432     rval = gen_lowpart (mode, rval);
19433   emit_move_insn (operands[1], rval);
19434
19435   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
19436   emit_insn (gen_rtx_SET (bval, x));
19437 }
19438
19439 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
19440    sequence implementing an atomic operation.  */
19441
19442 static void
19443 aarch64_emit_post_barrier (enum memmodel model)
19444 {
19445   const enum memmodel base_model = memmodel_base (model);
19446
19447   if (is_mm_sync (model)
19448       && (base_model == MEMMODEL_ACQUIRE
19449           || base_model == MEMMODEL_ACQ_REL
19450           || base_model == MEMMODEL_SEQ_CST))
19451     {
19452       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
19453     }
19454 }
19455
19456 /* Split a compare and swap pattern.  */
19457
19458 void
19459 aarch64_split_compare_and_swap (rtx operands[])
19460 {
19461   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
19462   gcc_assert (epilogue_completed);
19463
19464   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
19465   machine_mode mode;
19466   bool is_weak;
19467   rtx_code_label *label1, *label2;
19468   enum memmodel model;
19469
19470   rval = operands[0];
19471   mem = operands[1];
19472   oldval = operands[2];
19473   newval = operands[3];
19474   is_weak = (operands[4] != const0_rtx);
19475   model_rtx = operands[5];
19476   scratch = operands[7];
19477   mode = GET_MODE (mem);
19478   model = memmodel_from_int (INTVAL (model_rtx));
19479
19480   /* When OLDVAL is zero and we want the strong version we can emit a tighter
19481     loop:
19482     .label1:
19483         LD[A]XR rval, [mem]
19484         CBNZ    rval, .label2
19485         ST[L]XR scratch, newval, [mem]
19486         CBNZ    scratch, .label1
19487     .label2:
19488         CMP     rval, 0.  */
19489   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
19490                         oldval == const0_rtx && mode != TImode);
19491
19492   label1 = NULL;
19493   if (!is_weak)
19494     {
19495       label1 = gen_label_rtx ();
19496       emit_label (label1);
19497     }
19498   label2 = gen_label_rtx ();
19499
19500   /* The initial load can be relaxed for a __sync operation since a final
19501      barrier will be emitted to stop code hoisting.  */
19502   if (is_mm_sync (model))
19503     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
19504   else
19505     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
19506
19507   if (strong_zero_p)
19508     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
19509   else
19510     {
19511       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19512       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
19513     }
19514   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19515                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
19516   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19517
19518   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
19519
19520   if (!is_weak)
19521     {
19522       if (aarch64_track_speculation)
19523         {
19524           /* Emit an explicit compare instruction, so that we can correctly
19525              track the condition codes.  */
19526           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19527           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19528         }
19529       else
19530         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
19531
19532       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19533                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
19534       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19535     }
19536   else
19537     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19538
19539   emit_label (label2);
19540
19541   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
19542      to set the condition flags.  If this is not used it will be removed by
19543      later passes.  */
19544   if (strong_zero_p)
19545     aarch64_gen_compare_reg (NE, rval, const0_rtx);
19546
19547   /* Emit any final barrier needed for a __sync operation.  */
19548   if (is_mm_sync (model))
19549     aarch64_emit_post_barrier (model);
19550 }
19551
19552 /* Split an atomic operation.  */
19553
19554 void
19555 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
19556                          rtx value, rtx model_rtx, rtx cond)
19557 {
19558   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
19559   gcc_assert (epilogue_completed);
19560
19561   machine_mode mode = GET_MODE (mem);
19562   machine_mode wmode = (mode == DImode ? DImode : SImode);
19563   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
19564   const bool is_sync = is_mm_sync (model);
19565   rtx_code_label *label;
19566   rtx x;
19567
19568   /* Split the atomic operation into a sequence.  */
19569   label = gen_label_rtx ();
19570   emit_label (label);
19571
19572   if (new_out)
19573     new_out = gen_lowpart (wmode, new_out);
19574   if (old_out)
19575     old_out = gen_lowpart (wmode, old_out);
19576   else
19577     old_out = new_out;
19578   value = simplify_gen_subreg (wmode, value, mode, 0);
19579
19580   /* The initial load can be relaxed for a __sync operation since a final
19581      barrier will be emitted to stop code hoisting.  */
19582  if (is_sync)
19583     aarch64_emit_load_exclusive (mode, old_out, mem,
19584                                  GEN_INT (MEMMODEL_RELAXED));
19585   else
19586     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
19587
19588   switch (code)
19589     {
19590     case SET:
19591       new_out = value;
19592       break;
19593
19594     case NOT:
19595       x = gen_rtx_AND (wmode, old_out, value);
19596       emit_insn (gen_rtx_SET (new_out, x));
19597       x = gen_rtx_NOT (wmode, new_out);
19598       emit_insn (gen_rtx_SET (new_out, x));
19599       break;
19600
19601     case MINUS:
19602       if (CONST_INT_P (value))
19603         {
19604           value = GEN_INT (-INTVAL (value));
19605           code = PLUS;
19606         }
19607       /* Fall through.  */
19608
19609     default:
19610       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
19611       emit_insn (gen_rtx_SET (new_out, x));
19612       break;
19613     }
19614
19615   aarch64_emit_store_exclusive (mode, cond, mem,
19616                                 gen_lowpart (mode, new_out), model_rtx);
19617
19618   if (aarch64_track_speculation)
19619     {
19620       /* Emit an explicit compare instruction, so that we can correctly
19621          track the condition codes.  */
19622       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
19623       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19624     }
19625   else
19626     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
19627
19628   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19629                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
19630   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19631
19632   /* Emit any final barrier needed for a __sync operation.  */
19633   if (is_sync)
19634     aarch64_emit_post_barrier (model);
19635 }
19636
19637 static void
19638 aarch64_init_libfuncs (void)
19639 {
19640    /* Half-precision float operations.  The compiler handles all operations
19641      with NULL libfuncs by converting to SFmode.  */
19642
19643   /* Conversions.  */
19644   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
19645   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
19646
19647   /* Arithmetic.  */
19648   set_optab_libfunc (add_optab, HFmode, NULL);
19649   set_optab_libfunc (sdiv_optab, HFmode, NULL);
19650   set_optab_libfunc (smul_optab, HFmode, NULL);
19651   set_optab_libfunc (neg_optab, HFmode, NULL);
19652   set_optab_libfunc (sub_optab, HFmode, NULL);
19653
19654   /* Comparisons.  */
19655   set_optab_libfunc (eq_optab, HFmode, NULL);
19656   set_optab_libfunc (ne_optab, HFmode, NULL);
19657   set_optab_libfunc (lt_optab, HFmode, NULL);
19658   set_optab_libfunc (le_optab, HFmode, NULL);
19659   set_optab_libfunc (ge_optab, HFmode, NULL);
19660   set_optab_libfunc (gt_optab, HFmode, NULL);
19661   set_optab_libfunc (unord_optab, HFmode, NULL);
19662 }
19663
19664 /* Target hook for c_mode_for_suffix.  */
19665 static machine_mode
19666 aarch64_c_mode_for_suffix (char suffix)
19667 {
19668   if (suffix == 'q')
19669     return TFmode;
19670
19671   return VOIDmode;
19672 }
19673
19674 /* We can only represent floating point constants which will fit in
19675    "quarter-precision" values.  These values are characterised by
19676    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
19677    by:
19678
19679    (-1)^s * (n/16) * 2^r
19680
19681    Where:
19682      's' is the sign bit.
19683      'n' is an integer in the range 16 <= n <= 31.
19684      'r' is an integer in the range -3 <= r <= 4.  */
19685
19686 /* Return true iff X can be represented by a quarter-precision
19687    floating point immediate operand X.  Note, we cannot represent 0.0.  */
19688 bool
19689 aarch64_float_const_representable_p (rtx x)
19690 {
19691   /* This represents our current view of how many bits
19692      make up the mantissa.  */
19693   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
19694   int exponent;
19695   unsigned HOST_WIDE_INT mantissa, mask;
19696   REAL_VALUE_TYPE r, m;
19697   bool fail;
19698
19699   x = unwrap_const_vec_duplicate (x);
19700   if (!CONST_DOUBLE_P (x))
19701     return false;
19702
19703   if (GET_MODE (x) == VOIDmode
19704       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
19705     return false;
19706
19707   r = *CONST_DOUBLE_REAL_VALUE (x);
19708
19709   /* We cannot represent infinities, NaNs or +/-zero.  We won't
19710      know if we have +zero until we analyse the mantissa, but we
19711      can reject the other invalid values.  */
19712   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
19713       || REAL_VALUE_MINUS_ZERO (r))
19714     return false;
19715
19716   /* Extract exponent.  */
19717   r = real_value_abs (&r);
19718   exponent = REAL_EXP (&r);
19719
19720   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
19721      highest (sign) bit, with a fixed binary point at bit point_pos.
19722      m1 holds the low part of the mantissa, m2 the high part.
19723      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
19724      bits for the mantissa, this can fail (low bits will be lost).  */
19725   real_ldexp (&m, &r, point_pos - exponent);
19726   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
19727
19728   /* If the low part of the mantissa has bits set we cannot represent
19729      the value.  */
19730   if (w.ulow () != 0)
19731     return false;
19732   /* We have rejected the lower HOST_WIDE_INT, so update our
19733      understanding of how many bits lie in the mantissa and
19734      look only at the high HOST_WIDE_INT.  */
19735   mantissa = w.elt (1);
19736   point_pos -= HOST_BITS_PER_WIDE_INT;
19737
19738   /* We can only represent values with a mantissa of the form 1.xxxx.  */
19739   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
19740   if ((mantissa & mask) != 0)
19741     return false;
19742
19743   /* Having filtered unrepresentable values, we may now remove all
19744      but the highest 5 bits.  */
19745   mantissa >>= point_pos - 5;
19746
19747   /* We cannot represent the value 0.0, so reject it.  This is handled
19748      elsewhere.  */
19749   if (mantissa == 0)
19750     return false;
19751
19752   /* Then, as bit 4 is always set, we can mask it off, leaving
19753      the mantissa in the range [0, 15].  */
19754   mantissa &= ~(1 << 4);
19755   gcc_assert (mantissa <= 15);
19756
19757   /* GCC internally does not use IEEE754-like encoding (where normalized
19758      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
19759      Our mantissa values are shifted 4 places to the left relative to
19760      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
19761      by 5 places to correct for GCC's representation.  */
19762   exponent = 5 - exponent;
19763
19764   return (exponent >= 0 && exponent <= 7);
19765 }
19766
19767 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
19768    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
19769    output MOVI/MVNI, ORR or BIC immediate.  */
19770 char*
19771 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
19772                                    enum simd_immediate_check which)
19773 {
19774   bool is_valid;
19775   static char templ[40];
19776   const char *mnemonic;
19777   const char *shift_op;
19778   unsigned int lane_count = 0;
19779   char element_char;
19780
19781   struct simd_immediate_info info;
19782
19783   /* This will return true to show const_vector is legal for use as either
19784      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
19785      It will also update INFO to show how the immediate should be generated.
19786      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
19787   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
19788   gcc_assert (is_valid);
19789
19790   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19791   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
19792
19793   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
19794     {
19795       gcc_assert (info.insn == simd_immediate_info::MOV
19796                   && info.u.mov.shift == 0);
19797       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
19798          move immediate path.  */
19799       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19800         info.u.mov.value = GEN_INT (0);
19801       else
19802         {
19803           const unsigned int buf_size = 20;
19804           char float_buf[buf_size] = {'\0'};
19805           real_to_decimal_for_mode (float_buf,
19806                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
19807                                     buf_size, buf_size, 1, info.elt_mode);
19808
19809           if (lane_count == 1)
19810             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
19811           else
19812             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
19813                       lane_count, element_char, float_buf);
19814           return templ;
19815         }
19816     }
19817
19818   gcc_assert (CONST_INT_P (info.u.mov.value));
19819
19820   if (which == AARCH64_CHECK_MOV)
19821     {
19822       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
19823       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
19824                   ? "msl" : "lsl");
19825       if (lane_count == 1)
19826         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
19827                   mnemonic, UINTVAL (info.u.mov.value));
19828       else if (info.u.mov.shift)
19829         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
19830                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
19831                   element_char, UINTVAL (info.u.mov.value), shift_op,
19832                   info.u.mov.shift);
19833       else
19834         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
19835                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
19836                   element_char, UINTVAL (info.u.mov.value));
19837     }
19838   else
19839     {
19840       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
19841       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
19842       if (info.u.mov.shift)
19843         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
19844                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
19845                   element_char, UINTVAL (info.u.mov.value), "lsl",
19846                   info.u.mov.shift);
19847       else
19848         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
19849                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
19850                   element_char, UINTVAL (info.u.mov.value));
19851     }
19852   return templ;
19853 }
19854
19855 char*
19856 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
19857 {
19858
19859   /* If a floating point number was passed and we desire to use it in an
19860      integer mode do the conversion to integer.  */
19861   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
19862     {
19863       unsigned HOST_WIDE_INT ival;
19864       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
19865           gcc_unreachable ();
19866       immediate = gen_int_mode (ival, mode);
19867     }
19868
19869   machine_mode vmode;
19870   /* use a 64 bit mode for everything except for DI/DF mode, where we use
19871      a 128 bit vector mode.  */
19872   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
19873
19874   vmode = aarch64_simd_container_mode (mode, width);
19875   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
19876   return aarch64_output_simd_mov_immediate (v_op, width);
19877 }
19878
19879 /* Return the output string to use for moving immediate CONST_VECTOR
19880    into an SVE register.  */
19881
19882 char *
19883 aarch64_output_sve_mov_immediate (rtx const_vector)
19884 {
19885   static char templ[40];
19886   struct simd_immediate_info info;
19887   char element_char;
19888
19889   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
19890   gcc_assert (is_valid);
19891
19892   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19893
19894   machine_mode vec_mode = GET_MODE (const_vector);
19895   if (aarch64_sve_pred_mode_p (vec_mode))
19896     {
19897       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
19898       if (info.insn == simd_immediate_info::MOV)
19899         {
19900           gcc_assert (info.u.mov.value == const0_rtx);
19901           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
19902         }
19903       else
19904         {
19905           gcc_assert (info.insn == simd_immediate_info::PTRUE);
19906           unsigned int total_bytes;
19907           if (info.u.pattern == AARCH64_SV_ALL
19908               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
19909             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
19910                       total_bytes / GET_MODE_SIZE (info.elt_mode));
19911           else
19912             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
19913                       svpattern_token (info.u.pattern));
19914         }
19915       return buf;
19916     }
19917
19918   if (info.insn == simd_immediate_info::INDEX)
19919     {
19920       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
19921                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
19922                 element_char, INTVAL (info.u.index.base),
19923                 INTVAL (info.u.index.step));
19924       return templ;
19925     }
19926
19927   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
19928     {
19929       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19930         info.u.mov.value = GEN_INT (0);
19931       else
19932         {
19933           const int buf_size = 20;
19934           char float_buf[buf_size] = {};
19935           real_to_decimal_for_mode (float_buf,
19936                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
19937                                     buf_size, buf_size, 1, info.elt_mode);
19938
19939           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
19940                     element_char, float_buf);
19941           return templ;
19942         }
19943     }
19944
19945   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
19946             element_char, INTVAL (info.u.mov.value));
19947   return templ;
19948 }
19949
19950 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
19951    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
19952    pattern.  */
19953
19954 char *
19955 aarch64_output_sve_ptrues (rtx const_unspec)
19956 {
19957   static char templ[40];
19958
19959   struct simd_immediate_info info;
19960   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
19961   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
19962
19963   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19964   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
19965             svpattern_token (info.u.pattern));
19966   return templ;
19967 }
19968
19969 /* Split operands into moves from op[1] + op[2] into op[0].  */
19970
19971 void
19972 aarch64_split_combinev16qi (rtx operands[3])
19973 {
19974   unsigned int dest = REGNO (operands[0]);
19975   unsigned int src1 = REGNO (operands[1]);
19976   unsigned int src2 = REGNO (operands[2]);
19977   machine_mode halfmode = GET_MODE (operands[1]);
19978   unsigned int halfregs = REG_NREGS (operands[1]);
19979   rtx destlo, desthi;
19980
19981   gcc_assert (halfmode == V16QImode);
19982
19983   if (src1 == dest && src2 == dest + halfregs)
19984     {
19985       /* No-op move.  Can't split to nothing; emit something.  */
19986       emit_note (NOTE_INSN_DELETED);
19987       return;
19988     }
19989
19990   /* Preserve register attributes for variable tracking.  */
19991   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
19992   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
19993                                GET_MODE_SIZE (halfmode));
19994
19995   /* Special case of reversed high/low parts.  */
19996   if (reg_overlap_mentioned_p (operands[2], destlo)
19997       && reg_overlap_mentioned_p (operands[1], desthi))
19998     {
19999       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
20000       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
20001       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
20002     }
20003   else if (!reg_overlap_mentioned_p (operands[2], destlo))
20004     {
20005       /* Try to avoid unnecessary moves if part of the result
20006          is in the right place already.  */
20007       if (src1 != dest)
20008         emit_move_insn (destlo, operands[1]);
20009       if (src2 != dest + halfregs)
20010         emit_move_insn (desthi, operands[2]);
20011     }
20012   else
20013     {
20014       if (src2 != dest + halfregs)
20015         emit_move_insn (desthi, operands[2]);
20016       if (src1 != dest)
20017         emit_move_insn (destlo, operands[1]);
20018     }
20019 }
20020
20021 /* vec_perm support.  */
20022
20023 struct expand_vec_perm_d
20024 {
20025   rtx target, op0, op1;
20026   vec_perm_indices perm;
20027   machine_mode vmode;
20028   unsigned int vec_flags;
20029   bool one_vector_p;
20030   bool testing_p;
20031 };
20032
20033 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
20034
20035 /* Generate a variable permutation.  */
20036
20037 static void
20038 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
20039 {
20040   machine_mode vmode = GET_MODE (target);
20041   bool one_vector_p = rtx_equal_p (op0, op1);
20042
20043   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
20044   gcc_checking_assert (GET_MODE (op0) == vmode);
20045   gcc_checking_assert (GET_MODE (op1) == vmode);
20046   gcc_checking_assert (GET_MODE (sel) == vmode);
20047   gcc_checking_assert (TARGET_SIMD);
20048
20049   if (one_vector_p)
20050     {
20051       if (vmode == V8QImode)
20052         {
20053           /* Expand the argument to a V16QI mode by duplicating it.  */
20054           rtx pair = gen_reg_rtx (V16QImode);
20055           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
20056           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20057         }
20058       else
20059         {
20060           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
20061         }
20062     }
20063   else
20064     {
20065       rtx pair;
20066
20067       if (vmode == V8QImode)
20068         {
20069           pair = gen_reg_rtx (V16QImode);
20070           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
20071           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20072         }
20073       else
20074         {
20075           pair = gen_reg_rtx (OImode);
20076           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
20077           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
20078         }
20079     }
20080 }
20081
20082 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
20083    NELT is the number of elements in the vector.  */
20084
20085 void
20086 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
20087                          unsigned int nelt)
20088 {
20089   machine_mode vmode = GET_MODE (target);
20090   bool one_vector_p = rtx_equal_p (op0, op1);
20091   rtx mask;
20092
20093   /* The TBL instruction does not use a modulo index, so we must take care
20094      of that ourselves.  */
20095   mask = aarch64_simd_gen_const_vector_dup (vmode,
20096       one_vector_p ? nelt - 1 : 2 * nelt - 1);
20097   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
20098
20099   /* For big-endian, we also need to reverse the index within the vector
20100      (but not which vector).  */
20101   if (BYTES_BIG_ENDIAN)
20102     {
20103       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
20104       if (!one_vector_p)
20105         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
20106       sel = expand_simple_binop (vmode, XOR, sel, mask,
20107                                  NULL, 0, OPTAB_LIB_WIDEN);
20108     }
20109   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
20110 }
20111
20112 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
20113
20114 static void
20115 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
20116 {
20117   emit_insn (gen_rtx_SET (target,
20118                           gen_rtx_UNSPEC (GET_MODE (target),
20119                                           gen_rtvec (2, op0, op1), code)));
20120 }
20121
20122 /* Expand an SVE vec_perm with the given operands.  */
20123
20124 void
20125 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
20126 {
20127   machine_mode data_mode = GET_MODE (target);
20128   machine_mode sel_mode = GET_MODE (sel);
20129   /* Enforced by the pattern condition.  */
20130   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
20131
20132   /* Note: vec_perm indices are supposed to wrap when they go beyond the
20133      size of the two value vectors, i.e. the upper bits of the indices
20134      are effectively ignored.  SVE TBL instead produces 0 for any
20135      out-of-range indices, so we need to modulo all the vec_perm indices
20136      to ensure they are all in range.  */
20137   rtx sel_reg = force_reg (sel_mode, sel);
20138
20139   /* Check if the sel only references the first values vector.  */
20140   if (GET_CODE (sel) == CONST_VECTOR
20141       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
20142     {
20143       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
20144       return;
20145     }
20146
20147   /* Check if the two values vectors are the same.  */
20148   if (rtx_equal_p (op0, op1))
20149     {
20150       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
20151       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20152                                          NULL, 0, OPTAB_DIRECT);
20153       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
20154       return;
20155     }
20156
20157   /* Run TBL on for each value vector and combine the results.  */
20158
20159   rtx res0 = gen_reg_rtx (data_mode);
20160   rtx res1 = gen_reg_rtx (data_mode);
20161   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
20162   if (GET_CODE (sel) != CONST_VECTOR
20163       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
20164     {
20165       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
20166                                                        2 * nunits - 1);
20167       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20168                                      NULL, 0, OPTAB_DIRECT);
20169     }
20170   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
20171   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
20172                                      NULL, 0, OPTAB_DIRECT);
20173   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
20174   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
20175     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
20176   else
20177     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
20178 }
20179
20180 /* Recognize patterns suitable for the TRN instructions.  */
20181 static bool
20182 aarch64_evpc_trn (struct expand_vec_perm_d *d)
20183 {
20184   HOST_WIDE_INT odd;
20185   poly_uint64 nelt = d->perm.length ();
20186   rtx out, in0, in1, x;
20187   machine_mode vmode = d->vmode;
20188
20189   if (GET_MODE_UNIT_SIZE (vmode) > 8)
20190     return false;
20191
20192   /* Note that these are little-endian tests.
20193      We correct for big-endian later.  */
20194   if (!d->perm[0].is_constant (&odd)
20195       || (odd != 0 && odd != 1)
20196       || !d->perm.series_p (0, 2, odd, 2)
20197       || !d->perm.series_p (1, 2, nelt + odd, 2))
20198     return false;
20199
20200   /* Success!  */
20201   if (d->testing_p)
20202     return true;
20203
20204   in0 = d->op0;
20205   in1 = d->op1;
20206   /* We don't need a big-endian lane correction for SVE; see the comment
20207      at the head of aarch64-sve.md for details.  */
20208   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20209     {
20210       x = in0, in0 = in1, in1 = x;
20211       odd = !odd;
20212     }
20213   out = d->target;
20214
20215   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20216                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
20217   return true;
20218 }
20219
20220 /* Try to re-encode the PERM constant so it combines odd and even elements.
20221    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
20222    We retry with this new constant with the full suite of patterns.  */
20223 static bool
20224 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
20225 {
20226   expand_vec_perm_d newd;
20227   unsigned HOST_WIDE_INT nelt;
20228
20229   if (d->vec_flags != VEC_ADVSIMD)
20230     return false;
20231
20232   /* Get the new mode.  Always twice the size of the inner
20233      and half the elements.  */
20234   poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
20235   unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
20236   auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
20237   machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
20238
20239   if (new_mode == word_mode)
20240     return false;
20241
20242   /* to_constant is safe since this routine is specific to Advanced SIMD
20243      vectors.  */
20244   nelt = d->perm.length ().to_constant ();
20245
20246   vec_perm_builder newpermconst;
20247   newpermconst.new_vector (nelt / 2, nelt / 2, 1);
20248
20249   /* Convert the perm constant if we can.  Require even, odd as the pairs.  */
20250   for (unsigned int i = 0; i < nelt; i += 2)
20251     {
20252       poly_int64 elt0 = d->perm[i];
20253       poly_int64 elt1 = d->perm[i + 1];
20254       poly_int64 newelt;
20255       if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
20256         return false;
20257       newpermconst.quick_push (newelt.to_constant ());
20258     }
20259   newpermconst.finalize ();
20260
20261   newd.vmode = new_mode;
20262   newd.vec_flags = VEC_ADVSIMD;
20263   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
20264   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
20265   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
20266   newd.testing_p = d->testing_p;
20267   newd.one_vector_p = d->one_vector_p;
20268
20269   newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
20270   return aarch64_expand_vec_perm_const_1 (&newd);
20271 }
20272
20273 /* Recognize patterns suitable for the UZP instructions.  */
20274 static bool
20275 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
20276 {
20277   HOST_WIDE_INT odd;
20278   rtx out, in0, in1, x;
20279   machine_mode vmode = d->vmode;
20280
20281   if (GET_MODE_UNIT_SIZE (vmode) > 8)
20282     return false;
20283
20284   /* Note that these are little-endian tests.
20285      We correct for big-endian later.  */
20286   if (!d->perm[0].is_constant (&odd)
20287       || (odd != 0 && odd != 1)
20288       || !d->perm.series_p (0, 1, odd, 2))
20289     return false;
20290
20291   /* Success!  */
20292   if (d->testing_p)
20293     return true;
20294
20295   in0 = d->op0;
20296   in1 = d->op1;
20297   /* We don't need a big-endian lane correction for SVE; see the comment
20298      at the head of aarch64-sve.md for details.  */
20299   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20300     {
20301       x = in0, in0 = in1, in1 = x;
20302       odd = !odd;
20303     }
20304   out = d->target;
20305
20306   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20307                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
20308   return true;
20309 }
20310
20311 /* Recognize patterns suitable for the ZIP instructions.  */
20312 static bool
20313 aarch64_evpc_zip (struct expand_vec_perm_d *d)
20314 {
20315   unsigned int high;
20316   poly_uint64 nelt = d->perm.length ();
20317   rtx out, in0, in1, x;
20318   machine_mode vmode = d->vmode;
20319
20320   if (GET_MODE_UNIT_SIZE (vmode) > 8)
20321     return false;
20322
20323   /* Note that these are little-endian tests.
20324      We correct for big-endian later.  */
20325   poly_uint64 first = d->perm[0];
20326   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
20327       || !d->perm.series_p (0, 2, first, 1)
20328       || !d->perm.series_p (1, 2, first + nelt, 1))
20329     return false;
20330   high = maybe_ne (first, 0U);
20331
20332   /* Success!  */
20333   if (d->testing_p)
20334     return true;
20335
20336   in0 = d->op0;
20337   in1 = d->op1;
20338   /* We don't need a big-endian lane correction for SVE; see the comment
20339      at the head of aarch64-sve.md for details.  */
20340   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20341     {
20342       x = in0, in0 = in1, in1 = x;
20343       high = !high;
20344     }
20345   out = d->target;
20346
20347   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20348                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
20349   return true;
20350 }
20351
20352 /* Recognize patterns for the EXT insn.  */
20353
20354 static bool
20355 aarch64_evpc_ext (struct expand_vec_perm_d *d)
20356 {
20357   HOST_WIDE_INT location;
20358   rtx offset;
20359
20360   /* The first element always refers to the first vector.
20361      Check if the extracted indices are increasing by one.  */
20362   if (d->vec_flags == VEC_SVE_PRED
20363       || !d->perm[0].is_constant (&location)
20364       || !d->perm.series_p (0, 1, location, 1))
20365     return false;
20366
20367   /* Success! */
20368   if (d->testing_p)
20369     return true;
20370
20371   /* The case where (location == 0) is a no-op for both big- and little-endian,
20372      and is removed by the mid-end at optimization levels -O1 and higher.
20373
20374      We don't need a big-endian lane correction for SVE; see the comment
20375      at the head of aarch64-sve.md for details.  */
20376   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
20377     {
20378       /* After setup, we want the high elements of the first vector (stored
20379          at the LSB end of the register), and the low elements of the second
20380          vector (stored at the MSB end of the register). So swap.  */
20381       std::swap (d->op0, d->op1);
20382       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
20383          to_constant () is safe since this is restricted to Advanced SIMD
20384          vectors.  */
20385       location = d->perm.length ().to_constant () - location;
20386     }
20387
20388   offset = GEN_INT (location);
20389   emit_set_insn (d->target,
20390                  gen_rtx_UNSPEC (d->vmode,
20391                                  gen_rtvec (3, d->op0, d->op1, offset),
20392                                  UNSPEC_EXT));
20393   return true;
20394 }
20395
20396 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
20397    within each 64-bit, 32-bit or 16-bit granule.  */
20398
20399 static bool
20400 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
20401 {
20402   HOST_WIDE_INT diff;
20403   unsigned int i, size, unspec;
20404   machine_mode pred_mode;
20405
20406   if (d->vec_flags == VEC_SVE_PRED
20407       || !d->one_vector_p
20408       || !d->perm[0].is_constant (&diff)
20409       || !diff)
20410     return false;
20411
20412   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
20413   if (size == 8)
20414     {
20415       unspec = UNSPEC_REV64;
20416       pred_mode = VNx2BImode;
20417     }
20418   else if (size == 4)
20419     {
20420       unspec = UNSPEC_REV32;
20421       pred_mode = VNx4BImode;
20422     }
20423   else if (size == 2)
20424     {
20425       unspec = UNSPEC_REV16;
20426       pred_mode = VNx8BImode;
20427     }
20428   else
20429     return false;
20430
20431   unsigned int step = diff + 1;
20432   for (i = 0; i < step; ++i)
20433     if (!d->perm.series_p (i, step, diff - i, step))
20434       return false;
20435
20436   /* Success! */
20437   if (d->testing_p)
20438     return true;
20439
20440   if (d->vec_flags == VEC_SVE_DATA)
20441     {
20442       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
20443       rtx target = gen_reg_rtx (int_mode);
20444       if (BYTES_BIG_ENDIAN)
20445         /* The act of taking a subreg between INT_MODE and d->vmode
20446            is itself a reversing operation on big-endian targets;
20447            see the comment at the head of aarch64-sve.md for details.
20448            First reinterpret OP0 as INT_MODE without using a subreg
20449            and without changing the contents.  */
20450         emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
20451       else
20452         {
20453           /* For SVE we use REV[BHW] unspecs derived from the element size
20454              of v->mode and vector modes whose elements have SIZE bytes.
20455              This ensures that the vector modes match the predicate modes.  */
20456           int unspec = aarch64_sve_rev_unspec (d->vmode);
20457           rtx pred = aarch64_ptrue_reg (pred_mode);
20458           emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
20459                                        gen_lowpart (int_mode, d->op0)));
20460         }
20461       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
20462       return true;
20463     }
20464   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
20465   emit_set_insn (d->target, src);
20466   return true;
20467 }
20468
20469 /* Recognize patterns for the REV insn, which reverses elements within
20470    a full vector.  */
20471
20472 static bool
20473 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
20474 {
20475   poly_uint64 nelt = d->perm.length ();
20476
20477   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
20478     return false;
20479
20480   if (!d->perm.series_p (0, 1, nelt - 1, -1))
20481     return false;
20482
20483   /* Success! */
20484   if (d->testing_p)
20485     return true;
20486
20487   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
20488   emit_set_insn (d->target, src);
20489   return true;
20490 }
20491
20492 static bool
20493 aarch64_evpc_dup (struct expand_vec_perm_d *d)
20494 {
20495   rtx out = d->target;
20496   rtx in0;
20497   HOST_WIDE_INT elt;
20498   machine_mode vmode = d->vmode;
20499   rtx lane;
20500
20501   if (d->vec_flags == VEC_SVE_PRED
20502       || d->perm.encoding ().encoded_nelts () != 1
20503       || !d->perm[0].is_constant (&elt))
20504     return false;
20505
20506   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
20507     return false;
20508
20509   /* Success! */
20510   if (d->testing_p)
20511     return true;
20512
20513   /* The generic preparation in aarch64_expand_vec_perm_const_1
20514      swaps the operand order and the permute indices if it finds
20515      d->perm[0] to be in the second operand.  Thus, we can always
20516      use d->op0 and need not do any extra arithmetic to get the
20517      correct lane number.  */
20518   in0 = d->op0;
20519   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
20520
20521   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
20522   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
20523   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
20524   return true;
20525 }
20526
20527 static bool
20528 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
20529 {
20530   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
20531   machine_mode vmode = d->vmode;
20532
20533   /* Make sure that the indices are constant.  */
20534   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
20535   for (unsigned int i = 0; i < encoded_nelts; ++i)
20536     if (!d->perm[i].is_constant ())
20537       return false;
20538
20539   if (d->testing_p)
20540     return true;
20541
20542   /* Generic code will try constant permutation twice.  Once with the
20543      original mode and again with the elements lowered to QImode.
20544      So wait and don't do the selector expansion ourselves.  */
20545   if (vmode != V8QImode && vmode != V16QImode)
20546     return false;
20547
20548   /* to_constant is safe since this routine is specific to Advanced SIMD
20549      vectors.  */
20550   unsigned int nelt = d->perm.length ().to_constant ();
20551   for (unsigned int i = 0; i < nelt; ++i)
20552     /* If big-endian and two vectors we end up with a weird mixed-endian
20553        mode on NEON.  Reverse the index within each word but not the word
20554        itself.  to_constant is safe because we checked is_constant above.  */
20555     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
20556                         ? d->perm[i].to_constant () ^ (nelt - 1)
20557                         : d->perm[i].to_constant ());
20558
20559   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
20560   sel = force_reg (vmode, sel);
20561
20562   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
20563   return true;
20564 }
20565
20566 /* Try to implement D using an SVE TBL instruction.  */
20567
20568 static bool
20569 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
20570 {
20571   unsigned HOST_WIDE_INT nelt;
20572
20573   /* Permuting two variable-length vectors could overflow the
20574      index range.  */
20575   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
20576     return false;
20577
20578   if (d->testing_p)
20579     return true;
20580
20581   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
20582   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
20583   if (d->one_vector_p)
20584     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
20585   else
20586     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
20587   return true;
20588 }
20589
20590 /* Try to implement D using SVE SEL instruction.  */
20591
20592 static bool
20593 aarch64_evpc_sel (struct expand_vec_perm_d *d)
20594 {
20595   machine_mode vmode = d->vmode;
20596   int unit_size = GET_MODE_UNIT_SIZE (vmode);
20597
20598   if (d->vec_flags != VEC_SVE_DATA
20599       || unit_size > 8)
20600     return false;
20601
20602   int n_patterns = d->perm.encoding ().npatterns ();
20603   poly_int64 vec_len = d->perm.length ();
20604
20605   for (int i = 0; i < n_patterns; ++i)
20606     if (!known_eq (d->perm[i], i)
20607         && !known_eq (d->perm[i], vec_len + i))
20608       return false;
20609
20610   for (int i = n_patterns; i < n_patterns * 2; i++)
20611     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
20612         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
20613       return false;
20614
20615   if (d->testing_p)
20616     return true;
20617
20618   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
20619
20620   /* Build a predicate that is true when op0 elements should be used.  */
20621   rtx_vector_builder builder (pred_mode, n_patterns, 2);
20622   for (int i = 0; i < n_patterns * 2; i++)
20623     {
20624       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
20625                                           : CONST0_RTX (BImode);
20626       builder.quick_push (elem);
20627     }
20628
20629   rtx const_vec = builder.build ();
20630   rtx pred = force_reg (pred_mode, const_vec);
20631   /* TARGET = PRED ? OP0 : OP1.  */
20632   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
20633   return true;
20634 }
20635
20636 /* Recognize patterns suitable for the INS instructions.  */
20637 static bool
20638 aarch64_evpc_ins (struct expand_vec_perm_d *d)
20639 {
20640   machine_mode mode = d->vmode;
20641   unsigned HOST_WIDE_INT nelt;
20642
20643   if (d->vec_flags != VEC_ADVSIMD)
20644     return false;
20645
20646   /* to_constant is safe since this routine is specific to Advanced SIMD
20647      vectors.  */
20648   nelt = d->perm.length ().to_constant ();
20649   rtx insv = d->op0;
20650
20651   HOST_WIDE_INT idx = -1;
20652
20653   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
20654     {
20655       HOST_WIDE_INT elt;
20656       if (!d->perm[i].is_constant (&elt))
20657         return false;
20658       if (elt == (HOST_WIDE_INT) i)
20659         continue;
20660       if (idx != -1)
20661         {
20662           idx = -1;
20663           break;
20664         }
20665       idx = i;
20666     }
20667
20668   if (idx == -1)
20669     {
20670       insv = d->op1;
20671       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
20672         {
20673           if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
20674             continue;
20675           if (idx != -1)
20676             return false;
20677           idx = i;
20678         }
20679
20680       if (idx == -1)
20681         return false;
20682     }
20683
20684   if (d->testing_p)
20685     return true;
20686
20687   gcc_assert (idx != -1);
20688
20689   unsigned extractindex = d->perm[idx].to_constant ();
20690   rtx extractv = d->op0;
20691   if (extractindex >= nelt)
20692     {
20693       extractv = d->op1;
20694       extractindex -= nelt;
20695     }
20696   gcc_assert (extractindex < nelt);
20697
20698   emit_move_insn (d->target, insv);
20699   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
20700   expand_operand ops[5];
20701   create_output_operand (&ops[0], d->target, mode);
20702   create_input_operand (&ops[1], d->target, mode);
20703   create_integer_operand (&ops[2], 1 << idx);
20704   create_input_operand (&ops[3], extractv, mode);
20705   create_integer_operand (&ops[4], extractindex);
20706   expand_insn (icode, 5, ops);
20707
20708   return true;
20709 }
20710
20711 static bool
20712 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
20713 {
20714   /* The pattern matching functions above are written to look for a small
20715      number to begin the sequence (0, 1, N/2).  If we begin with an index
20716      from the second operand, we can swap the operands.  */
20717   poly_int64 nelt = d->perm.length ();
20718   if (known_ge (d->perm[0], nelt))
20719     {
20720       d->perm.rotate_inputs (1);
20721       std::swap (d->op0, d->op1);
20722     }
20723
20724   if ((d->vec_flags == VEC_ADVSIMD
20725        || d->vec_flags == VEC_SVE_DATA
20726        || d->vec_flags == VEC_SVE_PRED)
20727       && known_gt (nelt, 1))
20728     {
20729       if (aarch64_evpc_rev_local (d))
20730         return true;
20731       else if (aarch64_evpc_rev_global (d))
20732         return true;
20733       else if (aarch64_evpc_ext (d))
20734         return true;
20735       else if (aarch64_evpc_dup (d))
20736         return true;
20737       else if (aarch64_evpc_zip (d))
20738         return true;
20739       else if (aarch64_evpc_uzp (d))
20740         return true;
20741       else if (aarch64_evpc_trn (d))
20742         return true;
20743       else if (aarch64_evpc_sel (d))
20744         return true;
20745       else if (aarch64_evpc_ins (d))
20746         return true;
20747       else if (aarch64_evpc_reencode (d))
20748         return true;
20749       if (d->vec_flags == VEC_SVE_DATA)
20750         return aarch64_evpc_sve_tbl (d);
20751       else if (d->vec_flags == VEC_ADVSIMD)
20752         return aarch64_evpc_tbl (d);
20753     }
20754   return false;
20755 }
20756
20757 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
20758
20759 static bool
20760 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
20761                                   rtx op1, const vec_perm_indices &sel)
20762 {
20763   struct expand_vec_perm_d d;
20764
20765   /* Check whether the mask can be applied to a single vector.  */
20766   if (sel.ninputs () == 1
20767       || (op0 && rtx_equal_p (op0, op1)))
20768     d.one_vector_p = true;
20769   else if (sel.all_from_input_p (0))
20770     {
20771       d.one_vector_p = true;
20772       op1 = op0;
20773     }
20774   else if (sel.all_from_input_p (1))
20775     {
20776       d.one_vector_p = true;
20777       op0 = op1;
20778     }
20779   else
20780     d.one_vector_p = false;
20781
20782   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
20783                      sel.nelts_per_input ());
20784   d.vmode = vmode;
20785   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
20786   d.target = target;
20787   d.op0 = op0;
20788   d.op1 = op1;
20789   d.testing_p = !target;
20790
20791   if (!d.testing_p)
20792     return aarch64_expand_vec_perm_const_1 (&d);
20793
20794   rtx_insn *last = get_last_insn ();
20795   bool ret = aarch64_expand_vec_perm_const_1 (&d);
20796   gcc_assert (last == get_last_insn ());
20797
20798   return ret;
20799 }
20800
20801 /* Generate a byte permute mask for a register of mode MODE,
20802    which has NUNITS units.  */
20803
20804 rtx
20805 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
20806 {
20807   /* We have to reverse each vector because we dont have
20808      a permuted load that can reverse-load according to ABI rules.  */
20809   rtx mask;
20810   rtvec v = rtvec_alloc (16);
20811   unsigned int i, j;
20812   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
20813
20814   gcc_assert (BYTES_BIG_ENDIAN);
20815   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
20816
20817   for (i = 0; i < nunits; i++)
20818     for (j = 0; j < usize; j++)
20819       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
20820   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
20821   return force_reg (V16QImode, mask);
20822 }
20823
20824 /* Expand an SVE integer comparison using the SVE equivalent of:
20825
20826      (set TARGET (CODE OP0 OP1)).  */
20827
20828 void
20829 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
20830 {
20831   machine_mode pred_mode = GET_MODE (target);
20832   machine_mode data_mode = GET_MODE (op0);
20833   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
20834                                       op0, op1);
20835   if (!rtx_equal_p (target, res))
20836     emit_move_insn (target, res);
20837 }
20838
20839 /* Return the UNSPEC_COND_* code for comparison CODE.  */
20840
20841 static unsigned int
20842 aarch64_unspec_cond_code (rtx_code code)
20843 {
20844   switch (code)
20845     {
20846     case NE:
20847       return UNSPEC_COND_FCMNE;
20848     case EQ:
20849       return UNSPEC_COND_FCMEQ;
20850     case LT:
20851       return UNSPEC_COND_FCMLT;
20852     case GT:
20853       return UNSPEC_COND_FCMGT;
20854     case LE:
20855       return UNSPEC_COND_FCMLE;
20856     case GE:
20857       return UNSPEC_COND_FCMGE;
20858     case UNORDERED:
20859       return UNSPEC_COND_FCMUO;
20860     default:
20861       gcc_unreachable ();
20862     }
20863 }
20864
20865 /* Emit:
20866
20867       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
20868
20869    where <X> is the operation associated with comparison CODE.
20870    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
20871
20872 static void
20873 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
20874                           bool known_ptrue_p, rtx op0, rtx op1)
20875 {
20876   rtx flag = gen_int_mode (known_ptrue_p, SImode);
20877   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
20878                                gen_rtvec (4, pred, flag, op0, op1),
20879                                aarch64_unspec_cond_code (code));
20880   emit_set_insn (target, unspec);
20881 }
20882
20883 /* Emit the SVE equivalent of:
20884
20885       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
20886       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
20887       (set TARGET (ior:PRED_MODE TMP1 TMP2))
20888
20889    where <Xi> is the operation associated with comparison CODEi.
20890    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
20891
20892 static void
20893 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
20894                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
20895 {
20896   machine_mode pred_mode = GET_MODE (pred);
20897   rtx tmp1 = gen_reg_rtx (pred_mode);
20898   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
20899   rtx tmp2 = gen_reg_rtx (pred_mode);
20900   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
20901   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
20902 }
20903
20904 /* Emit the SVE equivalent of:
20905
20906       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
20907       (set TARGET (not TMP))
20908
20909    where <X> is the operation associated with comparison CODE.
20910    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
20911
20912 static void
20913 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
20914                                  bool known_ptrue_p, rtx op0, rtx op1)
20915 {
20916   machine_mode pred_mode = GET_MODE (pred);
20917   rtx tmp = gen_reg_rtx (pred_mode);
20918   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
20919   aarch64_emit_unop (target, one_cmpl_optab, tmp);
20920 }
20921
20922 /* Expand an SVE floating-point comparison using the SVE equivalent of:
20923
20924      (set TARGET (CODE OP0 OP1))
20925
20926    If CAN_INVERT_P is true, the caller can also handle inverted results;
20927    return true if the result is in fact inverted.  */
20928
20929 bool
20930 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
20931                                   rtx op0, rtx op1, bool can_invert_p)
20932 {
20933   machine_mode pred_mode = GET_MODE (target);
20934   machine_mode data_mode = GET_MODE (op0);
20935
20936   rtx ptrue = aarch64_ptrue_reg (pred_mode);
20937   switch (code)
20938     {
20939     case UNORDERED:
20940       /* UNORDERED has no immediate form.  */
20941       op1 = force_reg (data_mode, op1);
20942       /* fall through */
20943     case LT:
20944     case LE:
20945     case GT:
20946     case GE:
20947     case EQ:
20948     case NE:
20949       {
20950         /* There is native support for the comparison.  */
20951         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
20952         return false;
20953       }
20954
20955     case LTGT:
20956       /* This is a trapping operation (LT or GT).  */
20957       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
20958       return false;
20959
20960     case UNEQ:
20961       if (!flag_trapping_math)
20962         {
20963           /* This would trap for signaling NaNs.  */
20964           op1 = force_reg (data_mode, op1);
20965           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
20966                                         ptrue, true, op0, op1);
20967           return false;
20968         }
20969       /* fall through */
20970     case UNLT:
20971     case UNLE:
20972     case UNGT:
20973     case UNGE:
20974       if (flag_trapping_math)
20975         {
20976           /* Work out which elements are ordered.  */
20977           rtx ordered = gen_reg_rtx (pred_mode);
20978           op1 = force_reg (data_mode, op1);
20979           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
20980                                            ptrue, true, op0, op1);
20981
20982           /* Test the opposite condition for the ordered elements,
20983              then invert the result.  */
20984           if (code == UNEQ)
20985             code = NE;
20986           else
20987             code = reverse_condition_maybe_unordered (code);
20988           if (can_invert_p)
20989             {
20990               aarch64_emit_sve_fp_cond (target, code,
20991                                         ordered, false, op0, op1);
20992               return true;
20993             }
20994           aarch64_emit_sve_invert_fp_cond (target, code,
20995                                            ordered, false, op0, op1);
20996           return false;
20997         }
20998       break;
20999
21000     case ORDERED:
21001       /* ORDERED has no immediate form.  */
21002       op1 = force_reg (data_mode, op1);
21003       break;
21004
21005     default:
21006       gcc_unreachable ();
21007     }
21008
21009   /* There is native support for the inverse comparison.  */
21010   code = reverse_condition_maybe_unordered (code);
21011   if (can_invert_p)
21012     {
21013       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
21014       return true;
21015     }
21016   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
21017   return false;
21018 }
21019
21020 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
21021    of the data being selected and CMP_MODE is the mode of the values being
21022    compared.  */
21023
21024 void
21025 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
21026                           rtx *ops)
21027 {
21028   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
21029   rtx pred = gen_reg_rtx (pred_mode);
21030   if (FLOAT_MODE_P (cmp_mode))
21031     {
21032       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
21033                                             ops[4], ops[5], true))
21034         std::swap (ops[1], ops[2]);
21035     }
21036   else
21037     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
21038
21039   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
21040     ops[1] = force_reg (data_mode, ops[1]);
21041   /* The "false" value can only be zero if the "true" value is a constant.  */
21042   if (register_operand (ops[1], data_mode)
21043       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
21044     ops[2] = force_reg (data_mode, ops[2]);
21045
21046   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
21047   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
21048 }
21049
21050 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
21051    true.  However due to issues with register allocation it is preferable
21052    to avoid tieing integer scalar and FP scalar modes.  Executing integer
21053    operations in general registers is better than treating them as scalar
21054    vector operations.  This reduces latency and avoids redundant int<->FP
21055    moves.  So tie modes if they are either the same class, or vector modes
21056    with other vector modes, vector structs or any scalar mode.  */
21057
21058 static bool
21059 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
21060 {
21061   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
21062     return true;
21063
21064   /* We specifically want to allow elements of "structure" modes to
21065      be tieable to the structure.  This more general condition allows
21066      other rarer situations too.  The reason we don't extend this to
21067      predicate modes is that there are no predicate structure modes
21068      nor any specific instructions for extracting part of a predicate
21069      register.  */
21070   if (aarch64_vector_data_mode_p (mode1)
21071       && aarch64_vector_data_mode_p (mode2))
21072     return true;
21073
21074   /* Also allow any scalar modes with vectors.  */
21075   if (aarch64_vector_mode_supported_p (mode1)
21076       || aarch64_vector_mode_supported_p (mode2))
21077     return true;
21078
21079   return false;
21080 }
21081
21082 /* Return a new RTX holding the result of moving POINTER forward by
21083    AMOUNT bytes.  */
21084
21085 static rtx
21086 aarch64_move_pointer (rtx pointer, poly_int64 amount)
21087 {
21088   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
21089
21090   return adjust_automodify_address (pointer, GET_MODE (pointer),
21091                                     next, amount);
21092 }
21093
21094 /* Return a new RTX holding the result of moving POINTER forward by the
21095    size of the mode it points to.  */
21096
21097 static rtx
21098 aarch64_progress_pointer (rtx pointer)
21099 {
21100   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
21101 }
21102
21103 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
21104    MODE bytes.  */
21105
21106 static void
21107 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
21108                                               machine_mode mode)
21109 {
21110   rtx reg = gen_reg_rtx (mode);
21111
21112   /* "Cast" the pointers to the correct mode.  */
21113   *src = adjust_address (*src, mode, 0);
21114   *dst = adjust_address (*dst, mode, 0);
21115   /* Emit the memcpy.  */
21116   emit_move_insn (reg, *src);
21117   emit_move_insn (*dst, reg);
21118   /* Move the pointers forward.  */
21119   *src = aarch64_progress_pointer (*src);
21120   *dst = aarch64_progress_pointer (*dst);
21121 }
21122
21123 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
21124    we succeed, otherwise return false.  */
21125
21126 bool
21127 aarch64_expand_cpymem (rtx *operands)
21128 {
21129   int n, mode_bits;
21130   rtx dst = operands[0];
21131   rtx src = operands[1];
21132   rtx base;
21133   machine_mode cur_mode = BLKmode, next_mode;
21134   bool speed_p = !optimize_function_for_size_p (cfun);
21135
21136   /* When optimizing for size, give a better estimate of the length of a
21137      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
21138      will always require an even number of instructions to do now.  And each
21139      operation requires both a load+store, so devide the max number by 2.  */
21140   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
21141
21142   /* We can't do anything smart if the amount to copy is not constant.  */
21143   if (!CONST_INT_P (operands[2]))
21144     return false;
21145
21146   n = INTVAL (operands[2]);
21147
21148   /* Try to keep the number of instructions low.  For all cases we will do at
21149      most two moves for the residual amount, since we'll always overlap the
21150      remainder.  */
21151   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
21152     return false;
21153
21154   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21155   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
21156
21157   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
21158   src = adjust_automodify_address (src, VOIDmode, base, 0);
21159
21160   /* Convert n to bits to make the rest of the code simpler.  */
21161   n = n * BITS_PER_UNIT;
21162
21163   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
21164      larger than TImode, but we should not use them for loads/stores here.  */
21165   const int copy_limit = GET_MODE_BITSIZE (TImode);
21166
21167   while (n > 0)
21168     {
21169       /* Find the largest mode in which to do the copy in without over reading
21170          or writing.  */
21171       opt_scalar_int_mode mode_iter;
21172       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
21173         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
21174           cur_mode = mode_iter.require ();
21175
21176       gcc_assert (cur_mode != BLKmode);
21177
21178       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
21179       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
21180
21181       n -= mode_bits;
21182
21183       /* Do certain trailing copies as overlapping if it's going to be
21184          cheaper.  i.e. less instructions to do so.  For instance doing a 15
21185          byte copy it's more efficient to do two overlapping 8 byte copies than
21186          8 + 6 + 1.  */
21187       if (n > 0 && n <= 8 * BITS_PER_UNIT)
21188         {
21189           next_mode = smallest_mode_for_size (n, MODE_INT);
21190           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
21191           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
21192           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
21193           n = n_bits;
21194         }
21195     }
21196
21197   return true;
21198 }
21199
21200 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
21201    SImode stores.  Handle the case when the constant has identical
21202    bottom and top halves.  This is beneficial when the two stores can be
21203    merged into an STP and we avoid synthesising potentially expensive
21204    immediates twice.  Return true if such a split is possible.  */
21205
21206 bool
21207 aarch64_split_dimode_const_store (rtx dst, rtx src)
21208 {
21209   rtx lo = gen_lowpart (SImode, src);
21210   rtx hi = gen_highpart_mode (SImode, DImode, src);
21211
21212   bool size_p = optimize_function_for_size_p (cfun);
21213
21214   if (!rtx_equal_p (lo, hi))
21215     return false;
21216
21217   unsigned int orig_cost
21218     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
21219   unsigned int lo_cost
21220     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
21221
21222   /* We want to transform:
21223      MOV        x1, 49370
21224      MOVK       x1, 0x140, lsl 16
21225      MOVK       x1, 0xc0da, lsl 32
21226      MOVK       x1, 0x140, lsl 48
21227      STR        x1, [x0]
21228    into:
21229      MOV        w1, 49370
21230      MOVK       w1, 0x140, lsl 16
21231      STP        w1, w1, [x0]
21232    So we want to perform this only when we save two instructions
21233    or more.  When optimizing for size, however, accept any code size
21234    savings we can.  */
21235   if (size_p && orig_cost <= lo_cost)
21236     return false;
21237
21238   if (!size_p
21239       && (orig_cost <= lo_cost + 1))
21240     return false;
21241
21242   rtx mem_lo = adjust_address (dst, SImode, 0);
21243   if (!aarch64_mem_pair_operand (mem_lo, SImode))
21244     return false;
21245
21246   rtx tmp_reg = gen_reg_rtx (SImode);
21247   aarch64_expand_mov_immediate (tmp_reg, lo);
21248   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
21249   /* Don't emit an explicit store pair as this may not be always profitable.
21250      Let the sched-fusion logic decide whether to merge them.  */
21251   emit_move_insn (mem_lo, tmp_reg);
21252   emit_move_insn (mem_hi, tmp_reg);
21253
21254   return true;
21255 }
21256
21257 /* Generate RTL for a conditional branch with rtx comparison CODE in
21258    mode CC_MODE.  The destination of the unlikely conditional branch
21259    is LABEL_REF.  */
21260
21261 void
21262 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
21263                               rtx label_ref)
21264 {
21265   rtx x;
21266   x = gen_rtx_fmt_ee (code, VOIDmode,
21267                       gen_rtx_REG (cc_mode, CC_REGNUM),
21268                       const0_rtx);
21269
21270   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21271                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
21272                             pc_rtx);
21273   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21274 }
21275
21276 /* Generate DImode scratch registers for 128-bit (TImode) addition.
21277
21278    OP1 represents the TImode destination operand 1
21279    OP2 represents the TImode destination operand 2
21280    LOW_DEST represents the low half (DImode) of TImode operand 0
21281    LOW_IN1 represents the low half (DImode) of TImode operand 1
21282    LOW_IN2 represents the low half (DImode) of TImode operand 2
21283    HIGH_DEST represents the high half (DImode) of TImode operand 0
21284    HIGH_IN1 represents the high half (DImode) of TImode operand 1
21285    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
21286
21287 void
21288 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21289                             rtx *low_in1, rtx *low_in2,
21290                             rtx *high_dest, rtx *high_in1,
21291                             rtx *high_in2)
21292 {
21293   *low_dest = gen_reg_rtx (DImode);
21294   *low_in1 = gen_lowpart (DImode, op1);
21295   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21296                                   subreg_lowpart_offset (DImode, TImode));
21297   *high_dest = gen_reg_rtx (DImode);
21298   *high_in1 = gen_highpart (DImode, op1);
21299   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21300                                    subreg_highpart_offset (DImode, TImode));
21301 }
21302
21303 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
21304
21305    This function differs from 'arch64_addti_scratch_regs' in that
21306    OP1 can be an immediate constant (zero). We must call
21307    subreg_highpart_offset with DImode and TImode arguments, otherwise
21308    VOIDmode will be used for the const_int which generates an internal
21309    error from subreg_size_highpart_offset which does not expect a size of zero.
21310
21311    OP1 represents the TImode destination operand 1
21312    OP2 represents the TImode destination operand 2
21313    LOW_DEST represents the low half (DImode) of TImode operand 0
21314    LOW_IN1 represents the low half (DImode) of TImode operand 1
21315    LOW_IN2 represents the low half (DImode) of TImode operand 2
21316    HIGH_DEST represents the high half (DImode) of TImode operand 0
21317    HIGH_IN1 represents the high half (DImode) of TImode operand 1
21318    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
21319
21320
21321 void
21322 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21323                              rtx *low_in1, rtx *low_in2,
21324                              rtx *high_dest, rtx *high_in1,
21325                              rtx *high_in2)
21326 {
21327   *low_dest = gen_reg_rtx (DImode);
21328   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
21329                                   subreg_lowpart_offset (DImode, TImode));
21330
21331   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21332                                   subreg_lowpart_offset (DImode, TImode));
21333   *high_dest = gen_reg_rtx (DImode);
21334
21335   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
21336                                    subreg_highpart_offset (DImode, TImode));
21337   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21338                                    subreg_highpart_offset (DImode, TImode));
21339 }
21340
21341 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
21342
21343    OP0 represents the TImode destination operand 0
21344    LOW_DEST represents the low half (DImode) of TImode operand 0
21345    LOW_IN1 represents the low half (DImode) of TImode operand 1
21346    LOW_IN2 represents the low half (DImode) of TImode operand 2
21347    HIGH_DEST represents the high half (DImode) of TImode operand 0
21348    HIGH_IN1 represents the high half (DImode) of TImode operand 1
21349    HIGH_IN2 represents the high half (DImode) of TImode operand 2
21350    UNSIGNED_P is true if the operation is being performed on unsigned
21351    values.  */
21352 void
21353 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
21354                        rtx low_in2, rtx high_dest, rtx high_in1,
21355                        rtx high_in2, bool unsigned_p)
21356 {
21357   if (low_in2 == const0_rtx)
21358     {
21359       low_dest = low_in1;
21360       high_in2 = force_reg (DImode, high_in2);
21361       if (unsigned_p)
21362         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
21363       else
21364         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
21365     }
21366   else
21367     {
21368       if (aarch64_plus_immediate (low_in2, DImode))
21369         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
21370                                             GEN_INT (-INTVAL (low_in2))));
21371       else
21372         {
21373           low_in2 = force_reg (DImode, low_in2);
21374           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
21375         }
21376       high_in2 = force_reg (DImode, high_in2);
21377
21378       if (unsigned_p)
21379         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
21380       else
21381         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
21382     }
21383
21384   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
21385   emit_move_insn (gen_highpart (DImode, op0), high_dest);
21386
21387 }
21388
21389 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
21390
21391 static unsigned HOST_WIDE_INT
21392 aarch64_asan_shadow_offset (void)
21393 {
21394   if (TARGET_ILP32)
21395     return (HOST_WIDE_INT_1 << 29);
21396   else
21397     return (HOST_WIDE_INT_1 << 36);
21398 }
21399
21400 static rtx
21401 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
21402                         int code, tree treeop0, tree treeop1)
21403 {
21404   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21405   rtx op0, op1;
21406   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
21407   insn_code icode;
21408   struct expand_operand ops[4];
21409
21410   start_sequence ();
21411   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21412
21413   op_mode = GET_MODE (op0);
21414   if (op_mode == VOIDmode)
21415     op_mode = GET_MODE (op1);
21416
21417   switch (op_mode)
21418     {
21419     case E_QImode:
21420     case E_HImode:
21421     case E_SImode:
21422       cmp_mode = SImode;
21423       icode = CODE_FOR_cmpsi;
21424       break;
21425
21426     case E_DImode:
21427       cmp_mode = DImode;
21428       icode = CODE_FOR_cmpdi;
21429       break;
21430
21431     case E_SFmode:
21432       cmp_mode = SFmode;
21433       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21434       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
21435       break;
21436
21437     case E_DFmode:
21438       cmp_mode = DFmode;
21439       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21440       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
21441       break;
21442
21443     default:
21444       end_sequence ();
21445       return NULL_RTX;
21446     }
21447
21448   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
21449   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
21450   if (!op0 || !op1)
21451     {
21452       end_sequence ();
21453       return NULL_RTX;
21454     }
21455   *prep_seq = get_insns ();
21456   end_sequence ();
21457
21458   create_fixed_operand (&ops[0], op0);
21459   create_fixed_operand (&ops[1], op1);
21460
21461   start_sequence ();
21462   if (!maybe_expand_insn (icode, 2, ops))
21463     {
21464       end_sequence ();
21465       return NULL_RTX;
21466     }
21467   *gen_seq = get_insns ();
21468   end_sequence ();
21469
21470   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
21471                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
21472 }
21473
21474 static rtx
21475 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
21476                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
21477 {
21478   rtx op0, op1, target;
21479   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21480   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
21481   insn_code icode;
21482   struct expand_operand ops[6];
21483   int aarch64_cond;
21484
21485   push_to_sequence (*prep_seq);
21486   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21487
21488   op_mode = GET_MODE (op0);
21489   if (op_mode == VOIDmode)
21490     op_mode = GET_MODE (op1);
21491
21492   switch (op_mode)
21493     {
21494     case E_QImode:
21495     case E_HImode:
21496     case E_SImode:
21497       cmp_mode = SImode;
21498       break;
21499
21500     case E_DImode:
21501       cmp_mode = DImode;
21502       break;
21503
21504     case E_SFmode:
21505       cmp_mode = SFmode;
21506       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
21507       break;
21508
21509     case E_DFmode:
21510       cmp_mode = DFmode;
21511       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
21512       break;
21513
21514     default:
21515       end_sequence ();
21516       return NULL_RTX;
21517     }
21518
21519   icode = code_for_ccmp (cc_mode, cmp_mode);
21520
21521   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
21522   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
21523   if (!op0 || !op1)
21524     {
21525       end_sequence ();
21526       return NULL_RTX;
21527     }
21528   *prep_seq = get_insns ();
21529   end_sequence ();
21530
21531   target = gen_rtx_REG (cc_mode, CC_REGNUM);
21532   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
21533
21534   if (bit_code != AND)
21535     {
21536       /* Treat the ccmp patterns as canonical and use them where possible,
21537          but fall back to ccmp_rev patterns if there's no other option.  */
21538       rtx_code prev_code = GET_CODE (prev);
21539       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
21540       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
21541           && !(prev_code == EQ
21542                || prev_code == NE
21543                || prev_code == ORDERED
21544                || prev_code == UNORDERED))
21545         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
21546       else
21547         {
21548           rtx_code code = reverse_condition (prev_code);
21549           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
21550         }
21551       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
21552     }
21553
21554   create_fixed_operand (&ops[0], XEXP (prev, 0));
21555   create_fixed_operand (&ops[1], target);
21556   create_fixed_operand (&ops[2], op0);
21557   create_fixed_operand (&ops[3], op1);
21558   create_fixed_operand (&ops[4], prev);
21559   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
21560
21561   push_to_sequence (*gen_seq);
21562   if (!maybe_expand_insn (icode, 6, ops))
21563     {
21564       end_sequence ();
21565       return NULL_RTX;
21566     }
21567
21568   *gen_seq = get_insns ();
21569   end_sequence ();
21570
21571   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
21572 }
21573
21574 #undef TARGET_GEN_CCMP_FIRST
21575 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
21576
21577 #undef TARGET_GEN_CCMP_NEXT
21578 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
21579
21580 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
21581    instruction fusion of some sort.  */
21582
21583 static bool
21584 aarch64_macro_fusion_p (void)
21585 {
21586   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
21587 }
21588
21589
21590 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
21591    should be kept together during scheduling.  */
21592
21593 static bool
21594 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
21595 {
21596   rtx set_dest;
21597   rtx prev_set = single_set (prev);
21598   rtx curr_set = single_set (curr);
21599   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
21600   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
21601
21602   if (!aarch64_macro_fusion_p ())
21603     return false;
21604
21605   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
21606     {
21607       /* We are trying to match:
21608          prev (mov)  == (set (reg r0) (const_int imm16))
21609          curr (movk) == (set (zero_extract (reg r0)
21610                                            (const_int 16)
21611                                            (const_int 16))
21612                              (const_int imm16_1))  */
21613
21614       set_dest = SET_DEST (curr_set);
21615
21616       if (GET_CODE (set_dest) == ZERO_EXTRACT
21617           && CONST_INT_P (SET_SRC (curr_set))
21618           && CONST_INT_P (SET_SRC (prev_set))
21619           && CONST_INT_P (XEXP (set_dest, 2))
21620           && INTVAL (XEXP (set_dest, 2)) == 16
21621           && REG_P (XEXP (set_dest, 0))
21622           && REG_P (SET_DEST (prev_set))
21623           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
21624         {
21625           return true;
21626         }
21627     }
21628
21629   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
21630     {
21631
21632       /*  We're trying to match:
21633           prev (adrp) == (set (reg r1)
21634                               (high (symbol_ref ("SYM"))))
21635           curr (add) == (set (reg r0)
21636                              (lo_sum (reg r1)
21637                                      (symbol_ref ("SYM"))))
21638           Note that r0 need not necessarily be the same as r1, especially
21639           during pre-regalloc scheduling.  */
21640
21641       if (satisfies_constraint_Ush (SET_SRC (prev_set))
21642           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
21643         {
21644           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
21645               && REG_P (XEXP (SET_SRC (curr_set), 0))
21646               && REGNO (XEXP (SET_SRC (curr_set), 0))
21647                  == REGNO (SET_DEST (prev_set))
21648               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
21649                               XEXP (SET_SRC (curr_set), 1)))
21650             return true;
21651         }
21652     }
21653
21654   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
21655     {
21656
21657       /* We're trying to match:
21658          prev (movk) == (set (zero_extract (reg r0)
21659                                            (const_int 16)
21660                                            (const_int 32))
21661                              (const_int imm16_1))
21662          curr (movk) == (set (zero_extract (reg r0)
21663                                            (const_int 16)
21664                                            (const_int 48))
21665                              (const_int imm16_2))  */
21666
21667       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
21668           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
21669           && REG_P (XEXP (SET_DEST (prev_set), 0))
21670           && REG_P (XEXP (SET_DEST (curr_set), 0))
21671           && REGNO (XEXP (SET_DEST (prev_set), 0))
21672              == REGNO (XEXP (SET_DEST (curr_set), 0))
21673           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
21674           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
21675           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
21676           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
21677           && CONST_INT_P (SET_SRC (prev_set))
21678           && CONST_INT_P (SET_SRC (curr_set)))
21679         return true;
21680
21681     }
21682   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
21683     {
21684       /* We're trying to match:
21685           prev (adrp) == (set (reg r0)
21686                               (high (symbol_ref ("SYM"))))
21687           curr (ldr) == (set (reg r1)
21688                              (mem (lo_sum (reg r0)
21689                                              (symbol_ref ("SYM")))))
21690                  or
21691           curr (ldr) == (set (reg r1)
21692                              (zero_extend (mem
21693                                            (lo_sum (reg r0)
21694                                                    (symbol_ref ("SYM"))))))  */
21695       if (satisfies_constraint_Ush (SET_SRC (prev_set))
21696           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
21697         {
21698           rtx curr_src = SET_SRC (curr_set);
21699
21700           if (GET_CODE (curr_src) == ZERO_EXTEND)
21701             curr_src = XEXP (curr_src, 0);
21702
21703           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
21704               && REG_P (XEXP (XEXP (curr_src, 0), 0))
21705               && REGNO (XEXP (XEXP (curr_src, 0), 0))
21706                  == REGNO (SET_DEST (prev_set))
21707               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
21708                               XEXP (SET_SRC (prev_set), 0)))
21709               return true;
21710         }
21711     }
21712
21713   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
21714   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
21715       && prev_set && curr_set && any_condjump_p (curr)
21716       && GET_CODE (SET_SRC (prev_set)) == COMPARE
21717       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
21718       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
21719     return true;
21720
21721   /* Fuse flag-setting ALU instructions and conditional branch.  */
21722   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
21723       && any_condjump_p (curr))
21724     {
21725       unsigned int condreg1, condreg2;
21726       rtx cc_reg_1;
21727       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
21728       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
21729
21730       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
21731           && prev
21732           && modified_in_p (cc_reg_1, prev))
21733         {
21734           enum attr_type prev_type = get_attr_type (prev);
21735
21736           /* FIXME: this misses some which is considered simple arthematic
21737              instructions for ThunderX.  Simple shifts are missed here.  */
21738           if (prev_type == TYPE_ALUS_SREG
21739               || prev_type == TYPE_ALUS_IMM
21740               || prev_type == TYPE_LOGICS_REG
21741               || prev_type == TYPE_LOGICS_IMM)
21742             return true;
21743         }
21744     }
21745
21746   /* Fuse ALU instructions and CBZ/CBNZ.  */
21747   if (prev_set
21748       && curr_set
21749       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
21750       && any_condjump_p (curr))
21751     {
21752       /* We're trying to match:
21753           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
21754           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
21755                                                          (const_int 0))
21756                                                  (label_ref ("SYM"))
21757                                                  (pc))  */
21758       if (SET_DEST (curr_set) == (pc_rtx)
21759           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
21760           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
21761           && REG_P (SET_DEST (prev_set))
21762           && REGNO (SET_DEST (prev_set))
21763              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
21764         {
21765           /* Fuse ALU operations followed by conditional branch instruction.  */
21766           switch (get_attr_type (prev))
21767             {
21768             case TYPE_ALU_IMM:
21769             case TYPE_ALU_SREG:
21770             case TYPE_ADC_REG:
21771             case TYPE_ADC_IMM:
21772             case TYPE_ADCS_REG:
21773             case TYPE_ADCS_IMM:
21774             case TYPE_LOGIC_REG:
21775             case TYPE_LOGIC_IMM:
21776             case TYPE_CSEL:
21777             case TYPE_ADR:
21778             case TYPE_MOV_IMM:
21779             case TYPE_SHIFT_REG:
21780             case TYPE_SHIFT_IMM:
21781             case TYPE_BFM:
21782             case TYPE_RBIT:
21783             case TYPE_REV:
21784             case TYPE_EXTEND:
21785               return true;
21786
21787             default:;
21788             }
21789         }
21790     }
21791
21792   return false;
21793 }
21794
21795 /* Return true iff the instruction fusion described by OP is enabled.  */
21796
21797 bool
21798 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
21799 {
21800   return (aarch64_tune_params.fusible_ops & op) != 0;
21801 }
21802
21803 /* If MEM is in the form of [base+offset], extract the two parts
21804    of address and set to BASE and OFFSET, otherwise return false
21805    after clearing BASE and OFFSET.  */
21806
21807 bool
21808 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
21809 {
21810   rtx addr;
21811
21812   gcc_assert (MEM_P (mem));
21813
21814   addr = XEXP (mem, 0);
21815
21816   if (REG_P (addr))
21817     {
21818       *base = addr;
21819       *offset = const0_rtx;
21820       return true;
21821     }
21822
21823   if (GET_CODE (addr) == PLUS
21824       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
21825     {
21826       *base = XEXP (addr, 0);
21827       *offset = XEXP (addr, 1);
21828       return true;
21829     }
21830
21831   *base = NULL_RTX;
21832   *offset = NULL_RTX;
21833
21834   return false;
21835 }
21836
21837 /* Types for scheduling fusion.  */
21838 enum sched_fusion_type
21839 {
21840   SCHED_FUSION_NONE = 0,
21841   SCHED_FUSION_LD_SIGN_EXTEND,
21842   SCHED_FUSION_LD_ZERO_EXTEND,
21843   SCHED_FUSION_LD,
21844   SCHED_FUSION_ST,
21845   SCHED_FUSION_NUM
21846 };
21847
21848 /* If INSN is a load or store of address in the form of [base+offset],
21849    extract the two parts and set to BASE and OFFSET.  Return scheduling
21850    fusion type this INSN is.  */
21851
21852 static enum sched_fusion_type
21853 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
21854 {
21855   rtx x, dest, src;
21856   enum sched_fusion_type fusion = SCHED_FUSION_LD;
21857
21858   gcc_assert (INSN_P (insn));
21859   x = PATTERN (insn);
21860   if (GET_CODE (x) != SET)
21861     return SCHED_FUSION_NONE;
21862
21863   src = SET_SRC (x);
21864   dest = SET_DEST (x);
21865
21866   machine_mode dest_mode = GET_MODE (dest);
21867
21868   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
21869     return SCHED_FUSION_NONE;
21870
21871   if (GET_CODE (src) == SIGN_EXTEND)
21872     {
21873       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
21874       src = XEXP (src, 0);
21875       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
21876         return SCHED_FUSION_NONE;
21877     }
21878   else if (GET_CODE (src) == ZERO_EXTEND)
21879     {
21880       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
21881       src = XEXP (src, 0);
21882       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
21883         return SCHED_FUSION_NONE;
21884     }
21885
21886   if (GET_CODE (src) == MEM && REG_P (dest))
21887     extract_base_offset_in_addr (src, base, offset);
21888   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
21889     {
21890       fusion = SCHED_FUSION_ST;
21891       extract_base_offset_in_addr (dest, base, offset);
21892     }
21893   else
21894     return SCHED_FUSION_NONE;
21895
21896   if (*base == NULL_RTX || *offset == NULL_RTX)
21897     fusion = SCHED_FUSION_NONE;
21898
21899   return fusion;
21900 }
21901
21902 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
21903
21904    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
21905    and PRI are only calculated for these instructions.  For other instruction,
21906    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
21907    type instruction fusion can be added by returning different priorities.
21908
21909    It's important that irrelevant instructions get the largest FUSION_PRI.  */
21910
21911 static void
21912 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
21913                                int *fusion_pri, int *pri)
21914 {
21915   int tmp, off_val;
21916   rtx base, offset;
21917   enum sched_fusion_type fusion;
21918
21919   gcc_assert (INSN_P (insn));
21920
21921   tmp = max_pri - 1;
21922   fusion = fusion_load_store (insn, &base, &offset);
21923   if (fusion == SCHED_FUSION_NONE)
21924     {
21925       *pri = tmp;
21926       *fusion_pri = tmp;
21927       return;
21928     }
21929
21930   /* Set FUSION_PRI according to fusion type and base register.  */
21931   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
21932
21933   /* Calculate PRI.  */
21934   tmp /= 2;
21935
21936   /* INSN with smaller offset goes first.  */
21937   off_val = (int)(INTVAL (offset));
21938   if (off_val >= 0)
21939     tmp -= (off_val & 0xfffff);
21940   else
21941     tmp += ((- off_val) & 0xfffff);
21942
21943   *pri = tmp;
21944   return;
21945 }
21946
21947 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
21948    Adjust priority of sha1h instructions so they are scheduled before
21949    other SHA1 instructions.  */
21950
21951 static int
21952 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
21953 {
21954   rtx x = PATTERN (insn);
21955
21956   if (GET_CODE (x) == SET)
21957     {
21958       x = SET_SRC (x);
21959
21960       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
21961         return priority + 10;
21962     }
21963
21964   return priority;
21965 }
21966
21967 /* Given OPERANDS of consecutive load/store, check if we can merge
21968    them into ldp/stp.  LOAD is true if they are load instructions.
21969    MODE is the mode of memory operands.  */
21970
21971 bool
21972 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
21973                                 machine_mode mode)
21974 {
21975   HOST_WIDE_INT offval_1, offval_2, msize;
21976   enum reg_class rclass_1, rclass_2;
21977   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
21978
21979   if (load)
21980     {
21981       mem_1 = operands[1];
21982       mem_2 = operands[3];
21983       reg_1 = operands[0];
21984       reg_2 = operands[2];
21985       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
21986       if (REGNO (reg_1) == REGNO (reg_2))
21987         return false;
21988     }
21989   else
21990     {
21991       mem_1 = operands[0];
21992       mem_2 = operands[2];
21993       reg_1 = operands[1];
21994       reg_2 = operands[3];
21995     }
21996
21997   /* The mems cannot be volatile.  */
21998   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
21999     return false;
22000
22001   /* If we have SImode and slow unaligned ldp,
22002      check the alignment to be at least 8 byte. */
22003   if (mode == SImode
22004       && (aarch64_tune_params.extra_tuning_flags
22005           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
22006       && !optimize_size
22007       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
22008     return false;
22009
22010   /* Check if the addresses are in the form of [base+offset].  */
22011   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
22012   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
22013     return false;
22014   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
22015   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
22016     return false;
22017
22018   /* Check if the bases are same.  */
22019   if (!rtx_equal_p (base_1, base_2))
22020     return false;
22021
22022   /* The operands must be of the same size.  */
22023   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
22024                          GET_MODE_SIZE (GET_MODE (mem_2))));
22025
22026   offval_1 = INTVAL (offset_1);
22027   offval_2 = INTVAL (offset_2);
22028   /* We should only be trying this for fixed-sized modes.  There is no
22029      SVE LDP/STP instruction.  */
22030   msize = GET_MODE_SIZE (mode).to_constant ();
22031   /* Check if the offsets are consecutive.  */
22032   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
22033     return false;
22034
22035   /* Check if the addresses are clobbered by load.  */
22036   if (load)
22037     {
22038       if (reg_mentioned_p (reg_1, mem_1))
22039         return false;
22040
22041       /* In increasing order, the last load can clobber the address.  */
22042       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
22043         return false;
22044     }
22045
22046   /* One of the memory accesses must be a mempair operand.
22047      If it is not the first one, they need to be swapped by the
22048      peephole.  */
22049   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
22050        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
22051     return false;
22052
22053   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
22054     rclass_1 = FP_REGS;
22055   else
22056     rclass_1 = GENERAL_REGS;
22057
22058   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
22059     rclass_2 = FP_REGS;
22060   else
22061     rclass_2 = GENERAL_REGS;
22062
22063   /* Check if the registers are of same class.  */
22064   if (rclass_1 != rclass_2)
22065     return false;
22066
22067   return true;
22068 }
22069
22070 /* Given OPERANDS of consecutive load/store that can be merged,
22071    swap them if they are not in ascending order.  */
22072 void
22073 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
22074 {
22075   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
22076   HOST_WIDE_INT offval_1, offval_2;
22077
22078   if (load)
22079     {
22080       mem_1 = operands[1];
22081       mem_2 = operands[3];
22082     }
22083   else
22084     {
22085       mem_1 = operands[0];
22086       mem_2 = operands[2];
22087     }
22088
22089   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
22090   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
22091
22092   offval_1 = INTVAL (offset_1);
22093   offval_2 = INTVAL (offset_2);
22094
22095   if (offval_1 > offval_2)
22096     {
22097       /* Irrespective of whether this is a load or a store,
22098          we do the same swap.  */
22099       std::swap (operands[0], operands[2]);
22100       std::swap (operands[1], operands[3]);
22101     }
22102 }
22103
22104 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
22105    comparison between the two.  */
22106 int
22107 aarch64_host_wide_int_compare (const void *x, const void *y)
22108 {
22109   return wi::cmps (* ((const HOST_WIDE_INT *) x),
22110                    * ((const HOST_WIDE_INT *) y));
22111 }
22112
22113 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
22114    other pointing to a REG rtx containing an offset, compare the offsets
22115    of the two pairs.
22116
22117    Return:
22118
22119         1 iff offset (X) > offset (Y)
22120         0 iff offset (X) == offset (Y)
22121         -1 iff offset (X) < offset (Y)  */
22122 int
22123 aarch64_ldrstr_offset_compare (const void *x, const void *y)
22124 {
22125   const rtx * operands_1 = (const rtx *) x;
22126   const rtx * operands_2 = (const rtx *) y;
22127   rtx mem_1, mem_2, base, offset_1, offset_2;
22128
22129   if (MEM_P (operands_1[0]))
22130     mem_1 = operands_1[0];
22131   else
22132     mem_1 = operands_1[1];
22133
22134   if (MEM_P (operands_2[0]))
22135     mem_2 = operands_2[0];
22136   else
22137     mem_2 = operands_2[1];
22138
22139   /* Extract the offsets.  */
22140   extract_base_offset_in_addr (mem_1, &base, &offset_1);
22141   extract_base_offset_in_addr (mem_2, &base, &offset_2);
22142
22143   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
22144
22145   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
22146 }
22147
22148 /* Given OPERANDS of consecutive load/store, check if we can merge
22149    them into ldp/stp by adjusting the offset.  LOAD is true if they
22150    are load instructions.  MODE is the mode of memory operands.
22151
22152    Given below consecutive stores:
22153
22154      str  w1, [xb, 0x100]
22155      str  w1, [xb, 0x104]
22156      str  w1, [xb, 0x108]
22157      str  w1, [xb, 0x10c]
22158
22159    Though the offsets are out of the range supported by stp, we can
22160    still pair them after adjusting the offset, like:
22161
22162      add  scratch, xb, 0x100
22163      stp  w1, w1, [scratch]
22164      stp  w1, w1, [scratch, 0x8]
22165
22166    The peephole patterns detecting this opportunity should guarantee
22167    the scratch register is avaliable.  */
22168
22169 bool
22170 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
22171                                        machine_mode mode)
22172 {
22173   const int num_insns = 4;
22174   enum reg_class rclass;
22175   HOST_WIDE_INT offvals[num_insns], msize;
22176   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
22177
22178   if (load)
22179     {
22180       for (int i = 0; i < num_insns; i++)
22181         {
22182           reg[i] = operands[2 * i];
22183           mem[i] = operands[2 * i + 1];
22184
22185           gcc_assert (REG_P (reg[i]));
22186         }
22187
22188       /* Do not attempt to merge the loads if the loads clobber each other.  */
22189       for (int i = 0; i < 8; i += 2)
22190         for (int j = i + 2; j < 8; j += 2)
22191           if (reg_overlap_mentioned_p (operands[i], operands[j]))
22192             return false;
22193     }
22194   else
22195     for (int i = 0; i < num_insns; i++)
22196       {
22197         mem[i] = operands[2 * i];
22198         reg[i] = operands[2 * i + 1];
22199       }
22200
22201   /* Skip if memory operand is by itself valid for ldp/stp.  */
22202   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
22203     return false;
22204
22205   for (int i = 0; i < num_insns; i++)
22206     {
22207       /* The mems cannot be volatile.  */
22208       if (MEM_VOLATILE_P (mem[i]))
22209         return false;
22210
22211       /* Check if the addresses are in the form of [base+offset].  */
22212       extract_base_offset_in_addr (mem[i], base + i, offset + i);
22213       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
22214         return false;
22215     }
22216
22217   /* Check if the registers are of same class.  */
22218   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
22219     ? FP_REGS : GENERAL_REGS;
22220
22221   for (int i = 1; i < num_insns; i++)
22222     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
22223       {
22224         if (rclass != FP_REGS)
22225           return false;
22226       }
22227     else
22228       {
22229         if (rclass != GENERAL_REGS)
22230           return false;
22231       }
22232
22233   /* Only the last register in the order in which they occur
22234      may be clobbered by the load.  */
22235   if (rclass == GENERAL_REGS && load)
22236     for (int i = 0; i < num_insns - 1; i++)
22237       if (reg_mentioned_p (reg[i], mem[i]))
22238         return false;
22239
22240   /* Check if the bases are same.  */
22241   for (int i = 0; i < num_insns - 1; i++)
22242     if (!rtx_equal_p (base[i], base[i + 1]))
22243       return false;
22244
22245   for (int i = 0; i < num_insns; i++)
22246     offvals[i] = INTVAL (offset[i]);
22247
22248   msize = GET_MODE_SIZE (mode).to_constant ();
22249
22250   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
22251   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
22252          aarch64_host_wide_int_compare);
22253
22254   if (!(offvals[1] == offvals[0] + msize
22255         && offvals[3] == offvals[2] + msize))
22256     return false;
22257
22258   /* Check that offsets are within range of each other.  The ldp/stp
22259      instructions have 7 bit immediate offsets, so use 0x80.  */
22260   if (offvals[2] - offvals[0] >= msize * 0x80)
22261     return false;
22262
22263   /* The offsets must be aligned with respect to each other.  */
22264   if (offvals[0] % msize != offvals[2] % msize)
22265     return false;
22266
22267   /* If we have SImode and slow unaligned ldp,
22268      check the alignment to be at least 8 byte. */
22269   if (mode == SImode
22270       && (aarch64_tune_params.extra_tuning_flags
22271           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
22272       && !optimize_size
22273       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
22274     return false;
22275
22276   return true;
22277 }
22278
22279 /* Given OPERANDS of consecutive load/store, this function pairs them
22280    into LDP/STP after adjusting the offset.  It depends on the fact
22281    that the operands can be sorted so the offsets are correct for STP.
22282    MODE is the mode of memory operands.  CODE is the rtl operator
22283    which should be applied to all memory operands, it's SIGN_EXTEND,
22284    ZERO_EXTEND or UNKNOWN.  */
22285
22286 bool
22287 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
22288                              machine_mode mode, RTX_CODE code)
22289 {
22290   rtx base, offset_1, offset_3, t1, t2;
22291   rtx mem_1, mem_2, mem_3, mem_4;
22292   rtx temp_operands[8];
22293   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
22294                 stp_off_upper_limit, stp_off_lower_limit, msize;
22295
22296   /* We make changes on a copy as we may still bail out.  */
22297   for (int i = 0; i < 8; i ++)
22298     temp_operands[i] = operands[i];
22299
22300   /* Sort the operands.  */
22301   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
22302
22303   /* Copy the memory operands so that if we have to bail for some
22304      reason the original addresses are unchanged.  */
22305   if (load)
22306     {
22307       mem_1 = copy_rtx (temp_operands[1]);
22308       mem_2 = copy_rtx (temp_operands[3]);
22309       mem_3 = copy_rtx (temp_operands[5]);
22310       mem_4 = copy_rtx (temp_operands[7]);
22311     }
22312   else
22313     {
22314       mem_1 = copy_rtx (temp_operands[0]);
22315       mem_2 = copy_rtx (temp_operands[2]);
22316       mem_3 = copy_rtx (temp_operands[4]);
22317       mem_4 = copy_rtx (temp_operands[6]);
22318       gcc_assert (code == UNKNOWN);
22319     }
22320
22321   extract_base_offset_in_addr (mem_1, &base, &offset_1);
22322   extract_base_offset_in_addr (mem_3, &base, &offset_3);
22323   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
22324               && offset_3 != NULL_RTX);
22325
22326   /* Adjust offset so it can fit in LDP/STP instruction.  */
22327   msize = GET_MODE_SIZE (mode).to_constant();
22328   stp_off_upper_limit = msize * (0x40 - 1);
22329   stp_off_lower_limit = - msize * 0x40;
22330
22331   off_val_1 = INTVAL (offset_1);
22332   off_val_3 = INTVAL (offset_3);
22333
22334   /* The base offset is optimally half way between the two STP/LDP offsets.  */
22335   if (msize <= 4)
22336     base_off = (off_val_1 + off_val_3) / 2;
22337   else
22338     /* However, due to issues with negative LDP/STP offset generation for
22339        larger modes, for DF, DI and vector modes. we must not use negative
22340        addresses smaller than 9 signed unadjusted bits can store.  This
22341        provides the most range in this case.  */
22342     base_off = off_val_1;
22343
22344   /* Adjust the base so that it is aligned with the addresses but still
22345      optimal.  */
22346   if (base_off % msize != off_val_1 % msize)
22347     /* Fix the offset, bearing in mind we want to make it bigger not
22348        smaller.  */
22349     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22350   else if (msize <= 4)
22351     /* The negative range of LDP/STP is one larger than the positive range.  */
22352     base_off += msize;
22353
22354   /* Check if base offset is too big or too small.  We can attempt to resolve
22355      this issue by setting it to the maximum value and seeing if the offsets
22356      still fit.  */
22357   if (base_off >= 0x1000)
22358     {
22359       base_off = 0x1000 - 1;
22360       /* We must still make sure that the base offset is aligned with respect
22361          to the address.  But it may not be made any bigger.  */
22362       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22363     }
22364
22365   /* Likewise for the case where the base is too small.  */
22366   if (base_off <= -0x1000)
22367     {
22368       base_off = -0x1000 + 1;
22369       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22370     }
22371
22372   /* Offset of the first STP/LDP.  */
22373   new_off_1 = off_val_1 - base_off;
22374
22375   /* Offset of the second STP/LDP.  */
22376   new_off_3 = off_val_3 - base_off;
22377
22378   /* The offsets must be within the range of the LDP/STP instructions.  */
22379   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
22380       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
22381     return false;
22382
22383   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
22384                                                   new_off_1), true);
22385   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
22386                                                   new_off_1 + msize), true);
22387   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
22388                                                   new_off_3), true);
22389   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
22390                                                   new_off_3 + msize), true);
22391
22392   if (!aarch64_mem_pair_operand (mem_1, mode)
22393       || !aarch64_mem_pair_operand (mem_3, mode))
22394     return false;
22395
22396   if (code == ZERO_EXTEND)
22397     {
22398       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
22399       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
22400       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
22401       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
22402     }
22403   else if (code == SIGN_EXTEND)
22404     {
22405       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
22406       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
22407       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
22408       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
22409     }
22410
22411   if (load)
22412     {
22413       operands[0] = temp_operands[0];
22414       operands[1] = mem_1;
22415       operands[2] = temp_operands[2];
22416       operands[3] = mem_2;
22417       operands[4] = temp_operands[4];
22418       operands[5] = mem_3;
22419       operands[6] = temp_operands[6];
22420       operands[7] = mem_4;
22421     }
22422   else
22423     {
22424       operands[0] = mem_1;
22425       operands[1] = temp_operands[1];
22426       operands[2] = mem_2;
22427       operands[3] = temp_operands[3];
22428       operands[4] = mem_3;
22429       operands[5] = temp_operands[5];
22430       operands[6] = mem_4;
22431       operands[7] = temp_operands[7];
22432     }
22433
22434   /* Emit adjusting instruction.  */
22435   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
22436   /* Emit ldp/stp instructions.  */
22437   t1 = gen_rtx_SET (operands[0], operands[1]);
22438   t2 = gen_rtx_SET (operands[2], operands[3]);
22439   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22440   t1 = gen_rtx_SET (operands[4], operands[5]);
22441   t2 = gen_rtx_SET (operands[6], operands[7]);
22442   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22443   return true;
22444 }
22445
22446 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
22447    it isn't worth branching around empty masked ops (including masked
22448    stores).  */
22449
22450 static bool
22451 aarch64_empty_mask_is_expensive (unsigned)
22452 {
22453   return false;
22454 }
22455
22456 /* Return 1 if pseudo register should be created and used to hold
22457    GOT address for PIC code.  */
22458
22459 bool
22460 aarch64_use_pseudo_pic_reg (void)
22461 {
22462   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
22463 }
22464
22465 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
22466
22467 static int
22468 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
22469 {
22470   switch (XINT (x, 1))
22471     {
22472     case UNSPEC_GOTSMALLPIC:
22473     case UNSPEC_GOTSMALLPIC28K:
22474     case UNSPEC_GOTTINYPIC:
22475       return 0;
22476     default:
22477       break;
22478     }
22479
22480   return default_unspec_may_trap_p (x, flags);
22481 }
22482
22483
22484 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
22485    return the log2 of that value.  Otherwise return -1.  */
22486
22487 int
22488 aarch64_fpconst_pow_of_2 (rtx x)
22489 {
22490   const REAL_VALUE_TYPE *r;
22491
22492   if (!CONST_DOUBLE_P (x))
22493     return -1;
22494
22495   r = CONST_DOUBLE_REAL_VALUE (x);
22496
22497   if (REAL_VALUE_NEGATIVE (*r)
22498       || REAL_VALUE_ISNAN (*r)
22499       || REAL_VALUE_ISINF (*r)
22500       || !real_isinteger (r, DFmode))
22501     return -1;
22502
22503   return exact_log2 (real_to_integer (r));
22504 }
22505
22506 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
22507    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
22508    return n. Otherwise return -1.  */
22509
22510 int
22511 aarch64_fpconst_pow2_recip (rtx x)
22512 {
22513   REAL_VALUE_TYPE r0;
22514
22515   if (!CONST_DOUBLE_P (x))
22516     return -1;
22517
22518   r0 = *CONST_DOUBLE_REAL_VALUE (x);
22519   if (exact_real_inverse (DFmode, &r0)
22520       && !REAL_VALUE_NEGATIVE (r0))
22521     {
22522         int ret = exact_log2 (real_to_integer (&r0));
22523         if (ret >= 1 && ret <= 32)
22524             return ret;
22525     }
22526   return -1;
22527 }
22528
22529 /* If X is a vector of equal CONST_DOUBLE values and that value is
22530    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
22531
22532 int
22533 aarch64_vec_fpconst_pow_of_2 (rtx x)
22534 {
22535   int nelts;
22536   if (GET_CODE (x) != CONST_VECTOR
22537       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
22538     return -1;
22539
22540   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
22541     return -1;
22542
22543   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
22544   if (firstval <= 0)
22545     return -1;
22546
22547   for (int i = 1; i < nelts; i++)
22548     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
22549       return -1;
22550
22551   return firstval;
22552 }
22553
22554 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
22555    to float.
22556
22557    __fp16 always promotes through this hook.
22558    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
22559    through the generic excess precision logic rather than here.  */
22560
22561 static tree
22562 aarch64_promoted_type (const_tree t)
22563 {
22564   if (SCALAR_FLOAT_TYPE_P (t)
22565       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
22566     return float_type_node;
22567
22568   return NULL_TREE;
22569 }
22570
22571 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
22572
22573 static bool
22574 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
22575                            optimization_type opt_type)
22576 {
22577   switch (op)
22578     {
22579     case rsqrt_optab:
22580       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
22581
22582     default:
22583       return true;
22584     }
22585 }
22586
22587 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
22588
22589 static unsigned int
22590 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
22591                                         int *offset)
22592 {
22593   /* Polynomial invariant 1 == (VG / 2) - 1.  */
22594   gcc_assert (i == 1);
22595   *factor = 2;
22596   *offset = 1;
22597   return AARCH64_DWARF_VG;
22598 }
22599
22600 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
22601    if MODE is HFmode, and punt to the generic implementation otherwise.  */
22602
22603 static bool
22604 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
22605 {
22606   return (mode == HFmode
22607           ? true
22608           : default_libgcc_floating_mode_supported_p (mode));
22609 }
22610
22611 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
22612    if MODE is HFmode, and punt to the generic implementation otherwise.  */
22613
22614 static bool
22615 aarch64_scalar_mode_supported_p (scalar_mode mode)
22616 {
22617   return (mode == HFmode
22618           ? true
22619           : default_scalar_mode_supported_p (mode));
22620 }
22621
22622 /* Set the value of FLT_EVAL_METHOD.
22623    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
22624
22625     0: evaluate all operations and constants, whose semantic type has at
22626        most the range and precision of type float, to the range and
22627        precision of float; evaluate all other operations and constants to
22628        the range and precision of the semantic type;
22629
22630     N, where _FloatN is a supported interchange floating type
22631        evaluate all operations and constants, whose semantic type has at
22632        most the range and precision of _FloatN type, to the range and
22633        precision of the _FloatN type; evaluate all other operations and
22634        constants to the range and precision of the semantic type;
22635
22636    If we have the ARMv8.2-A extensions then we support _Float16 in native
22637    precision, so we should set this to 16.  Otherwise, we support the type,
22638    but want to evaluate expressions in float precision, so set this to
22639    0.  */
22640
22641 static enum flt_eval_method
22642 aarch64_excess_precision (enum excess_precision_type type)
22643 {
22644   switch (type)
22645     {
22646       case EXCESS_PRECISION_TYPE_FAST:
22647       case EXCESS_PRECISION_TYPE_STANDARD:
22648         /* We can calculate either in 16-bit range and precision or
22649            32-bit range and precision.  Make that decision based on whether
22650            we have native support for the ARMv8.2-A 16-bit floating-point
22651            instructions or not.  */
22652         return (TARGET_FP_F16INST
22653                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
22654                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
22655       case EXCESS_PRECISION_TYPE_IMPLICIT:
22656         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
22657       default:
22658         gcc_unreachable ();
22659     }
22660   return FLT_EVAL_METHOD_UNPREDICTABLE;
22661 }
22662
22663 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
22664    scheduled for speculative execution.  Reject the long-running division
22665    and square-root instructions.  */
22666
22667 static bool
22668 aarch64_sched_can_speculate_insn (rtx_insn *insn)
22669 {
22670   switch (get_attr_type (insn))
22671     {
22672       case TYPE_SDIV:
22673       case TYPE_UDIV:
22674       case TYPE_FDIVS:
22675       case TYPE_FDIVD:
22676       case TYPE_FSQRTS:
22677       case TYPE_FSQRTD:
22678       case TYPE_NEON_FP_SQRT_S:
22679       case TYPE_NEON_FP_SQRT_D:
22680       case TYPE_NEON_FP_SQRT_S_Q:
22681       case TYPE_NEON_FP_SQRT_D_Q:
22682       case TYPE_NEON_FP_DIV_S:
22683       case TYPE_NEON_FP_DIV_D:
22684       case TYPE_NEON_FP_DIV_S_Q:
22685       case TYPE_NEON_FP_DIV_D_Q:
22686         return false;
22687       default:
22688         return true;
22689     }
22690 }
22691
22692 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
22693
22694 static int
22695 aarch64_compute_pressure_classes (reg_class *classes)
22696 {
22697   int i = 0;
22698   classes[i++] = GENERAL_REGS;
22699   classes[i++] = FP_REGS;
22700   /* PR_REGS isn't a useful pressure class because many predicate pseudo
22701      registers need to go in PR_LO_REGS at some point during their
22702      lifetime.  Splitting it into two halves has the effect of making
22703      all predicates count against PR_LO_REGS, so that we try whenever
22704      possible to restrict the number of live predicates to 8.  This
22705      greatly reduces the amount of spilling in certain loops.  */
22706   classes[i++] = PR_LO_REGS;
22707   classes[i++] = PR_HI_REGS;
22708   return i;
22709 }
22710
22711 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
22712
22713 static bool
22714 aarch64_can_change_mode_class (machine_mode from,
22715                                machine_mode to, reg_class_t)
22716 {
22717   unsigned int from_flags = aarch64_classify_vector_mode (from);
22718   unsigned int to_flags = aarch64_classify_vector_mode (to);
22719
22720   bool from_sve_p = (from_flags & VEC_ANY_SVE);
22721   bool to_sve_p = (to_flags & VEC_ANY_SVE);
22722
22723   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
22724   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
22725
22726   bool from_pred_p = (from_flags & VEC_SVE_PRED);
22727   bool to_pred_p = (to_flags & VEC_SVE_PRED);
22728
22729   /* Don't allow changes between predicate modes and other modes.
22730      Only predicate registers can hold predicate modes and only
22731      non-predicate registers can hold non-predicate modes, so any
22732      attempt to mix them would require a round trip through memory.  */
22733   if (from_pred_p != to_pred_p)
22734     return false;
22735
22736   /* Don't allow changes between partial SVE modes and other modes.
22737      The contents of partial SVE modes are distributed evenly across
22738      the register, whereas GCC expects them to be clustered together.  */
22739   if (from_partial_sve_p != to_partial_sve_p)
22740     return false;
22741
22742   /* Similarly reject changes between partial SVE modes that have
22743      different patterns of significant and insignificant bits.  */
22744   if (from_partial_sve_p
22745       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
22746           || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
22747     return false;
22748
22749   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
22750     {
22751       /* Don't allow changes between SVE modes and other modes that might
22752          be bigger than 128 bits.  In particular, OImode, CImode and XImode
22753          divide into 128-bit quantities while SVE modes divide into
22754          BITS_PER_SVE_VECTOR quantities.  */
22755       if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
22756         return false;
22757       if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
22758         return false;
22759     }
22760
22761   if (BYTES_BIG_ENDIAN)
22762     {
22763       /* Don't allow changes between SVE data modes and non-SVE modes.
22764          See the comment at the head of aarch64-sve.md for details.  */
22765       if (from_sve_p != to_sve_p)
22766         return false;
22767
22768       /* Don't allow changes in element size: lane 0 of the new vector
22769          would not then be lane 0 of the old vector.  See the comment
22770          above aarch64_maybe_expand_sve_subreg_move for a more detailed
22771          description.
22772
22773          In the worst case, this forces a register to be spilled in
22774          one mode and reloaded in the other, which handles the
22775          endianness correctly.  */
22776       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
22777         return false;
22778     }
22779   return true;
22780 }
22781
22782 /* Implement TARGET_EARLY_REMAT_MODES.  */
22783
22784 static void
22785 aarch64_select_early_remat_modes (sbitmap modes)
22786 {
22787   /* SVE values are not normally live across a call, so it should be
22788      worth doing early rematerialization even in VL-specific mode.  */
22789   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
22790     if (aarch64_sve_mode_p ((machine_mode) i))
22791       bitmap_set_bit (modes, i);
22792 }
22793
22794 /* Override the default target speculation_safe_value.  */
22795 static rtx
22796 aarch64_speculation_safe_value (machine_mode mode,
22797                                 rtx result, rtx val, rtx failval)
22798 {
22799   /* Maybe we should warn if falling back to hard barriers.  They are
22800      likely to be noticably more expensive than the alternative below.  */
22801   if (!aarch64_track_speculation)
22802     return default_speculation_safe_value (mode, result, val, failval);
22803
22804   if (!REG_P (val))
22805     val = copy_to_mode_reg (mode, val);
22806
22807   if (!aarch64_reg_or_zero (failval, mode))
22808     failval = copy_to_mode_reg (mode, failval);
22809
22810   emit_insn (gen_despeculate_copy (mode, result, val, failval));
22811   return result;
22812 }
22813
22814 /* Implement TARGET_ESTIMATED_POLY_VALUE.
22815    Look into the tuning structure for an estimate.
22816    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
22817    Advanced SIMD 128 bits.  */
22818
22819 static HOST_WIDE_INT
22820 aarch64_estimated_poly_value (poly_int64 val)
22821 {
22822   enum aarch64_sve_vector_bits_enum width_source
22823     = aarch64_tune_params.sve_width;
22824
22825   /* If we still don't have an estimate, use the default.  */
22826   if (width_source == SVE_SCALABLE)
22827     return default_estimated_poly_value (val);
22828
22829   HOST_WIDE_INT over_128 = width_source - 128;
22830   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
22831 }
22832
22833
22834 /* Return true for types that could be supported as SIMD return or
22835    argument types.  */
22836
22837 static bool
22838 supported_simd_type (tree t)
22839 {
22840   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
22841     {
22842       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
22843       return s == 1 || s == 2 || s == 4 || s == 8;
22844     }
22845   return false;
22846 }
22847
22848 /* Return true for types that currently are supported as SIMD return
22849    or argument types.  */
22850
22851 static bool
22852 currently_supported_simd_type (tree t, tree b)
22853 {
22854   if (COMPLEX_FLOAT_TYPE_P (t))
22855     return false;
22856
22857   if (TYPE_SIZE (t) != TYPE_SIZE (b))
22858     return false;
22859
22860   return supported_simd_type (t);
22861 }
22862
22863 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
22864
22865 static int
22866 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
22867                                         struct cgraph_simd_clone *clonei,
22868                                         tree base_type, int num)
22869 {
22870   tree t, ret_type, arg_type;
22871   unsigned int elt_bits, vec_bits, count;
22872
22873   if (!TARGET_SIMD)
22874     return 0;
22875
22876   if (clonei->simdlen
22877       && (clonei->simdlen < 2
22878           || clonei->simdlen > 1024
22879           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
22880     {
22881       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22882                   "unsupported simdlen %d", clonei->simdlen);
22883       return 0;
22884     }
22885
22886   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
22887   if (TREE_CODE (ret_type) != VOID_TYPE
22888       && !currently_supported_simd_type (ret_type, base_type))
22889     {
22890       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
22891         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22892                     "GCC does not currently support mixed size types "
22893                     "for %<simd%> functions");
22894       else if (supported_simd_type (ret_type))
22895         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22896                     "GCC does not currently support return type %qT "
22897                     "for %<simd%> functions", ret_type);
22898       else
22899         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22900                     "unsupported return type %qT for %<simd%> functions",
22901                     ret_type);
22902       return 0;
22903     }
22904
22905   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
22906     {
22907       arg_type = TREE_TYPE (t);
22908
22909       if (!currently_supported_simd_type (arg_type, base_type))
22910         {
22911           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
22912             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22913                         "GCC does not currently support mixed size types "
22914                         "for %<simd%> functions");
22915           else
22916             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22917                         "GCC does not currently support argument type %qT "
22918                         "for %<simd%> functions", arg_type);
22919           return 0;
22920         }
22921     }
22922
22923   clonei->vecsize_mangle = 'n';
22924   clonei->mask_mode = VOIDmode;
22925   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
22926   if (clonei->simdlen == 0)
22927     {
22928       count = 2;
22929       vec_bits = (num == 0 ? 64 : 128);
22930       clonei->simdlen = vec_bits / elt_bits;
22931     }
22932   else
22933     {
22934       count = 1;
22935       vec_bits = clonei->simdlen * elt_bits;
22936       if (vec_bits != 64 && vec_bits != 128)
22937         {
22938           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22939                       "GCC does not currently support simdlen %d for type %qT",
22940                       clonei->simdlen, base_type);
22941           return 0;
22942         }
22943     }
22944   clonei->vecsize_int = vec_bits;
22945   clonei->vecsize_float = vec_bits;
22946   return count;
22947 }
22948
22949 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
22950
22951 static void
22952 aarch64_simd_clone_adjust (struct cgraph_node *node)
22953 {
22954   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
22955      use the correct ABI.  */
22956
22957   tree t = TREE_TYPE (node->decl);
22958   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
22959                                         TYPE_ATTRIBUTES (t));
22960 }
22961
22962 /* Implement TARGET_SIMD_CLONE_USABLE.  */
22963
22964 static int
22965 aarch64_simd_clone_usable (struct cgraph_node *node)
22966 {
22967   switch (node->simdclone->vecsize_mangle)
22968     {
22969     case 'n':
22970       if (!TARGET_SIMD)
22971         return -1;
22972       return 0;
22973     default:
22974       gcc_unreachable ();
22975     }
22976 }
22977
22978 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
22979
22980 static int
22981 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
22982 {
22983   auto check_attr = [&](const char *name) {
22984     tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
22985     tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
22986     if (!attr1 && !attr2)
22987       return true;
22988
22989     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
22990   };
22991
22992   if (!check_attr ("aarch64_vector_pcs"))
22993     return 0;
22994   if (!check_attr ("Advanced SIMD type"))
22995     return 0;
22996   return 1;
22997 }
22998
22999 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
23000
23001 static const char *
23002 aarch64_get_multilib_abi_name (void)
23003 {
23004   if (TARGET_BIG_END)
23005     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
23006   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
23007 }
23008
23009 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
23010    global variable based guard use the default else
23011    return a null tree.  */
23012 static tree
23013 aarch64_stack_protect_guard (void)
23014 {
23015   if (aarch64_stack_protector_guard == SSP_GLOBAL)
23016     return default_stack_protect_guard ();
23017
23018   return NULL_TREE;
23019 }
23020
23021 /* Return the diagnostic message string if conversion from FROMTYPE to
23022    TOTYPE is not allowed, NULL otherwise.  */
23023
23024 static const char *
23025 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
23026 {
23027   if (element_mode (fromtype) != element_mode (totype))
23028     {
23029       /* Do no allow conversions to/from BFmode scalar types.  */
23030       if (TYPE_MODE (fromtype) == BFmode)
23031         return N_("invalid conversion from type %<bfloat16_t%>");
23032       if (TYPE_MODE (totype) == BFmode)
23033         return N_("invalid conversion to type %<bfloat16_t%>");
23034     }
23035
23036   /* Conversion allowed.  */
23037   return NULL;
23038 }
23039
23040 /* Return the diagnostic message string if the unary operation OP is
23041    not permitted on TYPE, NULL otherwise.  */
23042
23043 static const char *
23044 aarch64_invalid_unary_op (int op, const_tree type)
23045 {
23046   /* Reject all single-operand operations on BFmode except for &.  */
23047   if (element_mode (type) == BFmode && op != ADDR_EXPR)
23048     return N_("operation not permitted on type %<bfloat16_t%>");
23049
23050   /* Operation allowed.  */
23051   return NULL;
23052 }
23053
23054 /* Return the diagnostic message string if the binary operation OP is
23055    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
23056
23057 static const char *
23058 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
23059                            const_tree type2)
23060 {
23061   /* Reject all 2-operand operations on BFmode.  */
23062   if (element_mode (type1) == BFmode
23063       || element_mode (type2) == BFmode)
23064     return N_("operation not permitted on type %<bfloat16_t%>");
23065
23066   if (VECTOR_TYPE_P (type1)
23067       && VECTOR_TYPE_P (type2)
23068       && !TYPE_INDIVISIBLE_P (type1)
23069       && !TYPE_INDIVISIBLE_P (type2)
23070       && (aarch64_sve::builtin_type_p (type1)
23071           != aarch64_sve::builtin_type_p (type2)))
23072     return N_("cannot combine GNU and SVE vectors in a binary operation");
23073
23074   /* Operation allowed.  */
23075   return NULL;
23076 }
23077
23078 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
23079    section at the end if needed.  */
23080 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
23081 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
23082 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
23083 void
23084 aarch64_file_end_indicate_exec_stack ()
23085 {
23086   file_end_indicate_exec_stack ();
23087
23088   unsigned feature_1_and = 0;
23089   if (aarch64_bti_enabled ())
23090     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
23091
23092   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
23093     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
23094
23095   if (feature_1_and)
23096     {
23097       /* Generate .note.gnu.property section.  */
23098       switch_to_section (get_section (".note.gnu.property",
23099                                       SECTION_NOTYPE, NULL));
23100
23101       /* PT_NOTE header: namesz, descsz, type.
23102          namesz = 4 ("GNU\0")
23103          descsz = 16 (Size of the program property array)
23104                   [(12 + padding) * Number of array elements]
23105          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
23106       assemble_align (POINTER_SIZE);
23107       assemble_integer (GEN_INT (4), 4, 32, 1);
23108       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
23109       assemble_integer (GEN_INT (5), 4, 32, 1);
23110
23111       /* PT_NOTE name.  */
23112       assemble_string ("GNU", 4);
23113
23114       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
23115          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
23116          datasz = 4
23117          data   = feature_1_and.  */
23118       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
23119       assemble_integer (GEN_INT (4), 4, 32, 1);
23120       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
23121
23122       /* Pad the size of the note to the required alignment.  */
23123       assemble_align (POINTER_SIZE);
23124     }
23125 }
23126 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
23127 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
23128 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
23129
23130 /* Helper function for straight line speculation.
23131    Return what barrier should be emitted for straight line speculation
23132    mitigation.
23133    When not mitigating against straight line speculation this function returns
23134    an empty string.
23135    When mitigating against straight line speculation, use:
23136    * SB when the v8.5-A SB extension is enabled.
23137    * DSB+ISB otherwise.  */
23138 const char *
23139 aarch64_sls_barrier (int mitigation_required)
23140 {
23141   return mitigation_required
23142     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
23143     : "";
23144 }
23145
23146 static GTY (()) tree aarch64_sls_shared_thunks[30];
23147 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
23148 const char *indirect_symbol_names[30] = {
23149     "__call_indirect_x0",
23150     "__call_indirect_x1",
23151     "__call_indirect_x2",
23152     "__call_indirect_x3",
23153     "__call_indirect_x4",
23154     "__call_indirect_x5",
23155     "__call_indirect_x6",
23156     "__call_indirect_x7",
23157     "__call_indirect_x8",
23158     "__call_indirect_x9",
23159     "__call_indirect_x10",
23160     "__call_indirect_x11",
23161     "__call_indirect_x12",
23162     "__call_indirect_x13",
23163     "__call_indirect_x14",
23164     "__call_indirect_x15",
23165     "", /* "__call_indirect_x16",  */
23166     "", /* "__call_indirect_x17",  */
23167     "__call_indirect_x18",
23168     "__call_indirect_x19",
23169     "__call_indirect_x20",
23170     "__call_indirect_x21",
23171     "__call_indirect_x22",
23172     "__call_indirect_x23",
23173     "__call_indirect_x24",
23174     "__call_indirect_x25",
23175     "__call_indirect_x26",
23176     "__call_indirect_x27",
23177     "__call_indirect_x28",
23178     "__call_indirect_x29",
23179 };
23180
23181 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
23182    line speculation.  Instead of a simple BLR that can be speculated past,
23183    we emit a BL to this thunk, and this thunk contains a BR to the relevant
23184    register.  These thunks have the relevant speculation barries put after
23185    their indirect branch so that speculation is blocked.
23186
23187    We use such a thunk so the speculation barriers are kept off the
23188    architecturally executed path in order to reduce the performance overhead.
23189
23190    When optimizing for size we use stubs shared by the linked object.
23191    When optimizing for performance we emit stubs for each function in the hope
23192    that the branch predictor can better train on jumps specific for a given
23193    function.  */
23194 rtx
23195 aarch64_sls_create_blr_label (int regnum)
23196 {
23197   gcc_assert (STUB_REGNUM_P (regnum));
23198   if (optimize_function_for_size_p (cfun))
23199     {
23200       /* For the thunks shared between different functions in this compilation
23201          unit we use a named symbol -- this is just for users to more easily
23202          understand the generated assembly.  */
23203       aarch64_sls_shared_thunks_needed = true;
23204       const char *thunk_name = indirect_symbol_names[regnum];
23205       if (aarch64_sls_shared_thunks[regnum] == NULL)
23206         {
23207           /* Build a decl representing this function stub and record it for
23208              later.  We build a decl here so we can use the GCC machinery for
23209              handling sections automatically (through `get_named_section` and
23210              `make_decl_one_only`).  That saves us a lot of trouble handling
23211              the specifics of different output file formats.  */
23212           tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
23213                                   get_identifier (thunk_name),
23214                                   build_function_type_list (void_type_node,
23215                                                             NULL_TREE));
23216           DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
23217                                            NULL_TREE, void_type_node);
23218           TREE_PUBLIC (decl) = 1;
23219           TREE_STATIC (decl) = 1;
23220           DECL_IGNORED_P (decl) = 1;
23221           DECL_ARTIFICIAL (decl) = 1;
23222           make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
23223           resolve_unique_section (decl, 0, false);
23224           aarch64_sls_shared_thunks[regnum] = decl;
23225         }
23226
23227       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
23228     }
23229
23230   if (cfun->machine->call_via[regnum] == NULL)
23231     cfun->machine->call_via[regnum]
23232       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
23233   return cfun->machine->call_via[regnum];
23234 }
23235
23236 /* Helper function for aarch64_sls_emit_blr_function_thunks and
23237    aarch64_sls_emit_shared_blr_thunks below.  */
23238 static void
23239 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
23240 {
23241   /* Save in x16 and branch to that function so this transformation does
23242      not prevent jumping to `BTI c` instructions.  */
23243   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
23244   asm_fprintf (out_file, "\tbr\tx16\n");
23245 }
23246
23247 /* Emit all BLR stubs for this particular function.
23248    Here we emit all the BLR stubs needed for the current function.  Since we
23249    emit these stubs in a consecutive block we know there will be no speculation
23250    gadgets between each stub, and hence we only emit a speculation barrier at
23251    the end of the stub sequences.
23252
23253    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
23254 void
23255 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
23256 {
23257   if (! aarch64_harden_sls_blr_p ())
23258     return;
23259
23260   bool any_functions_emitted = false;
23261   /* We must save and restore the current function section since this assembly
23262      is emitted at the end of the function.  This means it can be emitted *just
23263      after* the cold section of a function.  That cold part would be emitted in
23264      a different section.  That switch would trigger a `.cfi_endproc` directive
23265      to be emitted in the original section and a `.cfi_startproc` directive to
23266      be emitted in the new section.  Switching to the original section without
23267      restoring would mean that the `.cfi_endproc` emitted as a function ends
23268      would happen in a different section -- leaving an unmatched
23269      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
23270      in the standard text section.  */
23271   section *save_text_section = in_section;
23272   switch_to_section (function_section (current_function_decl));
23273   for (int regnum = 0; regnum < 30; ++regnum)
23274     {
23275       rtx specu_label = cfun->machine->call_via[regnum];
23276       if (specu_label == NULL)
23277         continue;
23278
23279       targetm.asm_out.print_operand (out_file, specu_label, 0);
23280       asm_fprintf (out_file, ":\n");
23281       aarch64_sls_emit_function_stub (out_file, regnum);
23282       any_functions_emitted = true;
23283     }
23284   if (any_functions_emitted)
23285     /* Can use the SB if needs be here, since this stub will only be used
23286       by the current function, and hence for the current target.  */
23287     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
23288   switch_to_section (save_text_section);
23289 }
23290
23291 /* Emit shared BLR stubs for the current compilation unit.
23292    Over the course of compiling this unit we may have converted some BLR
23293    instructions to a BL to a shared stub function.  This is where we emit those
23294    stub functions.
23295    This function is for the stubs shared between different functions in this
23296    compilation unit.  We share when optimizing for size instead of speed.
23297
23298    This function is called through the TARGET_ASM_FILE_END hook.  */
23299 void
23300 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
23301 {
23302   if (! aarch64_sls_shared_thunks_needed)
23303     return;
23304
23305   for (int regnum = 0; regnum < 30; ++regnum)
23306     {
23307       tree decl = aarch64_sls_shared_thunks[regnum];
23308       if (!decl)
23309         continue;
23310
23311       const char *name = indirect_symbol_names[regnum];
23312       switch_to_section (get_named_section (decl, NULL, 0));
23313       ASM_OUTPUT_ALIGN (out_file, 2);
23314       targetm.asm_out.globalize_label (out_file, name);
23315       /* Only emits if the compiler is configured for an assembler that can
23316          handle visibility directives.  */
23317       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
23318       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
23319       ASM_OUTPUT_LABEL (out_file, name);
23320       aarch64_sls_emit_function_stub (out_file, regnum);
23321       /* Use the most conservative target to ensure it can always be used by any
23322          function in the translation unit.  */
23323       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
23324       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
23325     }
23326 }
23327
23328 /* Implement TARGET_ASM_FILE_END.  */
23329 void
23330 aarch64_asm_file_end ()
23331 {
23332   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
23333   /* Since this function will be called for the ASM_FILE_END hook, we ensure
23334      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
23335      for FreeBSD) still gets called.  */
23336 #ifdef TARGET_ASM_FILE_END
23337   TARGET_ASM_FILE_END ();
23338 #endif
23339 }
23340
23341 const char *
23342 aarch64_indirect_call_asm (rtx addr)
23343 {
23344   gcc_assert (REG_P (addr));
23345   if (aarch64_harden_sls_blr_p ())
23346     {
23347       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
23348       output_asm_insn ("bl\t%0", &stub_label);
23349     }
23350   else
23351    output_asm_insn ("blr\t%0", &addr);
23352   return "";
23353 }
23354
23355 /* Target-specific selftests.  */
23356
23357 #if CHECKING_P
23358
23359 namespace selftest {
23360
23361 /* Selftest for the RTL loader.
23362    Verify that the RTL loader copes with a dump from
23363    print_rtx_function.  This is essentially just a test that class
23364    function_reader can handle a real dump, but it also verifies
23365    that lookup_reg_by_dump_name correctly handles hard regs.
23366    The presence of hard reg names in the dump means that the test is
23367    target-specific, hence it is in this file.  */
23368
23369 static void
23370 aarch64_test_loading_full_dump ()
23371 {
23372   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
23373
23374   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
23375
23376   rtx_insn *insn_1 = get_insn_by_uid (1);
23377   ASSERT_EQ (NOTE, GET_CODE (insn_1));
23378
23379   rtx_insn *insn_15 = get_insn_by_uid (15);
23380   ASSERT_EQ (INSN, GET_CODE (insn_15));
23381   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
23382
23383   /* Verify crtl->return_rtx.  */
23384   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
23385   ASSERT_EQ (0, REGNO (crtl->return_rtx));
23386   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
23387 }
23388
23389 /* Run all target-specific selftests.  */
23390
23391 static void
23392 aarch64_run_selftests (void)
23393 {
23394   aarch64_test_loading_full_dump ();
23395 }
23396
23397 } // namespace selftest
23398
23399 #endif /* #if CHECKING_P */
23400
23401 #undef TARGET_STACK_PROTECT_GUARD
23402 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
23403
23404 #undef TARGET_ADDRESS_COST
23405 #define TARGET_ADDRESS_COST aarch64_address_cost
23406
23407 /* This hook will determines whether unnamed bitfields affect the alignment
23408    of the containing structure.  The hook returns true if the structure
23409    should inherit the alignment requirements of an unnamed bitfield's
23410    type.  */
23411 #undef TARGET_ALIGN_ANON_BITFIELD
23412 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
23413
23414 #undef TARGET_ASM_ALIGNED_DI_OP
23415 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
23416
23417 #undef TARGET_ASM_ALIGNED_HI_OP
23418 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
23419
23420 #undef TARGET_ASM_ALIGNED_SI_OP
23421 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
23422
23423 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
23424 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
23425   hook_bool_const_tree_hwi_hwi_const_tree_true
23426
23427 #undef TARGET_ASM_FILE_START
23428 #define TARGET_ASM_FILE_START aarch64_start_file
23429
23430 #undef TARGET_ASM_OUTPUT_MI_THUNK
23431 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
23432
23433 #undef TARGET_ASM_SELECT_RTX_SECTION
23434 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
23435
23436 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
23437 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
23438
23439 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
23440 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
23441
23442 #undef TARGET_BUILD_BUILTIN_VA_LIST
23443 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
23444
23445 #undef TARGET_CALLEE_COPIES
23446 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
23447
23448 #undef TARGET_CAN_ELIMINATE
23449 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
23450
23451 #undef TARGET_CAN_INLINE_P
23452 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
23453
23454 #undef TARGET_CANNOT_FORCE_CONST_MEM
23455 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
23456
23457 #undef TARGET_CASE_VALUES_THRESHOLD
23458 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
23459
23460 #undef TARGET_CONDITIONAL_REGISTER_USAGE
23461 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
23462
23463 #undef TARGET_MEMBER_TYPE_FORCES_BLK
23464 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
23465
23466 /* Only the least significant bit is used for initialization guard
23467    variables.  */
23468 #undef TARGET_CXX_GUARD_MASK_BIT
23469 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
23470
23471 #undef TARGET_C_MODE_FOR_SUFFIX
23472 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
23473
23474 #ifdef TARGET_BIG_ENDIAN_DEFAULT
23475 #undef  TARGET_DEFAULT_TARGET_FLAGS
23476 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
23477 #endif
23478
23479 #undef TARGET_CLASS_MAX_NREGS
23480 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
23481
23482 #undef TARGET_BUILTIN_DECL
23483 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
23484
23485 #undef TARGET_BUILTIN_RECIPROCAL
23486 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
23487
23488 #undef TARGET_C_EXCESS_PRECISION
23489 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
23490
23491 #undef  TARGET_EXPAND_BUILTIN
23492 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
23493
23494 #undef TARGET_EXPAND_BUILTIN_VA_START
23495 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
23496
23497 #undef TARGET_FOLD_BUILTIN
23498 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
23499
23500 #undef TARGET_FUNCTION_ARG
23501 #define TARGET_FUNCTION_ARG aarch64_function_arg
23502
23503 #undef TARGET_FUNCTION_ARG_ADVANCE
23504 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
23505
23506 #undef TARGET_FUNCTION_ARG_BOUNDARY
23507 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
23508
23509 #undef TARGET_FUNCTION_ARG_PADDING
23510 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
23511
23512 #undef TARGET_GET_RAW_RESULT_MODE
23513 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
23514 #undef TARGET_GET_RAW_ARG_MODE
23515 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
23516
23517 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
23518 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
23519
23520 #undef TARGET_FUNCTION_VALUE
23521 #define TARGET_FUNCTION_VALUE aarch64_function_value
23522
23523 #undef TARGET_FUNCTION_VALUE_REGNO_P
23524 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
23525
23526 #undef TARGET_GIMPLE_FOLD_BUILTIN
23527 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
23528
23529 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
23530 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
23531
23532 #undef  TARGET_INIT_BUILTINS
23533 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
23534
23535 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
23536 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
23537   aarch64_ira_change_pseudo_allocno_class
23538
23539 #undef TARGET_LEGITIMATE_ADDRESS_P
23540 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
23541
23542 #undef TARGET_LEGITIMATE_CONSTANT_P
23543 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
23544
23545 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
23546 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
23547   aarch64_legitimize_address_displacement
23548
23549 #undef TARGET_LIBGCC_CMP_RETURN_MODE
23550 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
23551
23552 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
23553 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
23554 aarch64_libgcc_floating_mode_supported_p
23555
23556 #undef TARGET_MANGLE_TYPE
23557 #define TARGET_MANGLE_TYPE aarch64_mangle_type
23558
23559 #undef TARGET_INVALID_CONVERSION
23560 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
23561
23562 #undef TARGET_INVALID_UNARY_OP
23563 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
23564
23565 #undef TARGET_INVALID_BINARY_OP
23566 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
23567
23568 #undef TARGET_VERIFY_TYPE_CONTEXT
23569 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
23570
23571 #undef TARGET_MEMORY_MOVE_COST
23572 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
23573
23574 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
23575 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
23576
23577 #undef TARGET_MUST_PASS_IN_STACK
23578 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
23579
23580 /* This target hook should return true if accesses to volatile bitfields
23581    should use the narrowest mode possible.  It should return false if these
23582    accesses should use the bitfield container type.  */
23583 #undef TARGET_NARROW_VOLATILE_BITFIELD
23584 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
23585
23586 #undef  TARGET_OPTION_OVERRIDE
23587 #define TARGET_OPTION_OVERRIDE aarch64_override_options
23588
23589 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
23590 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
23591   aarch64_override_options_after_change
23592
23593 #undef TARGET_OFFLOAD_OPTIONS
23594 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
23595
23596 #undef TARGET_OPTION_SAVE
23597 #define TARGET_OPTION_SAVE aarch64_option_save
23598
23599 #undef TARGET_OPTION_RESTORE
23600 #define TARGET_OPTION_RESTORE aarch64_option_restore
23601
23602 #undef TARGET_OPTION_PRINT
23603 #define TARGET_OPTION_PRINT aarch64_option_print
23604
23605 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
23606 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
23607
23608 #undef TARGET_SET_CURRENT_FUNCTION
23609 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
23610
23611 #undef TARGET_PASS_BY_REFERENCE
23612 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
23613
23614 #undef TARGET_PREFERRED_RELOAD_CLASS
23615 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
23616
23617 #undef TARGET_SCHED_REASSOCIATION_WIDTH
23618 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
23619
23620 #undef TARGET_PROMOTED_TYPE
23621 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
23622
23623 #undef TARGET_SECONDARY_RELOAD
23624 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
23625
23626 #undef TARGET_SHIFT_TRUNCATION_MASK
23627 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
23628
23629 #undef TARGET_SETUP_INCOMING_VARARGS
23630 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
23631
23632 #undef TARGET_STRUCT_VALUE_RTX
23633 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
23634
23635 #undef TARGET_REGISTER_MOVE_COST
23636 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
23637
23638 #undef TARGET_RETURN_IN_MEMORY
23639 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
23640
23641 #undef TARGET_RETURN_IN_MSB
23642 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
23643
23644 #undef TARGET_RTX_COSTS
23645 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
23646
23647 #undef TARGET_SCALAR_MODE_SUPPORTED_P
23648 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
23649
23650 #undef TARGET_SCHED_ISSUE_RATE
23651 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
23652
23653 #undef TARGET_SCHED_VARIABLE_ISSUE
23654 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
23655
23656 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
23657 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
23658   aarch64_sched_first_cycle_multipass_dfa_lookahead
23659
23660 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
23661 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
23662   aarch64_first_cycle_multipass_dfa_lookahead_guard
23663
23664 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
23665 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
23666   aarch64_get_separate_components
23667
23668 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
23669 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
23670   aarch64_components_for_bb
23671
23672 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
23673 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
23674   aarch64_disqualify_components
23675
23676 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
23677 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
23678   aarch64_emit_prologue_components
23679
23680 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
23681 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
23682   aarch64_emit_epilogue_components
23683
23684 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
23685 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
23686   aarch64_set_handled_components
23687
23688 #undef TARGET_TRAMPOLINE_INIT
23689 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
23690
23691 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
23692 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
23693
23694 #undef TARGET_VECTOR_MODE_SUPPORTED_P
23695 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
23696
23697 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
23698 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
23699
23700 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
23701 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
23702   aarch64_builtin_support_vector_misalignment
23703
23704 #undef TARGET_ARRAY_MODE
23705 #define TARGET_ARRAY_MODE aarch64_array_mode
23706
23707 #undef TARGET_ARRAY_MODE_SUPPORTED_P
23708 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
23709
23710 #undef TARGET_VECTORIZE_ADD_STMT_COST
23711 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
23712
23713 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
23714 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
23715   aarch64_builtin_vectorization_cost
23716
23717 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
23718 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
23719
23720 #undef TARGET_VECTORIZE_BUILTINS
23721 #define TARGET_VECTORIZE_BUILTINS
23722
23723 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
23724 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
23725   aarch64_builtin_vectorized_function
23726
23727 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
23728 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
23729   aarch64_autovectorize_vector_modes
23730
23731 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
23732 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
23733   aarch64_atomic_assign_expand_fenv
23734
23735 /* Section anchor support.  */
23736
23737 #undef TARGET_MIN_ANCHOR_OFFSET
23738 #define TARGET_MIN_ANCHOR_OFFSET -256
23739
23740 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
23741    byte offset; we can do much more for larger data types, but have no way
23742    to determine the size of the access.  We assume accesses are aligned.  */
23743 #undef TARGET_MAX_ANCHOR_OFFSET
23744 #define TARGET_MAX_ANCHOR_OFFSET 4095
23745
23746 #undef TARGET_VECTOR_ALIGNMENT
23747 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
23748
23749 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
23750 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
23751   aarch64_vectorize_preferred_vector_alignment
23752 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
23753 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
23754   aarch64_simd_vector_alignment_reachable
23755
23756 /* vec_perm support.  */
23757
23758 #undef TARGET_VECTORIZE_VEC_PERM_CONST
23759 #define TARGET_VECTORIZE_VEC_PERM_CONST \
23760   aarch64_vectorize_vec_perm_const
23761
23762 #undef TARGET_VECTORIZE_RELATED_MODE
23763 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
23764 #undef TARGET_VECTORIZE_GET_MASK_MODE
23765 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
23766 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
23767 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
23768   aarch64_empty_mask_is_expensive
23769 #undef TARGET_PREFERRED_ELSE_VALUE
23770 #define TARGET_PREFERRED_ELSE_VALUE \
23771   aarch64_preferred_else_value
23772
23773 #undef TARGET_INIT_LIBFUNCS
23774 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
23775
23776 #undef TARGET_FIXED_CONDITION_CODE_REGS
23777 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
23778
23779 #undef TARGET_FLAGS_REGNUM
23780 #define TARGET_FLAGS_REGNUM CC_REGNUM
23781
23782 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
23783 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
23784
23785 #undef TARGET_ASAN_SHADOW_OFFSET
23786 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
23787
23788 #undef TARGET_LEGITIMIZE_ADDRESS
23789 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
23790
23791 #undef TARGET_SCHED_CAN_SPECULATE_INSN
23792 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
23793
23794 #undef TARGET_CAN_USE_DOLOOP_P
23795 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
23796
23797 #undef TARGET_SCHED_ADJUST_PRIORITY
23798 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
23799
23800 #undef TARGET_SCHED_MACRO_FUSION_P
23801 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
23802
23803 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
23804 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
23805
23806 #undef TARGET_SCHED_FUSION_PRIORITY
23807 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
23808
23809 #undef TARGET_UNSPEC_MAY_TRAP_P
23810 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
23811
23812 #undef TARGET_USE_PSEUDO_PIC_REG
23813 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
23814
23815 #undef TARGET_PRINT_OPERAND
23816 #define TARGET_PRINT_OPERAND aarch64_print_operand
23817
23818 #undef TARGET_PRINT_OPERAND_ADDRESS
23819 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
23820
23821 #undef TARGET_OPTAB_SUPPORTED_P
23822 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
23823
23824 #undef TARGET_OMIT_STRUCT_RETURN_REG
23825 #define TARGET_OMIT_STRUCT_RETURN_REG true
23826
23827 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
23828 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
23829   aarch64_dwarf_poly_indeterminate_value
23830
23831 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
23832 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
23833 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
23834
23835 #undef TARGET_HARD_REGNO_NREGS
23836 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
23837 #undef TARGET_HARD_REGNO_MODE_OK
23838 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
23839
23840 #undef TARGET_MODES_TIEABLE_P
23841 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
23842
23843 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
23844 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
23845   aarch64_hard_regno_call_part_clobbered
23846
23847 #undef TARGET_INSN_CALLEE_ABI
23848 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
23849
23850 #undef TARGET_CONSTANT_ALIGNMENT
23851 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
23852
23853 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
23854 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
23855   aarch64_stack_clash_protection_alloca_probe_range
23856
23857 #undef TARGET_COMPUTE_PRESSURE_CLASSES
23858 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
23859
23860 #undef TARGET_CAN_CHANGE_MODE_CLASS
23861 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
23862
23863 #undef TARGET_SELECT_EARLY_REMAT_MODES
23864 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
23865
23866 #undef TARGET_SPECULATION_SAFE_VALUE
23867 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
23868
23869 #undef TARGET_ESTIMATED_POLY_VALUE
23870 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
23871
23872 #undef TARGET_ATTRIBUTE_TABLE
23873 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
23874
23875 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
23876 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
23877   aarch64_simd_clone_compute_vecsize_and_simdlen
23878
23879 #undef TARGET_SIMD_CLONE_ADJUST
23880 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
23881
23882 #undef TARGET_SIMD_CLONE_USABLE
23883 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
23884
23885 #undef TARGET_COMP_TYPE_ATTRIBUTES
23886 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
23887
23888 #undef TARGET_GET_MULTILIB_ABI_NAME
23889 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
23890
23891 #undef TARGET_FNTYPE_ABI
23892 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
23893
23894 #if CHECKING_P
23895 #undef TARGET_RUN_TARGET_SELFTESTS
23896 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
23897 #endif /* #if CHECKING_P */
23898
23899 #undef TARGET_ASM_POST_CFI_STARTPROC
23900 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
23901
23902 #undef TARGET_STRICT_ARGUMENT_NAMING
23903 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
23904
23905 #undef TARGET_MD_ASM_ADJUST
23906 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
23907
23908 #undef TARGET_ASM_FILE_END
23909 #define TARGET_ASM_FILE_END aarch64_asm_file_end
23910
23911 #undef TARGET_ASM_FUNCTION_EPILOGUE
23912 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
23913
23914 struct gcc_target targetm = TARGET_INITIALIZER;
23915
23916 #include "gt-aarch64.h"