gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2020 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74 #include "intl.h"
  75 #include "expmed.h"
  76 #include "function-abi.h"
  77
  78 /* This file should be included last.  */
  79 #include "target-def.h"
  80
  81 /* Defined for convenience.  */
  82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  83
  84 /* Information about a legitimate vector immediate operand.  */
  85 struct simd_immediate_info
  86 {
  87   enum insn_type { MOV, MVN, INDEX, PTRUE };
  88   enum modifier_type { LSL, MSL };
  89
  90   simd_immediate_info () {}
  91   simd_immediate_info (scalar_float_mode, rtx);
  92   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  93                        insn_type = MOV, modifier_type = LSL,
  94                        unsigned int = 0);
  95   simd_immediate_info (scalar_mode, rtx, rtx);
  96   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  97
  98   /* The mode of the elements.  */
  99   scalar_mode elt_mode;
 100
 101   /* The instruction to use to move the immediate into a vector.  */
 102   insn_type insn;
 103
 104   union
 105   {
 106     /* For MOV and MVN.  */
 107     struct
 108     {
 109       /* The value of each element.  */
 110       rtx value;
 111
 112       /* The kind of shift modifier to use, and the number of bits to shift.
 113          This is (LSL, 0) if no shift is needed.  */
 114       modifier_type modifier;
 115       unsigned int shift;
 116     } mov;
 117
 118     /* For INDEX.  */
 119     struct
 120     {
 121       /* The value of the first element and the step to be added for each
 122          subsequent element.  */
 123       rtx base, step;
 124     } index;
 125
 126     /* For PTRUE.  */
 127     aarch64_svpattern pattern;
 128   } u;
 129 };
 130
 131 /* Construct a floating-point immediate in which each element has mode
 132    ELT_MODE_IN and value VALUE_IN.  */
 133 inline simd_immediate_info
 134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 135   : elt_mode (elt_mode_in), insn (MOV)
 136 {
 137   u.mov.value = value_in;
 138   u.mov.modifier = LSL;
 139   u.mov.shift = 0;
 140 }
 141
 142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 143    and value VALUE_IN.  The other parameters are as for the structure
 144    fields.  */
 145 inline simd_immediate_info
 146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 147                        unsigned HOST_WIDE_INT value_in,
 148                        insn_type insn_in, modifier_type modifier_in,
 149                        unsigned int shift_in)
 150   : elt_mode (elt_mode_in), insn (insn_in)
 151 {
 152   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 153   u.mov.modifier = modifier_in;
 154   u.mov.shift = shift_in;
 155 }
 156
 157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 158    and where element I is equal to BASE_IN + I * STEP_IN.  */
 159 inline simd_immediate_info
 160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 161   : elt_mode (elt_mode_in), insn (INDEX)
 162 {
 163   u.index.base = base_in;
 164   u.index.step = step_in;
 165 }
 166
 167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 168    and has PTRUE pattern PATTERN_IN.  */
 169 inline simd_immediate_info
 170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 171                        aarch64_svpattern pattern_in)
 172   : elt_mode (elt_mode_in), insn (PTRUE)
 173 {
 174   u.pattern = pattern_in;
 175 }
 176
 177 /* The current code model.  */
 178 enum aarch64_code_model aarch64_cmodel;
 179
 180 /* The number of 64-bit elements in an SVE vector.  */
 181 poly_uint16 aarch64_sve_vg;
 182
 183 #ifdef HAVE_AS_TLS
 184 #undef TARGET_HAVE_TLS
 185 #define TARGET_HAVE_TLS 1
 186 #endif
 187
 188 static bool aarch64_composite_type_p (const_tree, machine_mode);
 189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 190                                                      const_tree,
 191                                                      machine_mode *, int *,
 192                                                      bool *);
 193 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 194 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 195 static void aarch64_override_options_after_change (void);
 196 static bool aarch64_vector_mode_supported_p (machine_mode);
 197 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 199                                                          const_tree type,
 200                                                          int misalignment,
 201                                                          bool is_packed);
 202 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 203 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 204                                             aarch64_addr_query_type);
 205 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 206
 207 /* Major revision number of the ARM Architecture implemented by the target.  */
 208 unsigned aarch64_architecture_version;
 209
 210 /* The processor for which instructions should be scheduled.  */
 211 enum aarch64_processor aarch64_tune = cortexa53;
 212
 213 /* Mask to specify which instruction scheduling options should be used.  */
 214 uint64_t aarch64_tune_flags = 0;
 215
 216 /* Global flag for PC relative loads.  */
 217 bool aarch64_pcrelative_literal_loads;
 218
 219 /* Global flag for whether frame pointer is enabled.  */
 220 bool aarch64_use_frame_pointer;
 221
 222 #define BRANCH_PROTECT_STR_MAX 255
 223 char *accepted_branch_protection_string = NULL;
 224
 225 static enum aarch64_parse_opt_result
 226 aarch64_parse_branch_protection (const char*, char**);
 227
 228 /* Support for command line parsing of boolean flags in the tuning
 229    structures.  */
 230 struct aarch64_flag_desc
 231 {
 232   const char* name;
 233   unsigned int flag;
 234 };
 235
 236 #define AARCH64_FUSION_PAIR(name, internal_name) \
 237   { name, AARCH64_FUSE_##internal_name },
 238 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 239 {
 240   { "none", AARCH64_FUSE_NOTHING },
 241 #include "aarch64-fusion-pairs.def"
 242   { "all", AARCH64_FUSE_ALL },
 243   { NULL, AARCH64_FUSE_NOTHING }
 244 };
 245
 246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 247   { name, AARCH64_EXTRA_TUNE_##internal_name },
 248 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 249 {
 250   { "none", AARCH64_EXTRA_TUNE_NONE },
 251 #include "aarch64-tuning-flags.def"
 252   { "all", AARCH64_EXTRA_TUNE_ALL },
 253   { NULL, AARCH64_EXTRA_TUNE_NONE }
 254 };
 255
 256 /* Tuning parameters.  */
 257
 258 static const struct cpu_addrcost_table generic_addrcost_table =
 259 {
 260     {
 261       1, /* hi  */
 262       0, /* si  */
 263       0, /* di  */
 264       1, /* ti  */
 265     },
 266   0, /* pre_modify  */
 267   0, /* post_modify  */
 268   0, /* register_offset  */
 269   0, /* register_sextend  */
 270   0, /* register_zextend  */
 271   0 /* imm_offset  */
 272 };
 273
 274 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 275 {
 276     {
 277       0, /* hi  */
 278       0, /* si  */
 279       0, /* di  */
 280       2, /* ti  */
 281     },
 282   0, /* pre_modify  */
 283   0, /* post_modify  */
 284   1, /* register_offset  */
 285   1, /* register_sextend  */
 286   2, /* register_zextend  */
 287   0, /* imm_offset  */
 288 };
 289
 290 static const struct cpu_addrcost_table xgene1_addrcost_table =
 291 {
 292     {
 293       1, /* hi  */
 294       0, /* si  */
 295       0, /* di  */
 296       1, /* ti  */
 297     },
 298   1, /* pre_modify  */
 299   1, /* post_modify  */
 300   0, /* register_offset  */
 301   1, /* register_sextend  */
 302   1, /* register_zextend  */
 303   0, /* imm_offset  */
 304 };
 305
 306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 307 {
 308     {
 309       1, /* hi  */
 310       1, /* si  */
 311       1, /* di  */
 312       2, /* ti  */
 313     },
 314   0, /* pre_modify  */
 315   0, /* post_modify  */
 316   2, /* register_offset  */
 317   3, /* register_sextend  */
 318   3, /* register_zextend  */
 319   0, /* imm_offset  */
 320 };
 321
 322 static const struct cpu_addrcost_table tsv110_addrcost_table =
 323 {
 324     {
 325       1, /* hi  */
 326       0, /* si  */
 327       0, /* di  */
 328       1, /* ti  */
 329     },
 330   0, /* pre_modify  */
 331   0, /* post_modify  */
 332   0, /* register_offset  */
 333   1, /* register_sextend  */
 334   1, /* register_zextend  */
 335   0, /* imm_offset  */
 336 };
 337
 338 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 339 {
 340     {
 341       1, /* hi  */
 342       1, /* si  */
 343       1, /* di  */
 344       2, /* ti  */
 345     },
 346   1, /* pre_modify  */
 347   1, /* post_modify  */
 348   3, /* register_offset  */
 349   3, /* register_sextend  */
 350   3, /* register_zextend  */
 351   2, /* imm_offset  */
 352 };
 353
 354 static const struct cpu_regmove_cost generic_regmove_cost =
 355 {
 356   1, /* GP2GP  */
 357   /* Avoid the use of slow int<->fp moves for spilling by setting
 358      their cost higher than memmov_cost.  */
 359   5, /* GP2FP  */
 360   5, /* FP2GP  */
 361   2 /* FP2FP  */
 362 };
 363
 364 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 365 {
 366   1, /* GP2GP  */
 367   /* Avoid the use of slow int<->fp moves for spilling by setting
 368      their cost higher than memmov_cost.  */
 369   5, /* GP2FP  */
 370   5, /* FP2GP  */
 371   2 /* FP2FP  */
 372 };
 373
 374 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 375 {
 376   1, /* GP2GP  */
 377   /* Avoid the use of slow int<->fp moves for spilling by setting
 378      their cost higher than memmov_cost.  */
 379   5, /* GP2FP  */
 380   5, /* FP2GP  */
 381   2 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of slow int<->fp moves for spilling by setting
 388      their cost higher than memmov_cost (actual, 4 and 9).  */
 389   9, /* GP2FP  */
 390   9, /* FP2GP  */
 391   1 /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost thunderx_regmove_cost =
 395 {
 396   2, /* GP2GP  */
 397   2, /* GP2FP  */
 398   6, /* FP2GP  */
 399   4 /* FP2FP  */
 400 };
 401
 402 static const struct cpu_regmove_cost xgene1_regmove_cost =
 403 {
 404   1, /* GP2GP  */
 405   /* Avoid the use of slow int<->fp moves for spilling by setting
 406      their cost higher than memmov_cost.  */
 407   8, /* GP2FP  */
 408   8, /* FP2GP  */
 409   2 /* FP2FP  */
 410 };
 411
 412 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 413 {
 414   2, /* GP2GP  */
 415   /* Avoid the use of int<->fp moves for spilling.  */
 416   6, /* GP2FP  */
 417   6, /* FP2GP  */
 418   4 /* FP2FP  */
 419 };
 420
 421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 422 {
 423   1, /* GP2GP  */
 424   /* Avoid the use of int<->fp moves for spilling.  */
 425   8, /* GP2FP  */
 426   8, /* FP2GP  */
 427   4  /* FP2FP  */
 428 };
 429
 430 static const struct cpu_regmove_cost tsv110_regmove_cost =
 431 {
 432   1, /* GP2GP  */
 433   /* Avoid the use of slow int<->fp moves for spilling by setting
 434      their cost higher than memmov_cost.  */
 435   2, /* GP2FP  */
 436   3, /* FP2GP  */
 437   2  /* FP2FP  */
 438 };
 439
 440 /* Generic costs for vector insn classes.  */
 441 static const struct cpu_vector_cost generic_vector_cost =
 442 {
 443   1, /* scalar_int_stmt_cost  */
 444   1, /* scalar_fp_stmt_cost  */
 445   1, /* scalar_load_cost  */
 446   1, /* scalar_store_cost  */
 447   1, /* vec_int_stmt_cost  */
 448   1, /* vec_fp_stmt_cost  */
 449   2, /* vec_permute_cost  */
 450   2, /* vec_to_scalar_cost  */
 451   1, /* scalar_to_vec_cost  */
 452   1, /* vec_align_load_cost  */
 453   1, /* vec_unalign_load_cost  */
 454   1, /* vec_unalign_store_cost  */
 455   1, /* vec_store_cost  */
 456   3, /* cond_taken_branch_cost  */
 457   1 /* cond_not_taken_branch_cost  */
 458 };
 459
 460 /* QDF24XX costs for vector insn classes.  */
 461 static const struct cpu_vector_cost qdf24xx_vector_cost =
 462 {
 463   1, /* scalar_int_stmt_cost  */
 464   1, /* scalar_fp_stmt_cost  */
 465   1, /* scalar_load_cost  */
 466   1, /* scalar_store_cost  */
 467   1, /* vec_int_stmt_cost  */
 468   3, /* vec_fp_stmt_cost  */
 469   2, /* vec_permute_cost  */
 470   1, /* vec_to_scalar_cost  */
 471   1, /* scalar_to_vec_cost  */
 472   1, /* vec_align_load_cost  */
 473   1, /* vec_unalign_load_cost  */
 474   1, /* vec_unalign_store_cost  */
 475   1, /* vec_store_cost  */
 476   3, /* cond_taken_branch_cost  */
 477   1 /* cond_not_taken_branch_cost  */
 478 };
 479
 480 /* ThunderX costs for vector insn classes.  */
 481 static const struct cpu_vector_cost thunderx_vector_cost =
 482 {
 483   1, /* scalar_int_stmt_cost  */
 484   1, /* scalar_fp_stmt_cost  */
 485   3, /* scalar_load_cost  */
 486   1, /* scalar_store_cost  */
 487   4, /* vec_int_stmt_cost  */
 488   1, /* vec_fp_stmt_cost  */
 489   4, /* vec_permute_cost  */
 490   2, /* vec_to_scalar_cost  */
 491   2, /* scalar_to_vec_cost  */
 492   3, /* vec_align_load_cost  */
 493   5, /* vec_unalign_load_cost  */
 494   5, /* vec_unalign_store_cost  */
 495   1, /* vec_store_cost  */
 496   3, /* cond_taken_branch_cost  */
 497   3 /* cond_not_taken_branch_cost  */
 498 };
 499
 500 static const struct cpu_vector_cost tsv110_vector_cost =
 501 {
 502   1, /* scalar_int_stmt_cost  */
 503   1, /* scalar_fp_stmt_cost  */
 504   5, /* scalar_load_cost  */
 505   1, /* scalar_store_cost  */
 506   2, /* vec_int_stmt_cost  */
 507   2, /* vec_fp_stmt_cost  */
 508   2, /* vec_permute_cost  */
 509   3, /* vec_to_scalar_cost  */
 510   2, /* scalar_to_vec_cost  */
 511   5, /* vec_align_load_cost  */
 512   5, /* vec_unalign_load_cost  */
 513   1, /* vec_unalign_store_cost  */
 514   1, /* vec_store_cost  */
 515   1, /* cond_taken_branch_cost  */
 516   1 /* cond_not_taken_branch_cost  */
 517 };
 518
 519 /* Generic costs for vector insn classes.  */
 520 static const struct cpu_vector_cost cortexa57_vector_cost =
 521 {
 522   1, /* scalar_int_stmt_cost  */
 523   1, /* scalar_fp_stmt_cost  */
 524   4, /* scalar_load_cost  */
 525   1, /* scalar_store_cost  */
 526   2, /* vec_int_stmt_cost  */
 527   2, /* vec_fp_stmt_cost  */
 528   3, /* vec_permute_cost  */
 529   8, /* vec_to_scalar_cost  */
 530   8, /* scalar_to_vec_cost  */
 531   4, /* vec_align_load_cost  */
 532   4, /* vec_unalign_load_cost  */
 533   1, /* vec_unalign_store_cost  */
 534   1, /* vec_store_cost  */
 535   1, /* cond_taken_branch_cost  */
 536   1 /* cond_not_taken_branch_cost  */
 537 };
 538
 539 static const struct cpu_vector_cost exynosm1_vector_cost =
 540 {
 541   1, /* scalar_int_stmt_cost  */
 542   1, /* scalar_fp_stmt_cost  */
 543   5, /* scalar_load_cost  */
 544   1, /* scalar_store_cost  */
 545   3, /* vec_int_stmt_cost  */
 546   3, /* vec_fp_stmt_cost  */
 547   3, /* vec_permute_cost  */
 548   3, /* vec_to_scalar_cost  */
 549   3, /* scalar_to_vec_cost  */
 550   5, /* vec_align_load_cost  */
 551   5, /* vec_unalign_load_cost  */
 552   1, /* vec_unalign_store_cost  */
 553   1, /* vec_store_cost  */
 554   1, /* cond_taken_branch_cost  */
 555   1 /* cond_not_taken_branch_cost  */
 556 };
 557
 558 /* Generic costs for vector insn classes.  */
 559 static const struct cpu_vector_cost xgene1_vector_cost =
 560 {
 561   1, /* scalar_int_stmt_cost  */
 562   1, /* scalar_fp_stmt_cost  */
 563   5, /* scalar_load_cost  */
 564   1, /* scalar_store_cost  */
 565   2, /* vec_int_stmt_cost  */
 566   2, /* vec_fp_stmt_cost  */
 567   2, /* vec_permute_cost  */
 568   4, /* vec_to_scalar_cost  */
 569   4, /* scalar_to_vec_cost  */
 570   10, /* vec_align_load_cost  */
 571   10, /* vec_unalign_load_cost  */
 572   2, /* vec_unalign_store_cost  */
 573   2, /* vec_store_cost  */
 574   2, /* cond_taken_branch_cost  */
 575   1 /* cond_not_taken_branch_cost  */
 576 };
 577
 578 /* Costs for vector insn classes for Vulcan.  */
 579 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 580 {
 581   1, /* scalar_int_stmt_cost  */
 582   6, /* scalar_fp_stmt_cost  */
 583   4, /* scalar_load_cost  */
 584   1, /* scalar_store_cost  */
 585   5, /* vec_int_stmt_cost  */
 586   6, /* vec_fp_stmt_cost  */
 587   10, /* vec_permute_cost  */
 588   6, /* vec_to_scalar_cost  */
 589   5, /* scalar_to_vec_cost  */
 590   8, /* vec_align_load_cost  */
 591   8, /* vec_unalign_load_cost  */
 592   4, /* vec_unalign_store_cost  */
 593   4, /* vec_store_cost  */
 594   2, /* cond_taken_branch_cost  */
 595   1  /* cond_not_taken_branch_cost  */
 596 };
 597
 598 /* Generic costs for branch instructions.  */
 599 static const struct cpu_branch_cost generic_branch_cost =
 600 {
 601   1,  /* Predictable.  */
 602   3   /* Unpredictable.  */
 603 };
 604
 605 /* Generic approximation modes.  */
 606 static const cpu_approx_modes generic_approx_modes =
 607 {
 608   AARCH64_APPROX_NONE,  /* division  */
 609   AARCH64_APPROX_NONE,  /* sqrt  */
 610   AARCH64_APPROX_NONE   /* recip_sqrt  */
 611 };
 612
 613 /* Approximation modes for Exynos M1.  */
 614 static const cpu_approx_modes exynosm1_approx_modes =
 615 {
 616   AARCH64_APPROX_NONE,  /* division  */
 617   AARCH64_APPROX_ALL,   /* sqrt  */
 618   AARCH64_APPROX_ALL    /* recip_sqrt  */
 619 };
 620
 621 /* Approximation modes for X-Gene 1.  */
 622 static const cpu_approx_modes xgene1_approx_modes =
 623 {
 624   AARCH64_APPROX_NONE,  /* division  */
 625   AARCH64_APPROX_NONE,  /* sqrt  */
 626   AARCH64_APPROX_ALL    /* recip_sqrt  */
 627 };
 628
 629 /* Generic prefetch settings (which disable prefetch).  */
 630 static const cpu_prefetch_tune generic_prefetch_tune =
 631 {
 632   0,                    /* num_slots  */
 633   -1,                   /* l1_cache_size  */
 634   -1,                   /* l1_cache_line_size  */
 635   -1,                   /* l2_cache_size  */
 636   true,                 /* prefetch_dynamic_strides */
 637   -1,                   /* minimum_stride */
 638   -1                    /* default_opt_level  */
 639 };
 640
 641 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 642 {
 643   0,                    /* num_slots  */
 644   -1,                   /* l1_cache_size  */
 645   64,                   /* l1_cache_line_size  */
 646   -1,                   /* l2_cache_size  */
 647   true,                 /* prefetch_dynamic_strides */
 648   -1,                   /* minimum_stride */
 649   -1                    /* default_opt_level  */
 650 };
 651
 652 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 653 {
 654   4,                    /* num_slots  */
 655   32,                   /* l1_cache_size  */
 656   64,                   /* l1_cache_line_size  */
 657   512,                  /* l2_cache_size  */
 658   false,                /* prefetch_dynamic_strides */
 659   2048,                 /* minimum_stride */
 660   3                     /* default_opt_level  */
 661 };
 662
 663 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 664 {
 665   8,                    /* num_slots  */
 666   32,                   /* l1_cache_size  */
 667   128,                  /* l1_cache_line_size  */
 668   16*1024,              /* l2_cache_size  */
 669   true,                 /* prefetch_dynamic_strides */
 670   -1,                   /* minimum_stride */
 671   3                     /* default_opt_level  */
 672 };
 673
 674 static const cpu_prefetch_tune thunderx_prefetch_tune =
 675 {
 676   8,                    /* num_slots  */
 677   32,                   /* l1_cache_size  */
 678   128,                  /* l1_cache_line_size  */
 679   -1,                   /* l2_cache_size  */
 680   true,                 /* prefetch_dynamic_strides */
 681   -1,                   /* minimum_stride */
 682   -1                    /* default_opt_level  */
 683 };
 684
 685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 686 {
 687   8,                    /* num_slots  */
 688   32,                   /* l1_cache_size  */
 689   64,                   /* l1_cache_line_size  */
 690   256,                  /* l2_cache_size  */
 691   true,                 /* prefetch_dynamic_strides */
 692   -1,                   /* minimum_stride */
 693   -1                    /* default_opt_level  */
 694 };
 695
 696 static const cpu_prefetch_tune tsv110_prefetch_tune =
 697 {
 698   0,                    /* num_slots  */
 699   64,                   /* l1_cache_size  */
 700   64,                   /* l1_cache_line_size  */
 701   512,                  /* l2_cache_size  */
 702   true,                 /* prefetch_dynamic_strides */
 703   -1,                   /* minimum_stride */
 704   -1                    /* default_opt_level  */
 705 };
 706
 707 static const cpu_prefetch_tune xgene1_prefetch_tune =
 708 {
 709   8,                    /* num_slots  */
 710   32,                   /* l1_cache_size  */
 711   64,                   /* l1_cache_line_size  */
 712   256,                  /* l2_cache_size  */
 713   true,                 /* prefetch_dynamic_strides */
 714   -1,                   /* minimum_stride */
 715   -1                    /* default_opt_level  */
 716 };
 717
 718 static const struct tune_params generic_tunings =
 719 {
 720   &cortexa57_extra_costs,
 721   &generic_addrcost_table,
 722   &generic_regmove_cost,
 723   &generic_vector_cost,
 724   &generic_branch_cost,
 725   &generic_approx_modes,
 726   SVE_NOT_IMPLEMENTED, /* sve_width  */
 727   4, /* memmov_cost  */
 728   2, /* issue_rate  */
 729   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
 730   "16:12",      /* function_align.  */
 731   "4",  /* jump_align.  */
 732   "8",  /* loop_align.  */
 733   2,    /* int_reassoc_width.  */
 734   4,    /* fp_reassoc_width.  */
 735   1,    /* vec_reassoc_width.  */
 736   2,    /* min_div_recip_mul_sf.  */
 737   2,    /* min_div_recip_mul_df.  */
 738   0,    /* max_case_values.  */
 739   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 740   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 741   &generic_prefetch_tune
 742 };
 743
 744 static const struct tune_params cortexa35_tunings =
 745 {
 746   &cortexa53_extra_costs,
 747   &generic_addrcost_table,
 748   &cortexa53_regmove_cost,
 749   &generic_vector_cost,
 750   &generic_branch_cost,
 751   &generic_approx_modes,
 752   SVE_NOT_IMPLEMENTED, /* sve_width  */
 753   4, /* memmov_cost  */
 754   1, /* issue_rate  */
 755   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 756    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 757   "16", /* function_align.  */
 758   "4",  /* jump_align.  */
 759   "8",  /* loop_align.  */
 760   2,    /* int_reassoc_width.  */
 761   4,    /* fp_reassoc_width.  */
 762   1,    /* vec_reassoc_width.  */
 763   2,    /* min_div_recip_mul_sf.  */
 764   2,    /* min_div_recip_mul_df.  */
 765   0,    /* max_case_values.  */
 766   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 767   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 768   &generic_prefetch_tune
 769 };
 770
 771 static const struct tune_params cortexa53_tunings =
 772 {
 773   &cortexa53_extra_costs,
 774   &generic_addrcost_table,
 775   &cortexa53_regmove_cost,
 776   &generic_vector_cost,
 777   &generic_branch_cost,
 778   &generic_approx_modes,
 779   SVE_NOT_IMPLEMENTED, /* sve_width  */
 780   4, /* memmov_cost  */
 781   2, /* issue_rate  */
 782   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 783    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 784   "16", /* function_align.  */
 785   "4",  /* jump_align.  */
 786   "8",  /* loop_align.  */
 787   2,    /* int_reassoc_width.  */
 788   4,    /* fp_reassoc_width.  */
 789   1,    /* vec_reassoc_width.  */
 790   2,    /* min_div_recip_mul_sf.  */
 791   2,    /* min_div_recip_mul_df.  */
 792   0,    /* max_case_values.  */
 793   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 794   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 795   &generic_prefetch_tune
 796 };
 797
 798 static const struct tune_params cortexa57_tunings =
 799 {
 800   &cortexa57_extra_costs,
 801   &generic_addrcost_table,
 802   &cortexa57_regmove_cost,
 803   &cortexa57_vector_cost,
 804   &generic_branch_cost,
 805   &generic_approx_modes,
 806   SVE_NOT_IMPLEMENTED, /* sve_width  */
 807   4, /* memmov_cost  */
 808   3, /* issue_rate  */
 809   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 811   "16", /* function_align.  */
 812   "4",  /* jump_align.  */
 813   "8",  /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 822   &generic_prefetch_tune
 823 };
 824
 825 static const struct tune_params cortexa72_tunings =
 826 {
 827   &cortexa57_extra_costs,
 828   &generic_addrcost_table,
 829   &cortexa57_regmove_cost,
 830   &cortexa57_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   SVE_NOT_IMPLEMENTED, /* sve_width  */
 834   4, /* memmov_cost  */
 835   3, /* issue_rate  */
 836   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 837    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 838   "16", /* function_align.  */
 839   "4",  /* jump_align.  */
 840   "8",  /* loop_align.  */
 841   2,    /* int_reassoc_width.  */
 842   4,    /* fp_reassoc_width.  */
 843   1,    /* vec_reassoc_width.  */
 844   2,    /* min_div_recip_mul_sf.  */
 845   2,    /* min_div_recip_mul_df.  */
 846   0,    /* max_case_values.  */
 847   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 848   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 849   &generic_prefetch_tune
 850 };
 851
 852 static const struct tune_params cortexa73_tunings =
 853 {
 854   &cortexa57_extra_costs,
 855   &generic_addrcost_table,
 856   &cortexa57_regmove_cost,
 857   &cortexa57_vector_cost,
 858   &generic_branch_cost,
 859   &generic_approx_modes,
 860   SVE_NOT_IMPLEMENTED, /* sve_width  */
 861   4, /* memmov_cost.  */
 862   2, /* issue_rate.  */
 863   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 864    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 865   "16", /* function_align.  */
 866   "4",  /* jump_align.  */
 867   "8",  /* loop_align.  */
 868   2,    /* int_reassoc_width.  */
 869   4,    /* fp_reassoc_width.  */
 870   1,    /* vec_reassoc_width.  */
 871   2,    /* min_div_recip_mul_sf.  */
 872   2,    /* min_div_recip_mul_df.  */
 873   0,    /* max_case_values.  */
 874   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 875   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 876   &generic_prefetch_tune
 877 };
 878
 879
 880
 881 static const struct tune_params exynosm1_tunings =
 882 {
 883   &exynosm1_extra_costs,
 884   &exynosm1_addrcost_table,
 885   &exynosm1_regmove_cost,
 886   &exynosm1_vector_cost,
 887   &generic_branch_cost,
 888   &exynosm1_approx_modes,
 889   SVE_NOT_IMPLEMENTED, /* sve_width  */
 890   4,    /* memmov_cost  */
 891   3,    /* issue_rate  */
 892   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 893   "4",  /* function_align.  */
 894   "4",  /* jump_align.  */
 895   "4",  /* loop_align.  */
 896   2,    /* int_reassoc_width.  */
 897   4,    /* fp_reassoc_width.  */
 898   1,    /* vec_reassoc_width.  */
 899   2,    /* min_div_recip_mul_sf.  */
 900   2,    /* min_div_recip_mul_df.  */
 901   48,   /* max_case_values.  */
 902   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 903   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 904   &exynosm1_prefetch_tune
 905 };
 906
 907 static const struct tune_params thunderxt88_tunings =
 908 {
 909   &thunderx_extra_costs,
 910   &generic_addrcost_table,
 911   &thunderx_regmove_cost,
 912   &thunderx_vector_cost,
 913   &generic_branch_cost,
 914   &generic_approx_modes,
 915   SVE_NOT_IMPLEMENTED, /* sve_width  */
 916   6, /* memmov_cost  */
 917   2, /* issue_rate  */
 918   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
 919   "8",  /* function_align.  */
 920   "8",  /* jump_align.  */
 921   "8",  /* loop_align.  */
 922   2,    /* int_reassoc_width.  */
 923   4,    /* fp_reassoc_width.  */
 924   1,    /* vec_reassoc_width.  */
 925   2,    /* min_div_recip_mul_sf.  */
 926   2,    /* min_div_recip_mul_df.  */
 927   0,    /* max_case_values.  */
 928   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 929   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 930   &thunderxt88_prefetch_tune
 931 };
 932
 933 static const struct tune_params thunderx_tunings =
 934 {
 935   &thunderx_extra_costs,
 936   &generic_addrcost_table,
 937   &thunderx_regmove_cost,
 938   &thunderx_vector_cost,
 939   &generic_branch_cost,
 940   &generic_approx_modes,
 941   SVE_NOT_IMPLEMENTED, /* sve_width  */
 942   6, /* memmov_cost  */
 943   2, /* issue_rate  */
 944   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
 945   "8",  /* function_align.  */
 946   "8",  /* jump_align.  */
 947   "8",  /* loop_align.  */
 948   2,    /* int_reassoc_width.  */
 949   4,    /* fp_reassoc_width.  */
 950   1,    /* vec_reassoc_width.  */
 951   2,    /* min_div_recip_mul_sf.  */
 952   2,    /* min_div_recip_mul_df.  */
 953   0,    /* max_case_values.  */
 954   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 955   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 956    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 957   &thunderx_prefetch_tune
 958 };
 959
 960 static const struct tune_params tsv110_tunings =
 961 {
 962   &tsv110_extra_costs,
 963   &tsv110_addrcost_table,
 964   &tsv110_regmove_cost,
 965   &tsv110_vector_cost,
 966   &generic_branch_cost,
 967   &generic_approx_modes,
 968   SVE_NOT_IMPLEMENTED, /* sve_width  */
 969   4,    /* memmov_cost  */
 970   4,    /* issue_rate  */
 971   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
 972    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
 973   "16", /* function_align.  */
 974   "4",  /* jump_align.  */
 975   "8",  /* loop_align.  */
 976   2,    /* int_reassoc_width.  */
 977   4,    /* fp_reassoc_width.  */
 978   1,    /* vec_reassoc_width.  */
 979   2,    /* min_div_recip_mul_sf.  */
 980   2,    /* min_div_recip_mul_df.  */
 981   0,    /* max_case_values.  */
 982   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 983   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 984   &tsv110_prefetch_tune
 985 };
 986
 987 static const struct tune_params xgene1_tunings =
 988 {
 989   &xgene1_extra_costs,
 990   &xgene1_addrcost_table,
 991   &xgene1_regmove_cost,
 992   &xgene1_vector_cost,
 993   &generic_branch_cost,
 994   &xgene1_approx_modes,
 995   SVE_NOT_IMPLEMENTED, /* sve_width  */
 996   6, /* memmov_cost  */
 997   4, /* issue_rate  */
 998   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 999   "16", /* function_align.  */
1000   "16", /* jump_align.  */
1001   "16", /* loop_align.  */
1002   2,    /* int_reassoc_width.  */
1003   4,    /* fp_reassoc_width.  */
1004   1,    /* vec_reassoc_width.  */
1005   2,    /* min_div_recip_mul_sf.  */
1006   2,    /* min_div_recip_mul_df.  */
1007   17,   /* max_case_values.  */
1008   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1009   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1010   &xgene1_prefetch_tune
1011 };
1012
1013 static const struct tune_params emag_tunings =
1014 {
1015   &xgene1_extra_costs,
1016   &xgene1_addrcost_table,
1017   &xgene1_regmove_cost,
1018   &xgene1_vector_cost,
1019   &generic_branch_cost,
1020   &xgene1_approx_modes,
1021   SVE_NOT_IMPLEMENTED,
1022   6, /* memmov_cost  */
1023   4, /* issue_rate  */
1024   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1025   "16", /* function_align.  */
1026   "16", /* jump_align.  */
1027   "16", /* loop_align.  */
1028   2,    /* int_reassoc_width.  */
1029   4,    /* fp_reassoc_width.  */
1030   1,    /* vec_reassoc_width.  */
1031   2,    /* min_div_recip_mul_sf.  */
1032   2,    /* min_div_recip_mul_df.  */
1033   17,   /* max_case_values.  */
1034   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1035   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1036   &xgene1_prefetch_tune
1037 };
1038
1039 static const struct tune_params qdf24xx_tunings =
1040 {
1041   &qdf24xx_extra_costs,
1042   &qdf24xx_addrcost_table,
1043   &qdf24xx_regmove_cost,
1044   &qdf24xx_vector_cost,
1045   &generic_branch_cost,
1046   &generic_approx_modes,
1047   SVE_NOT_IMPLEMENTED, /* sve_width  */
1048   4, /* memmov_cost  */
1049   4, /* issue_rate  */
1050   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1051    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1052   "16", /* function_align.  */
1053   "8",  /* jump_align.  */
1054   "16", /* loop_align.  */
1055   2,    /* int_reassoc_width.  */
1056   4,    /* fp_reassoc_width.  */
1057   1,    /* vec_reassoc_width.  */
1058   2,    /* min_div_recip_mul_sf.  */
1059   2,    /* min_div_recip_mul_df.  */
1060   0,    /* max_case_values.  */
1061   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1062   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1063   &qdf24xx_prefetch_tune
1064 };
1065
1066 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1067    for now.  */
1068 static const struct tune_params saphira_tunings =
1069 {
1070   &generic_extra_costs,
1071   &generic_addrcost_table,
1072   &generic_regmove_cost,
1073   &generic_vector_cost,
1074   &generic_branch_cost,
1075   &generic_approx_modes,
1076   SVE_NOT_IMPLEMENTED, /* sve_width  */
1077   4, /* memmov_cost  */
1078   4, /* issue_rate  */
1079   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1080    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1081   "16", /* function_align.  */
1082   "8",  /* jump_align.  */
1083   "16", /* loop_align.  */
1084   2,    /* int_reassoc_width.  */
1085   4,    /* fp_reassoc_width.  */
1086   1,    /* vec_reassoc_width.  */
1087   2,    /* min_div_recip_mul_sf.  */
1088   2,    /* min_div_recip_mul_df.  */
1089   0,    /* max_case_values.  */
1090   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1091   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1092   &generic_prefetch_tune
1093 };
1094
1095 static const struct tune_params thunderx2t99_tunings =
1096 {
1097   &thunderx2t99_extra_costs,
1098   &thunderx2t99_addrcost_table,
1099   &thunderx2t99_regmove_cost,
1100   &thunderx2t99_vector_cost,
1101   &generic_branch_cost,
1102   &generic_approx_modes,
1103   SVE_NOT_IMPLEMENTED, /* sve_width  */
1104   4, /* memmov_cost.  */
1105   4, /* issue_rate.  */
1106   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1107    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1108   "16", /* function_align.  */
1109   "8",  /* jump_align.  */
1110   "16", /* loop_align.  */
1111   3,    /* int_reassoc_width.  */
1112   2,    /* fp_reassoc_width.  */
1113   2,    /* vec_reassoc_width.  */
1114   2,    /* min_div_recip_mul_sf.  */
1115   2,    /* min_div_recip_mul_df.  */
1116   0,    /* max_case_values.  */
1117   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1118   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1119   &thunderx2t99_prefetch_tune
1120 };
1121
1122 static const struct tune_params neoversen1_tunings =
1123 {
1124   &cortexa57_extra_costs,
1125   &generic_addrcost_table,
1126   &generic_regmove_cost,
1127   &cortexa57_vector_cost,
1128   &generic_branch_cost,
1129   &generic_approx_modes,
1130   SVE_NOT_IMPLEMENTED, /* sve_width  */
1131   4, /* memmov_cost  */
1132   3, /* issue_rate  */
1133   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1134   "32:16",      /* function_align.  */
1135   "4",          /* jump_align.  */
1136   "32:16",      /* loop_align.  */
1137   2,    /* int_reassoc_width.  */
1138   4,    /* fp_reassoc_width.  */
1139   2,    /* vec_reassoc_width.  */
1140   2,    /* min_div_recip_mul_sf.  */
1141   2,    /* min_div_recip_mul_df.  */
1142   0,    /* max_case_values.  */
1143   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1144   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1145   &generic_prefetch_tune
1146 };
1147
1148 /* Support for fine-grained override of the tuning structures.  */
1149 struct aarch64_tuning_override_function
1150 {
1151   const char* name;
1152   void (*parse_override)(const char*, struct tune_params*);
1153 };
1154
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1158
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions[] =
1161 {
1162   { "fuse", aarch64_parse_fuse_string },
1163   { "tune", aarch64_parse_tune_string },
1164   { "sve_width", aarch64_parse_sve_width_string },
1165   { NULL, NULL }
1166 };
1167
1168 /* A processor implementing AArch64.  */
1169 struct processor
1170 {
1171   const char *const name;
1172   enum aarch64_processor ident;
1173   enum aarch64_processor sched_core;
1174   enum aarch64_arch arch;
1175   unsigned architecture_version;
1176   const uint64_t flags;
1177   const struct tune_params *const tune;
1178 };
1179
1180 /* Architectures implementing AArch64.  */
1181 static const struct processor all_architectures[] =
1182 {
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1187 };
1188
1189 /* Processor cores implementing AArch64.  */
1190 static const struct processor all_cores[] =
1191 {
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1194   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1195   FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1198     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1199   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1200 };
1201
1202
1203 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1204    handling code or by target attributes.  */
1205 static const struct processor *selected_arch;
1206 static const struct processor *selected_cpu;
1207 static const struct processor *selected_tune;
1208
1209 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1210
1211 /* The current tuning set.  */
1212 struct tune_params aarch64_tune_params = generic_tunings;
1213
1214 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
1215
1216 static tree
1217 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1218                                      int, bool *no_add_attrs)
1219 {
1220   /* Since we set fn_type_req to true, the caller should have checked
1221      this for us.  */
1222   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1223   switch ((arm_pcs) fntype_abi (*node).id ())
1224     {
1225     case ARM_PCS_AAPCS64:
1226     case ARM_PCS_SIMD:
1227       return NULL_TREE;
1228
1229     case ARM_PCS_SVE:
1230       error ("the %qE attribute cannot be applied to an SVE function type",
1231              name);
1232       *no_add_attrs = true;
1233       return NULL_TREE;
1234
1235     case ARM_PCS_TLSDESC:
1236     case ARM_PCS_UNKNOWN:
1237       break;
1238     }
1239   gcc_unreachable ();
1240 }
1241
1242 /* Table of machine attributes.  */
1243 static const struct attribute_spec aarch64_attribute_table[] =
1244 {
1245   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1246        affects_type_identity, handler, exclude } */
1247   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
1248                           handle_aarch64_vector_pcs_attribute, NULL },
1249   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
1250   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1251 };
1252
1253 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1254
1255 /* An ISA extension in the co-processor and main instruction set space.  */
1256 struct aarch64_option_extension
1257 {
1258   const char *const name;
1259   const unsigned long flags_on;
1260   const unsigned long flags_off;
1261 };
1262
1263 typedef enum aarch64_cond_code
1264 {
1265   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1266   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1267   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1268 }
1269 aarch64_cc;
1270
1271 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1272
1273 struct aarch64_branch_protect_type
1274 {
1275   /* The type's name that the user passes to the branch-protection option
1276     string.  */
1277   const char* name;
1278   /* Function to handle the protection type and set global variables.
1279     First argument is the string token corresponding with this type and the
1280     second argument is the next token in the option string.
1281     Return values:
1282     * AARCH64_PARSE_OK: Handling was sucessful.
1283     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1284       should print an error.
1285     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1286       own error.  */
1287   enum aarch64_parse_opt_result (*handler)(char*, char*);
1288   /* A list of types that can follow this type in the option string.  */
1289   const aarch64_branch_protect_type* subtypes;
1290   unsigned int num_subtypes;
1291 };
1292
1293 static enum aarch64_parse_opt_result
1294 aarch64_handle_no_branch_protection (char* str, char* rest)
1295 {
1296   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1297   aarch64_enable_bti = 0;
1298   if (rest)
1299     {
1300       error ("unexpected %<%s%> after %<%s%>", rest, str);
1301       return AARCH64_PARSE_INVALID_FEATURE;
1302     }
1303   return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_standard_branch_protection (char* str, char* rest)
1308 {
1309   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1310   aarch64_ra_sign_key = AARCH64_KEY_A;
1311   aarch64_enable_bti = 1;
1312   if (rest)
1313     {
1314       error ("unexpected %<%s%> after %<%s%>", rest, str);
1315       return AARCH64_PARSE_INVALID_FEATURE;
1316     }
1317   return AARCH64_PARSE_OK;
1318 }
1319
1320 static enum aarch64_parse_opt_result
1321 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1322                                     char* rest ATTRIBUTE_UNUSED)
1323 {
1324   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1325   aarch64_ra_sign_key = AARCH64_KEY_A;
1326   return AARCH64_PARSE_OK;
1327 }
1328
1329 static enum aarch64_parse_opt_result
1330 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1331                               char* rest ATTRIBUTE_UNUSED)
1332 {
1333   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1334   return AARCH64_PARSE_OK;
1335 }
1336
1337 static enum aarch64_parse_opt_result
1338 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1339                               char* rest ATTRIBUTE_UNUSED)
1340 {
1341   aarch64_ra_sign_key = AARCH64_KEY_B;
1342   return AARCH64_PARSE_OK;
1343 }
1344
1345 static enum aarch64_parse_opt_result
1346 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1347                                     char* rest ATTRIBUTE_UNUSED)
1348 {
1349   aarch64_enable_bti = 1;
1350   return AARCH64_PARSE_OK;
1351 }
1352
1353 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1354   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1355   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1356   { NULL, NULL, NULL, 0 }
1357 };
1358
1359 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1360   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1361   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1362   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1363     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1364   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1365   { NULL, NULL, NULL, 0 }
1366 };
1367
1368 /* The condition codes of the processor, and the inverse function.  */
1369 static const char * const aarch64_condition_codes[] =
1370 {
1371   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1372   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1373 };
1374
1375 /* The preferred condition codes for SVE conditions.  */
1376 static const char *const aarch64_sve_condition_codes[] =
1377 {
1378   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1379   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1380 };
1381
1382 /* Return the assembly token for svpattern value VALUE.  */
1383
1384 static const char *
1385 svpattern_token (enum aarch64_svpattern pattern)
1386 {
1387   switch (pattern)
1388     {
1389 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1390     AARCH64_FOR_SVPATTERN (CASE)
1391 #undef CASE
1392     case AARCH64_NUM_SVPATTERNS:
1393       break;
1394     }
1395   gcc_unreachable ();
1396 }
1397
1398 /* Return the descriptor of the SIMD ABI.  */
1399
1400 static const predefined_function_abi &
1401 aarch64_simd_abi (void)
1402 {
1403   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1404   if (!simd_abi.initialized_p ())
1405     {
1406       HARD_REG_SET full_reg_clobbers
1407         = default_function_abi.full_reg_clobbers ();
1408       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1409         if (FP_SIMD_SAVED_REGNUM_P (regno))
1410           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1411       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1412     }
1413   return simd_abi;
1414 }
1415
1416 /* Return the descriptor of the SVE PCS.  */
1417
1418 static const predefined_function_abi &
1419 aarch64_sve_abi (void)
1420 {
1421   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1422   if (!sve_abi.initialized_p ())
1423     {
1424       HARD_REG_SET full_reg_clobbers
1425         = default_function_abi.full_reg_clobbers ();
1426       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1427         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1428       for (int regno = P4_REGNUM; regno <= P11_REGNUM; ++regno)
1429         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1430       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1431     }
1432   return sve_abi;
1433 }
1434
1435 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1436 const char *
1437 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1438                         const char * branch_format)
1439 {
1440     rtx_code_label * tmp_label = gen_label_rtx ();
1441     char label_buf[256];
1442     char buffer[128];
1443     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1444                                  CODE_LABEL_NUMBER (tmp_label));
1445     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1446     rtx dest_label = operands[pos_label];
1447     operands[pos_label] = tmp_label;
1448
1449     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1450     output_asm_insn (buffer, operands);
1451
1452     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1453     operands[pos_label] = dest_label;
1454     output_asm_insn (buffer, operands);
1455     return "";
1456 }
1457
1458 void
1459 aarch64_err_no_fpadvsimd (machine_mode mode)
1460 {
1461   if (TARGET_GENERAL_REGS_ONLY)
1462     if (FLOAT_MODE_P (mode))
1463       error ("%qs is incompatible with the use of floating-point types",
1464              "-mgeneral-regs-only");
1465     else
1466       error ("%qs is incompatible with the use of vector types",
1467              "-mgeneral-regs-only");
1468   else
1469     if (FLOAT_MODE_P (mode))
1470       error ("%qs feature modifier is incompatible with the use of"
1471              " floating-point types", "+nofp");
1472     else
1473       error ("%qs feature modifier is incompatible with the use of"
1474              " vector types", "+nofp");
1475 }
1476
1477 /* Report when we try to do something that requires SVE when SVE is disabled.
1478    This is an error of last resort and isn't very high-quality.  It usually
1479    involves attempts to measure the vector length in some way.  */
1480 static void
1481 aarch64_report_sve_required (void)
1482 {
1483   static bool reported_p = false;
1484
1485   /* Avoid reporting a slew of messages for a single oversight.  */
1486   if (reported_p)
1487     return;
1488
1489   error ("this operation requires the SVE ISA extension");
1490   inform (input_location, "you can enable SVE using the command-line"
1491           " option %<-march%>, or by using the %<target%>"
1492           " attribute or pragma");
1493   reported_p = true;
1494 }
1495
1496 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1497    registers.  */
1498 inline bool
1499 pr_or_ffr_regnum_p (unsigned int regno)
1500 {
1501   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1502 }
1503
1504 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1505    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1506    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1507    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1508    and GENERAL_REGS is lower than the memory cost (in this case the best class
1509    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1510    cost results in bad allocations with many redundant int<->FP moves which
1511    are expensive on various cores.
1512    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1513    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1514    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1515    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1516    The result of this is that it is no longer inefficient to have a higher
1517    memory move cost than the register move cost.
1518 */
1519
1520 static reg_class_t
1521 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1522                                          reg_class_t best_class)
1523 {
1524   machine_mode mode;
1525
1526   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1527       || !reg_class_subset_p (FP_REGS, allocno_class))
1528     return allocno_class;
1529
1530   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1531       || !reg_class_subset_p (FP_REGS, best_class))
1532     return best_class;
1533
1534   mode = PSEUDO_REGNO_MODE (regno);
1535   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1536 }
1537
1538 static unsigned int
1539 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1540 {
1541   if (GET_MODE_UNIT_SIZE (mode) == 4)
1542     return aarch64_tune_params.min_div_recip_mul_sf;
1543   return aarch64_tune_params.min_div_recip_mul_df;
1544 }
1545
1546 /* Return the reassociation width of treeop OPC with mode MODE.  */
1547 static int
1548 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1549 {
1550   if (VECTOR_MODE_P (mode))
1551     return aarch64_tune_params.vec_reassoc_width;
1552   if (INTEGRAL_MODE_P (mode))
1553     return aarch64_tune_params.int_reassoc_width;
1554   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1555   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1556     return aarch64_tune_params.fp_reassoc_width;
1557   return 1;
1558 }
1559
1560 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1561 unsigned
1562 aarch64_dbx_register_number (unsigned regno)
1563 {
1564    if (GP_REGNUM_P (regno))
1565      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1566    else if (regno == SP_REGNUM)
1567      return AARCH64_DWARF_SP;
1568    else if (FP_REGNUM_P (regno))
1569      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1570    else if (PR_REGNUM_P (regno))
1571      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1572    else if (regno == VG_REGNUM)
1573      return AARCH64_DWARF_VG;
1574
1575    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1576       equivalent DWARF register.  */
1577    return DWARF_FRAME_REGISTERS;
1578 }
1579
1580 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1581    integer, otherwise return X unmodified.  */
1582 static rtx
1583 aarch64_bit_representation (rtx x)
1584 {
1585   if (CONST_DOUBLE_P (x))
1586     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1587   return x;
1588 }
1589
1590 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1591 static bool
1592 aarch64_advsimd_struct_mode_p (machine_mode mode)
1593 {
1594   return (TARGET_SIMD
1595           && (mode == OImode || mode == CImode || mode == XImode));
1596 }
1597
1598 /* Return true if MODE is an SVE predicate mode.  */
1599 static bool
1600 aarch64_sve_pred_mode_p (machine_mode mode)
1601 {
1602   return (TARGET_SVE
1603           && (mode == VNx16BImode
1604               || mode == VNx8BImode
1605               || mode == VNx4BImode
1606               || mode == VNx2BImode));
1607 }
1608
1609 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1610 const unsigned int VEC_ADVSIMD  = 1;
1611 const unsigned int VEC_SVE_DATA = 2;
1612 const unsigned int VEC_SVE_PRED = 4;
1613 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1614    a structure of 2, 3 or 4 vectors.  */
1615 const unsigned int VEC_STRUCT   = 8;
1616 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1617    vector has fewer significant bytes than a full SVE vector.  */
1618 const unsigned int VEC_PARTIAL  = 16;
1619 /* Useful combinations of the above.  */
1620 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1621 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1622
1623 /* Return a set of flags describing the vector properties of mode MODE.
1624    Ignore modes that are not supported by the current target.  */
1625 static unsigned int
1626 aarch64_classify_vector_mode (machine_mode mode)
1627 {
1628   if (aarch64_advsimd_struct_mode_p (mode))
1629     return VEC_ADVSIMD | VEC_STRUCT;
1630
1631   if (aarch64_sve_pred_mode_p (mode))
1632     return VEC_SVE_PRED;
1633
1634   /* Make the decision based on the mode's enum value rather than its
1635      properties, so that we keep the correct classification regardless
1636      of -msve-vector-bits.  */
1637   switch (mode)
1638     {
1639     /* Partial SVE QI vectors.  */
1640     case E_VNx2QImode:
1641     case E_VNx4QImode:
1642     case E_VNx8QImode:
1643     /* Partial SVE HI vectors.  */
1644     case E_VNx2HImode:
1645     case E_VNx4HImode:
1646     /* Partial SVE SI vector.  */
1647     case E_VNx2SImode:
1648     /* Partial SVE HF vectors.  */
1649     case E_VNx2HFmode:
1650     case E_VNx4HFmode:
1651     /* Partial SVE SF vector.  */
1652     case E_VNx2SFmode:
1653       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1654
1655     case E_VNx16QImode:
1656     case E_VNx8HImode:
1657     case E_VNx4SImode:
1658     case E_VNx2DImode:
1659     case E_VNx8HFmode:
1660     case E_VNx4SFmode:
1661     case E_VNx2DFmode:
1662       return TARGET_SVE ? VEC_SVE_DATA : 0;
1663
1664     /* x2 SVE vectors.  */
1665     case E_VNx32QImode:
1666     case E_VNx16HImode:
1667     case E_VNx8SImode:
1668     case E_VNx4DImode:
1669     case E_VNx16HFmode:
1670     case E_VNx8SFmode:
1671     case E_VNx4DFmode:
1672     /* x3 SVE vectors.  */
1673     case E_VNx48QImode:
1674     case E_VNx24HImode:
1675     case E_VNx12SImode:
1676     case E_VNx6DImode:
1677     case E_VNx24HFmode:
1678     case E_VNx12SFmode:
1679     case E_VNx6DFmode:
1680     /* x4 SVE vectors.  */
1681     case E_VNx64QImode:
1682     case E_VNx32HImode:
1683     case E_VNx16SImode:
1684     case E_VNx8DImode:
1685     case E_VNx32HFmode:
1686     case E_VNx16SFmode:
1687     case E_VNx8DFmode:
1688       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1689
1690     /* 64-bit Advanced SIMD vectors.  */
1691     case E_V8QImode:
1692     case E_V4HImode:
1693     case E_V2SImode:
1694     /* ...E_V1DImode doesn't exist.  */
1695     case E_V4HFmode:
1696     case E_V4BFmode:
1697     case E_V2SFmode:
1698     case E_V1DFmode:
1699     /* 128-bit Advanced SIMD vectors.  */
1700     case E_V16QImode:
1701     case E_V8HImode:
1702     case E_V4SImode:
1703     case E_V2DImode:
1704     case E_V8HFmode:
1705     case E_V8BFmode:
1706     case E_V4SFmode:
1707     case E_V2DFmode:
1708       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1709
1710     default:
1711       return 0;
1712     }
1713 }
1714
1715 /* Return true if MODE is any of the data vector modes, including
1716    structure modes.  */
1717 static bool
1718 aarch64_vector_data_mode_p (machine_mode mode)
1719 {
1720   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1721 }
1722
1723 /* Return true if MODE is any form of SVE mode, including predicates,
1724    vectors and structures.  */
1725 bool
1726 aarch64_sve_mode_p (machine_mode mode)
1727 {
1728   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1729 }
1730
1731 /* Return true if MODE is an SVE data vector mode; either a single vector
1732    or a structure of vectors.  */
1733 static bool
1734 aarch64_sve_data_mode_p (machine_mode mode)
1735 {
1736   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1737 }
1738
1739 /* Return the number of defined bytes in one constituent vector of
1740    SVE mode MODE, which has vector flags VEC_FLAGS.  */
1741 static poly_int64
1742 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1743 {
1744   if (vec_flags & VEC_PARTIAL)
1745     /* A single partial vector.  */
1746     return GET_MODE_SIZE (mode);
1747
1748   if (vec_flags & VEC_SVE_DATA)
1749     /* A single vector or a tuple.  */
1750     return BYTES_PER_SVE_VECTOR;
1751
1752   /* A single predicate.  */
1753   gcc_assert (vec_flags & VEC_SVE_PRED);
1754   return BYTES_PER_SVE_PRED;
1755 }
1756
1757 /* Implement target hook TARGET_ARRAY_MODE.  */
1758 static opt_machine_mode
1759 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1760 {
1761   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1762       && IN_RANGE (nelems, 2, 4))
1763     return mode_for_vector (GET_MODE_INNER (mode),
1764                             GET_MODE_NUNITS (mode) * nelems);
1765
1766   return opt_machine_mode ();
1767 }
1768
1769 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1770 static bool
1771 aarch64_array_mode_supported_p (machine_mode mode,
1772                                 unsigned HOST_WIDE_INT nelems)
1773 {
1774   if (TARGET_SIMD
1775       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1776           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1777       && (nelems >= 2 && nelems <= 4))
1778     return true;
1779
1780   return false;
1781 }
1782
1783 /* MODE is some form of SVE vector mode.  For data modes, return the number
1784    of vector register bits that each element of MODE occupies, such as 64
1785    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1786    in a 64-bit container).  For predicate modes, return the number of
1787    data bits controlled by each significant predicate bit.  */
1788
1789 static unsigned int
1790 aarch64_sve_container_bits (machine_mode mode)
1791 {
1792   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1793   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1794                              ? BITS_PER_SVE_VECTOR
1795                              : GET_MODE_BITSIZE (mode));
1796   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1797 }
1798
1799 /* Return the SVE predicate mode to use for elements that have
1800    ELEM_NBYTES bytes, if such a mode exists.  */
1801
1802 opt_machine_mode
1803 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1804 {
1805   if (TARGET_SVE)
1806     {
1807       if (elem_nbytes == 1)
1808         return VNx16BImode;
1809       if (elem_nbytes == 2)
1810         return VNx8BImode;
1811       if (elem_nbytes == 4)
1812         return VNx4BImode;
1813       if (elem_nbytes == 8)
1814         return VNx2BImode;
1815     }
1816   return opt_machine_mode ();
1817 }
1818
1819 /* Return the SVE predicate mode that should be used to control
1820    SVE mode MODE.  */
1821
1822 machine_mode
1823 aarch64_sve_pred_mode (machine_mode mode)
1824 {
1825   unsigned int bits = aarch64_sve_container_bits (mode);
1826   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1827 }
1828
1829 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1830
1831 static opt_machine_mode
1832 aarch64_get_mask_mode (machine_mode mode)
1833 {
1834   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1835   if (vec_flags & VEC_SVE_DATA)
1836     return aarch64_sve_pred_mode (mode);
1837
1838   return default_get_mask_mode (mode);
1839 }
1840
1841 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1842
1843 opt_machine_mode
1844 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1845 {
1846   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1847                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1848   machine_mode mode;
1849   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1850     if (inner_mode == GET_MODE_INNER (mode)
1851         && known_eq (nunits, GET_MODE_NUNITS (mode))
1852         && aarch64_sve_data_mode_p (mode))
1853       return mode;
1854   return opt_machine_mode ();
1855 }
1856
1857 /* Return the integer element mode associated with SVE mode MODE.  */
1858
1859 static scalar_int_mode
1860 aarch64_sve_element_int_mode (machine_mode mode)
1861 {
1862   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1863                              ? BITS_PER_SVE_VECTOR
1864                              : GET_MODE_BITSIZE (mode));
1865   unsigned int elt_bits = vector_element_size (vector_bits,
1866                                                GET_MODE_NUNITS (mode));
1867   return int_mode_for_size (elt_bits, 0).require ();
1868 }
1869
1870 /* Return an integer element mode that contains exactly
1871    aarch64_sve_container_bits (MODE) bits.  This is wider than
1872    aarch64_sve_element_int_mode if MODE is a partial vector,
1873    otherwise it's the same.  */
1874
1875 static scalar_int_mode
1876 aarch64_sve_container_int_mode (machine_mode mode)
1877 {
1878   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1879 }
1880
1881 /* Return the integer vector mode associated with SVE mode MODE.
1882    Unlike related_int_vector_mode, this can handle the case in which
1883    MODE is a predicate (and thus has a different total size).  */
1884
1885 machine_mode
1886 aarch64_sve_int_mode (machine_mode mode)
1887 {
1888   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1889   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1890 }
1891
1892 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
1893
1894 static opt_machine_mode
1895 aarch64_vectorize_related_mode (machine_mode vector_mode,
1896                                 scalar_mode element_mode,
1897                                 poly_uint64 nunits)
1898 {
1899   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
1900
1901   /* If we're operating on SVE vectors, try to return an SVE mode.  */
1902   poly_uint64 sve_nunits;
1903   if ((vec_flags & VEC_SVE_DATA)
1904       && multiple_p (BYTES_PER_SVE_VECTOR,
1905                      GET_MODE_SIZE (element_mode), &sve_nunits))
1906     {
1907       machine_mode sve_mode;
1908       if (maybe_ne (nunits, 0U))
1909         {
1910           /* Try to find a full or partial SVE mode with exactly
1911              NUNITS units.  */
1912           if (multiple_p (sve_nunits, nunits)
1913               && aarch64_sve_data_mode (element_mode,
1914                                         nunits).exists (&sve_mode))
1915             return sve_mode;
1916         }
1917       else
1918         {
1919           /* Take the preferred number of units from the number of bytes
1920              that fit in VECTOR_MODE.  We always start by "autodetecting"
1921              a full vector mode with preferred_simd_mode, so vectors
1922              chosen here will also be full vector modes.  Then
1923              autovectorize_vector_modes tries smaller starting modes
1924              and thus smaller preferred numbers of units.  */
1925           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
1926           if (aarch64_sve_data_mode (element_mode,
1927                                      sve_nunits).exists (&sve_mode))
1928             return sve_mode;
1929         }
1930     }
1931
1932   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
1933   if ((vec_flags & VEC_ADVSIMD)
1934       && known_eq (nunits, 0U)
1935       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
1936       && maybe_ge (GET_MODE_BITSIZE (element_mode)
1937                    * GET_MODE_NUNITS (vector_mode), 128U))
1938     {
1939       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
1940       if (VECTOR_MODE_P (res))
1941         return res;
1942     }
1943
1944   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
1945 }
1946
1947 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1948    prefer to use the first arithmetic operand as the else value if
1949    the else value doesn't matter, since that exactly matches the SVE
1950    destructive merging form.  For ternary operations we could either
1951    pick the first operand and use FMAD-like instructions or the last
1952    operand and use FMLA-like instructions; the latter seems more
1953    natural.  */
1954
1955 static tree
1956 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1957 {
1958   return nops == 3 ? ops[2] : ops[0];
1959 }
1960
1961 /* Implement TARGET_HARD_REGNO_NREGS.  */
1962
1963 static unsigned int
1964 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1965 {
1966   /* ??? Logically we should only need to provide a value when
1967      HARD_REGNO_MODE_OK says that the combination is valid,
1968      but at the moment we need to handle all modes.  Just ignore
1969      any runtime parts for registers that can't store them.  */
1970   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1971   switch (aarch64_regno_regclass (regno))
1972     {
1973     case FP_REGS:
1974     case FP_LO_REGS:
1975     case FP_LO8_REGS:
1976       {
1977         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1978         if (vec_flags & VEC_SVE_DATA)
1979           return exact_div (GET_MODE_SIZE (mode),
1980                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
1981         return CEIL (lowest_size, UNITS_PER_VREG);
1982       }
1983     case PR_REGS:
1984     case PR_LO_REGS:
1985     case PR_HI_REGS:
1986     case FFR_REGS:
1987     case PR_AND_FFR_REGS:
1988       return 1;
1989     default:
1990       return CEIL (lowest_size, UNITS_PER_WORD);
1991     }
1992   gcc_unreachable ();
1993 }
1994
1995 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1996
1997 static bool
1998 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1999 {
2000   if (GET_MODE_CLASS (mode) == MODE_CC)
2001     return regno == CC_REGNUM;
2002
2003   if (regno == VG_REGNUM)
2004     /* This must have the same size as _Unwind_Word.  */
2005     return mode == DImode;
2006
2007   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2008   if (vec_flags & VEC_SVE_PRED)
2009     return pr_or_ffr_regnum_p (regno);
2010
2011   if (pr_or_ffr_regnum_p (regno))
2012     return false;
2013
2014   if (regno == SP_REGNUM)
2015     /* The purpose of comparing with ptr_mode is to support the
2016        global register variable associated with the stack pointer
2017        register via the syntax of asm ("wsp") in ILP32.  */
2018     return mode == Pmode || mode == ptr_mode;
2019
2020   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2021     return mode == Pmode;
2022
2023   if (GP_REGNUM_P (regno))
2024     {
2025       if (vec_flags & VEC_ANY_SVE)
2026         return false;
2027       if (known_le (GET_MODE_SIZE (mode), 8))
2028         return true;
2029       if (known_le (GET_MODE_SIZE (mode), 16))
2030         return (regno & 1) == 0;
2031     }
2032   else if (FP_REGNUM_P (regno))
2033     {
2034       if (vec_flags & VEC_STRUCT)
2035         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2036       else
2037         return !VECTOR_MODE_P (mode) || vec_flags != 0;
2038     }
2039
2040   return false;
2041 }
2042
2043 /* Return true if TYPE is a type that should be passed or returned in
2044    SVE registers, assuming enough registers are available.  When returning
2045    true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
2046    respectively.  */
2047
2048 /* Return true if a function with type FNTYPE returns its value in
2049    SVE vector or predicate registers.  */
2050
2051 static bool
2052 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2053 {
2054   tree return_type = TREE_TYPE (fntype);
2055   return (return_type != error_mark_node
2056           && aarch64_sve::builtin_type_p (return_type));
2057 }
2058
2059 /* Return true if a function with type FNTYPE takes arguments in
2060    SVE vector or predicate registers.  */
2061
2062 static bool
2063 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2064 {
2065   CUMULATIVE_ARGS args_so_far_v;
2066   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2067                                 NULL_TREE, 0, true);
2068   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2069
2070   for (tree chain = TYPE_ARG_TYPES (fntype);
2071        chain && chain != void_list_node;
2072        chain = TREE_CHAIN (chain))
2073     {
2074       tree arg_type = TREE_VALUE (chain);
2075       if (arg_type == error_mark_node)
2076         return false;
2077
2078       function_arg_info arg (arg_type, /*named=*/true);
2079       apply_pass_by_reference_rules (&args_so_far_v, arg);
2080       if (aarch64_sve::builtin_type_p (arg.type))
2081         return true;
2082
2083       targetm.calls.function_arg_advance (args_so_far, arg);
2084     }
2085   return false;
2086 }
2087
2088 /* Implement TARGET_FNTYPE_ABI.  */
2089
2090 static const predefined_function_abi &
2091 aarch64_fntype_abi (const_tree fntype)
2092 {
2093   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2094     return aarch64_simd_abi ();
2095
2096   if (aarch64_returns_value_in_sve_regs_p (fntype)
2097       || aarch64_takes_arguments_in_sve_regs_p (fntype))
2098     return aarch64_sve_abi ();
2099
2100   return default_function_abi;
2101 }
2102
2103 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
2104
2105 static bool
2106 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2107 {
2108   return (aarch64_sve::builtin_type_p (type1)
2109           == aarch64_sve::builtin_type_p (type2));
2110 }
2111
2112 /* Return true if we should emit CFI for register REGNO.  */
2113
2114 static bool
2115 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2116 {
2117   return (GP_REGNUM_P (regno)
2118           || !default_function_abi.clobbers_full_reg_p (regno));
2119 }
2120
2121 /* Return the mode we should use to save and restore register REGNO.  */
2122
2123 static machine_mode
2124 aarch64_reg_save_mode (unsigned int regno)
2125 {
2126   if (GP_REGNUM_P (regno))
2127     return DImode;
2128
2129   if (FP_REGNUM_P (regno))
2130     switch (crtl->abi->id ())
2131       {
2132       case ARM_PCS_AAPCS64:
2133         /* Only the low 64 bits are saved by the base PCS.  */
2134         return DFmode;
2135
2136       case ARM_PCS_SIMD:
2137         /* The vector PCS saves the low 128 bits (which is the full
2138            register on non-SVE targets).  */
2139         return TFmode;
2140
2141       case ARM_PCS_SVE:
2142         /* Use vectors of DImode for registers that need frame
2143            information, so that the first 64 bytes of the save slot
2144            are always the equivalent of what storing D<n> would give.  */
2145         if (aarch64_emit_cfi_for_reg_p (regno))
2146           return VNx2DImode;
2147
2148         /* Use vectors of bytes otherwise, so that the layout is
2149            endian-agnostic, and so that we can use LDR and STR for
2150            big-endian targets.  */
2151         return VNx16QImode;
2152
2153       case ARM_PCS_TLSDESC:
2154       case ARM_PCS_UNKNOWN:
2155         break;
2156       }
2157
2158   if (PR_REGNUM_P (regno))
2159     /* Save the full predicate register.  */
2160     return VNx16BImode;
2161
2162   gcc_unreachable ();
2163 }
2164
2165 /* Implement TARGET_INSN_CALLEE_ABI.  */
2166
2167 const predefined_function_abi &
2168 aarch64_insn_callee_abi (const rtx_insn *insn)
2169 {
2170   rtx pat = PATTERN (insn);
2171   gcc_assert (GET_CODE (pat) == PARALLEL);
2172   rtx unspec = XVECEXP (pat, 0, 1);
2173   gcc_assert (GET_CODE (unspec) == UNSPEC
2174               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2175   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2176 }
2177
2178 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2179    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2180    clobbers the top 64 bits when restoring the bottom 64 bits.  */
2181
2182 static bool
2183 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2184                                         unsigned int regno,
2185                                         machine_mode mode)
2186 {
2187   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2188     {
2189       poly_int64 per_register_size = GET_MODE_SIZE (mode);
2190       unsigned int nregs = hard_regno_nregs (regno, mode);
2191       if (nregs > 1)
2192         per_register_size = exact_div (per_register_size, nregs);
2193       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2194         return maybe_gt (per_register_size, 16);
2195       return maybe_gt (per_register_size, 8);
2196     }
2197   return false;
2198 }
2199
2200 /* Implement REGMODE_NATURAL_SIZE.  */
2201 poly_uint64
2202 aarch64_regmode_natural_size (machine_mode mode)
2203 {
2204   /* The natural size for SVE data modes is one SVE data vector,
2205      and similarly for predicates.  We can't independently modify
2206      anything smaller than that.  */
2207   /* ??? For now, only do this for variable-width SVE registers.
2208      Doing it for constant-sized registers breaks lower-subreg.c.  */
2209   /* ??? And once that's fixed, we should probably have similar
2210      code for Advanced SIMD.  */
2211   if (!aarch64_sve_vg.is_constant ())
2212     {
2213       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2214       if (vec_flags & VEC_SVE_PRED)
2215         return BYTES_PER_SVE_PRED;
2216       if (vec_flags & VEC_SVE_DATA)
2217         return BYTES_PER_SVE_VECTOR;
2218     }
2219   return UNITS_PER_WORD;
2220 }
2221
2222 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2223 machine_mode
2224 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2225                                      machine_mode mode)
2226 {
2227   /* The predicate mode determines which bits are significant and
2228      which are "don't care".  Decreasing the number of lanes would
2229      lose data while increasing the number of lanes would make bits
2230      unnecessarily significant.  */
2231   if (PR_REGNUM_P (regno))
2232     return mode;
2233   if (known_ge (GET_MODE_SIZE (mode), 4))
2234     return mode;
2235   else
2236     return SImode;
2237 }
2238
2239 /* Return true if I's bits are consecutive ones from the MSB.  */
2240 bool
2241 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2242 {
2243   return exact_log2 (-i) != HOST_WIDE_INT_M1;
2244 }
2245
2246 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2247    that strcpy from constants will be faster.  */
2248
2249 static HOST_WIDE_INT
2250 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2251 {
2252   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2253     return MAX (align, BITS_PER_WORD);
2254   return align;
2255 }
2256
2257 /* Return true if calls to DECL should be treated as
2258    long-calls (ie called via a register).  */
2259 static bool
2260 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2261 {
2262   return false;
2263 }
2264
2265 /* Return true if calls to symbol-ref SYM should be treated as
2266    long-calls (ie called via a register).  */
2267 bool
2268 aarch64_is_long_call_p (rtx sym)
2269 {
2270   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2271 }
2272
2273 /* Return true if calls to symbol-ref SYM should not go through
2274    plt stubs.  */
2275
2276 bool
2277 aarch64_is_noplt_call_p (rtx sym)
2278 {
2279   const_tree decl = SYMBOL_REF_DECL (sym);
2280
2281   if (flag_pic
2282       && decl
2283       && (!flag_plt
2284           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2285       && !targetm.binds_local_p (decl))
2286     return true;
2287
2288   return false;
2289 }
2290
2291 /* Return true if the offsets to a zero/sign-extract operation
2292    represent an expression that matches an extend operation.  The
2293    operands represent the paramters from
2294
2295    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
2296 bool
2297 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2298                                 rtx extract_imm)
2299 {
2300   HOST_WIDE_INT mult_val, extract_val;
2301
2302   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2303     return false;
2304
2305   mult_val = INTVAL (mult_imm);
2306   extract_val = INTVAL (extract_imm);
2307
2308   if (extract_val > 8
2309       && extract_val < GET_MODE_BITSIZE (mode)
2310       && exact_log2 (extract_val & ~7) > 0
2311       && (extract_val & 7) <= 4
2312       && mult_val == (1 << (extract_val & 7)))
2313     return true;
2314
2315   return false;
2316 }
2317
2318 /* Emit an insn that's a simple single-set.  Both the operands must be
2319    known to be valid.  */
2320 inline static rtx_insn *
2321 emit_set_insn (rtx x, rtx y)
2322 {
2323   return emit_insn (gen_rtx_SET (x, y));
2324 }
2325
2326 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2327    return the rtx for register 0 in the proper mode.  */
2328 rtx
2329 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2330 {
2331   machine_mode cmp_mode = GET_MODE (x);
2332   machine_mode cc_mode;
2333   rtx cc_reg;
2334
2335   if (cmp_mode == TImode)
2336     {
2337       gcc_assert (code == NE);
2338
2339       cc_mode = CCmode;
2340       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2341
2342       rtx x_lo = operand_subword (x, 0, 0, TImode);
2343       rtx y_lo = operand_subword (y, 0, 0, TImode);
2344       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2345
2346       rtx x_hi = operand_subword (x, 1, 0, TImode);
2347       rtx y_hi = operand_subword (y, 1, 0, TImode);
2348       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2349                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2350                                GEN_INT (AARCH64_EQ)));
2351     }
2352   else
2353     {
2354       cc_mode = SELECT_CC_MODE (code, x, y);
2355       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2356       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2357     }
2358   return cc_reg;
2359 }
2360
2361 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2362
2363 static rtx
2364 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2365                                   machine_mode y_mode)
2366 {
2367   if (y_mode == E_QImode || y_mode == E_HImode)
2368     {
2369       if (CONST_INT_P (y))
2370         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2371       else
2372         {
2373           rtx t, cc_reg;
2374           machine_mode cc_mode;
2375
2376           t = gen_rtx_ZERO_EXTEND (SImode, y);
2377           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2378           cc_mode = CC_SWPmode;
2379           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2380           emit_set_insn (cc_reg, t);
2381           return cc_reg;
2382         }
2383     }
2384
2385   if (!aarch64_plus_operand (y, y_mode))
2386     y = force_reg (y_mode, y);
2387
2388   return aarch64_gen_compare_reg (code, x, y);
2389 }
2390
2391 /* Build the SYMBOL_REF for __tls_get_addr.  */
2392
2393 static GTY(()) rtx tls_get_addr_libfunc;
2394
2395 rtx
2396 aarch64_tls_get_addr (void)
2397 {
2398   if (!tls_get_addr_libfunc)
2399     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2400   return tls_get_addr_libfunc;
2401 }
2402
2403 /* Return the TLS model to use for ADDR.  */
2404
2405 static enum tls_model
2406 tls_symbolic_operand_type (rtx addr)
2407 {
2408   enum tls_model tls_kind = TLS_MODEL_NONE;
2409   if (GET_CODE (addr) == CONST)
2410     {
2411       poly_int64 addend;
2412       rtx sym = strip_offset (addr, &addend);
2413       if (GET_CODE (sym) == SYMBOL_REF)
2414         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2415     }
2416   else if (GET_CODE (addr) == SYMBOL_REF)
2417     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2418
2419   return tls_kind;
2420 }
2421
2422 /* We'll allow lo_sum's in addresses in our legitimate addresses
2423    so that combine would take care of combining addresses where
2424    necessary, but for generation purposes, we'll generate the address
2425    as :
2426    RTL                               Absolute
2427    tmp = hi (symbol_ref);            adrp  x1, foo
2428    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2429                                      nop
2430
2431    PIC                               TLS
2432    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2433    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2434                                      bl   __tls_get_addr
2435                                      nop
2436
2437    Load TLS symbol, depending on TLS mechanism and TLS access model.
2438
2439    Global Dynamic - Traditional TLS:
2440    adrp tmp, :tlsgd:imm
2441    add  dest, tmp, #:tlsgd_lo12:imm
2442    bl   __tls_get_addr
2443
2444    Global Dynamic - TLS Descriptors:
2445    adrp dest, :tlsdesc:imm
2446    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2447    add  dest, dest, #:tlsdesc_lo12:imm
2448    blr  tmp
2449    mrs  tp, tpidr_el0
2450    add  dest, dest, tp
2451
2452    Initial Exec:
2453    mrs  tp, tpidr_el0
2454    adrp tmp, :gottprel:imm
2455    ldr  dest, [tmp, #:gottprel_lo12:imm]
2456    add  dest, dest, tp
2457
2458    Local Exec:
2459    mrs  tp, tpidr_el0
2460    add  t0, tp, #:tprel_hi12:imm, lsl #12
2461    add  t0, t0, #:tprel_lo12_nc:imm
2462 */
2463
2464 static void
2465 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2466                                    enum aarch64_symbol_type type)
2467 {
2468   switch (type)
2469     {
2470     case SYMBOL_SMALL_ABSOLUTE:
2471       {
2472         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2473         rtx tmp_reg = dest;
2474         machine_mode mode = GET_MODE (dest);
2475
2476         gcc_assert (mode == Pmode || mode == ptr_mode);
2477
2478         if (can_create_pseudo_p ())
2479           tmp_reg = gen_reg_rtx (mode);
2480
2481         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2482         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2483         return;
2484       }
2485
2486     case SYMBOL_TINY_ABSOLUTE:
2487       emit_insn (gen_rtx_SET (dest, imm));
2488       return;
2489
2490     case SYMBOL_SMALL_GOT_28K:
2491       {
2492         machine_mode mode = GET_MODE (dest);
2493         rtx gp_rtx = pic_offset_table_rtx;
2494         rtx insn;
2495         rtx mem;
2496
2497         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2498            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2499            decide rtx costs, in which case pic_offset_table_rtx is not
2500            initialized.  For that case no need to generate the first adrp
2501            instruction as the final cost for global variable access is
2502            one instruction.  */
2503         if (gp_rtx != NULL)
2504           {
2505             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2506                using the page base as GOT base, the first page may be wasted,
2507                in the worst scenario, there is only 28K space for GOT).
2508
2509                The generate instruction sequence for accessing global variable
2510                is:
2511
2512                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2513
2514                Only one instruction needed. But we must initialize
2515                pic_offset_table_rtx properly.  We generate initialize insn for
2516                every global access, and allow CSE to remove all redundant.
2517
2518                The final instruction sequences will look like the following
2519                for multiply global variables access.
2520
2521                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2522
2523                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2524                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2525                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2526                  ...  */
2527
2528             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2529             crtl->uses_pic_offset_table = 1;
2530             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2531
2532             if (mode != GET_MODE (gp_rtx))
2533              gp_rtx = gen_lowpart (mode, gp_rtx);
2534
2535           }
2536
2537         if (mode == ptr_mode)
2538           {
2539             if (mode == DImode)
2540               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2541             else
2542               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2543
2544             mem = XVECEXP (SET_SRC (insn), 0, 0);
2545           }
2546         else
2547           {
2548             gcc_assert (mode == Pmode);
2549
2550             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2551             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2552           }
2553
2554         /* The operand is expected to be MEM.  Whenever the related insn
2555            pattern changed, above code which calculate mem should be
2556            updated.  */
2557         gcc_assert (GET_CODE (mem) == MEM);
2558         MEM_READONLY_P (mem) = 1;
2559         MEM_NOTRAP_P (mem) = 1;
2560         emit_insn (insn);
2561         return;
2562       }
2563
2564     case SYMBOL_SMALL_GOT_4G:
2565       {
2566         /* In ILP32, the mode of dest can be either SImode or DImode,
2567            while the got entry is always of SImode size.  The mode of
2568            dest depends on how dest is used: if dest is assigned to a
2569            pointer (e.g. in the memory), it has SImode; it may have
2570            DImode if dest is dereferenced to access the memeory.
2571            This is why we have to handle three different ldr_got_small
2572            patterns here (two patterns for ILP32).  */
2573
2574         rtx insn;
2575         rtx mem;
2576         rtx tmp_reg = dest;
2577         machine_mode mode = GET_MODE (dest);
2578
2579         if (can_create_pseudo_p ())
2580           tmp_reg = gen_reg_rtx (mode);
2581
2582         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2583         if (mode == ptr_mode)
2584           {
2585             if (mode == DImode)
2586               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2587             else
2588               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2589
2590             mem = XVECEXP (SET_SRC (insn), 0, 0);
2591           }
2592         else
2593           {
2594             gcc_assert (mode == Pmode);
2595
2596             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2597             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2598           }
2599
2600         gcc_assert (GET_CODE (mem) == MEM);
2601         MEM_READONLY_P (mem) = 1;
2602         MEM_NOTRAP_P (mem) = 1;
2603         emit_insn (insn);
2604         return;
2605       }
2606
2607     case SYMBOL_SMALL_TLSGD:
2608       {
2609         rtx_insn *insns;
2610         machine_mode mode = GET_MODE (dest);
2611         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2612
2613         start_sequence ();
2614         if (TARGET_ILP32)
2615           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2616         else
2617           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2618         insns = get_insns ();
2619         end_sequence ();
2620
2621         RTL_CONST_CALL_P (insns) = 1;
2622         emit_libcall_block (insns, dest, result, imm);
2623         return;
2624       }
2625
2626     case SYMBOL_SMALL_TLSDESC:
2627       {
2628         machine_mode mode = GET_MODE (dest);
2629         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2630         rtx tp;
2631
2632         gcc_assert (mode == Pmode || mode == ptr_mode);
2633
2634         /* In ILP32, the got entry is always of SImode size.  Unlike
2635            small GOT, the dest is fixed at reg 0.  */
2636         if (TARGET_ILP32)
2637           emit_insn (gen_tlsdesc_small_si (imm));
2638         else
2639           emit_insn (gen_tlsdesc_small_di (imm));
2640         tp = aarch64_load_tp (NULL);
2641
2642         if (mode != Pmode)
2643           tp = gen_lowpart (mode, tp);
2644
2645         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2646         if (REG_P (dest))
2647           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2648         return;
2649       }
2650
2651     case SYMBOL_SMALL_TLSIE:
2652       {
2653         /* In ILP32, the mode of dest can be either SImode or DImode,
2654            while the got entry is always of SImode size.  The mode of
2655            dest depends on how dest is used: if dest is assigned to a
2656            pointer (e.g. in the memory), it has SImode; it may have
2657            DImode if dest is dereferenced to access the memeory.
2658            This is why we have to handle three different tlsie_small
2659            patterns here (two patterns for ILP32).  */
2660         machine_mode mode = GET_MODE (dest);
2661         rtx tmp_reg = gen_reg_rtx (mode);
2662         rtx tp = aarch64_load_tp (NULL);
2663
2664         if (mode == ptr_mode)
2665           {
2666             if (mode == DImode)
2667               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2668             else
2669               {
2670                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2671                 tp = gen_lowpart (mode, tp);
2672               }
2673           }
2674         else
2675           {
2676             gcc_assert (mode == Pmode);
2677             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2678           }
2679
2680         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2681         if (REG_P (dest))
2682           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2683         return;
2684       }
2685
2686     case SYMBOL_TLSLE12:
2687     case SYMBOL_TLSLE24:
2688     case SYMBOL_TLSLE32:
2689     case SYMBOL_TLSLE48:
2690       {
2691         machine_mode mode = GET_MODE (dest);
2692         rtx tp = aarch64_load_tp (NULL);
2693
2694         if (mode != Pmode)
2695           tp = gen_lowpart (mode, tp);
2696
2697         switch (type)
2698           {
2699           case SYMBOL_TLSLE12:
2700             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2701                         (dest, tp, imm));
2702             break;
2703           case SYMBOL_TLSLE24:
2704             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2705                         (dest, tp, imm));
2706           break;
2707           case SYMBOL_TLSLE32:
2708             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2709                         (dest, imm));
2710             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2711                         (dest, dest, tp));
2712           break;
2713           case SYMBOL_TLSLE48:
2714             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2715                         (dest, imm));
2716             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2717                         (dest, dest, tp));
2718             break;
2719           default:
2720             gcc_unreachable ();
2721           }
2722
2723         if (REG_P (dest))
2724           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2725         return;
2726       }
2727
2728     case SYMBOL_TINY_GOT:
2729       emit_insn (gen_ldr_got_tiny (dest, imm));
2730       return;
2731
2732     case SYMBOL_TINY_TLSIE:
2733       {
2734         machine_mode mode = GET_MODE (dest);
2735         rtx tp = aarch64_load_tp (NULL);
2736
2737         if (mode == ptr_mode)
2738           {
2739             if (mode == DImode)
2740               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2741             else
2742               {
2743                 tp = gen_lowpart (mode, tp);
2744                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2745               }
2746           }
2747         else
2748           {
2749             gcc_assert (mode == Pmode);
2750             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2751           }
2752
2753         if (REG_P (dest))
2754           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2755         return;
2756       }
2757
2758     default:
2759       gcc_unreachable ();
2760     }
2761 }
2762
2763 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2764    handle all moves if !can_create_pseudo_p ().  The distinction is
2765    important because, unlike emit_move_insn, the move expanders know
2766    how to force Pmode objects into the constant pool even when the
2767    constant pool address is not itself legitimate.  */
2768 static rtx
2769 aarch64_emit_move (rtx dest, rtx src)
2770 {
2771   return (can_create_pseudo_p ()
2772           ? emit_move_insn (dest, src)
2773           : emit_move_insn_1 (dest, src));
2774 }
2775
2776 /* Apply UNOPTAB to OP and store the result in DEST.  */
2777
2778 static void
2779 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2780 {
2781   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2782   if (dest != tmp)
2783     emit_move_insn (dest, tmp);
2784 }
2785
2786 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2787
2788 static void
2789 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2790 {
2791   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2792                           OPTAB_DIRECT);
2793   if (dest != tmp)
2794     emit_move_insn (dest, tmp);
2795 }
2796
2797 /* Split a 128-bit move operation into two 64-bit move operations,
2798    taking care to handle partial overlap of register to register
2799    copies.  Special cases are needed when moving between GP regs and
2800    FP regs.  SRC can be a register, constant or memory; DST a register
2801    or memory.  If either operand is memory it must not have any side
2802    effects.  */
2803 void
2804 aarch64_split_128bit_move (rtx dst, rtx src)
2805 {
2806   rtx dst_lo, dst_hi;
2807   rtx src_lo, src_hi;
2808
2809   machine_mode mode = GET_MODE (dst);
2810
2811   gcc_assert (mode == TImode || mode == TFmode);
2812   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2813   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2814
2815   if (REG_P (dst) && REG_P (src))
2816     {
2817       int src_regno = REGNO (src);
2818       int dst_regno = REGNO (dst);
2819
2820       /* Handle FP <-> GP regs.  */
2821       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2822         {
2823           src_lo = gen_lowpart (word_mode, src);
2824           src_hi = gen_highpart (word_mode, src);
2825
2826           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2827           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2828           return;
2829         }
2830       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2831         {
2832           dst_lo = gen_lowpart (word_mode, dst);
2833           dst_hi = gen_highpart (word_mode, dst);
2834
2835           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2836           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2837           return;
2838         }
2839     }
2840
2841   dst_lo = gen_lowpart (word_mode, dst);
2842   dst_hi = gen_highpart (word_mode, dst);
2843   src_lo = gen_lowpart (word_mode, src);
2844   src_hi = gen_highpart_mode (word_mode, mode, src);
2845
2846   /* At most one pairing may overlap.  */
2847   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2848     {
2849       aarch64_emit_move (dst_hi, src_hi);
2850       aarch64_emit_move (dst_lo, src_lo);
2851     }
2852   else
2853     {
2854       aarch64_emit_move (dst_lo, src_lo);
2855       aarch64_emit_move (dst_hi, src_hi);
2856     }
2857 }
2858
2859 bool
2860 aarch64_split_128bit_move_p (rtx dst, rtx src)
2861 {
2862   return (! REG_P (src)
2863           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2864 }
2865
2866 /* Split a complex SIMD combine.  */
2867
2868 void
2869 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2870 {
2871   machine_mode src_mode = GET_MODE (src1);
2872   machine_mode dst_mode = GET_MODE (dst);
2873
2874   gcc_assert (VECTOR_MODE_P (dst_mode));
2875   gcc_assert (register_operand (dst, dst_mode)
2876               && register_operand (src1, src_mode)
2877               && register_operand (src2, src_mode));
2878
2879   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2880   return;
2881 }
2882
2883 /* Split a complex SIMD move.  */
2884
2885 void
2886 aarch64_split_simd_move (rtx dst, rtx src)
2887 {
2888   machine_mode src_mode = GET_MODE (src);
2889   machine_mode dst_mode = GET_MODE (dst);
2890
2891   gcc_assert (VECTOR_MODE_P (dst_mode));
2892
2893   if (REG_P (dst) && REG_P (src))
2894     {
2895       gcc_assert (VECTOR_MODE_P (src_mode));
2896       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2897     }
2898 }
2899
2900 bool
2901 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2902                               machine_mode ymode, rtx y)
2903 {
2904   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2905   gcc_assert (r != NULL);
2906   return rtx_equal_p (x, r);
2907 }
2908
2909 /* Return TARGET if it is nonnull and a register of mode MODE.
2910    Otherwise, return a fresh register of mode MODE if we can,
2911    or TARGET reinterpreted as MODE if we can't.  */
2912
2913 static rtx
2914 aarch64_target_reg (rtx target, machine_mode mode)
2915 {
2916   if (target && REG_P (target) && GET_MODE (target) == mode)
2917     return target;
2918   if (!can_create_pseudo_p ())
2919     {
2920       gcc_assert (target);
2921       return gen_lowpart (mode, target);
2922     }
2923   return gen_reg_rtx (mode);
2924 }
2925
2926 /* Return a register that contains the constant in BUILDER, given that
2927    the constant is a legitimate move operand.  Use TARGET as the register
2928    if it is nonnull and convenient.  */
2929
2930 static rtx
2931 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2932 {
2933   rtx src = builder.build ();
2934   target = aarch64_target_reg (target, GET_MODE (src));
2935   emit_insn (gen_rtx_SET (target, src));
2936   return target;
2937 }
2938
2939 static rtx
2940 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2941 {
2942   if (can_create_pseudo_p ())
2943     return force_reg (mode, value);
2944   else
2945     {
2946       gcc_assert (x);
2947       aarch64_emit_move (x, value);
2948       return x;
2949     }
2950 }
2951
2952 /* Return true if predicate value X is a constant in which every element
2953    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2954    value, i.e. as a predicate in which all bits are significant.  */
2955
2956 static bool
2957 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2958 {
2959   if (GET_CODE (x) != CONST_VECTOR)
2960     return false;
2961
2962   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2963                                              GET_MODE_NUNITS (GET_MODE (x)));
2964   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2965   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2966   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2967
2968   unsigned int nelts = const_vector_encoded_nelts (x);
2969   for (unsigned int i = 0; i < nelts; ++i)
2970     {
2971       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2972       if (!CONST_INT_P (elt))
2973         return false;
2974
2975       builder.quick_push (elt);
2976       for (unsigned int j = 1; j < factor; ++j)
2977         builder.quick_push (const0_rtx);
2978     }
2979   builder.finalize ();
2980   return true;
2981 }
2982
2983 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2984    widest predicate element size it can have (that is, the largest size
2985    for which each element would still be 0 or 1).  */
2986
2987 unsigned int
2988 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2989 {
2990   /* Start with the most optimistic assumption: that we only need
2991      one bit per pattern.  This is what we will use if only the first
2992      bit in each pattern is ever set.  */
2993   unsigned int mask = GET_MODE_SIZE (DImode);
2994   mask |= builder.npatterns ();
2995
2996   /* Look for set bits.  */
2997   unsigned int nelts = builder.encoded_nelts ();
2998   for (unsigned int i = 1; i < nelts; ++i)
2999     if (INTVAL (builder.elt (i)) != 0)
3000       {
3001         if (i & 1)
3002           return 1;
3003         mask |= i;
3004       }
3005   return mask & -mask;
3006 }
3007
3008 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3009    return that predicate mode, otherwise return opt_machine_mode ().  */
3010
3011 opt_machine_mode
3012 aarch64_ptrue_all_mode (rtx x)
3013 {
3014   gcc_assert (GET_MODE (x) == VNx16BImode);
3015   if (GET_CODE (x) != CONST_VECTOR
3016       || !CONST_VECTOR_DUPLICATE_P (x)
3017       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3018       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3019     return opt_machine_mode ();
3020
3021   unsigned int nelts = const_vector_encoded_nelts (x);
3022   for (unsigned int i = 1; i < nelts; ++i)
3023     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3024       return opt_machine_mode ();
3025
3026   return aarch64_sve_pred_mode (nelts);
3027 }
3028
3029 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
3030    that the constant would have with predicate element size ELT_SIZE
3031    (ignoring the upper bits in each element) and return:
3032
3033    * -1 if all bits are set
3034    * N if the predicate has N leading set bits followed by all clear bits
3035    * 0 if the predicate does not have any of these forms.  */
3036
3037 int
3038 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3039                               unsigned int elt_size)
3040 {
3041   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3042      followed by set bits.  */
3043   if (builder.nelts_per_pattern () == 3)
3044     return 0;
3045
3046   /* Skip over leading set bits.  */
3047   unsigned int nelts = builder.encoded_nelts ();
3048   unsigned int i = 0;
3049   for (; i < nelts; i += elt_size)
3050     if (INTVAL (builder.elt (i)) == 0)
3051       break;
3052   unsigned int vl = i / elt_size;
3053
3054   /* Check for the all-true case.  */
3055   if (i == nelts)
3056     return -1;
3057
3058   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3059      repeating pattern of set bits followed by clear bits.  */
3060   if (builder.nelts_per_pattern () != 2)
3061     return 0;
3062
3063   /* We have a "foreground" value and a duplicated "background" value.
3064      If the background might repeat and the last set bit belongs to it,
3065      we might have set bits followed by clear bits followed by set bits.  */
3066   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3067     return 0;
3068
3069   /* Make sure that the rest are all clear.  */
3070   for (; i < nelts; i += elt_size)
3071     if (INTVAL (builder.elt (i)) != 0)
3072       return 0;
3073
3074   return vl;
3075 }
3076
3077 /* See if there is an svpattern that encodes an SVE predicate of mode
3078    PRED_MODE in which the first VL bits are set and the rest are clear.
3079    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3080    A VL of -1 indicates an all-true vector.  */
3081
3082 aarch64_svpattern
3083 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3084 {
3085   if (vl < 0)
3086     return AARCH64_SV_ALL;
3087
3088   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3089     return AARCH64_NUM_SVPATTERNS;
3090
3091   if (vl >= 1 && vl <= 8)
3092     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3093
3094   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3095     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3096
3097   int max_vl;
3098   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3099     {
3100       if (vl == (max_vl / 3) * 3)
3101         return AARCH64_SV_MUL3;
3102       /* These would only trigger for non-power-of-2 lengths.  */
3103       if (vl == (max_vl & -4))
3104         return AARCH64_SV_MUL4;
3105       if (vl == (1 << floor_log2 (max_vl)))
3106         return AARCH64_SV_POW2;
3107       if (vl == max_vl)
3108         return AARCH64_SV_ALL;
3109     }
3110   return AARCH64_NUM_SVPATTERNS;
3111 }
3112
3113 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3114    bits has the lowest bit set and the upper bits clear.  This is the
3115    VNx16BImode equivalent of a PTRUE for controlling elements of
3116    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3117    all bits are significant, even the upper zeros.  */
3118
3119 rtx
3120 aarch64_ptrue_all (unsigned int elt_size)
3121 {
3122   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3123   builder.quick_push (const1_rtx);
3124   for (unsigned int i = 1; i < elt_size; ++i)
3125     builder.quick_push (const0_rtx);
3126   return builder.build ();
3127 }
3128
3129 /* Return an all-true predicate register of mode MODE.  */
3130
3131 rtx
3132 aarch64_ptrue_reg (machine_mode mode)
3133 {
3134   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3135   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3136   return gen_lowpart (mode, reg);
3137 }
3138
3139 /* Return an all-false predicate register of mode MODE.  */
3140
3141 rtx
3142 aarch64_pfalse_reg (machine_mode mode)
3143 {
3144   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3145   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3146   return gen_lowpart (mode, reg);
3147 }
3148
3149 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3150    true, or alternatively if we know that the operation predicated by
3151    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
3152    aarch64_sve_gp_strictness operand that describes the operation
3153    predicated by PRED1[0].  */
3154
3155 bool
3156 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3157 {
3158   machine_mode mode = GET_MODE (pred2);
3159   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3160               && mode == GET_MODE (pred1[0])
3161               && aarch64_sve_gp_strictness (pred1[1], SImode));
3162   return (pred1[0] == CONSTM1_RTX (mode)
3163           || INTVAL (pred1[1]) == SVE_RELAXED_GP
3164           || rtx_equal_p (pred1[0], pred2));
3165 }
3166
3167 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3168    for it.  PRED2[0] is the predicate for the instruction whose result
3169    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3170    for it.  Return true if we can prove that the two predicates are
3171    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3172    with PRED1[0] without changing behavior.  */
3173
3174 bool
3175 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3176 {
3177   machine_mode mode = GET_MODE (pred1[0]);
3178   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3179               && mode == GET_MODE (pred2[0])
3180               && aarch64_sve_ptrue_flag (pred1[1], SImode)
3181               && aarch64_sve_ptrue_flag (pred2[1], SImode));
3182
3183   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3184                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3185   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3186                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3187   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3188 }
3189
3190 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3191    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3192    Use TARGET as the target register if nonnull and convenient.  */
3193
3194 static rtx
3195 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3196                           machine_mode data_mode, rtx op1, rtx op2)
3197 {
3198   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3199   expand_operand ops[5];
3200   create_output_operand (&ops[0], target, pred_mode);
3201   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3202   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3203   create_input_operand (&ops[3], op1, data_mode);
3204   create_input_operand (&ops[4], op2, data_mode);
3205   expand_insn (icode, 5, ops);
3206   return ops[0].value;
3207 }
3208
3209 /* Use a comparison to convert integer vector SRC into MODE, which is
3210    the corresponding SVE predicate mode.  Use TARGET for the result
3211    if it's nonnull and convenient.  */
3212
3213 rtx
3214 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3215 {
3216   machine_mode src_mode = GET_MODE (src);
3217   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3218                                    src, CONST0_RTX (src_mode));
3219 }
3220
3221 /* Return the assembly token for svprfop value PRFOP.  */
3222
3223 static const char *
3224 svprfop_token (enum aarch64_svprfop prfop)
3225 {
3226   switch (prfop)
3227     {
3228 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3229     AARCH64_FOR_SVPRFOP (CASE)
3230 #undef CASE
3231     case AARCH64_NUM_SVPRFOPS:
3232       break;
3233     }
3234   gcc_unreachable ();
3235 }
3236
3237 /* Return the assembly string for an SVE prefetch operation with
3238    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3239    and that SUFFIX is the format for the remaining operands.  */
3240
3241 char *
3242 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3243                              const char *suffix)
3244 {
3245   static char buffer[128];
3246   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3247   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3248                                    mnemonic, svprfop_token (prfop), suffix);
3249   gcc_assert (written < sizeof (buffer));
3250   return buffer;
3251 }
3252
3253 /* Check whether we can calculate the number of elements in PATTERN
3254    at compile time, given that there are NELTS_PER_VQ elements per
3255    128-bit block.  Return the value if so, otherwise return -1.  */
3256
3257 HOST_WIDE_INT
3258 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3259 {
3260   unsigned int vl, const_vg;
3261   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3262     vl = 1 + (pattern - AARCH64_SV_VL1);
3263   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3264     vl = 16 << (pattern - AARCH64_SV_VL16);
3265   else if (aarch64_sve_vg.is_constant (&const_vg))
3266     {
3267       /* There are two vector granules per quadword.  */
3268       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3269       switch (pattern)
3270         {
3271         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3272         case AARCH64_SV_MUL4: return nelts & -4;
3273         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3274         case AARCH64_SV_ALL: return nelts;
3275         default: gcc_unreachable ();
3276         }
3277     }
3278   else
3279     return -1;
3280
3281   /* There are two vector granules per quadword.  */
3282   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3283   if (known_le (vl, nelts_all))
3284     return vl;
3285
3286   /* Requesting more elements than are available results in a PFALSE.  */
3287   if (known_gt (vl, nelts_all))
3288     return 0;
3289
3290   return -1;
3291 }
3292
3293 /* Return true if we can move VALUE into a register using a single
3294    CNT[BHWD] instruction.  */
3295
3296 static bool
3297 aarch64_sve_cnt_immediate_p (poly_int64 value)
3298 {
3299   HOST_WIDE_INT factor = value.coeffs[0];
3300   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
3301   return (value.coeffs[1] == factor
3302           && IN_RANGE (factor, 2, 16 * 16)
3303           && (factor & 1) == 0
3304           && factor <= 16 * (factor & -factor));
3305 }
3306
3307 /* Likewise for rtx X.  */
3308
3309 bool
3310 aarch64_sve_cnt_immediate_p (rtx x)
3311 {
3312   poly_int64 value;
3313   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3314 }
3315
3316 /* Return the asm string for an instruction with a CNT-like vector size
3317    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3318    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3319    first part of the operands template (the part that comes before the
3320    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
3321    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
3322    in each quadword.  If it is zero, we can use any element size.  */
3323
3324 static char *
3325 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3326                                   aarch64_svpattern pattern,
3327                                   unsigned int factor,
3328                                   unsigned int nelts_per_vq)
3329 {
3330   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3331
3332   if (nelts_per_vq == 0)
3333     /* There is some overlap in the ranges of the four CNT instructions.
3334        Here we always use the smallest possible element size, so that the
3335        multiplier is 1 whereever possible.  */
3336     nelts_per_vq = factor & -factor;
3337   int shift = std::min (exact_log2 (nelts_per_vq), 4);
3338   gcc_assert (IN_RANGE (shift, 1, 4));
3339   char suffix = "dwhb"[shift - 1];
3340
3341   factor >>= shift;
3342   unsigned int written;
3343   if (pattern == AARCH64_SV_ALL && factor == 1)
3344     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3345                         prefix, suffix, operands);
3346   else if (factor == 1)
3347     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3348                         prefix, suffix, operands, svpattern_token (pattern));
3349   else
3350     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3351                         prefix, suffix, operands, svpattern_token (pattern),
3352                         factor);
3353   gcc_assert (written < sizeof (buffer));
3354   return buffer;
3355 }
3356
3357 /* Return the asm string for an instruction with a CNT-like vector size
3358    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3359    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3360    first part of the operands template (the part that comes before the
3361    vector size itself).  X is the value of the vector size operand,
3362    as a polynomial integer rtx; we need to convert this into an "all"
3363    pattern with a multiplier.  */
3364
3365 char *
3366 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3367                                   rtx x)
3368 {
3369   poly_int64 value = rtx_to_poly_int64 (x);
3370   gcc_assert (aarch64_sve_cnt_immediate_p (value));
3371   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3372                                            value.coeffs[1], 0);
3373 }
3374
3375 /* Return the asm string for an instruction with a CNT-like vector size
3376    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3377    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3378    first part of the operands template (the part that comes before the
3379    vector size itself).  CNT_PAT[0..2] are the operands of the
3380    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
3381
3382 char *
3383 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3384                                       const char *operands, rtx *cnt_pat)
3385 {
3386   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3387   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3388   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3389   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3390                                            factor, nelts_per_vq);
3391 }
3392
3393 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
3394
3395 bool
3396 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3397 {
3398   poly_int64 value;
3399   return (poly_int_rtx_p (x, &value)
3400           && (aarch64_sve_cnt_immediate_p (value)
3401               || aarch64_sve_cnt_immediate_p (-value)));
3402 }
3403
3404 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3405    operand 0.  */
3406
3407 char *
3408 aarch64_output_sve_scalar_inc_dec (rtx offset)
3409 {
3410   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3411   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3412   if (offset_value.coeffs[1] > 0)
3413     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3414                                              offset_value.coeffs[1], 0);
3415   else
3416     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3417                                              -offset_value.coeffs[1], 0);
3418 }
3419
3420 /* Return true if we can add VALUE to a register using a single ADDVL
3421    or ADDPL instruction.  */
3422
3423 static bool
3424 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3425 {
3426   HOST_WIDE_INT factor = value.coeffs[0];
3427   if (factor == 0 || value.coeffs[1] != factor)
3428     return false;
3429   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3430      and a value of 16 is one vector width.  */
3431   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3432           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3433 }
3434
3435 /* Likewise for rtx X.  */
3436
3437 bool
3438 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3439 {
3440   poly_int64 value;
3441   return (poly_int_rtx_p (x, &value)
3442           && aarch64_sve_addvl_addpl_immediate_p (value));
3443 }
3444
3445 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3446    to operand 1 and storing the result in operand 0.  */
3447
3448 char *
3449 aarch64_output_sve_addvl_addpl (rtx offset)
3450 {
3451   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3452   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3453   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3454
3455   int factor = offset_value.coeffs[1];
3456   if ((factor & 15) == 0)
3457     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3458   else
3459     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3460   return buffer;
3461 }
3462
3463 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3464    instruction.  If it is, store the number of elements in each vector
3465    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3466    factor in *FACTOR_OUT (if nonnull).  */
3467
3468 bool
3469 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3470                                         unsigned int *nelts_per_vq_out)
3471 {
3472   rtx elt;
3473   poly_int64 value;
3474
3475   if (!const_vec_duplicate_p (x, &elt)
3476       || !poly_int_rtx_p (elt, &value))
3477     return false;
3478
3479   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3480   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3481     /* There's no vector INCB.  */
3482     return false;
3483
3484   HOST_WIDE_INT factor = value.coeffs[0];
3485   if (value.coeffs[1] != factor)
3486     return false;
3487
3488   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3489   if ((factor % nelts_per_vq) != 0
3490       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3491     return false;
3492
3493   if (factor_out)
3494     *factor_out = factor;
3495   if (nelts_per_vq_out)
3496     *nelts_per_vq_out = nelts_per_vq;
3497   return true;
3498 }
3499
3500 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3501    instruction.  */
3502
3503 bool
3504 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3505 {
3506   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3507 }
3508
3509 /* Return the asm template for an SVE vector INC or DEC instruction.
3510    OPERANDS gives the operands before the vector count and X is the
3511    value of the vector count operand itself.  */
3512
3513 char *
3514 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3515 {
3516   int factor;
3517   unsigned int nelts_per_vq;
3518   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3519     gcc_unreachable ();
3520   if (factor < 0)
3521     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3522                                              -factor, nelts_per_vq);
3523   else
3524     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3525                                              factor, nelts_per_vq);
3526 }
3527
3528 static int
3529 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3530                                 scalar_int_mode mode)
3531 {
3532   int i;
3533   unsigned HOST_WIDE_INT val, val2, mask;
3534   int one_match, zero_match;
3535   int num_insns;
3536
3537   val = INTVAL (imm);
3538
3539   if (aarch64_move_imm (val, mode))
3540     {
3541       if (generate)
3542         emit_insn (gen_rtx_SET (dest, imm));
3543       return 1;
3544     }
3545
3546   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3547      (with XXXX non-zero). In that case check to see if the move can be done in
3548      a smaller mode.  */
3549   val2 = val & 0xffffffff;
3550   if (mode == DImode
3551       && aarch64_move_imm (val2, SImode)
3552       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3553     {
3554       if (generate)
3555         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3556
3557       /* Check if we have to emit a second instruction by checking to see
3558          if any of the upper 32 bits of the original DI mode value is set.  */
3559       if (val == val2)
3560         return 1;
3561
3562       i = (val >> 48) ? 48 : 32;
3563
3564       if (generate)
3565          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3566                                     GEN_INT ((val >> i) & 0xffff)));
3567
3568       return 2;
3569     }
3570
3571   if ((val >> 32) == 0 || mode == SImode)
3572     {
3573       if (generate)
3574         {
3575           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3576           if (mode == SImode)
3577             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3578                                        GEN_INT ((val >> 16) & 0xffff)));
3579           else
3580             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3581                                        GEN_INT ((val >> 16) & 0xffff)));
3582         }
3583       return 2;
3584     }
3585
3586   /* Remaining cases are all for DImode.  */
3587
3588   mask = 0xffff;
3589   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3590     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3591   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3592     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3593
3594   if (zero_match != 2 && one_match != 2)
3595     {
3596       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3597          For a 64-bit bitmask try whether changing 16 bits to all ones or
3598          zeroes creates a valid bitmask.  To check any repeated bitmask,
3599          try using 16 bits from the other 32-bit half of val.  */
3600
3601       for (i = 0; i < 64; i += 16, mask <<= 16)
3602         {
3603           val2 = val & ~mask;
3604           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3605             break;
3606           val2 = val | mask;
3607           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3608             break;
3609           val2 = val2 & ~mask;
3610           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3611           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3612             break;
3613         }
3614       if (i != 64)
3615         {
3616           if (generate)
3617             {
3618               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3619               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3620                                          GEN_INT ((val >> i) & 0xffff)));
3621             }
3622           return 2;
3623         }
3624     }
3625
3626   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3627      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3628      otherwise skip zero bits.  */
3629
3630   num_insns = 1;
3631   mask = 0xffff;
3632   val2 = one_match > zero_match ? ~val : val;
3633   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3634
3635   if (generate)
3636     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3637                                            ? (val | ~(mask << i))
3638                                            : (val & (mask << i)))));
3639   for (i += 16; i < 64; i += 16)
3640     {
3641       if ((val2 & (mask << i)) == 0)
3642         continue;
3643       if (generate)
3644         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3645                                    GEN_INT ((val >> i) & 0xffff)));
3646       num_insns ++;
3647     }
3648
3649   return num_insns;
3650 }
3651
3652 /* Return whether imm is a 128-bit immediate which is simple enough to
3653    expand inline.  */
3654 bool
3655 aarch64_mov128_immediate (rtx imm)
3656 {
3657   if (GET_CODE (imm) == CONST_INT)
3658     return true;
3659
3660   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3661
3662   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3663   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3664
3665   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3666          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3667 }
3668
3669
3670 /* Return the number of temporary registers that aarch64_add_offset_1
3671    would need to add OFFSET to a register.  */
3672
3673 static unsigned int
3674 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3675 {
3676   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3677 }
3678
3679 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3680    a non-polynomial OFFSET.  MODE is the mode of the addition.
3681    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3682    be set and CFA adjustments added to the generated instructions.
3683
3684    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3685    temporary if register allocation is already complete.  This temporary
3686    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3687    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3688    the immediate again.
3689
3690    Since this function may be used to adjust the stack pointer, we must
3691    ensure that it cannot cause transient stack deallocation (for example
3692    by first incrementing SP and then decrementing when adjusting by a
3693    large immediate).  */
3694
3695 static void
3696 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3697                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3698                       bool frame_related_p, bool emit_move_imm)
3699 {
3700   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3701   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3702
3703   HOST_WIDE_INT moffset = abs_hwi (offset);
3704   rtx_insn *insn;
3705
3706   if (!moffset)
3707     {
3708       if (!rtx_equal_p (dest, src))
3709         {
3710           insn = emit_insn (gen_rtx_SET (dest, src));
3711           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3712         }
3713       return;
3714     }
3715
3716   /* Single instruction adjustment.  */
3717   if (aarch64_uimm12_shift (moffset))
3718     {
3719       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3720       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3721       return;
3722     }
3723
3724   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3725      and either:
3726
3727      a) the offset cannot be loaded by a 16-bit move or
3728      b) there is no spare register into which we can move it.  */
3729   if (moffset < 0x1000000
3730       && ((!temp1 && !can_create_pseudo_p ())
3731           || !aarch64_move_imm (moffset, mode)))
3732     {
3733       HOST_WIDE_INT low_off = moffset & 0xfff;
3734
3735       low_off = offset < 0 ? -low_off : low_off;
3736       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3737       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3738       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3739       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3740       return;
3741     }
3742
3743   /* Emit a move immediate if required and an addition/subtraction.  */
3744   if (emit_move_imm)
3745     {
3746       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3747       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3748     }
3749   insn = emit_insn (offset < 0
3750                     ? gen_sub3_insn (dest, src, temp1)
3751                     : gen_add3_insn (dest, src, temp1));
3752   if (frame_related_p)
3753     {
3754       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3755       rtx adj = plus_constant (mode, src, offset);
3756       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3757     }
3758 }
3759
3760 /* Return the number of temporary registers that aarch64_add_offset
3761    would need to move OFFSET into a register or add OFFSET to a register;
3762    ADD_P is true if we want the latter rather than the former.  */
3763
3764 static unsigned int
3765 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3766 {
3767   /* This follows the same structure as aarch64_add_offset.  */
3768   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3769     return 0;
3770
3771   unsigned int count = 0;
3772   HOST_WIDE_INT factor = offset.coeffs[1];
3773   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3774   poly_int64 poly_offset (factor, factor);
3775   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3776     /* Need one register for the ADDVL/ADDPL result.  */
3777     count += 1;
3778   else if (factor != 0)
3779     {
3780       factor = abs (factor);
3781       if (factor > 16 * (factor & -factor))
3782         /* Need one register for the CNT result and one for the multiplication
3783            factor.  If necessary, the second temporary can be reused for the
3784            constant part of the offset.  */
3785         return 2;
3786       /* Need one register for the CNT result (which might then
3787          be shifted).  */
3788       count += 1;
3789     }
3790   return count + aarch64_add_offset_1_temporaries (constant);
3791 }
3792
3793 /* If X can be represented as a poly_int64, return the number
3794    of temporaries that are required to add it to a register.
3795    Return -1 otherwise.  */
3796
3797 int
3798 aarch64_add_offset_temporaries (rtx x)
3799 {
3800   poly_int64 offset;
3801   if (!poly_int_rtx_p (x, &offset))
3802     return -1;
3803   return aarch64_offset_temporaries (true, offset);
3804 }
3805
3806 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3807    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3808    be set and CFA adjustments added to the generated instructions.
3809
3810    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3811    temporary if register allocation is already complete.  This temporary
3812    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3813    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3814    false to avoid emitting the immediate again.
3815
3816    TEMP2, if nonnull, is a second temporary register that doesn't
3817    overlap either DEST or REG.
3818
3819    Since this function may be used to adjust the stack pointer, we must
3820    ensure that it cannot cause transient stack deallocation (for example
3821    by first incrementing SP and then decrementing when adjusting by a
3822    large immediate).  */
3823
3824 static void
3825 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3826                     poly_int64 offset, rtx temp1, rtx temp2,
3827                     bool frame_related_p, bool emit_move_imm = true)
3828 {
3829   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3830   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3831   gcc_assert (temp1 == NULL_RTX
3832               || !frame_related_p
3833               || !reg_overlap_mentioned_p (temp1, dest));
3834   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3835
3836   /* Try using ADDVL or ADDPL to add the whole value.  */
3837   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3838     {
3839       rtx offset_rtx = gen_int_mode (offset, mode);
3840       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3841       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3842       return;
3843     }
3844
3845   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3846      SVE vector register, over and above the minimum size of 128 bits.
3847      This is equivalent to half the value returned by CNTD with a
3848      vector shape of ALL.  */
3849   HOST_WIDE_INT factor = offset.coeffs[1];
3850   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3851
3852   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3853   poly_int64 poly_offset (factor, factor);
3854   if (src != const0_rtx
3855       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3856     {
3857       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3858       if (frame_related_p)
3859         {
3860           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3861           RTX_FRAME_RELATED_P (insn) = true;
3862           src = dest;
3863         }
3864       else
3865         {
3866           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3867           src = aarch64_force_temporary (mode, temp1, addr);
3868           temp1 = temp2;
3869           temp2 = NULL_RTX;
3870         }
3871     }
3872   /* Otherwise use a CNT-based sequence.  */
3873   else if (factor != 0)
3874     {
3875       /* Use a subtraction if we have a negative factor.  */
3876       rtx_code code = PLUS;
3877       if (factor < 0)
3878         {
3879           factor = -factor;
3880           code = MINUS;
3881         }
3882
3883       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3884          into the multiplication.  */
3885       rtx val;
3886       int shift = 0;
3887       if (factor & 1)
3888         /* Use a right shift by 1.  */
3889         shift = -1;
3890       else
3891         factor /= 2;
3892       HOST_WIDE_INT low_bit = factor & -factor;
3893       if (factor <= 16 * low_bit)
3894         {
3895           if (factor > 16 * 8)
3896             {
3897               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3898                  the value with the minimum multiplier and shift it into
3899                  position.  */
3900               int extra_shift = exact_log2 (low_bit);
3901               shift += extra_shift;
3902               factor >>= extra_shift;
3903             }
3904           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3905         }
3906       else
3907         {
3908           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3909              directly, since that should increase the chances of being
3910              able to use a shift and add sequence.  If LOW_BIT itself
3911              is out of range, just use CNTD.  */
3912           if (low_bit <= 16 * 8)
3913             factor /= low_bit;
3914           else
3915             low_bit = 1;
3916
3917           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3918           val = aarch64_force_temporary (mode, temp1, val);
3919
3920           if (can_create_pseudo_p ())
3921             {
3922               rtx coeff1 = gen_int_mode (factor, mode);
3923               val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3924             }
3925           else
3926             {
3927               /* Go back to using a negative multiplication factor if we have
3928                  no register from which to subtract.  */
3929               if (code == MINUS && src == const0_rtx)
3930                 {
3931                   factor = -factor;
3932                   code = PLUS;
3933                 }
3934               rtx coeff1 = gen_int_mode (factor, mode);
3935               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3936               val = gen_rtx_MULT (mode, val, coeff1);
3937             }
3938         }
3939
3940       if (shift > 0)
3941         {
3942           /* Multiply by 1 << SHIFT.  */
3943           val = aarch64_force_temporary (mode, temp1, val);
3944           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3945         }
3946       else if (shift == -1)
3947         {
3948           /* Divide by 2.  */
3949           val = aarch64_force_temporary (mode, temp1, val);
3950           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3951         }
3952
3953       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3954       if (src != const0_rtx)
3955         {
3956           val = aarch64_force_temporary (mode, temp1, val);
3957           val = gen_rtx_fmt_ee (code, mode, src, val);
3958         }
3959       else if (code == MINUS)
3960         {
3961           val = aarch64_force_temporary (mode, temp1, val);
3962           val = gen_rtx_NEG (mode, val);
3963         }
3964
3965       if (constant == 0 || frame_related_p)
3966         {
3967           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3968           if (frame_related_p)
3969             {
3970               RTX_FRAME_RELATED_P (insn) = true;
3971               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3972                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3973                                                               poly_offset)));
3974             }
3975           src = dest;
3976           if (constant == 0)
3977             return;
3978         }
3979       else
3980         {
3981           src = aarch64_force_temporary (mode, temp1, val);
3982           temp1 = temp2;
3983           temp2 = NULL_RTX;
3984         }
3985
3986       emit_move_imm = true;
3987     }
3988
3989   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3990                         frame_related_p, emit_move_imm);
3991 }
3992
3993 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3994    than a poly_int64.  */
3995
3996 void
3997 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3998                           rtx offset_rtx, rtx temp1, rtx temp2)
3999 {
4000   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4001                       temp1, temp2, false);
4002 }
4003
4004 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4005    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
4006    if TEMP1 already contains abs (DELTA).  */
4007
4008 static inline void
4009 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4010 {
4011   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4012                       temp1, temp2, true, emit_move_imm);
4013 }
4014
4015 /* Subtract DELTA from the stack pointer, marking the instructions
4016    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
4017    if nonnull.  */
4018
4019 static inline void
4020 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4021                 bool emit_move_imm = true)
4022 {
4023   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4024                       temp1, temp2, frame_related_p, emit_move_imm);
4025 }
4026
4027 /* Set DEST to (vec_series BASE STEP).  */
4028
4029 static void
4030 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4031 {
4032   machine_mode mode = GET_MODE (dest);
4033   scalar_mode inner = GET_MODE_INNER (mode);
4034
4035   /* Each operand can be a register or an immediate in the range [-16, 15].  */
4036   if (!aarch64_sve_index_immediate_p (base))
4037     base = force_reg (inner, base);
4038   if (!aarch64_sve_index_immediate_p (step))
4039     step = force_reg (inner, step);
4040
4041   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4042 }
4043
4044 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4045    register of mode MODE.  Use TARGET for the result if it's nonnull
4046    and convenient.
4047
4048    The two vector modes must have the same element mode.  The behavior
4049    is to duplicate architectural lane N of SRC into architectural lanes
4050    N + I * STEP of the result.  On big-endian targets, architectural
4051    lane 0 of an Advanced SIMD vector is the last element of the vector
4052    in memory layout, so for big-endian targets this operation has the
4053    effect of reversing SRC before duplicating it.  Callers need to
4054    account for this.  */
4055
4056 rtx
4057 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4058 {
4059   machine_mode src_mode = GET_MODE (src);
4060   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4061   insn_code icode = (BYTES_BIG_ENDIAN
4062                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
4063                      : code_for_aarch64_vec_duplicate_vq_le (mode));
4064
4065   unsigned int i = 0;
4066   expand_operand ops[3];
4067   create_output_operand (&ops[i++], target, mode);
4068   create_output_operand (&ops[i++], src, src_mode);
4069   if (BYTES_BIG_ENDIAN)
4070     {
4071       /* Create a PARALLEL describing the reversal of SRC.  */
4072       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4073       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4074                                                   nelts_per_vq - 1, -1);
4075       create_fixed_operand (&ops[i++], sel);
4076     }
4077   expand_insn (icode, i, ops);
4078   return ops[0].value;
4079 }
4080
4081 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4082    the memory image into DEST.  Return true on success.  */
4083
4084 static bool
4085 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4086 {
4087   src = force_const_mem (GET_MODE (src), src);
4088   if (!src)
4089     return false;
4090
4091   /* Make sure that the address is legitimate.  */
4092   if (!aarch64_sve_ld1rq_operand_p (src))
4093     {
4094       rtx addr = force_reg (Pmode, XEXP (src, 0));
4095       src = replace_equiv_address (src, addr);
4096     }
4097
4098   machine_mode mode = GET_MODE (dest);
4099   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4100   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4101   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4102   return true;
4103 }
4104
4105 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4106    SVE data mode and isn't a legitimate constant.  Use TARGET for the
4107    result if convenient.
4108
4109    The returned register can have whatever mode seems most natural
4110    given the contents of SRC.  */
4111
4112 static rtx
4113 aarch64_expand_sve_const_vector (rtx target, rtx src)
4114 {
4115   machine_mode mode = GET_MODE (src);
4116   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4117   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4118   scalar_mode elt_mode = GET_MODE_INNER (mode);
4119   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4120   unsigned int container_bits = aarch64_sve_container_bits (mode);
4121   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4122
4123   if (nelts_per_pattern == 1
4124       && encoded_bits <= 128
4125       && container_bits != elt_bits)
4126     {
4127       /* We have a partial vector mode and a constant whose full-vector
4128          equivalent would occupy a repeating 128-bit sequence.  Build that
4129          full-vector equivalent instead, so that we have the option of
4130          using LD1RQ and Advanced SIMD operations.  */
4131       unsigned int repeat = container_bits / elt_bits;
4132       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4133       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4134       for (unsigned int i = 0; i < npatterns; ++i)
4135         for (unsigned int j = 0; j < repeat; ++j)
4136           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4137       target = aarch64_target_reg (target, full_mode);
4138       return aarch64_expand_sve_const_vector (target, builder.build ());
4139     }
4140
4141   if (nelts_per_pattern == 1 && encoded_bits == 128)
4142     {
4143       /* The constant is a duplicated quadword but can't be narrowed
4144          beyond a quadword.  Get the memory image of the first quadword
4145          as a 128-bit vector and try using LD1RQ to load it from memory.
4146
4147          The effect for both endiannesses is to load memory lane N into
4148          architectural lanes N + I * STEP of the result.  On big-endian
4149          targets, the layout of the 128-bit vector in an Advanced SIMD
4150          register would be different from its layout in an SVE register,
4151          but this 128-bit vector is a memory value only.  */
4152       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4153       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4154       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4155         return target;
4156     }
4157
4158   if (nelts_per_pattern == 1 && encoded_bits < 128)
4159     {
4160       /* The vector is a repeating sequence of 64 bits or fewer.
4161          See if we can load them using an Advanced SIMD move and then
4162          duplicate it to fill a vector.  This is better than using a GPR
4163          move because it keeps everything in the same register file.  */
4164       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4165       rtx_vector_builder builder (vq_mode, npatterns, 1);
4166       for (unsigned int i = 0; i < npatterns; ++i)
4167         {
4168           /* We want memory lane N to go into architectural lane N,
4169              so reverse for big-endian targets.  The DUP .Q pattern
4170              has a compensating reverse built-in.  */
4171           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4172           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4173         }
4174       rtx vq_src = builder.build ();
4175       if (aarch64_simd_valid_immediate (vq_src, NULL))
4176         {
4177           vq_src = force_reg (vq_mode, vq_src);
4178           return aarch64_expand_sve_dupq (target, mode, vq_src);
4179         }
4180
4181       /* Get an integer representation of the repeating part of Advanced
4182          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
4183          which for big-endian targets is lane-swapped wrt a normal
4184          Advanced SIMD vector.  This means that for both endiannesses,
4185          memory lane N of SVE vector SRC corresponds to architectural
4186          lane N of a register holding VQ_SRC.  This in turn means that
4187          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4188          as a single 128-bit value) and thus that memory lane 0 of SRC is
4189          in the lsb of the integer.  Duplicating the integer therefore
4190          ensures that memory lane N of SRC goes into architectural lane
4191          N + I * INDEX of the SVE register.  */
4192       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4193       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4194       if (elt_value)
4195         {
4196           /* Pretend that we had a vector of INT_MODE to start with.  */
4197           elt_mode = int_mode;
4198           mode = aarch64_full_sve_mode (int_mode).require ();
4199
4200           /* If the integer can be moved into a general register by a
4201              single instruction, do that and duplicate the result.  */
4202           if (CONST_INT_P (elt_value)
4203               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4204             {
4205               elt_value = force_reg (elt_mode, elt_value);
4206               return expand_vector_broadcast (mode, elt_value);
4207             }
4208         }
4209       else if (npatterns == 1)
4210         /* We're duplicating a single value, but can't do better than
4211            force it to memory and load from there.  This handles things
4212            like symbolic constants.  */
4213         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4214
4215       if (elt_value)
4216         {
4217           /* Load the element from memory if we can, otherwise move it into
4218              a register and use a DUP.  */
4219           rtx op = force_const_mem (elt_mode, elt_value);
4220           if (!op)
4221             op = force_reg (elt_mode, elt_value);
4222           return expand_vector_broadcast (mode, op);
4223         }
4224     }
4225
4226   /* Try using INDEX.  */
4227   rtx base, step;
4228   if (const_vec_series_p (src, &base, &step))
4229     {
4230       aarch64_expand_vec_series (target, base, step);
4231       return target;
4232     }
4233
4234   /* From here on, it's better to force the whole constant to memory
4235      if we can.  */
4236   if (GET_MODE_NUNITS (mode).is_constant ())
4237     return NULL_RTX;
4238
4239   /* Expand each pattern individually.  */
4240   gcc_assert (npatterns > 1);
4241   rtx_vector_builder builder;
4242   auto_vec<rtx, 16> vectors (npatterns);
4243   for (unsigned int i = 0; i < npatterns; ++i)
4244     {
4245       builder.new_vector (mode, 1, nelts_per_pattern);
4246       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4247         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4248       vectors.quick_push (force_reg (mode, builder.build ()));
4249     }
4250
4251   /* Use permutes to interleave the separate vectors.  */
4252   while (npatterns > 1)
4253     {
4254       npatterns /= 2;
4255       for (unsigned int i = 0; i < npatterns; ++i)
4256         {
4257           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4258           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4259           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4260           vectors[i] = tmp;
4261         }
4262     }
4263   gcc_assert (vectors[0] == target);
4264   return target;
4265 }
4266
4267 /* Use WHILE to set a predicate register of mode MODE in which the first
4268    VL bits are set and the rest are clear.  Use TARGET for the register
4269    if it's nonnull and convenient.  */
4270
4271 static rtx
4272 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4273                                  unsigned int vl)
4274 {
4275   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4276   target = aarch64_target_reg (target, mode);
4277   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
4278                         target, const0_rtx, limit));
4279   return target;
4280 }
4281
4282 static rtx
4283 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4284
4285 /* BUILDER is a constant predicate in which the index of every set bit
4286    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4287    by inverting every element at a multiple of ELT_SIZE and EORing the
4288    result with an ELT_SIZE PTRUE.
4289
4290    Return a register that contains the constant on success, otherwise
4291    return null.  Use TARGET as the register if it is nonnull and
4292    convenient.  */
4293
4294 static rtx
4295 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4296                                    unsigned int elt_size)
4297 {
4298   /* Invert every element at a multiple of ELT_SIZE, keeping the
4299      other bits zero.  */
4300   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4301                                   builder.nelts_per_pattern ());
4302   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4303     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4304       inv_builder.quick_push (const1_rtx);
4305     else
4306       inv_builder.quick_push (const0_rtx);
4307   inv_builder.finalize ();
4308
4309   /* See if we can load the constant cheaply.  */
4310   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4311   if (!inv)
4312     return NULL_RTX;
4313
4314   /* EOR the result with an ELT_SIZE PTRUE.  */
4315   rtx mask = aarch64_ptrue_all (elt_size);
4316   mask = force_reg (VNx16BImode, mask);
4317   target = aarch64_target_reg (target, VNx16BImode);
4318   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4319   return target;
4320 }
4321
4322 /* BUILDER is a constant predicate in which the index of every set bit
4323    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4324    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
4325    register on success, otherwise return null.  Use TARGET as the register
4326    if nonnull and convenient.  */
4327
4328 static rtx
4329 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4330                                    unsigned int elt_size,
4331                                    unsigned int permute_size)
4332 {
4333   /* We're going to split the constant into two new constants A and B,
4334      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4335      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4336
4337      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4338      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4339
4340      where _ indicates elements that will be discarded by the permute.
4341
4342      First calculate the ELT_SIZEs for A and B.  */
4343   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4344   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4345   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4346     if (INTVAL (builder.elt (i)) != 0)
4347       {
4348         if (i & permute_size)
4349           b_elt_size |= i - permute_size;
4350         else
4351           a_elt_size |= i;
4352       }
4353   a_elt_size &= -a_elt_size;
4354   b_elt_size &= -b_elt_size;
4355
4356   /* Now construct the vectors themselves.  */
4357   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4358                                 builder.nelts_per_pattern ());
4359   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4360                                 builder.nelts_per_pattern ());
4361   unsigned int nelts = builder.encoded_nelts ();
4362   for (unsigned int i = 0; i < nelts; ++i)
4363     if (i & (elt_size - 1))
4364       {
4365         a_builder.quick_push (const0_rtx);
4366         b_builder.quick_push (const0_rtx);
4367       }
4368     else if ((i & permute_size) == 0)
4369       {
4370         /* The A and B elements are significant.  */
4371         a_builder.quick_push (builder.elt (i));
4372         b_builder.quick_push (builder.elt (i + permute_size));
4373       }
4374     else
4375       {
4376         /* The A and B elements are going to be discarded, so pick whatever
4377            is likely to give a nice constant.  We are targeting element
4378            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4379            with the aim of each being a sequence of ones followed by
4380            a sequence of zeros.  So:
4381
4382            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4383              duplicate the last X_ELT_SIZE element, to extend the
4384              current sequence of ones or zeros.
4385
4386            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4387              zero, so that the constant really does have X_ELT_SIZE and
4388              not a smaller size.  */
4389         if (a_elt_size > permute_size)
4390           a_builder.quick_push (const0_rtx);
4391         else
4392           a_builder.quick_push (a_builder.elt (i - a_elt_size));
4393         if (b_elt_size > permute_size)
4394           b_builder.quick_push (const0_rtx);
4395         else
4396           b_builder.quick_push (b_builder.elt (i - b_elt_size));
4397       }
4398   a_builder.finalize ();
4399   b_builder.finalize ();
4400
4401   /* Try loading A into a register.  */
4402   rtx_insn *last = get_last_insn ();
4403   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4404   if (!a)
4405     return NULL_RTX;
4406
4407   /* Try loading B into a register.  */
4408   rtx b = a;
4409   if (a_builder != b_builder)
4410     {
4411       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4412       if (!b)
4413         {
4414           delete_insns_since (last);
4415           return NULL_RTX;
4416         }
4417     }
4418
4419   /* Emit the TRN1 itself.  */
4420   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4421   target = aarch64_target_reg (target, mode);
4422   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4423                               gen_lowpart (mode, a),
4424                               gen_lowpart (mode, b)));
4425   return target;
4426 }
4427
4428 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
4429    constant in BUILDER into an SVE predicate register.  Return the register
4430    on success, otherwise return null.  Use TARGET for the register if
4431    nonnull and convenient.
4432
4433    ALLOW_RECURSE_P is true if we can use methods that would call this
4434    function recursively.  */
4435
4436 static rtx
4437 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4438                                  bool allow_recurse_p)
4439 {
4440   if (builder.encoded_nelts () == 1)
4441     /* A PFALSE or a PTRUE .B ALL.  */
4442     return aarch64_emit_set_immediate (target, builder);
4443
4444   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4445   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4446     {
4447       /* If we can load the constant using PTRUE, use it as-is.  */
4448       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4449       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4450         return aarch64_emit_set_immediate (target, builder);
4451
4452       /* Otherwise use WHILE to set the first VL bits.  */
4453       return aarch64_sve_move_pred_via_while (target, mode, vl);
4454     }
4455
4456   if (!allow_recurse_p)
4457     return NULL_RTX;
4458
4459   /* Try inverting the vector in element size ELT_SIZE and then EORing
4460      the result with an ELT_SIZE PTRUE.  */
4461   if (INTVAL (builder.elt (0)) == 0)
4462     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4463                                                      elt_size))
4464       return res;
4465
4466   /* Try using TRN1 to permute two simpler constants.  */
4467   for (unsigned int i = elt_size; i <= 8; i *= 2)
4468     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4469                                                      elt_size, i))
4470       return res;
4471
4472   return NULL_RTX;
4473 }
4474
4475 /* Return an SVE predicate register that contains the VNx16BImode
4476    constant in BUILDER, without going through the move expanders.
4477
4478    The returned register can have whatever mode seems most natural
4479    given the contents of BUILDER.  Use TARGET for the result if
4480    convenient.  */
4481
4482 static rtx
4483 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4484 {
4485   /* Try loading the constant using pure predicate operations.  */
4486   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4487     return res;
4488
4489   /* Try forcing the constant to memory.  */
4490   if (builder.full_nelts ().is_constant ())
4491     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4492       {
4493         target = aarch64_target_reg (target, VNx16BImode);
4494         emit_move_insn (target, mem);
4495         return target;
4496       }
4497
4498   /* The last resort is to load the constant as an integer and then
4499      compare it against zero.  Use -1 for set bits in order to increase
4500      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
4501   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4502                                   builder.nelts_per_pattern ());
4503   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4504     int_builder.quick_push (INTVAL (builder.elt (i))
4505                             ? constm1_rtx : const0_rtx);
4506   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4507                                            int_builder.build ());
4508 }
4509
4510 /* Set DEST to immediate IMM.  */
4511
4512 void
4513 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4514 {
4515   machine_mode mode = GET_MODE (dest);
4516
4517   /* Check on what type of symbol it is.  */
4518   scalar_int_mode int_mode;
4519   if ((GET_CODE (imm) == SYMBOL_REF
4520        || GET_CODE (imm) == LABEL_REF
4521        || GET_CODE (imm) == CONST
4522        || GET_CODE (imm) == CONST_POLY_INT)
4523       && is_a <scalar_int_mode> (mode, &int_mode))
4524     {
4525       rtx mem;
4526       poly_int64 offset;
4527       HOST_WIDE_INT const_offset;
4528       enum aarch64_symbol_type sty;
4529
4530       /* If we have (const (plus symbol offset)), separate out the offset
4531          before we start classifying the symbol.  */
4532       rtx base = strip_offset (imm, &offset);
4533
4534       /* We must always add an offset involving VL separately, rather than
4535          folding it into the relocation.  */
4536       if (!offset.is_constant (&const_offset))
4537         {
4538           if (!TARGET_SVE)
4539             {
4540               aarch64_report_sve_required ();
4541               return;
4542             }
4543           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4544             emit_insn (gen_rtx_SET (dest, imm));
4545           else
4546             {
4547               /* Do arithmetic on 32-bit values if the result is smaller
4548                  than that.  */
4549               if (partial_subreg_p (int_mode, SImode))
4550                 {
4551                   /* It is invalid to do symbol calculations in modes
4552                      narrower than SImode.  */
4553                   gcc_assert (base == const0_rtx);
4554                   dest = gen_lowpart (SImode, dest);
4555                   int_mode = SImode;
4556                 }
4557               if (base != const0_rtx)
4558                 {
4559                   base = aarch64_force_temporary (int_mode, dest, base);
4560                   aarch64_add_offset (int_mode, dest, base, offset,
4561                                       NULL_RTX, NULL_RTX, false);
4562                 }
4563               else
4564                 aarch64_add_offset (int_mode, dest, base, offset,
4565                                     dest, NULL_RTX, false);
4566             }
4567           return;
4568         }
4569
4570       sty = aarch64_classify_symbol (base, const_offset);
4571       switch (sty)
4572         {
4573         case SYMBOL_FORCE_TO_MEM:
4574           if (const_offset != 0
4575               && targetm.cannot_force_const_mem (int_mode, imm))
4576             {
4577               gcc_assert (can_create_pseudo_p ());
4578               base = aarch64_force_temporary (int_mode, dest, base);
4579               aarch64_add_offset (int_mode, dest, base, const_offset,
4580                                   NULL_RTX, NULL_RTX, false);
4581               return;
4582             }
4583
4584           mem = force_const_mem (ptr_mode, imm);
4585           gcc_assert (mem);
4586
4587           /* If we aren't generating PC relative literals, then
4588              we need to expand the literal pool access carefully.
4589              This is something that needs to be done in a number
4590              of places, so could well live as a separate function.  */
4591           if (!aarch64_pcrelative_literal_loads)
4592             {
4593               gcc_assert (can_create_pseudo_p ());
4594               base = gen_reg_rtx (ptr_mode);
4595               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4596               if (ptr_mode != Pmode)
4597                 base = convert_memory_address (Pmode, base);
4598               mem = gen_rtx_MEM (ptr_mode, base);
4599             }
4600
4601           if (int_mode != ptr_mode)
4602             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4603
4604           emit_insn (gen_rtx_SET (dest, mem));
4605
4606           return;
4607
4608         case SYMBOL_SMALL_TLSGD:
4609         case SYMBOL_SMALL_TLSDESC:
4610         case SYMBOL_SMALL_TLSIE:
4611         case SYMBOL_SMALL_GOT_28K:
4612         case SYMBOL_SMALL_GOT_4G:
4613         case SYMBOL_TINY_GOT:
4614         case SYMBOL_TINY_TLSIE:
4615           if (const_offset != 0)
4616             {
4617               gcc_assert(can_create_pseudo_p ());
4618               base = aarch64_force_temporary (int_mode, dest, base);
4619               aarch64_add_offset (int_mode, dest, base, const_offset,
4620                                   NULL_RTX, NULL_RTX, false);
4621               return;
4622             }
4623           /* FALLTHRU */
4624
4625         case SYMBOL_SMALL_ABSOLUTE:
4626         case SYMBOL_TINY_ABSOLUTE:
4627         case SYMBOL_TLSLE12:
4628         case SYMBOL_TLSLE24:
4629         case SYMBOL_TLSLE32:
4630         case SYMBOL_TLSLE48:
4631           aarch64_load_symref_appropriately (dest, imm, sty);
4632           return;
4633
4634         default:
4635           gcc_unreachable ();
4636         }
4637     }
4638
4639   if (!CONST_INT_P (imm))
4640     {
4641       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4642         {
4643           /* Only the low bit of each .H, .S and .D element is defined,
4644              so we can set the upper bits to whatever we like.  If the
4645              predicate is all-true in MODE, prefer to set all the undefined
4646              bits as well, so that we can share a single .B predicate for
4647              all modes.  */
4648           if (imm == CONSTM1_RTX (mode))
4649             imm = CONSTM1_RTX (VNx16BImode);
4650
4651           /* All methods for constructing predicate modes wider than VNx16BI
4652              will set the upper bits of each element to zero.  Expose this
4653              by moving such constants as a VNx16BI, so that all bits are
4654              significant and so that constants for different modes can be
4655              shared.  The wider constant will still be available as a
4656              REG_EQUAL note.  */
4657           rtx_vector_builder builder;
4658           if (aarch64_get_sve_pred_bits (builder, imm))
4659             {
4660               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4661               if (dest != res)
4662                 emit_move_insn (dest, gen_lowpart (mode, res));
4663               return;
4664             }
4665         }
4666
4667       if (GET_CODE (imm) == HIGH
4668           || aarch64_simd_valid_immediate (imm, NULL))
4669         {
4670           emit_insn (gen_rtx_SET (dest, imm));
4671           return;
4672         }
4673
4674       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4675         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4676           {
4677             if (dest != res)
4678               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4679             return;
4680           }
4681
4682       rtx mem = force_const_mem (mode, imm);
4683       gcc_assert (mem);
4684       emit_move_insn (dest, mem);
4685       return;
4686     }
4687
4688   aarch64_internal_mov_immediate (dest, imm, true,
4689                                   as_a <scalar_int_mode> (mode));
4690 }
4691
4692 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4693    that is known to contain PTRUE.  */
4694
4695 void
4696 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4697 {
4698   expand_operand ops[3];
4699   machine_mode mode = GET_MODE (dest);
4700   create_output_operand (&ops[0], dest, mode);
4701   create_input_operand (&ops[1], pred, GET_MODE(pred));
4702   create_input_operand (&ops[2], src, mode);
4703   temporary_volatile_ok v (true);
4704   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4705 }
4706
4707 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4708    operand is in memory.  In this case we need to use the predicated LD1
4709    and ST1 instead of LDR and STR, both for correctness on big-endian
4710    targets and because LD1 and ST1 support a wider range of addressing modes.
4711    PRED_MODE is the mode of the predicate.
4712
4713    See the comment at the head of aarch64-sve.md for details about the
4714    big-endian handling.  */
4715
4716 void
4717 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4718 {
4719   machine_mode mode = GET_MODE (dest);
4720   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4721   if (!register_operand (src, mode)
4722       && !register_operand (dest, mode))
4723     {
4724       rtx tmp = gen_reg_rtx (mode);
4725       if (MEM_P (src))
4726         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4727       else
4728         emit_move_insn (tmp, src);
4729       src = tmp;
4730     }
4731   aarch64_emit_sve_pred_move (dest, ptrue, src);
4732 }
4733
4734 /* Called only on big-endian targets.  See whether an SVE vector move
4735    from SRC to DEST is effectively a REV[BHW] instruction, because at
4736    least one operand is a subreg of an SVE vector that has wider or
4737    narrower elements.  Return true and emit the instruction if so.
4738
4739    For example:
4740
4741      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4742
4743    represents a VIEW_CONVERT between the following vectors, viewed
4744    in memory order:
4745
4746      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4747      R1: { [0],      [1],      [2],      [3],     ... }
4748
4749    The high part of lane X in R2 should therefore correspond to lane X*2
4750    of R1, but the register representations are:
4751
4752          msb                                      lsb
4753      R2: ...... [1].high  [1].low   [0].high  [0].low
4754      R1: ...... [3]       [2]       [1]       [0]
4755
4756    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4757    We therefore need a reverse operation to swap the high and low values
4758    around.
4759
4760    This is purely an optimization.  Without it we would spill the
4761    subreg operand to the stack in one mode and reload it in the
4762    other mode, which has the same effect as the REV.  */
4763
4764 bool
4765 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4766 {
4767   gcc_assert (BYTES_BIG_ENDIAN);
4768   if (GET_CODE (dest) == SUBREG)
4769     dest = SUBREG_REG (dest);
4770   if (GET_CODE (src) == SUBREG)
4771     src = SUBREG_REG (src);
4772
4773   /* The optimization handles two single SVE REGs with different element
4774      sizes.  */
4775   if (!REG_P (dest)
4776       || !REG_P (src)
4777       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4778       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4779       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4780           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4781     return false;
4782
4783   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4784   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4785   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4786                                UNSPEC_REV_SUBREG);
4787   emit_insn (gen_rtx_SET (dest, unspec));
4788   return true;
4789 }
4790
4791 /* Return a copy of X with mode MODE, without changing its other
4792    attributes.  Unlike gen_lowpart, this doesn't care whether the
4793    mode change is valid.  */
4794
4795 rtx
4796 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4797 {
4798   if (GET_MODE (x) == mode)
4799     return x;
4800
4801   x = shallow_copy_rtx (x);
4802   set_mode_and_regno (x, mode, REGNO (x));
4803   return x;
4804 }
4805
4806 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4807    stored in wider integer containers.  */
4808
4809 static unsigned int
4810 aarch64_sve_rev_unspec (machine_mode mode)
4811 {
4812   switch (GET_MODE_UNIT_SIZE (mode))
4813     {
4814     case 1: return UNSPEC_REVB;
4815     case 2: return UNSPEC_REVH;
4816     case 4: return UNSPEC_REVW;
4817     }
4818   gcc_unreachable ();
4819 }
4820
4821 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4822    operands.  */
4823
4824 void
4825 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4826 {
4827   /* Decide which REV operation we need.  The mode with wider elements
4828      determines the mode of the operands and the mode with the narrower
4829      elements determines the reverse width.  */
4830   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
4831   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
4832   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4833       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4834     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4835
4836   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4837   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
4838
4839   /* Get the operands in the appropriate modes and emit the instruction.  */
4840   ptrue = gen_lowpart (pred_mode, ptrue);
4841   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4842   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4843   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4844                                dest, ptrue, src));
4845 }
4846
4847 static bool
4848 aarch64_function_ok_for_sibcall (tree, tree exp)
4849 {
4850   if (crtl->abi->id () != expr_callee_abi (exp).id ())
4851     return false;
4852
4853   return true;
4854 }
4855
4856 /* Implement TARGET_PASS_BY_REFERENCE.  */
4857
4858 static bool
4859 aarch64_pass_by_reference (cumulative_args_t pcum_v,
4860                            const function_arg_info &arg)
4861 {
4862   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4863   HOST_WIDE_INT size;
4864   machine_mode dummymode;
4865   int nregs;
4866
4867   unsigned int num_zr, num_pr;
4868   if (arg.type && aarch64_sve::builtin_type_p (arg.type, &num_zr, &num_pr))
4869     {
4870       if (pcum && !pcum->silent_p && !TARGET_SVE)
4871         /* We can't gracefully recover at this point, so make this a
4872            fatal error.  */
4873         fatal_error (input_location, "arguments of type %qT require"
4874                      " the SVE ISA extension", arg.type);
4875
4876       /* Variadic SVE types are passed by reference.  Normal non-variadic
4877          arguments are too if we've run out of registers.  */
4878       return (!arg.named
4879               || pcum->aapcs_nvrn + num_zr > NUM_FP_ARG_REGS
4880               || pcum->aapcs_nprn + num_pr > NUM_PR_ARG_REGS);
4881     }
4882
4883   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4884   if (arg.mode == BLKmode && arg.type)
4885     size = int_size_in_bytes (arg.type);
4886   else
4887     /* No frontends can create types with variable-sized modes, so we
4888        shouldn't be asked to pass or return them.  */
4889     size = GET_MODE_SIZE (arg.mode).to_constant ();
4890
4891   /* Aggregates are passed by reference based on their size.  */
4892   if (arg.aggregate_type_p ())
4893     size = int_size_in_bytes (arg.type);
4894
4895   /* Variable sized arguments are always returned by reference.  */
4896   if (size < 0)
4897     return true;
4898
4899   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4900   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4901                                                &dummymode, &nregs,
4902                                                NULL))
4903     return false;
4904
4905   /* Arguments which are variable sized or larger than 2 registers are
4906      passed by reference unless they are a homogenous floating point
4907      aggregate.  */
4908   return size > 2 * UNITS_PER_WORD;
4909 }
4910
4911 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4912 static bool
4913 aarch64_return_in_msb (const_tree valtype)
4914 {
4915   machine_mode dummy_mode;
4916   int dummy_int;
4917
4918   /* Never happens in little-endian mode.  */
4919   if (!BYTES_BIG_ENDIAN)
4920     return false;
4921
4922   /* Only composite types smaller than or equal to 16 bytes can
4923      be potentially returned in registers.  */
4924   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4925       || int_size_in_bytes (valtype) <= 0
4926       || int_size_in_bytes (valtype) > 16)
4927     return false;
4928
4929   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4930      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4931      is always passed/returned in the least significant bits of fp/simd
4932      register(s).  */
4933   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4934                                                &dummy_mode, &dummy_int, NULL))
4935     return false;
4936
4937   return true;
4938 }
4939
4940 /* Subroutine of aarch64_function_value.  MODE is the mode of the argument
4941    after promotion, and after partial SVE types have been replaced by
4942    their integer equivalents.  */
4943 static rtx
4944 aarch64_function_value_1 (const_tree type, machine_mode mode)
4945 {
4946   unsigned int num_zr, num_pr;
4947   if (type && aarch64_sve::builtin_type_p (type, &num_zr, &num_pr))
4948     {
4949       /* Don't raise an error here if we're called when SVE is disabled,
4950          since this is really just a query function.  Other code must
4951          do that where appropriate.  */
4952       mode = TYPE_MODE_RAW (type);
4953       gcc_assert (VECTOR_MODE_P (mode)
4954                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
4955
4956       if (num_zr > 0 && num_pr == 0)
4957         return gen_rtx_REG (mode, V0_REGNUM);
4958
4959       if (num_zr == 0 && num_pr == 1)
4960         return gen_rtx_REG (mode, P0_REGNUM);
4961
4962       gcc_unreachable ();
4963     }
4964
4965   /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
4966      returned in memory, not by value.  */
4967   gcc_assert (!aarch64_sve_mode_p (mode));
4968
4969   if (aarch64_return_in_msb (type))
4970     {
4971       HOST_WIDE_INT size = int_size_in_bytes (type);
4972
4973       if (size % UNITS_PER_WORD != 0)
4974         {
4975           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4976           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4977         }
4978     }
4979
4980   int count;
4981   machine_mode ag_mode;
4982   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4983                                                &ag_mode, &count, NULL))
4984     {
4985       if (!aarch64_composite_type_p (type, mode))
4986         {
4987           gcc_assert (count == 1 && mode == ag_mode);
4988           return gen_rtx_REG (mode, V0_REGNUM);
4989         }
4990       else
4991         {
4992           int i;
4993           rtx par;
4994
4995           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4996           for (i = 0; i < count; i++)
4997             {
4998               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4999               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5000               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5001               XVECEXP (par, 0, i) = tmp;
5002             }
5003           return par;
5004         }
5005     }
5006   else
5007     return gen_rtx_REG (mode, R0_REGNUM);
5008 }
5009
5010 /* Implement TARGET_FUNCTION_VALUE.
5011    Define how to find the value returned by a function.  */
5012
5013 static rtx
5014 aarch64_function_value (const_tree type, const_tree func,
5015                         bool outgoing ATTRIBUTE_UNUSED)
5016 {
5017   machine_mode mode;
5018   int unsignedp;
5019
5020   mode = TYPE_MODE (type);
5021   if (INTEGRAL_TYPE_P (type))
5022     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5023
5024   /* Vector types can acquire a partial SVE mode using things like
5025      __attribute__((vector_size(N))), and this is potentially useful.
5026      However, the choice of mode doesn't affect the type's ABI identity,
5027      so we should treat the types as though they had the associated
5028      integer mode, just like they did before SVE was introduced.
5029
5030      We know that the vector must be 128 bits or smaller, otherwise we'd
5031      have returned it in memory instead.  */
5032   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5033   if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
5034     {
5035       scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5036       rtx reg = aarch64_function_value_1 (type, int_mode);
5037       /* Vector types are never returned in the MSB and are never split.  */
5038       gcc_assert (REG_P (reg) && GET_MODE (reg) == int_mode);
5039       rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5040       return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, pair));
5041     }
5042
5043   return aarch64_function_value_1 (type, mode);
5044 }
5045
5046 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5047    Return true if REGNO is the number of a hard register in which the values
5048    of called function may come back.  */
5049
5050 static bool
5051 aarch64_function_value_regno_p (const unsigned int regno)
5052 {
5053   /* Maximum of 16 bytes can be returned in the general registers.  Examples
5054      of 16-byte return values are: 128-bit integers and 16-byte small
5055      structures (excluding homogeneous floating-point aggregates).  */
5056   if (regno == R0_REGNUM || regno == R1_REGNUM)
5057     return true;
5058
5059   /* Up to four fp/simd registers can return a function value, e.g. a
5060      homogeneous floating-point aggregate having four members.  */
5061   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5062     return TARGET_FLOAT;
5063
5064   return false;
5065 }
5066
5067 /* Implement TARGET_RETURN_IN_MEMORY.
5068
5069    If the type T of the result of a function is such that
5070      void func (T arg)
5071    would require that arg be passed as a value in a register (or set of
5072    registers) according to the parameter passing rules, then the result
5073    is returned in the same registers as would be used for such an
5074    argument.  */
5075
5076 static bool
5077 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5078 {
5079   HOST_WIDE_INT size;
5080   machine_mode ag_mode;
5081   int count;
5082
5083   if (!AGGREGATE_TYPE_P (type)
5084       && TREE_CODE (type) != COMPLEX_TYPE
5085       && TREE_CODE (type) != VECTOR_TYPE)
5086     /* Simple scalar types always returned in registers.  */
5087     return false;
5088
5089   unsigned int num_zr, num_pr;
5090   if (type && aarch64_sve::builtin_type_p (type, &num_zr, &num_pr))
5091     {
5092       /* All SVE types we support fit in registers.  For example, it isn't
5093          yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
5094          predicates.  */
5095       gcc_assert (num_zr <= NUM_FP_ARG_REGS && num_pr <= NUM_PR_ARG_REGS);
5096       return false;
5097     }
5098
5099   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
5100                                                type,
5101                                                &ag_mode,
5102                                                &count,
5103                                                NULL))
5104     return false;
5105
5106   /* Types larger than 2 registers returned in memory.  */
5107   size = int_size_in_bytes (type);
5108   return (size < 0 || size > 2 * UNITS_PER_WORD);
5109 }
5110
5111 static bool
5112 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5113                                const_tree type, int *nregs)
5114 {
5115   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5116   return aarch64_vfp_is_call_or_return_candidate (mode,
5117                                                   type,
5118                                                   &pcum->aapcs_vfp_rmode,
5119                                                   nregs,
5120                                                   NULL);
5121 }
5122
5123 /* Given MODE and TYPE of a function argument, return the alignment in
5124    bits.  The idea is to suppress any stronger alignment requested by
5125    the user and opt for the natural alignment (specified in AAPCS64 \S
5126    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
5127    calculated in versions of GCC prior to GCC-9.  This is a helper
5128    function for local use only.  */
5129
5130 static unsigned int
5131 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5132                                 bool *abi_break)
5133 {
5134   *abi_break = false;
5135   if (!type)
5136     return GET_MODE_ALIGNMENT (mode);
5137
5138   if (integer_zerop (TYPE_SIZE (type)))
5139     return 0;
5140
5141   gcc_assert (TYPE_MODE (type) == mode);
5142
5143   if (!AGGREGATE_TYPE_P (type))
5144     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
5145
5146   if (TREE_CODE (type) == ARRAY_TYPE)
5147     return TYPE_ALIGN (TREE_TYPE (type));
5148
5149   unsigned int alignment = 0;
5150   unsigned int bitfield_alignment = 0;
5151   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5152     if (TREE_CODE (field) == FIELD_DECL)
5153       {
5154         alignment = std::max (alignment, DECL_ALIGN (field));
5155         if (DECL_BIT_FIELD_TYPE (field))
5156           bitfield_alignment
5157             = std::max (bitfield_alignment,
5158                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5159       }
5160
5161   if (bitfield_alignment > alignment)
5162     {
5163       *abi_break = true;
5164       return bitfield_alignment;
5165     }
5166
5167   return alignment;
5168 }
5169
5170 /* Layout a function argument according to the AAPCS64 rules.  The rule
5171    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
5172    mode that was originally given to us by the target hook, whereas the
5173    mode in ARG might be the result of replacing partial SVE modes with
5174    the equivalent integer mode.  */
5175
5176 static void
5177 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg,
5178                     machine_mode orig_mode)
5179 {
5180   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5181   tree type = arg.type;
5182   machine_mode mode = arg.mode;
5183   int ncrn, nvrn, nregs;
5184   bool allocate_ncrn, allocate_nvrn;
5185   HOST_WIDE_INT size;
5186   bool abi_break;
5187
5188   /* We need to do this once per argument.  */
5189   if (pcum->aapcs_arg_processed)
5190     return;
5191
5192   /* Vector types can acquire a partial SVE mode using things like
5193      __attribute__((vector_size(N))), and this is potentially useful.
5194      However, the choice of mode doesn't affect the type's ABI identity,
5195      so we should treat the types as though they had the associated
5196      integer mode, just like they did before SVE was introduced.
5197
5198      We know that the vector must be 128 bits or smaller, otherwise we'd
5199      have passed it by reference instead.  */
5200   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5201   if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
5202     {
5203       function_arg_info tmp_arg = arg;
5204       tmp_arg.mode = int_mode_for_mode (mode).require ();
5205       aarch64_layout_arg (pcum_v, tmp_arg, orig_mode);
5206       if (rtx reg = pcum->aapcs_reg)
5207         {
5208           gcc_assert (REG_P (reg) && GET_MODE (reg) == tmp_arg.mode);
5209           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5210           pcum->aapcs_reg = gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5211         }
5212       return;
5213     }
5214
5215   pcum->aapcs_arg_processed = true;
5216
5217   unsigned int num_zr, num_pr;
5218   if (type && aarch64_sve::builtin_type_p (type, &num_zr, &num_pr))
5219     {
5220       /* The PCS says that it is invalid to pass an SVE value to an
5221          unprototyped function.  There is no ABI-defined location we
5222          can return in this case, so we have no real choice but to raise
5223          an error immediately, even though this is only a query function.  */
5224       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5225         {
5226           gcc_assert (!pcum->silent_p);
5227           error ("SVE type %qT cannot be passed to an unprototyped function",
5228                  arg.type);
5229           /* Avoid repeating the message, and avoid tripping the assert
5230              below.  */
5231           pcum->pcs_variant = ARM_PCS_SVE;
5232         }
5233
5234       /* We would have converted the argument into pass-by-reference
5235          form if it didn't fit in registers.  */
5236       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + num_zr;
5237       pcum->aapcs_nextnprn = pcum->aapcs_nprn + num_pr;
5238       gcc_assert (arg.named
5239                   && pcum->pcs_variant == ARM_PCS_SVE
5240                   && aarch64_sve_mode_p (mode)
5241                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5242                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5243
5244       if (num_zr > 0 && num_pr == 0)
5245         pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + pcum->aapcs_nvrn);
5246       else if (num_zr == 0 && num_pr == 1)
5247         pcum->aapcs_reg = gen_rtx_REG (mode, P0_REGNUM + pcum->aapcs_nprn);
5248       else
5249         gcc_unreachable ();
5250       return;
5251     }
5252
5253   /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
5254      passed by reference, not by value.  */
5255   gcc_assert (!aarch64_sve_mode_p (mode));
5256
5257   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
5258   if (type)
5259     size = int_size_in_bytes (type);
5260   else
5261     /* No frontends can create types with variable-sized modes, so we
5262        shouldn't be asked to pass or return them.  */
5263     size = GET_MODE_SIZE (mode).to_constant ();
5264   size = ROUND_UP (size, UNITS_PER_WORD);
5265
5266   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5267   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5268                                                  mode,
5269                                                  type,
5270                                                  &nregs);
5271
5272   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5273      The following code thus handles passing by SIMD/FP registers first.  */
5274
5275   nvrn = pcum->aapcs_nvrn;
5276
5277   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5278      and homogenous short-vector aggregates (HVA).  */
5279   if (allocate_nvrn)
5280     {
5281       if (!pcum->silent_p && !TARGET_FLOAT)
5282         aarch64_err_no_fpadvsimd (mode);
5283
5284       if (nvrn + nregs <= NUM_FP_ARG_REGS)
5285         {
5286           pcum->aapcs_nextnvrn = nvrn + nregs;
5287           if (!aarch64_composite_type_p (type, mode))
5288             {
5289               gcc_assert (nregs == 1);
5290               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5291             }
5292           else
5293             {
5294               rtx par;
5295               int i;
5296               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5297               for (i = 0; i < nregs; i++)
5298                 {
5299                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5300                                          V0_REGNUM + nvrn + i);
5301                   rtx offset = gen_int_mode
5302                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5303                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5304                   XVECEXP (par, 0, i) = tmp;
5305                 }
5306               pcum->aapcs_reg = par;
5307             }
5308           return;
5309         }
5310       else
5311         {
5312           /* C.3 NSRN is set to 8.  */
5313           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5314           goto on_stack;
5315         }
5316     }
5317
5318   ncrn = pcum->aapcs_ncrn;
5319   nregs = size / UNITS_PER_WORD;
5320
5321   /* C6 - C9.  though the sign and zero extension semantics are
5322      handled elsewhere.  This is the case where the argument fits
5323      entirely general registers.  */
5324   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5325     {
5326       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5327
5328       /* C.8 if the argument has an alignment of 16 then the NGRN is
5329          rounded up to the next even number.  */
5330       if (nregs == 2
5331           && ncrn % 2
5332           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5333              comparison is there because for > 16 * BITS_PER_UNIT
5334              alignment nregs should be > 2 and therefore it should be
5335              passed by reference rather than value.  */
5336           && (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
5337               == 16 * BITS_PER_UNIT))
5338         {
5339           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5340             inform (input_location, "parameter passing for argument of type "
5341                     "%qT changed in GCC 9.1", type);
5342           ++ncrn;
5343           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
5344         }
5345
5346       /* NREGS can be 0 when e.g. an empty structure is to be passed.
5347          A reg is still generated for it, but the caller should be smart
5348          enough not to use it.  */
5349       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
5350         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
5351       else
5352         {
5353           rtx par;
5354           int i;
5355
5356           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5357           for (i = 0; i < nregs; i++)
5358             {
5359               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
5360               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5361                                        GEN_INT (i * UNITS_PER_WORD));
5362               XVECEXP (par, 0, i) = tmp;
5363             }
5364           pcum->aapcs_reg = par;
5365         }
5366
5367       pcum->aapcs_nextncrn = ncrn + nregs;
5368       return;
5369     }
5370
5371   /* C.11  */
5372   pcum->aapcs_nextncrn = NUM_ARG_REGS;
5373
5374   /* The argument is passed on stack; record the needed number of words for
5375      this argument and align the total size if necessary.  */
5376 on_stack:
5377   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
5378
5379   if (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
5380       == 16 * BITS_PER_UNIT)
5381     {
5382       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5383       if (pcum->aapcs_stack_size != new_size)
5384         {
5385           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5386             inform (input_location, "parameter passing for argument of type "
5387                     "%qT changed in GCC 9.1", type);
5388           pcum->aapcs_stack_size = new_size;
5389         }
5390     }
5391   return;
5392 }
5393
5394 /* Implement TARGET_FUNCTION_ARG.  */
5395
5396 static rtx
5397 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5398 {
5399   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5400   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
5401               || pcum->pcs_variant == ARM_PCS_SIMD
5402               || pcum->pcs_variant == ARM_PCS_SVE);
5403
5404   if (arg.end_marker_p ())
5405     return gen_int_mode (pcum->pcs_variant, DImode);
5406
5407   aarch64_layout_arg (pcum_v, arg, arg.mode);
5408   return pcum->aapcs_reg;
5409 }
5410
5411 void
5412 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
5413                               const_tree fntype,
5414                               rtx libname ATTRIBUTE_UNUSED,
5415                               const_tree fndecl ATTRIBUTE_UNUSED,
5416                               unsigned n_named ATTRIBUTE_UNUSED,
5417                               bool silent_p)
5418 {
5419   pcum->aapcs_ncrn = 0;
5420   pcum->aapcs_nvrn = 0;
5421   pcum->aapcs_nprn = 0;
5422   pcum->aapcs_nextncrn = 0;
5423   pcum->aapcs_nextnvrn = 0;
5424   pcum->aapcs_nextnprn = 0;
5425   if (fntype)
5426     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5427   else
5428     pcum->pcs_variant = ARM_PCS_AAPCS64;
5429   pcum->aapcs_reg = NULL_RTX;
5430   pcum->aapcs_arg_processed = false;
5431   pcum->aapcs_stack_words = 0;
5432   pcum->aapcs_stack_size = 0;
5433   pcum->silent_p = silent_p;
5434
5435   if (!silent_p
5436       && !TARGET_FLOAT
5437       && fndecl && TREE_PUBLIC (fndecl)
5438       && fntype && fntype != error_mark_node)
5439     {
5440       const_tree type = TREE_TYPE (fntype);
5441       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
5442       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
5443       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5444                                                    &mode, &nregs, NULL))
5445         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
5446     }
5447
5448   if (!silent_p
5449       && !TARGET_SVE
5450       && pcum->pcs_variant == ARM_PCS_SVE)
5451     {
5452       /* We can't gracefully recover at this point, so make this a
5453          fatal error.  */
5454       if (fndecl)
5455         fatal_error (input_location, "%qE requires the SVE ISA extension",
5456                      fndecl);
5457       else
5458         fatal_error (input_location, "calls to functions of type %qT require"
5459                      " the SVE ISA extension", fntype);
5460     }
5461 }
5462
5463 static void
5464 aarch64_function_arg_advance (cumulative_args_t pcum_v,
5465                               const function_arg_info &arg)
5466 {
5467   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5468   if (pcum->pcs_variant == ARM_PCS_AAPCS64
5469       || pcum->pcs_variant == ARM_PCS_SIMD
5470       || pcum->pcs_variant == ARM_PCS_SVE)
5471     {
5472       aarch64_layout_arg (pcum_v, arg, arg.mode);
5473       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
5474                   != (pcum->aapcs_stack_words != 0));
5475       pcum->aapcs_arg_processed = false;
5476       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
5477       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
5478       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
5479       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
5480       pcum->aapcs_stack_words = 0;
5481       pcum->aapcs_reg = NULL_RTX;
5482     }
5483 }
5484
5485 bool
5486 aarch64_function_arg_regno_p (unsigned regno)
5487 {
5488   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
5489           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
5490 }
5491
5492 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
5493    PARM_BOUNDARY bits of alignment, but will be given anything up
5494    to STACK_BOUNDARY bits if the type requires it.  This makes sure
5495    that both before and after the layout of each argument, the Next
5496    Stacked Argument Address (NSAA) will have a minimum alignment of
5497    8 bytes.  */
5498
5499 static unsigned int
5500 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
5501 {
5502   bool abi_break;
5503   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
5504                                                            &abi_break);
5505   if (abi_break & warn_psabi)
5506     inform (input_location, "parameter passing for argument of type "
5507             "%qT changed in GCC 9.1", type);
5508
5509   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
5510 }
5511
5512 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
5513
5514 static fixed_size_mode
5515 aarch64_get_reg_raw_mode (int regno)
5516 {
5517   if (TARGET_SVE && FP_REGNUM_P (regno))
5518     /* Don't use the SVE part of the register for __builtin_apply and
5519        __builtin_return.  The SVE registers aren't used by the normal PCS,
5520        so using them there would be a waste of time.  The PCS extensions
5521        for SVE types are fundamentally incompatible with the
5522        __builtin_return/__builtin_apply interface.  */
5523     return as_a <fixed_size_mode> (V16QImode);
5524   return default_get_reg_raw_mode (regno);
5525 }
5526
5527 /* Implement TARGET_FUNCTION_ARG_PADDING.
5528
5529    Small aggregate types are placed in the lowest memory address.
5530
5531    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
5532
5533 static pad_direction
5534 aarch64_function_arg_padding (machine_mode mode, const_tree type)
5535 {
5536   /* On little-endian targets, the least significant byte of every stack
5537      argument is passed at the lowest byte address of the stack slot.  */
5538   if (!BYTES_BIG_ENDIAN)
5539     return PAD_UPWARD;
5540
5541   /* Otherwise, integral, floating-point and pointer types are padded downward:
5542      the least significant byte of a stack argument is passed at the highest
5543      byte address of the stack slot.  */
5544   if (type
5545       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
5546          || POINTER_TYPE_P (type))
5547       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
5548     return PAD_DOWNWARD;
5549
5550   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
5551   return PAD_UPWARD;
5552 }
5553
5554 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
5555
5556    It specifies padding for the last (may also be the only)
5557    element of a block move between registers and memory.  If
5558    assuming the block is in the memory, padding upward means that
5559    the last element is padded after its highest significant byte,
5560    while in downward padding, the last element is padded at the
5561    its least significant byte side.
5562
5563    Small aggregates and small complex types are always padded
5564    upwards.
5565
5566    We don't need to worry about homogeneous floating-point or
5567    short-vector aggregates; their move is not affected by the
5568    padding direction determined here.  Regardless of endianness,
5569    each element of such an aggregate is put in the least
5570    significant bits of a fp/simd register.
5571
5572    Return !BYTES_BIG_ENDIAN if the least significant byte of the
5573    register has useful data, and return the opposite if the most
5574    significant byte does.  */
5575
5576 bool
5577 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
5578                      bool first ATTRIBUTE_UNUSED)
5579 {
5580
5581   /* Small composite types are always padded upward.  */
5582   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
5583     {
5584       HOST_WIDE_INT size;
5585       if (type)
5586         size = int_size_in_bytes (type);
5587       else
5588         /* No frontends can create types with variable-sized modes, so we
5589            shouldn't be asked to pass or return them.  */
5590         size = GET_MODE_SIZE (mode).to_constant ();
5591       if (size < 2 * UNITS_PER_WORD)
5592         return true;
5593     }
5594
5595   /* Otherwise, use the default padding.  */
5596   return !BYTES_BIG_ENDIAN;
5597 }
5598
5599 static scalar_int_mode
5600 aarch64_libgcc_cmp_return_mode (void)
5601 {
5602   return SImode;
5603 }
5604
5605 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5606
5607 /* We use the 12-bit shifted immediate arithmetic instructions so values
5608    must be multiple of (1 << 12), i.e. 4096.  */
5609 #define ARITH_FACTOR 4096
5610
5611 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5612 #error Cannot use simple address calculation for stack probing
5613 #endif
5614
5615 /* The pair of scratch registers used for stack probing.  */
5616 #define PROBE_STACK_FIRST_REG  R9_REGNUM
5617 #define PROBE_STACK_SECOND_REG R10_REGNUM
5618
5619 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5620    inclusive.  These are offsets from the current stack pointer.  */
5621
5622 static void
5623 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5624 {
5625   HOST_WIDE_INT size;
5626   if (!poly_size.is_constant (&size))
5627     {
5628       sorry ("stack probes for SVE frames");
5629       return;
5630     }
5631
5632   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5633
5634   /* See the same assertion on PROBE_INTERVAL above.  */
5635   gcc_assert ((first % ARITH_FACTOR) == 0);
5636
5637   /* See if we have a constant small number of probes to generate.  If so,
5638      that's the easy case.  */
5639   if (size <= PROBE_INTERVAL)
5640     {
5641       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5642
5643       emit_set_insn (reg1,
5644                      plus_constant (Pmode,
5645                                     stack_pointer_rtx, -(first + base)));
5646       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5647     }
5648
5649   /* The run-time loop is made up of 8 insns in the generic case while the
5650      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
5651   else if (size <= 4 * PROBE_INTERVAL)
5652     {
5653       HOST_WIDE_INT i, rem;
5654
5655       emit_set_insn (reg1,
5656                      plus_constant (Pmode,
5657                                     stack_pointer_rtx,
5658                                     -(first + PROBE_INTERVAL)));
5659       emit_stack_probe (reg1);
5660
5661       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5662          it exceeds SIZE.  If only two probes are needed, this will not
5663          generate any code.  Then probe at FIRST + SIZE.  */
5664       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5665         {
5666           emit_set_insn (reg1,
5667                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5668           emit_stack_probe (reg1);
5669         }
5670
5671       rem = size - (i - PROBE_INTERVAL);
5672       if (rem > 256)
5673         {
5674           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5675
5676           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5677           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5678         }
5679       else
5680         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5681     }
5682
5683   /* Otherwise, do the same as above, but in a loop.  Note that we must be
5684      extra careful with variables wrapping around because we might be at
5685      the very top (or the very bottom) of the address space and we have
5686      to be able to handle this case properly; in particular, we use an
5687      equality test for the loop condition.  */
5688   else
5689     {
5690       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5691
5692       /* Step 1: round SIZE to the previous multiple of the interval.  */
5693
5694       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5695
5696
5697       /* Step 2: compute initial and final value of the loop counter.  */
5698
5699       /* TEST_ADDR = SP + FIRST.  */
5700       emit_set_insn (reg1,
5701                      plus_constant (Pmode, stack_pointer_rtx, -first));
5702
5703       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5704       HOST_WIDE_INT adjustment = - (first + rounded_size);
5705       if (! aarch64_uimm12_shift (adjustment))
5706         {
5707           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5708                                           true, Pmode);
5709           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5710         }
5711       else
5712         emit_set_insn (reg2,
5713                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5714
5715       /* Step 3: the loop
5716
5717          do
5718            {
5719              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5720              probe at TEST_ADDR
5721            }
5722          while (TEST_ADDR != LAST_ADDR)
5723
5724          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5725          until it is equal to ROUNDED_SIZE.  */
5726
5727       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5728
5729
5730       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5731          that SIZE is equal to ROUNDED_SIZE.  */
5732
5733       if (size != rounded_size)
5734         {
5735           HOST_WIDE_INT rem = size - rounded_size;
5736
5737           if (rem > 256)
5738             {
5739               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5740
5741               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5742               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5743             }
5744           else
5745             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5746         }
5747     }
5748
5749   /* Make sure nothing is scheduled before we are done.  */
5750   emit_insn (gen_blockage ());
5751 }
5752
5753 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5754    absolute addresses.  */
5755
5756 const char *
5757 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5758 {
5759   static int labelno = 0;
5760   char loop_lab[32];
5761   rtx xops[2];
5762
5763   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5764
5765   /* Loop.  */
5766   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5767
5768   HOST_WIDE_INT stack_clash_probe_interval
5769     = 1 << param_stack_clash_protection_guard_size;
5770
5771   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5772   xops[0] = reg1;
5773   HOST_WIDE_INT interval;
5774   if (flag_stack_clash_protection)
5775     interval = stack_clash_probe_interval;
5776   else
5777     interval = PROBE_INTERVAL;
5778
5779   gcc_assert (aarch64_uimm12_shift (interval));
5780   xops[1] = GEN_INT (interval);
5781
5782   output_asm_insn ("sub\t%0, %0, %1", xops);
5783
5784   /* If doing stack clash protection then we probe up by the ABI specified
5785      amount.  We do this because we're dropping full pages at a time in the
5786      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5787   if (flag_stack_clash_protection)
5788     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5789   else
5790     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5791
5792   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5793      by this amount for each iteration.  */
5794   output_asm_insn ("str\txzr, [%0, %1]", xops);
5795
5796   /* Test if TEST_ADDR == LAST_ADDR.  */
5797   xops[1] = reg2;
5798   output_asm_insn ("cmp\t%0, %1", xops);
5799
5800   /* Branch.  */
5801   fputs ("\tb.ne\t", asm_out_file);
5802   assemble_name_raw (asm_out_file, loop_lab);
5803   fputc ('\n', asm_out_file);
5804
5805   return "";
5806 }
5807
5808 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5809    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5810    of GUARD_SIZE.  When a probe is emitted it is done at most
5811    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5812    at most MIN_PROBE_THRESHOLD.  By the end of this function
5813    BASE = BASE - ADJUSTMENT.  */
5814
5815 const char *
5816 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5817                                       rtx min_probe_threshold, rtx guard_size)
5818 {
5819   /* This function is not allowed to use any instruction generation function
5820      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5821      so instead emit the code you want using output_asm_insn.  */
5822   gcc_assert (flag_stack_clash_protection);
5823   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5824   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5825
5826   /* The minimum required allocation before the residual requires probing.  */
5827   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5828
5829   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5830   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5831   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5832
5833   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5834   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5835
5836   static int labelno = 0;
5837   char loop_start_lab[32];
5838   char loop_end_lab[32];
5839   rtx xops[2];
5840
5841   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5842   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5843
5844   /* Emit loop start label.  */
5845   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5846
5847   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5848   xops[0] = adjustment;
5849   xops[1] = probe_offset_value_rtx;
5850   output_asm_insn ("cmp\t%0, %1", xops);
5851
5852   /* Branch to end if not enough adjustment to probe.  */
5853   fputs ("\tb.lt\t", asm_out_file);
5854   assemble_name_raw (asm_out_file, loop_end_lab);
5855   fputc ('\n', asm_out_file);
5856
5857   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5858   xops[0] = base;
5859   xops[1] = probe_offset_value_rtx;
5860   output_asm_insn ("sub\t%0, %0, %1", xops);
5861
5862   /* Probe at BASE.  */
5863   xops[1] = const0_rtx;
5864   output_asm_insn ("str\txzr, [%0, %1]", xops);
5865
5866   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5867   xops[0] = adjustment;
5868   xops[1] = probe_offset_value_rtx;
5869   output_asm_insn ("sub\t%0, %0, %1", xops);
5870
5871   /* Branch to start if still more bytes to allocate.  */
5872   fputs ("\tb\t", asm_out_file);
5873   assemble_name_raw (asm_out_file, loop_start_lab);
5874   fputc ('\n', asm_out_file);
5875
5876   /* No probe leave.  */
5877   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5878
5879   /* BASE = BASE - ADJUSTMENT.  */
5880   xops[0] = base;
5881   xops[1] = adjustment;
5882   output_asm_insn ("sub\t%0, %0, %1", xops);
5883   return "";
5884 }
5885
5886 /* Determine whether a frame chain needs to be generated.  */
5887 static bool
5888 aarch64_needs_frame_chain (void)
5889 {
5890   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5891   if (frame_pointer_needed || crtl->calls_eh_return)
5892     return true;
5893
5894   /* A leaf function cannot have calls or write LR.  */
5895   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5896
5897   /* Don't use a frame chain in leaf functions if leaf frame pointers
5898      are disabled.  */
5899   if (flag_omit_leaf_frame_pointer && is_leaf)
5900     return false;
5901
5902   return aarch64_use_frame_pointer;
5903 }
5904
5905 /* Mark the registers that need to be saved by the callee and calculate
5906    the size of the callee-saved registers area and frame record (both FP
5907    and LR may be omitted).  */
5908 static void
5909 aarch64_layout_frame (void)
5910 {
5911   poly_int64 offset = 0;
5912   int regno, last_fp_reg = INVALID_REGNUM;
5913   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
5914   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
5915   bool frame_related_fp_reg_p = false;
5916   aarch64_frame &frame = cfun->machine->frame;
5917
5918   frame.emit_frame_chain = aarch64_needs_frame_chain ();
5919
5920   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5921      the mid-end is doing.  */
5922   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5923
5924 #define SLOT_NOT_REQUIRED (-2)
5925 #define SLOT_REQUIRED     (-1)
5926
5927   frame.wb_candidate1 = INVALID_REGNUM;
5928   frame.wb_candidate2 = INVALID_REGNUM;
5929   frame.spare_pred_reg = INVALID_REGNUM;
5930
5931   /* First mark all the registers that really need to be saved...  */
5932   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5933     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5934
5935   /* ... that includes the eh data registers (if needed)...  */
5936   if (crtl->calls_eh_return)
5937     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5938       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
5939
5940   /* ... and any callee saved register that dataflow says is live.  */
5941   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5942     if (df_regs_ever_live_p (regno)
5943         && !fixed_regs[regno]
5944         && (regno == R30_REGNUM
5945             || !crtl->abi->clobbers_full_reg_p (regno)))
5946       frame.reg_offset[regno] = SLOT_REQUIRED;
5947
5948   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5949     if (df_regs_ever_live_p (regno)
5950         && !fixed_regs[regno]
5951         && !crtl->abi->clobbers_full_reg_p (regno))
5952       {
5953         frame.reg_offset[regno] = SLOT_REQUIRED;
5954         last_fp_reg = regno;
5955         if (aarch64_emit_cfi_for_reg_p (regno))
5956           frame_related_fp_reg_p = true;
5957       }
5958
5959   /* Big-endian SVE frames need a spare predicate register in order
5960      to save Z8-Z15.  Decide which register they should use.  Prefer
5961      an unused argument register if possible, so that we don't force P4
5962      to be saved unnecessarily.  */
5963   if (frame_related_fp_reg_p
5964       && crtl->abi->id () == ARM_PCS_SVE
5965       && BYTES_BIG_ENDIAN)
5966     {
5967       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
5968       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
5969       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
5970         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
5971           break;
5972       gcc_assert (regno <= P7_REGNUM);
5973       frame.spare_pred_reg = regno;
5974       df_set_regs_ever_live (regno, true);
5975     }
5976
5977   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
5978     if (df_regs_ever_live_p (regno)
5979         && !fixed_regs[regno]
5980         && !crtl->abi->clobbers_full_reg_p (regno))
5981       frame.reg_offset[regno] = SLOT_REQUIRED;
5982
5983   /* With stack-clash, LR must be saved in non-leaf functions.  */
5984   gcc_assert (crtl->is_leaf
5985               || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
5986
5987   /* Now assign stack slots for the registers.  Start with the predicate
5988      registers, since predicate LDR and STR have a relatively small
5989      offset range.  These saves happen below the hard frame pointer.  */
5990   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
5991     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
5992       {
5993         frame.reg_offset[regno] = offset;
5994         offset += BYTES_PER_SVE_PRED;
5995       }
5996
5997   /* We save a maximum of 8 predicate registers, and since vector
5998      registers are 8 times the size of a predicate register, all the
5999      saved predicates fit within a single vector.  Doing this also
6000      rounds the offset to a 128-bit boundary.  */
6001   if (maybe_ne (offset, 0))
6002     {
6003       gcc_assert (known_le (offset, vector_save_size));
6004       offset = vector_save_size;
6005     }
6006
6007   /* If we need to save any SVE vector registers, add them next.  */
6008   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6009     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6010       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6011         {
6012           frame.reg_offset[regno] = offset;
6013           offset += vector_save_size;
6014         }
6015
6016   /* OFFSET is now the offset of the hard frame pointer from the bottom
6017      of the callee save area.  */
6018   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6019   frame.below_hard_fp_saved_regs_size = offset;
6020   if (frame.emit_frame_chain)
6021     {
6022       /* FP and LR are placed in the linkage record.  */
6023       frame.reg_offset[R29_REGNUM] = offset;
6024       frame.wb_candidate1 = R29_REGNUM;
6025       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6026       frame.wb_candidate2 = R30_REGNUM;
6027       offset += 2 * UNITS_PER_WORD;
6028     }
6029
6030   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6031     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6032       {
6033         frame.reg_offset[regno] = offset;
6034         if (frame.wb_candidate1 == INVALID_REGNUM)
6035           frame.wb_candidate1 = regno;
6036         else if (frame.wb_candidate2 == INVALID_REGNUM)
6037           frame.wb_candidate2 = regno;
6038         offset += UNITS_PER_WORD;
6039       }
6040
6041   poly_int64 max_int_offset = offset;
6042   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6043   bool has_align_gap = maybe_ne (offset, max_int_offset);
6044
6045   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6046     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6047       {
6048         /* If there is an alignment gap between integer and fp callee-saves,
6049            allocate the last fp register to it if possible.  */
6050         if (regno == last_fp_reg
6051             && has_align_gap
6052             && known_eq (vector_save_size, 8)
6053             && multiple_p (offset, 16))
6054           {
6055             frame.reg_offset[regno] = max_int_offset;
6056             break;
6057           }
6058
6059         frame.reg_offset[regno] = offset;
6060         if (frame.wb_candidate1 == INVALID_REGNUM)
6061           frame.wb_candidate1 = regno;
6062         else if (frame.wb_candidate2 == INVALID_REGNUM
6063                  && frame.wb_candidate1 >= V0_REGNUM)
6064           frame.wb_candidate2 = regno;
6065         offset += vector_save_size;
6066       }
6067
6068   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6069
6070   frame.saved_regs_size = offset;
6071
6072   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
6073
6074   poly_int64 above_outgoing_args
6075     = aligned_upper_bound (varargs_and_saved_regs_size
6076                            + get_frame_size (),
6077                            STACK_BOUNDARY / BITS_PER_UNIT);
6078
6079   frame.hard_fp_offset
6080     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6081
6082   /* Both these values are already aligned.  */
6083   gcc_assert (multiple_p (crtl->outgoing_args_size,
6084                           STACK_BOUNDARY / BITS_PER_UNIT));
6085   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
6086
6087   frame.locals_offset = frame.saved_varargs_size;
6088
6089   frame.initial_adjust = 0;
6090   frame.final_adjust = 0;
6091   frame.callee_adjust = 0;
6092   frame.sve_callee_adjust = 0;
6093   frame.callee_offset = 0;
6094
6095   HOST_WIDE_INT max_push_offset = 0;
6096   if (frame.wb_candidate2 != INVALID_REGNUM)
6097     max_push_offset = 512;
6098   else if (frame.wb_candidate1 != INVALID_REGNUM)
6099     max_push_offset = 256;
6100
6101   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
6102   HOST_WIDE_INT const_saved_regs_size;
6103   if (frame.frame_size.is_constant (&const_size)
6104       && const_size < max_push_offset
6105       && known_eq (frame.hard_fp_offset, const_size))
6106     {
6107       /* Simple, small frame with no outgoing arguments:
6108
6109          stp reg1, reg2, [sp, -frame_size]!
6110          stp reg3, reg4, [sp, 16]  */
6111       frame.callee_adjust = const_size;
6112     }
6113   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
6114            && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6115            && const_outgoing_args_size + const_saved_regs_size < 512
6116            /* We could handle this case even with outgoing args, provided
6117               that the number of args left us with valid offsets for all
6118               predicate and vector save slots.  It's such a rare case that
6119               it hardly seems worth the effort though.  */
6120            && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
6121            && !(cfun->calls_alloca
6122                 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6123                 && const_fp_offset < max_push_offset))
6124     {
6125       /* Frame with small outgoing arguments:
6126
6127          sub sp, sp, frame_size
6128          stp reg1, reg2, [sp, outgoing_args_size]
6129          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
6130       frame.initial_adjust = frame.frame_size;
6131       frame.callee_offset = const_outgoing_args_size;
6132     }
6133   else if (saves_below_hard_fp_p
6134            && known_eq (frame.saved_regs_size,
6135                         frame.below_hard_fp_saved_regs_size))
6136     {
6137       /* Frame in which all saves are SVE saves:
6138
6139          sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6140          save SVE registers relative to SP
6141          sub sp, sp, outgoing_args_size  */
6142       frame.initial_adjust = (frame.hard_fp_offset
6143                               + frame.below_hard_fp_saved_regs_size);
6144       frame.final_adjust = crtl->outgoing_args_size;
6145     }
6146   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6147            && const_fp_offset < max_push_offset)
6148     {
6149       /* Frame with large outgoing arguments or SVE saves, but with
6150          a small local area:
6151
6152          stp reg1, reg2, [sp, -hard_fp_offset]!
6153          stp reg3, reg4, [sp, 16]
6154          [sub sp, sp, below_hard_fp_saved_regs_size]
6155          [save SVE registers relative to SP]
6156          sub sp, sp, outgoing_args_size  */
6157       frame.callee_adjust = const_fp_offset;
6158       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6159       frame.final_adjust = crtl->outgoing_args_size;
6160     }
6161   else
6162     {
6163       /* Frame with large local area and outgoing arguments or SVE saves,
6164          using frame pointer:
6165
6166          sub sp, sp, hard_fp_offset
6167          stp x29, x30, [sp, 0]
6168          add x29, sp, 0
6169          stp reg3, reg4, [sp, 16]
6170          [sub sp, sp, below_hard_fp_saved_regs_size]
6171          [save SVE registers relative to SP]
6172          sub sp, sp, outgoing_args_size  */
6173       frame.initial_adjust = frame.hard_fp_offset;
6174       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6175       frame.final_adjust = crtl->outgoing_args_size;
6176     }
6177
6178   /* Make sure the individual adjustments add up to the full frame size.  */
6179   gcc_assert (known_eq (frame.initial_adjust
6180                         + frame.callee_adjust
6181                         + frame.sve_callee_adjust
6182                         + frame.final_adjust, frame.frame_size));
6183
6184   frame.laid_out = true;
6185 }
6186
6187 /* Return true if the register REGNO is saved on entry to
6188    the current function.  */
6189
6190 static bool
6191 aarch64_register_saved_on_entry (int regno)
6192 {
6193   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6194 }
6195
6196 /* Return the next register up from REGNO up to LIMIT for the callee
6197    to save.  */
6198
6199 static unsigned
6200 aarch64_next_callee_save (unsigned regno, unsigned limit)
6201 {
6202   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6203     regno ++;
6204   return regno;
6205 }
6206
6207 /* Push the register number REGNO of mode MODE to the stack with write-back
6208    adjusting the stack by ADJUSTMENT.  */
6209
6210 static void
6211 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6212                            HOST_WIDE_INT adjustment)
6213  {
6214   rtx base_rtx = stack_pointer_rtx;
6215   rtx insn, reg, mem;
6216
6217   reg = gen_rtx_REG (mode, regno);
6218   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6219                             plus_constant (Pmode, base_rtx, -adjustment));
6220   mem = gen_frame_mem (mode, mem);
6221
6222   insn = emit_move_insn (mem, reg);
6223   RTX_FRAME_RELATED_P (insn) = 1;
6224 }
6225
6226 /* Generate and return an instruction to store the pair of registers
6227    REG and REG2 of mode MODE to location BASE with write-back adjusting
6228    the stack location BASE by ADJUSTMENT.  */
6229
6230 static rtx
6231 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6232                           HOST_WIDE_INT adjustment)
6233 {
6234   switch (mode)
6235     {
6236     case E_DImode:
6237       return gen_storewb_pairdi_di (base, base, reg, reg2,
6238                                     GEN_INT (-adjustment),
6239                                     GEN_INT (UNITS_PER_WORD - adjustment));
6240     case E_DFmode:
6241       return gen_storewb_pairdf_di (base, base, reg, reg2,
6242                                     GEN_INT (-adjustment),
6243                                     GEN_INT (UNITS_PER_WORD - adjustment));
6244     case E_TFmode:
6245       return gen_storewb_pairtf_di (base, base, reg, reg2,
6246                                     GEN_INT (-adjustment),
6247                                     GEN_INT (UNITS_PER_VREG - adjustment));
6248     default:
6249       gcc_unreachable ();
6250     }
6251 }
6252
6253 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6254    stack pointer by ADJUSTMENT.  */
6255
6256 static void
6257 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
6258 {
6259   rtx_insn *insn;
6260   machine_mode mode = aarch64_reg_save_mode (regno1);
6261
6262   if (regno2 == INVALID_REGNUM)
6263     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6264
6265   rtx reg1 = gen_rtx_REG (mode, regno1);
6266   rtx reg2 = gen_rtx_REG (mode, regno2);
6267
6268   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6269                                               reg2, adjustment));
6270   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
6271   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6272   RTX_FRAME_RELATED_P (insn) = 1;
6273 }
6274
6275 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6276    adjusting it by ADJUSTMENT afterwards.  */
6277
6278 static rtx
6279 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6280                          HOST_WIDE_INT adjustment)
6281 {
6282   switch (mode)
6283     {
6284     case E_DImode:
6285       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
6286                                    GEN_INT (UNITS_PER_WORD));
6287     case E_DFmode:
6288       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
6289                                    GEN_INT (UNITS_PER_WORD));
6290     case E_TFmode:
6291       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6292                                    GEN_INT (UNITS_PER_VREG));
6293     default:
6294       gcc_unreachable ();
6295     }
6296 }
6297
6298 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6299    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6300    into CFI_OPS.  */
6301
6302 static void
6303 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6304                   rtx *cfi_ops)
6305 {
6306   machine_mode mode = aarch64_reg_save_mode (regno1);
6307   rtx reg1 = gen_rtx_REG (mode, regno1);
6308
6309   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6310
6311   if (regno2 == INVALID_REGNUM)
6312     {
6313       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6314       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
6315       emit_move_insn (reg1, gen_frame_mem (mode, mem));
6316     }
6317   else
6318     {
6319       rtx reg2 = gen_rtx_REG (mode, regno2);
6320       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6321       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6322                                           reg2, adjustment));
6323     }
6324 }
6325
6326 /* Generate and return a store pair instruction of mode MODE to store
6327    register REG1 to MEM1 and register REG2 to MEM2.  */
6328
6329 static rtx
6330 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
6331                         rtx reg2)
6332 {
6333   switch (mode)
6334     {
6335     case E_DImode:
6336       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
6337
6338     case E_DFmode:
6339       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
6340
6341     case E_TFmode:
6342       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6343
6344     default:
6345       gcc_unreachable ();
6346     }
6347 }
6348
6349 /* Generate and regurn a load pair isntruction of mode MODE to load register
6350    REG1 from MEM1 and register REG2 from MEM2.  */
6351
6352 static rtx
6353 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
6354                        rtx mem2)
6355 {
6356   switch (mode)
6357     {
6358     case E_DImode:
6359       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
6360
6361     case E_DFmode:
6362       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
6363
6364     case E_TFmode:
6365       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6366
6367     default:
6368       gcc_unreachable ();
6369     }
6370 }
6371
6372 /* Return TRUE if return address signing should be enabled for the current
6373    function, otherwise return FALSE.  */
6374
6375 bool
6376 aarch64_return_address_signing_enabled (void)
6377 {
6378   /* This function should only be called after frame laid out.   */
6379   gcc_assert (cfun->machine->frame.laid_out);
6380
6381   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6382      if its LR is pushed onto stack.  */
6383   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6384           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
6385               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
6386 }
6387
6388 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
6389 bool
6390 aarch64_bti_enabled (void)
6391 {
6392   return (aarch64_enable_bti == 1);
6393 }
6394
6395 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6396    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6397    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
6398
6399      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6400          or LD1D address
6401
6402      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6403          if the variable isn't already nonnull
6404
6405    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6406    Handle this case using a temporary base register that is suitable for
6407    all offsets in that range.  Use ANCHOR_REG as this base register if it
6408    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
6409
6410 static inline void
6411 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
6412                                      rtx &anchor_reg, poly_int64 &offset,
6413                                      rtx &ptrue)
6414 {
6415   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
6416     {
6417       /* This is the maximum valid offset of the anchor from the base.
6418          Lower values would be valid too.  */
6419       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
6420       if (!anchor_reg)
6421         {
6422           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6423           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6424                                     gen_int_mode (anchor_offset, Pmode)));
6425         }
6426       base_rtx = anchor_reg;
6427       offset -= anchor_offset;
6428     }
6429   if (!ptrue)
6430     {
6431       int pred_reg = cfun->machine->frame.spare_pred_reg;
6432       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
6433                       CONSTM1_RTX (VNx16BImode));
6434       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
6435     }
6436 }
6437
6438 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6439    is saved at BASE + OFFSET.  */
6440
6441 static void
6442 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
6443                             rtx base, poly_int64 offset)
6444 {
6445   rtx mem = gen_frame_mem (GET_MODE (reg),
6446                            plus_constant (Pmode, base, offset));
6447   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
6448 }
6449
6450 /* Emit code to save the callee-saved registers from register number START
6451    to LIMIT to the stack at the location starting at offset START_OFFSET,
6452    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
6453    is true if the hard frame pointer has been set up.  */
6454
6455 static void
6456 aarch64_save_callee_saves (poly_int64 start_offset,
6457                            unsigned start, unsigned limit, bool skip_wb,
6458                            bool hard_fp_valid_p)
6459 {
6460   rtx_insn *insn;
6461   unsigned regno;
6462   unsigned regno2;
6463   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6464
6465   for (regno = aarch64_next_callee_save (start, limit);
6466        regno <= limit;
6467        regno = aarch64_next_callee_save (regno + 1, limit))
6468     {
6469       rtx reg, mem;
6470       poly_int64 offset;
6471       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6472
6473       if (skip_wb
6474           && (regno == cfun->machine->frame.wb_candidate1
6475               || regno == cfun->machine->frame.wb_candidate2))
6476         continue;
6477
6478       if (cfun->machine->reg_is_wrapped_separately[regno])
6479         continue;
6480
6481       machine_mode mode = aarch64_reg_save_mode (regno);
6482       reg = gen_rtx_REG (mode, regno);
6483       offset = start_offset + cfun->machine->frame.reg_offset[regno];
6484       rtx base_rtx = stack_pointer_rtx;
6485       poly_int64 sp_offset = offset;
6486
6487       HOST_WIDE_INT const_offset;
6488       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6489         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6490                                              offset, ptrue);
6491       else if (GP_REGNUM_P (regno)
6492                && (!offset.is_constant (&const_offset) || const_offset >= 512))
6493         {
6494           gcc_assert (known_eq (start_offset, 0));
6495           poly_int64 fp_offset
6496             = cfun->machine->frame.below_hard_fp_saved_regs_size;
6497           if (hard_fp_valid_p)
6498             base_rtx = hard_frame_pointer_rtx;
6499           else
6500             {
6501               if (!anchor_reg)
6502                 {
6503                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6504                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6505                                             gen_int_mode (fp_offset, Pmode)));
6506                 }
6507               base_rtx = anchor_reg;
6508             }
6509           offset -= fp_offset;
6510         }
6511       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6512       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
6513
6514       if (!aarch64_sve_mode_p (mode)
6515           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6516           && !cfun->machine->reg_is_wrapped_separately[regno2]
6517           && known_eq (GET_MODE_SIZE (mode),
6518                        cfun->machine->frame.reg_offset[regno2]
6519                        - cfun->machine->frame.reg_offset[regno]))
6520         {
6521           rtx reg2 = gen_rtx_REG (mode, regno2);
6522           rtx mem2;
6523
6524           offset += GET_MODE_SIZE (mode);
6525           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6526           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
6527                                                     reg2));
6528
6529           /* The first part of a frame-related parallel insn is
6530              always assumed to be relevant to the frame
6531              calculations; subsequent parts, are only
6532              frame-related if explicitly marked.  */
6533           if (aarch64_emit_cfi_for_reg_p (regno2))
6534             {
6535               if (need_cfa_note_p)
6536                 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
6537                                             sp_offset + GET_MODE_SIZE (mode));
6538               else
6539                 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6540             }
6541
6542           regno = regno2;
6543         }
6544       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6545         {
6546           insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
6547           need_cfa_note_p = true;
6548         }
6549       else if (aarch64_sve_mode_p (mode))
6550         insn = emit_insn (gen_rtx_SET (mem, reg));
6551       else
6552         insn = emit_move_insn (mem, reg);
6553
6554       RTX_FRAME_RELATED_P (insn) = frame_related_p;
6555       if (frame_related_p && need_cfa_note_p)
6556         aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
6557     }
6558 }
6559
6560 /* Emit code to restore the callee registers from register number START
6561    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
6562    skipping any write-back candidates if SKIP_WB is true.  Write the
6563    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
6564
6565 static void
6566 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
6567                               unsigned limit, bool skip_wb, rtx *cfi_ops)
6568 {
6569   unsigned regno;
6570   unsigned regno2;
6571   poly_int64 offset;
6572   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6573
6574   for (regno = aarch64_next_callee_save (start, limit);
6575        regno <= limit;
6576        regno = aarch64_next_callee_save (regno + 1, limit))
6577     {
6578       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6579       if (cfun->machine->reg_is_wrapped_separately[regno])
6580         continue;
6581
6582       rtx reg, mem;
6583
6584       if (skip_wb
6585           && (regno == cfun->machine->frame.wb_candidate1
6586               || regno == cfun->machine->frame.wb_candidate2))
6587         continue;
6588
6589       machine_mode mode = aarch64_reg_save_mode (regno);
6590       reg = gen_rtx_REG (mode, regno);
6591       offset = start_offset + cfun->machine->frame.reg_offset[regno];
6592       rtx base_rtx = stack_pointer_rtx;
6593       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6594         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6595                                              offset, ptrue);
6596       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6597
6598       if (!aarch64_sve_mode_p (mode)
6599           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6600           && !cfun->machine->reg_is_wrapped_separately[regno2]
6601           && known_eq (GET_MODE_SIZE (mode),
6602                        cfun->machine->frame.reg_offset[regno2]
6603                        - cfun->machine->frame.reg_offset[regno]))
6604         {
6605           rtx reg2 = gen_rtx_REG (mode, regno2);
6606           rtx mem2;
6607
6608           offset += GET_MODE_SIZE (mode);
6609           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6610           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6611
6612           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6613           regno = regno2;
6614         }
6615       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6616         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
6617       else if (aarch64_sve_mode_p (mode))
6618         emit_insn (gen_rtx_SET (reg, mem));
6619       else
6620         emit_move_insn (reg, mem);
6621       if (frame_related_p)
6622         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
6623     }
6624 }
6625
6626 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
6627    of MODE.  */
6628
6629 static inline bool
6630 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6631 {
6632   HOST_WIDE_INT multiple;
6633   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6634           && IN_RANGE (multiple, -8, 7));
6635 }
6636
6637 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
6638    of MODE.  */
6639
6640 static inline bool
6641 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6642 {
6643   HOST_WIDE_INT multiple;
6644   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6645           && IN_RANGE (multiple, 0, 63));
6646 }
6647
6648 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
6649    of MODE.  */
6650
6651 bool
6652 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6653 {
6654   HOST_WIDE_INT multiple;
6655   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6656           && IN_RANGE (multiple, -64, 63));
6657 }
6658
6659 /* Return true if OFFSET is a signed 9-bit value.  */
6660
6661 bool
6662 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
6663                                        poly_int64 offset)
6664 {
6665   HOST_WIDE_INT const_offset;
6666   return (offset.is_constant (&const_offset)
6667           && IN_RANGE (const_offset, -256, 255));
6668 }
6669
6670 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
6671    of MODE.  */
6672
6673 static inline bool
6674 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6675 {
6676   HOST_WIDE_INT multiple;
6677   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6678           && IN_RANGE (multiple, -256, 255));
6679 }
6680
6681 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
6682    of MODE.  */
6683
6684 static inline bool
6685 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6686 {
6687   HOST_WIDE_INT multiple;
6688   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6689           && IN_RANGE (multiple, 0, 4095));
6690 }
6691
6692 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
6693
6694 static sbitmap
6695 aarch64_get_separate_components (void)
6696 {
6697   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6698   bitmap_clear (components);
6699
6700   /* The registers we need saved to the frame.  */
6701   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6702     if (aarch64_register_saved_on_entry (regno))
6703       {
6704         /* Punt on saves and restores that use ST1D and LD1D.  We could
6705            try to be smarter, but it would involve making sure that the
6706            spare predicate register itself is safe to use at the save
6707            and restore points.  Also, when a frame pointer is being used,
6708            the slots are often out of reach of ST1D and LD1D anyway.  */
6709         machine_mode mode = aarch64_reg_save_mode (regno);
6710         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6711           continue;
6712
6713         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6714
6715         /* If the register is saved in the first SVE save slot, we use
6716            it as a stack probe for -fstack-clash-protection.  */
6717         if (flag_stack_clash_protection
6718             && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
6719             && known_eq (offset, 0))
6720           continue;
6721
6722         /* Get the offset relative to the register we'll use.  */
6723         if (frame_pointer_needed)
6724           offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6725         else
6726           offset += crtl->outgoing_args_size;
6727
6728         /* Check that we can access the stack slot of the register with one
6729            direct load with no adjustments needed.  */
6730         if (aarch64_sve_mode_p (mode)
6731             ? offset_9bit_signed_scaled_p (mode, offset)
6732             : offset_12bit_unsigned_scaled_p (mode, offset))
6733           bitmap_set_bit (components, regno);
6734       }
6735
6736   /* Don't mess with the hard frame pointer.  */
6737   if (frame_pointer_needed)
6738     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
6739
6740   /* If the spare predicate register used by big-endian SVE code
6741      is call-preserved, it must be saved in the main prologue
6742      before any saves that use it.  */
6743   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
6744     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
6745
6746   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6747   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6748   /* If registers have been chosen to be stored/restored with
6749      writeback don't interfere with them to avoid having to output explicit
6750      stack adjustment instructions.  */
6751   if (reg2 != INVALID_REGNUM)
6752     bitmap_clear_bit (components, reg2);
6753   if (reg1 != INVALID_REGNUM)
6754     bitmap_clear_bit (components, reg1);
6755
6756   bitmap_clear_bit (components, LR_REGNUM);
6757   bitmap_clear_bit (components, SP_REGNUM);
6758
6759   return components;
6760 }
6761
6762 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
6763
6764 static sbitmap
6765 aarch64_components_for_bb (basic_block bb)
6766 {
6767   bitmap in = DF_LIVE_IN (bb);
6768   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
6769   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
6770
6771   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6772   bitmap_clear (components);
6773
6774   /* Clobbered registers don't generate values in any meaningful sense,
6775      since nothing after the clobber can rely on their value.  And we can't
6776      say that partially-clobbered registers are unconditionally killed,
6777      because whether they're killed or not depends on the mode of the
6778      value they're holding.  Thus partially call-clobbered registers
6779      appear in neither the kill set nor the gen set.
6780
6781      Check manually for any calls that clobber more of a register than the
6782      current function can.  */
6783   function_abi_aggregator callee_abis;
6784   rtx_insn *insn;
6785   FOR_BB_INSNS (bb, insn)
6786     if (CALL_P (insn))
6787       callee_abis.note_callee_abi (insn_callee_abi (insn));
6788   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
6789
6790   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
6791   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6792     if (!fixed_regs[regno]
6793         && !crtl->abi->clobbers_full_reg_p (regno)
6794         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
6795             || bitmap_bit_p (in, regno)
6796             || bitmap_bit_p (gen, regno)
6797             || bitmap_bit_p (kill, regno)))
6798       {
6799         bitmap_set_bit (components, regno);
6800
6801         /* If there is a callee-save at an adjacent offset, add it too
6802            to increase the use of LDP/STP.  */
6803         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6804         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
6805
6806         if (regno2 <= LAST_SAVED_REGNUM)
6807           {
6808             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6809             if (regno < regno2
6810                 ? known_eq (offset + 8, offset2)
6811                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
6812               bitmap_set_bit (components, regno2);
6813           }
6814       }
6815
6816   return components;
6817 }
6818
6819 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6820    Nothing to do for aarch64.  */
6821
6822 static void
6823 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
6824 {
6825 }
6826
6827 /* Return the next set bit in BMP from START onwards.  Return the total number
6828    of bits in BMP if no set bit is found at or after START.  */
6829
6830 static unsigned int
6831 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6832 {
6833   unsigned int nbits = SBITMAP_SIZE (bmp);
6834   if (start == nbits)
6835     return start;
6836
6837   gcc_assert (start < nbits);
6838   for (unsigned int i = start; i < nbits; i++)
6839     if (bitmap_bit_p (bmp, i))
6840       return i;
6841
6842   return nbits;
6843 }
6844
6845 /* Do the work for aarch64_emit_prologue_components and
6846    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
6847    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6848    for these components or the epilogue sequence.  That is, it determines
6849    whether we should emit stores or loads and what kind of CFA notes to attach
6850    to the insns.  Otherwise the logic for the two sequences is very
6851    similar.  */
6852
6853 static void
6854 aarch64_process_components (sbitmap components, bool prologue_p)
6855 {
6856   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6857                              ? HARD_FRAME_POINTER_REGNUM
6858                              : STACK_POINTER_REGNUM);
6859
6860   unsigned last_regno = SBITMAP_SIZE (components);
6861   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6862   rtx_insn *insn = NULL;
6863
6864   while (regno != last_regno)
6865     {
6866       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6867       machine_mode mode = aarch64_reg_save_mode (regno);
6868
6869       rtx reg = gen_rtx_REG (mode, regno);
6870       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6871       if (frame_pointer_needed)
6872         offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6873       else
6874         offset += crtl->outgoing_args_size;
6875
6876       rtx addr = plus_constant (Pmode, ptr_reg, offset);
6877       rtx mem = gen_frame_mem (mode, addr);
6878
6879       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6880       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6881       /* No more registers to handle after REGNO.
6882          Emit a single save/restore and exit.  */
6883       if (regno2 == last_regno)
6884         {
6885           insn = emit_insn (set);
6886           if (frame_related_p)
6887             {
6888               RTX_FRAME_RELATED_P (insn) = 1;
6889               if (prologue_p)
6890                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6891               else
6892                 add_reg_note (insn, REG_CFA_RESTORE, reg);
6893             }
6894           break;
6895         }
6896
6897       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6898       /* The next register is not of the same class or its offset is not
6899          mergeable with the current one into a pair.  */
6900       if (aarch64_sve_mode_p (mode)
6901           || !satisfies_constraint_Ump (mem)
6902           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6903           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6904           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6905                        GET_MODE_SIZE (mode)))
6906         {
6907           insn = emit_insn (set);
6908           if (frame_related_p)
6909             {
6910               RTX_FRAME_RELATED_P (insn) = 1;
6911               if (prologue_p)
6912                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6913               else
6914                 add_reg_note (insn, REG_CFA_RESTORE, reg);
6915             }
6916
6917           regno = regno2;
6918           continue;
6919         }
6920
6921       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
6922
6923       /* REGNO2 can be saved/restored in a pair with REGNO.  */
6924       rtx reg2 = gen_rtx_REG (mode, regno2);
6925       if (frame_pointer_needed)
6926         offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6927       else
6928         offset2 += crtl->outgoing_args_size;
6929       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6930       rtx mem2 = gen_frame_mem (mode, addr2);
6931       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6932                              : gen_rtx_SET (reg2, mem2);
6933
6934       if (prologue_p)
6935         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6936       else
6937         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6938
6939       if (frame_related_p || frame_related2_p)
6940         {
6941           RTX_FRAME_RELATED_P (insn) = 1;
6942           if (prologue_p)
6943             {
6944               if (frame_related_p)
6945                 add_reg_note (insn, REG_CFA_OFFSET, set);
6946               if (frame_related2_p)
6947                 add_reg_note (insn, REG_CFA_OFFSET, set2);
6948             }
6949           else
6950             {
6951               if (frame_related_p)
6952                 add_reg_note (insn, REG_CFA_RESTORE, reg);
6953               if (frame_related2_p)
6954                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6955             }
6956         }
6957
6958       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6959     }
6960 }
6961
6962 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6963
6964 static void
6965 aarch64_emit_prologue_components (sbitmap components)
6966 {
6967   aarch64_process_components (components, true);
6968 }
6969
6970 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
6971
6972 static void
6973 aarch64_emit_epilogue_components (sbitmap components)
6974 {
6975   aarch64_process_components (components, false);
6976 }
6977
6978 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
6979
6980 static void
6981 aarch64_set_handled_components (sbitmap components)
6982 {
6983   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6984     if (bitmap_bit_p (components, regno))
6985       cfun->machine->reg_is_wrapped_separately[regno] = true;
6986 }
6987
6988 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
6989    determining the probe offset for alloca.  */
6990
6991 static HOST_WIDE_INT
6992 aarch64_stack_clash_protection_alloca_probe_range (void)
6993 {
6994   return STACK_CLASH_CALLER_GUARD;
6995 }
6996
6997
6998 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6999    registers.  If POLY_SIZE is not large enough to require a probe this function
7000    will only adjust the stack.  When allocating the stack space
7001    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7002    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7003    arguments.  If we are then we ensure that any allocation larger than the ABI
7004    defined buffer needs a probe so that the invariant of having a 1KB buffer is
7005    maintained.
7006
7007    We emit barriers after each stack adjustment to prevent optimizations from
7008    breaking the invariant that we never drop the stack more than a page.  This
7009    invariant is needed to make it easier to correctly handle asynchronous
7010    events, e.g. if we were to allow the stack to be dropped by more than a page
7011    and then have multiple probes up and we take a signal somewhere in between
7012    then the signal handler doesn't know the state of the stack and can make no
7013    assumptions about which pages have been probed.  */
7014
7015 static void
7016 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7017                                         poly_int64 poly_size,
7018                                         bool frame_related_p,
7019                                         bool final_adjustment_p)
7020 {
7021   HOST_WIDE_INT guard_size
7022     = 1 << param_stack_clash_protection_guard_size;
7023   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7024   HOST_WIDE_INT min_probe_threshold
7025     = (final_adjustment_p
7026        ? guard_used_by_caller
7027        : guard_size - guard_used_by_caller);
7028   /* When doing the final adjustment for the outgoing arguments, take into
7029      account any unprobed space there is above the current SP.  There are
7030      two cases:
7031
7032      - When saving SVE registers below the hard frame pointer, we force
7033        the lowest save to take place in the prologue before doing the final
7034        adjustment (i.e. we don't allow the save to be shrink-wrapped).
7035        This acts as a probe at SP, so there is no unprobed space.
7036
7037      - When there are no SVE register saves, we use the store of the link
7038        register as a probe.  We can't assume that LR was saved at position 0
7039        though, so treat any space below it as unprobed.  */
7040   if (final_adjustment_p
7041       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7042     {
7043       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7044       if (known_ge (lr_offset, 0))
7045         min_probe_threshold -= lr_offset.to_constant ();
7046       else
7047         gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7048     }
7049
7050   poly_int64 frame_size = cfun->machine->frame.frame_size;
7051
7052   /* We should always have a positive probe threshold.  */
7053   gcc_assert (min_probe_threshold > 0);
7054
7055   if (flag_stack_clash_protection && !final_adjustment_p)
7056     {
7057       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7058       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7059       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7060
7061       if (known_eq (frame_size, 0))
7062         {
7063           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7064         }
7065       else if (known_lt (initial_adjust + sve_callee_adjust,
7066                          guard_size - guard_used_by_caller)
7067                && known_lt (final_adjust, guard_used_by_caller))
7068         {
7069           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7070         }
7071     }
7072
7073   /* If SIZE is not large enough to require probing, just adjust the stack and
7074      exit.  */
7075   if (known_lt (poly_size, min_probe_threshold)
7076       || !flag_stack_clash_protection)
7077     {
7078       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7079       return;
7080     }
7081
7082   HOST_WIDE_INT size;
7083   /* Handle the SVE non-constant case first.  */
7084   if (!poly_size.is_constant (&size))
7085     {
7086      if (dump_file)
7087       {
7088         fprintf (dump_file, "Stack clash SVE prologue: ");
7089         print_dec (poly_size, dump_file);
7090         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7091       }
7092
7093       /* First calculate the amount of bytes we're actually spilling.  */
7094       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7095                           poly_size, temp1, temp2, false, true);
7096
7097       rtx_insn *insn = get_last_insn ();
7098
7099       if (frame_related_p)
7100         {
7101           /* This is done to provide unwinding information for the stack
7102              adjustments we're about to do, however to prevent the optimizers
7103              from removing the R11 move and leaving the CFA note (which would be
7104              very wrong) we tie the old and new stack pointer together.
7105              The tie will expand to nothing but the optimizers will not touch
7106              the instruction.  */
7107           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7108           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7109           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7110
7111           /* We want the CFA independent of the stack pointer for the
7112              duration of the loop.  */
7113           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7114           RTX_FRAME_RELATED_P (insn) = 1;
7115         }
7116
7117       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7118       rtx guard_const = gen_int_mode (guard_size, Pmode);
7119
7120       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7121                                                    stack_pointer_rtx, temp1,
7122                                                    probe_const, guard_const));
7123
7124       /* Now reset the CFA register if needed.  */
7125       if (frame_related_p)
7126         {
7127           add_reg_note (insn, REG_CFA_DEF_CFA,
7128                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7129                                       gen_int_mode (poly_size, Pmode)));
7130           RTX_FRAME_RELATED_P (insn) = 1;
7131         }
7132
7133       return;
7134     }
7135
7136   if (dump_file)
7137     fprintf (dump_file,
7138              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7139              " bytes, probing will be required.\n", size);
7140
7141   /* Round size to the nearest multiple of guard_size, and calculate the
7142      residual as the difference between the original size and the rounded
7143      size.  */
7144   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7145   HOST_WIDE_INT residual = size - rounded_size;
7146
7147   /* We can handle a small number of allocations/probes inline.  Otherwise
7148      punt to a loop.  */
7149   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7150     {
7151       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7152         {
7153           aarch64_sub_sp (NULL, temp2, guard_size, true);
7154           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7155                                            guard_used_by_caller));
7156           emit_insn (gen_blockage ());
7157         }
7158       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7159     }
7160   else
7161     {
7162       /* Compute the ending address.  */
7163       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7164                           temp1, NULL, false, true);
7165       rtx_insn *insn = get_last_insn ();
7166
7167       /* For the initial allocation, we don't have a frame pointer
7168          set up, so we always need CFI notes.  If we're doing the
7169          final allocation, then we may have a frame pointer, in which
7170          case it is the CFA, otherwise we need CFI notes.
7171
7172          We can determine which allocation we are doing by looking at
7173          the value of FRAME_RELATED_P since the final allocations are not
7174          frame related.  */
7175       if (frame_related_p)
7176         {
7177           /* We want the CFA independent of the stack pointer for the
7178              duration of the loop.  */
7179           add_reg_note (insn, REG_CFA_DEF_CFA,
7180                         plus_constant (Pmode, temp1, rounded_size));
7181           RTX_FRAME_RELATED_P (insn) = 1;
7182         }
7183
7184       /* This allocates and probes the stack.  Note that this re-uses some of
7185          the existing Ada stack protection code.  However we are guaranteed not
7186          to enter the non loop or residual branches of that code.
7187
7188          The non-loop part won't be entered because if our allocation amount
7189          doesn't require a loop, the case above would handle it.
7190
7191          The residual amount won't be entered because TEMP1 is a mutliple of
7192          the allocation size.  The residual will always be 0.  As such, the only
7193          part we are actually using from that code is the loop setup.  The
7194          actual probing is done in aarch64_output_probe_stack_range.  */
7195       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7196                                                stack_pointer_rtx, temp1));
7197
7198       /* Now reset the CFA register if needed.  */
7199       if (frame_related_p)
7200         {
7201           add_reg_note (insn, REG_CFA_DEF_CFA,
7202                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7203           RTX_FRAME_RELATED_P (insn) = 1;
7204         }
7205
7206       emit_insn (gen_blockage ());
7207       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7208     }
7209
7210   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
7211      be probed.  This maintains the requirement that each page is probed at
7212      least once.  For initial probing we probe only if the allocation is
7213      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7214      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
7215      GUARD_SIZE.  This works that for any allocation that is large enough to
7216      trigger a probe here, we'll have at least one, and if they're not large
7217      enough for this code to emit anything for them, The page would have been
7218      probed by the saving of FP/LR either by this function or any callees.  If
7219      we don't have any callees then we won't have more stack adjustments and so
7220      are still safe.  */
7221   if (residual)
7222     {
7223       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7224       /* If we're doing final adjustments, and we've done any full page
7225          allocations then any residual needs to be probed.  */
7226       if (final_adjustment_p && rounded_size != 0)
7227         min_probe_threshold = 0;
7228       /* If doing a small final adjustment, we always probe at offset 0.
7229          This is done to avoid issues when LR is not at position 0 or when
7230          the final adjustment is smaller than the probing offset.  */
7231       else if (final_adjustment_p && rounded_size == 0)
7232         residual_probe_offset = 0;
7233
7234       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7235       if (residual >= min_probe_threshold)
7236         {
7237           if (dump_file)
7238             fprintf (dump_file,
7239                      "Stack clash AArch64 prologue residuals: "
7240                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7241                      "\n", residual);
7242
7243             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7244                                              residual_probe_offset));
7245           emit_insn (gen_blockage ());
7246         }
7247     }
7248 }
7249
7250 /* Return 1 if the register is used by the epilogue.  We need to say the
7251    return register is used, but only after epilogue generation is complete.
7252    Note that in the case of sibcalls, the values "used by the epilogue" are
7253    considered live at the start of the called function.
7254
7255    For SIMD functions we need to return 1 for FP registers that are saved and
7256    restored by a function but are not zero in call_used_regs.  If we do not do
7257    this optimizations may remove the restore of the register.  */
7258
7259 int
7260 aarch64_epilogue_uses (int regno)
7261 {
7262   if (epilogue_completed)
7263     {
7264       if (regno == LR_REGNUM)
7265         return 1;
7266     }
7267   return 0;
7268 }
7269
7270 /* AArch64 stack frames generated by this compiler look like:
7271
7272         +-------------------------------+
7273         |                               |
7274         |  incoming stack arguments     |
7275         |                               |
7276         +-------------------------------+
7277         |                               | <-- incoming stack pointer (aligned)
7278         |  callee-allocated save area   |
7279         |  for register varargs         |
7280         |                               |
7281         +-------------------------------+
7282         |  local variables              | <-- frame_pointer_rtx
7283         |                               |
7284         +-------------------------------+
7285         |  padding                      | \
7286         +-------------------------------+  |
7287         |  callee-saved registers       |  | frame.saved_regs_size
7288         +-------------------------------+  |
7289         |  LR'                          |  |
7290         +-------------------------------+  |
7291         |  FP'                          |  |
7292         +-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
7293         |  SVE vector registers         |  | \
7294         +-------------------------------+  |  | below_hard_fp_saved_regs_size
7295         |  SVE predicate registers      | /  /
7296         +-------------------------------+
7297         |  dynamic allocation           |
7298         +-------------------------------+
7299         |  padding                      |
7300         +-------------------------------+
7301         |  outgoing stack arguments     | <-- arg_pointer
7302         |                               |
7303         +-------------------------------+
7304         |                               | <-- stack_pointer_rtx (aligned)
7305
7306    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7307    but leave frame_pointer_rtx and hard_frame_pointer_rtx
7308    unchanged.
7309
7310    By default for stack-clash we assume the guard is at least 64KB, but this
7311    value is configurable to either 4KB or 64KB.  We also force the guard size to
7312    be the same as the probing interval and both values are kept in sync.
7313
7314    With those assumptions the callee can allocate up to 63KB (or 3KB depending
7315    on the guard size) of stack space without probing.
7316
7317    When probing is needed, we emit a probe at the start of the prologue
7318    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7319
7320    We have to track how much space has been allocated and the only stores
7321    to the stack we track as implicit probes are the FP/LR stores.
7322
7323    For outgoing arguments we probe if the size is larger than 1KB, such that
7324    the ABI specified buffer is maintained for the next callee.
7325
7326    The following registers are reserved during frame layout and should not be
7327    used for any other purpose:
7328
7329    - r11: Used by stack clash protection when SVE is enabled, and also
7330           as an anchor register when saving and restoring registers
7331    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7332    - r14 and r15: Used for speculation tracking.
7333    - r16(IP0), r17(IP1): Used by indirect tailcalls.
7334    - r30(LR), r29(FP): Used by standard frame layout.
7335
7336    These registers must be avoided in frame layout related code unless the
7337    explicit intention is to interact with one of the features listed above.  */
7338
7339 /* Generate the prologue instructions for entry into a function.
7340    Establish the stack frame by decreasing the stack pointer with a
7341    properly calculated size and, if necessary, create a frame record
7342    filled with the values of LR and previous frame pointer.  The
7343    current FP is also set up if it is in use.  */
7344
7345 void
7346 aarch64_expand_prologue (void)
7347 {
7348   poly_int64 frame_size = cfun->machine->frame.frame_size;
7349   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7350   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7351   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7352   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7353   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7354   poly_int64 below_hard_fp_saved_regs_size
7355     = cfun->machine->frame.below_hard_fp_saved_regs_size;
7356   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7357   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7358   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
7359   rtx_insn *insn;
7360
7361   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7362     {
7363       /* Fold the SVE allocation into the initial allocation.
7364          We don't do this in aarch64_layout_arg to avoid pessimizing
7365          the epilogue code.  */
7366       initial_adjust += sve_callee_adjust;
7367       sve_callee_adjust = 0;
7368     }
7369
7370   /* Sign return address for functions.  */
7371   if (aarch64_return_address_signing_enabled ())
7372     {
7373       switch (aarch64_ra_sign_key)
7374         {
7375           case AARCH64_KEY_A:
7376             insn = emit_insn (gen_paciasp ());
7377             break;
7378           case AARCH64_KEY_B:
7379             insn = emit_insn (gen_pacibsp ());
7380             break;
7381           default:
7382             gcc_unreachable ();
7383         }
7384       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7385       RTX_FRAME_RELATED_P (insn) = 1;
7386     }
7387
7388   if (flag_stack_usage_info)
7389     current_function_static_stack_size = constant_lower_bound (frame_size);
7390
7391   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7392     {
7393       if (crtl->is_leaf && !cfun->calls_alloca)
7394         {
7395           if (maybe_gt (frame_size, PROBE_INTERVAL)
7396               && maybe_gt (frame_size, get_stack_check_protect ()))
7397             aarch64_emit_probe_stack_range (get_stack_check_protect (),
7398                                             (frame_size
7399                                              - get_stack_check_protect ()));
7400         }
7401       else if (maybe_gt (frame_size, 0))
7402         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
7403     }
7404
7405   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7406   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7407
7408   /* In theory we should never have both an initial adjustment
7409      and a callee save adjustment.  Verify that is the case since the
7410      code below does not handle it for -fstack-clash-protection.  */
7411   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
7412
7413   /* Will only probe if the initial adjustment is larger than the guard
7414      less the amount of the guard reserved for use by the caller's
7415      outgoing args.  */
7416   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
7417                                           true, false);
7418
7419   if (callee_adjust != 0)
7420     aarch64_push_regs (reg1, reg2, callee_adjust);
7421
7422   /* The offset of the frame chain record (if any) from the current SP.  */
7423   poly_int64 chain_offset = (initial_adjust + callee_adjust
7424                              - cfun->machine->frame.hard_fp_offset);
7425   gcc_assert (known_ge (chain_offset, 0));
7426
7427   /* The offset of the bottom of the save area from the current SP.  */
7428   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
7429
7430   if (emit_frame_chain)
7431     {
7432       if (callee_adjust == 0)
7433         {
7434           reg1 = R29_REGNUM;
7435           reg2 = R30_REGNUM;
7436           aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
7437                                      false, false);
7438         }
7439       else
7440         gcc_assert (known_eq (chain_offset, 0));
7441       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
7442                           stack_pointer_rtx, chain_offset,
7443                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
7444       if (frame_pointer_needed && !frame_size.is_constant ())
7445         {
7446           /* Variable-sized frames need to describe the save slot
7447              address using DW_CFA_expression rather than DW_CFA_offset.
7448              This means that, without taking further action, the
7449              locations of the registers that we've already saved would
7450              remain based on the stack pointer even after we redefine
7451              the CFA based on the frame pointer.  We therefore need new
7452              DW_CFA_expressions to re-express the save slots with addresses
7453              based on the frame pointer.  */
7454           rtx_insn *insn = get_last_insn ();
7455           gcc_assert (RTX_FRAME_RELATED_P (insn));
7456
7457           /* Add an explicit CFA definition if this was previously
7458              implicit.  */
7459           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
7460             {
7461               rtx src = plus_constant (Pmode, stack_pointer_rtx,
7462                                        callee_offset);
7463               add_reg_note (insn, REG_CFA_ADJUST_CFA,
7464                             gen_rtx_SET (hard_frame_pointer_rtx, src));
7465             }
7466
7467           /* Change the save slot expressions for the registers that
7468              we've already saved.  */
7469           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
7470                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
7471           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
7472                                       hard_frame_pointer_rtx, 0);
7473         }
7474       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
7475     }
7476
7477   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
7478                              callee_adjust != 0 || emit_frame_chain,
7479                              emit_frame_chain);
7480   if (maybe_ne (sve_callee_adjust, 0))
7481     {
7482       gcc_assert (!flag_stack_clash_protection
7483                   || known_eq (initial_adjust, 0));
7484       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
7485                                               sve_callee_adjust,
7486                                               !frame_pointer_needed, false);
7487       saved_regs_offset += sve_callee_adjust;
7488     }
7489   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
7490                              false, emit_frame_chain);
7491   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
7492                              callee_adjust != 0 || emit_frame_chain,
7493                              emit_frame_chain);
7494
7495   /* We may need to probe the final adjustment if it is larger than the guard
7496      that is assumed by the called.  */
7497   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
7498                                           !frame_pointer_needed, true);
7499 }
7500
7501 /* Return TRUE if we can use a simple_return insn.
7502
7503    This function checks whether the callee saved stack is empty, which
7504    means no restore actions are need. The pro_and_epilogue will use
7505    this to check whether shrink-wrapping opt is feasible.  */
7506
7507 bool
7508 aarch64_use_return_insn_p (void)
7509 {
7510   if (!reload_completed)
7511     return false;
7512
7513   if (crtl->profile)
7514     return false;
7515
7516   return known_eq (cfun->machine->frame.frame_size, 0);
7517 }
7518
7519 /* Generate the epilogue instructions for returning from a function.
7520    This is almost exactly the reverse of the prolog sequence, except
7521    that we need to insert barriers to avoid scheduling loads that read
7522    from a deallocated stack, and we optimize the unwind records by
7523    emitting them all together if possible.  */
7524 void
7525 aarch64_expand_epilogue (bool for_sibcall)
7526 {
7527   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7528   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7529   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7530   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7531   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7532   poly_int64 below_hard_fp_saved_regs_size
7533     = cfun->machine->frame.below_hard_fp_saved_regs_size;
7534   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7535   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7536   rtx cfi_ops = NULL;
7537   rtx_insn *insn;
7538   /* A stack clash protection prologue may not have left EP0_REGNUM or
7539      EP1_REGNUM in a usable state.  The same is true for allocations
7540      with an SVE component, since we then need both temporary registers
7541      for each allocation.  For stack clash we are in a usable state if
7542      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
7543   HOST_WIDE_INT guard_size
7544     = 1 << param_stack_clash_protection_guard_size;
7545   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7546
7547   /* We can re-use the registers when:
7548
7549      (a) the deallocation amount is the same as the corresponding
7550          allocation amount (which is false if we combine the initial
7551          and SVE callee save allocations in the prologue); and
7552
7553      (b) the allocation amount doesn't need a probe (which is false
7554          if the amount is guard_size - guard_used_by_caller or greater).
7555
7556      In such situations the register should remain live with the correct
7557      value.  */
7558   bool can_inherit_p = (initial_adjust.is_constant ()
7559                         && final_adjust.is_constant ()
7560                         && (!flag_stack_clash_protection
7561                             || (known_lt (initial_adjust,
7562                                           guard_size - guard_used_by_caller)
7563                                 && known_eq (sve_callee_adjust, 0))));
7564
7565   /* We need to add memory barrier to prevent read from deallocated stack.  */
7566   bool need_barrier_p
7567     = maybe_ne (get_frame_size ()
7568                 + cfun->machine->frame.saved_varargs_size, 0);
7569
7570   /* Emit a barrier to prevent loads from a deallocated stack.  */
7571   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
7572       || cfun->calls_alloca
7573       || crtl->calls_eh_return)
7574     {
7575       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7576       need_barrier_p = false;
7577     }
7578
7579   /* Restore the stack pointer from the frame pointer if it may not
7580      be the same as the stack pointer.  */
7581   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7582   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7583   if (frame_pointer_needed
7584       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
7585     /* If writeback is used when restoring callee-saves, the CFA
7586        is restored on the instruction doing the writeback.  */
7587     aarch64_add_offset (Pmode, stack_pointer_rtx,
7588                         hard_frame_pointer_rtx,
7589                         -callee_offset - below_hard_fp_saved_regs_size,
7590                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
7591   else
7592      /* The case where we need to re-use the register here is very rare, so
7593         avoid the complicated condition and just always emit a move if the
7594         immediate doesn't fit.  */
7595      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
7596
7597   /* Restore the vector registers before the predicate registers,
7598      so that we can use P4 as a temporary for big-endian SVE frames.  */
7599   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
7600                                 callee_adjust != 0, &cfi_ops);
7601   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
7602                                 false, &cfi_ops);
7603   if (maybe_ne (sve_callee_adjust, 0))
7604     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
7605   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
7606                                 R0_REGNUM, R30_REGNUM,
7607                                 callee_adjust != 0, &cfi_ops);
7608
7609   if (need_barrier_p)
7610     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7611
7612   if (callee_adjust != 0)
7613     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
7614
7615   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
7616     {
7617       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
7618       insn = get_last_insn ();
7619       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
7620       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
7621       RTX_FRAME_RELATED_P (insn) = 1;
7622       cfi_ops = NULL;
7623     }
7624
7625   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
7626      add restriction on emit_move optimization to leaf functions.  */
7627   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
7628                   (!can_inherit_p || !crtl->is_leaf
7629                    || df_regs_ever_live_p (EP0_REGNUM)));
7630
7631   if (cfi_ops)
7632     {
7633       /* Emit delayed restores and reset the CFA to be SP.  */
7634       insn = get_last_insn ();
7635       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
7636       REG_NOTES (insn) = cfi_ops;
7637       RTX_FRAME_RELATED_P (insn) = 1;
7638     }
7639
7640   /* We prefer to emit the combined return/authenticate instruction RETAA,
7641      however there are three cases in which we must instead emit an explicit
7642      authentication instruction.
7643
7644         1) Sibcalls don't return in a normal way, so if we're about to call one
7645            we must authenticate.
7646
7647         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
7648            generating code for !TARGET_ARMV8_3 we can't use it and must
7649            explicitly authenticate.
7650
7651         3) On an eh_return path we make extra stack adjustments to update the
7652            canonical frame address to be the exception handler's CFA.  We want
7653            to authenticate using the CFA of the function which calls eh_return.
7654     */
7655   if (aarch64_return_address_signing_enabled ()
7656       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
7657     {
7658       switch (aarch64_ra_sign_key)
7659         {
7660           case AARCH64_KEY_A:
7661             insn = emit_insn (gen_autiasp ());
7662             break;
7663           case AARCH64_KEY_B:
7664             insn = emit_insn (gen_autibsp ());
7665             break;
7666           default:
7667             gcc_unreachable ();
7668         }
7669       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7670       RTX_FRAME_RELATED_P (insn) = 1;
7671     }
7672
7673   /* Stack adjustment for exception handler.  */
7674   if (crtl->calls_eh_return && !for_sibcall)
7675     {
7676       /* We need to unwind the stack by the offset computed by
7677          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
7678          to be SP; letting the CFA move during this adjustment
7679          is just as correct as retaining the CFA from the body
7680          of the function.  Therefore, do nothing special.  */
7681       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
7682     }
7683
7684   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
7685   if (!for_sibcall)
7686     emit_jump_insn (ret_rtx);
7687 }
7688
7689 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
7690    normally or return to a previous frame after unwinding.
7691
7692    An EH return uses a single shared return sequence.  The epilogue is
7693    exactly like a normal epilogue except that it has an extra input
7694    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
7695    that must be applied after the frame has been destroyed.  An extra label
7696    is inserted before the epilogue which initializes this register to zero,
7697    and this is the entry point for a normal return.
7698
7699    An actual EH return updates the return address, initializes the stack
7700    adjustment and jumps directly into the epilogue (bypassing the zeroing
7701    of the adjustment).  Since the return address is typically saved on the
7702    stack when a function makes a call, the saved LR must be updated outside
7703    the epilogue.
7704
7705    This poses problems as the store is generated well before the epilogue,
7706    so the offset of LR is not known yet.  Also optimizations will remove the
7707    store as it appears dead, even after the epilogue is generated (as the
7708    base or offset for loading LR is different in many cases).
7709
7710    To avoid these problems this implementation forces the frame pointer
7711    in eh_return functions so that the location of LR is fixed and known early.
7712    It also marks the store volatile, so no optimization is permitted to
7713    remove the store.  */
7714 rtx
7715 aarch64_eh_return_handler_rtx (void)
7716 {
7717   rtx tmp = gen_frame_mem (Pmode,
7718     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
7719
7720   /* Mark the store volatile, so no optimization is permitted to remove it.  */
7721   MEM_VOLATILE_P (tmp) = true;
7722   return tmp;
7723 }
7724
7725 /* Output code to add DELTA to the first argument, and then jump
7726    to FUNCTION.  Used for C++ multiple inheritance.  */
7727 static void
7728 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
7729                          HOST_WIDE_INT delta,
7730                          HOST_WIDE_INT vcall_offset,
7731                          tree function)
7732 {
7733   /* The this pointer is always in x0.  Note that this differs from
7734      Arm where the this pointer maybe bumped to r1 if r0 is required
7735      to return a pointer to an aggregate.  On AArch64 a result value
7736      pointer will be in x8.  */
7737   int this_regno = R0_REGNUM;
7738   rtx this_rtx, temp0, temp1, addr, funexp;
7739   rtx_insn *insn;
7740   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
7741
7742   if (aarch64_bti_enabled ())
7743     emit_insn (gen_bti_c());
7744
7745   reload_completed = 1;
7746   emit_note (NOTE_INSN_PROLOGUE_END);
7747
7748   this_rtx = gen_rtx_REG (Pmode, this_regno);
7749   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
7750   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
7751
7752   if (vcall_offset == 0)
7753     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
7754   else
7755     {
7756       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
7757
7758       addr = this_rtx;
7759       if (delta != 0)
7760         {
7761           if (delta >= -256 && delta < 256)
7762             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
7763                                        plus_constant (Pmode, this_rtx, delta));
7764           else
7765             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
7766                                 temp1, temp0, false);
7767         }
7768
7769       if (Pmode == ptr_mode)
7770         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
7771       else
7772         aarch64_emit_move (temp0,
7773                            gen_rtx_ZERO_EXTEND (Pmode,
7774                                                 gen_rtx_MEM (ptr_mode, addr)));
7775
7776       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
7777           addr = plus_constant (Pmode, temp0, vcall_offset);
7778       else
7779         {
7780           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
7781                                           Pmode);
7782           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
7783         }
7784
7785       if (Pmode == ptr_mode)
7786         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
7787       else
7788         aarch64_emit_move (temp1,
7789                            gen_rtx_SIGN_EXTEND (Pmode,
7790                                                 gen_rtx_MEM (ptr_mode, addr)));
7791
7792       emit_insn (gen_add2_insn (this_rtx, temp1));
7793     }
7794
7795   /* Generate a tail call to the target function.  */
7796   if (!TREE_USED (function))
7797     {
7798       assemble_external (function);
7799       TREE_USED (function) = 1;
7800     }
7801   funexp = XEXP (DECL_RTL (function), 0);
7802   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
7803   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
7804   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
7805   SIBLING_CALL_P (insn) = 1;
7806
7807   insn = get_insns ();
7808   shorten_branches (insn);
7809
7810   assemble_start_function (thunk, fnname);
7811   final_start_function (insn, file, 1);
7812   final (insn, file, 1);
7813   final_end_function ();
7814   assemble_end_function (thunk, fnname);
7815
7816   /* Stop pretending to be a post-reload pass.  */
7817   reload_completed = 0;
7818 }
7819
7820 static bool
7821 aarch64_tls_referenced_p (rtx x)
7822 {
7823   if (!TARGET_HAVE_TLS)
7824     return false;
7825   subrtx_iterator::array_type array;
7826   FOR_EACH_SUBRTX (iter, array, x, ALL)
7827     {
7828       const_rtx x = *iter;
7829       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
7830         return true;
7831       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
7832          TLS offsets, not real symbol references.  */
7833       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7834         iter.skip_subrtxes ();
7835     }
7836   return false;
7837 }
7838
7839
7840 /* Return true if val can be encoded as a 12-bit unsigned immediate with
7841    a left shift of 0 or 12 bits.  */
7842 bool
7843 aarch64_uimm12_shift (HOST_WIDE_INT val)
7844 {
7845   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
7846           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
7847           );
7848 }
7849
7850 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
7851    that can be created with a left shift of 0 or 12.  */
7852 static HOST_WIDE_INT
7853 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
7854 {
7855   /* Check to see if the value fits in 24 bits, as that is the maximum we can
7856      handle correctly.  */
7857   gcc_assert ((val & 0xffffff) == val);
7858
7859   if (((val & 0xfff) << 0) == val)
7860     return val;
7861
7862   return val & (0xfff << 12);
7863 }
7864
7865 /* Return true if val is an immediate that can be loaded into a
7866    register by a MOVZ instruction.  */
7867 static bool
7868 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
7869 {
7870   if (GET_MODE_SIZE (mode) > 4)
7871     {
7872       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
7873           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
7874         return 1;
7875     }
7876   else
7877     {
7878       /* Ignore sign extension.  */
7879       val &= (HOST_WIDE_INT) 0xffffffff;
7880     }
7881   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
7882           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
7883 }
7884
7885 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
7886    64-bit (DImode) integer.  */
7887
7888 static unsigned HOST_WIDE_INT
7889 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
7890 {
7891   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
7892   while (size < 64)
7893     {
7894       val &= (HOST_WIDE_INT_1U << size) - 1;
7895       val |= val << size;
7896       size *= 2;
7897     }
7898   return val;
7899 }
7900
7901 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
7902
7903 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7904   {
7905     0x0000000100000001ull,
7906     0x0001000100010001ull,
7907     0x0101010101010101ull,
7908     0x1111111111111111ull,
7909     0x5555555555555555ull,
7910   };
7911
7912
7913 /* Return true if val is a valid bitmask immediate.  */
7914
7915 bool
7916 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7917 {
7918   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7919   int bits;
7920
7921   /* Check for a single sequence of one bits and return quickly if so.
7922      The special cases of all ones and all zeroes returns false.  */
7923   val = aarch64_replicate_bitmask_imm (val_in, mode);
7924   tmp = val + (val & -val);
7925
7926   if (tmp == (tmp & -tmp))
7927     return (val + 1) > 1;
7928
7929   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
7930   if (mode == SImode)
7931     val = (val << 32) | (val & 0xffffffff);
7932
7933   /* Invert if the immediate doesn't start with a zero bit - this means we
7934      only need to search for sequences of one bits.  */
7935   if (val & 1)
7936     val = ~val;
7937
7938   /* Find the first set bit and set tmp to val with the first sequence of one
7939      bits removed.  Return success if there is a single sequence of ones.  */
7940   first_one = val & -val;
7941   tmp = val & (val + first_one);
7942
7943   if (tmp == 0)
7944     return true;
7945
7946   /* Find the next set bit and compute the difference in bit position.  */
7947   next_one = tmp & -tmp;
7948   bits = clz_hwi (first_one) - clz_hwi (next_one);
7949   mask = val ^ tmp;
7950
7951   /* Check the bit position difference is a power of 2, and that the first
7952      sequence of one bits fits within 'bits' bits.  */
7953   if ((mask >> bits) != 0 || bits != (bits & -bits))
7954     return false;
7955
7956   /* Check the sequence of one bits is repeated 64/bits times.  */
7957   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7958 }
7959
7960 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7961    Assumed precondition: VAL_IN Is not zero.  */
7962
7963 unsigned HOST_WIDE_INT
7964 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7965 {
7966   int lowest_bit_set = ctz_hwi (val_in);
7967   int highest_bit_set = floor_log2 (val_in);
7968   gcc_assert (val_in != 0);
7969
7970   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7971           (HOST_WIDE_INT_1U << lowest_bit_set));
7972 }
7973
7974 /* Create constant where bits outside of lowest bit set to highest bit set
7975    are set to 1.  */
7976
7977 unsigned HOST_WIDE_INT
7978 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7979 {
7980   return val_in | ~aarch64_and_split_imm1 (val_in);
7981 }
7982
7983 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
7984
7985 bool
7986 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7987 {
7988   scalar_int_mode int_mode;
7989   if (!is_a <scalar_int_mode> (mode, &int_mode))
7990     return false;
7991
7992   if (aarch64_bitmask_imm (val_in, int_mode))
7993     return false;
7994
7995   if (aarch64_move_imm (val_in, int_mode))
7996     return false;
7997
7998   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7999
8000   return aarch64_bitmask_imm (imm2, int_mode);
8001 }
8002
8003 /* Return true if val is an immediate that can be loaded into a
8004    register in a single instruction.  */
8005 bool
8006 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8007 {
8008   scalar_int_mode int_mode;
8009   if (!is_a <scalar_int_mode> (mode, &int_mode))
8010     return false;
8011
8012   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8013     return 1;
8014   return aarch64_bitmask_imm (val, int_mode);
8015 }
8016
8017 static bool
8018 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8019 {
8020   rtx base, offset;
8021
8022   if (GET_CODE (x) == HIGH)
8023     return true;
8024
8025   /* There's no way to calculate VL-based values using relocations.  */
8026   subrtx_iterator::array_type array;
8027   FOR_EACH_SUBRTX (iter, array, x, ALL)
8028     if (GET_CODE (*iter) == CONST_POLY_INT)
8029       return true;
8030
8031   split_const (x, &base, &offset);
8032   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
8033     {
8034       if (aarch64_classify_symbol (base, INTVAL (offset))
8035           != SYMBOL_FORCE_TO_MEM)
8036         return true;
8037       else
8038         /* Avoid generating a 64-bit relocation in ILP32; leave
8039            to aarch64_expand_mov_immediate to handle it properly.  */
8040         return mode != ptr_mode;
8041     }
8042
8043   return aarch64_tls_referenced_p (x);
8044 }
8045
8046 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8047    The expansion for a table switch is quite expensive due to the number
8048    of instructions, the table lookup and hard to predict indirect jump.
8049    When optimizing for speed, and -O3 enabled, use the per-core tuning if
8050    set, otherwise use tables for > 16 cases as a tradeoff between size and
8051    performance.  When optimizing for size, use the default setting.  */
8052
8053 static unsigned int
8054 aarch64_case_values_threshold (void)
8055 {
8056   /* Use the specified limit for the number of cases before using jump
8057      tables at higher optimization levels.  */
8058   if (optimize > 2
8059       && selected_cpu->tune->max_case_values != 0)
8060     return selected_cpu->tune->max_case_values;
8061   else
8062     return optimize_size ? default_case_values_threshold () : 17;
8063 }
8064
8065 /* Return true if register REGNO is a valid index register.
8066    STRICT_P is true if REG_OK_STRICT is in effect.  */
8067
8068 bool
8069 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8070 {
8071   if (!HARD_REGISTER_NUM_P (regno))
8072     {
8073       if (!strict_p)
8074         return true;
8075
8076       if (!reg_renumber)
8077         return false;
8078
8079       regno = reg_renumber[regno];
8080     }
8081   return GP_REGNUM_P (regno);
8082 }
8083
8084 /* Return true if register REGNO is a valid base register for mode MODE.
8085    STRICT_P is true if REG_OK_STRICT is in effect.  */
8086
8087 bool
8088 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8089 {
8090   if (!HARD_REGISTER_NUM_P (regno))
8091     {
8092       if (!strict_p)
8093         return true;
8094
8095       if (!reg_renumber)
8096         return false;
8097
8098       regno = reg_renumber[regno];
8099     }
8100
8101   /* The fake registers will be eliminated to either the stack or
8102      hard frame pointer, both of which are usually valid base registers.
8103      Reload deals with the cases where the eliminated form isn't valid.  */
8104   return (GP_REGNUM_P (regno)
8105           || regno == SP_REGNUM
8106           || regno == FRAME_POINTER_REGNUM
8107           || regno == ARG_POINTER_REGNUM);
8108 }
8109
8110 /* Return true if X is a valid base register for mode MODE.
8111    STRICT_P is true if REG_OK_STRICT is in effect.  */
8112
8113 static bool
8114 aarch64_base_register_rtx_p (rtx x, bool strict_p)
8115 {
8116   if (!strict_p
8117       && GET_CODE (x) == SUBREG
8118       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
8119     x = SUBREG_REG (x);
8120
8121   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8122 }
8123
8124 /* Return true if address offset is a valid index.  If it is, fill in INFO
8125    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
8126
8127 static bool
8128 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
8129                         machine_mode mode, bool strict_p)
8130 {
8131   enum aarch64_address_type type;
8132   rtx index;
8133   int shift;
8134
8135   /* (reg:P) */
8136   if ((REG_P (x) || GET_CODE (x) == SUBREG)
8137       && GET_MODE (x) == Pmode)
8138     {
8139       type = ADDRESS_REG_REG;
8140       index = x;
8141       shift = 0;
8142     }
8143   /* (sign_extend:DI (reg:SI)) */
8144   else if ((GET_CODE (x) == SIGN_EXTEND
8145             || GET_CODE (x) == ZERO_EXTEND)
8146            && GET_MODE (x) == DImode
8147            && GET_MODE (XEXP (x, 0)) == SImode)
8148     {
8149       type = (GET_CODE (x) == SIGN_EXTEND)
8150         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8151       index = XEXP (x, 0);
8152       shift = 0;
8153     }
8154   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8155   else if (GET_CODE (x) == MULT
8156            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8157                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8158            && GET_MODE (XEXP (x, 0)) == DImode
8159            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8160            && CONST_INT_P (XEXP (x, 1)))
8161     {
8162       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8163         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8164       index = XEXP (XEXP (x, 0), 0);
8165       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8166     }
8167   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8168   else if (GET_CODE (x) == ASHIFT
8169            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8170                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8171            && GET_MODE (XEXP (x, 0)) == DImode
8172            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8173            && CONST_INT_P (XEXP (x, 1)))
8174     {
8175       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8176         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8177       index = XEXP (XEXP (x, 0), 0);
8178       shift = INTVAL (XEXP (x, 1));
8179     }
8180   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8181   else if ((GET_CODE (x) == SIGN_EXTRACT
8182             || GET_CODE (x) == ZERO_EXTRACT)
8183            && GET_MODE (x) == DImode
8184            && GET_CODE (XEXP (x, 0)) == MULT
8185            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8186            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8187     {
8188       type = (GET_CODE (x) == SIGN_EXTRACT)
8189         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8190       index = XEXP (XEXP (x, 0), 0);
8191       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8192       if (INTVAL (XEXP (x, 1)) != 32 + shift
8193           || INTVAL (XEXP (x, 2)) != 0)
8194         shift = -1;
8195     }
8196   /* (and:DI (mult:DI (reg:DI) (const_int scale))
8197      (const_int 0xffffffff<<shift)) */
8198   else if (GET_CODE (x) == AND
8199            && GET_MODE (x) == DImode
8200            && GET_CODE (XEXP (x, 0)) == MULT
8201            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8202            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8203            && CONST_INT_P (XEXP (x, 1)))
8204     {
8205       type = ADDRESS_REG_UXTW;
8206       index = XEXP (XEXP (x, 0), 0);
8207       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8208       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8209         shift = -1;
8210     }
8211   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8212   else if ((GET_CODE (x) == SIGN_EXTRACT
8213             || GET_CODE (x) == ZERO_EXTRACT)
8214            && GET_MODE (x) == DImode
8215            && GET_CODE (XEXP (x, 0)) == ASHIFT
8216            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8217            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8218     {
8219       type = (GET_CODE (x) == SIGN_EXTRACT)
8220         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8221       index = XEXP (XEXP (x, 0), 0);
8222       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8223       if (INTVAL (XEXP (x, 1)) != 32 + shift
8224           || INTVAL (XEXP (x, 2)) != 0)
8225         shift = -1;
8226     }
8227   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8228      (const_int 0xffffffff<<shift)) */
8229   else if (GET_CODE (x) == AND
8230            && GET_MODE (x) == DImode
8231            && GET_CODE (XEXP (x, 0)) == ASHIFT
8232            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8233            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8234            && CONST_INT_P (XEXP (x, 1)))
8235     {
8236       type = ADDRESS_REG_UXTW;
8237       index = XEXP (XEXP (x, 0), 0);
8238       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8239       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8240         shift = -1;
8241     }
8242   /* (mult:P (reg:P) (const_int scale)) */
8243   else if (GET_CODE (x) == MULT
8244            && GET_MODE (x) == Pmode
8245            && GET_MODE (XEXP (x, 0)) == Pmode
8246            && CONST_INT_P (XEXP (x, 1)))
8247     {
8248       type = ADDRESS_REG_REG;
8249       index = XEXP (x, 0);
8250       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8251     }
8252   /* (ashift:P (reg:P) (const_int shift)) */
8253   else if (GET_CODE (x) == ASHIFT
8254            && GET_MODE (x) == Pmode
8255            && GET_MODE (XEXP (x, 0)) == Pmode
8256            && CONST_INT_P (XEXP (x, 1)))
8257     {
8258       type = ADDRESS_REG_REG;
8259       index = XEXP (x, 0);
8260       shift = INTVAL (XEXP (x, 1));
8261     }
8262   else
8263     return false;
8264
8265   if (!strict_p
8266       && GET_CODE (index) == SUBREG
8267       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
8268     index = SUBREG_REG (index);
8269
8270   if (aarch64_sve_data_mode_p (mode))
8271     {
8272       if (type != ADDRESS_REG_REG
8273           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8274         return false;
8275     }
8276   else
8277     {
8278       if (shift != 0
8279           && !(IN_RANGE (shift, 1, 3)
8280                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8281         return false;
8282     }
8283
8284   if (REG_P (index)
8285       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8286     {
8287       info->type = type;
8288       info->offset = index;
8289       info->shift = shift;
8290       return true;
8291     }
8292
8293   return false;
8294 }
8295
8296 /* Return true if MODE is one of the modes for which we
8297    support LDP/STP operations.  */
8298
8299 static bool
8300 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8301 {
8302   return mode == SImode || mode == DImode
8303          || mode == SFmode || mode == DFmode
8304          || (aarch64_vector_mode_supported_p (mode)
8305              && (known_eq (GET_MODE_SIZE (mode), 8)
8306                  || (known_eq (GET_MODE_SIZE (mode), 16)
8307                     && (aarch64_tune_params.extra_tuning_flags
8308                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
8309 }
8310
8311 /* Return true if REGNO is a virtual pointer register, or an eliminable
8312    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
8313    include stack_pointer or hard_frame_pointer.  */
8314 static bool
8315 virt_or_elim_regno_p (unsigned regno)
8316 {
8317   return ((regno >= FIRST_VIRTUAL_REGISTER
8318            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8319           || regno == FRAME_POINTER_REGNUM
8320           || regno == ARG_POINTER_REGNUM);
8321 }
8322
8323 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8324    If it is, fill in INFO appropriately.  STRICT_P is true if
8325    REG_OK_STRICT is in effect.  */
8326
8327 bool
8328 aarch64_classify_address (struct aarch64_address_info *info,
8329                           rtx x, machine_mode mode, bool strict_p,
8330                           aarch64_addr_query_type type)
8331 {
8332   enum rtx_code code = GET_CODE (x);
8333   rtx op0, op1;
8334   poly_int64 offset;
8335
8336   HOST_WIDE_INT const_size;
8337
8338   /* Whether a vector mode is partial doesn't affect address legitimacy.
8339      Partial vectors like VNx8QImode allow the same indexed addressing
8340      mode and MUL VL addressing mode as full vectors like VNx16QImode;
8341      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
8342   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8343   vec_flags &= ~VEC_PARTIAL;
8344
8345   /* On BE, we use load/store pair for all large int mode load/stores.
8346      TI/TFmode may also use a load/store pair.  */
8347   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
8348   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
8349                             || type == ADDR_QUERY_LDP_STP_N
8350                             || mode == TImode
8351                             || mode == TFmode
8352                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
8353
8354   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8355      corresponds to the actual size of the memory being loaded/stored and the
8356      mode of the corresponding addressing mode is half of that.  */
8357   if (type == ADDR_QUERY_LDP_STP_N
8358       && known_eq (GET_MODE_SIZE (mode), 16))
8359     mode = DFmode;
8360
8361   bool allow_reg_index_p = (!load_store_pair_p
8362                             && (known_lt (GET_MODE_SIZE (mode), 16)
8363                                 || vec_flags == VEC_ADVSIMD
8364                                 || vec_flags & VEC_SVE_DATA));
8365
8366   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8367      [Rn, #offset, MUL VL].  */
8368   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8369       && (code != REG && code != PLUS))
8370     return false;
8371
8372   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8373      REG addressing.  */
8374   if (advsimd_struct_p
8375       && !BYTES_BIG_ENDIAN
8376       && (code != POST_INC && code != REG))
8377     return false;
8378
8379   gcc_checking_assert (GET_MODE (x) == VOIDmode
8380                        || SCALAR_INT_MODE_P (GET_MODE (x)));
8381
8382   switch (code)
8383     {
8384     case REG:
8385     case SUBREG:
8386       info->type = ADDRESS_REG_IMM;
8387       info->base = x;
8388       info->offset = const0_rtx;
8389       info->const_offset = 0;
8390       return aarch64_base_register_rtx_p (x, strict_p);
8391
8392     case PLUS:
8393       op0 = XEXP (x, 0);
8394       op1 = XEXP (x, 1);
8395
8396       if (! strict_p
8397           && REG_P (op0)
8398           && virt_or_elim_regno_p (REGNO (op0))
8399           && poly_int_rtx_p (op1, &offset))
8400         {
8401           info->type = ADDRESS_REG_IMM;
8402           info->base = op0;
8403           info->offset = op1;
8404           info->const_offset = offset;
8405
8406           return true;
8407         }
8408
8409       if (maybe_ne (GET_MODE_SIZE (mode), 0)
8410           && aarch64_base_register_rtx_p (op0, strict_p)
8411           && poly_int_rtx_p (op1, &offset))
8412         {
8413           info->type = ADDRESS_REG_IMM;
8414           info->base = op0;
8415           info->offset = op1;
8416           info->const_offset = offset;
8417
8418           /* TImode and TFmode values are allowed in both pairs of X
8419              registers and individual Q registers.  The available
8420              address modes are:
8421              X,X: 7-bit signed scaled offset
8422              Q:   9-bit signed offset
8423              We conservatively require an offset representable in either mode.
8424              When performing the check for pairs of X registers i.e.  LDP/STP
8425              pass down DImode since that is the natural size of the LDP/STP
8426              instruction memory accesses.  */
8427           if (mode == TImode || mode == TFmode)
8428             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
8429                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8430                         || offset_12bit_unsigned_scaled_p (mode, offset)));
8431
8432           /* A 7bit offset check because OImode will emit a ldp/stp
8433              instruction (only big endian will get here).
8434              For ldp/stp instructions, the offset is scaled for the size of a
8435              single element of the pair.  */
8436           if (mode == OImode)
8437             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
8438
8439           /* Three 9/12 bit offsets checks because CImode will emit three
8440              ldr/str instructions (only big endian will get here).  */
8441           if (mode == CImode)
8442             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8443                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
8444                                                                offset + 32)
8445                         || offset_12bit_unsigned_scaled_p (V16QImode,
8446                                                            offset + 32)));
8447
8448           /* Two 7bit offsets checks because XImode will emit two ldp/stp
8449              instructions (only big endian will get here).  */
8450           if (mode == XImode)
8451             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8452                     && aarch64_offset_7bit_signed_scaled_p (TImode,
8453                                                             offset + 32));
8454
8455           /* Make "m" use the LD1 offset range for SVE data modes, so
8456              that pre-RTL optimizers like ivopts will work to that
8457              instead of the wider LDR/STR range.  */
8458           if (vec_flags == VEC_SVE_DATA)
8459             return (type == ADDR_QUERY_M
8460                     ? offset_4bit_signed_scaled_p (mode, offset)
8461                     : offset_9bit_signed_scaled_p (mode, offset));
8462
8463           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
8464             {
8465               poly_int64 end_offset = (offset
8466                                        + GET_MODE_SIZE (mode)
8467                                        - BYTES_PER_SVE_VECTOR);
8468               return (type == ADDR_QUERY_M
8469                       ? offset_4bit_signed_scaled_p (mode, offset)
8470                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
8471                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
8472                                                          end_offset)));
8473             }
8474
8475           if (vec_flags == VEC_SVE_PRED)
8476             return offset_9bit_signed_scaled_p (mode, offset);
8477
8478           if (load_store_pair_p)
8479             return ((known_eq (GET_MODE_SIZE (mode), 4)
8480                      || known_eq (GET_MODE_SIZE (mode), 8)
8481                      || known_eq (GET_MODE_SIZE (mode), 16))
8482                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8483           else
8484             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8485                     || offset_12bit_unsigned_scaled_p (mode, offset));
8486         }
8487
8488       if (allow_reg_index_p)
8489         {
8490           /* Look for base + (scaled/extended) index register.  */
8491           if (aarch64_base_register_rtx_p (op0, strict_p)
8492               && aarch64_classify_index (info, op1, mode, strict_p))
8493             {
8494               info->base = op0;
8495               return true;
8496             }
8497           if (aarch64_base_register_rtx_p (op1, strict_p)
8498               && aarch64_classify_index (info, op0, mode, strict_p))
8499             {
8500               info->base = op1;
8501               return true;
8502             }
8503         }
8504
8505       return false;
8506
8507     case POST_INC:
8508     case POST_DEC:
8509     case PRE_INC:
8510     case PRE_DEC:
8511       info->type = ADDRESS_REG_WB;
8512       info->base = XEXP (x, 0);
8513       info->offset = NULL_RTX;
8514       return aarch64_base_register_rtx_p (info->base, strict_p);
8515
8516     case POST_MODIFY:
8517     case PRE_MODIFY:
8518       info->type = ADDRESS_REG_WB;
8519       info->base = XEXP (x, 0);
8520       if (GET_CODE (XEXP (x, 1)) == PLUS
8521           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
8522           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
8523           && aarch64_base_register_rtx_p (info->base, strict_p))
8524         {
8525           info->offset = XEXP (XEXP (x, 1), 1);
8526           info->const_offset = offset;
8527
8528           /* TImode and TFmode values are allowed in both pairs of X
8529              registers and individual Q registers.  The available
8530              address modes are:
8531              X,X: 7-bit signed scaled offset
8532              Q:   9-bit signed offset
8533              We conservatively require an offset representable in either mode.
8534            */
8535           if (mode == TImode || mode == TFmode)
8536             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
8537                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
8538
8539           if (load_store_pair_p)
8540             return ((known_eq (GET_MODE_SIZE (mode), 4)
8541                      || known_eq (GET_MODE_SIZE (mode), 8)
8542                      || known_eq (GET_MODE_SIZE (mode), 16))
8543                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8544           else
8545             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
8546         }
8547       return false;
8548
8549     case CONST:
8550     case SYMBOL_REF:
8551     case LABEL_REF:
8552       /* load literal: pc-relative constant pool entry.  Only supported
8553          for SI mode or larger.  */
8554       info->type = ADDRESS_SYMBOLIC;
8555
8556       if (!load_store_pair_p
8557           && GET_MODE_SIZE (mode).is_constant (&const_size)
8558           && const_size >= 4)
8559         {
8560           rtx sym, addend;
8561
8562           split_const (x, &sym, &addend);
8563           return ((GET_CODE (sym) == LABEL_REF
8564                    || (GET_CODE (sym) == SYMBOL_REF
8565                        && CONSTANT_POOL_ADDRESS_P (sym)
8566                        && aarch64_pcrelative_literal_loads)));
8567         }
8568       return false;
8569
8570     case LO_SUM:
8571       info->type = ADDRESS_LO_SUM;
8572       info->base = XEXP (x, 0);
8573       info->offset = XEXP (x, 1);
8574       if (allow_reg_index_p
8575           && aarch64_base_register_rtx_p (info->base, strict_p))
8576         {
8577           rtx sym, offs;
8578           split_const (info->offset, &sym, &offs);
8579           if (GET_CODE (sym) == SYMBOL_REF
8580               && (aarch64_classify_symbol (sym, INTVAL (offs))
8581                   == SYMBOL_SMALL_ABSOLUTE))
8582             {
8583               /* The symbol and offset must be aligned to the access size.  */
8584               unsigned int align;
8585
8586               if (CONSTANT_POOL_ADDRESS_P (sym))
8587                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
8588               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
8589                 {
8590                   tree exp = SYMBOL_REF_DECL (sym);
8591                   align = TYPE_ALIGN (TREE_TYPE (exp));
8592                   align = aarch64_constant_alignment (exp, align);
8593                 }
8594               else if (SYMBOL_REF_DECL (sym))
8595                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
8596               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
8597                        && SYMBOL_REF_BLOCK (sym) != NULL)
8598                 align = SYMBOL_REF_BLOCK (sym)->alignment;
8599               else
8600                 align = BITS_PER_UNIT;
8601
8602               poly_int64 ref_size = GET_MODE_SIZE (mode);
8603               if (known_eq (ref_size, 0))
8604                 ref_size = GET_MODE_SIZE (DImode);
8605
8606               return (multiple_p (INTVAL (offs), ref_size)
8607                       && multiple_p (align / BITS_PER_UNIT, ref_size));
8608             }
8609         }
8610       return false;
8611
8612     default:
8613       return false;
8614     }
8615 }
8616
8617 /* Return true if the address X is valid for a PRFM instruction.
8618    STRICT_P is true if we should do strict checking with
8619    aarch64_classify_address.  */
8620
8621 bool
8622 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
8623 {
8624   struct aarch64_address_info addr;
8625
8626   /* PRFM accepts the same addresses as DImode...  */
8627   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
8628   if (!res)
8629     return false;
8630
8631   /* ... except writeback forms.  */
8632   return addr.type != ADDRESS_REG_WB;
8633 }
8634
8635 bool
8636 aarch64_symbolic_address_p (rtx x)
8637 {
8638   rtx offset;
8639
8640   split_const (x, &x, &offset);
8641   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
8642 }
8643
8644 /* Classify the base of symbolic expression X.  */
8645
8646 enum aarch64_symbol_type
8647 aarch64_classify_symbolic_expression (rtx x)
8648 {
8649   rtx offset;
8650
8651   split_const (x, &x, &offset);
8652   return aarch64_classify_symbol (x, INTVAL (offset));
8653 }
8654
8655
8656 /* Return TRUE if X is a legitimate address for accessing memory in
8657    mode MODE.  */
8658 static bool
8659 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
8660 {
8661   struct aarch64_address_info addr;
8662
8663   return aarch64_classify_address (&addr, x, mode, strict_p);
8664 }
8665
8666 /* Return TRUE if X is a legitimate address of type TYPE for accessing
8667    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
8668 bool
8669 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
8670                               aarch64_addr_query_type type)
8671 {
8672   struct aarch64_address_info addr;
8673
8674   return aarch64_classify_address (&addr, x, mode, strict_p, type);
8675 }
8676
8677 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
8678
8679 static bool
8680 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
8681                                          poly_int64 orig_offset,
8682                                          machine_mode mode)
8683 {
8684   HOST_WIDE_INT size;
8685   if (GET_MODE_SIZE (mode).is_constant (&size))
8686     {
8687       HOST_WIDE_INT const_offset, second_offset;
8688
8689       /* A general SVE offset is A * VQ + B.  Remove the A component from
8690          coefficient 0 in order to get the constant B.  */
8691       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
8692
8693       /* Split an out-of-range address displacement into a base and
8694          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
8695          range otherwise to increase opportunities for sharing the base
8696          address of different sizes.  Unaligned accesses use the signed
8697          9-bit range, TImode/TFmode use the intersection of signed
8698          scaled 7-bit and signed 9-bit offset.  */
8699       if (mode == TImode || mode == TFmode)
8700         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
8701       else if ((const_offset & (size - 1)) != 0)
8702         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
8703       else
8704         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
8705
8706       if (second_offset == 0 || known_eq (orig_offset, second_offset))
8707         return false;
8708
8709       /* Split the offset into second_offset and the rest.  */
8710       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8711       *offset2 = gen_int_mode (second_offset, Pmode);
8712       return true;
8713     }
8714   else
8715     {
8716       /* Get the mode we should use as the basis of the range.  For structure
8717          modes this is the mode of one vector.  */
8718       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8719       machine_mode step_mode
8720         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
8721
8722       /* Get the "mul vl" multiplier we'd like to use.  */
8723       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
8724       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
8725       if (vec_flags & VEC_SVE_DATA)
8726         /* LDR supports a 9-bit range, but the move patterns for
8727            structure modes require all vectors to be in range of the
8728            same base.  The simplest way of accomodating that while still
8729            promoting reuse of anchor points between different modes is
8730            to use an 8-bit range unconditionally.  */
8731         vnum = ((vnum + 128) & 255) - 128;
8732       else
8733         /* Predicates are only handled singly, so we might as well use
8734            the full range.  */
8735         vnum = ((vnum + 256) & 511) - 256;
8736       if (vnum == 0)
8737         return false;
8738
8739       /* Convert the "mul vl" multiplier into a byte offset.  */
8740       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
8741       if (known_eq (second_offset, orig_offset))
8742         return false;
8743
8744       /* Split the offset into second_offset and the rest.  */
8745       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8746       *offset2 = gen_int_mode (second_offset, Pmode);
8747       return true;
8748     }
8749 }
8750
8751 /* Return the binary representation of floating point constant VALUE in INTVAL.
8752    If the value cannot be converted, return false without setting INTVAL.
8753    The conversion is done in the given MODE.  */
8754 bool
8755 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
8756 {
8757
8758   /* We make a general exception for 0.  */
8759   if (aarch64_float_const_zero_rtx_p (value))
8760     {
8761       *intval = 0;
8762       return true;
8763     }
8764
8765   scalar_float_mode mode;
8766   if (GET_CODE (value) != CONST_DOUBLE
8767       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
8768       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
8769       /* Only support up to DF mode.  */
8770       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
8771     return false;
8772
8773   unsigned HOST_WIDE_INT ival = 0;
8774
8775   long res[2];
8776   real_to_target (res,
8777                   CONST_DOUBLE_REAL_VALUE (value),
8778                   REAL_MODE_FORMAT (mode));
8779
8780   if (mode == DFmode)
8781     {
8782       int order = BYTES_BIG_ENDIAN ? 1 : 0;
8783       ival = zext_hwi (res[order], 32);
8784       ival |= (zext_hwi (res[1 - order], 32) << 32);
8785     }
8786   else
8787       ival = zext_hwi (res[0], 32);
8788
8789   *intval = ival;
8790   return true;
8791 }
8792
8793 /* Return TRUE if rtx X is an immediate constant that can be moved using a
8794    single MOV(+MOVK) followed by an FMOV.  */
8795 bool
8796 aarch64_float_const_rtx_p (rtx x)
8797 {
8798   machine_mode mode = GET_MODE (x);
8799   if (mode == VOIDmode)
8800     return false;
8801
8802   /* Determine whether it's cheaper to write float constants as
8803      mov/movk pairs over ldr/adrp pairs.  */
8804   unsigned HOST_WIDE_INT ival;
8805
8806   if (GET_CODE (x) == CONST_DOUBLE
8807       && SCALAR_FLOAT_MODE_P (mode)
8808       && aarch64_reinterpret_float_as_int (x, &ival))
8809     {
8810       scalar_int_mode imode = (mode == HFmode
8811                                ? SImode
8812                                : int_mode_for_mode (mode).require ());
8813       int num_instr = aarch64_internal_mov_immediate
8814                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8815       return num_instr < 3;
8816     }
8817
8818   return false;
8819 }
8820
8821 /* Return TRUE if rtx X is immediate constant 0.0 */
8822 bool
8823 aarch64_float_const_zero_rtx_p (rtx x)
8824 {
8825   if (GET_MODE (x) == VOIDmode)
8826     return false;
8827
8828   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
8829     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
8830   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
8831 }
8832
8833 /* Return TRUE if rtx X is immediate constant that fits in a single
8834    MOVI immediate operation.  */
8835 bool
8836 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
8837 {
8838   if (!TARGET_SIMD)
8839      return false;
8840
8841   machine_mode vmode;
8842   scalar_int_mode imode;
8843   unsigned HOST_WIDE_INT ival;
8844
8845   if (GET_CODE (x) == CONST_DOUBLE
8846       && SCALAR_FLOAT_MODE_P (mode))
8847     {
8848       if (!aarch64_reinterpret_float_as_int (x, &ival))
8849         return false;
8850
8851       /* We make a general exception for 0.  */
8852       if (aarch64_float_const_zero_rtx_p (x))
8853         return true;
8854
8855       imode = int_mode_for_mode (mode).require ();
8856     }
8857   else if (GET_CODE (x) == CONST_INT
8858            && is_a <scalar_int_mode> (mode, &imode))
8859     ival = INTVAL (x);
8860   else
8861     return false;
8862
8863    /* use a 64 bit mode for everything except for DI/DF mode, where we use
8864      a 128 bit vector mode.  */
8865   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
8866
8867   vmode = aarch64_simd_container_mode (imode, width);
8868   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
8869
8870   return aarch64_simd_valid_immediate (v_op, NULL);
8871 }
8872
8873
8874 /* Return the fixed registers used for condition codes.  */
8875
8876 static bool
8877 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
8878 {
8879   *p1 = CC_REGNUM;
8880   *p2 = INVALID_REGNUM;
8881   return true;
8882 }
8883
8884 /* This function is used by the call expanders of the machine description.
8885    RESULT is the register in which the result is returned.  It's NULL for
8886    "call" and "sibcall".
8887    MEM is the location of the function call.
8888    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
8889    SIBCALL indicates whether this function call is normal call or sibling call.
8890    It will generate different pattern accordingly.  */
8891
8892 void
8893 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
8894 {
8895   rtx call, callee, tmp;
8896   rtvec vec;
8897   machine_mode mode;
8898
8899   gcc_assert (MEM_P (mem));
8900   callee = XEXP (mem, 0);
8901   mode = GET_MODE (callee);
8902   gcc_assert (mode == Pmode);
8903
8904   /* Decide if we should generate indirect calls by loading the
8905      address of the callee into a register before performing
8906      the branch-and-link.  */
8907   if (SYMBOL_REF_P (callee)
8908       ? (aarch64_is_long_call_p (callee)
8909          || aarch64_is_noplt_call_p (callee))
8910       : !REG_P (callee))
8911     XEXP (mem, 0) = force_reg (mode, callee);
8912
8913   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8914
8915   if (result != NULL_RTX)
8916     call = gen_rtx_SET (result, call);
8917
8918   if (sibcall)
8919     tmp = ret_rtx;
8920   else
8921     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8922
8923   gcc_assert (CONST_INT_P (callee_abi));
8924   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
8925                                UNSPEC_CALLEE_ABI);
8926
8927   vec = gen_rtvec (3, call, callee_abi, tmp);
8928   call = gen_rtx_PARALLEL (VOIDmode, vec);
8929
8930   aarch64_emit_call_insn (call);
8931 }
8932
8933 /* Emit call insn with PAT and do aarch64-specific handling.  */
8934
8935 void
8936 aarch64_emit_call_insn (rtx pat)
8937 {
8938   rtx insn = emit_call_insn (pat);
8939
8940   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8941   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8942   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8943 }
8944
8945 machine_mode
8946 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8947 {
8948   machine_mode mode_x = GET_MODE (x);
8949   rtx_code code_x = GET_CODE (x);
8950
8951   /* All floating point compares return CCFP if it is an equality
8952      comparison, and CCFPE otherwise.  */
8953   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8954     {
8955       switch (code)
8956         {
8957         case EQ:
8958         case NE:
8959         case UNORDERED:
8960         case ORDERED:
8961         case UNLT:
8962         case UNLE:
8963         case UNGT:
8964         case UNGE:
8965         case UNEQ:
8966           return CCFPmode;
8967
8968         case LT:
8969         case LE:
8970         case GT:
8971         case GE:
8972         case LTGT:
8973           return CCFPEmode;
8974
8975         default:
8976           gcc_unreachable ();
8977         }
8978     }
8979
8980   /* Equality comparisons of short modes against zero can be performed
8981      using the TST instruction with the appropriate bitmask.  */
8982   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8983       && (code == EQ || code == NE)
8984       && (mode_x == HImode || mode_x == QImode))
8985     return CC_NZmode;
8986
8987   /* Similarly, comparisons of zero_extends from shorter modes can
8988      be performed using an ANDS with an immediate mask.  */
8989   if (y == const0_rtx && code_x == ZERO_EXTEND
8990       && (mode_x == SImode || mode_x == DImode)
8991       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8992       && (code == EQ || code == NE))
8993     return CC_NZmode;
8994
8995   if ((mode_x == SImode || mode_x == DImode)
8996       && y == const0_rtx
8997       && (code == EQ || code == NE || code == LT || code == GE)
8998       && (code_x == PLUS || code_x == MINUS || code_x == AND
8999           || code_x == NEG
9000           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9001               && CONST_INT_P (XEXP (x, 2)))))
9002     return CC_NZmode;
9003
9004   /* A compare with a shifted operand.  Because of canonicalization,
9005      the comparison will have to be swapped when we emit the assembly
9006      code.  */
9007   if ((mode_x == SImode || mode_x == DImode)
9008       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
9009       && (code_x == ASHIFT || code_x == ASHIFTRT
9010           || code_x == LSHIFTRT
9011           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9012     return CC_SWPmode;
9013
9014   /* Similarly for a negated operand, but we can only do this for
9015      equalities.  */
9016   if ((mode_x == SImode || mode_x == DImode)
9017       && (REG_P (y) || GET_CODE (y) == SUBREG)
9018       && (code == EQ || code == NE)
9019       && code_x == NEG)
9020     return CC_Zmode;
9021
9022   /* A test for unsigned overflow from an addition.  */
9023   if ((mode_x == DImode || mode_x == TImode)
9024       && (code == LTU || code == GEU)
9025       && code_x == PLUS
9026       && rtx_equal_p (XEXP (x, 0), y))
9027     return CC_Cmode;
9028
9029   /* A test for unsigned overflow from an add with carry.  */
9030   if ((mode_x == DImode || mode_x == TImode)
9031       && (code == LTU || code == GEU)
9032       && code_x == PLUS
9033       && CONST_SCALAR_INT_P (y)
9034       && (rtx_mode_t (y, mode_x)
9035           == (wi::shwi (1, mode_x)
9036               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9037     return CC_ADCmode;
9038
9039   /* A test for signed overflow.  */
9040   if ((mode_x == DImode || mode_x == TImode)
9041       && code == NE
9042       && code_x == PLUS
9043       && GET_CODE (y) == SIGN_EXTEND)
9044     return CC_Vmode;
9045
9046   /* For everything else, return CCmode.  */
9047   return CCmode;
9048 }
9049
9050 static int
9051 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
9052
9053 int
9054 aarch64_get_condition_code (rtx x)
9055 {
9056   machine_mode mode = GET_MODE (XEXP (x, 0));
9057   enum rtx_code comp_code = GET_CODE (x);
9058
9059   if (GET_MODE_CLASS (mode) != MODE_CC)
9060     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
9061   return aarch64_get_condition_code_1 (mode, comp_code);
9062 }
9063
9064 static int
9065 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
9066 {
9067   switch (mode)
9068     {
9069     case E_CCFPmode:
9070     case E_CCFPEmode:
9071       switch (comp_code)
9072         {
9073         case GE: return AARCH64_GE;
9074         case GT: return AARCH64_GT;
9075         case LE: return AARCH64_LS;
9076         case LT: return AARCH64_MI;
9077         case NE: return AARCH64_NE;
9078         case EQ: return AARCH64_EQ;
9079         case ORDERED: return AARCH64_VC;
9080         case UNORDERED: return AARCH64_VS;
9081         case UNLT: return AARCH64_LT;
9082         case UNLE: return AARCH64_LE;
9083         case UNGT: return AARCH64_HI;
9084         case UNGE: return AARCH64_PL;
9085         default: return -1;
9086         }
9087       break;
9088
9089     case E_CCmode:
9090       switch (comp_code)
9091         {
9092         case NE: return AARCH64_NE;
9093         case EQ: return AARCH64_EQ;
9094         case GE: return AARCH64_GE;
9095         case GT: return AARCH64_GT;
9096         case LE: return AARCH64_LE;
9097         case LT: return AARCH64_LT;
9098         case GEU: return AARCH64_CS;
9099         case GTU: return AARCH64_HI;
9100         case LEU: return AARCH64_LS;
9101         case LTU: return AARCH64_CC;
9102         default: return -1;
9103         }
9104       break;
9105
9106     case E_CC_SWPmode:
9107       switch (comp_code)
9108         {
9109         case NE: return AARCH64_NE;
9110         case EQ: return AARCH64_EQ;
9111         case GE: return AARCH64_LE;
9112         case GT: return AARCH64_LT;
9113         case LE: return AARCH64_GE;
9114         case LT: return AARCH64_GT;
9115         case GEU: return AARCH64_LS;
9116         case GTU: return AARCH64_CC;
9117         case LEU: return AARCH64_CS;
9118         case LTU: return AARCH64_HI;
9119         default: return -1;
9120         }
9121       break;
9122
9123     case E_CC_NZCmode:
9124       switch (comp_code)
9125         {
9126         case NE: return AARCH64_NE; /* = any */
9127         case EQ: return AARCH64_EQ; /* = none */
9128         case GE: return AARCH64_PL; /* = nfrst */
9129         case LT: return AARCH64_MI; /* = first */
9130         case GEU: return AARCH64_CS; /* = nlast */
9131         case GTU: return AARCH64_HI; /* = pmore */
9132         case LEU: return AARCH64_LS; /* = plast */
9133         case LTU: return AARCH64_CC; /* = last */
9134         default: return -1;
9135         }
9136       break;
9137
9138     case E_CC_NZmode:
9139       switch (comp_code)
9140         {
9141         case NE: return AARCH64_NE;
9142         case EQ: return AARCH64_EQ;
9143         case GE: return AARCH64_PL;
9144         case LT: return AARCH64_MI;
9145         default: return -1;
9146         }
9147       break;
9148
9149     case E_CC_Zmode:
9150       switch (comp_code)
9151         {
9152         case NE: return AARCH64_NE;
9153         case EQ: return AARCH64_EQ;
9154         default: return -1;
9155         }
9156       break;
9157
9158     case E_CC_Cmode:
9159       switch (comp_code)
9160         {
9161         case LTU: return AARCH64_CS;
9162         case GEU: return AARCH64_CC;
9163         default: return -1;
9164         }
9165       break;
9166
9167     case E_CC_ADCmode:
9168       switch (comp_code)
9169         {
9170         case GEU: return AARCH64_CS;
9171         case LTU: return AARCH64_CC;
9172         default: return -1;
9173         }
9174       break;
9175
9176     case E_CC_Vmode:
9177       switch (comp_code)
9178         {
9179         case NE: return AARCH64_VS;
9180         case EQ: return AARCH64_VC;
9181         default: return -1;
9182         }
9183       break;
9184
9185     default:
9186       return -1;
9187     }
9188
9189   return -1;
9190 }
9191
9192 bool
9193 aarch64_const_vec_all_same_in_range_p (rtx x,
9194                                        HOST_WIDE_INT minval,
9195                                        HOST_WIDE_INT maxval)
9196 {
9197   rtx elt;
9198   return (const_vec_duplicate_p (x, &elt)
9199           && CONST_INT_P (elt)
9200           && IN_RANGE (INTVAL (elt), minval, maxval));
9201 }
9202
9203 bool
9204 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9205 {
9206   return aarch64_const_vec_all_same_in_range_p (x, val, val);
9207 }
9208
9209 /* Return true if VEC is a constant in which every element is in the range
9210    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
9211
9212 static bool
9213 aarch64_const_vec_all_in_range_p (rtx vec,
9214                                   HOST_WIDE_INT minval,
9215                                   HOST_WIDE_INT maxval)
9216 {
9217   if (GET_CODE (vec) != CONST_VECTOR
9218       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9219     return false;
9220
9221   int nunits;
9222   if (!CONST_VECTOR_STEPPED_P (vec))
9223     nunits = const_vector_encoded_nelts (vec);
9224   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9225     return false;
9226
9227   for (int i = 0; i < nunits; i++)
9228     {
9229       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9230       if (!CONST_INT_P (vec_elem)
9231           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9232         return false;
9233     }
9234   return true;
9235 }
9236
9237 /* N Z C V.  */
9238 #define AARCH64_CC_V 1
9239 #define AARCH64_CC_C (1 << 1)
9240 #define AARCH64_CC_Z (1 << 2)
9241 #define AARCH64_CC_N (1 << 3)
9242
9243 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
9244 static const int aarch64_nzcv_codes[] =
9245 {
9246   0,            /* EQ, Z == 1.  */
9247   AARCH64_CC_Z, /* NE, Z == 0.  */
9248   0,            /* CS, C == 1.  */
9249   AARCH64_CC_C, /* CC, C == 0.  */
9250   0,            /* MI, N == 1.  */
9251   AARCH64_CC_N, /* PL, N == 0.  */
9252   0,            /* VS, V == 1.  */
9253   AARCH64_CC_V, /* VC, V == 0.  */
9254   0,            /* HI, C ==1 && Z == 0.  */
9255   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
9256   AARCH64_CC_V, /* GE, N == V.  */
9257   0,            /* LT, N != V.  */
9258   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
9259   0,            /* LE, !(Z == 0 && N == V).  */
9260   0,            /* AL, Any.  */
9261   0             /* NV, Any.  */
9262 };
9263
9264 /* Print floating-point vector immediate operand X to F, negating it
9265    first if NEGATE is true.  Return true on success, false if it isn't
9266    a constant we can handle.  */
9267
9268 static bool
9269 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9270 {
9271   rtx elt;
9272
9273   if (!const_vec_duplicate_p (x, &elt))
9274     return false;
9275
9276   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9277   if (negate)
9278     r = real_value_negate (&r);
9279
9280   /* Handle the SVE single-bit immediates specially, since they have a
9281      fixed form in the assembly syntax.  */
9282   if (real_equal (&r, &dconst0))
9283     asm_fprintf (f, "0.0");
9284   else if (real_equal (&r, &dconst2))
9285     asm_fprintf (f, "2.0");
9286   else if (real_equal (&r, &dconst1))
9287     asm_fprintf (f, "1.0");
9288   else if (real_equal (&r, &dconsthalf))
9289     asm_fprintf (f, "0.5");
9290   else
9291     {
9292       const int buf_size = 20;
9293       char float_buf[buf_size] = {'\0'};
9294       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9295                                 1, GET_MODE (elt));
9296       asm_fprintf (f, "%s", float_buf);
9297     }
9298
9299   return true;
9300 }
9301
9302 /* Return the equivalent letter for size.  */
9303 static char
9304 sizetochar (int size)
9305 {
9306   switch (size)
9307     {
9308     case 64: return 'd';
9309     case 32: return 's';
9310     case 16: return 'h';
9311     case 8 : return 'b';
9312     default: gcc_unreachable ();
9313     }
9314 }
9315
9316 /* Print operand X to file F in a target specific manner according to CODE.
9317    The acceptable formatting commands given by CODE are:
9318      'c':               An integer or symbol address without a preceding #
9319                         sign.
9320      'C':               Take the duplicated element in a vector constant
9321                         and print it in hex.
9322      'D':               Take the duplicated element in a vector constant
9323                         and print it as an unsigned integer, in decimal.
9324      'e':               Print the sign/zero-extend size as a character 8->b,
9325                         16->h, 32->w.  Can also be used for masks:
9326                         0xff->b, 0xffff->h, 0xffffffff->w.
9327      'I':               If the operand is a duplicated vector constant,
9328                         replace it with the duplicated scalar.  If the
9329                         operand is then a floating-point constant, replace
9330                         it with the integer bit representation.  Print the
9331                         transformed constant as a signed decimal number.
9332      'p':               Prints N such that 2^N == X (X must be power of 2 and
9333                         const int).
9334      'P':               Print the number of non-zero bits in X (a const_int).
9335      'H':               Print the higher numbered register of a pair (TImode)
9336                         of regs.
9337      'm':               Print a condition (eq, ne, etc).
9338      'M':               Same as 'm', but invert condition.
9339      'N':               Take the duplicated element in a vector constant
9340                         and print the negative of it in decimal.
9341      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
9342      'S/T/U/V':         Print a FP/SIMD register name for a register list.
9343                         The register printed is the FP/SIMD register name
9344                         of X + 0/1/2/3 for S/T/U/V.
9345      'R':               Print a scalar Integer/FP/SIMD register name + 1.
9346      'X':               Print bottom 16 bits of integer constant in hex.
9347      'w/x':             Print a general register name or the zero register
9348                         (32-bit or 64-bit).
9349      '0':               Print a normal operand, if it's a general register,
9350                         then we assume DImode.
9351      'k':               Print NZCV for conditional compare instructions.
9352      'A':               Output address constant representing the first
9353                         argument of X, specifying a relocation offset
9354                         if appropriate.
9355      'L':               Output constant address specified by X
9356                         with a relocation offset if appropriate.
9357      'G':               Prints address of X, specifying a PC relative
9358                         relocation mode if appropriate.
9359      'y':               Output address of LDP or STP - this is used for
9360                         some LDP/STPs which don't use a PARALLEL in their
9361                         pattern (so the mode needs to be adjusted).
9362      'z':               Output address of a typical LDP or STP.  */
9363
9364 static void
9365 aarch64_print_operand (FILE *f, rtx x, int code)
9366 {
9367   rtx elt;
9368   switch (code)
9369     {
9370     case 'c':
9371       switch (GET_CODE (x))
9372         {
9373         case CONST_INT:
9374           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9375           break;
9376
9377         case SYMBOL_REF:
9378           output_addr_const (f, x);
9379           break;
9380
9381         case CONST:
9382           if (GET_CODE (XEXP (x, 0)) == PLUS
9383               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
9384             {
9385               output_addr_const (f, x);
9386               break;
9387             }
9388           /* Fall through.  */
9389
9390         default:
9391           output_operand_lossage ("unsupported operand for code '%c'", code);
9392         }
9393       break;
9394
9395     case 'e':
9396       {
9397         x = unwrap_const_vec_duplicate (x);
9398         if (!CONST_INT_P (x))
9399           {
9400             output_operand_lossage ("invalid operand for '%%%c'", code);
9401             return;
9402           }
9403
9404         HOST_WIDE_INT val = INTVAL (x);
9405         if ((val & ~7) == 8 || val == 0xff)
9406           fputc ('b', f);
9407         else if ((val & ~7) == 16 || val == 0xffff)
9408           fputc ('h', f);
9409         else if ((val & ~7) == 32 || val == 0xffffffff)
9410           fputc ('w', f);
9411         else
9412           {
9413             output_operand_lossage ("invalid operand for '%%%c'", code);
9414             return;
9415           }
9416       }
9417       break;
9418
9419     case 'p':
9420       {
9421         int n;
9422
9423         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
9424           {
9425             output_operand_lossage ("invalid operand for '%%%c'", code);
9426             return;
9427           }
9428
9429         asm_fprintf (f, "%d", n);
9430       }
9431       break;
9432
9433     case 'P':
9434       if (!CONST_INT_P (x))
9435         {
9436           output_operand_lossage ("invalid operand for '%%%c'", code);
9437           return;
9438         }
9439
9440       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
9441       break;
9442
9443     case 'H':
9444       if (x == const0_rtx)
9445         {
9446           asm_fprintf (f, "xzr");
9447           break;
9448         }
9449
9450       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
9451         {
9452           output_operand_lossage ("invalid operand for '%%%c'", code);
9453           return;
9454         }
9455
9456       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
9457       break;
9458
9459     case 'I':
9460       {
9461         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
9462         if (CONST_INT_P (x))
9463           asm_fprintf (f, "%wd", INTVAL (x));
9464         else
9465           {
9466             output_operand_lossage ("invalid operand for '%%%c'", code);
9467             return;
9468           }
9469         break;
9470       }
9471
9472     case 'M':
9473     case 'm':
9474       {
9475         int cond_code;
9476         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
9477         if (x == const_true_rtx)
9478           {
9479             if (code == 'M')
9480               fputs ("nv", f);
9481             return;
9482           }
9483
9484         if (!COMPARISON_P (x))
9485           {
9486             output_operand_lossage ("invalid operand for '%%%c'", code);
9487             return;
9488           }
9489
9490         cond_code = aarch64_get_condition_code (x);
9491         gcc_assert (cond_code >= 0);
9492         if (code == 'M')
9493           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
9494         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
9495           fputs (aarch64_sve_condition_codes[cond_code], f);
9496         else
9497           fputs (aarch64_condition_codes[cond_code], f);
9498       }
9499       break;
9500
9501     case 'N':
9502       if (!const_vec_duplicate_p (x, &elt))
9503         {
9504           output_operand_lossage ("invalid vector constant");
9505           return;
9506         }
9507
9508       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9509         asm_fprintf (f, "%wd", -INTVAL (elt));
9510       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9511                && aarch64_print_vector_float_operand (f, x, true))
9512         ;
9513       else
9514         {
9515           output_operand_lossage ("invalid vector constant");
9516           return;
9517         }
9518       break;
9519
9520     case 'b':
9521     case 'h':
9522     case 's':
9523     case 'd':
9524     case 'q':
9525       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9526         {
9527           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9528           return;
9529         }
9530       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
9531       break;
9532
9533     case 'S':
9534     case 'T':
9535     case 'U':
9536     case 'V':
9537       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9538         {
9539           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9540           return;
9541         }
9542       asm_fprintf (f, "%c%d",
9543                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
9544                    REGNO (x) - V0_REGNUM + (code - 'S'));
9545       break;
9546
9547     case 'R':
9548       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
9549         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
9550       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9551         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
9552       else
9553         output_operand_lossage ("incompatible register operand for '%%%c'",
9554                                 code);
9555       break;
9556
9557     case 'X':
9558       if (!CONST_INT_P (x))
9559         {
9560           output_operand_lossage ("invalid operand for '%%%c'", code);
9561           return;
9562         }
9563       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
9564       break;
9565
9566     case 'C':
9567       {
9568         /* Print a replicated constant in hex.  */
9569         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9570           {
9571             output_operand_lossage ("invalid operand for '%%%c'", code);
9572             return;
9573           }
9574         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9575         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9576       }
9577       break;
9578
9579     case 'D':
9580       {
9581         /* Print a replicated constant in decimal, treating it as
9582            unsigned.  */
9583         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9584           {
9585             output_operand_lossage ("invalid operand for '%%%c'", code);
9586             return;
9587           }
9588         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9589         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9590       }
9591       break;
9592
9593     case 'w':
9594     case 'x':
9595       if (x == const0_rtx
9596           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
9597         {
9598           asm_fprintf (f, "%czr", code);
9599           break;
9600         }
9601
9602       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9603         {
9604           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
9605           break;
9606         }
9607
9608       if (REG_P (x) && REGNO (x) == SP_REGNUM)
9609         {
9610           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
9611           break;
9612         }
9613
9614       /* Fall through */
9615
9616     case 0:
9617       if (x == NULL)
9618         {
9619           output_operand_lossage ("missing operand");
9620           return;
9621         }
9622
9623       switch (GET_CODE (x))
9624         {
9625         case REG:
9626           if (aarch64_sve_data_mode_p (GET_MODE (x)))
9627             {
9628               if (REG_NREGS (x) == 1)
9629                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
9630               else
9631                 {
9632                   char suffix
9633                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
9634                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
9635                                REGNO (x) - V0_REGNUM, suffix,
9636                                END_REGNO (x) - V0_REGNUM - 1, suffix);
9637                 }
9638             }
9639           else
9640             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
9641           break;
9642
9643         case MEM:
9644           output_address (GET_MODE (x), XEXP (x, 0));
9645           break;
9646
9647         case LABEL_REF:
9648         case SYMBOL_REF:
9649           output_addr_const (asm_out_file, x);
9650           break;
9651
9652         case CONST_INT:
9653           asm_fprintf (f, "%wd", INTVAL (x));
9654           break;
9655
9656         case CONST:
9657           if (!VECTOR_MODE_P (GET_MODE (x)))
9658             {
9659               output_addr_const (asm_out_file, x);
9660               break;
9661             }
9662           /* fall through */
9663
9664         case CONST_VECTOR:
9665           if (!const_vec_duplicate_p (x, &elt))
9666             {
9667               output_operand_lossage ("invalid vector constant");
9668               return;
9669             }
9670
9671           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9672             asm_fprintf (f, "%wd", INTVAL (elt));
9673           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9674                    && aarch64_print_vector_float_operand (f, x, false))
9675             ;
9676           else
9677             {
9678               output_operand_lossage ("invalid vector constant");
9679               return;
9680             }
9681           break;
9682
9683         case CONST_DOUBLE:
9684           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
9685              be getting CONST_DOUBLEs holding integers.  */
9686           gcc_assert (GET_MODE (x) != VOIDmode);
9687           if (aarch64_float_const_zero_rtx_p (x))
9688             {
9689               fputc ('0', f);
9690               break;
9691             }
9692           else if (aarch64_float_const_representable_p (x))
9693             {
9694 #define buf_size 20
9695               char float_buf[buf_size] = {'\0'};
9696               real_to_decimal_for_mode (float_buf,
9697                                         CONST_DOUBLE_REAL_VALUE (x),
9698                                         buf_size, buf_size,
9699                                         1, GET_MODE (x));
9700               asm_fprintf (asm_out_file, "%s", float_buf);
9701               break;
9702 #undef buf_size
9703             }
9704           output_operand_lossage ("invalid constant");
9705           return;
9706         default:
9707           output_operand_lossage ("invalid operand");
9708           return;
9709         }
9710       break;
9711
9712     case 'A':
9713       if (GET_CODE (x) == HIGH)
9714         x = XEXP (x, 0);
9715
9716       switch (aarch64_classify_symbolic_expression (x))
9717         {
9718         case SYMBOL_SMALL_GOT_4G:
9719           asm_fprintf (asm_out_file, ":got:");
9720           break;
9721
9722         case SYMBOL_SMALL_TLSGD:
9723           asm_fprintf (asm_out_file, ":tlsgd:");
9724           break;
9725
9726         case SYMBOL_SMALL_TLSDESC:
9727           asm_fprintf (asm_out_file, ":tlsdesc:");
9728           break;
9729
9730         case SYMBOL_SMALL_TLSIE:
9731           asm_fprintf (asm_out_file, ":gottprel:");
9732           break;
9733
9734         case SYMBOL_TLSLE24:
9735           asm_fprintf (asm_out_file, ":tprel:");
9736           break;
9737
9738         case SYMBOL_TINY_GOT:
9739           gcc_unreachable ();
9740           break;
9741
9742         default:
9743           break;
9744         }
9745       output_addr_const (asm_out_file, x);
9746       break;
9747
9748     case 'L':
9749       switch (aarch64_classify_symbolic_expression (x))
9750         {
9751         case SYMBOL_SMALL_GOT_4G:
9752           asm_fprintf (asm_out_file, ":lo12:");
9753           break;
9754
9755         case SYMBOL_SMALL_TLSGD:
9756           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
9757           break;
9758
9759         case SYMBOL_SMALL_TLSDESC:
9760           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
9761           break;
9762
9763         case SYMBOL_SMALL_TLSIE:
9764           asm_fprintf (asm_out_file, ":gottprel_lo12:");
9765           break;
9766
9767         case SYMBOL_TLSLE12:
9768           asm_fprintf (asm_out_file, ":tprel_lo12:");
9769           break;
9770
9771         case SYMBOL_TLSLE24:
9772           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
9773           break;
9774
9775         case SYMBOL_TINY_GOT:
9776           asm_fprintf (asm_out_file, ":got:");
9777           break;
9778
9779         case SYMBOL_TINY_TLSIE:
9780           asm_fprintf (asm_out_file, ":gottprel:");
9781           break;
9782
9783         default:
9784           break;
9785         }
9786       output_addr_const (asm_out_file, x);
9787       break;
9788
9789     case 'G':
9790       switch (aarch64_classify_symbolic_expression (x))
9791         {
9792         case SYMBOL_TLSLE24:
9793           asm_fprintf (asm_out_file, ":tprel_hi12:");
9794           break;
9795         default:
9796           break;
9797         }
9798       output_addr_const (asm_out_file, x);
9799       break;
9800
9801     case 'k':
9802       {
9803         HOST_WIDE_INT cond_code;
9804
9805         if (!CONST_INT_P (x))
9806           {
9807             output_operand_lossage ("invalid operand for '%%%c'", code);
9808             return;
9809           }
9810
9811         cond_code = INTVAL (x);
9812         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
9813         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
9814       }
9815       break;
9816
9817     case 'y':
9818     case 'z':
9819       {
9820         machine_mode mode = GET_MODE (x);
9821
9822         if (GET_CODE (x) != MEM
9823             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
9824           {
9825             output_operand_lossage ("invalid operand for '%%%c'", code);
9826             return;
9827           }
9828
9829         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
9830                                             code == 'y'
9831                                             ? ADDR_QUERY_LDP_STP_N
9832                                             : ADDR_QUERY_LDP_STP))
9833           output_operand_lossage ("invalid operand prefix '%%%c'", code);
9834       }
9835       break;
9836
9837     default:
9838       output_operand_lossage ("invalid operand prefix '%%%c'", code);
9839       return;
9840     }
9841 }
9842
9843 /* Print address 'x' of a memory access with mode 'mode'.
9844    'op' is the context required by aarch64_classify_address.  It can either be
9845    MEM for a normal memory access or PARALLEL for LDP/STP.  */
9846 static bool
9847 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
9848                                 aarch64_addr_query_type type)
9849 {
9850   struct aarch64_address_info addr;
9851   unsigned int size, vec_flags;
9852
9853   /* Check all addresses are Pmode - including ILP32.  */
9854   if (GET_MODE (x) != Pmode
9855       && (!CONST_INT_P (x)
9856           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
9857     {
9858       output_operand_lossage ("invalid address mode");
9859       return false;
9860     }
9861
9862   if (aarch64_classify_address (&addr, x, mode, true, type))
9863     switch (addr.type)
9864       {
9865       case ADDRESS_REG_IMM:
9866         if (known_eq (addr.const_offset, 0))
9867           {
9868             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
9869             return true;
9870           }
9871
9872         vec_flags = aarch64_classify_vector_mode (mode);
9873         if (vec_flags & VEC_ANY_SVE)
9874           {
9875             HOST_WIDE_INT vnum
9876               = exact_div (addr.const_offset,
9877                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
9878             asm_fprintf (f, "[%s, #%wd, mul vl]",
9879                          reg_names[REGNO (addr.base)], vnum);
9880             return true;
9881           }
9882
9883         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
9884                      INTVAL (addr.offset));
9885         return true;
9886
9887       case ADDRESS_REG_REG:
9888         if (addr.shift == 0)
9889           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
9890                        reg_names [REGNO (addr.offset)]);
9891         else
9892           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
9893                        reg_names [REGNO (addr.offset)], addr.shift);
9894         return true;
9895
9896       case ADDRESS_REG_UXTW:
9897         if (addr.shift == 0)
9898           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9899                        REGNO (addr.offset) - R0_REGNUM);
9900         else
9901           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9902                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9903         return true;
9904
9905       case ADDRESS_REG_SXTW:
9906         if (addr.shift == 0)
9907           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9908                        REGNO (addr.offset) - R0_REGNUM);
9909         else
9910           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9911                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
9912         return true;
9913
9914       case ADDRESS_REG_WB:
9915         /* Writeback is only supported for fixed-width modes.  */
9916         size = GET_MODE_SIZE (mode).to_constant ();
9917         switch (GET_CODE (x))
9918           {
9919           case PRE_INC:
9920             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9921             return true;
9922           case POST_INC:
9923             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9924             return true;
9925           case PRE_DEC:
9926             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9927             return true;
9928           case POST_DEC:
9929             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9930             return true;
9931           case PRE_MODIFY:
9932             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9933                          INTVAL (addr.offset));
9934             return true;
9935           case POST_MODIFY:
9936             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9937                          INTVAL (addr.offset));
9938             return true;
9939           default:
9940             break;
9941           }
9942         break;
9943
9944       case ADDRESS_LO_SUM:
9945         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9946         output_addr_const (f, addr.offset);
9947         asm_fprintf (f, "]");
9948         return true;
9949
9950       case ADDRESS_SYMBOLIC:
9951         output_addr_const (f, x);
9952         return true;
9953       }
9954
9955   return false;
9956 }
9957
9958 /* Print address 'x' of a memory access with mode 'mode'.  */
9959 static void
9960 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9961 {
9962   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9963     output_addr_const (f, x);
9964 }
9965
9966 bool
9967 aarch64_label_mentioned_p (rtx x)
9968 {
9969   const char *fmt;
9970   int i;
9971
9972   if (GET_CODE (x) == LABEL_REF)
9973     return true;
9974
9975   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9976      referencing instruction, but they are constant offsets, not
9977      symbols.  */
9978   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9979     return false;
9980
9981   fmt = GET_RTX_FORMAT (GET_CODE (x));
9982   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9983     {
9984       if (fmt[i] == 'E')
9985         {
9986           int j;
9987
9988           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9989             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9990               return 1;
9991         }
9992       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9993         return 1;
9994     }
9995
9996   return 0;
9997 }
9998
9999 /* Implement REGNO_REG_CLASS.  */
10000
10001 enum reg_class
10002 aarch64_regno_regclass (unsigned regno)
10003 {
10004   if (GP_REGNUM_P (regno))
10005     return GENERAL_REGS;
10006
10007   if (regno == SP_REGNUM)
10008     return STACK_REG;
10009
10010   if (regno == FRAME_POINTER_REGNUM
10011       || regno == ARG_POINTER_REGNUM)
10012     return POINTER_REGS;
10013
10014   if (FP_REGNUM_P (regno))
10015     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10016             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10017
10018   if (PR_REGNUM_P (regno))
10019     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10020
10021   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10022     return FFR_REGS;
10023
10024   return NO_REGS;
10025 }
10026
10027 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10028    If OFFSET is out of range, return an offset of an anchor point
10029    that is in range.  Return 0 otherwise.  */
10030
10031 static HOST_WIDE_INT
10032 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10033                        machine_mode mode)
10034 {
10035   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
10036   if (size > 16)
10037     return (offset + 0x400) & ~0x7f0;
10038
10039   /* For offsets that aren't a multiple of the access size, the limit is
10040      -256...255.  */
10041   if (offset & (size - 1))
10042     {
10043       /* BLKmode typically uses LDP of X-registers.  */
10044       if (mode == BLKmode)
10045         return (offset + 512) & ~0x3ff;
10046       return (offset + 0x100) & ~0x1ff;
10047     }
10048
10049   /* Small negative offsets are supported.  */
10050   if (IN_RANGE (offset, -256, 0))
10051     return 0;
10052
10053   if (mode == TImode || mode == TFmode)
10054     return (offset + 0x100) & ~0x1ff;
10055
10056   /* Use 12-bit offset by access size.  */
10057   return offset & (~0xfff * size);
10058 }
10059
10060 static rtx
10061 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
10062 {
10063   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10064      where mask is selected by alignment and size of the offset.
10065      We try to pick as large a range for the offset as possible to
10066      maximize the chance of a CSE.  However, for aligned addresses
10067      we limit the range to 4k so that structures with different sized
10068      elements are likely to use the same base.  We need to be careful
10069      not to split a CONST for some forms of address expression, otherwise
10070      it will generate sub-optimal code.  */
10071
10072   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10073     {
10074       rtx base = XEXP (x, 0);
10075       rtx offset_rtx = XEXP (x, 1);
10076       HOST_WIDE_INT offset = INTVAL (offset_rtx);
10077
10078       if (GET_CODE (base) == PLUS)
10079         {
10080           rtx op0 = XEXP (base, 0);
10081           rtx op1 = XEXP (base, 1);
10082
10083           /* Force any scaling into a temp for CSE.  */
10084           op0 = force_reg (Pmode, op0);
10085           op1 = force_reg (Pmode, op1);
10086
10087           /* Let the pointer register be in op0.  */
10088           if (REG_POINTER (op1))
10089             std::swap (op0, op1);
10090
10091           /* If the pointer is virtual or frame related, then we know that
10092              virtual register instantiation or register elimination is going
10093              to apply a second constant.  We want the two constants folded
10094              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
10095           if (virt_or_elim_regno_p (REGNO (op0)))
10096             {
10097               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10098                                    NULL_RTX, true, OPTAB_DIRECT);
10099               return gen_rtx_PLUS (Pmode, base, op1);
10100             }
10101
10102           /* Otherwise, in order to encourage CSE (and thence loop strength
10103              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
10104           base = expand_binop (Pmode, add_optab, op0, op1,
10105                                NULL_RTX, true, OPTAB_DIRECT);
10106           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
10107         }
10108
10109       HOST_WIDE_INT size;
10110       if (GET_MODE_SIZE (mode).is_constant (&size))
10111         {
10112           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10113                                                              mode);
10114           if (base_offset != 0)
10115             {
10116               base = plus_constant (Pmode, base, base_offset);
10117               base = force_operand (base, NULL_RTX);
10118               return plus_constant (Pmode, base, offset - base_offset);
10119             }
10120         }
10121     }
10122
10123   return x;
10124 }
10125
10126 static reg_class_t
10127 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10128                           reg_class_t rclass,
10129                           machine_mode mode,
10130                           secondary_reload_info *sri)
10131 {
10132   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10133      LDR and STR.  See the comment at the head of aarch64-sve.md for
10134      more details about the big-endian handling.  */
10135   if (reg_class_subset_p (rclass, FP_REGS)
10136       && !((REG_P (x) && HARD_REGISTER_P (x))
10137            || aarch64_simd_valid_immediate (x, NULL))
10138       && mode != VNx16QImode)
10139     {
10140       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10141       if ((vec_flags & VEC_SVE_DATA)
10142           && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10143         {
10144           sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10145           return NO_REGS;
10146         }
10147     }
10148
10149   /* If we have to disable direct literal pool loads and stores because the
10150      function is too big, then we need a scratch register.  */
10151   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
10152       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10153           || targetm.vector_mode_supported_p (GET_MODE (x)))
10154       && !aarch64_pcrelative_literal_loads)
10155     {
10156       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
10157       return NO_REGS;
10158     }
10159
10160   /* Without the TARGET_SIMD instructions we cannot move a Q register
10161      to a Q register directly.  We need a scratch.  */
10162   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10163       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10164       && reg_class_subset_p (rclass, FP_REGS))
10165     {
10166       sri->icode = code_for_aarch64_reload_mov (mode);
10167       return NO_REGS;
10168     }
10169
10170   /* A TFmode or TImode memory access should be handled via an FP_REGS
10171      because AArch64 has richer addressing modes for LDR/STR instructions
10172      than LDP/STP instructions.  */
10173   if (TARGET_FLOAT && rclass == GENERAL_REGS
10174       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
10175     return FP_REGS;
10176
10177   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
10178       return GENERAL_REGS;
10179
10180   return NO_REGS;
10181 }
10182
10183 static bool
10184 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10185 {
10186   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10187
10188   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10189      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
10190   if (frame_pointer_needed)
10191     return to == HARD_FRAME_POINTER_REGNUM;
10192   return true;
10193 }
10194
10195 poly_int64
10196 aarch64_initial_elimination_offset (unsigned from, unsigned to)
10197 {
10198   if (to == HARD_FRAME_POINTER_REGNUM)
10199     {
10200       if (from == ARG_POINTER_REGNUM)
10201         return cfun->machine->frame.hard_fp_offset;
10202
10203       if (from == FRAME_POINTER_REGNUM)
10204         return cfun->machine->frame.hard_fp_offset
10205                - cfun->machine->frame.locals_offset;
10206     }
10207
10208   if (to == STACK_POINTER_REGNUM)
10209     {
10210       if (from == FRAME_POINTER_REGNUM)
10211           return cfun->machine->frame.frame_size
10212                  - cfun->machine->frame.locals_offset;
10213     }
10214
10215   return cfun->machine->frame.frame_size;
10216 }
10217
10218 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
10219    previous frame.  */
10220
10221 rtx
10222 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10223 {
10224   if (count != 0)
10225     return const0_rtx;
10226   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
10227 }
10228
10229
10230 static void
10231 aarch64_asm_trampoline_template (FILE *f)
10232 {
10233   int offset1 = 16;
10234   int offset2 = 20;
10235
10236   if (aarch64_bti_enabled ())
10237     {
10238       asm_fprintf (f, "\thint\t34 // bti c\n");
10239       offset1 -= 4;
10240       offset2 -= 4;
10241     }
10242
10243   if (TARGET_ILP32)
10244     {
10245       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
10246       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
10247                    offset1);
10248     }
10249   else
10250     {
10251       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
10252       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
10253                    offset2);
10254     }
10255   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
10256
10257   /* The trampoline needs an extra padding instruction.  In case if BTI is
10258      enabled the padding instruction is replaced by the BTI instruction at
10259      the beginning.  */
10260   if (!aarch64_bti_enabled ())
10261     assemble_aligned_integer (4, const0_rtx);
10262
10263   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10264   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10265 }
10266
10267 static void
10268 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10269 {
10270   rtx fnaddr, mem, a_tramp;
10271   const int tramp_code_sz = 16;
10272
10273   /* Don't need to copy the trailing D-words, we fill those in below.  */
10274   emit_block_move (m_tramp, assemble_trampoline_template (),
10275                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10276   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
10277   fnaddr = XEXP (DECL_RTL (fndecl), 0);
10278   if (GET_MODE (fnaddr) != ptr_mode)
10279     fnaddr = convert_memory_address (ptr_mode, fnaddr);
10280   emit_move_insn (mem, fnaddr);
10281
10282   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
10283   emit_move_insn (mem, chain_value);
10284
10285   /* XXX We should really define a "clear_cache" pattern and use
10286      gen_clear_cache().  */
10287   a_tramp = XEXP (m_tramp, 0);
10288   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
10289                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
10290                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10291                      ptr_mode);
10292 }
10293
10294 static unsigned char
10295 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
10296 {
10297   /* ??? Logically we should only need to provide a value when
10298      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10299      can hold MODE, but at the moment we need to handle all modes.
10300      Just ignore any runtime parts for registers that can't store them.  */
10301   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
10302   unsigned int nregs, vec_flags;
10303   switch (regclass)
10304     {
10305     case TAILCALL_ADDR_REGS:
10306     case POINTER_REGS:
10307     case GENERAL_REGS:
10308     case ALL_REGS:
10309     case POINTER_AND_FP_REGS:
10310     case FP_REGS:
10311     case FP_LO_REGS:
10312     case FP_LO8_REGS:
10313       vec_flags = aarch64_classify_vector_mode (mode);
10314       if ((vec_flags & VEC_SVE_DATA)
10315           && constant_multiple_p (GET_MODE_SIZE (mode),
10316                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
10317         return nregs;
10318       return (vec_flags & VEC_ADVSIMD
10319               ? CEIL (lowest_size, UNITS_PER_VREG)
10320               : CEIL (lowest_size, UNITS_PER_WORD));
10321     case STACK_REG:
10322     case PR_REGS:
10323     case PR_LO_REGS:
10324     case PR_HI_REGS:
10325     case FFR_REGS:
10326     case PR_AND_FFR_REGS:
10327       return 1;
10328
10329     case NO_REGS:
10330       return 0;
10331
10332     default:
10333       break;
10334     }
10335   gcc_unreachable ();
10336 }
10337
10338 static reg_class_t
10339 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
10340 {
10341   if (regclass == POINTER_REGS)
10342     return GENERAL_REGS;
10343
10344   if (regclass == STACK_REG)
10345     {
10346       if (REG_P(x)
10347           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10348           return regclass;
10349
10350       return NO_REGS;
10351     }
10352
10353   /* Register eliminiation can result in a request for
10354      SP+constant->FP_REGS.  We cannot support such operations which
10355      use SP as source and an FP_REG as destination, so reject out
10356      right now.  */
10357   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
10358     {
10359       rtx lhs = XEXP (x, 0);
10360
10361       /* Look through a possible SUBREG introduced by ILP32.  */
10362       if (GET_CODE (lhs) == SUBREG)
10363         lhs = SUBREG_REG (lhs);
10364
10365       gcc_assert (REG_P (lhs));
10366       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
10367                                       POINTER_REGS));
10368       return NO_REGS;
10369     }
10370
10371   return regclass;
10372 }
10373
10374 void
10375 aarch64_asm_output_labelref (FILE* f, const char *name)
10376 {
10377   asm_fprintf (f, "%U%s", name);
10378 }
10379
10380 static void
10381 aarch64_elf_asm_constructor (rtx symbol, int priority)
10382 {
10383   if (priority == DEFAULT_INIT_PRIORITY)
10384     default_ctor_section_asm_out_constructor (symbol, priority);
10385   else
10386     {
10387       section *s;
10388       /* While priority is known to be in range [0, 65535], so 18 bytes
10389          would be enough, the compiler might not know that.  To avoid
10390          -Wformat-truncation false positive, use a larger size.  */
10391       char buf[23];
10392       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
10393       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10394       switch_to_section (s);
10395       assemble_align (POINTER_SIZE);
10396       assemble_aligned_integer (POINTER_BYTES, symbol);
10397     }
10398 }
10399
10400 static void
10401 aarch64_elf_asm_destructor (rtx symbol, int priority)
10402 {
10403   if (priority == DEFAULT_INIT_PRIORITY)
10404     default_dtor_section_asm_out_destructor (symbol, priority);
10405   else
10406     {
10407       section *s;
10408       /* While priority is known to be in range [0, 65535], so 18 bytes
10409          would be enough, the compiler might not know that.  To avoid
10410          -Wformat-truncation false positive, use a larger size.  */
10411       char buf[23];
10412       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
10413       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10414       switch_to_section (s);
10415       assemble_align (POINTER_SIZE);
10416       assemble_aligned_integer (POINTER_BYTES, symbol);
10417     }
10418 }
10419
10420 const char*
10421 aarch64_output_casesi (rtx *operands)
10422 {
10423   char buf[100];
10424   char label[100];
10425   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
10426   int index;
10427   static const char *const patterns[4][2] =
10428   {
10429     {
10430       "ldrb\t%w3, [%0,%w1,uxtw]",
10431       "add\t%3, %4, %w3, sxtb #2"
10432     },
10433     {
10434       "ldrh\t%w3, [%0,%w1,uxtw #1]",
10435       "add\t%3, %4, %w3, sxth #2"
10436     },
10437     {
10438       "ldr\t%w3, [%0,%w1,uxtw #2]",
10439       "add\t%3, %4, %w3, sxtw #2"
10440     },
10441     /* We assume that DImode is only generated when not optimizing and
10442        that we don't really need 64-bit address offsets.  That would
10443        imply an object file with 8GB of code in a single function!  */
10444     {
10445       "ldr\t%w3, [%0,%w1,uxtw #2]",
10446       "add\t%3, %4, %w3, sxtw #2"
10447     }
10448   };
10449
10450   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
10451
10452   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
10453   index = exact_log2 (GET_MODE_SIZE (mode));
10454
10455   gcc_assert (index >= 0 && index <= 3);
10456
10457   /* Need to implement table size reduction, by chaning the code below.  */
10458   output_asm_insn (patterns[index][0], operands);
10459   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
10460   snprintf (buf, sizeof (buf),
10461             "adr\t%%4, %s", targetm.strip_name_encoding (label));
10462   output_asm_insn (buf, operands);
10463   output_asm_insn (patterns[index][1], operands);
10464   output_asm_insn ("br\t%3", operands);
10465   assemble_label (asm_out_file, label);
10466   return "";
10467 }
10468
10469
10470 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10471    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10472    operator.  */
10473
10474 int
10475 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
10476 {
10477   if (shift >= 0 && shift <= 3)
10478     {
10479       int size;
10480       for (size = 8; size <= 32; size *= 2)
10481         {
10482           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
10483           if (mask == bits << shift)
10484             return size;
10485         }
10486     }
10487   return 0;
10488 }
10489
10490 /* Constant pools are per function only when PC relative
10491    literal loads are true or we are in the large memory
10492    model.  */
10493
10494 static inline bool
10495 aarch64_can_use_per_function_literal_pools_p (void)
10496 {
10497   return (aarch64_pcrelative_literal_loads
10498           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
10499 }
10500
10501 static bool
10502 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
10503 {
10504   /* We can't use blocks for constants when we're using a per-function
10505      constant pool.  */
10506   return !aarch64_can_use_per_function_literal_pools_p ();
10507 }
10508
10509 /* Select appropriate section for constants depending
10510    on where we place literal pools.  */
10511
10512 static section *
10513 aarch64_select_rtx_section (machine_mode mode,
10514                             rtx x,
10515                             unsigned HOST_WIDE_INT align)
10516 {
10517   if (aarch64_can_use_per_function_literal_pools_p ())
10518     return function_section (current_function_decl);
10519
10520   return default_elf_select_rtx_section (mode, x, align);
10521 }
10522
10523 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
10524 void
10525 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
10526                                   HOST_WIDE_INT offset)
10527 {
10528   /* When using per-function literal pools, we must ensure that any code
10529      section is aligned to the minimal instruction length, lest we get
10530      errors from the assembler re "unaligned instructions".  */
10531   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
10532     ASM_OUTPUT_ALIGN (f, 2);
10533 }
10534
10535 /* Costs.  */
10536
10537 /* Helper function for rtx cost calculation.  Strip a shift expression
10538    from X.  Returns the inner operand if successful, or the original
10539    expression on failure.  */
10540 static rtx
10541 aarch64_strip_shift (rtx x)
10542 {
10543   rtx op = x;
10544
10545   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
10546      we can convert both to ROR during final output.  */
10547   if ((GET_CODE (op) == ASHIFT
10548        || GET_CODE (op) == ASHIFTRT
10549        || GET_CODE (op) == LSHIFTRT
10550        || GET_CODE (op) == ROTATERT
10551        || GET_CODE (op) == ROTATE)
10552       && CONST_INT_P (XEXP (op, 1)))
10553     return XEXP (op, 0);
10554
10555   if (GET_CODE (op) == MULT
10556       && CONST_INT_P (XEXP (op, 1))
10557       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
10558     return XEXP (op, 0);
10559
10560   return x;
10561 }
10562
10563 /* Helper function for rtx cost calculation.  Strip an extend
10564    expression from X.  Returns the inner operand if successful, or the
10565    original expression on failure.  We deal with a number of possible
10566    canonicalization variations here. If STRIP_SHIFT is true, then
10567    we can strip off a shift also.  */
10568 static rtx
10569 aarch64_strip_extend (rtx x, bool strip_shift)
10570 {
10571   scalar_int_mode mode;
10572   rtx op = x;
10573
10574   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
10575     return op;
10576
10577   /* Zero and sign extraction of a widened value.  */
10578   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
10579       && XEXP (op, 2) == const0_rtx
10580       && GET_CODE (XEXP (op, 0)) == MULT
10581       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
10582                                          XEXP (op, 1)))
10583     return XEXP (XEXP (op, 0), 0);
10584
10585   /* It can also be represented (for zero-extend) as an AND with an
10586      immediate.  */
10587   if (GET_CODE (op) == AND
10588       && GET_CODE (XEXP (op, 0)) == MULT
10589       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
10590       && CONST_INT_P (XEXP (op, 1))
10591       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
10592                            INTVAL (XEXP (op, 1))) != 0)
10593     return XEXP (XEXP (op, 0), 0);
10594
10595   /* Now handle extended register, as this may also have an optional
10596      left shift by 1..4.  */
10597   if (strip_shift
10598       && GET_CODE (op) == ASHIFT
10599       && CONST_INT_P (XEXP (op, 1))
10600       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
10601     op = XEXP (op, 0);
10602
10603   if (GET_CODE (op) == ZERO_EXTEND
10604       || GET_CODE (op) == SIGN_EXTEND)
10605     op = XEXP (op, 0);
10606
10607   if (op != x)
10608     return op;
10609
10610   return x;
10611 }
10612
10613 /* Return true iff CODE is a shift supported in combination
10614    with arithmetic instructions.  */
10615
10616 static bool
10617 aarch64_shift_p (enum rtx_code code)
10618 {
10619   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
10620 }
10621
10622
10623 /* Return true iff X is a cheap shift without a sign extend. */
10624
10625 static bool
10626 aarch64_cheap_mult_shift_p (rtx x)
10627 {
10628   rtx op0, op1;
10629
10630   op0 = XEXP (x, 0);
10631   op1 = XEXP (x, 1);
10632
10633   if (!(aarch64_tune_params.extra_tuning_flags
10634                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
10635     return false;
10636
10637   if (GET_CODE (op0) == SIGN_EXTEND)
10638     return false;
10639
10640   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
10641       && UINTVAL (op1) <= 4)
10642     return true;
10643
10644   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
10645     return false;
10646
10647   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
10648
10649   if (l2 > 0 && l2 <= 4)
10650     return true;
10651
10652   return false;
10653 }
10654
10655 /* Helper function for rtx cost calculation.  Calculate the cost of
10656    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
10657    Return the calculated cost of the expression, recursing manually in to
10658    operands where needed.  */
10659
10660 static int
10661 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
10662 {
10663   rtx op0, op1;
10664   const struct cpu_cost_table *extra_cost
10665     = aarch64_tune_params.insn_extra_cost;
10666   int cost = 0;
10667   bool compound_p = (outer == PLUS || outer == MINUS);
10668   machine_mode mode = GET_MODE (x);
10669
10670   gcc_checking_assert (code == MULT);
10671
10672   op0 = XEXP (x, 0);
10673   op1 = XEXP (x, 1);
10674
10675   if (VECTOR_MODE_P (mode))
10676     mode = GET_MODE_INNER (mode);
10677
10678   /* Integer multiply/fma.  */
10679   if (GET_MODE_CLASS (mode) == MODE_INT)
10680     {
10681       /* The multiply will be canonicalized as a shift, cost it as such.  */
10682       if (aarch64_shift_p (GET_CODE (x))
10683           || (CONST_INT_P (op1)
10684               && exact_log2 (INTVAL (op1)) > 0))
10685         {
10686           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
10687                            || GET_CODE (op0) == SIGN_EXTEND;
10688           if (speed)
10689             {
10690               if (compound_p)
10691                 {
10692                   /* If the shift is considered cheap,
10693                      then don't add any cost. */
10694                   if (aarch64_cheap_mult_shift_p (x))
10695                     ;
10696                   else if (REG_P (op1))
10697                     /* ARITH + shift-by-register.  */
10698                     cost += extra_cost->alu.arith_shift_reg;
10699                   else if (is_extend)
10700                     /* ARITH + extended register.  We don't have a cost field
10701                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
10702                     cost += extra_cost->alu.extend_arith;
10703                   else
10704                     /* ARITH + shift-by-immediate.  */
10705                     cost += extra_cost->alu.arith_shift;
10706                 }
10707               else
10708                 /* LSL (immediate).  */
10709                 cost += extra_cost->alu.shift;
10710
10711             }
10712           /* Strip extends as we will have costed them in the case above.  */
10713           if (is_extend)
10714             op0 = aarch64_strip_extend (op0, true);
10715
10716           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
10717
10718           return cost;
10719         }
10720
10721       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
10722          compound and let the below cases handle it.  After all, MNEG is a
10723          special-case alias of MSUB.  */
10724       if (GET_CODE (op0) == NEG)
10725         {
10726           op0 = XEXP (op0, 0);
10727           compound_p = true;
10728         }
10729
10730       /* Integer multiplies or FMAs have zero/sign extending variants.  */
10731       if ((GET_CODE (op0) == ZERO_EXTEND
10732            && GET_CODE (op1) == ZERO_EXTEND)
10733           || (GET_CODE (op0) == SIGN_EXTEND
10734               && GET_CODE (op1) == SIGN_EXTEND))
10735         {
10736           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
10737           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
10738
10739           if (speed)
10740             {
10741               if (compound_p)
10742                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
10743                 cost += extra_cost->mult[0].extend_add;
10744               else
10745                 /* MUL/SMULL/UMULL.  */
10746                 cost += extra_cost->mult[0].extend;
10747             }
10748
10749           return cost;
10750         }
10751
10752       /* This is either an integer multiply or a MADD.  In both cases
10753          we want to recurse and cost the operands.  */
10754       cost += rtx_cost (op0, mode, MULT, 0, speed);
10755       cost += rtx_cost (op1, mode, MULT, 1, speed);
10756
10757       if (speed)
10758         {
10759           if (compound_p)
10760             /* MADD/MSUB.  */
10761             cost += extra_cost->mult[mode == DImode].add;
10762           else
10763             /* MUL.  */
10764             cost += extra_cost->mult[mode == DImode].simple;
10765         }
10766
10767       return cost;
10768     }
10769   else
10770     {
10771       if (speed)
10772         {
10773           /* Floating-point FMA/FMUL can also support negations of the
10774              operands, unless the rounding mode is upward or downward in
10775              which case FNMUL is different than FMUL with operand negation.  */
10776           bool neg0 = GET_CODE (op0) == NEG;
10777           bool neg1 = GET_CODE (op1) == NEG;
10778           if (compound_p || !flag_rounding_math || (neg0 && neg1))
10779             {
10780               if (neg0)
10781                 op0 = XEXP (op0, 0);
10782               if (neg1)
10783                 op1 = XEXP (op1, 0);
10784             }
10785
10786           if (compound_p)
10787             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
10788             cost += extra_cost->fp[mode == DFmode].fma;
10789           else
10790             /* FMUL/FNMUL.  */
10791             cost += extra_cost->fp[mode == DFmode].mult;
10792         }
10793
10794       cost += rtx_cost (op0, mode, MULT, 0, speed);
10795       cost += rtx_cost (op1, mode, MULT, 1, speed);
10796       return cost;
10797     }
10798 }
10799
10800 static int
10801 aarch64_address_cost (rtx x,
10802                       machine_mode mode,
10803                       addr_space_t as ATTRIBUTE_UNUSED,
10804                       bool speed)
10805 {
10806   enum rtx_code c = GET_CODE (x);
10807   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
10808   struct aarch64_address_info info;
10809   int cost = 0;
10810   info.shift = 0;
10811
10812   if (!aarch64_classify_address (&info, x, mode, false))
10813     {
10814       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
10815         {
10816           /* This is a CONST or SYMBOL ref which will be split
10817              in a different way depending on the code model in use.
10818              Cost it through the generic infrastructure.  */
10819           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
10820           /* Divide through by the cost of one instruction to
10821              bring it to the same units as the address costs.  */
10822           cost_symbol_ref /= COSTS_N_INSNS (1);
10823           /* The cost is then the cost of preparing the address,
10824              followed by an immediate (possibly 0) offset.  */
10825           return cost_symbol_ref + addr_cost->imm_offset;
10826         }
10827       else
10828         {
10829           /* This is most likely a jump table from a case
10830              statement.  */
10831           return addr_cost->register_offset;
10832         }
10833     }
10834
10835   switch (info.type)
10836     {
10837       case ADDRESS_LO_SUM:
10838       case ADDRESS_SYMBOLIC:
10839       case ADDRESS_REG_IMM:
10840         cost += addr_cost->imm_offset;
10841         break;
10842
10843       case ADDRESS_REG_WB:
10844         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
10845           cost += addr_cost->pre_modify;
10846         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
10847           cost += addr_cost->post_modify;
10848         else
10849           gcc_unreachable ();
10850
10851         break;
10852
10853       case ADDRESS_REG_REG:
10854         cost += addr_cost->register_offset;
10855         break;
10856
10857       case ADDRESS_REG_SXTW:
10858         cost += addr_cost->register_sextend;
10859         break;
10860
10861       case ADDRESS_REG_UXTW:
10862         cost += addr_cost->register_zextend;
10863         break;
10864
10865       default:
10866         gcc_unreachable ();
10867     }
10868
10869
10870   if (info.shift > 0)
10871     {
10872       /* For the sake of calculating the cost of the shifted register
10873          component, we can treat same sized modes in the same way.  */
10874       if (known_eq (GET_MODE_BITSIZE (mode), 16))
10875         cost += addr_cost->addr_scale_costs.hi;
10876       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
10877         cost += addr_cost->addr_scale_costs.si;
10878       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
10879         cost += addr_cost->addr_scale_costs.di;
10880       else
10881         /* We can't tell, or this is a 128-bit vector.  */
10882         cost += addr_cost->addr_scale_costs.ti;
10883     }
10884
10885   return cost;
10886 }
10887
10888 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
10889    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
10890    to be taken.  */
10891
10892 int
10893 aarch64_branch_cost (bool speed_p, bool predictable_p)
10894 {
10895   /* When optimizing for speed, use the cost of unpredictable branches.  */
10896   const struct cpu_branch_cost *branch_costs =
10897     aarch64_tune_params.branch_costs;
10898
10899   if (!speed_p || predictable_p)
10900     return branch_costs->predictable;
10901   else
10902     return branch_costs->unpredictable;
10903 }
10904
10905 /* Return true if the RTX X in mode MODE is a zero or sign extract
10906    usable in an ADD or SUB (extended register) instruction.  */
10907 static bool
10908 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10909 {
10910   /* Catch add with a sign extract.
10911      This is add_<optab><mode>_multp2.  */
10912   if (GET_CODE (x) == SIGN_EXTRACT
10913       || GET_CODE (x) == ZERO_EXTRACT)
10914     {
10915       rtx op0 = XEXP (x, 0);
10916       rtx op1 = XEXP (x, 1);
10917       rtx op2 = XEXP (x, 2);
10918
10919       if (GET_CODE (op0) == MULT
10920           && CONST_INT_P (op1)
10921           && op2 == const0_rtx
10922           && CONST_INT_P (XEXP (op0, 1))
10923           && aarch64_is_extend_from_extract (mode,
10924                                              XEXP (op0, 1),
10925                                              op1))
10926         {
10927           return true;
10928         }
10929     }
10930   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10931      No shift.  */
10932   else if (GET_CODE (x) == SIGN_EXTEND
10933            || GET_CODE (x) == ZERO_EXTEND)
10934     return REG_P (XEXP (x, 0));
10935
10936   return false;
10937 }
10938
10939 static bool
10940 aarch64_frint_unspec_p (unsigned int u)
10941 {
10942   switch (u)
10943     {
10944       case UNSPEC_FRINTZ:
10945       case UNSPEC_FRINTP:
10946       case UNSPEC_FRINTM:
10947       case UNSPEC_FRINTA:
10948       case UNSPEC_FRINTN:
10949       case UNSPEC_FRINTX:
10950       case UNSPEC_FRINTI:
10951         return true;
10952
10953       default:
10954         return false;
10955     }
10956 }
10957
10958 /* Return true iff X is an rtx that will match an extr instruction
10959    i.e. as described in the *extr<mode>5_insn family of patterns.
10960    OP0 and OP1 will be set to the operands of the shifts involved
10961    on success and will be NULL_RTX otherwise.  */
10962
10963 static bool
10964 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10965 {
10966   rtx op0, op1;
10967   scalar_int_mode mode;
10968   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10969     return false;
10970
10971   *res_op0 = NULL_RTX;
10972   *res_op1 = NULL_RTX;
10973
10974   if (GET_CODE (x) != IOR)
10975     return false;
10976
10977   op0 = XEXP (x, 0);
10978   op1 = XEXP (x, 1);
10979
10980   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10981       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10982     {
10983      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
10984       if (GET_CODE (op1) == ASHIFT)
10985         std::swap (op0, op1);
10986
10987       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10988         return false;
10989
10990       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10991       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10992
10993       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10994           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10995         {
10996           *res_op0 = XEXP (op0, 0);
10997           *res_op1 = XEXP (op1, 0);
10998           return true;
10999         }
11000     }
11001
11002   return false;
11003 }
11004
11005 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11006    storing it in *COST.  Result is true if the total cost of the operation
11007    has now been calculated.  */
11008 static bool
11009 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11010 {
11011   rtx inner;
11012   rtx comparator;
11013   enum rtx_code cmpcode;
11014
11015   if (COMPARISON_P (op0))
11016     {
11017       inner = XEXP (op0, 0);
11018       comparator = XEXP (op0, 1);
11019       cmpcode = GET_CODE (op0);
11020     }
11021   else
11022     {
11023       inner = op0;
11024       comparator = const0_rtx;
11025       cmpcode = NE;
11026     }
11027
11028   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11029     {
11030       /* Conditional branch.  */
11031       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11032         return true;
11033       else
11034         {
11035           if (cmpcode == NE || cmpcode == EQ)
11036             {
11037               if (comparator == const0_rtx)
11038                 {
11039                   /* TBZ/TBNZ/CBZ/CBNZ.  */
11040                   if (GET_CODE (inner) == ZERO_EXTRACT)
11041                     /* TBZ/TBNZ.  */
11042                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11043                                        ZERO_EXTRACT, 0, speed);
11044                   else
11045                     /* CBZ/CBNZ.  */
11046                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
11047
11048                 return true;
11049               }
11050             }
11051           else if (cmpcode == LT || cmpcode == GE)
11052             {
11053               /* TBZ/TBNZ.  */
11054               if (comparator == const0_rtx)
11055                 return true;
11056             }
11057         }
11058     }
11059   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11060     {
11061       /* CCMP.  */
11062       if (GET_CODE (op1) == COMPARE)
11063         {
11064           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
11065           if (XEXP (op1, 1) == const0_rtx)
11066             *cost += 1;
11067           if (speed)
11068             {
11069               machine_mode mode = GET_MODE (XEXP (op1, 0));
11070               const struct cpu_cost_table *extra_cost
11071                 = aarch64_tune_params.insn_extra_cost;
11072
11073               if (GET_MODE_CLASS (mode) == MODE_INT)
11074                 *cost += extra_cost->alu.arith;
11075               else
11076                 *cost += extra_cost->fp[mode == DFmode].compare;
11077             }
11078           return true;
11079         }
11080
11081       /* It's a conditional operation based on the status flags,
11082          so it must be some flavor of CSEL.  */
11083
11084       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
11085       if (GET_CODE (op1) == NEG
11086           || GET_CODE (op1) == NOT
11087           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11088         op1 = XEXP (op1, 0);
11089       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11090         {
11091           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
11092           op1 = XEXP (op1, 0);
11093           op2 = XEXP (op2, 0);
11094         }
11095
11096       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11097       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
11098       return true;
11099     }
11100
11101   /* We don't know what this is, cost all operands.  */
11102   return false;
11103 }
11104
11105 /* Check whether X is a bitfield operation of the form shift + extend that
11106    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
11107    operand to which the bitfield operation is applied.  Otherwise return
11108    NULL_RTX.  */
11109
11110 static rtx
11111 aarch64_extend_bitfield_pattern_p (rtx x)
11112 {
11113   rtx_code outer_code = GET_CODE (x);
11114   machine_mode outer_mode = GET_MODE (x);
11115
11116   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11117       && outer_mode != SImode && outer_mode != DImode)
11118     return NULL_RTX;
11119
11120   rtx inner = XEXP (x, 0);
11121   rtx_code inner_code = GET_CODE (inner);
11122   machine_mode inner_mode = GET_MODE (inner);
11123   rtx op = NULL_RTX;
11124
11125   switch (inner_code)
11126     {
11127       case ASHIFT:
11128         if (CONST_INT_P (XEXP (inner, 1))
11129             && (inner_mode == QImode || inner_mode == HImode))
11130           op = XEXP (inner, 0);
11131         break;
11132       case LSHIFTRT:
11133         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11134             && (inner_mode == QImode || inner_mode == HImode))
11135           op = XEXP (inner, 0);
11136         break;
11137       case ASHIFTRT:
11138         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11139             && (inner_mode == QImode || inner_mode == HImode))
11140           op = XEXP (inner, 0);
11141         break;
11142       default:
11143         break;
11144     }
11145
11146   return op;
11147 }
11148
11149 /* Return true if the mask and a shift amount from an RTX of the form
11150    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11151    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
11152
11153 bool
11154 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11155                                     rtx shft_amnt)
11156 {
11157   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11158          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11159          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
11160          && (INTVAL (mask)
11161              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
11162 }
11163
11164 /* Return true if the masks and a shift amount from an RTX of the form
11165    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11166    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
11167
11168 bool
11169 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11170                                    unsigned HOST_WIDE_INT mask1,
11171                                    unsigned HOST_WIDE_INT shft_amnt,
11172                                    unsigned HOST_WIDE_INT mask2)
11173 {
11174   unsigned HOST_WIDE_INT t;
11175
11176   /* Verify that there is no overlap in what bits are set in the two masks.  */
11177   if (mask1 != ~mask2)
11178     return false;
11179
11180   /* Verify that mask2 is not all zeros or ones.  */
11181   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11182     return false;
11183
11184   /* The shift amount should always be less than the mode size.  */
11185   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11186
11187   /* Verify that the mask being shifted is contiguous and would be in the
11188      least significant bits after shifting by shft_amnt.  */
11189   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11190   return (t == (t & -t));
11191 }
11192
11193 /* Calculate the cost of calculating X, storing it in *COST.  Result
11194    is true if the total cost of the operation has now been calculated.  */
11195 static bool
11196 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
11197                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11198 {
11199   rtx op0, op1, op2;
11200   const struct cpu_cost_table *extra_cost
11201     = aarch64_tune_params.insn_extra_cost;
11202   int code = GET_CODE (x);
11203   scalar_int_mode int_mode;
11204
11205   /* By default, assume that everything has equivalent cost to the
11206      cheapest instruction.  Any additional costs are applied as a delta
11207      above this default.  */
11208   *cost = COSTS_N_INSNS (1);
11209
11210   switch (code)
11211     {
11212     case SET:
11213       /* The cost depends entirely on the operands to SET.  */
11214       *cost = 0;
11215       op0 = SET_DEST (x);
11216       op1 = SET_SRC (x);
11217
11218       switch (GET_CODE (op0))
11219         {
11220         case MEM:
11221           if (speed)
11222             {
11223               rtx address = XEXP (op0, 0);
11224               if (VECTOR_MODE_P (mode))
11225                 *cost += extra_cost->ldst.storev;
11226               else if (GET_MODE_CLASS (mode) == MODE_INT)
11227                 *cost += extra_cost->ldst.store;
11228               else if (mode == SFmode)
11229                 *cost += extra_cost->ldst.storef;
11230               else if (mode == DFmode)
11231                 *cost += extra_cost->ldst.stored;
11232
11233               *cost +=
11234                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11235                                                      0, speed));
11236             }
11237
11238           *cost += rtx_cost (op1, mode, SET, 1, speed);
11239           return true;
11240
11241         case SUBREG:
11242           if (! REG_P (SUBREG_REG (op0)))
11243             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
11244
11245           /* Fall through.  */
11246         case REG:
11247           /* The cost is one per vector-register copied.  */
11248           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11249             {
11250               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11251               *cost = COSTS_N_INSNS (nregs);
11252             }
11253           /* const0_rtx is in general free, but we will use an
11254              instruction to set a register to 0.  */
11255           else if (REG_P (op1) || op1 == const0_rtx)
11256             {
11257               /* The cost is 1 per register copied.  */
11258               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11259               *cost = COSTS_N_INSNS (nregs);
11260             }
11261           else
11262             /* Cost is just the cost of the RHS of the set.  */
11263             *cost += rtx_cost (op1, mode, SET, 1, speed);
11264           return true;
11265
11266         case ZERO_EXTRACT:
11267         case SIGN_EXTRACT:
11268           /* Bit-field insertion.  Strip any redundant widening of
11269              the RHS to meet the width of the target.  */
11270           if (GET_CODE (op1) == SUBREG)
11271             op1 = SUBREG_REG (op1);
11272           if ((GET_CODE (op1) == ZERO_EXTEND
11273                || GET_CODE (op1) == SIGN_EXTEND)
11274               && CONST_INT_P (XEXP (op0, 1))
11275               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11276               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
11277             op1 = XEXP (op1, 0);
11278
11279           if (CONST_INT_P (op1))
11280             {
11281               /* MOV immediate is assumed to always be cheap.  */
11282               *cost = COSTS_N_INSNS (1);
11283             }
11284           else
11285             {
11286               /* BFM.  */
11287               if (speed)
11288                 *cost += extra_cost->alu.bfi;
11289               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
11290             }
11291
11292           return true;
11293
11294         default:
11295           /* We can't make sense of this, assume default cost.  */
11296           *cost = COSTS_N_INSNS (1);
11297           return false;
11298         }
11299       return false;
11300
11301     case CONST_INT:
11302       /* If an instruction can incorporate a constant within the
11303          instruction, the instruction's expression avoids calling
11304          rtx_cost() on the constant.  If rtx_cost() is called on a
11305          constant, then it is usually because the constant must be
11306          moved into a register by one or more instructions.
11307
11308          The exception is constant 0, which can be expressed
11309          as XZR/WZR and is therefore free.  The exception to this is
11310          if we have (set (reg) (const0_rtx)) in which case we must cost
11311          the move.  However, we can catch that when we cost the SET, so
11312          we don't need to consider that here.  */
11313       if (x == const0_rtx)
11314         *cost = 0;
11315       else
11316         {
11317           /* To an approximation, building any other constant is
11318              proportionally expensive to the number of instructions
11319              required to build that constant.  This is true whether we
11320              are compiling for SPEED or otherwise.  */
11321           if (!is_a <scalar_int_mode> (mode, &int_mode))
11322             int_mode = word_mode;
11323           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
11324                                  (NULL_RTX, x, false, int_mode));
11325         }
11326       return true;
11327
11328     case CONST_DOUBLE:
11329
11330       /* First determine number of instructions to do the move
11331           as an integer constant.  */
11332       if (!aarch64_float_const_representable_p (x)
11333            && !aarch64_can_const_movi_rtx_p (x, mode)
11334            && aarch64_float_const_rtx_p (x))
11335         {
11336           unsigned HOST_WIDE_INT ival;
11337           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
11338           gcc_assert (succeed);
11339
11340           scalar_int_mode imode = (mode == HFmode
11341                                    ? SImode
11342                                    : int_mode_for_mode (mode).require ());
11343           int ncost = aarch64_internal_mov_immediate
11344                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11345           *cost += COSTS_N_INSNS (ncost);
11346           return true;
11347         }
11348
11349       if (speed)
11350         {
11351           /* mov[df,sf]_aarch64.  */
11352           if (aarch64_float_const_representable_p (x))
11353             /* FMOV (scalar immediate).  */
11354             *cost += extra_cost->fp[mode == DFmode].fpconst;
11355           else if (!aarch64_float_const_zero_rtx_p (x))
11356             {
11357               /* This will be a load from memory.  */
11358               if (mode == DFmode)
11359                 *cost += extra_cost->ldst.loadd;
11360               else
11361                 *cost += extra_cost->ldst.loadf;
11362             }
11363           else
11364             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
11365                or MOV v0.s[0], wzr - neither of which are modeled by the
11366                cost tables.  Just use the default cost.  */
11367             {
11368             }
11369         }
11370
11371       return true;
11372
11373     case MEM:
11374       if (speed)
11375         {
11376           /* For loads we want the base cost of a load, plus an
11377              approximation for the additional cost of the addressing
11378              mode.  */
11379           rtx address = XEXP (x, 0);
11380           if (VECTOR_MODE_P (mode))
11381             *cost += extra_cost->ldst.loadv;
11382           else if (GET_MODE_CLASS (mode) == MODE_INT)
11383             *cost += extra_cost->ldst.load;
11384           else if (mode == SFmode)
11385             *cost += extra_cost->ldst.loadf;
11386           else if (mode == DFmode)
11387             *cost += extra_cost->ldst.loadd;
11388
11389           *cost +=
11390                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11391                                                      0, speed));
11392         }
11393
11394       return true;
11395
11396     case NEG:
11397       op0 = XEXP (x, 0);
11398
11399       if (VECTOR_MODE_P (mode))
11400         {
11401           if (speed)
11402             {
11403               /* FNEG.  */
11404               *cost += extra_cost->vect.alu;
11405             }
11406           return false;
11407         }
11408
11409       if (GET_MODE_CLASS (mode) == MODE_INT)
11410         {
11411           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11412               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11413             {
11414               /* CSETM.  */
11415               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
11416               return true;
11417             }
11418
11419           /* Cost this as SUB wzr, X.  */
11420           op0 = CONST0_RTX (mode);
11421           op1 = XEXP (x, 0);
11422           goto cost_minus;
11423         }
11424
11425       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11426         {
11427           /* Support (neg(fma...)) as a single instruction only if
11428              sign of zeros is unimportant.  This matches the decision
11429              making in aarch64.md.  */
11430           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
11431             {
11432               /* FNMADD.  */
11433               *cost = rtx_cost (op0, mode, NEG, 0, speed);
11434               return true;
11435             }
11436           if (GET_CODE (op0) == MULT)
11437             {
11438               /* FNMUL.  */
11439               *cost = rtx_cost (op0, mode, NEG, 0, speed);
11440               return true;
11441             }
11442           if (speed)
11443             /* FNEG.  */
11444             *cost += extra_cost->fp[mode == DFmode].neg;
11445           return false;
11446         }
11447
11448       return false;
11449
11450     case CLRSB:
11451     case CLZ:
11452       if (speed)
11453         {
11454           if (VECTOR_MODE_P (mode))
11455             *cost += extra_cost->vect.alu;
11456           else
11457             *cost += extra_cost->alu.clz;
11458         }
11459
11460       return false;
11461
11462     case COMPARE:
11463       op0 = XEXP (x, 0);
11464       op1 = XEXP (x, 1);
11465
11466       if (op1 == const0_rtx
11467           && GET_CODE (op0) == AND)
11468         {
11469           x = op0;
11470           mode = GET_MODE (op0);
11471           goto cost_logic;
11472         }
11473
11474       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
11475         {
11476           /* TODO: A write to the CC flags possibly costs extra, this
11477              needs encoding in the cost tables.  */
11478
11479           mode = GET_MODE (op0);
11480           /* ANDS.  */
11481           if (GET_CODE (op0) == AND)
11482             {
11483               x = op0;
11484               goto cost_logic;
11485             }
11486
11487           if (GET_CODE (op0) == PLUS)
11488             {
11489               /* ADDS (and CMN alias).  */
11490               x = op0;
11491               goto cost_plus;
11492             }
11493
11494           if (GET_CODE (op0) == MINUS)
11495             {
11496               /* SUBS.  */
11497               x = op0;
11498               goto cost_minus;
11499             }
11500
11501           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
11502               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
11503               && CONST_INT_P (XEXP (op0, 2)))
11504             {
11505               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
11506                  Handle it here directly rather than going to cost_logic
11507                  since we know the immediate generated for the TST is valid
11508                  so we can avoid creating an intermediate rtx for it only
11509                  for costing purposes.  */
11510               if (speed)
11511                 *cost += extra_cost->alu.logical;
11512
11513               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
11514                                  ZERO_EXTRACT, 0, speed);
11515               return true;
11516             }
11517
11518           if (GET_CODE (op1) == NEG)
11519             {
11520               /* CMN.  */
11521               if (speed)
11522                 *cost += extra_cost->alu.arith;
11523
11524               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
11525               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
11526               return true;
11527             }
11528
11529           /* CMP.
11530
11531              Compare can freely swap the order of operands, and
11532              canonicalization puts the more complex operation first.
11533              But the integer MINUS logic expects the shift/extend
11534              operation in op1.  */
11535           if (! (REG_P (op0)
11536                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
11537           {
11538             op0 = XEXP (x, 1);
11539             op1 = XEXP (x, 0);
11540           }
11541           goto cost_minus;
11542         }
11543
11544       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
11545         {
11546           /* FCMP.  */
11547           if (speed)
11548             *cost += extra_cost->fp[mode == DFmode].compare;
11549
11550           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
11551             {
11552               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
11553               /* FCMP supports constant 0.0 for no extra cost. */
11554               return true;
11555             }
11556           return false;
11557         }
11558
11559       if (VECTOR_MODE_P (mode))
11560         {
11561           /* Vector compare.  */
11562           if (speed)
11563             *cost += extra_cost->vect.alu;
11564
11565           if (aarch64_float_const_zero_rtx_p (op1))
11566             {
11567               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
11568                  cost.  */
11569               return true;
11570             }
11571           return false;
11572         }
11573       return false;
11574
11575     case MINUS:
11576       {
11577         op0 = XEXP (x, 0);
11578         op1 = XEXP (x, 1);
11579
11580 cost_minus:
11581         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
11582
11583         /* Detect valid immediates.  */
11584         if ((GET_MODE_CLASS (mode) == MODE_INT
11585              || (GET_MODE_CLASS (mode) == MODE_CC
11586                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
11587             && CONST_INT_P (op1)
11588             && aarch64_uimm12_shift (INTVAL (op1)))
11589           {
11590             if (speed)
11591               /* SUB(S) (immediate).  */
11592               *cost += extra_cost->alu.arith;
11593             return true;
11594           }
11595
11596         /* Look for SUB (extended register).  */
11597         if (is_a <scalar_int_mode> (mode, &int_mode)
11598             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
11599           {
11600             if (speed)
11601               *cost += extra_cost->alu.extend_arith;
11602
11603             op1 = aarch64_strip_extend (op1, true);
11604             *cost += rtx_cost (op1, VOIDmode,
11605                                (enum rtx_code) GET_CODE (op1), 0, speed);
11606             return true;
11607           }
11608
11609         rtx new_op1 = aarch64_strip_extend (op1, false);
11610
11611         /* Cost this as an FMA-alike operation.  */
11612         if ((GET_CODE (new_op1) == MULT
11613              || aarch64_shift_p (GET_CODE (new_op1)))
11614             && code != COMPARE)
11615           {
11616             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
11617                                             (enum rtx_code) code,
11618                                             speed);
11619             return true;
11620           }
11621
11622         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
11623
11624         if (speed)
11625           {
11626             if (VECTOR_MODE_P (mode))
11627               {
11628                 /* Vector SUB.  */
11629                 *cost += extra_cost->vect.alu;
11630               }
11631             else if (GET_MODE_CLASS (mode) == MODE_INT)
11632               {
11633                 /* SUB(S).  */
11634                 *cost += extra_cost->alu.arith;
11635               }
11636             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11637               {
11638                 /* FSUB.  */
11639                 *cost += extra_cost->fp[mode == DFmode].addsub;
11640               }
11641           }
11642         return true;
11643       }
11644
11645     case PLUS:
11646       {
11647         rtx new_op0;
11648
11649         op0 = XEXP (x, 0);
11650         op1 = XEXP (x, 1);
11651
11652 cost_plus:
11653         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11654             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11655           {
11656             /* CSINC.  */
11657             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
11658             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11659             return true;
11660           }
11661
11662         if (GET_MODE_CLASS (mode) == MODE_INT
11663             && (aarch64_plus_immediate (op1, mode)
11664                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
11665           {
11666             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
11667
11668             if (speed)
11669               /* ADD (immediate).  */
11670               *cost += extra_cost->alu.arith;
11671             return true;
11672           }
11673
11674         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11675
11676         /* Look for ADD (extended register).  */
11677         if (is_a <scalar_int_mode> (mode, &int_mode)
11678             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
11679           {
11680             if (speed)
11681               *cost += extra_cost->alu.extend_arith;
11682
11683             op0 = aarch64_strip_extend (op0, true);
11684             *cost += rtx_cost (op0, VOIDmode,
11685                                (enum rtx_code) GET_CODE (op0), 0, speed);
11686             return true;
11687           }
11688
11689         /* Strip any extend, leave shifts behind as we will
11690            cost them through mult_cost.  */
11691         new_op0 = aarch64_strip_extend (op0, false);
11692
11693         if (GET_CODE (new_op0) == MULT
11694             || aarch64_shift_p (GET_CODE (new_op0)))
11695           {
11696             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
11697                                             speed);
11698             return true;
11699           }
11700
11701         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
11702
11703         if (speed)
11704           {
11705             if (VECTOR_MODE_P (mode))
11706               {
11707                 /* Vector ADD.  */
11708                 *cost += extra_cost->vect.alu;
11709               }
11710             else if (GET_MODE_CLASS (mode) == MODE_INT)
11711               {
11712                 /* ADD.  */
11713                 *cost += extra_cost->alu.arith;
11714               }
11715             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11716               {
11717                 /* FADD.  */
11718                 *cost += extra_cost->fp[mode == DFmode].addsub;
11719               }
11720           }
11721         return true;
11722       }
11723
11724     case BSWAP:
11725       *cost = COSTS_N_INSNS (1);
11726
11727       if (speed)
11728         {
11729           if (VECTOR_MODE_P (mode))
11730             *cost += extra_cost->vect.alu;
11731           else
11732             *cost += extra_cost->alu.rev;
11733         }
11734       return false;
11735
11736     case IOR:
11737       if (aarch_rev16_p (x))
11738         {
11739           *cost = COSTS_N_INSNS (1);
11740
11741           if (speed)
11742             {
11743               if (VECTOR_MODE_P (mode))
11744                 *cost += extra_cost->vect.alu;
11745               else
11746                 *cost += extra_cost->alu.rev;
11747             }
11748           return true;
11749         }
11750
11751       if (aarch64_extr_rtx_p (x, &op0, &op1))
11752         {
11753           *cost += rtx_cost (op0, mode, IOR, 0, speed);
11754           *cost += rtx_cost (op1, mode, IOR, 1, speed);
11755           if (speed)
11756             *cost += extra_cost->alu.shift;
11757
11758           return true;
11759         }
11760     /* Fall through.  */
11761     case XOR:
11762     case AND:
11763     cost_logic:
11764       op0 = XEXP (x, 0);
11765       op1 = XEXP (x, 1);
11766
11767       if (VECTOR_MODE_P (mode))
11768         {
11769           if (speed)
11770             *cost += extra_cost->vect.alu;
11771           return true;
11772         }
11773
11774       if (code == AND
11775           && GET_CODE (op0) == MULT
11776           && CONST_INT_P (XEXP (op0, 1))
11777           && CONST_INT_P (op1)
11778           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
11779                                INTVAL (op1)) != 0)
11780         {
11781           /* This is a UBFM/SBFM.  */
11782           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
11783           if (speed)
11784             *cost += extra_cost->alu.bfx;
11785           return true;
11786         }
11787
11788       if (is_int_mode (mode, &int_mode))
11789         {
11790           if (CONST_INT_P (op1))
11791             {
11792               /* We have a mask + shift version of a UBFIZ
11793                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
11794               if (GET_CODE (op0) == ASHIFT
11795                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
11796                                                          XEXP (op0, 1)))
11797                 {
11798                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
11799                                      (enum rtx_code) code, 0, speed);
11800                   if (speed)
11801                     *cost += extra_cost->alu.bfx;
11802
11803                   return true;
11804                 }
11805               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
11806                 {
11807                 /* We possibly get the immediate for free, this is not
11808                    modelled.  */
11809                   *cost += rtx_cost (op0, int_mode,
11810                                      (enum rtx_code) code, 0, speed);
11811                   if (speed)
11812                     *cost += extra_cost->alu.logical;
11813
11814                   return true;
11815                 }
11816             }
11817           else
11818             {
11819               rtx new_op0 = op0;
11820
11821               /* Handle ORN, EON, or BIC.  */
11822               if (GET_CODE (op0) == NOT)
11823                 op0 = XEXP (op0, 0);
11824
11825               new_op0 = aarch64_strip_shift (op0);
11826
11827               /* If we had a shift on op0 then this is a logical-shift-
11828                  by-register/immediate operation.  Otherwise, this is just
11829                  a logical operation.  */
11830               if (speed)
11831                 {
11832                   if (new_op0 != op0)
11833                     {
11834                       /* Shift by immediate.  */
11835                       if (CONST_INT_P (XEXP (op0, 1)))
11836                         *cost += extra_cost->alu.log_shift;
11837                       else
11838                         *cost += extra_cost->alu.log_shift_reg;
11839                     }
11840                   else
11841                     *cost += extra_cost->alu.logical;
11842                 }
11843
11844               /* In both cases we want to cost both operands.  */
11845               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
11846                                  0, speed);
11847               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
11848                                  1, speed);
11849
11850               return true;
11851             }
11852         }
11853       return false;
11854
11855     case NOT:
11856       x = XEXP (x, 0);
11857       op0 = aarch64_strip_shift (x);
11858
11859       if (VECTOR_MODE_P (mode))
11860         {
11861           /* Vector NOT.  */
11862           *cost += extra_cost->vect.alu;
11863           return false;
11864         }
11865
11866       /* MVN-shifted-reg.  */
11867       if (op0 != x)
11868         {
11869           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11870
11871           if (speed)
11872             *cost += extra_cost->alu.log_shift;
11873
11874           return true;
11875         }
11876       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
11877          Handle the second form here taking care that 'a' in the above can
11878          be a shift.  */
11879       else if (GET_CODE (op0) == XOR)
11880         {
11881           rtx newop0 = XEXP (op0, 0);
11882           rtx newop1 = XEXP (op0, 1);
11883           rtx op0_stripped = aarch64_strip_shift (newop0);
11884
11885           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
11886           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
11887
11888           if (speed)
11889             {
11890               if (op0_stripped != newop0)
11891                 *cost += extra_cost->alu.log_shift;
11892               else
11893                 *cost += extra_cost->alu.logical;
11894             }
11895
11896           return true;
11897         }
11898       /* MVN.  */
11899       if (speed)
11900         *cost += extra_cost->alu.logical;
11901
11902       return false;
11903
11904     case ZERO_EXTEND:
11905
11906       op0 = XEXP (x, 0);
11907       /* If a value is written in SI mode, then zero extended to DI
11908          mode, the operation will in general be free as a write to
11909          a 'w' register implicitly zeroes the upper bits of an 'x'
11910          register.  However, if this is
11911
11912            (set (reg) (zero_extend (reg)))
11913
11914          we must cost the explicit register move.  */
11915       if (mode == DImode
11916           && GET_MODE (op0) == SImode
11917           && outer == SET)
11918         {
11919           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11920
11921         /* If OP_COST is non-zero, then the cost of the zero extend
11922            is effectively the cost of the inner operation.  Otherwise
11923            we have a MOV instruction and we take the cost from the MOV
11924            itself.  This is true independently of whether we are
11925            optimizing for space or time.  */
11926           if (op_cost)
11927             *cost = op_cost;
11928
11929           return true;
11930         }
11931       else if (MEM_P (op0))
11932         {
11933           /* All loads can zero extend to any size for free.  */
11934           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11935           return true;
11936         }
11937
11938       op0 = aarch64_extend_bitfield_pattern_p (x);
11939       if (op0)
11940         {
11941           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11942           if (speed)
11943             *cost += extra_cost->alu.bfx;
11944           return true;
11945         }
11946
11947       if (speed)
11948         {
11949           if (VECTOR_MODE_P (mode))
11950             {
11951               /* UMOV.  */
11952               *cost += extra_cost->vect.alu;
11953             }
11954           else
11955             {
11956               /* We generate an AND instead of UXTB/UXTH.  */
11957               *cost += extra_cost->alu.logical;
11958             }
11959         }
11960       return false;
11961
11962     case SIGN_EXTEND:
11963       if (MEM_P (XEXP (x, 0)))
11964         {
11965           /* LDRSH.  */
11966           if (speed)
11967             {
11968               rtx address = XEXP (XEXP (x, 0), 0);
11969               *cost += extra_cost->ldst.load_sign_extend;
11970
11971               *cost +=
11972                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11973                                                      0, speed));
11974             }
11975           return true;
11976         }
11977
11978       op0 = aarch64_extend_bitfield_pattern_p (x);
11979       if (op0)
11980         {
11981           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11982           if (speed)
11983             *cost += extra_cost->alu.bfx;
11984           return true;
11985         }
11986
11987       if (speed)
11988         {
11989           if (VECTOR_MODE_P (mode))
11990             *cost += extra_cost->vect.alu;
11991           else
11992             *cost += extra_cost->alu.extend;
11993         }
11994       return false;
11995
11996     case ASHIFT:
11997       op0 = XEXP (x, 0);
11998       op1 = XEXP (x, 1);
11999
12000       if (CONST_INT_P (op1))
12001         {
12002           if (speed)
12003             {
12004               if (VECTOR_MODE_P (mode))
12005                 {
12006                   /* Vector shift (immediate).  */
12007                   *cost += extra_cost->vect.alu;
12008                 }
12009               else
12010                 {
12011                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
12012                      aliases.  */
12013                   *cost += extra_cost->alu.shift;
12014                 }
12015             }
12016
12017           /* We can incorporate zero/sign extend for free.  */
12018           if (GET_CODE (op0) == ZERO_EXTEND
12019               || GET_CODE (op0) == SIGN_EXTEND)
12020             op0 = XEXP (op0, 0);
12021
12022           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
12023           return true;
12024         }
12025       else
12026         {
12027           if (VECTOR_MODE_P (mode))
12028             {
12029               if (speed)
12030                 /* Vector shift (register).  */
12031                 *cost += extra_cost->vect.alu;
12032             }
12033           else
12034             {
12035               if (speed)
12036                 /* LSLV.  */
12037                 *cost += extra_cost->alu.shift_reg;
12038
12039               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12040                   && CONST_INT_P (XEXP (op1, 1))
12041                   && known_eq (INTVAL (XEXP (op1, 1)),
12042                                GET_MODE_BITSIZE (mode) - 1))
12043                 {
12044                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12045                   /* We already demanded XEXP (op1, 0) to be REG_P, so
12046                      don't recurse into it.  */
12047                   return true;
12048                 }
12049             }
12050           return false;  /* All arguments need to be in registers.  */
12051         }
12052
12053     case ROTATE:
12054     case ROTATERT:
12055     case LSHIFTRT:
12056     case ASHIFTRT:
12057       op0 = XEXP (x, 0);
12058       op1 = XEXP (x, 1);
12059
12060       if (CONST_INT_P (op1))
12061         {
12062           /* ASR (immediate) and friends.  */
12063           if (speed)
12064             {
12065               if (VECTOR_MODE_P (mode))
12066                 *cost += extra_cost->vect.alu;
12067               else
12068                 *cost += extra_cost->alu.shift;
12069             }
12070
12071           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12072           return true;
12073         }
12074       else
12075         {
12076           if (VECTOR_MODE_P (mode))
12077             {
12078               if (speed)
12079                 /* Vector shift (register).  */
12080                 *cost += extra_cost->vect.alu;
12081             }
12082           else
12083             {
12084               if (speed)
12085                 /* ASR (register) and friends.  */
12086                 *cost += extra_cost->alu.shift_reg;
12087
12088               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12089                   && CONST_INT_P (XEXP (op1, 1))
12090                   && known_eq (INTVAL (XEXP (op1, 1)),
12091                                GET_MODE_BITSIZE (mode) - 1))
12092                 {
12093                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12094                   /* We already demanded XEXP (op1, 0) to be REG_P, so
12095                      don't recurse into it.  */
12096                   return true;
12097                 }
12098             }
12099           return false;  /* All arguments need to be in registers.  */
12100         }
12101
12102     case SYMBOL_REF:
12103
12104       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12105           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
12106         {
12107           /* LDR.  */
12108           if (speed)
12109             *cost += extra_cost->ldst.load;
12110         }
12111       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12112                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12113         {
12114           /* ADRP, followed by ADD.  */
12115           *cost += COSTS_N_INSNS (1);
12116           if (speed)
12117             *cost += 2 * extra_cost->alu.arith;
12118         }
12119       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12120                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12121         {
12122           /* ADR.  */
12123           if (speed)
12124             *cost += extra_cost->alu.arith;
12125         }
12126
12127       if (flag_pic)
12128         {
12129           /* One extra load instruction, after accessing the GOT.  */
12130           *cost += COSTS_N_INSNS (1);
12131           if (speed)
12132             *cost += extra_cost->ldst.load;
12133         }
12134       return true;
12135
12136     case HIGH:
12137     case LO_SUM:
12138       /* ADRP/ADD (immediate).  */
12139       if (speed)
12140         *cost += extra_cost->alu.arith;
12141       return true;
12142
12143     case ZERO_EXTRACT:
12144     case SIGN_EXTRACT:
12145       /* UBFX/SBFX.  */
12146       if (speed)
12147         {
12148           if (VECTOR_MODE_P (mode))
12149             *cost += extra_cost->vect.alu;
12150           else
12151             *cost += extra_cost->alu.bfx;
12152         }
12153
12154       /* We can trust that the immediates used will be correct (there
12155          are no by-register forms), so we need only cost op0.  */
12156       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
12157       return true;
12158
12159     case MULT:
12160       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12161       /* aarch64_rtx_mult_cost always handles recursion to its
12162          operands.  */
12163       return true;
12164
12165     case MOD:
12166     /* We can expand signed mod by power of 2 using a NEGS, two parallel
12167        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
12168        an unconditional negate.  This case should only ever be reached through
12169        the set_smod_pow2_cheap check in expmed.c.  */
12170       if (CONST_INT_P (XEXP (x, 1))
12171           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
12172           && (mode == SImode || mode == DImode))
12173         {
12174           /* We expand to 4 instructions.  Reset the baseline.  */
12175           *cost = COSTS_N_INSNS (4);
12176
12177           if (speed)
12178             *cost += 2 * extra_cost->alu.logical
12179                      + 2 * extra_cost->alu.arith;
12180
12181           return true;
12182         }
12183
12184     /* Fall-through.  */
12185     case UMOD:
12186       if (speed)
12187         {
12188           /* Slighly prefer UMOD over SMOD.  */
12189           if (VECTOR_MODE_P (mode))
12190             *cost += extra_cost->vect.alu;
12191           else if (GET_MODE_CLASS (mode) == MODE_INT)
12192             *cost += (extra_cost->mult[mode == DImode].add
12193                       + extra_cost->mult[mode == DImode].idiv
12194                       + (code == MOD ? 1 : 0));
12195         }
12196       return false;  /* All arguments need to be in registers.  */
12197
12198     case DIV:
12199     case UDIV:
12200     case SQRT:
12201       if (speed)
12202         {
12203           if (VECTOR_MODE_P (mode))
12204             *cost += extra_cost->vect.alu;
12205           else if (GET_MODE_CLASS (mode) == MODE_INT)
12206             /* There is no integer SQRT, so only DIV and UDIV can get
12207                here.  */
12208             *cost += (extra_cost->mult[mode == DImode].idiv
12209                      /* Slighly prefer UDIV over SDIV.  */
12210                      + (code == DIV ? 1 : 0));
12211           else
12212             *cost += extra_cost->fp[mode == DFmode].div;
12213         }
12214       return false;  /* All arguments need to be in registers.  */
12215
12216     case IF_THEN_ELSE:
12217       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12218                                          XEXP (x, 2), cost, speed);
12219
12220     case EQ:
12221     case NE:
12222     case GT:
12223     case GTU:
12224     case LT:
12225     case LTU:
12226     case GE:
12227     case GEU:
12228     case LE:
12229     case LEU:
12230
12231       return false; /* All arguments must be in registers.  */
12232
12233     case FMA:
12234       op0 = XEXP (x, 0);
12235       op1 = XEXP (x, 1);
12236       op2 = XEXP (x, 2);
12237
12238       if (speed)
12239         {
12240           if (VECTOR_MODE_P (mode))
12241             *cost += extra_cost->vect.alu;
12242           else
12243             *cost += extra_cost->fp[mode == DFmode].fma;
12244         }
12245
12246       /* FMSUB, FNMADD, and FNMSUB are free.  */
12247       if (GET_CODE (op0) == NEG)
12248         op0 = XEXP (op0, 0);
12249
12250       if (GET_CODE (op2) == NEG)
12251         op2 = XEXP (op2, 0);
12252
12253       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12254          and the by-element operand as operand 0.  */
12255       if (GET_CODE (op1) == NEG)
12256         op1 = XEXP (op1, 0);
12257
12258       /* Catch vector-by-element operations.  The by-element operand can
12259          either be (vec_duplicate (vec_select (x))) or just
12260          (vec_select (x)), depending on whether we are multiplying by
12261          a vector or a scalar.
12262
12263          Canonicalization is not very good in these cases, FMA4 will put the
12264          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
12265       if (GET_CODE (op0) == VEC_DUPLICATE)
12266         op0 = XEXP (op0, 0);
12267       else if (GET_CODE (op1) == VEC_DUPLICATE)
12268         op1 = XEXP (op1, 0);
12269
12270       if (GET_CODE (op0) == VEC_SELECT)
12271         op0 = XEXP (op0, 0);
12272       else if (GET_CODE (op1) == VEC_SELECT)
12273         op1 = XEXP (op1, 0);
12274
12275       /* If the remaining parameters are not registers,
12276          get the cost to put them into registers.  */
12277       *cost += rtx_cost (op0, mode, FMA, 0, speed);
12278       *cost += rtx_cost (op1, mode, FMA, 1, speed);
12279       *cost += rtx_cost (op2, mode, FMA, 2, speed);
12280       return true;
12281
12282     case FLOAT:
12283     case UNSIGNED_FLOAT:
12284       if (speed)
12285         *cost += extra_cost->fp[mode == DFmode].fromint;
12286       return false;
12287
12288     case FLOAT_EXTEND:
12289       if (speed)
12290         {
12291           if (VECTOR_MODE_P (mode))
12292             {
12293               /*Vector truncate.  */
12294               *cost += extra_cost->vect.alu;
12295             }
12296           else
12297             *cost += extra_cost->fp[mode == DFmode].widen;
12298         }
12299       return false;
12300
12301     case FLOAT_TRUNCATE:
12302       if (speed)
12303         {
12304           if (VECTOR_MODE_P (mode))
12305             {
12306               /*Vector conversion.  */
12307               *cost += extra_cost->vect.alu;
12308             }
12309           else
12310             *cost += extra_cost->fp[mode == DFmode].narrow;
12311         }
12312       return false;
12313
12314     case FIX:
12315     case UNSIGNED_FIX:
12316       x = XEXP (x, 0);
12317       /* Strip the rounding part.  They will all be implemented
12318          by the fcvt* family of instructions anyway.  */
12319       if (GET_CODE (x) == UNSPEC)
12320         {
12321           unsigned int uns_code = XINT (x, 1);
12322
12323           if (uns_code == UNSPEC_FRINTA
12324               || uns_code == UNSPEC_FRINTM
12325               || uns_code == UNSPEC_FRINTN
12326               || uns_code == UNSPEC_FRINTP
12327               || uns_code == UNSPEC_FRINTZ)
12328             x = XVECEXP (x, 0, 0);
12329         }
12330
12331       if (speed)
12332         {
12333           if (VECTOR_MODE_P (mode))
12334             *cost += extra_cost->vect.alu;
12335           else
12336             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
12337         }
12338
12339       /* We can combine fmul by a power of 2 followed by a fcvt into a single
12340          fixed-point fcvt.  */
12341       if (GET_CODE (x) == MULT
12342           && ((VECTOR_MODE_P (mode)
12343                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
12344               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
12345         {
12346           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
12347                              0, speed);
12348           return true;
12349         }
12350
12351       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
12352       return true;
12353
12354     case ABS:
12355       if (VECTOR_MODE_P (mode))
12356         {
12357           /* ABS (vector).  */
12358           if (speed)
12359             *cost += extra_cost->vect.alu;
12360         }
12361       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12362         {
12363           op0 = XEXP (x, 0);
12364
12365           /* FABD, which is analogous to FADD.  */
12366           if (GET_CODE (op0) == MINUS)
12367             {
12368               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
12369               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
12370               if (speed)
12371                 *cost += extra_cost->fp[mode == DFmode].addsub;
12372
12373               return true;
12374             }
12375           /* Simple FABS is analogous to FNEG.  */
12376           if (speed)
12377             *cost += extra_cost->fp[mode == DFmode].neg;
12378         }
12379       else
12380         {
12381           /* Integer ABS will either be split to
12382              two arithmetic instructions, or will be an ABS
12383              (scalar), which we don't model.  */
12384           *cost = COSTS_N_INSNS (2);
12385           if (speed)
12386             *cost += 2 * extra_cost->alu.arith;
12387         }
12388       return false;
12389
12390     case SMAX:
12391     case SMIN:
12392       if (speed)
12393         {
12394           if (VECTOR_MODE_P (mode))
12395             *cost += extra_cost->vect.alu;
12396           else
12397             {
12398               /* FMAXNM/FMINNM/FMAX/FMIN.
12399                  TODO: This may not be accurate for all implementations, but
12400                  we do not model this in the cost tables.  */
12401               *cost += extra_cost->fp[mode == DFmode].addsub;
12402             }
12403         }
12404       return false;
12405
12406     case UNSPEC:
12407       /* The floating point round to integer frint* instructions.  */
12408       if (aarch64_frint_unspec_p (XINT (x, 1)))
12409         {
12410           if (speed)
12411             *cost += extra_cost->fp[mode == DFmode].roundint;
12412
12413           return false;
12414         }
12415
12416       if (XINT (x, 1) == UNSPEC_RBIT)
12417         {
12418           if (speed)
12419             *cost += extra_cost->alu.rev;
12420
12421           return false;
12422         }
12423       break;
12424
12425     case TRUNCATE:
12426
12427       /* Decompose <su>muldi3_highpart.  */
12428       if (/* (truncate:DI  */
12429           mode == DImode
12430           /*   (lshiftrt:TI  */
12431           && GET_MODE (XEXP (x, 0)) == TImode
12432           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
12433           /*      (mult:TI  */
12434           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12435           /*        (ANY_EXTEND:TI (reg:DI))
12436                     (ANY_EXTEND:TI (reg:DI)))  */
12437           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
12438                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
12439               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
12440                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
12441           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
12442           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
12443           /*     (const_int 64)  */
12444           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12445           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
12446         {
12447           /* UMULH/SMULH.  */
12448           if (speed)
12449             *cost += extra_cost->mult[mode == DImode].extend;
12450           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
12451                              mode, MULT, 0, speed);
12452           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
12453                              mode, MULT, 1, speed);
12454           return true;
12455         }
12456
12457       /* Fall through.  */
12458     default:
12459       break;
12460     }
12461
12462   if (dump_file
12463       && flag_aarch64_verbose_cost)
12464     fprintf (dump_file,
12465       "\nFailed to cost RTX.  Assuming default cost.\n");
12466
12467   return true;
12468 }
12469
12470 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12471    calculated for X.  This cost is stored in *COST.  Returns true
12472    if the total cost of X was calculated.  */
12473 static bool
12474 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
12475                    int param, int *cost, bool speed)
12476 {
12477   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
12478
12479   if (dump_file
12480       && flag_aarch64_verbose_cost)
12481     {
12482       print_rtl_single (dump_file, x);
12483       fprintf (dump_file, "\n%s cost: %d (%s)\n",
12484                speed ? "Hot" : "Cold",
12485                *cost, result ? "final" : "partial");
12486     }
12487
12488   return result;
12489 }
12490
12491 static int
12492 aarch64_register_move_cost (machine_mode mode,
12493                             reg_class_t from_i, reg_class_t to_i)
12494 {
12495   enum reg_class from = (enum reg_class) from_i;
12496   enum reg_class to = (enum reg_class) to_i;
12497   const struct cpu_regmove_cost *regmove_cost
12498     = aarch64_tune_params.regmove_cost;
12499
12500   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
12501   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
12502     to = GENERAL_REGS;
12503
12504   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
12505     from = GENERAL_REGS;
12506
12507   /* Make RDFFR very expensive.  In particular, if we know that the FFR
12508      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
12509      as a way of obtaining a PTRUE.  */
12510   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
12511       && hard_reg_set_subset_p (reg_class_contents[from_i],
12512                                 reg_class_contents[FFR_REGS]))
12513     return 80;
12514
12515   /* Moving between GPR and stack cost is the same as GP2GP.  */
12516   if ((from == GENERAL_REGS && to == STACK_REG)
12517       || (to == GENERAL_REGS && from == STACK_REG))
12518     return regmove_cost->GP2GP;
12519
12520   /* To/From the stack register, we move via the gprs.  */
12521   if (to == STACK_REG || from == STACK_REG)
12522     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
12523             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
12524
12525   if (known_eq (GET_MODE_SIZE (mode), 16))
12526     {
12527       /* 128-bit operations on general registers require 2 instructions.  */
12528       if (from == GENERAL_REGS && to == GENERAL_REGS)
12529         return regmove_cost->GP2GP * 2;
12530       else if (from == GENERAL_REGS)
12531         return regmove_cost->GP2FP * 2;
12532       else if (to == GENERAL_REGS)
12533         return regmove_cost->FP2GP * 2;
12534
12535       /* When AdvSIMD instructions are disabled it is not possible to move
12536          a 128-bit value directly between Q registers.  This is handled in
12537          secondary reload.  A general register is used as a scratch to move
12538          the upper DI value and the lower DI value is moved directly,
12539          hence the cost is the sum of three moves. */
12540       if (! TARGET_SIMD)
12541         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
12542
12543       return regmove_cost->FP2FP;
12544     }
12545
12546   if (from == GENERAL_REGS && to == GENERAL_REGS)
12547     return regmove_cost->GP2GP;
12548   else if (from == GENERAL_REGS)
12549     return regmove_cost->GP2FP;
12550   else if (to == GENERAL_REGS)
12551     return regmove_cost->FP2GP;
12552
12553   return regmove_cost->FP2FP;
12554 }
12555
12556 static int
12557 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
12558                           reg_class_t rclass ATTRIBUTE_UNUSED,
12559                           bool in ATTRIBUTE_UNUSED)
12560 {
12561   return aarch64_tune_params.memmov_cost;
12562 }
12563
12564 /* Implement TARGET_INIT_BUILTINS.  */
12565 static void
12566 aarch64_init_builtins ()
12567 {
12568   aarch64_general_init_builtins ();
12569   aarch64_sve::init_builtins ();
12570 }
12571
12572 /* Implement TARGET_FOLD_BUILTIN.  */
12573 static tree
12574 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
12575 {
12576   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12577   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12578   tree type = TREE_TYPE (TREE_TYPE (fndecl));
12579   switch (code & AARCH64_BUILTIN_CLASS)
12580     {
12581     case AARCH64_BUILTIN_GENERAL:
12582       return aarch64_general_fold_builtin (subcode, type, nargs, args);
12583
12584     case AARCH64_BUILTIN_SVE:
12585       return NULL_TREE;
12586     }
12587   gcc_unreachable ();
12588 }
12589
12590 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
12591 static bool
12592 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
12593 {
12594   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
12595   tree fndecl = gimple_call_fndecl (stmt);
12596   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12597   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12598   gimple *new_stmt = NULL;
12599   switch (code & AARCH64_BUILTIN_CLASS)
12600     {
12601     case AARCH64_BUILTIN_GENERAL:
12602       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
12603       break;
12604
12605     case AARCH64_BUILTIN_SVE:
12606       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
12607       break;
12608     }
12609
12610   if (!new_stmt)
12611     return false;
12612
12613   gsi_replace (gsi, new_stmt, true);
12614   return true;
12615 }
12616
12617 /* Implement TARGET_EXPAND_BUILTIN.  */
12618 static rtx
12619 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
12620 {
12621   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12622   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12623   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12624   switch (code & AARCH64_BUILTIN_CLASS)
12625     {
12626     case AARCH64_BUILTIN_GENERAL:
12627       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
12628
12629     case AARCH64_BUILTIN_SVE:
12630       return aarch64_sve::expand_builtin (subcode, exp, target);
12631     }
12632   gcc_unreachable ();
12633 }
12634
12635 /* Implement TARGET_BUILTIN_DECL.  */
12636 static tree
12637 aarch64_builtin_decl (unsigned int code, bool initialize_p)
12638 {
12639   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12640   switch (code & AARCH64_BUILTIN_CLASS)
12641     {
12642     case AARCH64_BUILTIN_GENERAL:
12643       return aarch64_general_builtin_decl (subcode, initialize_p);
12644
12645     case AARCH64_BUILTIN_SVE:
12646       return aarch64_sve::builtin_decl (subcode, initialize_p);
12647     }
12648   gcc_unreachable ();
12649 }
12650
12651 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
12652    to optimize 1.0/sqrt.  */
12653
12654 static bool
12655 use_rsqrt_p (machine_mode mode)
12656 {
12657   return (!flag_trapping_math
12658           && flag_unsafe_math_optimizations
12659           && ((aarch64_tune_params.approx_modes->recip_sqrt
12660                & AARCH64_APPROX_MODE (mode))
12661               || flag_mrecip_low_precision_sqrt));
12662 }
12663
12664 /* Function to decide when to use the approximate reciprocal square root
12665    builtin.  */
12666
12667 static tree
12668 aarch64_builtin_reciprocal (tree fndecl)
12669 {
12670   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
12671
12672   if (!use_rsqrt_p (mode))
12673     return NULL_TREE;
12674   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12675   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12676   switch (code & AARCH64_BUILTIN_CLASS)
12677     {
12678     case AARCH64_BUILTIN_GENERAL:
12679       return aarch64_general_builtin_rsqrt (subcode);
12680
12681     case AARCH64_BUILTIN_SVE:
12682       return NULL_TREE;
12683     }
12684   gcc_unreachable ();
12685 }
12686
12687 /* Emit instruction sequence to compute either the approximate square root
12688    or its approximate reciprocal, depending on the flag RECP, and return
12689    whether the sequence was emitted or not.  */
12690
12691 bool
12692 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
12693 {
12694   machine_mode mode = GET_MODE (dst);
12695
12696   if (GET_MODE_INNER (mode) == HFmode)
12697     {
12698       gcc_assert (!recp);
12699       return false;
12700     }
12701
12702   if (!recp)
12703     {
12704       if (!(flag_mlow_precision_sqrt
12705             || (aarch64_tune_params.approx_modes->sqrt
12706                 & AARCH64_APPROX_MODE (mode))))
12707         return false;
12708
12709       if (flag_finite_math_only
12710           || flag_trapping_math
12711           || !flag_unsafe_math_optimizations
12712           || optimize_function_for_size_p (cfun))
12713         return false;
12714     }
12715   else
12716     /* Caller assumes we cannot fail.  */
12717     gcc_assert (use_rsqrt_p (mode));
12718
12719   machine_mode mmsk = (VECTOR_MODE_P (mode)
12720                        ? related_int_vector_mode (mode).require ()
12721                        : int_mode_for_mode (mode).require ());
12722   rtx xmsk = gen_reg_rtx (mmsk);
12723   if (!recp)
12724     /* When calculating the approximate square root, compare the
12725        argument with 0.0 and create a mask.  */
12726     emit_insn (gen_rtx_SET (xmsk,
12727                             gen_rtx_NEG (mmsk,
12728                                          gen_rtx_EQ (mmsk, src,
12729                                                      CONST0_RTX (mode)))));
12730
12731   /* Estimate the approximate reciprocal square root.  */
12732   rtx xdst = gen_reg_rtx (mode);
12733   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
12734
12735   /* Iterate over the series twice for SF and thrice for DF.  */
12736   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12737
12738   /* Optionally iterate over the series once less for faster performance
12739      while sacrificing the accuracy.  */
12740   if ((recp && flag_mrecip_low_precision_sqrt)
12741       || (!recp && flag_mlow_precision_sqrt))
12742     iterations--;
12743
12744   /* Iterate over the series to calculate the approximate reciprocal square
12745      root.  */
12746   rtx x1 = gen_reg_rtx (mode);
12747   while (iterations--)
12748     {
12749       rtx x2 = gen_reg_rtx (mode);
12750       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
12751
12752       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
12753
12754       if (iterations > 0)
12755         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
12756     }
12757
12758   if (!recp)
12759     {
12760       /* Qualify the approximate reciprocal square root when the argument is
12761          0.0 by squashing the intermediary result to 0.0.  */
12762       rtx xtmp = gen_reg_rtx (mmsk);
12763       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
12764                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
12765       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
12766
12767       /* Calculate the approximate square root.  */
12768       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
12769     }
12770
12771   /* Finalize the approximation.  */
12772   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
12773
12774   return true;
12775 }
12776
12777 /* Emit the instruction sequence to compute the approximation for the division
12778    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
12779
12780 bool
12781 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
12782 {
12783   machine_mode mode = GET_MODE (quo);
12784
12785   if (GET_MODE_INNER (mode) == HFmode)
12786     return false;
12787
12788   bool use_approx_division_p = (flag_mlow_precision_div
12789                                 || (aarch64_tune_params.approx_modes->division
12790                                     & AARCH64_APPROX_MODE (mode)));
12791
12792   if (!flag_finite_math_only
12793       || flag_trapping_math
12794       || !flag_unsafe_math_optimizations
12795       || optimize_function_for_size_p (cfun)
12796       || !use_approx_division_p)
12797     return false;
12798
12799   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
12800     return false;
12801
12802   /* Estimate the approximate reciprocal.  */
12803   rtx xrcp = gen_reg_rtx (mode);
12804   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
12805
12806   /* Iterate over the series twice for SF and thrice for DF.  */
12807   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12808
12809   /* Optionally iterate over the series once less for faster performance,
12810      while sacrificing the accuracy.  */
12811   if (flag_mlow_precision_div)
12812     iterations--;
12813
12814   /* Iterate over the series to calculate the approximate reciprocal.  */
12815   rtx xtmp = gen_reg_rtx (mode);
12816   while (iterations--)
12817     {
12818       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
12819
12820       if (iterations > 0)
12821         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
12822     }
12823
12824   if (num != CONST1_RTX (mode))
12825     {
12826       /* As the approximate reciprocal of DEN is already calculated, only
12827          calculate the approximate division when NUM is not 1.0.  */
12828       rtx xnum = force_reg (mode, num);
12829       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
12830     }
12831
12832   /* Finalize the approximation.  */
12833   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
12834   return true;
12835 }
12836
12837 /* Return the number of instructions that can be issued per cycle.  */
12838 static int
12839 aarch64_sched_issue_rate (void)
12840 {
12841   return aarch64_tune_params.issue_rate;
12842 }
12843
12844 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
12845 static int
12846 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
12847 {
12848   if (DEBUG_INSN_P (insn))
12849     return more;
12850
12851   rtx_code code = GET_CODE (PATTERN (insn));
12852   if (code == USE || code == CLOBBER)
12853     return more;
12854
12855   if (get_attr_type (insn) == TYPE_NO_INSN)
12856     return more;
12857
12858   return more - 1;
12859 }
12860
12861 static int
12862 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
12863 {
12864   int issue_rate = aarch64_sched_issue_rate ();
12865
12866   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
12867 }
12868
12869
12870 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
12871    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
12872    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
12873
12874 static int
12875 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
12876                                                     int ready_index)
12877 {
12878   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
12879 }
12880
12881
12882 /* Vectorizer cost model target hooks.  */
12883
12884 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
12885 static int
12886 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
12887                                     tree vectype,
12888                                     int misalign ATTRIBUTE_UNUSED)
12889 {
12890   unsigned elements;
12891   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
12892   bool fp = false;
12893
12894   if (vectype != NULL)
12895     fp = FLOAT_TYPE_P (vectype);
12896
12897   switch (type_of_cost)
12898     {
12899       case scalar_stmt:
12900         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
12901
12902       case scalar_load:
12903         return costs->scalar_load_cost;
12904
12905       case scalar_store:
12906         return costs->scalar_store_cost;
12907
12908       case vector_stmt:
12909         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12910
12911       case vector_load:
12912         return costs->vec_align_load_cost;
12913
12914       case vector_store:
12915         return costs->vec_store_cost;
12916
12917       case vec_to_scalar:
12918         return costs->vec_to_scalar_cost;
12919
12920       case scalar_to_vec:
12921         return costs->scalar_to_vec_cost;
12922
12923       case unaligned_load:
12924       case vector_gather_load:
12925         return costs->vec_unalign_load_cost;
12926
12927       case unaligned_store:
12928       case vector_scatter_store:
12929         return costs->vec_unalign_store_cost;
12930
12931       case cond_branch_taken:
12932         return costs->cond_taken_branch_cost;
12933
12934       case cond_branch_not_taken:
12935         return costs->cond_not_taken_branch_cost;
12936
12937       case vec_perm:
12938         return costs->vec_permute_cost;
12939
12940       case vec_promote_demote:
12941         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12942
12943       case vec_construct:
12944         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
12945         return elements / 2 + 1;
12946
12947       default:
12948         gcc_unreachable ();
12949     }
12950 }
12951
12952 /* Return true if STMT_INFO extends the result of a load.  */
12953 static bool
12954 aarch64_extending_load_p (stmt_vec_info stmt_info)
12955 {
12956   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
12957   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
12958     return false;
12959
12960   tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
12961   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
12962   tree rhs_type = TREE_TYPE (rhs);
12963   if (!INTEGRAL_TYPE_P (lhs_type)
12964       || !INTEGRAL_TYPE_P (rhs_type)
12965       || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
12966     return false;
12967
12968   stmt_vec_info def_stmt_info = stmt_info->vinfo->lookup_def (rhs);
12969   return (def_stmt_info
12970           && STMT_VINFO_DATA_REF (def_stmt_info)
12971           && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
12972 }
12973
12974 /* Return true if STMT_INFO is an integer truncation.  */
12975 static bool
12976 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
12977 {
12978   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
12979   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
12980     return false;
12981
12982   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
12983   tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
12984   return (INTEGRAL_TYPE_P (lhs_type)
12985           && INTEGRAL_TYPE_P (rhs_type)
12986           && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
12987 }
12988
12989 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
12990    for STMT_INFO, which has cost kind KIND.  Adjust the cost as necessary
12991    for SVE targets.  */
12992 static unsigned int
12993 aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
12994                               unsigned int stmt_cost)
12995 {
12996   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
12997      vector register size or number of units.  Integer promotions of this
12998      type therefore map to SXT[BHW] or UXT[BHW].
12999
13000      Most loads have extending forms that can do the sign or zero extension
13001      on the fly.  Optimistically assume that a load followed by an extension
13002      will fold to this form during combine, and that the extension therefore
13003      comes for free.  */
13004   if (kind == vector_stmt && aarch64_extending_load_p (stmt_info))
13005     stmt_cost = 0;
13006
13007   /* For similar reasons, vector_stmt integer truncations are a no-op,
13008      because we can just ignore the unused upper bits of the source.  */
13009   if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13010     stmt_cost = 0;
13011
13012   return stmt_cost;
13013 }
13014
13015 /* Implement targetm.vectorize.add_stmt_cost.  */
13016 static unsigned
13017 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
13018                        struct _stmt_vec_info *stmt_info, int misalign,
13019                        enum vect_cost_model_location where)
13020 {
13021   unsigned *cost = (unsigned *) data;
13022   unsigned retval = 0;
13023
13024   if (flag_vect_cost_model)
13025     {
13026       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
13027       int stmt_cost =
13028             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13029
13030       if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
13031         stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, stmt_cost);
13032
13033       /* Statements in an inner loop relative to the loop being
13034          vectorized are weighted more heavily.  The value here is
13035          arbitrary and could potentially be improved with analysis.  */
13036       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
13037         count *= 50; /*  FIXME  */
13038
13039       retval = (unsigned) (count * stmt_cost);
13040       cost[where] += retval;
13041     }
13042
13043   return retval;
13044 }
13045
13046 static void initialize_aarch64_code_model (struct gcc_options *);
13047
13048 /* Parse the TO_PARSE string and put the architecture struct that it
13049    selects into RES and the architectural features into ISA_FLAGS.
13050    Return an aarch64_parse_opt_result describing the parse result.
13051    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13052    When the TO_PARSE string contains an invalid extension,
13053    a copy of the string is created and stored to INVALID_EXTENSION.  */
13054
13055 static enum aarch64_parse_opt_result
13056 aarch64_parse_arch (const char *to_parse, const struct processor **res,
13057                     uint64_t *isa_flags, std::string *invalid_extension)
13058 {
13059   const char *ext;
13060   const struct processor *arch;
13061   size_t len;
13062
13063   ext = strchr (to_parse, '+');
13064
13065   if (ext != NULL)
13066     len = ext - to_parse;
13067   else
13068     len = strlen (to_parse);
13069
13070   if (len == 0)
13071     return AARCH64_PARSE_MISSING_ARG;
13072
13073
13074   /* Loop through the list of supported ARCHes to find a match.  */
13075   for (arch = all_architectures; arch->name != NULL; arch++)
13076     {
13077       if (strlen (arch->name) == len
13078           && strncmp (arch->name, to_parse, len) == 0)
13079         {
13080           uint64_t isa_temp = arch->flags;
13081
13082           if (ext != NULL)
13083             {
13084               /* TO_PARSE string contains at least one extension.  */
13085               enum aarch64_parse_opt_result ext_res
13086                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13087
13088               if (ext_res != AARCH64_PARSE_OK)
13089                 return ext_res;
13090             }
13091           /* Extension parsing was successful.  Confirm the result
13092              arch and ISA flags.  */
13093           *res = arch;
13094           *isa_flags = isa_temp;
13095           return AARCH64_PARSE_OK;
13096         }
13097     }
13098
13099   /* ARCH name not found in list.  */
13100   return AARCH64_PARSE_INVALID_ARG;
13101 }
13102
13103 /* Parse the TO_PARSE string and put the result tuning in RES and the
13104    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
13105    describing the parse result.  If there is an error parsing, RES and
13106    ISA_FLAGS are left unchanged.
13107    When the TO_PARSE string contains an invalid extension,
13108    a copy of the string is created and stored to INVALID_EXTENSION.  */
13109
13110 static enum aarch64_parse_opt_result
13111 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
13112                    uint64_t *isa_flags, std::string *invalid_extension)
13113 {
13114   const char *ext;
13115   const struct processor *cpu;
13116   size_t len;
13117
13118   ext = strchr (to_parse, '+');
13119
13120   if (ext != NULL)
13121     len = ext - to_parse;
13122   else
13123     len = strlen (to_parse);
13124
13125   if (len == 0)
13126     return AARCH64_PARSE_MISSING_ARG;
13127
13128
13129   /* Loop through the list of supported CPUs to find a match.  */
13130   for (cpu = all_cores; cpu->name != NULL; cpu++)
13131     {
13132       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
13133         {
13134           uint64_t isa_temp = cpu->flags;
13135
13136
13137           if (ext != NULL)
13138             {
13139               /* TO_PARSE string contains at least one extension.  */
13140               enum aarch64_parse_opt_result ext_res
13141                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13142
13143               if (ext_res != AARCH64_PARSE_OK)
13144                 return ext_res;
13145             }
13146           /* Extension parsing was successfull.  Confirm the result
13147              cpu and ISA flags.  */
13148           *res = cpu;
13149           *isa_flags = isa_temp;
13150           return AARCH64_PARSE_OK;
13151         }
13152     }
13153
13154   /* CPU name not found in list.  */
13155   return AARCH64_PARSE_INVALID_ARG;
13156 }
13157
13158 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13159    Return an aarch64_parse_opt_result describing the parse result.
13160    If the parsing fails the RES does not change.  */
13161
13162 static enum aarch64_parse_opt_result
13163 aarch64_parse_tune (const char *to_parse, const struct processor **res)
13164 {
13165   const struct processor *cpu;
13166
13167   /* Loop through the list of supported CPUs to find a match.  */
13168   for (cpu = all_cores; cpu->name != NULL; cpu++)
13169     {
13170       if (strcmp (cpu->name, to_parse) == 0)
13171         {
13172           *res = cpu;
13173           return AARCH64_PARSE_OK;
13174         }
13175     }
13176
13177   /* CPU name not found in list.  */
13178   return AARCH64_PARSE_INVALID_ARG;
13179 }
13180
13181 /* Parse TOKEN, which has length LENGTH to see if it is an option
13182    described in FLAG.  If it is, return the index bit for that fusion type.
13183    If not, error (printing OPTION_NAME) and return zero.  */
13184
13185 static unsigned int
13186 aarch64_parse_one_option_token (const char *token,
13187                                 size_t length,
13188                                 const struct aarch64_flag_desc *flag,
13189                                 const char *option_name)
13190 {
13191   for (; flag->name != NULL; flag++)
13192     {
13193       if (length == strlen (flag->name)
13194           && !strncmp (flag->name, token, length))
13195         return flag->flag;
13196     }
13197
13198   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
13199   return 0;
13200 }
13201
13202 /* Parse OPTION which is a comma-separated list of flags to enable.
13203    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
13204    default state we inherit from the CPU tuning structures.  OPTION_NAME
13205    gives the top-level option we are parsing in the -moverride string,
13206    for use in error messages.  */
13207
13208 static unsigned int
13209 aarch64_parse_boolean_options (const char *option,
13210                                const struct aarch64_flag_desc *flags,
13211                                unsigned int initial_state,
13212                                const char *option_name)
13213 {
13214   const char separator = '.';
13215   const char* specs = option;
13216   const char* ntoken = option;
13217   unsigned int found_flags = initial_state;
13218
13219   while ((ntoken = strchr (specs, separator)))
13220     {
13221       size_t token_length = ntoken - specs;
13222       unsigned token_ops = aarch64_parse_one_option_token (specs,
13223                                                            token_length,
13224                                                            flags,
13225                                                            option_name);
13226       /* If we find "none" (or, for simplicity's sake, an error) anywhere
13227          in the token stream, reset the supported operations.  So:
13228
13229            adrp+add.cmp+branch.none.adrp+add
13230
13231            would have the result of turning on only adrp+add fusion.  */
13232       if (!token_ops)
13233         found_flags = 0;
13234
13235       found_flags |= token_ops;
13236       specs = ++ntoken;
13237     }
13238
13239   /* We ended with a comma, print something.  */
13240   if (!(*specs))
13241     {
13242       error ("%s string ill-formed\n", option_name);
13243       return 0;
13244     }
13245
13246   /* We still have one more token to parse.  */
13247   size_t token_length = strlen (specs);
13248   unsigned token_ops = aarch64_parse_one_option_token (specs,
13249                                                        token_length,
13250                                                        flags,
13251                                                        option_name);
13252    if (!token_ops)
13253      found_flags = 0;
13254
13255   found_flags |= token_ops;
13256   return found_flags;
13257 }
13258
13259 /* Support for overriding instruction fusion.  */
13260
13261 static void
13262 aarch64_parse_fuse_string (const char *fuse_string,
13263                             struct tune_params *tune)
13264 {
13265   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
13266                                                      aarch64_fusible_pairs,
13267                                                      tune->fusible_ops,
13268                                                      "fuse=");
13269 }
13270
13271 /* Support for overriding other tuning flags.  */
13272
13273 static void
13274 aarch64_parse_tune_string (const char *tune_string,
13275                             struct tune_params *tune)
13276 {
13277   tune->extra_tuning_flags
13278     = aarch64_parse_boolean_options (tune_string,
13279                                      aarch64_tuning_flags,
13280                                      tune->extra_tuning_flags,
13281                                      "tune=");
13282 }
13283
13284 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13285    Accept the valid SVE vector widths allowed by
13286    aarch64_sve_vector_bits_enum and use it to override sve_width
13287    in TUNE.  */
13288
13289 static void
13290 aarch64_parse_sve_width_string (const char *tune_string,
13291                                 struct tune_params *tune)
13292 {
13293   int width = -1;
13294
13295   int n = sscanf (tune_string, "%d", &width);
13296   if (n == EOF)
13297     {
13298       error ("invalid format for sve_width");
13299       return;
13300     }
13301   switch (width)
13302     {
13303     case SVE_128:
13304     case SVE_256:
13305     case SVE_512:
13306     case SVE_1024:
13307     case SVE_2048:
13308       break;
13309     default:
13310       error ("invalid sve_width value: %d", width);
13311     }
13312   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
13313 }
13314
13315 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13316    we understand.  If it is, extract the option string and handoff to
13317    the appropriate function.  */
13318
13319 void
13320 aarch64_parse_one_override_token (const char* token,
13321                                   size_t length,
13322                                   struct tune_params *tune)
13323 {
13324   const struct aarch64_tuning_override_function *fn
13325     = aarch64_tuning_override_functions;
13326
13327   const char *option_part = strchr (token, '=');
13328   if (!option_part)
13329     {
13330       error ("tuning string missing in option (%s)", token);
13331       return;
13332     }
13333
13334   /* Get the length of the option name.  */
13335   length = option_part - token;
13336   /* Skip the '=' to get to the option string.  */
13337   option_part++;
13338
13339   for (; fn->name != NULL; fn++)
13340     {
13341       if (!strncmp (fn->name, token, length))
13342         {
13343           fn->parse_override (option_part, tune);
13344           return;
13345         }
13346     }
13347
13348   error ("unknown tuning option (%s)",token);
13349   return;
13350 }
13351
13352 /* A checking mechanism for the implementation of the tls size.  */
13353
13354 static void
13355 initialize_aarch64_tls_size (struct gcc_options *opts)
13356 {
13357   if (aarch64_tls_size == 0)
13358     aarch64_tls_size = 24;
13359
13360   switch (opts->x_aarch64_cmodel_var)
13361     {
13362     case AARCH64_CMODEL_TINY:
13363       /* Both the default and maximum TLS size allowed under tiny is 1M which
13364          needs two instructions to address, so we clamp the size to 24.  */
13365       if (aarch64_tls_size > 24)
13366         aarch64_tls_size = 24;
13367       break;
13368     case AARCH64_CMODEL_SMALL:
13369       /* The maximum TLS size allowed under small is 4G.  */
13370       if (aarch64_tls_size > 32)
13371         aarch64_tls_size = 32;
13372       break;
13373     case AARCH64_CMODEL_LARGE:
13374       /* The maximum TLS size allowed under large is 16E.
13375          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
13376       if (aarch64_tls_size > 48)
13377         aarch64_tls_size = 48;
13378       break;
13379     default:
13380       gcc_unreachable ();
13381     }
13382
13383   return;
13384 }
13385
13386 /* Parse STRING looking for options in the format:
13387      string     :: option:string
13388      option     :: name=substring
13389      name       :: {a-z}
13390      substring  :: defined by option.  */
13391
13392 static void
13393 aarch64_parse_override_string (const char* input_string,
13394                                struct tune_params* tune)
13395 {
13396   const char separator = ':';
13397   size_t string_length = strlen (input_string) + 1;
13398   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
13399   char *string = string_root;
13400   strncpy (string, input_string, string_length);
13401   string[string_length - 1] = '\0';
13402
13403   char* ntoken = string;
13404
13405   while ((ntoken = strchr (string, separator)))
13406     {
13407       size_t token_length = ntoken - string;
13408       /* Make this substring look like a string.  */
13409       *ntoken = '\0';
13410       aarch64_parse_one_override_token (string, token_length, tune);
13411       string = ++ntoken;
13412     }
13413
13414   /* One last option to parse.  */
13415   aarch64_parse_one_override_token (string, strlen (string), tune);
13416   free (string_root);
13417 }
13418
13419
13420 static void
13421 aarch64_override_options_after_change_1 (struct gcc_options *opts)
13422 {
13423   if (accepted_branch_protection_string)
13424     {
13425       opts->x_aarch64_branch_protection_string
13426         = xstrdup (accepted_branch_protection_string);
13427     }
13428
13429   /* PR 70044: We have to be careful about being called multiple times for the
13430      same function.  This means all changes should be repeatable.  */
13431
13432   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
13433      Disable the frame pointer flag so the mid-end will not use a frame
13434      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
13435      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
13436      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
13437   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
13438   if (opts->x_flag_omit_frame_pointer == 0)
13439     opts->x_flag_omit_frame_pointer = 2;
13440
13441   /* If not optimizing for size, set the default
13442      alignment to what the target wants.  */
13443   if (!opts->x_optimize_size)
13444     {
13445       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
13446         opts->x_str_align_loops = aarch64_tune_params.loop_align;
13447       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
13448         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
13449       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
13450         opts->x_str_align_functions = aarch64_tune_params.function_align;
13451     }
13452
13453   /* We default to no pc-relative literal loads.  */
13454
13455   aarch64_pcrelative_literal_loads = false;
13456
13457   /* If -mpc-relative-literal-loads is set on the command line, this
13458      implies that the user asked for PC relative literal loads.  */
13459   if (opts->x_pcrelative_literal_loads == 1)
13460     aarch64_pcrelative_literal_loads = true;
13461
13462   /* In the tiny memory model it makes no sense to disallow PC relative
13463      literal pool loads.  */
13464   if (aarch64_cmodel == AARCH64_CMODEL_TINY
13465       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
13466     aarch64_pcrelative_literal_loads = true;
13467
13468   /* When enabling the lower precision Newton series for the square root, also
13469      enable it for the reciprocal square root, since the latter is an
13470      intermediary step for the former.  */
13471   if (flag_mlow_precision_sqrt)
13472     flag_mrecip_low_precision_sqrt = true;
13473 }
13474
13475 /* 'Unpack' up the internal tuning structs and update the options
13476     in OPTS.  The caller must have set up selected_tune and selected_arch
13477     as all the other target-specific codegen decisions are
13478     derived from them.  */
13479
13480 void
13481 aarch64_override_options_internal (struct gcc_options *opts)
13482 {
13483   aarch64_tune_flags = selected_tune->flags;
13484   aarch64_tune = selected_tune->sched_core;
13485   /* Make a copy of the tuning parameters attached to the core, which
13486      we may later overwrite.  */
13487   aarch64_tune_params = *(selected_tune->tune);
13488   aarch64_architecture_version = selected_arch->architecture_version;
13489
13490   if (opts->x_aarch64_override_tune_string)
13491     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
13492                                   &aarch64_tune_params);
13493
13494   /* This target defaults to strict volatile bitfields.  */
13495   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
13496     opts->x_flag_strict_volatile_bitfields = 1;
13497
13498   if (aarch64_stack_protector_guard == SSP_GLOBAL
13499       && opts->x_aarch64_stack_protector_guard_offset_str)
13500     {
13501       error ("incompatible options %<-mstack-protector-guard=global%> and "
13502              "%<-mstack-protector-guard-offset=%s%>",
13503              aarch64_stack_protector_guard_offset_str);
13504     }
13505
13506   if (aarch64_stack_protector_guard == SSP_SYSREG
13507       && !(opts->x_aarch64_stack_protector_guard_offset_str
13508            && opts->x_aarch64_stack_protector_guard_reg_str))
13509     {
13510       error ("both %<-mstack-protector-guard-offset%> and "
13511              "%<-mstack-protector-guard-reg%> must be used "
13512              "with %<-mstack-protector-guard=sysreg%>");
13513     }
13514
13515   if (opts->x_aarch64_stack_protector_guard_reg_str)
13516     {
13517       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
13518           error ("specify a system register with a small string length.");
13519     }
13520
13521   if (opts->x_aarch64_stack_protector_guard_offset_str)
13522     {
13523       char *end;
13524       const char *str = aarch64_stack_protector_guard_offset_str;
13525       errno = 0;
13526       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
13527       if (!*str || *end || errno)
13528         error ("%qs is not a valid offset in %qs", str,
13529                "-mstack-protector-guard-offset=");
13530       aarch64_stack_protector_guard_offset = offs;
13531     }
13532
13533   initialize_aarch64_code_model (opts);
13534   initialize_aarch64_tls_size (opts);
13535
13536   int queue_depth = 0;
13537   switch (aarch64_tune_params.autoprefetcher_model)
13538     {
13539       case tune_params::AUTOPREFETCHER_OFF:
13540         queue_depth = -1;
13541         break;
13542       case tune_params::AUTOPREFETCHER_WEAK:
13543         queue_depth = 0;
13544         break;
13545       case tune_params::AUTOPREFETCHER_STRONG:
13546         queue_depth = max_insn_queue_index + 1;
13547         break;
13548       default:
13549         gcc_unreachable ();
13550     }
13551
13552   /* We don't mind passing in global_options_set here as we don't use
13553      the *options_set structs anyway.  */
13554   SET_OPTION_IF_UNSET (opts, &global_options_set,
13555                        param_sched_autopref_queue_depth, queue_depth);
13556
13557   /* Set up parameters to be used in prefetching algorithm.  Do not
13558      override the defaults unless we are tuning for a core we have
13559      researched values for.  */
13560   if (aarch64_tune_params.prefetch->num_slots > 0)
13561     SET_OPTION_IF_UNSET (opts, &global_options_set,
13562                          param_simultaneous_prefetches,
13563                          aarch64_tune_params.prefetch->num_slots);
13564   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
13565     SET_OPTION_IF_UNSET (opts, &global_options_set,
13566                          param_l1_cache_size,
13567                          aarch64_tune_params.prefetch->l1_cache_size);
13568   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
13569     SET_OPTION_IF_UNSET (opts, &global_options_set,
13570                          param_l1_cache_line_size,
13571                          aarch64_tune_params.prefetch->l1_cache_line_size);
13572   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
13573     SET_OPTION_IF_UNSET (opts, &global_options_set,
13574                          param_l2_cache_size,
13575                          aarch64_tune_params.prefetch->l2_cache_size);
13576   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
13577     SET_OPTION_IF_UNSET (opts, &global_options_set,
13578                          param_prefetch_dynamic_strides, 0);
13579   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
13580     SET_OPTION_IF_UNSET (opts, &global_options_set,
13581                          param_prefetch_minimum_stride,
13582                          aarch64_tune_params.prefetch->minimum_stride);
13583
13584   /* Use the alternative scheduling-pressure algorithm by default.  */
13585   SET_OPTION_IF_UNSET (opts, &global_options_set,
13586                        param_sched_pressure_algorithm,
13587                        SCHED_PRESSURE_MODEL);
13588
13589   /* Validate the guard size.  */
13590   int guard_size = param_stack_clash_protection_guard_size;
13591
13592   if (guard_size != 12 && guard_size != 16)
13593     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
13594            "size.  Given value %d (%llu KB) is out of range",
13595            guard_size, (1ULL << guard_size) / 1024ULL);
13596
13597   /* Enforce that interval is the same size as size so the mid-end does the
13598      right thing.  */
13599   SET_OPTION_IF_UNSET (opts, &global_options_set,
13600                        param_stack_clash_protection_probe_interval,
13601                        guard_size);
13602
13603   /* The maybe_set calls won't update the value if the user has explicitly set
13604      one.  Which means we need to validate that probing interval and guard size
13605      are equal.  */
13606   int probe_interval
13607     = param_stack_clash_protection_probe_interval;
13608   if (guard_size != probe_interval)
13609     error ("stack clash guard size %<%d%> must be equal to probing interval "
13610            "%<%d%>", guard_size, probe_interval);
13611
13612   /* Enable sw prefetching at specified optimization level for
13613      CPUS that have prefetch.  Lower optimization level threshold by 1
13614      when profiling is enabled.  */
13615   if (opts->x_flag_prefetch_loop_arrays < 0
13616       && !opts->x_optimize_size
13617       && aarch64_tune_params.prefetch->default_opt_level >= 0
13618       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
13619     opts->x_flag_prefetch_loop_arrays = 1;
13620
13621   if (opts->x_aarch64_arch_string == NULL)
13622     opts->x_aarch64_arch_string = selected_arch->name;
13623   if (opts->x_aarch64_cpu_string == NULL)
13624     opts->x_aarch64_cpu_string = selected_cpu->name;
13625   if (opts->x_aarch64_tune_string == NULL)
13626     opts->x_aarch64_tune_string = selected_tune->name;
13627
13628   aarch64_override_options_after_change_1 (opts);
13629 }
13630
13631 /* Print a hint with a suggestion for a core or architecture name that
13632    most closely resembles what the user passed in STR.  ARCH is true if
13633    the user is asking for an architecture name.  ARCH is false if the user
13634    is asking for a core name.  */
13635
13636 static void
13637 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
13638 {
13639   auto_vec<const char *> candidates;
13640   const struct processor *entry = arch ? all_architectures : all_cores;
13641   for (; entry->name != NULL; entry++)
13642     candidates.safe_push (entry->name);
13643
13644 #ifdef HAVE_LOCAL_CPU_DETECT
13645   /* Add also "native" as possible value.  */
13646   if (arch)
13647     candidates.safe_push ("native");
13648 #endif
13649
13650   char *s;
13651   const char *hint = candidates_list_and_hint (str, s, candidates);
13652   if (hint)
13653     inform (input_location, "valid arguments are: %s;"
13654                              " did you mean %qs?", s, hint);
13655   else
13656     inform (input_location, "valid arguments are: %s", s);
13657
13658   XDELETEVEC (s);
13659 }
13660
13661 /* Print a hint with a suggestion for a core name that most closely resembles
13662    what the user passed in STR.  */
13663
13664 inline static void
13665 aarch64_print_hint_for_core (const char *str)
13666 {
13667   aarch64_print_hint_for_core_or_arch (str, false);
13668 }
13669
13670 /* Print a hint with a suggestion for an architecture name that most closely
13671    resembles what the user passed in STR.  */
13672
13673 inline static void
13674 aarch64_print_hint_for_arch (const char *str)
13675 {
13676   aarch64_print_hint_for_core_or_arch (str, true);
13677 }
13678
13679
13680 /* Print a hint with a suggestion for an extension name
13681    that most closely resembles what the user passed in STR.  */
13682
13683 void
13684 aarch64_print_hint_for_extensions (const std::string &str)
13685 {
13686   auto_vec<const char *> candidates;
13687   aarch64_get_all_extension_candidates (&candidates);
13688   char *s;
13689   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
13690   if (hint)
13691     inform (input_location, "valid arguments are: %s;"
13692                              " did you mean %qs?", s, hint);
13693   else
13694     inform (input_location, "valid arguments are: %s;", s);
13695
13696   XDELETEVEC (s);
13697 }
13698
13699 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
13700    specified in STR and throw errors if appropriate.  Put the results if
13701    they are valid in RES and ISA_FLAGS.  Return whether the option is
13702    valid.  */
13703
13704 static bool
13705 aarch64_validate_mcpu (const char *str, const struct processor **res,
13706                        uint64_t *isa_flags)
13707 {
13708   std::string invalid_extension;
13709   enum aarch64_parse_opt_result parse_res
13710     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
13711
13712   if (parse_res == AARCH64_PARSE_OK)
13713     return true;
13714
13715   switch (parse_res)
13716     {
13717       case AARCH64_PARSE_MISSING_ARG:
13718         error ("missing cpu name in %<-mcpu=%s%>", str);
13719         break;
13720       case AARCH64_PARSE_INVALID_ARG:
13721         error ("unknown value %qs for %<-mcpu%>", str);
13722         aarch64_print_hint_for_core (str);
13723         break;
13724       case AARCH64_PARSE_INVALID_FEATURE:
13725         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
13726                invalid_extension.c_str (), str);
13727         aarch64_print_hint_for_extensions (invalid_extension);
13728         break;
13729       default:
13730         gcc_unreachable ();
13731     }
13732
13733   return false;
13734 }
13735
13736 /* Parses CONST_STR for branch protection features specified in
13737    aarch64_branch_protect_types, and set any global variables required.  Returns
13738    the parsing result and assigns LAST_STR to the last processed token from
13739    CONST_STR so that it can be used for error reporting.  */
13740
13741 static enum
13742 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
13743                                                           char** last_str)
13744 {
13745   char *str_root = xstrdup (const_str);
13746   char* token_save = NULL;
13747   char *str = strtok_r (str_root, "+", &token_save);
13748   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
13749   if (!str)
13750     res = AARCH64_PARSE_MISSING_ARG;
13751   else
13752     {
13753       char *next_str = strtok_r (NULL, "+", &token_save);
13754       /* Reset the branch protection features to their defaults.  */
13755       aarch64_handle_no_branch_protection (NULL, NULL);
13756
13757       while (str && res == AARCH64_PARSE_OK)
13758         {
13759           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
13760           bool found = false;
13761           /* Search for this type.  */
13762           while (type && type->name && !found && res == AARCH64_PARSE_OK)
13763             {
13764               if (strcmp (str, type->name) == 0)
13765                 {
13766                   found = true;
13767                   res = type->handler (str, next_str);
13768                   str = next_str;
13769                   next_str = strtok_r (NULL, "+", &token_save);
13770                 }
13771               else
13772                 type++;
13773             }
13774           if (found && res == AARCH64_PARSE_OK)
13775             {
13776               bool found_subtype = true;
13777               /* Loop through each token until we find one that isn't a
13778                  subtype.  */
13779               while (found_subtype)
13780                 {
13781                   found_subtype = false;
13782                   const aarch64_branch_protect_type *subtype = type->subtypes;
13783                   /* Search for the subtype.  */
13784                   while (str && subtype && subtype->name && !found_subtype
13785                           && res == AARCH64_PARSE_OK)
13786                     {
13787                       if (strcmp (str, subtype->name) == 0)
13788                         {
13789                           found_subtype = true;
13790                           res = subtype->handler (str, next_str);
13791                           str = next_str;
13792                           next_str = strtok_r (NULL, "+", &token_save);
13793                         }
13794                       else
13795                         subtype++;
13796                     }
13797                 }
13798             }
13799           else if (!found)
13800             res = AARCH64_PARSE_INVALID_ARG;
13801         }
13802     }
13803   /* Copy the last processed token into the argument to pass it back.
13804     Used by option and attribute validation to print the offending token.  */
13805   if (last_str)
13806     {
13807       if (str) strcpy (*last_str, str);
13808       else *last_str = NULL;
13809     }
13810   if (res == AARCH64_PARSE_OK)
13811     {
13812       /* If needed, alloc the accepted string then copy in const_str.
13813         Used by override_option_after_change_1.  */
13814       if (!accepted_branch_protection_string)
13815         accepted_branch_protection_string = (char *) xmalloc (
13816                                                       BRANCH_PROTECT_STR_MAX
13817                                                         + 1);
13818       strncpy (accepted_branch_protection_string, const_str,
13819                 BRANCH_PROTECT_STR_MAX + 1);
13820       /* Forcibly null-terminate.  */
13821       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
13822     }
13823   return res;
13824 }
13825
13826 static bool
13827 aarch64_validate_mbranch_protection (const char *const_str)
13828 {
13829   char *str = (char *) xmalloc (strlen (const_str));
13830   enum aarch64_parse_opt_result res =
13831     aarch64_parse_branch_protection (const_str, &str);
13832   if (res == AARCH64_PARSE_INVALID_ARG)
13833     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
13834   else if (res == AARCH64_PARSE_MISSING_ARG)
13835     error ("missing argument for %<-mbranch-protection=%>");
13836   free (str);
13837   return res == AARCH64_PARSE_OK;
13838 }
13839
13840 /* Validate a command-line -march option.  Parse the arch and extensions
13841    (if any) specified in STR and throw errors if appropriate.  Put the
13842    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
13843    option is valid.  */
13844
13845 static bool
13846 aarch64_validate_march (const char *str, const struct processor **res,
13847                          uint64_t *isa_flags)
13848 {
13849   std::string invalid_extension;
13850   enum aarch64_parse_opt_result parse_res
13851     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
13852
13853   if (parse_res == AARCH64_PARSE_OK)
13854     return true;
13855
13856   switch (parse_res)
13857     {
13858       case AARCH64_PARSE_MISSING_ARG:
13859         error ("missing arch name in %<-march=%s%>", str);
13860         break;
13861       case AARCH64_PARSE_INVALID_ARG:
13862         error ("unknown value %qs for %<-march%>", str);
13863         aarch64_print_hint_for_arch (str);
13864         break;
13865       case AARCH64_PARSE_INVALID_FEATURE:
13866         error ("invalid feature modifier %qs in %<-march=%s%>",
13867                invalid_extension.c_str (), str);
13868         aarch64_print_hint_for_extensions (invalid_extension);
13869         break;
13870       default:
13871         gcc_unreachable ();
13872     }
13873
13874   return false;
13875 }
13876
13877 /* Validate a command-line -mtune option.  Parse the cpu
13878    specified in STR and throw errors if appropriate.  Put the
13879    result, if it is valid, in RES.  Return whether the option is
13880    valid.  */
13881
13882 static bool
13883 aarch64_validate_mtune (const char *str, const struct processor **res)
13884 {
13885   enum aarch64_parse_opt_result parse_res
13886     = aarch64_parse_tune (str, res);
13887
13888   if (parse_res == AARCH64_PARSE_OK)
13889     return true;
13890
13891   switch (parse_res)
13892     {
13893       case AARCH64_PARSE_MISSING_ARG:
13894         error ("missing cpu name in %<-mtune=%s%>", str);
13895         break;
13896       case AARCH64_PARSE_INVALID_ARG:
13897         error ("unknown value %qs for %<-mtune%>", str);
13898         aarch64_print_hint_for_core (str);
13899         break;
13900       default:
13901         gcc_unreachable ();
13902     }
13903   return false;
13904 }
13905
13906 /* Return the CPU corresponding to the enum CPU.
13907    If it doesn't specify a cpu, return the default.  */
13908
13909 static const struct processor *
13910 aarch64_get_tune_cpu (enum aarch64_processor cpu)
13911 {
13912   if (cpu != aarch64_none)
13913     return &all_cores[cpu];
13914
13915   /* The & 0x3f is to extract the bottom 6 bits that encode the
13916      default cpu as selected by the --with-cpu GCC configure option
13917      in config.gcc.
13918      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
13919      flags mechanism should be reworked to make it more sane.  */
13920   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
13921 }
13922
13923 /* Return the architecture corresponding to the enum ARCH.
13924    If it doesn't specify a valid architecture, return the default.  */
13925
13926 static const struct processor *
13927 aarch64_get_arch (enum aarch64_arch arch)
13928 {
13929   if (arch != aarch64_no_arch)
13930     return &all_architectures[arch];
13931
13932   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
13933
13934   return &all_architectures[cpu->arch];
13935 }
13936
13937 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
13938
13939 static poly_uint16
13940 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
13941 {
13942   /* 128-bit SVE and Advanced SIMD modes use different register layouts
13943      on big-endian targets, so we would need to forbid subregs that convert
13944      from one to the other.  By default a reinterpret sequence would then
13945      involve a store to memory in one mode and a load back in the other.
13946      Even if we optimize that sequence using reverse instructions,
13947      it would still be a significant potential overhead.
13948
13949      For now, it seems better to generate length-agnostic code for that
13950      case instead.  */
13951   if (value == SVE_SCALABLE
13952       || (value == SVE_128 && BYTES_BIG_ENDIAN))
13953     return poly_uint16 (2, 2);
13954   else
13955     return (int) value / 64;
13956 }
13957
13958 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
13959    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
13960    tuning structs.  In particular it must set selected_tune and
13961    aarch64_isa_flags that define the available ISA features and tuning
13962    decisions.  It must also set selected_arch as this will be used to
13963    output the .arch asm tags for each function.  */
13964
13965 static void
13966 aarch64_override_options (void)
13967 {
13968   uint64_t cpu_isa = 0;
13969   uint64_t arch_isa = 0;
13970   aarch64_isa_flags = 0;
13971
13972   bool valid_cpu = true;
13973   bool valid_tune = true;
13974   bool valid_arch = true;
13975
13976   selected_cpu = NULL;
13977   selected_arch = NULL;
13978   selected_tune = NULL;
13979
13980   if (aarch64_branch_protection_string)
13981     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
13982
13983   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
13984      If either of -march or -mtune is given, they override their
13985      respective component of -mcpu.  */
13986   if (aarch64_cpu_string)
13987     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
13988                                         &cpu_isa);
13989
13990   if (aarch64_arch_string)
13991     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
13992                                           &arch_isa);
13993
13994   if (aarch64_tune_string)
13995     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
13996
13997 #ifdef SUBTARGET_OVERRIDE_OPTIONS
13998   SUBTARGET_OVERRIDE_OPTIONS;
13999 #endif
14000
14001   /* If the user did not specify a processor, choose the default
14002      one for them.  This will be the CPU set during configuration using
14003      --with-cpu, otherwise it is "generic".  */
14004   if (!selected_cpu)
14005     {
14006       if (selected_arch)
14007         {
14008           selected_cpu = &all_cores[selected_arch->ident];
14009           aarch64_isa_flags = arch_isa;
14010           explicit_arch = selected_arch->arch;
14011         }
14012       else
14013         {
14014           /* Get default configure-time CPU.  */
14015           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
14016           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
14017         }
14018
14019       if (selected_tune)
14020         explicit_tune_core = selected_tune->ident;
14021     }
14022   /* If both -mcpu and -march are specified check that they are architecturally
14023      compatible, warn if they're not and prefer the -march ISA flags.  */
14024   else if (selected_arch)
14025     {
14026       if (selected_arch->arch != selected_cpu->arch)
14027         {
14028           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14029                        all_architectures[selected_cpu->arch].name,
14030                        selected_arch->name);
14031         }
14032       aarch64_isa_flags = arch_isa;
14033       explicit_arch = selected_arch->arch;
14034       explicit_tune_core = selected_tune ? selected_tune->ident
14035                                           : selected_cpu->ident;
14036     }
14037   else
14038     {
14039       /* -mcpu but no -march.  */
14040       aarch64_isa_flags = cpu_isa;
14041       explicit_tune_core = selected_tune ? selected_tune->ident
14042                                           : selected_cpu->ident;
14043       gcc_assert (selected_cpu);
14044       selected_arch = &all_architectures[selected_cpu->arch];
14045       explicit_arch = selected_arch->arch;
14046     }
14047
14048   /* Set the arch as well as we will need it when outputing
14049      the .arch directive in assembly.  */
14050   if (!selected_arch)
14051     {
14052       gcc_assert (selected_cpu);
14053       selected_arch = &all_architectures[selected_cpu->arch];
14054     }
14055
14056   if (!selected_tune)
14057     selected_tune = selected_cpu;
14058
14059   if (aarch64_enable_bti == 2)
14060     {
14061 #ifdef TARGET_ENABLE_BTI
14062       aarch64_enable_bti = 1;
14063 #else
14064       aarch64_enable_bti = 0;
14065 #endif
14066     }
14067
14068   /* Return address signing is currently not supported for ILP32 targets.  For
14069      LP64 targets use the configured option in the absence of a command-line
14070      option for -mbranch-protection.  */
14071   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
14072     {
14073 #ifdef TARGET_ENABLE_PAC_RET
14074       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
14075 #else
14076       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
14077 #endif
14078     }
14079
14080 #ifndef HAVE_AS_MABI_OPTION
14081   /* The compiler may have been configured with 2.23.* binutils, which does
14082      not have support for ILP32.  */
14083   if (TARGET_ILP32)
14084     error ("assembler does not support %<-mabi=ilp32%>");
14085 #endif
14086
14087   /* Convert -msve-vector-bits to a VG count.  */
14088   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
14089
14090   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
14091     sorry ("return address signing is only supported for %<-mabi=lp64%>");
14092
14093   /* Make sure we properly set up the explicit options.  */
14094   if ((aarch64_cpu_string && valid_cpu)
14095        || (aarch64_tune_string && valid_tune))
14096     gcc_assert (explicit_tune_core != aarch64_none);
14097
14098   if ((aarch64_cpu_string && valid_cpu)
14099        || (aarch64_arch_string && valid_arch))
14100     gcc_assert (explicit_arch != aarch64_no_arch);
14101
14102   /* The pass to insert speculation tracking runs before
14103      shrink-wrapping and the latter does not know how to update the
14104      tracking status.  So disable it in this case.  */
14105   if (aarch64_track_speculation)
14106     flag_shrink_wrap = 0;
14107
14108   aarch64_override_options_internal (&global_options);
14109
14110   /* Save these options as the default ones in case we push and pop them later
14111      while processing functions with potential target attributes.  */
14112   target_option_default_node = target_option_current_node
14113       = build_target_option_node (&global_options);
14114 }
14115
14116 /* Implement targetm.override_options_after_change.  */
14117
14118 static void
14119 aarch64_override_options_after_change (void)
14120 {
14121   aarch64_override_options_after_change_1 (&global_options);
14122 }
14123
14124 static struct machine_function *
14125 aarch64_init_machine_status (void)
14126 {
14127   struct machine_function *machine;
14128   machine = ggc_cleared_alloc<machine_function> ();
14129   return machine;
14130 }
14131
14132 void
14133 aarch64_init_expanders (void)
14134 {
14135   init_machine_status = aarch64_init_machine_status;
14136 }
14137
14138 /* A checking mechanism for the implementation of the various code models.  */
14139 static void
14140 initialize_aarch64_code_model (struct gcc_options *opts)
14141 {
14142    if (opts->x_flag_pic)
14143      {
14144        switch (opts->x_aarch64_cmodel_var)
14145          {
14146          case AARCH64_CMODEL_TINY:
14147            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
14148            break;
14149          case AARCH64_CMODEL_SMALL:
14150 #ifdef HAVE_AS_SMALL_PIC_RELOCS
14151            aarch64_cmodel = (flag_pic == 2
14152                              ? AARCH64_CMODEL_SMALL_PIC
14153                              : AARCH64_CMODEL_SMALL_SPIC);
14154 #else
14155            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
14156 #endif
14157            break;
14158          case AARCH64_CMODEL_LARGE:
14159            sorry ("code model %qs with %<-f%s%>", "large",
14160                   opts->x_flag_pic > 1 ? "PIC" : "pic");
14161            break;
14162          default:
14163            gcc_unreachable ();
14164          }
14165      }
14166    else
14167      aarch64_cmodel = opts->x_aarch64_cmodel_var;
14168 }
14169
14170 /* Implement TARGET_OPTION_SAVE.  */
14171
14172 static void
14173 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
14174 {
14175   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
14176   ptr->x_aarch64_branch_protection_string
14177     = opts->x_aarch64_branch_protection_string;
14178 }
14179
14180 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
14181    using the information saved in PTR.  */
14182
14183 static void
14184 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
14185 {
14186   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
14187   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14188   opts->x_explicit_arch = ptr->x_explicit_arch;
14189   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
14190   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
14191   opts->x_aarch64_branch_protection_string
14192     = ptr->x_aarch64_branch_protection_string;
14193   if (opts->x_aarch64_branch_protection_string)
14194     {
14195       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
14196                                         NULL);
14197     }
14198
14199   aarch64_override_options_internal (opts);
14200 }
14201
14202 /* Implement TARGET_OPTION_PRINT.  */
14203
14204 static void
14205 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
14206 {
14207   const struct processor *cpu
14208     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14209   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
14210   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
14211   std::string extension
14212     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
14213
14214   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
14215   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
14216            arch->name, extension.c_str ());
14217 }
14218
14219 static GTY(()) tree aarch64_previous_fndecl;
14220
14221 void
14222 aarch64_reset_previous_fndecl (void)
14223 {
14224   aarch64_previous_fndecl = NULL;
14225 }
14226
14227 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
14228    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
14229    make sure optab availability predicates are recomputed when necessary.  */
14230
14231 void
14232 aarch64_save_restore_target_globals (tree new_tree)
14233 {
14234   if (TREE_TARGET_GLOBALS (new_tree))
14235     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
14236   else if (new_tree == target_option_default_node)
14237     restore_target_globals (&default_target_globals);
14238   else
14239     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
14240 }
14241
14242 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
14243    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14244    of the function, if such exists.  This function may be called multiple
14245    times on a single function so use aarch64_previous_fndecl to avoid
14246    setting up identical state.  */
14247
14248 static void
14249 aarch64_set_current_function (tree fndecl)
14250 {
14251   if (!fndecl || fndecl == aarch64_previous_fndecl)
14252     return;
14253
14254   tree old_tree = (aarch64_previous_fndecl
14255                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
14256                    : NULL_TREE);
14257
14258   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14259
14260   /* If current function has no attributes but the previous one did,
14261      use the default node.  */
14262   if (!new_tree && old_tree)
14263     new_tree = target_option_default_node;
14264
14265   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
14266      the default have been handled by aarch64_save_restore_target_globals from
14267      aarch64_pragma_target_parse.  */
14268   if (old_tree == new_tree)
14269     return;
14270
14271   aarch64_previous_fndecl = fndecl;
14272
14273   /* First set the target options.  */
14274   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
14275
14276   aarch64_save_restore_target_globals (new_tree);
14277 }
14278
14279 /* Enum describing the various ways we can handle attributes.
14280    In many cases we can reuse the generic option handling machinery.  */
14281
14282 enum aarch64_attr_opt_type
14283 {
14284   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
14285   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
14286   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
14287   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
14288 };
14289
14290 /* All the information needed to handle a target attribute.
14291    NAME is the name of the attribute.
14292    ATTR_TYPE specifies the type of behavior of the attribute as described
14293    in the definition of enum aarch64_attr_opt_type.
14294    ALLOW_NEG is true if the attribute supports a "no-" form.
14295    HANDLER is the function that takes the attribute string as an argument
14296    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14297    OPT_NUM is the enum specifying the option that the attribute modifies.
14298    This is needed for attributes that mirror the behavior of a command-line
14299    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14300    aarch64_attr_enum.  */
14301
14302 struct aarch64_attribute_info
14303 {
14304   const char *name;
14305   enum aarch64_attr_opt_type attr_type;
14306   bool allow_neg;
14307   bool (*handler) (const char *);
14308   enum opt_code opt_num;
14309 };
14310
14311 /* Handle the ARCH_STR argument to the arch= target attribute.  */
14312
14313 static bool
14314 aarch64_handle_attr_arch (const char *str)
14315 {
14316   const struct processor *tmp_arch = NULL;
14317   std::string invalid_extension;
14318   enum aarch64_parse_opt_result parse_res
14319     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
14320
14321   if (parse_res == AARCH64_PARSE_OK)
14322     {
14323       gcc_assert (tmp_arch);
14324       selected_arch = tmp_arch;
14325       explicit_arch = selected_arch->arch;
14326       return true;
14327     }
14328
14329   switch (parse_res)
14330     {
14331       case AARCH64_PARSE_MISSING_ARG:
14332         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14333         break;
14334       case AARCH64_PARSE_INVALID_ARG:
14335         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
14336         aarch64_print_hint_for_arch (str);
14337         break;
14338       case AARCH64_PARSE_INVALID_FEATURE:
14339         error ("invalid feature modifier %s of value (\"%s\") in "
14340                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14341         aarch64_print_hint_for_extensions (invalid_extension);
14342         break;
14343       default:
14344         gcc_unreachable ();
14345     }
14346
14347   return false;
14348 }
14349
14350 /* Handle the argument CPU_STR to the cpu= target attribute.  */
14351
14352 static bool
14353 aarch64_handle_attr_cpu (const char *str)
14354 {
14355   const struct processor *tmp_cpu = NULL;
14356   std::string invalid_extension;
14357   enum aarch64_parse_opt_result parse_res
14358     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
14359
14360   if (parse_res == AARCH64_PARSE_OK)
14361     {
14362       gcc_assert (tmp_cpu);
14363       selected_tune = tmp_cpu;
14364       explicit_tune_core = selected_tune->ident;
14365
14366       selected_arch = &all_architectures[tmp_cpu->arch];
14367       explicit_arch = selected_arch->arch;
14368       return true;
14369     }
14370
14371   switch (parse_res)
14372     {
14373       case AARCH64_PARSE_MISSING_ARG:
14374         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14375         break;
14376       case AARCH64_PARSE_INVALID_ARG:
14377         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
14378         aarch64_print_hint_for_core (str);
14379         break;
14380       case AARCH64_PARSE_INVALID_FEATURE:
14381         error ("invalid feature modifier %s of value (\"%s\") in "
14382                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14383         aarch64_print_hint_for_extensions (invalid_extension);
14384         break;
14385       default:
14386         gcc_unreachable ();
14387     }
14388
14389   return false;
14390 }
14391
14392 /* Handle the argument STR to the branch-protection= attribute.  */
14393
14394  static bool
14395  aarch64_handle_attr_branch_protection (const char* str)
14396  {
14397   char *err_str = (char *) xmalloc (strlen (str) + 1);
14398   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
14399                                                                       &err_str);
14400   bool success = false;
14401   switch (res)
14402     {
14403      case AARCH64_PARSE_MISSING_ARG:
14404        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14405               " attribute");
14406        break;
14407      case AARCH64_PARSE_INVALID_ARG:
14408        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14409               "=\")%> pragma or attribute", err_str);
14410        break;
14411      case AARCH64_PARSE_OK:
14412        success = true;
14413       /* Fall through.  */
14414      case AARCH64_PARSE_INVALID_FEATURE:
14415        break;
14416      default:
14417        gcc_unreachable ();
14418     }
14419   free (err_str);
14420   return success;
14421  }
14422
14423 /* Handle the argument STR to the tune= target attribute.  */
14424
14425 static bool
14426 aarch64_handle_attr_tune (const char *str)
14427 {
14428   const struct processor *tmp_tune = NULL;
14429   enum aarch64_parse_opt_result parse_res
14430     = aarch64_parse_tune (str, &tmp_tune);
14431
14432   if (parse_res == AARCH64_PARSE_OK)
14433     {
14434       gcc_assert (tmp_tune);
14435       selected_tune = tmp_tune;
14436       explicit_tune_core = selected_tune->ident;
14437       return true;
14438     }
14439
14440   switch (parse_res)
14441     {
14442       case AARCH64_PARSE_INVALID_ARG:
14443         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
14444         aarch64_print_hint_for_core (str);
14445         break;
14446       default:
14447         gcc_unreachable ();
14448     }
14449
14450   return false;
14451 }
14452
14453 /* Parse an architecture extensions target attribute string specified in STR.
14454    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
14455    if successful.  Update aarch64_isa_flags to reflect the ISA features
14456    modified.  */
14457
14458 static bool
14459 aarch64_handle_attr_isa_flags (char *str)
14460 {
14461   enum aarch64_parse_opt_result parse_res;
14462   uint64_t isa_flags = aarch64_isa_flags;
14463
14464   /* We allow "+nothing" in the beginning to clear out all architectural
14465      features if the user wants to handpick specific features.  */
14466   if (strncmp ("+nothing", str, 8) == 0)
14467     {
14468       isa_flags = 0;
14469       str += 8;
14470     }
14471
14472   std::string invalid_extension;
14473   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
14474
14475   if (parse_res == AARCH64_PARSE_OK)
14476     {
14477       aarch64_isa_flags = isa_flags;
14478       return true;
14479     }
14480
14481   switch (parse_res)
14482     {
14483       case AARCH64_PARSE_MISSING_ARG:
14484         error ("missing value in %<target()%> pragma or attribute");
14485         break;
14486
14487       case AARCH64_PARSE_INVALID_FEATURE:
14488         error ("invalid feature modifier %s of value (\"%s\") in "
14489                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14490         break;
14491
14492       default:
14493         gcc_unreachable ();
14494     }
14495
14496  return false;
14497 }
14498
14499 /* The target attributes that we support.  On top of these we also support just
14500    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
14501    handled explicitly in aarch64_process_one_target_attr.  */
14502
14503 static const struct aarch64_attribute_info aarch64_attributes[] =
14504 {
14505   { "general-regs-only", aarch64_attr_mask, false, NULL,
14506      OPT_mgeneral_regs_only },
14507   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
14508      OPT_mfix_cortex_a53_835769 },
14509   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
14510      OPT_mfix_cortex_a53_843419 },
14511   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
14512   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
14513   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
14514      OPT_momit_leaf_frame_pointer },
14515   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
14516   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
14517      OPT_march_ },
14518   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
14519   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
14520      OPT_mtune_ },
14521   { "branch-protection", aarch64_attr_custom, false,
14522      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
14523   { "sign-return-address", aarch64_attr_enum, false, NULL,
14524      OPT_msign_return_address_ },
14525   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
14526 };
14527
14528 /* Parse ARG_STR which contains the definition of one target attribute.
14529    Show appropriate errors if any or return true if the attribute is valid.  */
14530
14531 static bool
14532 aarch64_process_one_target_attr (char *arg_str)
14533 {
14534   bool invert = false;
14535
14536   size_t len = strlen (arg_str);
14537
14538   if (len == 0)
14539     {
14540       error ("malformed %<target()%> pragma or attribute");
14541       return false;
14542     }
14543
14544   char *str_to_check = (char *) alloca (len + 1);
14545   strcpy (str_to_check, arg_str);
14546
14547   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
14548      It is easier to detect and handle it explicitly here rather than going
14549      through the machinery for the rest of the target attributes in this
14550      function.  */
14551   if (*str_to_check == '+')
14552     return aarch64_handle_attr_isa_flags (str_to_check);
14553
14554   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
14555     {
14556       invert = true;
14557       str_to_check += 3;
14558     }
14559   char *arg = strchr (str_to_check, '=');
14560
14561   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
14562      and point ARG to "foo".  */
14563   if (arg)
14564     {
14565       *arg = '\0';
14566       arg++;
14567     }
14568   const struct aarch64_attribute_info *p_attr;
14569   bool found = false;
14570   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
14571     {
14572       /* If the names don't match up, or the user has given an argument
14573          to an attribute that doesn't accept one, or didn't give an argument
14574          to an attribute that expects one, fail to match.  */
14575       if (strcmp (str_to_check, p_attr->name) != 0)
14576         continue;
14577
14578       found = true;
14579       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
14580                               || p_attr->attr_type == aarch64_attr_enum;
14581
14582       if (attr_need_arg_p ^ (arg != NULL))
14583         {
14584           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
14585           return false;
14586         }
14587
14588       /* If the name matches but the attribute does not allow "no-" versions
14589          then we can't match.  */
14590       if (invert && !p_attr->allow_neg)
14591         {
14592           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
14593           return false;
14594         }
14595
14596       switch (p_attr->attr_type)
14597         {
14598         /* Has a custom handler registered.
14599            For example, cpu=, arch=, tune=.  */
14600           case aarch64_attr_custom:
14601             gcc_assert (p_attr->handler);
14602             if (!p_attr->handler (arg))
14603               return false;
14604             break;
14605
14606           /* Either set or unset a boolean option.  */
14607           case aarch64_attr_bool:
14608             {
14609               struct cl_decoded_option decoded;
14610
14611               generate_option (p_attr->opt_num, NULL, !invert,
14612                                CL_TARGET, &decoded);
14613               aarch64_handle_option (&global_options, &global_options_set,
14614                                       &decoded, input_location);
14615               break;
14616             }
14617           /* Set or unset a bit in the target_flags.  aarch64_handle_option
14618              should know what mask to apply given the option number.  */
14619           case aarch64_attr_mask:
14620             {
14621               struct cl_decoded_option decoded;
14622               /* We only need to specify the option number.
14623                  aarch64_handle_option will know which mask to apply.  */
14624               decoded.opt_index = p_attr->opt_num;
14625               decoded.value = !invert;
14626               aarch64_handle_option (&global_options, &global_options_set,
14627                                       &decoded, input_location);
14628               break;
14629             }
14630           /* Use the option setting machinery to set an option to an enum.  */
14631           case aarch64_attr_enum:
14632             {
14633               gcc_assert (arg);
14634               bool valid;
14635               int value;
14636               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
14637                                               &value, CL_TARGET);
14638               if (valid)
14639                 {
14640                   set_option (&global_options, NULL, p_attr->opt_num, value,
14641                               NULL, DK_UNSPECIFIED, input_location,
14642                               global_dc);
14643                 }
14644               else
14645                 {
14646                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
14647                 }
14648               break;
14649             }
14650           default:
14651             gcc_unreachable ();
14652         }
14653     }
14654
14655   /* If we reached here we either have found an attribute and validated
14656      it or didn't match any.  If we matched an attribute but its arguments
14657      were malformed we will have returned false already.  */
14658   return found;
14659 }
14660
14661 /* Count how many times the character C appears in
14662    NULL-terminated string STR.  */
14663
14664 static unsigned int
14665 num_occurences_in_str (char c, char *str)
14666 {
14667   unsigned int res = 0;
14668   while (*str != '\0')
14669     {
14670       if (*str == c)
14671         res++;
14672
14673       str++;
14674     }
14675
14676   return res;
14677 }
14678
14679 /* Parse the tree in ARGS that contains the target attribute information
14680    and update the global target options space.  */
14681
14682 bool
14683 aarch64_process_target_attr (tree args)
14684 {
14685   if (TREE_CODE (args) == TREE_LIST)
14686     {
14687       do
14688         {
14689           tree head = TREE_VALUE (args);
14690           if (head)
14691             {
14692               if (!aarch64_process_target_attr (head))
14693                 return false;
14694             }
14695           args = TREE_CHAIN (args);
14696         } while (args);
14697
14698       return true;
14699     }
14700
14701   if (TREE_CODE (args) != STRING_CST)
14702     {
14703       error ("attribute %<target%> argument not a string");
14704       return false;
14705     }
14706
14707   size_t len = strlen (TREE_STRING_POINTER (args));
14708   char *str_to_check = (char *) alloca (len + 1);
14709   strcpy (str_to_check, TREE_STRING_POINTER (args));
14710
14711   if (len == 0)
14712     {
14713       error ("malformed %<target()%> pragma or attribute");
14714       return false;
14715     }
14716
14717   /* Used to catch empty spaces between commas i.e.
14718      attribute ((target ("attr1,,attr2"))).  */
14719   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
14720
14721   /* Handle multiple target attributes separated by ','.  */
14722   char *token = strtok_r (str_to_check, ",", &str_to_check);
14723
14724   unsigned int num_attrs = 0;
14725   while (token)
14726     {
14727       num_attrs++;
14728       if (!aarch64_process_one_target_attr (token))
14729         {
14730           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
14731           return false;
14732         }
14733
14734       token = strtok_r (NULL, ",", &str_to_check);
14735     }
14736
14737   if (num_attrs != num_commas + 1)
14738     {
14739       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
14740       return false;
14741     }
14742
14743   return true;
14744 }
14745
14746 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
14747    process attribute ((target ("..."))).  */
14748
14749 static bool
14750 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
14751 {
14752   struct cl_target_option cur_target;
14753   bool ret;
14754   tree old_optimize;
14755   tree new_target, new_optimize;
14756   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14757
14758   /* If what we're processing is the current pragma string then the
14759      target option node is already stored in target_option_current_node
14760      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
14761      having to re-parse the string.  This is especially useful to keep
14762      arm_neon.h compile times down since that header contains a lot
14763      of intrinsics enclosed in pragmas.  */
14764   if (!existing_target && args == current_target_pragma)
14765     {
14766       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
14767       return true;
14768     }
14769   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14770
14771   old_optimize = build_optimization_node (&global_options);
14772   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14773
14774   /* If the function changed the optimization levels as well as setting
14775      target options, start with the optimizations specified.  */
14776   if (func_optimize && func_optimize != old_optimize)
14777     cl_optimization_restore (&global_options,
14778                              TREE_OPTIMIZATION (func_optimize));
14779
14780   /* Save the current target options to restore at the end.  */
14781   cl_target_option_save (&cur_target, &global_options);
14782
14783   /* If fndecl already has some target attributes applied to it, unpack
14784      them so that we add this attribute on top of them, rather than
14785      overwriting them.  */
14786   if (existing_target)
14787     {
14788       struct cl_target_option *existing_options
14789         = TREE_TARGET_OPTION (existing_target);
14790
14791       if (existing_options)
14792         cl_target_option_restore (&global_options, existing_options);
14793     }
14794   else
14795     cl_target_option_restore (&global_options,
14796                         TREE_TARGET_OPTION (target_option_current_node));
14797
14798   ret = aarch64_process_target_attr (args);
14799
14800   /* Set up any additional state.  */
14801   if (ret)
14802     {
14803       aarch64_override_options_internal (&global_options);
14804       /* Initialize SIMD builtins if we haven't already.
14805          Set current_target_pragma to NULL for the duration so that
14806          the builtin initialization code doesn't try to tag the functions
14807          being built with the attributes specified by any current pragma, thus
14808          going into an infinite recursion.  */
14809       if (TARGET_SIMD)
14810         {
14811           tree saved_current_target_pragma = current_target_pragma;
14812           current_target_pragma = NULL;
14813           aarch64_init_simd_builtins ();
14814           current_target_pragma = saved_current_target_pragma;
14815         }
14816       new_target = build_target_option_node (&global_options);
14817     }
14818   else
14819     new_target = NULL;
14820
14821   new_optimize = build_optimization_node (&global_options);
14822
14823   if (fndecl && ret)
14824     {
14825       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
14826
14827       if (old_optimize != new_optimize)
14828         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
14829     }
14830
14831   cl_target_option_restore (&global_options, &cur_target);
14832
14833   if (old_optimize != new_optimize)
14834     cl_optimization_restore (&global_options,
14835                              TREE_OPTIMIZATION (old_optimize));
14836   return ret;
14837 }
14838
14839 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
14840    tri-bool options (yes, no, don't care) and the default value is
14841    DEF, determine whether to reject inlining.  */
14842
14843 static bool
14844 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
14845                                      int dont_care, int def)
14846 {
14847   /* If the callee doesn't care, always allow inlining.  */
14848   if (callee == dont_care)
14849     return true;
14850
14851   /* If the caller doesn't care, always allow inlining.  */
14852   if (caller == dont_care)
14853     return true;
14854
14855   /* Otherwise, allow inlining if either the callee and caller values
14856      agree, or if the callee is using the default value.  */
14857   return (callee == caller || callee == def);
14858 }
14859
14860 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
14861    to inline CALLEE into CALLER based on target-specific info.
14862    Make sure that the caller and callee have compatible architectural
14863    features.  Then go through the other possible target attributes
14864    and see if they can block inlining.  Try not to reject always_inline
14865    callees unless they are incompatible architecturally.  */
14866
14867 static bool
14868 aarch64_can_inline_p (tree caller, tree callee)
14869 {
14870   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
14871   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
14872
14873   struct cl_target_option *caller_opts
14874         = TREE_TARGET_OPTION (caller_tree ? caller_tree
14875                                            : target_option_default_node);
14876
14877   struct cl_target_option *callee_opts
14878         = TREE_TARGET_OPTION (callee_tree ? callee_tree
14879                                            : target_option_default_node);
14880
14881   /* Callee's ISA flags should be a subset of the caller's.  */
14882   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
14883        != callee_opts->x_aarch64_isa_flags)
14884     return false;
14885
14886   /* Allow non-strict aligned functions inlining into strict
14887      aligned ones.  */
14888   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
14889        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
14890       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
14891            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
14892     return false;
14893
14894   bool always_inline = lookup_attribute ("always_inline",
14895                                           DECL_ATTRIBUTES (callee));
14896
14897   /* If the architectural features match up and the callee is always_inline
14898      then the other attributes don't matter.  */
14899   if (always_inline)
14900     return true;
14901
14902   if (caller_opts->x_aarch64_cmodel_var
14903       != callee_opts->x_aarch64_cmodel_var)
14904     return false;
14905
14906   if (caller_opts->x_aarch64_tls_dialect
14907       != callee_opts->x_aarch64_tls_dialect)
14908     return false;
14909
14910   /* Honour explicit requests to workaround errata.  */
14911   if (!aarch64_tribools_ok_for_inlining_p (
14912           caller_opts->x_aarch64_fix_a53_err835769,
14913           callee_opts->x_aarch64_fix_a53_err835769,
14914           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
14915     return false;
14916
14917   if (!aarch64_tribools_ok_for_inlining_p (
14918           caller_opts->x_aarch64_fix_a53_err843419,
14919           callee_opts->x_aarch64_fix_a53_err843419,
14920           2, TARGET_FIX_ERR_A53_843419))
14921     return false;
14922
14923   /* If the user explicitly specified -momit-leaf-frame-pointer for the
14924      caller and calle and they don't match up, reject inlining.  */
14925   if (!aarch64_tribools_ok_for_inlining_p (
14926           caller_opts->x_flag_omit_leaf_frame_pointer,
14927           callee_opts->x_flag_omit_leaf_frame_pointer,
14928           2, 1))
14929     return false;
14930
14931   /* If the callee has specific tuning overrides, respect them.  */
14932   if (callee_opts->x_aarch64_override_tune_string != NULL
14933       && caller_opts->x_aarch64_override_tune_string == NULL)
14934     return false;
14935
14936   /* If the user specified tuning override strings for the
14937      caller and callee and they don't match up, reject inlining.
14938      We just do a string compare here, we don't analyze the meaning
14939      of the string, as it would be too costly for little gain.  */
14940   if (callee_opts->x_aarch64_override_tune_string
14941       && caller_opts->x_aarch64_override_tune_string
14942       && (strcmp (callee_opts->x_aarch64_override_tune_string,
14943                   caller_opts->x_aarch64_override_tune_string) != 0))
14944     return false;
14945
14946   return true;
14947 }
14948
14949 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
14950    been already.  */
14951
14952 unsigned int
14953 aarch64_tlsdesc_abi_id ()
14954 {
14955   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
14956   if (!tlsdesc_abi.initialized_p ())
14957     {
14958       HARD_REG_SET full_reg_clobbers;
14959       CLEAR_HARD_REG_SET (full_reg_clobbers);
14960       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
14961       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
14962       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
14963         SET_HARD_REG_BIT (full_reg_clobbers, regno);
14964       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
14965     }
14966   return tlsdesc_abi.id ();
14967 }
14968
14969 /* Return true if SYMBOL_REF X binds locally.  */
14970
14971 static bool
14972 aarch64_symbol_binds_local_p (const_rtx x)
14973 {
14974   return (SYMBOL_REF_DECL (x)
14975           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
14976           : SYMBOL_REF_LOCAL_P (x));
14977 }
14978
14979 /* Return true if SYMBOL_REF X is thread local */
14980 static bool
14981 aarch64_tls_symbol_p (rtx x)
14982 {
14983   if (! TARGET_HAVE_TLS)
14984     return false;
14985
14986   if (GET_CODE (x) != SYMBOL_REF)
14987     return false;
14988
14989   return SYMBOL_REF_TLS_MODEL (x) != 0;
14990 }
14991
14992 /* Classify a TLS symbol into one of the TLS kinds.  */
14993 enum aarch64_symbol_type
14994 aarch64_classify_tls_symbol (rtx x)
14995 {
14996   enum tls_model tls_kind = tls_symbolic_operand_type (x);
14997
14998   switch (tls_kind)
14999     {
15000     case TLS_MODEL_GLOBAL_DYNAMIC:
15001     case TLS_MODEL_LOCAL_DYNAMIC:
15002       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
15003
15004     case TLS_MODEL_INITIAL_EXEC:
15005       switch (aarch64_cmodel)
15006         {
15007         case AARCH64_CMODEL_TINY:
15008         case AARCH64_CMODEL_TINY_PIC:
15009           return SYMBOL_TINY_TLSIE;
15010         default:
15011           return SYMBOL_SMALL_TLSIE;
15012         }
15013
15014     case TLS_MODEL_LOCAL_EXEC:
15015       if (aarch64_tls_size == 12)
15016         return SYMBOL_TLSLE12;
15017       else if (aarch64_tls_size == 24)
15018         return SYMBOL_TLSLE24;
15019       else if (aarch64_tls_size == 32)
15020         return SYMBOL_TLSLE32;
15021       else if (aarch64_tls_size == 48)
15022         return SYMBOL_TLSLE48;
15023       else
15024         gcc_unreachable ();
15025
15026     case TLS_MODEL_EMULATED:
15027     case TLS_MODEL_NONE:
15028       return SYMBOL_FORCE_TO_MEM;
15029
15030     default:
15031       gcc_unreachable ();
15032     }
15033 }
15034
15035 /* Return the correct method for accessing X + OFFSET, where X is either
15036    a SYMBOL_REF or LABEL_REF.  */
15037
15038 enum aarch64_symbol_type
15039 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
15040 {
15041   if (GET_CODE (x) == LABEL_REF)
15042     {
15043       switch (aarch64_cmodel)
15044         {
15045         case AARCH64_CMODEL_LARGE:
15046           return SYMBOL_FORCE_TO_MEM;
15047
15048         case AARCH64_CMODEL_TINY_PIC:
15049         case AARCH64_CMODEL_TINY:
15050           return SYMBOL_TINY_ABSOLUTE;
15051
15052         case AARCH64_CMODEL_SMALL_SPIC:
15053         case AARCH64_CMODEL_SMALL_PIC:
15054         case AARCH64_CMODEL_SMALL:
15055           return SYMBOL_SMALL_ABSOLUTE;
15056
15057         default:
15058           gcc_unreachable ();
15059         }
15060     }
15061
15062   if (GET_CODE (x) == SYMBOL_REF)
15063     {
15064       if (aarch64_tls_symbol_p (x))
15065         return aarch64_classify_tls_symbol (x);
15066
15067       switch (aarch64_cmodel)
15068         {
15069         case AARCH64_CMODEL_TINY:
15070           /* When we retrieve symbol + offset address, we have to make sure
15071              the offset does not cause overflow of the final address.  But
15072              we have no way of knowing the address of symbol at compile time
15073              so we can't accurately say if the distance between the PC and
15074              symbol + offset is outside the addressible range of +/-1MB in the
15075              TINY code model.  So we limit the maximum offset to +/-64KB and
15076              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15077              If offset_within_block_p is true we allow larger offsets.
15078              Furthermore force to memory if the symbol is a weak reference to
15079              something that doesn't resolve to a symbol in this module.  */
15080
15081           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15082             return SYMBOL_FORCE_TO_MEM;
15083           if (!(IN_RANGE (offset, -0x10000, 0x10000)
15084                 || offset_within_block_p (x, offset)))
15085             return SYMBOL_FORCE_TO_MEM;
15086
15087           return SYMBOL_TINY_ABSOLUTE;
15088
15089         case AARCH64_CMODEL_SMALL:
15090           /* Same reasoning as the tiny code model, but the offset cap here is
15091              1MB, allowing +/-3.9GB for the offset to the symbol.  */
15092
15093           if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15094             return SYMBOL_FORCE_TO_MEM;
15095           if (!(IN_RANGE (offset, -0x100000, 0x100000)
15096                 || offset_within_block_p (x, offset)))
15097             return SYMBOL_FORCE_TO_MEM;
15098
15099           return SYMBOL_SMALL_ABSOLUTE;
15100
15101         case AARCH64_CMODEL_TINY_PIC:
15102           if (!aarch64_symbol_binds_local_p (x))
15103             return SYMBOL_TINY_GOT;
15104           return SYMBOL_TINY_ABSOLUTE;
15105
15106         case AARCH64_CMODEL_SMALL_SPIC:
15107         case AARCH64_CMODEL_SMALL_PIC:
15108           if (!aarch64_symbol_binds_local_p (x))
15109             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
15110                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
15111           return SYMBOL_SMALL_ABSOLUTE;
15112
15113         case AARCH64_CMODEL_LARGE:
15114           /* This is alright even in PIC code as the constant
15115              pool reference is always PC relative and within
15116              the same translation unit.  */
15117           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
15118             return SYMBOL_SMALL_ABSOLUTE;
15119           else
15120             return SYMBOL_FORCE_TO_MEM;
15121
15122         default:
15123           gcc_unreachable ();
15124         }
15125     }
15126
15127   /* By default push everything into the constant pool.  */
15128   return SYMBOL_FORCE_TO_MEM;
15129 }
15130
15131 bool
15132 aarch64_constant_address_p (rtx x)
15133 {
15134   return (CONSTANT_P (x) && memory_address_p (DImode, x));
15135 }
15136
15137 bool
15138 aarch64_legitimate_pic_operand_p (rtx x)
15139 {
15140   if (GET_CODE (x) == SYMBOL_REF
15141       || (GET_CODE (x) == CONST
15142           && GET_CODE (XEXP (x, 0)) == PLUS
15143           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
15144      return false;
15145
15146   return true;
15147 }
15148
15149 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
15150    that should be rematerialized rather than spilled.  */
15151
15152 static bool
15153 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
15154 {
15155   /* Support CSE and rematerialization of common constants.  */
15156   if (CONST_INT_P (x)
15157       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
15158       || GET_CODE (x) == CONST_VECTOR)
15159     return true;
15160
15161   /* Do not allow vector struct mode constants for Advanced SIMD.
15162      We could support 0 and -1 easily, but they need support in
15163      aarch64-simd.md.  */
15164   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15165   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15166     return false;
15167
15168   /* Only accept variable-length vector constants if they can be
15169      handled directly.
15170
15171      ??? It would be possible to handle rematerialization of other
15172      constants via secondary reloads.  */
15173   if (vec_flags & VEC_ANY_SVE)
15174     return aarch64_simd_valid_immediate (x, NULL);
15175
15176   if (GET_CODE (x) == HIGH)
15177     x = XEXP (x, 0);
15178
15179   /* Accept polynomial constants that can be calculated by using the
15180      destination of a move as the sole temporary.  Constants that
15181      require a second temporary cannot be rematerialized (they can't be
15182      forced to memory and also aren't legitimate constants).  */
15183   poly_int64 offset;
15184   if (poly_int_rtx_p (x, &offset))
15185     return aarch64_offset_temporaries (false, offset) <= 1;
15186
15187   /* If an offset is being added to something else, we need to allow the
15188      base to be moved into the destination register, meaning that there
15189      are no free temporaries for the offset.  */
15190   x = strip_offset (x, &offset);
15191   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
15192     return false;
15193
15194   /* Do not allow const (plus (anchor_symbol, const_int)).  */
15195   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
15196     return false;
15197
15198   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
15199      so spilling them is better than rematerialization.  */
15200   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
15201     return true;
15202
15203   /* Label references are always constant.  */
15204   if (GET_CODE (x) == LABEL_REF)
15205     return true;
15206
15207   return false;
15208 }
15209
15210 rtx
15211 aarch64_load_tp (rtx target)
15212 {
15213   if (!target
15214       || GET_MODE (target) != Pmode
15215       || !register_operand (target, Pmode))
15216     target = gen_reg_rtx (Pmode);
15217
15218   /* Can return in any reg.  */
15219   emit_insn (gen_aarch64_load_tp_hard (target));
15220   return target;
15221 }
15222
15223 /* On AAPCS systems, this is the "struct __va_list".  */
15224 static GTY(()) tree va_list_type;
15225
15226 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
15227    Return the type to use as __builtin_va_list.
15228
15229    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
15230
15231    struct __va_list
15232    {
15233      void *__stack;
15234      void *__gr_top;
15235      void *__vr_top;
15236      int   __gr_offs;
15237      int   __vr_offs;
15238    };  */
15239
15240 static tree
15241 aarch64_build_builtin_va_list (void)
15242 {
15243   tree va_list_name;
15244   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15245
15246   /* Create the type.  */
15247   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
15248   /* Give it the required name.  */
15249   va_list_name = build_decl (BUILTINS_LOCATION,
15250                              TYPE_DECL,
15251                              get_identifier ("__va_list"),
15252                              va_list_type);
15253   DECL_ARTIFICIAL (va_list_name) = 1;
15254   TYPE_NAME (va_list_type) = va_list_name;
15255   TYPE_STUB_DECL (va_list_type) = va_list_name;
15256
15257   /* Create the fields.  */
15258   f_stack = build_decl (BUILTINS_LOCATION,
15259                         FIELD_DECL, get_identifier ("__stack"),
15260                         ptr_type_node);
15261   f_grtop = build_decl (BUILTINS_LOCATION,
15262                         FIELD_DECL, get_identifier ("__gr_top"),
15263                         ptr_type_node);
15264   f_vrtop = build_decl (BUILTINS_LOCATION,
15265                         FIELD_DECL, get_identifier ("__vr_top"),
15266                         ptr_type_node);
15267   f_groff = build_decl (BUILTINS_LOCATION,
15268                         FIELD_DECL, get_identifier ("__gr_offs"),
15269                         integer_type_node);
15270   f_vroff = build_decl (BUILTINS_LOCATION,
15271                         FIELD_DECL, get_identifier ("__vr_offs"),
15272                         integer_type_node);
15273
15274   /* Tell tree-stdarg pass about our internal offset fields.
15275      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15276      purpose to identify whether the code is updating va_list internal
15277      offset fields through irregular way.  */
15278   va_list_gpr_counter_field = f_groff;
15279   va_list_fpr_counter_field = f_vroff;
15280
15281   DECL_ARTIFICIAL (f_stack) = 1;
15282   DECL_ARTIFICIAL (f_grtop) = 1;
15283   DECL_ARTIFICIAL (f_vrtop) = 1;
15284   DECL_ARTIFICIAL (f_groff) = 1;
15285   DECL_ARTIFICIAL (f_vroff) = 1;
15286
15287   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
15288   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
15289   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
15290   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
15291   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
15292
15293   TYPE_FIELDS (va_list_type) = f_stack;
15294   DECL_CHAIN (f_stack) = f_grtop;
15295   DECL_CHAIN (f_grtop) = f_vrtop;
15296   DECL_CHAIN (f_vrtop) = f_groff;
15297   DECL_CHAIN (f_groff) = f_vroff;
15298
15299   /* Compute its layout.  */
15300   layout_type (va_list_type);
15301
15302   return va_list_type;
15303 }
15304
15305 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
15306 static void
15307 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
15308 {
15309   const CUMULATIVE_ARGS *cum;
15310   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15311   tree stack, grtop, vrtop, groff, vroff;
15312   tree t;
15313   int gr_save_area_size = cfun->va_list_gpr_size;
15314   int vr_save_area_size = cfun->va_list_fpr_size;
15315   int vr_offset;
15316
15317   cum = &crtl->args.info;
15318   if (cfun->va_list_gpr_size)
15319     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
15320                              cfun->va_list_gpr_size);
15321   if (cfun->va_list_fpr_size)
15322     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
15323                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
15324
15325   if (!TARGET_FLOAT)
15326     {
15327       gcc_assert (cum->aapcs_nvrn == 0);
15328       vr_save_area_size = 0;
15329     }
15330
15331   f_stack = TYPE_FIELDS (va_list_type_node);
15332   f_grtop = DECL_CHAIN (f_stack);
15333   f_vrtop = DECL_CHAIN (f_grtop);
15334   f_groff = DECL_CHAIN (f_vrtop);
15335   f_vroff = DECL_CHAIN (f_groff);
15336
15337   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
15338                   NULL_TREE);
15339   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
15340                   NULL_TREE);
15341   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
15342                   NULL_TREE);
15343   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
15344                   NULL_TREE);
15345   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
15346                   NULL_TREE);
15347
15348   /* Emit code to initialize STACK, which points to the next varargs stack
15349      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
15350      by named arguments.  STACK is 8-byte aligned.  */
15351   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
15352   if (cum->aapcs_stack_size > 0)
15353     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
15354   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
15355   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15356
15357   /* Emit code to initialize GRTOP, the top of the GR save area.
15358      virtual_incoming_args_rtx should have been 16 byte aligned.  */
15359   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
15360   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
15361   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15362
15363   /* Emit code to initialize VRTOP, the top of the VR save area.
15364      This address is gr_save_area_bytes below GRTOP, rounded
15365      down to the next 16-byte boundary.  */
15366   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
15367   vr_offset = ROUND_UP (gr_save_area_size,
15368                         STACK_BOUNDARY / BITS_PER_UNIT);
15369
15370   if (vr_offset)
15371     t = fold_build_pointer_plus_hwi (t, -vr_offset);
15372   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
15373   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15374
15375   /* Emit code to initialize GROFF, the offset from GRTOP of the
15376      next GPR argument.  */
15377   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
15378               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
15379   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15380
15381   /* Likewise emit code to initialize VROFF, the offset from FTOP
15382      of the next VR argument.  */
15383   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
15384               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
15385   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15386 }
15387
15388 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
15389
15390 static tree
15391 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
15392                               gimple_seq *post_p ATTRIBUTE_UNUSED)
15393 {
15394   tree addr;
15395   bool indirect_p;
15396   bool is_ha;           /* is HFA or HVA.  */
15397   bool dw_align;        /* double-word align.  */
15398   machine_mode ag_mode = VOIDmode;
15399   int nregs;
15400   machine_mode mode;
15401
15402   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15403   tree stack, f_top, f_off, off, arg, roundup, on_stack;
15404   HOST_WIDE_INT size, rsize, adjust, align;
15405   tree t, u, cond1, cond2;
15406
15407   indirect_p = pass_va_arg_by_reference (type);
15408   if (indirect_p)
15409     type = build_pointer_type (type);
15410
15411   mode = TYPE_MODE (type);
15412
15413   f_stack = TYPE_FIELDS (va_list_type_node);
15414   f_grtop = DECL_CHAIN (f_stack);
15415   f_vrtop = DECL_CHAIN (f_grtop);
15416   f_groff = DECL_CHAIN (f_vrtop);
15417   f_vroff = DECL_CHAIN (f_groff);
15418
15419   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
15420                   f_stack, NULL_TREE);
15421   size = int_size_in_bytes (type);
15422
15423   bool abi_break;
15424   align
15425     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
15426
15427   dw_align = false;
15428   adjust = 0;
15429   if (aarch64_vfp_is_call_or_return_candidate (mode,
15430                                                type,
15431                                                &ag_mode,
15432                                                &nregs,
15433                                                &is_ha))
15434     {
15435       /* No frontends can create types with variable-sized modes, so we
15436          shouldn't be asked to pass or return them.  */
15437       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
15438
15439       /* TYPE passed in fp/simd registers.  */
15440       if (!TARGET_FLOAT)
15441         aarch64_err_no_fpadvsimd (mode);
15442
15443       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
15444                       unshare_expr (valist), f_vrtop, NULL_TREE);
15445       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
15446                       unshare_expr (valist), f_vroff, NULL_TREE);
15447
15448       rsize = nregs * UNITS_PER_VREG;
15449
15450       if (is_ha)
15451         {
15452           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
15453             adjust = UNITS_PER_VREG - ag_size;
15454         }
15455       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15456                && size < UNITS_PER_VREG)
15457         {
15458           adjust = UNITS_PER_VREG - size;
15459         }
15460     }
15461   else
15462     {
15463       /* TYPE passed in general registers.  */
15464       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
15465                       unshare_expr (valist), f_grtop, NULL_TREE);
15466       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
15467                       unshare_expr (valist), f_groff, NULL_TREE);
15468       rsize = ROUND_UP (size, UNITS_PER_WORD);
15469       nregs = rsize / UNITS_PER_WORD;
15470
15471       if (align > 8)
15472         {
15473           if (abi_break && warn_psabi)
15474             inform (input_location, "parameter passing for argument of type "
15475                     "%qT changed in GCC 9.1", type);
15476           dw_align = true;
15477         }
15478
15479       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15480           && size < UNITS_PER_WORD)
15481         {
15482           adjust = UNITS_PER_WORD  - size;
15483         }
15484     }
15485
15486   /* Get a local temporary for the field value.  */
15487   off = get_initialized_tmp_var (f_off, pre_p, NULL);
15488
15489   /* Emit code to branch if off >= 0.  */
15490   t = build2 (GE_EXPR, boolean_type_node, off,
15491               build_int_cst (TREE_TYPE (off), 0));
15492   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
15493
15494   if (dw_align)
15495     {
15496       /* Emit: offs = (offs + 15) & -16.  */
15497       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15498                   build_int_cst (TREE_TYPE (off), 15));
15499       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
15500                   build_int_cst (TREE_TYPE (off), -16));
15501       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
15502     }
15503   else
15504     roundup = NULL;
15505
15506   /* Update ap.__[g|v]r_offs  */
15507   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15508               build_int_cst (TREE_TYPE (off), rsize));
15509   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
15510
15511   /* String up.  */
15512   if (roundup)
15513     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15514
15515   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
15516   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
15517               build_int_cst (TREE_TYPE (f_off), 0));
15518   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
15519
15520   /* String up: make sure the assignment happens before the use.  */
15521   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
15522   COND_EXPR_ELSE (cond1) = t;
15523
15524   /* Prepare the trees handling the argument that is passed on the stack;
15525      the top level node will store in ON_STACK.  */
15526   arg = get_initialized_tmp_var (stack, pre_p, NULL);
15527   if (align > 8)
15528     {
15529       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
15530       t = fold_build_pointer_plus_hwi (arg, 15);
15531       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15532                   build_int_cst (TREE_TYPE (t), -16));
15533       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
15534     }
15535   else
15536     roundup = NULL;
15537   /* Advance ap.__stack  */
15538   t = fold_build_pointer_plus_hwi (arg, size + 7);
15539   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15540               build_int_cst (TREE_TYPE (t), -8));
15541   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
15542   /* String up roundup and advance.  */
15543   if (roundup)
15544     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15545   /* String up with arg */
15546   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
15547   /* Big-endianness related address adjustment.  */
15548   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15549       && size < UNITS_PER_WORD)
15550   {
15551     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
15552                 size_int (UNITS_PER_WORD - size));
15553     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
15554   }
15555
15556   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
15557   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
15558
15559   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
15560   t = off;
15561   if (adjust)
15562     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
15563                 build_int_cst (TREE_TYPE (off), adjust));
15564
15565   t = fold_convert (sizetype, t);
15566   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
15567
15568   if (is_ha)
15569     {
15570       /* type ha; // treat as "struct {ftype field[n];}"
15571          ... [computing offs]
15572          for (i = 0; i <nregs; ++i, offs += 16)
15573            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
15574          return ha;  */
15575       int i;
15576       tree tmp_ha, field_t, field_ptr_t;
15577
15578       /* Declare a local variable.  */
15579       tmp_ha = create_tmp_var_raw (type, "ha");
15580       gimple_add_tmp_var (tmp_ha);
15581
15582       /* Establish the base type.  */
15583       switch (ag_mode)
15584         {
15585         case E_SFmode:
15586           field_t = float_type_node;
15587           field_ptr_t = float_ptr_type_node;
15588           break;
15589         case E_DFmode:
15590           field_t = double_type_node;
15591           field_ptr_t = double_ptr_type_node;
15592           break;
15593         case E_TFmode:
15594           field_t = long_double_type_node;
15595           field_ptr_t = long_double_ptr_type_node;
15596           break;
15597         case E_HFmode:
15598           field_t = aarch64_fp16_type_node;
15599           field_ptr_t = aarch64_fp16_ptr_type_node;
15600           break;
15601         case E_BFmode:
15602           field_t = aarch64_bf16_type_node;
15603           field_ptr_t = aarch64_bf16_ptr_type_node;
15604           break;
15605         case E_V2SImode:
15606         case E_V4SImode:
15607             {
15608               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
15609               field_t = build_vector_type_for_mode (innertype, ag_mode);
15610               field_ptr_t = build_pointer_type (field_t);
15611             }
15612           break;
15613         default:
15614           gcc_assert (0);
15615         }
15616
15617       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
15618       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
15619       addr = t;
15620       t = fold_convert (field_ptr_t, addr);
15621       t = build2 (MODIFY_EXPR, field_t,
15622                   build1 (INDIRECT_REF, field_t, tmp_ha),
15623                   build1 (INDIRECT_REF, field_t, t));
15624
15625       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
15626       for (i = 1; i < nregs; ++i)
15627         {
15628           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
15629           u = fold_convert (field_ptr_t, addr);
15630           u = build2 (MODIFY_EXPR, field_t,
15631                       build2 (MEM_REF, field_t, tmp_ha,
15632                               build_int_cst (field_ptr_t,
15633                                              (i *
15634                                               int_size_in_bytes (field_t)))),
15635                       build1 (INDIRECT_REF, field_t, u));
15636           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
15637         }
15638
15639       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
15640       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
15641     }
15642
15643   COND_EXPR_ELSE (cond2) = t;
15644   addr = fold_convert (build_pointer_type (type), cond1);
15645   addr = build_va_arg_indirect_ref (addr);
15646
15647   if (indirect_p)
15648     addr = build_va_arg_indirect_ref (addr);
15649
15650   return addr;
15651 }
15652
15653 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
15654
15655 static void
15656 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
15657                                 const function_arg_info &arg,
15658                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
15659 {
15660   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
15661   CUMULATIVE_ARGS local_cum;
15662   int gr_saved = cfun->va_list_gpr_size;
15663   int vr_saved = cfun->va_list_fpr_size;
15664
15665   /* The caller has advanced CUM up to, but not beyond, the last named
15666      argument.  Advance a local copy of CUM past the last "real" named
15667      argument, to find out how many registers are left over.  */
15668   local_cum = *cum;
15669   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
15670
15671   /* Found out how many registers we need to save.
15672      Honor tree-stdvar analysis results.  */
15673   if (cfun->va_list_gpr_size)
15674     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
15675                     cfun->va_list_gpr_size / UNITS_PER_WORD);
15676   if (cfun->va_list_fpr_size)
15677     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
15678                     cfun->va_list_fpr_size / UNITS_PER_VREG);
15679
15680   if (!TARGET_FLOAT)
15681     {
15682       gcc_assert (local_cum.aapcs_nvrn == 0);
15683       vr_saved = 0;
15684     }
15685
15686   if (!no_rtl)
15687     {
15688       if (gr_saved > 0)
15689         {
15690           rtx ptr, mem;
15691
15692           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
15693           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
15694                                - gr_saved * UNITS_PER_WORD);
15695           mem = gen_frame_mem (BLKmode, ptr);
15696           set_mem_alias_set (mem, get_varargs_alias_set ());
15697
15698           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
15699                                mem, gr_saved);
15700         }
15701       if (vr_saved > 0)
15702         {
15703           /* We can't use move_block_from_reg, because it will use
15704              the wrong mode, storing D regs only.  */
15705           machine_mode mode = TImode;
15706           int off, i, vr_start;
15707
15708           /* Set OFF to the offset from virtual_incoming_args_rtx of
15709              the first vector register.  The VR save area lies below
15710              the GR one, and is aligned to 16 bytes.  */
15711           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
15712                            STACK_BOUNDARY / BITS_PER_UNIT);
15713           off -= vr_saved * UNITS_PER_VREG;
15714
15715           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
15716           for (i = 0; i < vr_saved; ++i)
15717             {
15718               rtx ptr, mem;
15719
15720               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
15721               mem = gen_frame_mem (mode, ptr);
15722               set_mem_alias_set (mem, get_varargs_alias_set ());
15723               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
15724               off += UNITS_PER_VREG;
15725             }
15726         }
15727     }
15728
15729   /* We don't save the size into *PRETEND_SIZE because we want to avoid
15730      any complication of having crtl->args.pretend_args_size changed.  */
15731   cfun->machine->frame.saved_varargs_size
15732     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
15733                  STACK_BOUNDARY / BITS_PER_UNIT)
15734        + vr_saved * UNITS_PER_VREG);
15735 }
15736
15737 static void
15738 aarch64_conditional_register_usage (void)
15739 {
15740   int i;
15741   if (!TARGET_FLOAT)
15742     {
15743       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
15744         {
15745           fixed_regs[i] = 1;
15746           call_used_regs[i] = 1;
15747         }
15748     }
15749   if (!TARGET_SVE)
15750     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
15751       {
15752         fixed_regs[i] = 1;
15753         call_used_regs[i] = 1;
15754       }
15755
15756   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
15757   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
15758   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
15759
15760   /* When tracking speculation, we need a couple of call-clobbered registers
15761      to track the speculation state.  It would be nice to just use
15762      IP0 and IP1, but currently there are numerous places that just
15763      assume these registers are free for other uses (eg pointer
15764      authentication).  */
15765   if (aarch64_track_speculation)
15766     {
15767       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
15768       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
15769       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15770       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15771     }
15772 }
15773
15774 /* Walk down the type tree of TYPE counting consecutive base elements.
15775    If *MODEP is VOIDmode, then set it to the first valid floating point
15776    type.  If a non-floating point type is found, or if a floating point
15777    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
15778    otherwise return the count in the sub-tree.  */
15779 static int
15780 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
15781 {
15782   machine_mode mode;
15783   HOST_WIDE_INT size;
15784
15785   /* SVE types (and types containing SVE types) must be handled
15786      before calling this function.  */
15787   gcc_assert (!aarch64_sve::builtin_type_p (type));
15788
15789   switch (TREE_CODE (type))
15790     {
15791     case REAL_TYPE:
15792       mode = TYPE_MODE (type);
15793       if (mode != DFmode && mode != SFmode
15794           && mode != TFmode && mode != HFmode)
15795         return -1;
15796
15797       if (*modep == VOIDmode)
15798         *modep = mode;
15799
15800       if (*modep == mode)
15801         return 1;
15802
15803       break;
15804
15805     case COMPLEX_TYPE:
15806       mode = TYPE_MODE (TREE_TYPE (type));
15807       if (mode != DFmode && mode != SFmode
15808           && mode != TFmode && mode != HFmode)
15809         return -1;
15810
15811       if (*modep == VOIDmode)
15812         *modep = mode;
15813
15814       if (*modep == mode)
15815         return 2;
15816
15817       break;
15818
15819     case VECTOR_TYPE:
15820       /* Use V2SImode and V4SImode as representatives of all 64-bit
15821          and 128-bit vector types.  */
15822       size = int_size_in_bytes (type);
15823       switch (size)
15824         {
15825         case 8:
15826           mode = V2SImode;
15827           break;
15828         case 16:
15829           mode = V4SImode;
15830           break;
15831         default:
15832           return -1;
15833         }
15834
15835       if (*modep == VOIDmode)
15836         *modep = mode;
15837
15838       /* Vector modes are considered to be opaque: two vectors are
15839          equivalent for the purposes of being homogeneous aggregates
15840          if they are the same size.  */
15841       if (*modep == mode)
15842         return 1;
15843
15844       break;
15845
15846     case ARRAY_TYPE:
15847       {
15848         int count;
15849         tree index = TYPE_DOMAIN (type);
15850
15851         /* Can't handle incomplete types nor sizes that are not
15852            fixed.  */
15853         if (!COMPLETE_TYPE_P (type)
15854             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15855           return -1;
15856
15857         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
15858         if (count == -1
15859             || !index
15860             || !TYPE_MAX_VALUE (index)
15861             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
15862             || !TYPE_MIN_VALUE (index)
15863             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
15864             || count < 0)
15865           return -1;
15866
15867         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
15868                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
15869
15870         /* There must be no padding.  */
15871         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15872                       count * GET_MODE_BITSIZE (*modep)))
15873           return -1;
15874
15875         return count;
15876       }
15877
15878     case RECORD_TYPE:
15879       {
15880         int count = 0;
15881         int sub_count;
15882         tree field;
15883
15884         /* Can't handle incomplete types nor sizes that are not
15885            fixed.  */
15886         if (!COMPLETE_TYPE_P (type)
15887             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15888           return -1;
15889
15890         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
15891           {
15892             if (TREE_CODE (field) != FIELD_DECL)
15893               continue;
15894
15895             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
15896             if (sub_count < 0)
15897               return -1;
15898             count += sub_count;
15899           }
15900
15901         /* There must be no padding.  */
15902         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15903                       count * GET_MODE_BITSIZE (*modep)))
15904           return -1;
15905
15906         return count;
15907       }
15908
15909     case UNION_TYPE:
15910     case QUAL_UNION_TYPE:
15911       {
15912         /* These aren't very interesting except in a degenerate case.  */
15913         int count = 0;
15914         int sub_count;
15915         tree field;
15916
15917         /* Can't handle incomplete types nor sizes that are not
15918            fixed.  */
15919         if (!COMPLETE_TYPE_P (type)
15920             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15921           return -1;
15922
15923         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
15924           {
15925             if (TREE_CODE (field) != FIELD_DECL)
15926               continue;
15927
15928             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
15929             if (sub_count < 0)
15930               return -1;
15931             count = count > sub_count ? count : sub_count;
15932           }
15933
15934         /* There must be no padding.  */
15935         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15936                       count * GET_MODE_BITSIZE (*modep)))
15937           return -1;
15938
15939         return count;
15940       }
15941
15942     default:
15943       break;
15944     }
15945
15946   return -1;
15947 }
15948
15949 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
15950    type as described in AAPCS64 \S 4.1.2.
15951
15952    See the comment above aarch64_composite_type_p for the notes on MODE.  */
15953
15954 static bool
15955 aarch64_short_vector_p (const_tree type,
15956                         machine_mode mode)
15957 {
15958   poly_int64 size = -1;
15959
15960   if (type && aarch64_sve::builtin_type_p (type))
15961     return false;
15962
15963   if (type && TREE_CODE (type) == VECTOR_TYPE)
15964     size = int_size_in_bytes (type);
15965   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
15966             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
15967     size = GET_MODE_SIZE (mode);
15968
15969   return known_eq (size, 8) || known_eq (size, 16);
15970 }
15971
15972 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
15973    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
15974    array types.  The C99 floating-point complex types are also considered
15975    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
15976    types, which are GCC extensions and out of the scope of AAPCS64, are
15977    treated as composite types here as well.
15978
15979    Note that MODE itself is not sufficient in determining whether a type
15980    is such a composite type or not.  This is because
15981    stor-layout.c:compute_record_mode may have already changed the MODE
15982    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
15983    structure with only one field may have its MODE set to the mode of the
15984    field.  Also an integer mode whose size matches the size of the
15985    RECORD_TYPE type may be used to substitute the original mode
15986    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
15987    solely relied on.  */
15988
15989 static bool
15990 aarch64_composite_type_p (const_tree type,
15991                           machine_mode mode)
15992 {
15993   if (aarch64_short_vector_p (type, mode))
15994     return false;
15995
15996   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
15997     return true;
15998
15999   if (mode == BLKmode
16000       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
16001       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
16002     return true;
16003
16004   return false;
16005 }
16006
16007 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16008    shall be passed or returned in simd/fp register(s) (providing these
16009    parameter passing registers are available).
16010
16011    Upon successful return, *COUNT returns the number of needed registers,
16012    *BASE_MODE returns the mode of the individual register and when IS_HAF
16013    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
16014    floating-point aggregate or a homogeneous short-vector aggregate.  */
16015
16016 static bool
16017 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
16018                                          const_tree type,
16019                                          machine_mode *base_mode,
16020                                          int *count,
16021                                          bool *is_ha)
16022 {
16023   if (is_ha != NULL) *is_ha = false;
16024
16025   if (type && aarch64_sve::builtin_type_p (type))
16026     return false;
16027
16028   machine_mode new_mode = VOIDmode;
16029   bool composite_p = aarch64_composite_type_p (type, mode);
16030
16031   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
16032       || aarch64_short_vector_p (type, mode))
16033     {
16034       *count = 1;
16035       new_mode = mode;
16036     }
16037   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
16038     {
16039       if (is_ha != NULL) *is_ha = true;
16040       *count = 2;
16041       new_mode = GET_MODE_INNER (mode);
16042     }
16043   else if (type && composite_p)
16044     {
16045       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
16046
16047       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
16048         {
16049           if (is_ha != NULL) *is_ha = true;
16050           *count = ag_count;
16051         }
16052       else
16053         return false;
16054     }
16055   else
16056     return false;
16057
16058   *base_mode = new_mode;
16059   return true;
16060 }
16061
16062 /* Implement TARGET_STRUCT_VALUE_RTX.  */
16063
16064 static rtx
16065 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
16066                           int incoming ATTRIBUTE_UNUSED)
16067 {
16068   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
16069 }
16070
16071 /* Implements target hook vector_mode_supported_p.  */
16072 static bool
16073 aarch64_vector_mode_supported_p (machine_mode mode)
16074 {
16075   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16076   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
16077 }
16078
16079 /* Return the full-width SVE vector mode for element mode MODE, if one
16080    exists.  */
16081 opt_machine_mode
16082 aarch64_full_sve_mode (scalar_mode mode)
16083 {
16084   switch (mode)
16085     {
16086     case E_DFmode:
16087       return VNx2DFmode;
16088     case E_SFmode:
16089       return VNx4SFmode;
16090     case E_HFmode:
16091       return VNx8HFmode;
16092     case E_DImode:
16093         return VNx2DImode;
16094     case E_SImode:
16095       return VNx4SImode;
16096     case E_HImode:
16097       return VNx8HImode;
16098     case E_QImode:
16099       return VNx16QImode;
16100     default:
16101       return opt_machine_mode ();
16102     }
16103 }
16104
16105 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
16106    if it exists.  */
16107 opt_machine_mode
16108 aarch64_vq_mode (scalar_mode mode)
16109 {
16110   switch (mode)
16111     {
16112     case E_DFmode:
16113       return V2DFmode;
16114     case E_SFmode:
16115       return V4SFmode;
16116     case E_HFmode:
16117       return V8HFmode;
16118     case E_BFmode:
16119       return V8BFmode;
16120     case E_SImode:
16121       return V4SImode;
16122     case E_HImode:
16123       return V8HImode;
16124     case E_QImode:
16125       return V16QImode;
16126     case E_DImode:
16127       return V2DImode;
16128     default:
16129       return opt_machine_mode ();
16130     }
16131 }
16132
16133 /* Return appropriate SIMD container
16134    for MODE within a vector of WIDTH bits.  */
16135 static machine_mode
16136 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
16137 {
16138   if (TARGET_SVE
16139       && maybe_ne (width, 128)
16140       && known_eq (width, BITS_PER_SVE_VECTOR))
16141     return aarch64_full_sve_mode (mode).else_mode (word_mode);
16142
16143   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
16144   if (TARGET_SIMD)
16145     {
16146       if (known_eq (width, 128))
16147         return aarch64_vq_mode (mode).else_mode (word_mode);
16148       else
16149         switch (mode)
16150           {
16151           case E_SFmode:
16152             return V2SFmode;
16153           case E_HFmode:
16154             return V4HFmode;
16155           case E_BFmode:
16156             return V4BFmode;
16157           case E_SImode:
16158             return V2SImode;
16159           case E_HImode:
16160             return V4HImode;
16161           case E_QImode:
16162             return V8QImode;
16163           default:
16164             break;
16165           }
16166     }
16167   return word_mode;
16168 }
16169
16170 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
16171 static machine_mode
16172 aarch64_preferred_simd_mode (scalar_mode mode)
16173 {
16174   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
16175   return aarch64_simd_container_mode (mode, bits);
16176 }
16177
16178 /* Return a list of possible vector sizes for the vectorizer
16179    to iterate over.  */
16180 static unsigned int
16181 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
16182 {
16183   static const machine_mode sve_modes[] = {
16184     /* Try using full vectors for all element types.  */
16185     VNx16QImode,
16186
16187     /* Try using 16-bit containers for 8-bit elements and full vectors
16188        for wider elements.  */
16189     VNx8QImode,
16190
16191     /* Try using 32-bit containers for 8-bit and 16-bit elements and
16192        full vectors for wider elements.  */
16193     VNx4QImode,
16194
16195     /* Try using 64-bit containers for all element types.  */
16196     VNx2QImode
16197   };
16198
16199   static const machine_mode advsimd_modes[] = {
16200     /* Try using 128-bit vectors for all element types.  */
16201     V16QImode,
16202
16203     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
16204        for wider elements.  */
16205     V8QImode,
16206
16207     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
16208        for wider elements.
16209
16210        TODO: We could support a limited form of V4QImode too, so that
16211        we use 32-bit vectors for 8-bit elements.  */
16212     V4HImode,
16213
16214     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
16215        for 64-bit elements.
16216
16217        TODO: We could similarly support limited forms of V2QImode and V2HImode
16218        for this case.  */
16219     V2SImode
16220   };
16221
16222   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
16223      This is because:
16224
16225      - If we can't use N-byte Advanced SIMD vectors then the placement
16226        doesn't matter; we'll just continue as though the Advanced SIMD
16227        entry didn't exist.
16228
16229      - If an SVE main loop with N bytes ends up being cheaper than an
16230        Advanced SIMD main loop with N bytes then by default we'll replace
16231        the Advanced SIMD version with the SVE one.
16232
16233      - If an Advanced SIMD main loop with N bytes ends up being cheaper
16234        than an SVE main loop with N bytes then by default we'll try to
16235        use the SVE loop to vectorize the epilogue instead.  */
16236   unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
16237   unsigned int advsimd_i = 0;
16238   while (advsimd_i < ARRAY_SIZE (advsimd_modes))
16239     {
16240       if (sve_i < ARRAY_SIZE (sve_modes)
16241           && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
16242                        GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
16243         modes->safe_push (sve_modes[sve_i++]);
16244       else
16245         modes->safe_push (advsimd_modes[advsimd_i++]);
16246     }
16247   while (sve_i < ARRAY_SIZE (sve_modes))
16248     modes->safe_push (sve_modes[sve_i++]);
16249
16250   unsigned int flags = 0;
16251   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
16252      can compare SVE against Advanced SIMD and so that we can compare
16253      multiple SVE vectorization approaches against each other.  There's
16254      not really any point doing this for Advanced SIMD only, since the
16255      first mode that works should always be the best.  */
16256   if (TARGET_SVE && aarch64_sve_compare_costs)
16257     flags |= VECT_COMPARE_COSTS;
16258   return flags;
16259 }
16260
16261 /* Implement TARGET_MANGLE_TYPE.  */
16262
16263 static const char *
16264 aarch64_mangle_type (const_tree type)
16265 {
16266   /* The AArch64 ABI documents say that "__va_list" has to be
16267      mangled as if it is in the "std" namespace.  */
16268   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
16269     return "St9__va_list";
16270
16271   /* Half-precision floating point types.  */
16272   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
16273     {
16274       if (TYPE_MODE (type) == BFmode)
16275         return "u6__bf16";
16276       else
16277         return "Dh";
16278     }
16279
16280   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
16281      builtin types.  */
16282   if (TYPE_NAME (type) != NULL)
16283     {
16284       const char *res;
16285       if ((res = aarch64_general_mangle_builtin_type (type))
16286           || (res = aarch64_sve::mangle_builtin_type (type)))
16287         return res;
16288     }
16289
16290   /* Use the default mangling.  */
16291   return NULL;
16292 }
16293
16294 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
16295
16296 static bool
16297 aarch64_verify_type_context (location_t loc, type_context_kind context,
16298                              const_tree type, bool silent_p)
16299 {
16300   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
16301 }
16302
16303 /* Find the first rtx_insn before insn that will generate an assembly
16304    instruction.  */
16305
16306 static rtx_insn *
16307 aarch64_prev_real_insn (rtx_insn *insn)
16308 {
16309   if (!insn)
16310     return NULL;
16311
16312   do
16313     {
16314       insn = prev_real_insn (insn);
16315     }
16316   while (insn && recog_memoized (insn) < 0);
16317
16318   return insn;
16319 }
16320
16321 static bool
16322 is_madd_op (enum attr_type t1)
16323 {
16324   unsigned int i;
16325   /* A number of these may be AArch32 only.  */
16326   enum attr_type mlatypes[] = {
16327     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
16328     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
16329     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
16330   };
16331
16332   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
16333     {
16334       if (t1 == mlatypes[i])
16335         return true;
16336     }
16337
16338   return false;
16339 }
16340
16341 /* Check if there is a register dependency between a load and the insn
16342    for which we hold recog_data.  */
16343
16344 static bool
16345 dep_between_memop_and_curr (rtx memop)
16346 {
16347   rtx load_reg;
16348   int opno;
16349
16350   gcc_assert (GET_CODE (memop) == SET);
16351
16352   if (!REG_P (SET_DEST (memop)))
16353     return false;
16354
16355   load_reg = SET_DEST (memop);
16356   for (opno = 1; opno < recog_data.n_operands; opno++)
16357     {
16358       rtx operand = recog_data.operand[opno];
16359       if (REG_P (operand)
16360           && reg_overlap_mentioned_p (load_reg, operand))
16361         return true;
16362
16363     }
16364   return false;
16365 }
16366
16367
16368 /* When working around the Cortex-A53 erratum 835769,
16369    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16370    instruction and has a preceding memory instruction such that a NOP
16371    should be inserted between them.  */
16372
16373 bool
16374 aarch64_madd_needs_nop (rtx_insn* insn)
16375 {
16376   enum attr_type attr_type;
16377   rtx_insn *prev;
16378   rtx body;
16379
16380   if (!TARGET_FIX_ERR_A53_835769)
16381     return false;
16382
16383   if (!INSN_P (insn) || recog_memoized (insn) < 0)
16384     return false;
16385
16386   attr_type = get_attr_type (insn);
16387   if (!is_madd_op (attr_type))
16388     return false;
16389
16390   prev = aarch64_prev_real_insn (insn);
16391   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16392      Restore recog state to INSN to avoid state corruption.  */
16393   extract_constrain_insn_cached (insn);
16394
16395   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
16396     return false;
16397
16398   body = single_set (prev);
16399
16400   /* If the previous insn is a memory op and there is no dependency between
16401      it and the DImode madd, emit a NOP between them.  If body is NULL then we
16402      have a complex memory operation, probably a load/store pair.
16403      Be conservative for now and emit a NOP.  */
16404   if (GET_MODE (recog_data.operand[0]) == DImode
16405       && (!body || !dep_between_memop_and_curr (body)))
16406     return true;
16407
16408   return false;
16409
16410 }
16411
16412
16413 /* Implement FINAL_PRESCAN_INSN.  */
16414
16415 void
16416 aarch64_final_prescan_insn (rtx_insn *insn)
16417 {
16418   if (aarch64_madd_needs_nop (insn))
16419     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
16420 }
16421
16422
16423 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
16424    instruction.  */
16425
16426 bool
16427 aarch64_sve_index_immediate_p (rtx base_or_step)
16428 {
16429   return (CONST_INT_P (base_or_step)
16430           && IN_RANGE (INTVAL (base_or_step), -16, 15));
16431 }
16432
16433 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
16434    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
16435
16436 bool
16437 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
16438 {
16439   rtx elt = unwrap_const_vec_duplicate (x);
16440   if (!CONST_INT_P (elt))
16441     return false;
16442
16443   HOST_WIDE_INT val = INTVAL (elt);
16444   if (negate_p)
16445     val = -val;
16446   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
16447
16448   if (val & 0xff)
16449     return IN_RANGE (val, 0, 0xff);
16450   return IN_RANGE (val, 0, 0xff00);
16451 }
16452
16453 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
16454    instructions when applied to mode MODE.  Negate X first if NEGATE_P
16455    is true.  */
16456
16457 bool
16458 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
16459 {
16460   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
16461     return false;
16462
16463   /* After the optional negation, the immediate must be nonnegative.
16464      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
16465      instead of SQADD Zn.B, Zn.B, #129.  */
16466   rtx elt = unwrap_const_vec_duplicate (x);
16467   return negate_p == (INTVAL (elt) < 0);
16468 }
16469
16470 /* Return true if X is a valid immediate operand for an SVE logical
16471    instruction such as AND.  */
16472
16473 bool
16474 aarch64_sve_bitmask_immediate_p (rtx x)
16475 {
16476   rtx elt;
16477
16478   return (const_vec_duplicate_p (x, &elt)
16479           && CONST_INT_P (elt)
16480           && aarch64_bitmask_imm (INTVAL (elt),
16481                                   GET_MODE_INNER (GET_MODE (x))));
16482 }
16483
16484 /* Return true if X is a valid immediate for the SVE DUP and CPY
16485    instructions.  */
16486
16487 bool
16488 aarch64_sve_dup_immediate_p (rtx x)
16489 {
16490   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
16491   if (!CONST_INT_P (x))
16492     return false;
16493
16494   HOST_WIDE_INT val = INTVAL (x);
16495   if (val & 0xff)
16496     return IN_RANGE (val, -0x80, 0x7f);
16497   return IN_RANGE (val, -0x8000, 0x7f00);
16498 }
16499
16500 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
16501    SIGNED_P says whether the operand is signed rather than unsigned.  */
16502
16503 bool
16504 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
16505 {
16506   x = unwrap_const_vec_duplicate (x);
16507   return (CONST_INT_P (x)
16508           && (signed_p
16509               ? IN_RANGE (INTVAL (x), -16, 15)
16510               : IN_RANGE (INTVAL (x), 0, 127)));
16511 }
16512
16513 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
16514    instruction.  Negate X first if NEGATE_P is true.  */
16515
16516 bool
16517 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
16518 {
16519   rtx elt;
16520   REAL_VALUE_TYPE r;
16521
16522   if (!const_vec_duplicate_p (x, &elt)
16523       || GET_CODE (elt) != CONST_DOUBLE)
16524     return false;
16525
16526   r = *CONST_DOUBLE_REAL_VALUE (elt);
16527
16528   if (negate_p)
16529     r = real_value_negate (&r);
16530
16531   if (real_equal (&r, &dconst1))
16532     return true;
16533   if (real_equal (&r, &dconsthalf))
16534     return true;
16535   return false;
16536 }
16537
16538 /* Return true if X is a valid immediate operand for an SVE FMUL
16539    instruction.  */
16540
16541 bool
16542 aarch64_sve_float_mul_immediate_p (rtx x)
16543 {
16544   rtx elt;
16545
16546   return (const_vec_duplicate_p (x, &elt)
16547           && GET_CODE (elt) == CONST_DOUBLE
16548           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
16549               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
16550 }
16551
16552 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
16553    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
16554    is nonnull, use it to describe valid immediates.  */
16555 static bool
16556 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
16557                                     simd_immediate_info *info,
16558                                     enum simd_immediate_check which,
16559                                     simd_immediate_info::insn_type insn)
16560 {
16561   /* Try a 4-byte immediate with LSL.  */
16562   for (unsigned int shift = 0; shift < 32; shift += 8)
16563     if ((val32 & (0xff << shift)) == val32)
16564       {
16565         if (info)
16566           *info = simd_immediate_info (SImode, val32 >> shift, insn,
16567                                        simd_immediate_info::LSL, shift);
16568         return true;
16569       }
16570
16571   /* Try a 2-byte immediate with LSL.  */
16572   unsigned int imm16 = val32 & 0xffff;
16573   if (imm16 == (val32 >> 16))
16574     for (unsigned int shift = 0; shift < 16; shift += 8)
16575       if ((imm16 & (0xff << shift)) == imm16)
16576         {
16577           if (info)
16578             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
16579                                          simd_immediate_info::LSL, shift);
16580           return true;
16581         }
16582
16583   /* Try a 4-byte immediate with MSL, except for cases that MVN
16584      can handle.  */
16585   if (which == AARCH64_CHECK_MOV)
16586     for (unsigned int shift = 8; shift < 24; shift += 8)
16587       {
16588         unsigned int low = (1 << shift) - 1;
16589         if (((val32 & (0xff << shift)) | low) == val32)
16590           {
16591             if (info)
16592               *info = simd_immediate_info (SImode, val32 >> shift, insn,
16593                                            simd_immediate_info::MSL, shift);
16594             return true;
16595           }
16596       }
16597
16598   return false;
16599 }
16600
16601 /* Return true if replicating VAL64 is a valid immediate for the
16602    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
16603    use it to describe valid immediates.  */
16604 static bool
16605 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
16606                                  simd_immediate_info *info,
16607                                  enum simd_immediate_check which)
16608 {
16609   unsigned int val32 = val64 & 0xffffffff;
16610   unsigned int val16 = val64 & 0xffff;
16611   unsigned int val8 = val64 & 0xff;
16612
16613   if (val32 == (val64 >> 32))
16614     {
16615       if ((which & AARCH64_CHECK_ORR) != 0
16616           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
16617                                                  simd_immediate_info::MOV))
16618         return true;
16619
16620       if ((which & AARCH64_CHECK_BIC) != 0
16621           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
16622                                                  simd_immediate_info::MVN))
16623         return true;
16624
16625       /* Try using a replicated byte.  */
16626       if (which == AARCH64_CHECK_MOV
16627           && val16 == (val32 >> 16)
16628           && val8 == (val16 >> 8))
16629         {
16630           if (info)
16631             *info = simd_immediate_info (QImode, val8);
16632           return true;
16633         }
16634     }
16635
16636   /* Try using a bit-to-bytemask.  */
16637   if (which == AARCH64_CHECK_MOV)
16638     {
16639       unsigned int i;
16640       for (i = 0; i < 64; i += 8)
16641         {
16642           unsigned char byte = (val64 >> i) & 0xff;
16643           if (byte != 0 && byte != 0xff)
16644             break;
16645         }
16646       if (i == 64)
16647         {
16648           if (info)
16649             *info = simd_immediate_info (DImode, val64);
16650           return true;
16651         }
16652     }
16653   return false;
16654 }
16655
16656 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
16657    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
16658
16659 static bool
16660 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
16661                              simd_immediate_info *info)
16662 {
16663   scalar_int_mode mode = DImode;
16664   unsigned int val32 = val64 & 0xffffffff;
16665   if (val32 == (val64 >> 32))
16666     {
16667       mode = SImode;
16668       unsigned int val16 = val32 & 0xffff;
16669       if (val16 == (val32 >> 16))
16670         {
16671           mode = HImode;
16672           unsigned int val8 = val16 & 0xff;
16673           if (val8 == (val16 >> 8))
16674             mode = QImode;
16675         }
16676     }
16677   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
16678   if (IN_RANGE (val, -0x80, 0x7f))
16679     {
16680       /* DUP with no shift.  */
16681       if (info)
16682         *info = simd_immediate_info (mode, val);
16683       return true;
16684     }
16685   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
16686     {
16687       /* DUP with LSL #8.  */
16688       if (info)
16689         *info = simd_immediate_info (mode, val);
16690       return true;
16691     }
16692   if (aarch64_bitmask_imm (val64, mode))
16693     {
16694       /* DUPM.  */
16695       if (info)
16696         *info = simd_immediate_info (mode, val);
16697       return true;
16698     }
16699   return false;
16700 }
16701
16702 /* Return true if X is an UNSPEC_PTRUE constant of the form:
16703
16704        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
16705
16706    where PATTERN is the svpattern as a CONST_INT and where ZERO
16707    is a zero constant of the required PTRUE mode (which can have
16708    fewer elements than X's mode, if zero bits are significant).
16709
16710    If so, and if INFO is nonnull, describe the immediate in INFO.  */
16711 bool
16712 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
16713 {
16714   if (GET_CODE (x) != CONST)
16715     return false;
16716
16717   x = XEXP (x, 0);
16718   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
16719     return false;
16720
16721   if (info)
16722     {
16723       aarch64_svpattern pattern
16724         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
16725       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
16726       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
16727       *info = simd_immediate_info (int_mode, pattern);
16728     }
16729   return true;
16730 }
16731
16732 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
16733    it to describe valid immediates.  */
16734
16735 static bool
16736 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
16737 {
16738   if (aarch64_sve_ptrue_svpattern_p (x, info))
16739     return true;
16740
16741   if (x == CONST0_RTX (GET_MODE (x)))
16742     {
16743       if (info)
16744         *info = simd_immediate_info (DImode, 0);
16745       return true;
16746     }
16747
16748   /* Analyze the value as a VNx16BImode.  This should be relatively
16749      efficient, since rtx_vector_builder has enough built-in capacity
16750      to store all VLA predicate constants without needing the heap.  */
16751   rtx_vector_builder builder;
16752   if (!aarch64_get_sve_pred_bits (builder, x))
16753     return false;
16754
16755   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
16756   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
16757     {
16758       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
16759       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
16760       if (pattern != AARCH64_NUM_SVPATTERNS)
16761         {
16762           if (info)
16763             {
16764               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
16765               *info = simd_immediate_info (int_mode, pattern);
16766             }
16767           return true;
16768         }
16769     }
16770   return false;
16771 }
16772
16773 /* Return true if OP is a valid SIMD immediate for the operation
16774    described by WHICH.  If INFO is nonnull, use it to describe valid
16775    immediates.  */
16776 bool
16777 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
16778                               enum simd_immediate_check which)
16779 {
16780   machine_mode mode = GET_MODE (op);
16781   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16782   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
16783     return false;
16784
16785   if (vec_flags & VEC_SVE_PRED)
16786     return aarch64_sve_pred_valid_immediate (op, info);
16787
16788   scalar_mode elt_mode = GET_MODE_INNER (mode);
16789   rtx base, step;
16790   unsigned int n_elts;
16791   if (GET_CODE (op) == CONST_VECTOR
16792       && CONST_VECTOR_DUPLICATE_P (op))
16793     n_elts = CONST_VECTOR_NPATTERNS (op);
16794   else if ((vec_flags & VEC_SVE_DATA)
16795            && const_vec_series_p (op, &base, &step))
16796     {
16797       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
16798       if (!aarch64_sve_index_immediate_p (base)
16799           || !aarch64_sve_index_immediate_p (step))
16800         return false;
16801
16802       if (info)
16803         {
16804           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
16805              should yield two integer values per 128-bit block, meaning
16806              that we need to treat it in the same way as V2DI and then
16807              ignore the upper 32 bits of each element.  */
16808           elt_mode = aarch64_sve_container_int_mode (mode);
16809           *info = simd_immediate_info (elt_mode, base, step);
16810         }
16811       return true;
16812     }
16813   else if (GET_CODE (op) == CONST_VECTOR
16814            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
16815     /* N_ELTS set above.  */;
16816   else
16817     return false;
16818
16819   scalar_float_mode elt_float_mode;
16820   if (n_elts == 1
16821       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
16822     {
16823       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
16824       if (aarch64_float_const_zero_rtx_p (elt)
16825           || aarch64_float_const_representable_p (elt))
16826         {
16827           if (info)
16828             *info = simd_immediate_info (elt_float_mode, elt);
16829           return true;
16830         }
16831     }
16832
16833   /* If all elements in an SVE vector have the same value, we have a free
16834      choice between using the element mode and using the container mode.
16835      Using the element mode means that unused parts of the vector are
16836      duplicates of the used elements, while using the container mode means
16837      that the unused parts are an extension of the used elements.  Using the
16838      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
16839      for its container mode VNx4SI while 0x00000101 isn't.
16840
16841      If not all elements in an SVE vector have the same value, we need the
16842      transition from one element to the next to occur at container boundaries.
16843      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
16844      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
16845   scalar_int_mode elt_int_mode;
16846   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
16847     elt_int_mode = aarch64_sve_container_int_mode (mode);
16848   else
16849     elt_int_mode = int_mode_for_mode (elt_mode).require ();
16850
16851   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
16852   if (elt_size > 8)
16853     return false;
16854
16855   /* Expand the vector constant out into a byte vector, with the least
16856      significant byte of the register first.  */
16857   auto_vec<unsigned char, 16> bytes;
16858   bytes.reserve (n_elts * elt_size);
16859   for (unsigned int i = 0; i < n_elts; i++)
16860     {
16861       /* The vector is provided in gcc endian-neutral fashion.
16862          For aarch64_be Advanced SIMD, it must be laid out in the vector
16863          register in reverse order.  */
16864       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
16865       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
16866
16867       if (elt_mode != elt_int_mode)
16868         elt = gen_lowpart (elt_int_mode, elt);
16869
16870       if (!CONST_INT_P (elt))
16871         return false;
16872
16873       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
16874       for (unsigned int byte = 0; byte < elt_size; byte++)
16875         {
16876           bytes.quick_push (elt_val & 0xff);
16877           elt_val >>= BITS_PER_UNIT;
16878         }
16879     }
16880
16881   /* The immediate must repeat every eight bytes.  */
16882   unsigned int nbytes = bytes.length ();
16883   for (unsigned i = 8; i < nbytes; ++i)
16884     if (bytes[i] != bytes[i - 8])
16885       return false;
16886
16887   /* Get the repeating 8-byte value as an integer.  No endian correction
16888      is needed here because bytes is already in lsb-first order.  */
16889   unsigned HOST_WIDE_INT val64 = 0;
16890   for (unsigned int i = 0; i < 8; i++)
16891     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
16892               << (i * BITS_PER_UNIT));
16893
16894   if (vec_flags & VEC_SVE_DATA)
16895     return aarch64_sve_valid_immediate (val64, info);
16896   else
16897     return aarch64_advsimd_valid_immediate (val64, info, which);
16898 }
16899
16900 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
16901    has a step in the range of INDEX.  Return the index expression if so,
16902    otherwise return null.  */
16903 rtx
16904 aarch64_check_zero_based_sve_index_immediate (rtx x)
16905 {
16906   rtx base, step;
16907   if (const_vec_series_p (x, &base, &step)
16908       && base == const0_rtx
16909       && aarch64_sve_index_immediate_p (step))
16910     return step;
16911   return NULL_RTX;
16912 }
16913
16914 /* Check of immediate shift constants are within range.  */
16915 bool
16916 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
16917 {
16918   x = unwrap_const_vec_duplicate (x);
16919   if (!CONST_INT_P (x))
16920     return false;
16921   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
16922   if (left)
16923     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
16924   else
16925     return IN_RANGE (INTVAL (x), 1, bit_width);
16926 }
16927
16928 /* Return the bitmask CONST_INT to select the bits required by a zero extract
16929    operation of width WIDTH at bit position POS.  */
16930
16931 rtx
16932 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
16933 {
16934   gcc_assert (CONST_INT_P (width));
16935   gcc_assert (CONST_INT_P (pos));
16936
16937   unsigned HOST_WIDE_INT mask
16938     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
16939   return GEN_INT (mask << UINTVAL (pos));
16940 }
16941
16942 bool
16943 aarch64_mov_operand_p (rtx x, machine_mode mode)
16944 {
16945   if (GET_CODE (x) == HIGH
16946       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
16947     return true;
16948
16949   if (CONST_INT_P (x))
16950     return true;
16951
16952   if (VECTOR_MODE_P (GET_MODE (x)))
16953     {
16954       /* Require predicate constants to be VNx16BI before RA, so that we
16955          force everything to have a canonical form.  */
16956       if (!lra_in_progress
16957           && !reload_completed
16958           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
16959           && GET_MODE (x) != VNx16BImode)
16960         return false;
16961
16962       return aarch64_simd_valid_immediate (x, NULL);
16963     }
16964
16965   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
16966     return true;
16967
16968   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
16969     return true;
16970
16971   return aarch64_classify_symbolic_expression (x)
16972     == SYMBOL_TINY_ABSOLUTE;
16973 }
16974
16975 /* Return a const_int vector of VAL.  */
16976 rtx
16977 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
16978 {
16979   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
16980   return gen_const_vec_duplicate (mode, c);
16981 }
16982
16983 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
16984
16985 bool
16986 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
16987 {
16988   machine_mode vmode;
16989
16990   vmode = aarch64_simd_container_mode (mode, 64);
16991   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
16992   return aarch64_simd_valid_immediate (op_v, NULL);
16993 }
16994
16995 /* Construct and return a PARALLEL RTX vector with elements numbering the
16996    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
16997    the vector - from the perspective of the architecture.  This does not
16998    line up with GCC's perspective on lane numbers, so we end up with
16999    different masks depending on our target endian-ness.  The diagram
17000    below may help.  We must draw the distinction when building masks
17001    which select one half of the vector.  An instruction selecting
17002    architectural low-lanes for a big-endian target, must be described using
17003    a mask selecting GCC high-lanes.
17004
17005                  Big-Endian             Little-Endian
17006
17007 GCC             0   1   2   3           3   2   1   0
17008               | x | x | x | x |       | x | x | x | x |
17009 Architecture    3   2   1   0           3   2   1   0
17010
17011 Low Mask:         { 2, 3 }                { 0, 1 }
17012 High Mask:        { 0, 1 }                { 2, 3 }
17013
17014    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
17015
17016 rtx
17017 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
17018 {
17019   rtvec v = rtvec_alloc (nunits / 2);
17020   int high_base = nunits / 2;
17021   int low_base = 0;
17022   int base;
17023   rtx t1;
17024   int i;
17025
17026   if (BYTES_BIG_ENDIAN)
17027     base = high ? low_base : high_base;
17028   else
17029     base = high ? high_base : low_base;
17030
17031   for (i = 0; i < nunits / 2; i++)
17032     RTVEC_ELT (v, i) = GEN_INT (base + i);
17033
17034   t1 = gen_rtx_PARALLEL (mode, v);
17035   return t1;
17036 }
17037
17038 /* Check OP for validity as a PARALLEL RTX vector with elements
17039    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
17040    from the perspective of the architecture.  See the diagram above
17041    aarch64_simd_vect_par_cnst_half for more details.  */
17042
17043 bool
17044 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
17045                                        bool high)
17046 {
17047   int nelts;
17048   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
17049     return false;
17050
17051   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
17052   HOST_WIDE_INT count_op = XVECLEN (op, 0);
17053   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
17054   int i = 0;
17055
17056   if (count_op != count_ideal)
17057     return false;
17058
17059   for (i = 0; i < count_ideal; i++)
17060     {
17061       rtx elt_op = XVECEXP (op, 0, i);
17062       rtx elt_ideal = XVECEXP (ideal, 0, i);
17063
17064       if (!CONST_INT_P (elt_op)
17065           || INTVAL (elt_ideal) != INTVAL (elt_op))
17066         return false;
17067     }
17068   return true;
17069 }
17070
17071 /* Return a PARALLEL containing NELTS elements, with element I equal
17072    to BASE + I * STEP.  */
17073
17074 rtx
17075 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
17076 {
17077   rtvec vec = rtvec_alloc (nelts);
17078   for (unsigned int i = 0; i < nelts; ++i)
17079     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
17080   return gen_rtx_PARALLEL (VOIDmode, vec);
17081 }
17082
17083 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
17084    series with step STEP.  */
17085
17086 bool
17087 aarch64_stepped_int_parallel_p (rtx op, int step)
17088 {
17089   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
17090     return false;
17091
17092   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
17093   for (int i = 1; i < XVECLEN (op, 0); ++i)
17094     if (!CONST_INT_P (XVECEXP (op, 0, i))
17095         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
17096       return false;
17097
17098   return true;
17099 }
17100
17101 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
17102    HIGH (exclusive).  */
17103 void
17104 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
17105                           const_tree exp)
17106 {
17107   HOST_WIDE_INT lane;
17108   gcc_assert (CONST_INT_P (operand));
17109   lane = INTVAL (operand);
17110
17111   if (lane < low || lane >= high)
17112   {
17113     if (exp)
17114       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
17115     else
17116       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
17117   }
17118 }
17119
17120 /* Peform endian correction on lane number N, which indexes a vector
17121    of mode MODE, and return the result as an SImode rtx.  */
17122
17123 rtx
17124 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
17125 {
17126   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
17127 }
17128
17129 /* Return TRUE if OP is a valid vector addressing mode.  */
17130
17131 bool
17132 aarch64_simd_mem_operand_p (rtx op)
17133 {
17134   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
17135                         || REG_P (XEXP (op, 0)));
17136 }
17137
17138 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
17139
17140 bool
17141 aarch64_sve_ld1r_operand_p (rtx op)
17142 {
17143   struct aarch64_address_info addr;
17144   scalar_mode mode;
17145
17146   return (MEM_P (op)
17147           && is_a <scalar_mode> (GET_MODE (op), &mode)
17148           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
17149           && addr.type == ADDRESS_REG_IMM
17150           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
17151 }
17152
17153 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
17154    where the size of the read data is specified by `mode` and the size of the
17155    vector elements are specified by `elem_mode`.   */
17156 bool
17157 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
17158                                    scalar_mode elem_mode)
17159 {
17160   struct aarch64_address_info addr;
17161   if (!MEM_P (op)
17162       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
17163     return false;
17164
17165   if (addr.type == ADDRESS_REG_IMM)
17166     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
17167
17168   if (addr.type == ADDRESS_REG_REG)
17169     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
17170
17171   return false;
17172 }
17173
17174 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
17175 bool
17176 aarch64_sve_ld1rq_operand_p (rtx op)
17177 {
17178   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
17179                                             GET_MODE_INNER (GET_MODE (op)));
17180 }
17181
17182 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
17183    accessing a vector where the element size is specified by `elem_mode`.  */
17184 bool
17185 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
17186 {
17187   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
17188 }
17189
17190 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
17191 bool
17192 aarch64_sve_ldff1_operand_p (rtx op)
17193 {
17194   if (!MEM_P (op))
17195     return false;
17196
17197   struct aarch64_address_info addr;
17198   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
17199     return false;
17200
17201   if (addr.type == ADDRESS_REG_IMM)
17202     return known_eq (addr.const_offset, 0);
17203
17204   return addr.type == ADDRESS_REG_REG;
17205 }
17206
17207 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
17208 bool
17209 aarch64_sve_ldnf1_operand_p (rtx op)
17210 {
17211   struct aarch64_address_info addr;
17212
17213   return (MEM_P (op)
17214           && aarch64_classify_address (&addr, XEXP (op, 0),
17215                                        GET_MODE (op), false)
17216           && addr.type == ADDRESS_REG_IMM);
17217 }
17218
17219 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
17220    The conditions for STR are the same.  */
17221 bool
17222 aarch64_sve_ldr_operand_p (rtx op)
17223 {
17224   struct aarch64_address_info addr;
17225
17226   return (MEM_P (op)
17227           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
17228                                        false, ADDR_QUERY_ANY)
17229           && addr.type == ADDRESS_REG_IMM);
17230 }
17231
17232 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
17233    addressing memory of mode MODE.  */
17234 bool
17235 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
17236 {
17237   struct aarch64_address_info addr;
17238   if (!aarch64_classify_address (&addr, op, mode, false))
17239     return false;
17240
17241   if (addr.type == ADDRESS_REG_IMM)
17242     return known_eq (addr.const_offset, 0);
17243
17244   return addr.type == ADDRESS_REG_REG;
17245 }
17246
17247 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
17248    We need to be able to access the individual pieces, so the range
17249    is different from LD[234] and ST[234].  */
17250 bool
17251 aarch64_sve_struct_memory_operand_p (rtx op)
17252 {
17253   if (!MEM_P (op))
17254     return false;
17255
17256   machine_mode mode = GET_MODE (op);
17257   struct aarch64_address_info addr;
17258   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
17259                                  ADDR_QUERY_ANY)
17260       || addr.type != ADDRESS_REG_IMM)
17261     return false;
17262
17263   poly_int64 first = addr.const_offset;
17264   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
17265   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
17266           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
17267 }
17268
17269 /* Emit a register copy from operand to operand, taking care not to
17270    early-clobber source registers in the process.
17271
17272    COUNT is the number of components into which the copy needs to be
17273    decomposed.  */
17274 void
17275 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
17276                                 unsigned int count)
17277 {
17278   unsigned int i;
17279   int rdest = REGNO (operands[0]);
17280   int rsrc = REGNO (operands[1]);
17281
17282   if (!reg_overlap_mentioned_p (operands[0], operands[1])
17283       || rdest < rsrc)
17284     for (i = 0; i < count; i++)
17285       emit_move_insn (gen_rtx_REG (mode, rdest + i),
17286                       gen_rtx_REG (mode, rsrc + i));
17287   else
17288     for (i = 0; i < count; i++)
17289       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
17290                       gen_rtx_REG (mode, rsrc + count - i - 1));
17291 }
17292
17293 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
17294    one of VSTRUCT modes: OI, CI, or XI.  */
17295 int
17296 aarch64_simd_attr_length_rglist (machine_mode mode)
17297 {
17298   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
17299   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
17300 }
17301
17302 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
17303    alignment of a vector to 128 bits.  SVE predicates have an alignment of
17304    16 bits.  */
17305 static HOST_WIDE_INT
17306 aarch64_simd_vector_alignment (const_tree type)
17307 {
17308   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
17309      be set for non-predicate vectors of booleans.  Modes are the most
17310      direct way we have of identifying real SVE predicate types.  */
17311   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
17312     return 16;
17313   widest_int min_size
17314     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
17315   return wi::umin (min_size, 128).to_uhwi ();
17316 }
17317
17318 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
17319 static poly_uint64
17320 aarch64_vectorize_preferred_vector_alignment (const_tree type)
17321 {
17322   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
17323     {
17324       /* If the length of the vector is fixed, try to align to that length,
17325          otherwise don't try to align at all.  */
17326       HOST_WIDE_INT result;
17327       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
17328         result = TYPE_ALIGN (TREE_TYPE (type));
17329       return result;
17330     }
17331   return TYPE_ALIGN (type);
17332 }
17333
17334 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
17335 static bool
17336 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
17337 {
17338   if (is_packed)
17339     return false;
17340
17341   /* For fixed-length vectors, check that the vectorizer will aim for
17342      full-vector alignment.  This isn't true for generic GCC vectors
17343      that are wider than the ABI maximum of 128 bits.  */
17344   poly_uint64 preferred_alignment =
17345     aarch64_vectorize_preferred_vector_alignment (type);
17346   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
17347       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
17348                    preferred_alignment))
17349     return false;
17350
17351   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
17352   return true;
17353 }
17354
17355 /* Return true if the vector misalignment factor is supported by the
17356    target.  */
17357 static bool
17358 aarch64_builtin_support_vector_misalignment (machine_mode mode,
17359                                              const_tree type, int misalignment,
17360                                              bool is_packed)
17361 {
17362   if (TARGET_SIMD && STRICT_ALIGNMENT)
17363     {
17364       /* Return if movmisalign pattern is not supported for this mode.  */
17365       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
17366         return false;
17367
17368       /* Misalignment factor is unknown at compile time.  */
17369       if (misalignment == -1)
17370         return false;
17371     }
17372   return default_builtin_support_vector_misalignment (mode, type, misalignment,
17373                                                       is_packed);
17374 }
17375
17376 /* If VALS is a vector constant that can be loaded into a register
17377    using DUP, generate instructions to do so and return an RTX to
17378    assign to the register.  Otherwise return NULL_RTX.  */
17379 static rtx
17380 aarch64_simd_dup_constant (rtx vals)
17381 {
17382   machine_mode mode = GET_MODE (vals);
17383   machine_mode inner_mode = GET_MODE_INNER (mode);
17384   rtx x;
17385
17386   if (!const_vec_duplicate_p (vals, &x))
17387     return NULL_RTX;
17388
17389   /* We can load this constant by using DUP and a constant in a
17390      single ARM register.  This will be cheaper than a vector
17391      load.  */
17392   x = copy_to_mode_reg (inner_mode, x);
17393   return gen_vec_duplicate (mode, x);
17394 }
17395
17396
17397 /* Generate code to load VALS, which is a PARALLEL containing only
17398    constants (for vec_init) or CONST_VECTOR, efficiently into a
17399    register.  Returns an RTX to copy into the register, or NULL_RTX
17400    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
17401 static rtx
17402 aarch64_simd_make_constant (rtx vals)
17403 {
17404   machine_mode mode = GET_MODE (vals);
17405   rtx const_dup;
17406   rtx const_vec = NULL_RTX;
17407   int n_const = 0;
17408   int i;
17409
17410   if (GET_CODE (vals) == CONST_VECTOR)
17411     const_vec = vals;
17412   else if (GET_CODE (vals) == PARALLEL)
17413     {
17414       /* A CONST_VECTOR must contain only CONST_INTs and
17415          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
17416          Only store valid constants in a CONST_VECTOR.  */
17417       int n_elts = XVECLEN (vals, 0);
17418       for (i = 0; i < n_elts; ++i)
17419         {
17420           rtx x = XVECEXP (vals, 0, i);
17421           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17422             n_const++;
17423         }
17424       if (n_const == n_elts)
17425         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
17426     }
17427   else
17428     gcc_unreachable ();
17429
17430   if (const_vec != NULL_RTX
17431       && aarch64_simd_valid_immediate (const_vec, NULL))
17432     /* Load using MOVI/MVNI.  */
17433     return const_vec;
17434   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
17435     /* Loaded using DUP.  */
17436     return const_dup;
17437   else if (const_vec != NULL_RTX)
17438     /* Load from constant pool. We cannot take advantage of single-cycle
17439        LD1 because we need a PC-relative addressing mode.  */
17440     return const_vec;
17441   else
17442     /* A PARALLEL containing something not valid inside CONST_VECTOR.
17443        We cannot construct an initializer.  */
17444     return NULL_RTX;
17445 }
17446
17447 /* Expand a vector initialisation sequence, such that TARGET is
17448    initialised to contain VALS.  */
17449
17450 void
17451 aarch64_expand_vector_init (rtx target, rtx vals)
17452 {
17453   machine_mode mode = GET_MODE (target);
17454   scalar_mode inner_mode = GET_MODE_INNER (mode);
17455   /* The number of vector elements.  */
17456   int n_elts = XVECLEN (vals, 0);
17457   /* The number of vector elements which are not constant.  */
17458   int n_var = 0;
17459   rtx any_const = NULL_RTX;
17460   /* The first element of vals.  */
17461   rtx v0 = XVECEXP (vals, 0, 0);
17462   bool all_same = true;
17463
17464   /* This is a special vec_init<M><N> where N is not an element mode but a
17465      vector mode with half the elements of M.  We expect to find two entries
17466      of mode N in VALS and we must put their concatentation into TARGET.  */
17467   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
17468     {
17469       gcc_assert (known_eq (GET_MODE_SIZE (mode),
17470                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
17471       rtx lo = XVECEXP (vals, 0, 0);
17472       rtx hi = XVECEXP (vals, 0, 1);
17473       machine_mode narrow_mode = GET_MODE (lo);
17474       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
17475       gcc_assert (narrow_mode == GET_MODE (hi));
17476
17477       /* When we want to concatenate a half-width vector with zeroes we can
17478          use the aarch64_combinez[_be] patterns.  Just make sure that the
17479          zeroes are in the right half.  */
17480       if (BYTES_BIG_ENDIAN
17481           && aarch64_simd_imm_zero (lo, narrow_mode)
17482           && general_operand (hi, narrow_mode))
17483         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
17484       else if (!BYTES_BIG_ENDIAN
17485                && aarch64_simd_imm_zero (hi, narrow_mode)
17486                && general_operand (lo, narrow_mode))
17487         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
17488       else
17489         {
17490           /* Else create the two half-width registers and combine them.  */
17491           if (!REG_P (lo))
17492             lo = force_reg (GET_MODE (lo), lo);
17493           if (!REG_P (hi))
17494             hi = force_reg (GET_MODE (hi), hi);
17495
17496           if (BYTES_BIG_ENDIAN)
17497             std::swap (lo, hi);
17498           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
17499         }
17500      return;
17501    }
17502
17503   /* Count the number of variable elements to initialise.  */
17504   for (int i = 0; i < n_elts; ++i)
17505     {
17506       rtx x = XVECEXP (vals, 0, i);
17507       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
17508         ++n_var;
17509       else
17510         any_const = x;
17511
17512       all_same &= rtx_equal_p (x, v0);
17513     }
17514
17515   /* No variable elements, hand off to aarch64_simd_make_constant which knows
17516      how best to handle this.  */
17517   if (n_var == 0)
17518     {
17519       rtx constant = aarch64_simd_make_constant (vals);
17520       if (constant != NULL_RTX)
17521         {
17522           emit_move_insn (target, constant);
17523           return;
17524         }
17525     }
17526
17527   /* Splat a single non-constant element if we can.  */
17528   if (all_same)
17529     {
17530       rtx x = copy_to_mode_reg (inner_mode, v0);
17531       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17532       return;
17533     }
17534
17535   enum insn_code icode = optab_handler (vec_set_optab, mode);
17536   gcc_assert (icode != CODE_FOR_nothing);
17537
17538   /* If there are only variable elements, try to optimize
17539      the insertion using dup for the most common element
17540      followed by insertions.  */
17541
17542   /* The algorithm will fill matches[*][0] with the earliest matching element,
17543      and matches[X][1] with the count of duplicate elements (if X is the
17544      earliest element which has duplicates).  */
17545
17546   if (n_var == n_elts && n_elts <= 16)
17547     {
17548       int matches[16][2] = {0};
17549       for (int i = 0; i < n_elts; i++)
17550         {
17551           for (int j = 0; j <= i; j++)
17552             {
17553               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
17554                 {
17555                   matches[i][0] = j;
17556                   matches[j][1]++;
17557                   break;
17558                 }
17559             }
17560         }
17561       int maxelement = 0;
17562       int maxv = 0;
17563       for (int i = 0; i < n_elts; i++)
17564         if (matches[i][1] > maxv)
17565           {
17566             maxelement = i;
17567             maxv = matches[i][1];
17568           }
17569
17570       /* Create a duplicate of the most common element, unless all elements
17571          are equally useless to us, in which case just immediately set the
17572          vector register using the first element.  */
17573
17574       if (maxv == 1)
17575         {
17576           /* For vectors of two 64-bit elements, we can do even better.  */
17577           if (n_elts == 2
17578               && (inner_mode == E_DImode
17579                   || inner_mode == E_DFmode))
17580
17581             {
17582               rtx x0 = XVECEXP (vals, 0, 0);
17583               rtx x1 = XVECEXP (vals, 0, 1);
17584               /* Combine can pick up this case, but handling it directly
17585                  here leaves clearer RTL.
17586
17587                  This is load_pair_lanes<mode>, and also gives us a clean-up
17588                  for store_pair_lanes<mode>.  */
17589               if (memory_operand (x0, inner_mode)
17590                   && memory_operand (x1, inner_mode)
17591                   && !STRICT_ALIGNMENT
17592                   && rtx_equal_p (XEXP (x1, 0),
17593                                   plus_constant (Pmode,
17594                                                  XEXP (x0, 0),
17595                                                  GET_MODE_SIZE (inner_mode))))
17596                 {
17597                   rtx t;
17598                   if (inner_mode == DFmode)
17599                     t = gen_load_pair_lanesdf (target, x0, x1);
17600                   else
17601                     t = gen_load_pair_lanesdi (target, x0, x1);
17602                   emit_insn (t);
17603                   return;
17604                 }
17605             }
17606           /* The subreg-move sequence below will move into lane zero of the
17607              vector register.  For big-endian we want that position to hold
17608              the last element of VALS.  */
17609           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
17610           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17611           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
17612         }
17613       else
17614         {
17615           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17616           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17617         }
17618
17619       /* Insert the rest.  */
17620       for (int i = 0; i < n_elts; i++)
17621         {
17622           rtx x = XVECEXP (vals, 0, i);
17623           if (matches[i][0] == maxelement)
17624             continue;
17625           x = copy_to_mode_reg (inner_mode, x);
17626           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17627         }
17628       return;
17629     }
17630
17631   /* Initialise a vector which is part-variable.  We want to first try
17632      to build those lanes which are constant in the most efficient way we
17633      can.  */
17634   if (n_var != n_elts)
17635     {
17636       rtx copy = copy_rtx (vals);
17637
17638       /* Load constant part of vector.  We really don't care what goes into the
17639          parts we will overwrite, but we're more likely to be able to load the
17640          constant efficiently if it has fewer, larger, repeating parts
17641          (see aarch64_simd_valid_immediate).  */
17642       for (int i = 0; i < n_elts; i++)
17643         {
17644           rtx x = XVECEXP (vals, 0, i);
17645           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17646             continue;
17647           rtx subst = any_const;
17648           for (int bit = n_elts / 2; bit > 0; bit /= 2)
17649             {
17650               /* Look in the copied vector, as more elements are const.  */
17651               rtx test = XVECEXP (copy, 0, i ^ bit);
17652               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
17653                 {
17654                   subst = test;
17655                   break;
17656                 }
17657             }
17658           XVECEXP (copy, 0, i) = subst;
17659         }
17660       aarch64_expand_vector_init (target, copy);
17661     }
17662
17663   /* Insert the variable lanes directly.  */
17664   for (int i = 0; i < n_elts; i++)
17665     {
17666       rtx x = XVECEXP (vals, 0, i);
17667       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17668         continue;
17669       x = copy_to_mode_reg (inner_mode, x);
17670       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17671     }
17672 }
17673
17674 /* Emit RTL corresponding to:
17675    insr TARGET, ELEM.  */
17676
17677 static void
17678 emit_insr (rtx target, rtx elem)
17679 {
17680   machine_mode mode = GET_MODE (target);
17681   scalar_mode elem_mode = GET_MODE_INNER (mode);
17682   elem = force_reg (elem_mode, elem);
17683
17684   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
17685   gcc_assert (icode != CODE_FOR_nothing);
17686   emit_insn (GEN_FCN (icode) (target, target, elem));
17687 }
17688
17689 /* Subroutine of aarch64_sve_expand_vector_init for handling
17690    trailing constants.
17691    This function works as follows:
17692    (a) Create a new vector consisting of trailing constants.
17693    (b) Initialize TARGET with the constant vector using emit_move_insn.
17694    (c) Insert remaining elements in TARGET using insr.
17695    NELTS is the total number of elements in original vector while
17696    while NELTS_REQD is the number of elements that are actually
17697    significant.
17698
17699    ??? The heuristic used is to do above only if number of constants
17700    is at least half the total number of elements.  May need fine tuning.  */
17701
17702 static bool
17703 aarch64_sve_expand_vector_init_handle_trailing_constants
17704  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
17705 {
17706   machine_mode mode = GET_MODE (target);
17707   scalar_mode elem_mode = GET_MODE_INNER (mode);
17708   int n_trailing_constants = 0;
17709
17710   for (int i = nelts_reqd - 1;
17711        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
17712        i--)
17713     n_trailing_constants++;
17714
17715   if (n_trailing_constants >= nelts_reqd / 2)
17716     {
17717       rtx_vector_builder v (mode, 1, nelts);
17718       for (int i = 0; i < nelts; i++)
17719         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
17720       rtx const_vec = v.build ();
17721       emit_move_insn (target, const_vec);
17722
17723       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
17724         emit_insr (target, builder.elt (i));
17725
17726       return true;
17727     }
17728
17729   return false;
17730 }
17731
17732 /* Subroutine of aarch64_sve_expand_vector_init.
17733    Works as follows:
17734    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
17735    (b) Skip trailing elements from BUILDER, which are the same as
17736        element NELTS_REQD - 1.
17737    (c) Insert earlier elements in reverse order in TARGET using insr.  */
17738
17739 static void
17740 aarch64_sve_expand_vector_init_insert_elems (rtx target,
17741                                              const rtx_vector_builder &builder,
17742                                              int nelts_reqd)
17743 {
17744   machine_mode mode = GET_MODE (target);
17745   scalar_mode elem_mode = GET_MODE_INNER (mode);
17746
17747   struct expand_operand ops[2];
17748   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
17749   gcc_assert (icode != CODE_FOR_nothing);
17750
17751   create_output_operand (&ops[0], target, mode);
17752   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
17753   expand_insn (icode, 2, ops);
17754
17755   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17756   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
17757     emit_insr (target, builder.elt (i));
17758 }
17759
17760 /* Subroutine of aarch64_sve_expand_vector_init to handle case
17761    when all trailing elements of builder are same.
17762    This works as follows:
17763    (a) Use expand_insn interface to broadcast last vector element in TARGET.
17764    (b) Insert remaining elements in TARGET using insr.
17765
17766    ??? The heuristic used is to do above if number of same trailing elements
17767    is at least 3/4 of total number of elements, loosely based on
17768    heuristic from mostly_zeros_p.  May need fine-tuning.  */
17769
17770 static bool
17771 aarch64_sve_expand_vector_init_handle_trailing_same_elem
17772  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
17773 {
17774   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17775   if (ndups >= (3 * nelts_reqd) / 4)
17776     {
17777       aarch64_sve_expand_vector_init_insert_elems (target, builder,
17778                                                    nelts_reqd - ndups + 1);
17779       return true;
17780     }
17781
17782   return false;
17783 }
17784
17785 /* Initialize register TARGET from BUILDER. NELTS is the constant number
17786    of elements in BUILDER.
17787
17788    The function tries to initialize TARGET from BUILDER if it fits one
17789    of the special cases outlined below.
17790
17791    Failing that, the function divides BUILDER into two sub-vectors:
17792    v_even = even elements of BUILDER;
17793    v_odd = odd elements of BUILDER;
17794
17795    and recursively calls itself with v_even and v_odd.
17796
17797    if (recursive call succeeded for v_even or v_odd)
17798      TARGET = zip (v_even, v_odd)
17799
17800    The function returns true if it managed to build TARGET from BUILDER
17801    with one of the special cases, false otherwise.
17802
17803    Example: {a, 1, b, 2, c, 3, d, 4}
17804
17805    The vector gets divided into:
17806    v_even = {a, b, c, d}
17807    v_odd = {1, 2, 3, 4}
17808
17809    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
17810    initialize tmp2 from constant vector v_odd using emit_move_insn.
17811
17812    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
17813    4 elements, so we construct tmp1 from v_even using insr:
17814    tmp1 = dup(d)
17815    insr tmp1, c
17816    insr tmp1, b
17817    insr tmp1, a
17818
17819    And finally:
17820    TARGET = zip (tmp1, tmp2)
17821    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
17822
17823 static bool
17824 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
17825                                 int nelts, int nelts_reqd)
17826 {
17827   machine_mode mode = GET_MODE (target);
17828
17829   /* Case 1: Vector contains trailing constants.  */
17830
17831   if (aarch64_sve_expand_vector_init_handle_trailing_constants
17832        (target, builder, nelts, nelts_reqd))
17833     return true;
17834
17835   /* Case 2: Vector contains leading constants.  */
17836
17837   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
17838   for (int i = 0; i < nelts_reqd; i++)
17839     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
17840   rev_builder.finalize ();
17841
17842   if (aarch64_sve_expand_vector_init_handle_trailing_constants
17843        (target, rev_builder, nelts, nelts_reqd))
17844     {
17845       emit_insn (gen_aarch64_sve_rev (mode, target, target));
17846       return true;
17847     }
17848
17849   /* Case 3: Vector contains trailing same element.  */
17850
17851   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17852        (target, builder, nelts_reqd))
17853     return true;
17854
17855   /* Case 4: Vector contains leading same element.  */
17856
17857   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17858        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
17859     {
17860       emit_insn (gen_aarch64_sve_rev (mode, target, target));
17861       return true;
17862     }
17863
17864   /* Avoid recursing below 4-elements.
17865      ??? The threshold 4 may need fine-tuning.  */
17866
17867   if (nelts_reqd <= 4)
17868     return false;
17869
17870   rtx_vector_builder v_even (mode, 1, nelts);
17871   rtx_vector_builder v_odd (mode, 1, nelts);
17872
17873   for (int i = 0; i < nelts * 2; i += 2)
17874     {
17875       v_even.quick_push (builder.elt (i));
17876       v_odd.quick_push (builder.elt (i + 1));
17877     }
17878
17879   v_even.finalize ();
17880   v_odd.finalize ();
17881
17882   rtx tmp1 = gen_reg_rtx (mode);
17883   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
17884                                                     nelts, nelts_reqd / 2);
17885
17886   rtx tmp2 = gen_reg_rtx (mode);
17887   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
17888                                                    nelts, nelts_reqd / 2);
17889
17890   if (!did_even_p && !did_odd_p)
17891     return false;
17892
17893   /* Initialize v_even and v_odd using INSR if it didn't match any of the
17894      special cases and zip v_even, v_odd.  */
17895
17896   if (!did_even_p)
17897     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
17898
17899   if (!did_odd_p)
17900     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
17901
17902   rtvec v = gen_rtvec (2, tmp1, tmp2);
17903   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
17904   return true;
17905 }
17906
17907 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
17908
17909 void
17910 aarch64_sve_expand_vector_init (rtx target, rtx vals)
17911 {
17912   machine_mode mode = GET_MODE (target);
17913   int nelts = XVECLEN (vals, 0);
17914
17915   rtx_vector_builder v (mode, 1, nelts);
17916   for (int i = 0; i < nelts; i++)
17917     v.quick_push (XVECEXP (vals, 0, i));
17918   v.finalize ();
17919
17920   /* If neither sub-vectors of v could be initialized specially,
17921      then use INSR to insert all elements from v into TARGET.
17922      ??? This might not be optimal for vectors with large
17923      initializers like 16-element or above.
17924      For nelts < 4, it probably isn't useful to handle specially.  */
17925
17926   if (nelts < 4
17927       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
17928     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
17929 }
17930
17931 /* Check whether VALUE is a vector constant in which every element
17932    is either a power of 2 or a negated power of 2.  If so, return
17933    a constant vector of log2s, and flip CODE between PLUS and MINUS
17934    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
17935
17936 static rtx
17937 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
17938 {
17939   if (GET_CODE (value) != CONST_VECTOR)
17940     return NULL_RTX;
17941
17942   rtx_vector_builder builder;
17943   if (!builder.new_unary_operation (GET_MODE (value), value, false))
17944     return NULL_RTX;
17945
17946   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
17947   /* 1 if the result of the multiplication must be negated,
17948      0 if it mustn't, or -1 if we don't yet care.  */
17949   int negate = -1;
17950   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
17951   for (unsigned int i = 0; i < encoded_nelts; ++i)
17952     {
17953       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
17954       if (!CONST_SCALAR_INT_P (elt))
17955         return NULL_RTX;
17956       rtx_mode_t val (elt, int_mode);
17957       wide_int pow2 = wi::neg (val);
17958       if (val != pow2)
17959         {
17960           /* It matters whether we negate or not.  Make that choice,
17961              and make sure that it's consistent with previous elements.  */
17962           if (negate == !wi::neg_p (val))
17963             return NULL_RTX;
17964           negate = wi::neg_p (val);
17965           if (!negate)
17966             pow2 = val;
17967         }
17968       /* POW2 is now the value that we want to be a power of 2.  */
17969       int shift = wi::exact_log2 (pow2);
17970       if (shift < 0)
17971         return NULL_RTX;
17972       builder.quick_push (gen_int_mode (shift, int_mode));
17973     }
17974   if (negate == -1)
17975     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
17976     code = PLUS;
17977   else if (negate == 1)
17978     code = code == PLUS ? MINUS : PLUS;
17979   return builder.build ();
17980 }
17981
17982 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
17983    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
17984    operands array, in the same order as for fma_optab.  Return true if
17985    the function emitted all the necessary instructions, false if the caller
17986    should generate the pattern normally with the new OPERANDS array.  */
17987
17988 bool
17989 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
17990 {
17991   machine_mode mode = GET_MODE (operands[0]);
17992   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
17993     {
17994       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
17995                                   NULL_RTX, true, OPTAB_DIRECT);
17996       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
17997                           operands[3], product, operands[0], true,
17998                           OPTAB_DIRECT);
17999       return true;
18000     }
18001   operands[2] = force_reg (mode, operands[2]);
18002   return false;
18003 }
18004
18005 /* Likewise, but for a conditional pattern.  */
18006
18007 bool
18008 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
18009 {
18010   machine_mode mode = GET_MODE (operands[0]);
18011   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
18012     {
18013       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
18014                                   NULL_RTX, true, OPTAB_DIRECT);
18015       emit_insn (gen_cond (code, mode, operands[0], operands[1],
18016                            operands[4], product, operands[5]));
18017       return true;
18018     }
18019   operands[3] = force_reg (mode, operands[3]);
18020   return false;
18021 }
18022
18023 static unsigned HOST_WIDE_INT
18024 aarch64_shift_truncation_mask (machine_mode mode)
18025 {
18026   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
18027     return 0;
18028   return GET_MODE_UNIT_BITSIZE (mode) - 1;
18029 }
18030
18031 /* Select a format to encode pointers in exception handling data.  */
18032 int
18033 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
18034 {
18035    int type;
18036    switch (aarch64_cmodel)
18037      {
18038      case AARCH64_CMODEL_TINY:
18039      case AARCH64_CMODEL_TINY_PIC:
18040      case AARCH64_CMODEL_SMALL:
18041      case AARCH64_CMODEL_SMALL_PIC:
18042      case AARCH64_CMODEL_SMALL_SPIC:
18043        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
18044           for everything.  */
18045        type = DW_EH_PE_sdata4;
18046        break;
18047      default:
18048        /* No assumptions here.  8-byte relocs required.  */
18049        type = DW_EH_PE_sdata8;
18050        break;
18051      }
18052    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
18053 }
18054
18055 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
18056
18057 static void
18058 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
18059 {
18060   if (TREE_CODE (decl) == FUNCTION_DECL)
18061     {
18062       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
18063       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
18064         {
18065           fprintf (stream, "\t.variant_pcs\t");
18066           assemble_name (stream, name);
18067           fprintf (stream, "\n");
18068         }
18069     }
18070 }
18071
18072 /* The last .arch and .tune assembly strings that we printed.  */
18073 static std::string aarch64_last_printed_arch_string;
18074 static std::string aarch64_last_printed_tune_string;
18075
18076 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
18077    by the function fndecl.  */
18078
18079 void
18080 aarch64_declare_function_name (FILE *stream, const char* name,
18081                                 tree fndecl)
18082 {
18083   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18084
18085   struct cl_target_option *targ_options;
18086   if (target_parts)
18087     targ_options = TREE_TARGET_OPTION (target_parts);
18088   else
18089     targ_options = TREE_TARGET_OPTION (target_option_current_node);
18090   gcc_assert (targ_options);
18091
18092   const struct processor *this_arch
18093     = aarch64_get_arch (targ_options->x_explicit_arch);
18094
18095   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
18096   std::string extension
18097     = aarch64_get_extension_string_for_isa_flags (isa_flags,
18098                                                   this_arch->flags);
18099   /* Only update the assembler .arch string if it is distinct from the last
18100      such string we printed.  */
18101   std::string to_print = this_arch->name + extension;
18102   if (to_print != aarch64_last_printed_arch_string)
18103     {
18104       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
18105       aarch64_last_printed_arch_string = to_print;
18106     }
18107
18108   /* Print the cpu name we're tuning for in the comments, might be
18109      useful to readers of the generated asm.  Do it only when it changes
18110      from function to function and verbose assembly is requested.  */
18111   const struct processor *this_tune
18112     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
18113
18114   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
18115     {
18116       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
18117                    this_tune->name);
18118       aarch64_last_printed_tune_string = this_tune->name;
18119     }
18120
18121   aarch64_asm_output_variant_pcs (stream, fndecl, name);
18122
18123   /* Don't forget the type directive for ELF.  */
18124   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
18125   ASM_OUTPUT_LABEL (stream, name);
18126 }
18127
18128 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
18129
18130 void
18131 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
18132 {
18133   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
18134   const char *value = IDENTIFIER_POINTER (target);
18135   aarch64_asm_output_variant_pcs (stream, decl, name);
18136   ASM_OUTPUT_DEF (stream, name, value);
18137 }
18138
18139 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
18140    function symbol references.  */
18141
18142 void
18143 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
18144 {
18145   default_elf_asm_output_external (stream, decl, name);
18146   aarch64_asm_output_variant_pcs (stream, decl, name);
18147 }
18148
18149 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
18150    Used to output the .cfi_b_key_frame directive when signing the current
18151    function with the B key.  */
18152
18153 void
18154 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
18155 {
18156   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
18157       && aarch64_ra_sign_key == AARCH64_KEY_B)
18158         asm_fprintf (f, "\t.cfi_b_key_frame\n");
18159 }
18160
18161 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
18162
18163 static void
18164 aarch64_start_file (void)
18165 {
18166   struct cl_target_option *default_options
18167     = TREE_TARGET_OPTION (target_option_default_node);
18168
18169   const struct processor *default_arch
18170     = aarch64_get_arch (default_options->x_explicit_arch);
18171   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
18172   std::string extension
18173     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
18174                                                   default_arch->flags);
18175
18176    aarch64_last_printed_arch_string = default_arch->name + extension;
18177    aarch64_last_printed_tune_string = "";
18178    asm_fprintf (asm_out_file, "\t.arch %s\n",
18179                 aarch64_last_printed_arch_string.c_str ());
18180
18181    default_file_start ();
18182 }
18183
18184 /* Emit load exclusive.  */
18185
18186 static void
18187 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
18188                              rtx mem, rtx model_rtx)
18189 {
18190   if (mode == TImode)
18191     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
18192                                                 gen_highpart (DImode, rval),
18193                                                 mem, model_rtx));
18194   else
18195     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
18196 }
18197
18198 /* Emit store exclusive.  */
18199
18200 static void
18201 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
18202                               rtx mem, rtx rval, rtx model_rtx)
18203 {
18204   if (mode == TImode)
18205     emit_insn (gen_aarch64_store_exclusive_pair
18206                (bval, mem, operand_subword (rval, 0, 0, TImode),
18207                 operand_subword (rval, 1, 0, TImode), model_rtx));
18208   else
18209     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
18210 }
18211
18212 /* Mark the previous jump instruction as unlikely.  */
18213
18214 static void
18215 aarch64_emit_unlikely_jump (rtx insn)
18216 {
18217   rtx_insn *jump = emit_jump_insn (insn);
18218   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
18219 }
18220
18221 /* We store the names of the various atomic helpers in a 5x4 array.
18222    Return the libcall function given MODE, MODEL and NAMES.  */
18223
18224 rtx
18225 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
18226                         const atomic_ool_names *names)
18227 {
18228   memmodel model = memmodel_base (INTVAL (model_rtx));
18229   int mode_idx, model_idx;
18230
18231   switch (mode)
18232     {
18233     case E_QImode:
18234       mode_idx = 0;
18235       break;
18236     case E_HImode:
18237       mode_idx = 1;
18238       break;
18239     case E_SImode:
18240       mode_idx = 2;
18241       break;
18242     case E_DImode:
18243       mode_idx = 3;
18244       break;
18245     case E_TImode:
18246       mode_idx = 4;
18247       break;
18248     default:
18249       gcc_unreachable ();
18250     }
18251
18252   switch (model)
18253     {
18254     case MEMMODEL_RELAXED:
18255       model_idx = 0;
18256       break;
18257     case MEMMODEL_CONSUME:
18258     case MEMMODEL_ACQUIRE:
18259       model_idx = 1;
18260       break;
18261     case MEMMODEL_RELEASE:
18262       model_idx = 2;
18263       break;
18264     case MEMMODEL_ACQ_REL:
18265     case MEMMODEL_SEQ_CST:
18266       model_idx = 3;
18267       break;
18268     default:
18269       gcc_unreachable ();
18270     }
18271
18272   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
18273                                       VISIBILITY_HIDDEN);
18274 }
18275
18276 #define DEF0(B, N) \
18277   { "__aarch64_" #B #N "_relax", \
18278     "__aarch64_" #B #N "_acq", \
18279     "__aarch64_" #B #N "_rel", \
18280     "__aarch64_" #B #N "_acq_rel" }
18281
18282 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
18283                  { NULL, NULL, NULL, NULL }
18284 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
18285
18286 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
18287 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
18288 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
18289 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
18290 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
18291 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
18292
18293 #undef DEF0
18294 #undef DEF4
18295 #undef DEF5
18296
18297 /* Expand a compare and swap pattern.  */
18298
18299 void
18300 aarch64_expand_compare_and_swap (rtx operands[])
18301 {
18302   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
18303   machine_mode mode, r_mode;
18304
18305   bval = operands[0];
18306   rval = operands[1];
18307   mem = operands[2];
18308   oldval = operands[3];
18309   newval = operands[4];
18310   is_weak = operands[5];
18311   mod_s = operands[6];
18312   mod_f = operands[7];
18313   mode = GET_MODE (mem);
18314
18315   /* Normally the succ memory model must be stronger than fail, but in the
18316      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
18317      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
18318   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
18319       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
18320     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
18321
18322   r_mode = mode;
18323   if (mode == QImode || mode == HImode)
18324     {
18325       r_mode = SImode;
18326       rval = gen_reg_rtx (r_mode);
18327     }
18328
18329   if (TARGET_LSE)
18330     {
18331       /* The CAS insn requires oldval and rval overlap, but we need to
18332          have a copy of oldval saved across the operation to tell if
18333          the operation is successful.  */
18334       if (reg_overlap_mentioned_p (rval, oldval))
18335         rval = copy_to_mode_reg (r_mode, oldval);
18336       else
18337         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
18338
18339       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
18340                                                    newval, mod_s));
18341       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18342     }
18343   else if (TARGET_OUTLINE_ATOMICS)
18344     {
18345       /* Oldval must satisfy compare afterward.  */
18346       if (!aarch64_plus_operand (oldval, mode))
18347         oldval = force_reg (mode, oldval);
18348       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
18349       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
18350                                       oldval, mode, newval, mode,
18351                                       XEXP (mem, 0), Pmode);
18352       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18353     }
18354   else
18355     {
18356       /* The oldval predicate varies by mode.  Test it and force to reg.  */
18357       insn_code code = code_for_aarch64_compare_and_swap (mode);
18358       if (!insn_data[code].operand[2].predicate (oldval, mode))
18359         oldval = force_reg (mode, oldval);
18360
18361       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
18362                                  is_weak, mod_s, mod_f));
18363       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
18364     }
18365
18366   if (r_mode != mode)
18367     rval = gen_lowpart (mode, rval);
18368   emit_move_insn (operands[1], rval);
18369
18370   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
18371   emit_insn (gen_rtx_SET (bval, x));
18372 }
18373
18374 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
18375    sequence implementing an atomic operation.  */
18376
18377 static void
18378 aarch64_emit_post_barrier (enum memmodel model)
18379 {
18380   const enum memmodel base_model = memmodel_base (model);
18381
18382   if (is_mm_sync (model)
18383       && (base_model == MEMMODEL_ACQUIRE
18384           || base_model == MEMMODEL_ACQ_REL
18385           || base_model == MEMMODEL_SEQ_CST))
18386     {
18387       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
18388     }
18389 }
18390
18391 /* Split a compare and swap pattern.  */
18392
18393 void
18394 aarch64_split_compare_and_swap (rtx operands[])
18395 {
18396   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
18397   gcc_assert (epilogue_completed);
18398
18399   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
18400   machine_mode mode;
18401   bool is_weak;
18402   rtx_code_label *label1, *label2;
18403   enum memmodel model;
18404
18405   rval = operands[0];
18406   mem = operands[1];
18407   oldval = operands[2];
18408   newval = operands[3];
18409   is_weak = (operands[4] != const0_rtx);
18410   model_rtx = operands[5];
18411   scratch = operands[7];
18412   mode = GET_MODE (mem);
18413   model = memmodel_from_int (INTVAL (model_rtx));
18414
18415   /* When OLDVAL is zero and we want the strong version we can emit a tighter
18416     loop:
18417     .label1:
18418         LD[A]XR rval, [mem]
18419         CBNZ    rval, .label2
18420         ST[L]XR scratch, newval, [mem]
18421         CBNZ    scratch, .label1
18422     .label2:
18423         CMP     rval, 0.  */
18424   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
18425                         oldval == const0_rtx && mode != TImode);
18426
18427   label1 = NULL;
18428   if (!is_weak)
18429     {
18430       label1 = gen_label_rtx ();
18431       emit_label (label1);
18432     }
18433   label2 = gen_label_rtx ();
18434
18435   /* The initial load can be relaxed for a __sync operation since a final
18436      barrier will be emitted to stop code hoisting.  */
18437   if (is_mm_sync (model))
18438     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
18439   else
18440     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
18441
18442   if (strong_zero_p)
18443     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
18444   else
18445     {
18446       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18447       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
18448     }
18449   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18450                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
18451   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18452
18453   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
18454
18455   if (!is_weak)
18456     {
18457       if (aarch64_track_speculation)
18458         {
18459           /* Emit an explicit compare instruction, so that we can correctly
18460              track the condition codes.  */
18461           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18462           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18463         }
18464       else
18465         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
18466
18467       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18468                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
18469       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18470     }
18471   else
18472     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18473
18474   emit_label (label2);
18475
18476   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
18477      to set the condition flags.  If this is not used it will be removed by
18478      later passes.  */
18479   if (strong_zero_p)
18480     aarch64_gen_compare_reg (NE, rval, const0_rtx);
18481
18482   /* Emit any final barrier needed for a __sync operation.  */
18483   if (is_mm_sync (model))
18484     aarch64_emit_post_barrier (model);
18485 }
18486
18487 /* Split an atomic operation.  */
18488
18489 void
18490 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
18491                          rtx value, rtx model_rtx, rtx cond)
18492 {
18493   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
18494   gcc_assert (epilogue_completed);
18495
18496   machine_mode mode = GET_MODE (mem);
18497   machine_mode wmode = (mode == DImode ? DImode : SImode);
18498   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
18499   const bool is_sync = is_mm_sync (model);
18500   rtx_code_label *label;
18501   rtx x;
18502
18503   /* Split the atomic operation into a sequence.  */
18504   label = gen_label_rtx ();
18505   emit_label (label);
18506
18507   if (new_out)
18508     new_out = gen_lowpart (wmode, new_out);
18509   if (old_out)
18510     old_out = gen_lowpart (wmode, old_out);
18511   else
18512     old_out = new_out;
18513   value = simplify_gen_subreg (wmode, value, mode, 0);
18514
18515   /* The initial load can be relaxed for a __sync operation since a final
18516      barrier will be emitted to stop code hoisting.  */
18517  if (is_sync)
18518     aarch64_emit_load_exclusive (mode, old_out, mem,
18519                                  GEN_INT (MEMMODEL_RELAXED));
18520   else
18521     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
18522
18523   switch (code)
18524     {
18525     case SET:
18526       new_out = value;
18527       break;
18528
18529     case NOT:
18530       x = gen_rtx_AND (wmode, old_out, value);
18531       emit_insn (gen_rtx_SET (new_out, x));
18532       x = gen_rtx_NOT (wmode, new_out);
18533       emit_insn (gen_rtx_SET (new_out, x));
18534       break;
18535
18536     case MINUS:
18537       if (CONST_INT_P (value))
18538         {
18539           value = GEN_INT (-INTVAL (value));
18540           code = PLUS;
18541         }
18542       /* Fall through.  */
18543
18544     default:
18545       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
18546       emit_insn (gen_rtx_SET (new_out, x));
18547       break;
18548     }
18549
18550   aarch64_emit_store_exclusive (mode, cond, mem,
18551                                 gen_lowpart (mode, new_out), model_rtx);
18552
18553   if (aarch64_track_speculation)
18554     {
18555       /* Emit an explicit compare instruction, so that we can correctly
18556          track the condition codes.  */
18557       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
18558       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18559     }
18560   else
18561     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
18562
18563   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18564                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
18565   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18566
18567   /* Emit any final barrier needed for a __sync operation.  */
18568   if (is_sync)
18569     aarch64_emit_post_barrier (model);
18570 }
18571
18572 static void
18573 aarch64_init_libfuncs (void)
18574 {
18575    /* Half-precision float operations.  The compiler handles all operations
18576      with NULL libfuncs by converting to SFmode.  */
18577
18578   /* Conversions.  */
18579   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
18580   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
18581
18582   /* Arithmetic.  */
18583   set_optab_libfunc (add_optab, HFmode, NULL);
18584   set_optab_libfunc (sdiv_optab, HFmode, NULL);
18585   set_optab_libfunc (smul_optab, HFmode, NULL);
18586   set_optab_libfunc (neg_optab, HFmode, NULL);
18587   set_optab_libfunc (sub_optab, HFmode, NULL);
18588
18589   /* Comparisons.  */
18590   set_optab_libfunc (eq_optab, HFmode, NULL);
18591   set_optab_libfunc (ne_optab, HFmode, NULL);
18592   set_optab_libfunc (lt_optab, HFmode, NULL);
18593   set_optab_libfunc (le_optab, HFmode, NULL);
18594   set_optab_libfunc (ge_optab, HFmode, NULL);
18595   set_optab_libfunc (gt_optab, HFmode, NULL);
18596   set_optab_libfunc (unord_optab, HFmode, NULL);
18597 }
18598
18599 /* Target hook for c_mode_for_suffix.  */
18600 static machine_mode
18601 aarch64_c_mode_for_suffix (char suffix)
18602 {
18603   if (suffix == 'q')
18604     return TFmode;
18605
18606   return VOIDmode;
18607 }
18608
18609 /* We can only represent floating point constants which will fit in
18610    "quarter-precision" values.  These values are characterised by
18611    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
18612    by:
18613
18614    (-1)^s * (n/16) * 2^r
18615
18616    Where:
18617      's' is the sign bit.
18618      'n' is an integer in the range 16 <= n <= 31.
18619      'r' is an integer in the range -3 <= r <= 4.  */
18620
18621 /* Return true iff X can be represented by a quarter-precision
18622    floating point immediate operand X.  Note, we cannot represent 0.0.  */
18623 bool
18624 aarch64_float_const_representable_p (rtx x)
18625 {
18626   /* This represents our current view of how many bits
18627      make up the mantissa.  */
18628   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
18629   int exponent;
18630   unsigned HOST_WIDE_INT mantissa, mask;
18631   REAL_VALUE_TYPE r, m;
18632   bool fail;
18633
18634   x = unwrap_const_vec_duplicate (x);
18635   if (!CONST_DOUBLE_P (x))
18636     return false;
18637
18638   if (GET_MODE (x) == VOIDmode
18639       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
18640     return false;
18641
18642   r = *CONST_DOUBLE_REAL_VALUE (x);
18643
18644   /* We cannot represent infinities, NaNs or +/-zero.  We won't
18645      know if we have +zero until we analyse the mantissa, but we
18646      can reject the other invalid values.  */
18647   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
18648       || REAL_VALUE_MINUS_ZERO (r))
18649     return false;
18650
18651   /* Extract exponent.  */
18652   r = real_value_abs (&r);
18653   exponent = REAL_EXP (&r);
18654
18655   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
18656      highest (sign) bit, with a fixed binary point at bit point_pos.
18657      m1 holds the low part of the mantissa, m2 the high part.
18658      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
18659      bits for the mantissa, this can fail (low bits will be lost).  */
18660   real_ldexp (&m, &r, point_pos - exponent);
18661   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
18662
18663   /* If the low part of the mantissa has bits set we cannot represent
18664      the value.  */
18665   if (w.ulow () != 0)
18666     return false;
18667   /* We have rejected the lower HOST_WIDE_INT, so update our
18668      understanding of how many bits lie in the mantissa and
18669      look only at the high HOST_WIDE_INT.  */
18670   mantissa = w.elt (1);
18671   point_pos -= HOST_BITS_PER_WIDE_INT;
18672
18673   /* We can only represent values with a mantissa of the form 1.xxxx.  */
18674   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
18675   if ((mantissa & mask) != 0)
18676     return false;
18677
18678   /* Having filtered unrepresentable values, we may now remove all
18679      but the highest 5 bits.  */
18680   mantissa >>= point_pos - 5;
18681
18682   /* We cannot represent the value 0.0, so reject it.  This is handled
18683      elsewhere.  */
18684   if (mantissa == 0)
18685     return false;
18686
18687   /* Then, as bit 4 is always set, we can mask it off, leaving
18688      the mantissa in the range [0, 15].  */
18689   mantissa &= ~(1 << 4);
18690   gcc_assert (mantissa <= 15);
18691
18692   /* GCC internally does not use IEEE754-like encoding (where normalized
18693      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
18694      Our mantissa values are shifted 4 places to the left relative to
18695      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
18696      by 5 places to correct for GCC's representation.  */
18697   exponent = 5 - exponent;
18698
18699   return (exponent >= 0 && exponent <= 7);
18700 }
18701
18702 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
18703    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
18704    output MOVI/MVNI, ORR or BIC immediate.  */
18705 char*
18706 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
18707                                    enum simd_immediate_check which)
18708 {
18709   bool is_valid;
18710   static char templ[40];
18711   const char *mnemonic;
18712   const char *shift_op;
18713   unsigned int lane_count = 0;
18714   char element_char;
18715
18716   struct simd_immediate_info info;
18717
18718   /* This will return true to show const_vector is legal for use as either
18719      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
18720      It will also update INFO to show how the immediate should be generated.
18721      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
18722   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
18723   gcc_assert (is_valid);
18724
18725   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18726   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
18727
18728   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18729     {
18730       gcc_assert (info.insn == simd_immediate_info::MOV
18731                   && info.u.mov.shift == 0);
18732       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
18733          move immediate path.  */
18734       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18735         info.u.mov.value = GEN_INT (0);
18736       else
18737         {
18738           const unsigned int buf_size = 20;
18739           char float_buf[buf_size] = {'\0'};
18740           real_to_decimal_for_mode (float_buf,
18741                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18742                                     buf_size, buf_size, 1, info.elt_mode);
18743
18744           if (lane_count == 1)
18745             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
18746           else
18747             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
18748                       lane_count, element_char, float_buf);
18749           return templ;
18750         }
18751     }
18752
18753   gcc_assert (CONST_INT_P (info.u.mov.value));
18754
18755   if (which == AARCH64_CHECK_MOV)
18756     {
18757       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
18758       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
18759                   ? "msl" : "lsl");
18760       if (lane_count == 1)
18761         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
18762                   mnemonic, UINTVAL (info.u.mov.value));
18763       else if (info.u.mov.shift)
18764         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18765                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
18766                   element_char, UINTVAL (info.u.mov.value), shift_op,
18767                   info.u.mov.shift);
18768       else
18769         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18770                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
18771                   element_char, UINTVAL (info.u.mov.value));
18772     }
18773   else
18774     {
18775       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
18776       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
18777       if (info.u.mov.shift)
18778         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18779                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
18780                   element_char, UINTVAL (info.u.mov.value), "lsl",
18781                   info.u.mov.shift);
18782       else
18783         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18784                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
18785                   element_char, UINTVAL (info.u.mov.value));
18786     }
18787   return templ;
18788 }
18789
18790 char*
18791 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
18792 {
18793
18794   /* If a floating point number was passed and we desire to use it in an
18795      integer mode do the conversion to integer.  */
18796   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
18797     {
18798       unsigned HOST_WIDE_INT ival;
18799       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
18800           gcc_unreachable ();
18801       immediate = gen_int_mode (ival, mode);
18802     }
18803
18804   machine_mode vmode;
18805   /* use a 64 bit mode for everything except for DI/DF mode, where we use
18806      a 128 bit vector mode.  */
18807   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
18808
18809   vmode = aarch64_simd_container_mode (mode, width);
18810   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
18811   return aarch64_output_simd_mov_immediate (v_op, width);
18812 }
18813
18814 /* Return the output string to use for moving immediate CONST_VECTOR
18815    into an SVE register.  */
18816
18817 char *
18818 aarch64_output_sve_mov_immediate (rtx const_vector)
18819 {
18820   static char templ[40];
18821   struct simd_immediate_info info;
18822   char element_char;
18823
18824   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
18825   gcc_assert (is_valid);
18826
18827   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18828
18829   machine_mode vec_mode = GET_MODE (const_vector);
18830   if (aarch64_sve_pred_mode_p (vec_mode))
18831     {
18832       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
18833       if (info.insn == simd_immediate_info::MOV)
18834         {
18835           gcc_assert (info.u.mov.value == const0_rtx);
18836           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
18837         }
18838       else
18839         {
18840           gcc_assert (info.insn == simd_immediate_info::PTRUE);
18841           unsigned int total_bytes;
18842           if (info.u.pattern == AARCH64_SV_ALL
18843               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
18844             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
18845                       total_bytes / GET_MODE_SIZE (info.elt_mode));
18846           else
18847             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
18848                       svpattern_token (info.u.pattern));
18849         }
18850       return buf;
18851     }
18852
18853   if (info.insn == simd_immediate_info::INDEX)
18854     {
18855       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
18856                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
18857                 element_char, INTVAL (info.u.index.base),
18858                 INTVAL (info.u.index.step));
18859       return templ;
18860     }
18861
18862   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18863     {
18864       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18865         info.u.mov.value = GEN_INT (0);
18866       else
18867         {
18868           const int buf_size = 20;
18869           char float_buf[buf_size] = {};
18870           real_to_decimal_for_mode (float_buf,
18871                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18872                                     buf_size, buf_size, 1, info.elt_mode);
18873
18874           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
18875                     element_char, float_buf);
18876           return templ;
18877         }
18878     }
18879
18880   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
18881             element_char, INTVAL (info.u.mov.value));
18882   return templ;
18883 }
18884
18885 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
18886    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
18887    pattern.  */
18888
18889 char *
18890 aarch64_output_sve_ptrues (rtx const_unspec)
18891 {
18892   static char templ[40];
18893
18894   struct simd_immediate_info info;
18895   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
18896   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
18897
18898   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18899   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
18900             svpattern_token (info.u.pattern));
18901   return templ;
18902 }
18903
18904 /* Split operands into moves from op[1] + op[2] into op[0].  */
18905
18906 void
18907 aarch64_split_combinev16qi (rtx operands[3])
18908 {
18909   unsigned int dest = REGNO (operands[0]);
18910   unsigned int src1 = REGNO (operands[1]);
18911   unsigned int src2 = REGNO (operands[2]);
18912   machine_mode halfmode = GET_MODE (operands[1]);
18913   unsigned int halfregs = REG_NREGS (operands[1]);
18914   rtx destlo, desthi;
18915
18916   gcc_assert (halfmode == V16QImode);
18917
18918   if (src1 == dest && src2 == dest + halfregs)
18919     {
18920       /* No-op move.  Can't split to nothing; emit something.  */
18921       emit_note (NOTE_INSN_DELETED);
18922       return;
18923     }
18924
18925   /* Preserve register attributes for variable tracking.  */
18926   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
18927   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
18928                                GET_MODE_SIZE (halfmode));
18929
18930   /* Special case of reversed high/low parts.  */
18931   if (reg_overlap_mentioned_p (operands[2], destlo)
18932       && reg_overlap_mentioned_p (operands[1], desthi))
18933     {
18934       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
18935       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
18936       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
18937     }
18938   else if (!reg_overlap_mentioned_p (operands[2], destlo))
18939     {
18940       /* Try to avoid unnecessary moves if part of the result
18941          is in the right place already.  */
18942       if (src1 != dest)
18943         emit_move_insn (destlo, operands[1]);
18944       if (src2 != dest + halfregs)
18945         emit_move_insn (desthi, operands[2]);
18946     }
18947   else
18948     {
18949       if (src2 != dest + halfregs)
18950         emit_move_insn (desthi, operands[2]);
18951       if (src1 != dest)
18952         emit_move_insn (destlo, operands[1]);
18953     }
18954 }
18955
18956 /* vec_perm support.  */
18957
18958 struct expand_vec_perm_d
18959 {
18960   rtx target, op0, op1;
18961   vec_perm_indices perm;
18962   machine_mode vmode;
18963   unsigned int vec_flags;
18964   bool one_vector_p;
18965   bool testing_p;
18966 };
18967
18968 /* Generate a variable permutation.  */
18969
18970 static void
18971 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
18972 {
18973   machine_mode vmode = GET_MODE (target);
18974   bool one_vector_p = rtx_equal_p (op0, op1);
18975
18976   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
18977   gcc_checking_assert (GET_MODE (op0) == vmode);
18978   gcc_checking_assert (GET_MODE (op1) == vmode);
18979   gcc_checking_assert (GET_MODE (sel) == vmode);
18980   gcc_checking_assert (TARGET_SIMD);
18981
18982   if (one_vector_p)
18983     {
18984       if (vmode == V8QImode)
18985         {
18986           /* Expand the argument to a V16QI mode by duplicating it.  */
18987           rtx pair = gen_reg_rtx (V16QImode);
18988           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
18989           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
18990         }
18991       else
18992         {
18993           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
18994         }
18995     }
18996   else
18997     {
18998       rtx pair;
18999
19000       if (vmode == V8QImode)
19001         {
19002           pair = gen_reg_rtx (V16QImode);
19003           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
19004           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
19005         }
19006       else
19007         {
19008           pair = gen_reg_rtx (OImode);
19009           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
19010           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
19011         }
19012     }
19013 }
19014
19015 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
19016    NELT is the number of elements in the vector.  */
19017
19018 void
19019 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
19020                          unsigned int nelt)
19021 {
19022   machine_mode vmode = GET_MODE (target);
19023   bool one_vector_p = rtx_equal_p (op0, op1);
19024   rtx mask;
19025
19026   /* The TBL instruction does not use a modulo index, so we must take care
19027      of that ourselves.  */
19028   mask = aarch64_simd_gen_const_vector_dup (vmode,
19029       one_vector_p ? nelt - 1 : 2 * nelt - 1);
19030   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
19031
19032   /* For big-endian, we also need to reverse the index within the vector
19033      (but not which vector).  */
19034   if (BYTES_BIG_ENDIAN)
19035     {
19036       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
19037       if (!one_vector_p)
19038         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
19039       sel = expand_simple_binop (vmode, XOR, sel, mask,
19040                                  NULL, 0, OPTAB_LIB_WIDEN);
19041     }
19042   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
19043 }
19044
19045 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
19046
19047 static void
19048 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
19049 {
19050   emit_insn (gen_rtx_SET (target,
19051                           gen_rtx_UNSPEC (GET_MODE (target),
19052                                           gen_rtvec (2, op0, op1), code)));
19053 }
19054
19055 /* Expand an SVE vec_perm with the given operands.  */
19056
19057 void
19058 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
19059 {
19060   machine_mode data_mode = GET_MODE (target);
19061   machine_mode sel_mode = GET_MODE (sel);
19062   /* Enforced by the pattern condition.  */
19063   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
19064
19065   /* Note: vec_perm indices are supposed to wrap when they go beyond the
19066      size of the two value vectors, i.e. the upper bits of the indices
19067      are effectively ignored.  SVE TBL instead produces 0 for any
19068      out-of-range indices, so we need to modulo all the vec_perm indices
19069      to ensure they are all in range.  */
19070   rtx sel_reg = force_reg (sel_mode, sel);
19071
19072   /* Check if the sel only references the first values vector.  */
19073   if (GET_CODE (sel) == CONST_VECTOR
19074       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
19075     {
19076       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
19077       return;
19078     }
19079
19080   /* Check if the two values vectors are the same.  */
19081   if (rtx_equal_p (op0, op1))
19082     {
19083       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
19084       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19085                                          NULL, 0, OPTAB_DIRECT);
19086       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
19087       return;
19088     }
19089
19090   /* Run TBL on for each value vector and combine the results.  */
19091
19092   rtx res0 = gen_reg_rtx (data_mode);
19093   rtx res1 = gen_reg_rtx (data_mode);
19094   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
19095   if (GET_CODE (sel) != CONST_VECTOR
19096       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
19097     {
19098       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
19099                                                        2 * nunits - 1);
19100       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19101                                      NULL, 0, OPTAB_DIRECT);
19102     }
19103   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
19104   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
19105                                      NULL, 0, OPTAB_DIRECT);
19106   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
19107   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
19108     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
19109   else
19110     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
19111 }
19112
19113 /* Recognize patterns suitable for the TRN instructions.  */
19114 static bool
19115 aarch64_evpc_trn (struct expand_vec_perm_d *d)
19116 {
19117   HOST_WIDE_INT odd;
19118   poly_uint64 nelt = d->perm.length ();
19119   rtx out, in0, in1, x;
19120   machine_mode vmode = d->vmode;
19121
19122   if (GET_MODE_UNIT_SIZE (vmode) > 8)
19123     return false;
19124
19125   /* Note that these are little-endian tests.
19126      We correct for big-endian later.  */
19127   if (!d->perm[0].is_constant (&odd)
19128       || (odd != 0 && odd != 1)
19129       || !d->perm.series_p (0, 2, odd, 2)
19130       || !d->perm.series_p (1, 2, nelt + odd, 2))
19131     return false;
19132
19133   /* Success!  */
19134   if (d->testing_p)
19135     return true;
19136
19137   in0 = d->op0;
19138   in1 = d->op1;
19139   /* We don't need a big-endian lane correction for SVE; see the comment
19140      at the head of aarch64-sve.md for details.  */
19141   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19142     {
19143       x = in0, in0 = in1, in1 = x;
19144       odd = !odd;
19145     }
19146   out = d->target;
19147
19148   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19149                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
19150   return true;
19151 }
19152
19153 /* Recognize patterns suitable for the UZP instructions.  */
19154 static bool
19155 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
19156 {
19157   HOST_WIDE_INT odd;
19158   rtx out, in0, in1, x;
19159   machine_mode vmode = d->vmode;
19160
19161   if (GET_MODE_UNIT_SIZE (vmode) > 8)
19162     return false;
19163
19164   /* Note that these are little-endian tests.
19165      We correct for big-endian later.  */
19166   if (!d->perm[0].is_constant (&odd)
19167       || (odd != 0 && odd != 1)
19168       || !d->perm.series_p (0, 1, odd, 2))
19169     return false;
19170
19171   /* Success!  */
19172   if (d->testing_p)
19173     return true;
19174
19175   in0 = d->op0;
19176   in1 = d->op1;
19177   /* We don't need a big-endian lane correction for SVE; see the comment
19178      at the head of aarch64-sve.md for details.  */
19179   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19180     {
19181       x = in0, in0 = in1, in1 = x;
19182       odd = !odd;
19183     }
19184   out = d->target;
19185
19186   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19187                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
19188   return true;
19189 }
19190
19191 /* Recognize patterns suitable for the ZIP instructions.  */
19192 static bool
19193 aarch64_evpc_zip (struct expand_vec_perm_d *d)
19194 {
19195   unsigned int high;
19196   poly_uint64 nelt = d->perm.length ();
19197   rtx out, in0, in1, x;
19198   machine_mode vmode = d->vmode;
19199
19200   if (GET_MODE_UNIT_SIZE (vmode) > 8)
19201     return false;
19202
19203   /* Note that these are little-endian tests.
19204      We correct for big-endian later.  */
19205   poly_uint64 first = d->perm[0];
19206   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
19207       || !d->perm.series_p (0, 2, first, 1)
19208       || !d->perm.series_p (1, 2, first + nelt, 1))
19209     return false;
19210   high = maybe_ne (first, 0U);
19211
19212   /* Success!  */
19213   if (d->testing_p)
19214     return true;
19215
19216   in0 = d->op0;
19217   in1 = d->op1;
19218   /* We don't need a big-endian lane correction for SVE; see the comment
19219      at the head of aarch64-sve.md for details.  */
19220   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19221     {
19222       x = in0, in0 = in1, in1 = x;
19223       high = !high;
19224     }
19225   out = d->target;
19226
19227   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19228                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
19229   return true;
19230 }
19231
19232 /* Recognize patterns for the EXT insn.  */
19233
19234 static bool
19235 aarch64_evpc_ext (struct expand_vec_perm_d *d)
19236 {
19237   HOST_WIDE_INT location;
19238   rtx offset;
19239
19240   /* The first element always refers to the first vector.
19241      Check if the extracted indices are increasing by one.  */
19242   if (d->vec_flags == VEC_SVE_PRED
19243       || !d->perm[0].is_constant (&location)
19244       || !d->perm.series_p (0, 1, location, 1))
19245     return false;
19246
19247   /* Success! */
19248   if (d->testing_p)
19249     return true;
19250
19251   /* The case where (location == 0) is a no-op for both big- and little-endian,
19252      and is removed by the mid-end at optimization levels -O1 and higher.
19253
19254      We don't need a big-endian lane correction for SVE; see the comment
19255      at the head of aarch64-sve.md for details.  */
19256   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
19257     {
19258       /* After setup, we want the high elements of the first vector (stored
19259          at the LSB end of the register), and the low elements of the second
19260          vector (stored at the MSB end of the register). So swap.  */
19261       std::swap (d->op0, d->op1);
19262       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
19263          to_constant () is safe since this is restricted to Advanced SIMD
19264          vectors.  */
19265       location = d->perm.length ().to_constant () - location;
19266     }
19267
19268   offset = GEN_INT (location);
19269   emit_set_insn (d->target,
19270                  gen_rtx_UNSPEC (d->vmode,
19271                                  gen_rtvec (3, d->op0, d->op1, offset),
19272                                  UNSPEC_EXT));
19273   return true;
19274 }
19275
19276 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
19277    within each 64-bit, 32-bit or 16-bit granule.  */
19278
19279 static bool
19280 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
19281 {
19282   HOST_WIDE_INT diff;
19283   unsigned int i, size, unspec;
19284   machine_mode pred_mode;
19285
19286   if (d->vec_flags == VEC_SVE_PRED
19287       || !d->one_vector_p
19288       || !d->perm[0].is_constant (&diff))
19289     return false;
19290
19291   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
19292   if (size == 8)
19293     {
19294       unspec = UNSPEC_REV64;
19295       pred_mode = VNx2BImode;
19296     }
19297   else if (size == 4)
19298     {
19299       unspec = UNSPEC_REV32;
19300       pred_mode = VNx4BImode;
19301     }
19302   else if (size == 2)
19303     {
19304       unspec = UNSPEC_REV16;
19305       pred_mode = VNx8BImode;
19306     }
19307   else
19308     return false;
19309
19310   unsigned int step = diff + 1;
19311   for (i = 0; i < step; ++i)
19312     if (!d->perm.series_p (i, step, diff - i, step))
19313       return false;
19314
19315   /* Success! */
19316   if (d->testing_p)
19317     return true;
19318
19319   if (d->vec_flags == VEC_SVE_DATA)
19320     {
19321       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
19322       rtx target = gen_reg_rtx (int_mode);
19323       if (BYTES_BIG_ENDIAN)
19324         /* The act of taking a subreg between INT_MODE and d->vmode
19325            is itself a reversing operation on big-endian targets;
19326            see the comment at the head of aarch64-sve.md for details.
19327            First reinterpret OP0 as INT_MODE without using a subreg
19328            and without changing the contents.  */
19329         emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
19330       else
19331         {
19332           /* For SVE we use REV[BHW] unspecs derived from the element size
19333              of v->mode and vector modes whose elements have SIZE bytes.
19334              This ensures that the vector modes match the predicate modes.  */
19335           int unspec = aarch64_sve_rev_unspec (d->vmode);
19336           rtx pred = aarch64_ptrue_reg (pred_mode);
19337           emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
19338                                        gen_lowpart (int_mode, d->op0)));
19339         }
19340       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19341       return true;
19342     }
19343   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
19344   emit_set_insn (d->target, src);
19345   return true;
19346 }
19347
19348 /* Recognize patterns for the REV insn, which reverses elements within
19349    a full vector.  */
19350
19351 static bool
19352 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
19353 {
19354   poly_uint64 nelt = d->perm.length ();
19355
19356   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
19357     return false;
19358
19359   if (!d->perm.series_p (0, 1, nelt - 1, -1))
19360     return false;
19361
19362   /* Success! */
19363   if (d->testing_p)
19364     return true;
19365
19366   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
19367   emit_set_insn (d->target, src);
19368   return true;
19369 }
19370
19371 static bool
19372 aarch64_evpc_dup (struct expand_vec_perm_d *d)
19373 {
19374   rtx out = d->target;
19375   rtx in0;
19376   HOST_WIDE_INT elt;
19377   machine_mode vmode = d->vmode;
19378   rtx lane;
19379
19380   if (d->vec_flags == VEC_SVE_PRED
19381       || d->perm.encoding ().encoded_nelts () != 1
19382       || !d->perm[0].is_constant (&elt))
19383     return false;
19384
19385   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
19386     return false;
19387
19388   /* Success! */
19389   if (d->testing_p)
19390     return true;
19391
19392   /* The generic preparation in aarch64_expand_vec_perm_const_1
19393      swaps the operand order and the permute indices if it finds
19394      d->perm[0] to be in the second operand.  Thus, we can always
19395      use d->op0 and need not do any extra arithmetic to get the
19396      correct lane number.  */
19397   in0 = d->op0;
19398   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
19399
19400   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
19401   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
19402   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
19403   return true;
19404 }
19405
19406 static bool
19407 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
19408 {
19409   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
19410   machine_mode vmode = d->vmode;
19411
19412   /* Make sure that the indices are constant.  */
19413   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
19414   for (unsigned int i = 0; i < encoded_nelts; ++i)
19415     if (!d->perm[i].is_constant ())
19416       return false;
19417
19418   if (d->testing_p)
19419     return true;
19420
19421   /* Generic code will try constant permutation twice.  Once with the
19422      original mode and again with the elements lowered to QImode.
19423      So wait and don't do the selector expansion ourselves.  */
19424   if (vmode != V8QImode && vmode != V16QImode)
19425     return false;
19426
19427   /* to_constant is safe since this routine is specific to Advanced SIMD
19428      vectors.  */
19429   unsigned int nelt = d->perm.length ().to_constant ();
19430   for (unsigned int i = 0; i < nelt; ++i)
19431     /* If big-endian and two vectors we end up with a weird mixed-endian
19432        mode on NEON.  Reverse the index within each word but not the word
19433        itself.  to_constant is safe because we checked is_constant above.  */
19434     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
19435                         ? d->perm[i].to_constant () ^ (nelt - 1)
19436                         : d->perm[i].to_constant ());
19437
19438   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19439   sel = force_reg (vmode, sel);
19440
19441   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
19442   return true;
19443 }
19444
19445 /* Try to implement D using an SVE TBL instruction.  */
19446
19447 static bool
19448 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
19449 {
19450   unsigned HOST_WIDE_INT nelt;
19451
19452   /* Permuting two variable-length vectors could overflow the
19453      index range.  */
19454   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
19455     return false;
19456
19457   if (d->testing_p)
19458     return true;
19459
19460   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
19461   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
19462   if (d->one_vector_p)
19463     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
19464   else
19465     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
19466   return true;
19467 }
19468
19469 /* Try to implement D using SVE SEL instruction.  */
19470
19471 static bool
19472 aarch64_evpc_sel (struct expand_vec_perm_d *d)
19473 {
19474   machine_mode vmode = d->vmode;
19475   int unit_size = GET_MODE_UNIT_SIZE (vmode);
19476
19477   if (d->vec_flags != VEC_SVE_DATA
19478       || unit_size > 8)
19479     return false;
19480
19481   int n_patterns = d->perm.encoding ().npatterns ();
19482   poly_int64 vec_len = d->perm.length ();
19483
19484   for (int i = 0; i < n_patterns; ++i)
19485     if (!known_eq (d->perm[i], i)
19486         && !known_eq (d->perm[i], vec_len + i))
19487       return false;
19488
19489   for (int i = n_patterns; i < n_patterns * 2; i++)
19490     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
19491         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
19492       return false;
19493
19494   if (d->testing_p)
19495     return true;
19496
19497   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
19498
19499   /* Build a predicate that is true when op0 elements should be used.  */
19500   rtx_vector_builder builder (pred_mode, n_patterns, 2);
19501   for (int i = 0; i < n_patterns * 2; i++)
19502     {
19503       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
19504                                           : CONST0_RTX (BImode);
19505       builder.quick_push (elem);
19506     }
19507
19508   rtx const_vec = builder.build ();
19509   rtx pred = force_reg (pred_mode, const_vec);
19510   /* TARGET = PRED ? OP0 : OP1.  */
19511   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
19512   return true;
19513 }
19514
19515 static bool
19516 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19517 {
19518   /* The pattern matching functions above are written to look for a small
19519      number to begin the sequence (0, 1, N/2).  If we begin with an index
19520      from the second operand, we can swap the operands.  */
19521   poly_int64 nelt = d->perm.length ();
19522   if (known_ge (d->perm[0], nelt))
19523     {
19524       d->perm.rotate_inputs (1);
19525       std::swap (d->op0, d->op1);
19526     }
19527
19528   if ((d->vec_flags == VEC_ADVSIMD
19529        || d->vec_flags == VEC_SVE_DATA
19530        || d->vec_flags == VEC_SVE_PRED)
19531       && known_gt (nelt, 1))
19532     {
19533       if (aarch64_evpc_rev_local (d))
19534         return true;
19535       else if (aarch64_evpc_rev_global (d))
19536         return true;
19537       else if (aarch64_evpc_ext (d))
19538         return true;
19539       else if (aarch64_evpc_dup (d))
19540         return true;
19541       else if (aarch64_evpc_zip (d))
19542         return true;
19543       else if (aarch64_evpc_uzp (d))
19544         return true;
19545       else if (aarch64_evpc_trn (d))
19546         return true;
19547       else if (aarch64_evpc_sel (d))
19548         return true;
19549       if (d->vec_flags == VEC_SVE_DATA)
19550         return aarch64_evpc_sve_tbl (d);
19551       else if (d->vec_flags == VEC_ADVSIMD)
19552         return aarch64_evpc_tbl (d);
19553     }
19554   return false;
19555 }
19556
19557 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
19558
19559 static bool
19560 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19561                                   rtx op1, const vec_perm_indices &sel)
19562 {
19563   struct expand_vec_perm_d d;
19564
19565   /* Check whether the mask can be applied to a single vector.  */
19566   if (sel.ninputs () == 1
19567       || (op0 && rtx_equal_p (op0, op1)))
19568     d.one_vector_p = true;
19569   else if (sel.all_from_input_p (0))
19570     {
19571       d.one_vector_p = true;
19572       op1 = op0;
19573     }
19574   else if (sel.all_from_input_p (1))
19575     {
19576       d.one_vector_p = true;
19577       op0 = op1;
19578     }
19579   else
19580     d.one_vector_p = false;
19581
19582   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
19583                      sel.nelts_per_input ());
19584   d.vmode = vmode;
19585   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
19586   d.target = target;
19587   d.op0 = op0;
19588   d.op1 = op1;
19589   d.testing_p = !target;
19590
19591   if (!d.testing_p)
19592     return aarch64_expand_vec_perm_const_1 (&d);
19593
19594   rtx_insn *last = get_last_insn ();
19595   bool ret = aarch64_expand_vec_perm_const_1 (&d);
19596   gcc_assert (last == get_last_insn ());
19597
19598   return ret;
19599 }
19600
19601 /* Generate a byte permute mask for a register of mode MODE,
19602    which has NUNITS units.  */
19603
19604 rtx
19605 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
19606 {
19607   /* We have to reverse each vector because we dont have
19608      a permuted load that can reverse-load according to ABI rules.  */
19609   rtx mask;
19610   rtvec v = rtvec_alloc (16);
19611   unsigned int i, j;
19612   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
19613
19614   gcc_assert (BYTES_BIG_ENDIAN);
19615   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
19616
19617   for (i = 0; i < nunits; i++)
19618     for (j = 0; j < usize; j++)
19619       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
19620   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
19621   return force_reg (V16QImode, mask);
19622 }
19623
19624 /* Expand an SVE integer comparison using the SVE equivalent of:
19625
19626      (set TARGET (CODE OP0 OP1)).  */
19627
19628 void
19629 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
19630 {
19631   machine_mode pred_mode = GET_MODE (target);
19632   machine_mode data_mode = GET_MODE (op0);
19633   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
19634                                       op0, op1);
19635   if (!rtx_equal_p (target, res))
19636     emit_move_insn (target, res);
19637 }
19638
19639 /* Return the UNSPEC_COND_* code for comparison CODE.  */
19640
19641 static unsigned int
19642 aarch64_unspec_cond_code (rtx_code code)
19643 {
19644   switch (code)
19645     {
19646     case NE:
19647       return UNSPEC_COND_FCMNE;
19648     case EQ:
19649       return UNSPEC_COND_FCMEQ;
19650     case LT:
19651       return UNSPEC_COND_FCMLT;
19652     case GT:
19653       return UNSPEC_COND_FCMGT;
19654     case LE:
19655       return UNSPEC_COND_FCMLE;
19656     case GE:
19657       return UNSPEC_COND_FCMGE;
19658     case UNORDERED:
19659       return UNSPEC_COND_FCMUO;
19660     default:
19661       gcc_unreachable ();
19662     }
19663 }
19664
19665 /* Emit:
19666
19667       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19668
19669    where <X> is the operation associated with comparison CODE.
19670    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
19671
19672 static void
19673 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
19674                           bool known_ptrue_p, rtx op0, rtx op1)
19675 {
19676   rtx flag = gen_int_mode (known_ptrue_p, SImode);
19677   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
19678                                gen_rtvec (4, pred, flag, op0, op1),
19679                                aarch64_unspec_cond_code (code));
19680   emit_set_insn (target, unspec);
19681 }
19682
19683 /* Emit the SVE equivalent of:
19684
19685       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
19686       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
19687       (set TARGET (ior:PRED_MODE TMP1 TMP2))
19688
19689    where <Xi> is the operation associated with comparison CODEi.
19690    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
19691
19692 static void
19693 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
19694                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
19695 {
19696   machine_mode pred_mode = GET_MODE (pred);
19697   rtx tmp1 = gen_reg_rtx (pred_mode);
19698   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
19699   rtx tmp2 = gen_reg_rtx (pred_mode);
19700   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
19701   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
19702 }
19703
19704 /* Emit the SVE equivalent of:
19705
19706       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19707       (set TARGET (not TMP))
19708
19709    where <X> is the operation associated with comparison CODE.
19710    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
19711
19712 static void
19713 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
19714                                  bool known_ptrue_p, rtx op0, rtx op1)
19715 {
19716   machine_mode pred_mode = GET_MODE (pred);
19717   rtx tmp = gen_reg_rtx (pred_mode);
19718   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
19719   aarch64_emit_unop (target, one_cmpl_optab, tmp);
19720 }
19721
19722 /* Expand an SVE floating-point comparison using the SVE equivalent of:
19723
19724      (set TARGET (CODE OP0 OP1))
19725
19726    If CAN_INVERT_P is true, the caller can also handle inverted results;
19727    return true if the result is in fact inverted.  */
19728
19729 bool
19730 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
19731                                   rtx op0, rtx op1, bool can_invert_p)
19732 {
19733   machine_mode pred_mode = GET_MODE (target);
19734   machine_mode data_mode = GET_MODE (op0);
19735
19736   rtx ptrue = aarch64_ptrue_reg (pred_mode);
19737   switch (code)
19738     {
19739     case UNORDERED:
19740       /* UNORDERED has no immediate form.  */
19741       op1 = force_reg (data_mode, op1);
19742       /* fall through */
19743     case LT:
19744     case LE:
19745     case GT:
19746     case GE:
19747     case EQ:
19748     case NE:
19749       {
19750         /* There is native support for the comparison.  */
19751         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19752         return false;
19753       }
19754
19755     case LTGT:
19756       /* This is a trapping operation (LT or GT).  */
19757       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
19758       return false;
19759
19760     case UNEQ:
19761       if (!flag_trapping_math)
19762         {
19763           /* This would trap for signaling NaNs.  */
19764           op1 = force_reg (data_mode, op1);
19765           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
19766                                         ptrue, true, op0, op1);
19767           return false;
19768         }
19769       /* fall through */
19770     case UNLT:
19771     case UNLE:
19772     case UNGT:
19773     case UNGE:
19774       if (flag_trapping_math)
19775         {
19776           /* Work out which elements are ordered.  */
19777           rtx ordered = gen_reg_rtx (pred_mode);
19778           op1 = force_reg (data_mode, op1);
19779           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
19780                                            ptrue, true, op0, op1);
19781
19782           /* Test the opposite condition for the ordered elements,
19783              then invert the result.  */
19784           if (code == UNEQ)
19785             code = NE;
19786           else
19787             code = reverse_condition_maybe_unordered (code);
19788           if (can_invert_p)
19789             {
19790               aarch64_emit_sve_fp_cond (target, code,
19791                                         ordered, false, op0, op1);
19792               return true;
19793             }
19794           aarch64_emit_sve_invert_fp_cond (target, code,
19795                                            ordered, false, op0, op1);
19796           return false;
19797         }
19798       break;
19799
19800     case ORDERED:
19801       /* ORDERED has no immediate form.  */
19802       op1 = force_reg (data_mode, op1);
19803       break;
19804
19805     default:
19806       gcc_unreachable ();
19807     }
19808
19809   /* There is native support for the inverse comparison.  */
19810   code = reverse_condition_maybe_unordered (code);
19811   if (can_invert_p)
19812     {
19813       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19814       return true;
19815     }
19816   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
19817   return false;
19818 }
19819
19820 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
19821    of the data being selected and CMP_MODE is the mode of the values being
19822    compared.  */
19823
19824 void
19825 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
19826                           rtx *ops)
19827 {
19828   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
19829   rtx pred = gen_reg_rtx (pred_mode);
19830   if (FLOAT_MODE_P (cmp_mode))
19831     {
19832       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
19833                                             ops[4], ops[5], true))
19834         std::swap (ops[1], ops[2]);
19835     }
19836   else
19837     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
19838
19839   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
19840     ops[1] = force_reg (data_mode, ops[1]);
19841   /* The "false" value can only be zero if the "true" value is a constant.  */
19842   if (register_operand (ops[1], data_mode)
19843       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
19844     ops[2] = force_reg (data_mode, ops[2]);
19845
19846   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
19847   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
19848 }
19849
19850 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
19851    true.  However due to issues with register allocation it is preferable
19852    to avoid tieing integer scalar and FP scalar modes.  Executing integer
19853    operations in general registers is better than treating them as scalar
19854    vector operations.  This reduces latency and avoids redundant int<->FP
19855    moves.  So tie modes if they are either the same class, or vector modes
19856    with other vector modes, vector structs or any scalar mode.  */
19857
19858 static bool
19859 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
19860 {
19861   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
19862     return true;
19863
19864   /* We specifically want to allow elements of "structure" modes to
19865      be tieable to the structure.  This more general condition allows
19866      other rarer situations too.  The reason we don't extend this to
19867      predicate modes is that there are no predicate structure modes
19868      nor any specific instructions for extracting part of a predicate
19869      register.  */
19870   if (aarch64_vector_data_mode_p (mode1)
19871       && aarch64_vector_data_mode_p (mode2))
19872     return true;
19873
19874   /* Also allow any scalar modes with vectors.  */
19875   if (aarch64_vector_mode_supported_p (mode1)
19876       || aarch64_vector_mode_supported_p (mode2))
19877     return true;
19878
19879   return false;
19880 }
19881
19882 /* Return a new RTX holding the result of moving POINTER forward by
19883    AMOUNT bytes.  */
19884
19885 static rtx
19886 aarch64_move_pointer (rtx pointer, poly_int64 amount)
19887 {
19888   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
19889
19890   return adjust_automodify_address (pointer, GET_MODE (pointer),
19891                                     next, amount);
19892 }
19893
19894 /* Return a new RTX holding the result of moving POINTER forward by the
19895    size of the mode it points to.  */
19896
19897 static rtx
19898 aarch64_progress_pointer (rtx pointer)
19899 {
19900   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
19901 }
19902
19903 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
19904    MODE bytes.  */
19905
19906 static void
19907 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
19908                                               machine_mode mode)
19909 {
19910   rtx reg = gen_reg_rtx (mode);
19911
19912   /* "Cast" the pointers to the correct mode.  */
19913   *src = adjust_address (*src, mode, 0);
19914   *dst = adjust_address (*dst, mode, 0);
19915   /* Emit the memcpy.  */
19916   emit_move_insn (reg, *src);
19917   emit_move_insn (*dst, reg);
19918   /* Move the pointers forward.  */
19919   *src = aarch64_progress_pointer (*src);
19920   *dst = aarch64_progress_pointer (*dst);
19921 }
19922
19923 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
19924    we succeed, otherwise return false.  */
19925
19926 bool
19927 aarch64_expand_cpymem (rtx *operands)
19928 {
19929   int n, mode_bits;
19930   rtx dst = operands[0];
19931   rtx src = operands[1];
19932   rtx base;
19933   machine_mode cur_mode = BLKmode, next_mode;
19934   bool speed_p = !optimize_function_for_size_p (cfun);
19935
19936   /* When optimizing for size, give a better estimate of the length of a
19937      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
19938      will always require an even number of instructions to do now.  And each
19939      operation requires both a load+store, so devide the max number by 2.  */
19940   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
19941
19942   /* We can't do anything smart if the amount to copy is not constant.  */
19943   if (!CONST_INT_P (operands[2]))
19944     return false;
19945
19946   n = INTVAL (operands[2]);
19947
19948   /* Try to keep the number of instructions low.  For all cases we will do at
19949      most two moves for the residual amount, since we'll always overlap the
19950      remainder.  */
19951   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
19952     return false;
19953
19954   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
19955   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
19956
19957   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
19958   src = adjust_automodify_address (src, VOIDmode, base, 0);
19959
19960   /* Convert n to bits to make the rest of the code simpler.  */
19961   n = n * BITS_PER_UNIT;
19962
19963   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
19964      larger than TImode, but we should not use them for loads/stores here.  */
19965   const int copy_limit = GET_MODE_BITSIZE (TImode);
19966
19967   while (n > 0)
19968     {
19969       /* Find the largest mode in which to do the copy in without over reading
19970          or writing.  */
19971       opt_scalar_int_mode mode_iter;
19972       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
19973         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
19974           cur_mode = mode_iter.require ();
19975
19976       gcc_assert (cur_mode != BLKmode);
19977
19978       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
19979       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
19980
19981       n -= mode_bits;
19982
19983       /* Do certain trailing copies as overlapping if it's going to be
19984          cheaper.  i.e. less instructions to do so.  For instance doing a 15
19985          byte copy it's more efficient to do two overlapping 8 byte copies than
19986          8 + 6 + 1.  */
19987       if (n > 0 && n <= 8 * BITS_PER_UNIT)
19988         {
19989           next_mode = smallest_mode_for_size (n, MODE_INT);
19990           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
19991           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
19992           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
19993           n = n_bits;
19994         }
19995     }
19996
19997   return true;
19998 }
19999
20000 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
20001    SImode stores.  Handle the case when the constant has identical
20002    bottom and top halves.  This is beneficial when the two stores can be
20003    merged into an STP and we avoid synthesising potentially expensive
20004    immediates twice.  Return true if such a split is possible.  */
20005
20006 bool
20007 aarch64_split_dimode_const_store (rtx dst, rtx src)
20008 {
20009   rtx lo = gen_lowpart (SImode, src);
20010   rtx hi = gen_highpart_mode (SImode, DImode, src);
20011
20012   bool size_p = optimize_function_for_size_p (cfun);
20013
20014   if (!rtx_equal_p (lo, hi))
20015     return false;
20016
20017   unsigned int orig_cost
20018     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
20019   unsigned int lo_cost
20020     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
20021
20022   /* We want to transform:
20023      MOV        x1, 49370
20024      MOVK       x1, 0x140, lsl 16
20025      MOVK       x1, 0xc0da, lsl 32
20026      MOVK       x1, 0x140, lsl 48
20027      STR        x1, [x0]
20028    into:
20029      MOV        w1, 49370
20030      MOVK       w1, 0x140, lsl 16
20031      STP        w1, w1, [x0]
20032    So we want to perform this only when we save two instructions
20033    or more.  When optimizing for size, however, accept any code size
20034    savings we can.  */
20035   if (size_p && orig_cost <= lo_cost)
20036     return false;
20037
20038   if (!size_p
20039       && (orig_cost <= lo_cost + 1))
20040     return false;
20041
20042   rtx mem_lo = adjust_address (dst, SImode, 0);
20043   if (!aarch64_mem_pair_operand (mem_lo, SImode))
20044     return false;
20045
20046   rtx tmp_reg = gen_reg_rtx (SImode);
20047   aarch64_expand_mov_immediate (tmp_reg, lo);
20048   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
20049   /* Don't emit an explicit store pair as this may not be always profitable.
20050      Let the sched-fusion logic decide whether to merge them.  */
20051   emit_move_insn (mem_lo, tmp_reg);
20052   emit_move_insn (mem_hi, tmp_reg);
20053
20054   return true;
20055 }
20056
20057 /* Generate RTL for a conditional branch with rtx comparison CODE in
20058    mode CC_MODE.  The destination of the unlikely conditional branch
20059    is LABEL_REF.  */
20060
20061 void
20062 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
20063                               rtx label_ref)
20064 {
20065   rtx x;
20066   x = gen_rtx_fmt_ee (code, VOIDmode,
20067                       gen_rtx_REG (cc_mode, CC_REGNUM),
20068                       const0_rtx);
20069
20070   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
20071                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
20072                             pc_rtx);
20073   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
20074 }
20075
20076 /* Generate DImode scratch registers for 128-bit (TImode) addition.
20077
20078    OP1 represents the TImode destination operand 1
20079    OP2 represents the TImode destination operand 2
20080    LOW_DEST represents the low half (DImode) of TImode operand 0
20081    LOW_IN1 represents the low half (DImode) of TImode operand 1
20082    LOW_IN2 represents the low half (DImode) of TImode operand 2
20083    HIGH_DEST represents the high half (DImode) of TImode operand 0
20084    HIGH_IN1 represents the high half (DImode) of TImode operand 1
20085    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
20086
20087 void
20088 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20089                             rtx *low_in1, rtx *low_in2,
20090                             rtx *high_dest, rtx *high_in1,
20091                             rtx *high_in2)
20092 {
20093   *low_dest = gen_reg_rtx (DImode);
20094   *low_in1 = gen_lowpart (DImode, op1);
20095   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20096                                   subreg_lowpart_offset (DImode, TImode));
20097   *high_dest = gen_reg_rtx (DImode);
20098   *high_in1 = gen_highpart (DImode, op1);
20099   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20100                                    subreg_highpart_offset (DImode, TImode));
20101 }
20102
20103 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
20104
20105    This function differs from 'arch64_addti_scratch_regs' in that
20106    OP1 can be an immediate constant (zero). We must call
20107    subreg_highpart_offset with DImode and TImode arguments, otherwise
20108    VOIDmode will be used for the const_int which generates an internal
20109    error from subreg_size_highpart_offset which does not expect a size of zero.
20110
20111    OP1 represents the TImode destination operand 1
20112    OP2 represents the TImode destination operand 2
20113    LOW_DEST represents the low half (DImode) of TImode operand 0
20114    LOW_IN1 represents the low half (DImode) of TImode operand 1
20115    LOW_IN2 represents the low half (DImode) of TImode operand 2
20116    HIGH_DEST represents the high half (DImode) of TImode operand 0
20117    HIGH_IN1 represents the high half (DImode) of TImode operand 1
20118    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
20119
20120
20121 void
20122 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20123                              rtx *low_in1, rtx *low_in2,
20124                              rtx *high_dest, rtx *high_in1,
20125                              rtx *high_in2)
20126 {
20127   *low_dest = gen_reg_rtx (DImode);
20128   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
20129                                   subreg_lowpart_offset (DImode, TImode));
20130
20131   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20132                                   subreg_lowpart_offset (DImode, TImode));
20133   *high_dest = gen_reg_rtx (DImode);
20134
20135   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
20136                                    subreg_highpart_offset (DImode, TImode));
20137   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20138                                    subreg_highpart_offset (DImode, TImode));
20139 }
20140
20141 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
20142
20143    OP0 represents the TImode destination operand 0
20144    LOW_DEST represents the low half (DImode) of TImode operand 0
20145    LOW_IN1 represents the low half (DImode) of TImode operand 1
20146    LOW_IN2 represents the low half (DImode) of TImode operand 2
20147    HIGH_DEST represents the high half (DImode) of TImode operand 0
20148    HIGH_IN1 represents the high half (DImode) of TImode operand 1
20149    HIGH_IN2 represents the high half (DImode) of TImode operand 2
20150    UNSIGNED_P is true if the operation is being performed on unsigned
20151    values.  */
20152 void
20153 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
20154                        rtx low_in2, rtx high_dest, rtx high_in1,
20155                        rtx high_in2, bool unsigned_p)
20156 {
20157   if (low_in2 == const0_rtx)
20158     {
20159       low_dest = low_in1;
20160       high_in2 = force_reg (DImode, high_in2);
20161       if (unsigned_p)
20162         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
20163       else
20164         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
20165     }
20166   else
20167     {
20168       if (CONST_INT_P (low_in2))
20169         {
20170           high_in2 = force_reg (DImode, high_in2);
20171           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
20172                                               GEN_INT (-INTVAL (low_in2))));
20173         }
20174       else
20175         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
20176
20177       if (unsigned_p)
20178         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
20179       else
20180         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
20181     }
20182
20183   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
20184   emit_move_insn (gen_highpart (DImode, op0), high_dest);
20185
20186 }
20187
20188 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
20189
20190 static unsigned HOST_WIDE_INT
20191 aarch64_asan_shadow_offset (void)
20192 {
20193   if (TARGET_ILP32)
20194     return (HOST_WIDE_INT_1 << 29);
20195   else
20196     return (HOST_WIDE_INT_1 << 36);
20197 }
20198
20199 static rtx
20200 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
20201                         int code, tree treeop0, tree treeop1)
20202 {
20203   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20204   rtx op0, op1;
20205   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20206   insn_code icode;
20207   struct expand_operand ops[4];
20208
20209   start_sequence ();
20210   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20211
20212   op_mode = GET_MODE (op0);
20213   if (op_mode == VOIDmode)
20214     op_mode = GET_MODE (op1);
20215
20216   switch (op_mode)
20217     {
20218     case E_QImode:
20219     case E_HImode:
20220     case E_SImode:
20221       cmp_mode = SImode;
20222       icode = CODE_FOR_cmpsi;
20223       break;
20224
20225     case E_DImode:
20226       cmp_mode = DImode;
20227       icode = CODE_FOR_cmpdi;
20228       break;
20229
20230     case E_SFmode:
20231       cmp_mode = SFmode;
20232       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20233       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
20234       break;
20235
20236     case E_DFmode:
20237       cmp_mode = DFmode;
20238       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20239       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
20240       break;
20241
20242     default:
20243       end_sequence ();
20244       return NULL_RTX;
20245     }
20246
20247   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
20248   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
20249   if (!op0 || !op1)
20250     {
20251       end_sequence ();
20252       return NULL_RTX;
20253     }
20254   *prep_seq = get_insns ();
20255   end_sequence ();
20256
20257   create_fixed_operand (&ops[0], op0);
20258   create_fixed_operand (&ops[1], op1);
20259
20260   start_sequence ();
20261   if (!maybe_expand_insn (icode, 2, ops))
20262     {
20263       end_sequence ();
20264       return NULL_RTX;
20265     }
20266   *gen_seq = get_insns ();
20267   end_sequence ();
20268
20269   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
20270                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
20271 }
20272
20273 static rtx
20274 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
20275                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
20276 {
20277   rtx op0, op1, target;
20278   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20279   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20280   insn_code icode;
20281   struct expand_operand ops[6];
20282   int aarch64_cond;
20283
20284   push_to_sequence (*prep_seq);
20285   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20286
20287   op_mode = GET_MODE (op0);
20288   if (op_mode == VOIDmode)
20289     op_mode = GET_MODE (op1);
20290
20291   switch (op_mode)
20292     {
20293     case E_QImode:
20294     case E_HImode:
20295     case E_SImode:
20296       cmp_mode = SImode;
20297       break;
20298
20299     case E_DImode:
20300       cmp_mode = DImode;
20301       break;
20302
20303     case E_SFmode:
20304       cmp_mode = SFmode;
20305       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20306       break;
20307
20308     case E_DFmode:
20309       cmp_mode = DFmode;
20310       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20311       break;
20312
20313     default:
20314       end_sequence ();
20315       return NULL_RTX;
20316     }
20317
20318   icode = code_for_ccmp (cc_mode, cmp_mode);
20319
20320   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
20321   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
20322   if (!op0 || !op1)
20323     {
20324       end_sequence ();
20325       return NULL_RTX;
20326     }
20327   *prep_seq = get_insns ();
20328   end_sequence ();
20329
20330   target = gen_rtx_REG (cc_mode, CC_REGNUM);
20331   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
20332
20333   if (bit_code != AND)
20334     {
20335       /* Treat the ccmp patterns as canonical and use them where possible,
20336          but fall back to ccmp_rev patterns if there's no other option.  */
20337       rtx_code prev_code = GET_CODE (prev);
20338       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
20339       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
20340           && !(prev_code == EQ
20341                || prev_code == NE
20342                || prev_code == ORDERED
20343                || prev_code == UNORDERED))
20344         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
20345       else
20346         {
20347           rtx_code code = reverse_condition (prev_code);
20348           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
20349         }
20350       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
20351     }
20352
20353   create_fixed_operand (&ops[0], XEXP (prev, 0));
20354   create_fixed_operand (&ops[1], target);
20355   create_fixed_operand (&ops[2], op0);
20356   create_fixed_operand (&ops[3], op1);
20357   create_fixed_operand (&ops[4], prev);
20358   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
20359
20360   push_to_sequence (*gen_seq);
20361   if (!maybe_expand_insn (icode, 6, ops))
20362     {
20363       end_sequence ();
20364       return NULL_RTX;
20365     }
20366
20367   *gen_seq = get_insns ();
20368   end_sequence ();
20369
20370   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
20371 }
20372
20373 #undef TARGET_GEN_CCMP_FIRST
20374 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
20375
20376 #undef TARGET_GEN_CCMP_NEXT
20377 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
20378
20379 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
20380    instruction fusion of some sort.  */
20381
20382 static bool
20383 aarch64_macro_fusion_p (void)
20384 {
20385   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
20386 }
20387
20388
20389 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
20390    should be kept together during scheduling.  */
20391
20392 static bool
20393 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
20394 {
20395   rtx set_dest;
20396   rtx prev_set = single_set (prev);
20397   rtx curr_set = single_set (curr);
20398   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
20399   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
20400
20401   if (!aarch64_macro_fusion_p ())
20402     return false;
20403
20404   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
20405     {
20406       /* We are trying to match:
20407          prev (mov)  == (set (reg r0) (const_int imm16))
20408          curr (movk) == (set (zero_extract (reg r0)
20409                                            (const_int 16)
20410                                            (const_int 16))
20411                              (const_int imm16_1))  */
20412
20413       set_dest = SET_DEST (curr_set);
20414
20415       if (GET_CODE (set_dest) == ZERO_EXTRACT
20416           && CONST_INT_P (SET_SRC (curr_set))
20417           && CONST_INT_P (SET_SRC (prev_set))
20418           && CONST_INT_P (XEXP (set_dest, 2))
20419           && INTVAL (XEXP (set_dest, 2)) == 16
20420           && REG_P (XEXP (set_dest, 0))
20421           && REG_P (SET_DEST (prev_set))
20422           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
20423         {
20424           return true;
20425         }
20426     }
20427
20428   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
20429     {
20430
20431       /*  We're trying to match:
20432           prev (adrp) == (set (reg r1)
20433                               (high (symbol_ref ("SYM"))))
20434           curr (add) == (set (reg r0)
20435                              (lo_sum (reg r1)
20436                                      (symbol_ref ("SYM"))))
20437           Note that r0 need not necessarily be the same as r1, especially
20438           during pre-regalloc scheduling.  */
20439
20440       if (satisfies_constraint_Ush (SET_SRC (prev_set))
20441           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20442         {
20443           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
20444               && REG_P (XEXP (SET_SRC (curr_set), 0))
20445               && REGNO (XEXP (SET_SRC (curr_set), 0))
20446                  == REGNO (SET_DEST (prev_set))
20447               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
20448                               XEXP (SET_SRC (curr_set), 1)))
20449             return true;
20450         }
20451     }
20452
20453   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
20454     {
20455
20456       /* We're trying to match:
20457          prev (movk) == (set (zero_extract (reg r0)
20458                                            (const_int 16)
20459                                            (const_int 32))
20460                              (const_int imm16_1))
20461          curr (movk) == (set (zero_extract (reg r0)
20462                                            (const_int 16)
20463                                            (const_int 48))
20464                              (const_int imm16_2))  */
20465
20466       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
20467           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
20468           && REG_P (XEXP (SET_DEST (prev_set), 0))
20469           && REG_P (XEXP (SET_DEST (curr_set), 0))
20470           && REGNO (XEXP (SET_DEST (prev_set), 0))
20471              == REGNO (XEXP (SET_DEST (curr_set), 0))
20472           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
20473           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
20474           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
20475           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
20476           && CONST_INT_P (SET_SRC (prev_set))
20477           && CONST_INT_P (SET_SRC (curr_set)))
20478         return true;
20479
20480     }
20481   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
20482     {
20483       /* We're trying to match:
20484           prev (adrp) == (set (reg r0)
20485                               (high (symbol_ref ("SYM"))))
20486           curr (ldr) == (set (reg r1)
20487                              (mem (lo_sum (reg r0)
20488                                              (symbol_ref ("SYM")))))
20489                  or
20490           curr (ldr) == (set (reg r1)
20491                              (zero_extend (mem
20492                                            (lo_sum (reg r0)
20493                                                    (symbol_ref ("SYM"))))))  */
20494       if (satisfies_constraint_Ush (SET_SRC (prev_set))
20495           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20496         {
20497           rtx curr_src = SET_SRC (curr_set);
20498
20499           if (GET_CODE (curr_src) == ZERO_EXTEND)
20500             curr_src = XEXP (curr_src, 0);
20501
20502           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
20503               && REG_P (XEXP (XEXP (curr_src, 0), 0))
20504               && REGNO (XEXP (XEXP (curr_src, 0), 0))
20505                  == REGNO (SET_DEST (prev_set))
20506               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
20507                               XEXP (SET_SRC (prev_set), 0)))
20508               return true;
20509         }
20510     }
20511
20512   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
20513   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
20514       && prev_set && curr_set && any_condjump_p (curr)
20515       && GET_CODE (SET_SRC (prev_set)) == COMPARE
20516       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
20517       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
20518     return true;
20519
20520   /* Fuse flag-setting ALU instructions and conditional branch.  */
20521   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
20522       && any_condjump_p (curr))
20523     {
20524       unsigned int condreg1, condreg2;
20525       rtx cc_reg_1;
20526       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
20527       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
20528
20529       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
20530           && prev
20531           && modified_in_p (cc_reg_1, prev))
20532         {
20533           enum attr_type prev_type = get_attr_type (prev);
20534
20535           /* FIXME: this misses some which is considered simple arthematic
20536              instructions for ThunderX.  Simple shifts are missed here.  */
20537           if (prev_type == TYPE_ALUS_SREG
20538               || prev_type == TYPE_ALUS_IMM
20539               || prev_type == TYPE_LOGICS_REG
20540               || prev_type == TYPE_LOGICS_IMM)
20541             return true;
20542         }
20543     }
20544
20545   /* Fuse ALU instructions and CBZ/CBNZ.  */
20546   if (prev_set
20547       && curr_set
20548       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
20549       && any_condjump_p (curr))
20550     {
20551       /* We're trying to match:
20552           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
20553           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
20554                                                          (const_int 0))
20555                                                  (label_ref ("SYM"))
20556                                                  (pc))  */
20557       if (SET_DEST (curr_set) == (pc_rtx)
20558           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
20559           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
20560           && REG_P (SET_DEST (prev_set))
20561           && REGNO (SET_DEST (prev_set))
20562              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
20563         {
20564           /* Fuse ALU operations followed by conditional branch instruction.  */
20565           switch (get_attr_type (prev))
20566             {
20567             case TYPE_ALU_IMM:
20568             case TYPE_ALU_SREG:
20569             case TYPE_ADC_REG:
20570             case TYPE_ADC_IMM:
20571             case TYPE_ADCS_REG:
20572             case TYPE_ADCS_IMM:
20573             case TYPE_LOGIC_REG:
20574             case TYPE_LOGIC_IMM:
20575             case TYPE_CSEL:
20576             case TYPE_ADR:
20577             case TYPE_MOV_IMM:
20578             case TYPE_SHIFT_REG:
20579             case TYPE_SHIFT_IMM:
20580             case TYPE_BFM:
20581             case TYPE_RBIT:
20582             case TYPE_REV:
20583             case TYPE_EXTEND:
20584               return true;
20585
20586             default:;
20587             }
20588         }
20589     }
20590
20591   return false;
20592 }
20593
20594 /* Return true iff the instruction fusion described by OP is enabled.  */
20595
20596 bool
20597 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
20598 {
20599   return (aarch64_tune_params.fusible_ops & op) != 0;
20600 }
20601
20602 /* If MEM is in the form of [base+offset], extract the two parts
20603    of address and set to BASE and OFFSET, otherwise return false
20604    after clearing BASE and OFFSET.  */
20605
20606 bool
20607 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
20608 {
20609   rtx addr;
20610
20611   gcc_assert (MEM_P (mem));
20612
20613   addr = XEXP (mem, 0);
20614
20615   if (REG_P (addr))
20616     {
20617       *base = addr;
20618       *offset = const0_rtx;
20619       return true;
20620     }
20621
20622   if (GET_CODE (addr) == PLUS
20623       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
20624     {
20625       *base = XEXP (addr, 0);
20626       *offset = XEXP (addr, 1);
20627       return true;
20628     }
20629
20630   *base = NULL_RTX;
20631   *offset = NULL_RTX;
20632
20633   return false;
20634 }
20635
20636 /* Types for scheduling fusion.  */
20637 enum sched_fusion_type
20638 {
20639   SCHED_FUSION_NONE = 0,
20640   SCHED_FUSION_LD_SIGN_EXTEND,
20641   SCHED_FUSION_LD_ZERO_EXTEND,
20642   SCHED_FUSION_LD,
20643   SCHED_FUSION_ST,
20644   SCHED_FUSION_NUM
20645 };
20646
20647 /* If INSN is a load or store of address in the form of [base+offset],
20648    extract the two parts and set to BASE and OFFSET.  Return scheduling
20649    fusion type this INSN is.  */
20650
20651 static enum sched_fusion_type
20652 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
20653 {
20654   rtx x, dest, src;
20655   enum sched_fusion_type fusion = SCHED_FUSION_LD;
20656
20657   gcc_assert (INSN_P (insn));
20658   x = PATTERN (insn);
20659   if (GET_CODE (x) != SET)
20660     return SCHED_FUSION_NONE;
20661
20662   src = SET_SRC (x);
20663   dest = SET_DEST (x);
20664
20665   machine_mode dest_mode = GET_MODE (dest);
20666
20667   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
20668     return SCHED_FUSION_NONE;
20669
20670   if (GET_CODE (src) == SIGN_EXTEND)
20671     {
20672       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
20673       src = XEXP (src, 0);
20674       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20675         return SCHED_FUSION_NONE;
20676     }
20677   else if (GET_CODE (src) == ZERO_EXTEND)
20678     {
20679       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
20680       src = XEXP (src, 0);
20681       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20682         return SCHED_FUSION_NONE;
20683     }
20684
20685   if (GET_CODE (src) == MEM && REG_P (dest))
20686     extract_base_offset_in_addr (src, base, offset);
20687   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
20688     {
20689       fusion = SCHED_FUSION_ST;
20690       extract_base_offset_in_addr (dest, base, offset);
20691     }
20692   else
20693     return SCHED_FUSION_NONE;
20694
20695   if (*base == NULL_RTX || *offset == NULL_RTX)
20696     fusion = SCHED_FUSION_NONE;
20697
20698   return fusion;
20699 }
20700
20701 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
20702
20703    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
20704    and PRI are only calculated for these instructions.  For other instruction,
20705    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
20706    type instruction fusion can be added by returning different priorities.
20707
20708    It's important that irrelevant instructions get the largest FUSION_PRI.  */
20709
20710 static void
20711 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
20712                                int *fusion_pri, int *pri)
20713 {
20714   int tmp, off_val;
20715   rtx base, offset;
20716   enum sched_fusion_type fusion;
20717
20718   gcc_assert (INSN_P (insn));
20719
20720   tmp = max_pri - 1;
20721   fusion = fusion_load_store (insn, &base, &offset);
20722   if (fusion == SCHED_FUSION_NONE)
20723     {
20724       *pri = tmp;
20725       *fusion_pri = tmp;
20726       return;
20727     }
20728
20729   /* Set FUSION_PRI according to fusion type and base register.  */
20730   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
20731
20732   /* Calculate PRI.  */
20733   tmp /= 2;
20734
20735   /* INSN with smaller offset goes first.  */
20736   off_val = (int)(INTVAL (offset));
20737   if (off_val >= 0)
20738     tmp -= (off_val & 0xfffff);
20739   else
20740     tmp += ((- off_val) & 0xfffff);
20741
20742   *pri = tmp;
20743   return;
20744 }
20745
20746 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
20747    Adjust priority of sha1h instructions so they are scheduled before
20748    other SHA1 instructions.  */
20749
20750 static int
20751 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
20752 {
20753   rtx x = PATTERN (insn);
20754
20755   if (GET_CODE (x) == SET)
20756     {
20757       x = SET_SRC (x);
20758
20759       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
20760         return priority + 10;
20761     }
20762
20763   return priority;
20764 }
20765
20766 /* Given OPERANDS of consecutive load/store, check if we can merge
20767    them into ldp/stp.  LOAD is true if they are load instructions.
20768    MODE is the mode of memory operands.  */
20769
20770 bool
20771 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
20772                                 machine_mode mode)
20773 {
20774   HOST_WIDE_INT offval_1, offval_2, msize;
20775   enum reg_class rclass_1, rclass_2;
20776   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
20777
20778   if (load)
20779     {
20780       mem_1 = operands[1];
20781       mem_2 = operands[3];
20782       reg_1 = operands[0];
20783       reg_2 = operands[2];
20784       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
20785       if (REGNO (reg_1) == REGNO (reg_2))
20786         return false;
20787     }
20788   else
20789     {
20790       mem_1 = operands[0];
20791       mem_2 = operands[2];
20792       reg_1 = operands[1];
20793       reg_2 = operands[3];
20794     }
20795
20796   /* The mems cannot be volatile.  */
20797   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
20798     return false;
20799
20800   /* If we have SImode and slow unaligned ldp,
20801      check the alignment to be at least 8 byte. */
20802   if (mode == SImode
20803       && (aarch64_tune_params.extra_tuning_flags
20804           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
20805       && !optimize_size
20806       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
20807     return false;
20808
20809   /* Check if the addresses are in the form of [base+offset].  */
20810   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20811   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
20812     return false;
20813   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20814   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
20815     return false;
20816
20817   /* Check if the bases are same.  */
20818   if (!rtx_equal_p (base_1, base_2))
20819     return false;
20820
20821   /* The operands must be of the same size.  */
20822   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
20823                          GET_MODE_SIZE (GET_MODE (mem_2))));
20824
20825   offval_1 = INTVAL (offset_1);
20826   offval_2 = INTVAL (offset_2);
20827   /* We should only be trying this for fixed-sized modes.  There is no
20828      SVE LDP/STP instruction.  */
20829   msize = GET_MODE_SIZE (mode).to_constant ();
20830   /* Check if the offsets are consecutive.  */
20831   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
20832     return false;
20833
20834   /* Check if the addresses are clobbered by load.  */
20835   if (load)
20836     {
20837       if (reg_mentioned_p (reg_1, mem_1))
20838         return false;
20839
20840       /* In increasing order, the last load can clobber the address.  */
20841       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
20842         return false;
20843     }
20844
20845   /* One of the memory accesses must be a mempair operand.
20846      If it is not the first one, they need to be swapped by the
20847      peephole.  */
20848   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
20849        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
20850     return false;
20851
20852   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
20853     rclass_1 = FP_REGS;
20854   else
20855     rclass_1 = GENERAL_REGS;
20856
20857   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
20858     rclass_2 = FP_REGS;
20859   else
20860     rclass_2 = GENERAL_REGS;
20861
20862   /* Check if the registers are of same class.  */
20863   if (rclass_1 != rclass_2)
20864     return false;
20865
20866   return true;
20867 }
20868
20869 /* Given OPERANDS of consecutive load/store that can be merged,
20870    swap them if they are not in ascending order.  */
20871 void
20872 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
20873 {
20874   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
20875   HOST_WIDE_INT offval_1, offval_2;
20876
20877   if (load)
20878     {
20879       mem_1 = operands[1];
20880       mem_2 = operands[3];
20881     }
20882   else
20883     {
20884       mem_1 = operands[0];
20885       mem_2 = operands[2];
20886     }
20887
20888   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20889   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20890
20891   offval_1 = INTVAL (offset_1);
20892   offval_2 = INTVAL (offset_2);
20893
20894   if (offval_1 > offval_2)
20895     {
20896       /* Irrespective of whether this is a load or a store,
20897          we do the same swap.  */
20898       std::swap (operands[0], operands[2]);
20899       std::swap (operands[1], operands[3]);
20900     }
20901 }
20902
20903 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
20904    comparison between the two.  */
20905 int
20906 aarch64_host_wide_int_compare (const void *x, const void *y)
20907 {
20908   return wi::cmps (* ((const HOST_WIDE_INT *) x),
20909                    * ((const HOST_WIDE_INT *) y));
20910 }
20911
20912 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
20913    other pointing to a REG rtx containing an offset, compare the offsets
20914    of the two pairs.
20915
20916    Return:
20917
20918         1 iff offset (X) > offset (Y)
20919         0 iff offset (X) == offset (Y)
20920         -1 iff offset (X) < offset (Y)  */
20921 int
20922 aarch64_ldrstr_offset_compare (const void *x, const void *y)
20923 {
20924   const rtx * operands_1 = (const rtx *) x;
20925   const rtx * operands_2 = (const rtx *) y;
20926   rtx mem_1, mem_2, base, offset_1, offset_2;
20927
20928   if (MEM_P (operands_1[0]))
20929     mem_1 = operands_1[0];
20930   else
20931     mem_1 = operands_1[1];
20932
20933   if (MEM_P (operands_2[0]))
20934     mem_2 = operands_2[0];
20935   else
20936     mem_2 = operands_2[1];
20937
20938   /* Extract the offsets.  */
20939   extract_base_offset_in_addr (mem_1, &base, &offset_1);
20940   extract_base_offset_in_addr (mem_2, &base, &offset_2);
20941
20942   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
20943
20944   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
20945 }
20946
20947 /* Given OPERANDS of consecutive load/store, check if we can merge
20948    them into ldp/stp by adjusting the offset.  LOAD is true if they
20949    are load instructions.  MODE is the mode of memory operands.
20950
20951    Given below consecutive stores:
20952
20953      str  w1, [xb, 0x100]
20954      str  w1, [xb, 0x104]
20955      str  w1, [xb, 0x108]
20956      str  w1, [xb, 0x10c]
20957
20958    Though the offsets are out of the range supported by stp, we can
20959    still pair them after adjusting the offset, like:
20960
20961      add  scratch, xb, 0x100
20962      stp  w1, w1, [scratch]
20963      stp  w1, w1, [scratch, 0x8]
20964
20965    The peephole patterns detecting this opportunity should guarantee
20966    the scratch register is avaliable.  */
20967
20968 bool
20969 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
20970                                        scalar_mode mode)
20971 {
20972   const int num_insns = 4;
20973   enum reg_class rclass;
20974   HOST_WIDE_INT offvals[num_insns], msize;
20975   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
20976
20977   if (load)
20978     {
20979       for (int i = 0; i < num_insns; i++)
20980         {
20981           reg[i] = operands[2 * i];
20982           mem[i] = operands[2 * i + 1];
20983
20984           gcc_assert (REG_P (reg[i]));
20985         }
20986
20987       /* Do not attempt to merge the loads if the loads clobber each other.  */
20988       for (int i = 0; i < 8; i += 2)
20989         for (int j = i + 2; j < 8; j += 2)
20990           if (reg_overlap_mentioned_p (operands[i], operands[j]))
20991             return false;
20992     }
20993   else
20994     for (int i = 0; i < num_insns; i++)
20995       {
20996         mem[i] = operands[2 * i];
20997         reg[i] = operands[2 * i + 1];
20998       }
20999
21000   /* Skip if memory operand is by itself valid for ldp/stp.  */
21001   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
21002     return false;
21003
21004   for (int i = 0; i < num_insns; i++)
21005     {
21006       /* The mems cannot be volatile.  */
21007       if (MEM_VOLATILE_P (mem[i]))
21008         return false;
21009
21010       /* Check if the addresses are in the form of [base+offset].  */
21011       extract_base_offset_in_addr (mem[i], base + i, offset + i);
21012       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
21013         return false;
21014     }
21015
21016   /* Check if the registers are of same class.  */
21017   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
21018     ? FP_REGS : GENERAL_REGS;
21019
21020   for (int i = 1; i < num_insns; i++)
21021     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
21022       {
21023         if (rclass != FP_REGS)
21024           return false;
21025       }
21026     else
21027       {
21028         if (rclass != GENERAL_REGS)
21029           return false;
21030       }
21031
21032   /* Only the last register in the order in which they occur
21033      may be clobbered by the load.  */
21034   if (rclass == GENERAL_REGS && load)
21035     for (int i = 0; i < num_insns - 1; i++)
21036       if (reg_mentioned_p (reg[i], mem[i]))
21037         return false;
21038
21039   /* Check if the bases are same.  */
21040   for (int i = 0; i < num_insns - 1; i++)
21041     if (!rtx_equal_p (base[i], base[i + 1]))
21042       return false;
21043
21044   for (int i = 0; i < num_insns; i++)
21045     offvals[i] = INTVAL (offset[i]);
21046
21047   msize = GET_MODE_SIZE (mode);
21048
21049   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
21050   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
21051          aarch64_host_wide_int_compare);
21052
21053   if (!(offvals[1] == offvals[0] + msize
21054         && offvals[3] == offvals[2] + msize))
21055     return false;
21056
21057   /* Check that offsets are within range of each other.  The ldp/stp
21058      instructions have 7 bit immediate offsets, so use 0x80.  */
21059   if (offvals[2] - offvals[0] >= msize * 0x80)
21060     return false;
21061
21062   /* The offsets must be aligned with respect to each other.  */
21063   if (offvals[0] % msize != offvals[2] % msize)
21064     return false;
21065
21066   /* If we have SImode and slow unaligned ldp,
21067      check the alignment to be at least 8 byte. */
21068   if (mode == SImode
21069       && (aarch64_tune_params.extra_tuning_flags
21070           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
21071       && !optimize_size
21072       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
21073     return false;
21074
21075   return true;
21076 }
21077
21078 /* Given OPERANDS of consecutive load/store, this function pairs them
21079    into LDP/STP after adjusting the offset.  It depends on the fact
21080    that the operands can be sorted so the offsets are correct for STP.
21081    MODE is the mode of memory operands.  CODE is the rtl operator
21082    which should be applied to all memory operands, it's SIGN_EXTEND,
21083    ZERO_EXTEND or UNKNOWN.  */
21084
21085 bool
21086 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
21087                              scalar_mode mode, RTX_CODE code)
21088 {
21089   rtx base, offset_1, offset_3, t1, t2;
21090   rtx mem_1, mem_2, mem_3, mem_4;
21091   rtx temp_operands[8];
21092   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
21093                 stp_off_upper_limit, stp_off_lower_limit, msize;
21094
21095   /* We make changes on a copy as we may still bail out.  */
21096   for (int i = 0; i < 8; i ++)
21097     temp_operands[i] = operands[i];
21098
21099   /* Sort the operands.  */
21100   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
21101
21102   /* Copy the memory operands so that if we have to bail for some
21103      reason the original addresses are unchanged.  */
21104   if (load)
21105     {
21106       mem_1 = copy_rtx (temp_operands[1]);
21107       mem_2 = copy_rtx (temp_operands[3]);
21108       mem_3 = copy_rtx (temp_operands[5]);
21109       mem_4 = copy_rtx (temp_operands[7]);
21110     }
21111   else
21112     {
21113       mem_1 = copy_rtx (temp_operands[0]);
21114       mem_2 = copy_rtx (temp_operands[2]);
21115       mem_3 = copy_rtx (temp_operands[4]);
21116       mem_4 = copy_rtx (temp_operands[6]);
21117       gcc_assert (code == UNKNOWN);
21118     }
21119
21120   extract_base_offset_in_addr (mem_1, &base, &offset_1);
21121   extract_base_offset_in_addr (mem_3, &base, &offset_3);
21122   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
21123               && offset_3 != NULL_RTX);
21124
21125   /* Adjust offset so it can fit in LDP/STP instruction.  */
21126   msize = GET_MODE_SIZE (mode);
21127   stp_off_upper_limit = msize * (0x40 - 1);
21128   stp_off_lower_limit = - msize * 0x40;
21129
21130   off_val_1 = INTVAL (offset_1);
21131   off_val_3 = INTVAL (offset_3);
21132
21133   /* The base offset is optimally half way between the two STP/LDP offsets.  */
21134   if (msize <= 4)
21135     base_off = (off_val_1 + off_val_3) / 2;
21136   else
21137     /* However, due to issues with negative LDP/STP offset generation for
21138        larger modes, for DF, DI and vector modes. we must not use negative
21139        addresses smaller than 9 signed unadjusted bits can store.  This
21140        provides the most range in this case.  */
21141     base_off = off_val_1;
21142
21143   /* Adjust the base so that it is aligned with the addresses but still
21144      optimal.  */
21145   if (base_off % msize != off_val_1 % msize)
21146     /* Fix the offset, bearing in mind we want to make it bigger not
21147        smaller.  */
21148     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21149   else if (msize <= 4)
21150     /* The negative range of LDP/STP is one larger than the positive range.  */
21151     base_off += msize;
21152
21153   /* Check if base offset is too big or too small.  We can attempt to resolve
21154      this issue by setting it to the maximum value and seeing if the offsets
21155      still fit.  */
21156   if (base_off >= 0x1000)
21157     {
21158       base_off = 0x1000 - 1;
21159       /* We must still make sure that the base offset is aligned with respect
21160          to the address.  But it may may not be made any bigger.  */
21161       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21162     }
21163
21164   /* Likewise for the case where the base is too small.  */
21165   if (base_off <= -0x1000)
21166     {
21167       base_off = -0x1000 + 1;
21168       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21169     }
21170
21171   /* Offset of the first STP/LDP.  */
21172   new_off_1 = off_val_1 - base_off;
21173
21174   /* Offset of the second STP/LDP.  */
21175   new_off_3 = off_val_3 - base_off;
21176
21177   /* The offsets must be within the range of the LDP/STP instructions.  */
21178   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
21179       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
21180     return false;
21181
21182   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
21183                                                   new_off_1), true);
21184   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
21185                                                   new_off_1 + msize), true);
21186   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
21187                                                   new_off_3), true);
21188   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
21189                                                   new_off_3 + msize), true);
21190
21191   if (!aarch64_mem_pair_operand (mem_1, mode)
21192       || !aarch64_mem_pair_operand (mem_3, mode))
21193     return false;
21194
21195   if (code == ZERO_EXTEND)
21196     {
21197       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
21198       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
21199       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
21200       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
21201     }
21202   else if (code == SIGN_EXTEND)
21203     {
21204       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
21205       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
21206       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
21207       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
21208     }
21209
21210   if (load)
21211     {
21212       operands[0] = temp_operands[0];
21213       operands[1] = mem_1;
21214       operands[2] = temp_operands[2];
21215       operands[3] = mem_2;
21216       operands[4] = temp_operands[4];
21217       operands[5] = mem_3;
21218       operands[6] = temp_operands[6];
21219       operands[7] = mem_4;
21220     }
21221   else
21222     {
21223       operands[0] = mem_1;
21224       operands[1] = temp_operands[1];
21225       operands[2] = mem_2;
21226       operands[3] = temp_operands[3];
21227       operands[4] = mem_3;
21228       operands[5] = temp_operands[5];
21229       operands[6] = mem_4;
21230       operands[7] = temp_operands[7];
21231     }
21232
21233   /* Emit adjusting instruction.  */
21234   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
21235   /* Emit ldp/stp instructions.  */
21236   t1 = gen_rtx_SET (operands[0], operands[1]);
21237   t2 = gen_rtx_SET (operands[2], operands[3]);
21238   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21239   t1 = gen_rtx_SET (operands[4], operands[5]);
21240   t2 = gen_rtx_SET (operands[6], operands[7]);
21241   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21242   return true;
21243 }
21244
21245 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
21246    it isn't worth branching around empty masked ops (including masked
21247    stores).  */
21248
21249 static bool
21250 aarch64_empty_mask_is_expensive (unsigned)
21251 {
21252   return false;
21253 }
21254
21255 /* Return 1 if pseudo register should be created and used to hold
21256    GOT address for PIC code.  */
21257
21258 bool
21259 aarch64_use_pseudo_pic_reg (void)
21260 {
21261   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
21262 }
21263
21264 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
21265
21266 static int
21267 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
21268 {
21269   switch (XINT (x, 1))
21270     {
21271     case UNSPEC_GOTSMALLPIC:
21272     case UNSPEC_GOTSMALLPIC28K:
21273     case UNSPEC_GOTTINYPIC:
21274       return 0;
21275     default:
21276       break;
21277     }
21278
21279   return default_unspec_may_trap_p (x, flags);
21280 }
21281
21282
21283 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
21284    return the log2 of that value.  Otherwise return -1.  */
21285
21286 int
21287 aarch64_fpconst_pow_of_2 (rtx x)
21288 {
21289   const REAL_VALUE_TYPE *r;
21290
21291   if (!CONST_DOUBLE_P (x))
21292     return -1;
21293
21294   r = CONST_DOUBLE_REAL_VALUE (x);
21295
21296   if (REAL_VALUE_NEGATIVE (*r)
21297       || REAL_VALUE_ISNAN (*r)
21298       || REAL_VALUE_ISINF (*r)
21299       || !real_isinteger (r, DFmode))
21300     return -1;
21301
21302   return exact_log2 (real_to_integer (r));
21303 }
21304
21305 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
21306    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
21307    return n. Otherwise return -1.  */
21308
21309 int
21310 aarch64_fpconst_pow2_recip (rtx x)
21311 {
21312   REAL_VALUE_TYPE r0;
21313
21314   if (!CONST_DOUBLE_P (x))
21315     return -1;
21316
21317   r0 = *CONST_DOUBLE_REAL_VALUE (x);
21318   if (exact_real_inverse (DFmode, &r0)
21319       && !REAL_VALUE_NEGATIVE (r0))
21320     {
21321         int ret = exact_log2 (real_to_integer (&r0));
21322         if (ret >= 1 && ret <= 32)
21323             return ret;
21324     }
21325   return -1;
21326 }
21327
21328 /* If X is a vector of equal CONST_DOUBLE values and that value is
21329    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
21330
21331 int
21332 aarch64_vec_fpconst_pow_of_2 (rtx x)
21333 {
21334   int nelts;
21335   if (GET_CODE (x) != CONST_VECTOR
21336       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
21337     return -1;
21338
21339   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
21340     return -1;
21341
21342   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
21343   if (firstval <= 0)
21344     return -1;
21345
21346   for (int i = 1; i < nelts; i++)
21347     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
21348       return -1;
21349
21350   return firstval;
21351 }
21352
21353 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
21354    to float.
21355
21356    __fp16 always promotes through this hook.
21357    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
21358    through the generic excess precision logic rather than here.  */
21359
21360 static tree
21361 aarch64_promoted_type (const_tree t)
21362 {
21363   if (SCALAR_FLOAT_TYPE_P (t)
21364       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
21365     return float_type_node;
21366
21367   return NULL_TREE;
21368 }
21369
21370 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
21371
21372 static bool
21373 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
21374                            optimization_type opt_type)
21375 {
21376   switch (op)
21377     {
21378     case rsqrt_optab:
21379       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
21380
21381     default:
21382       return true;
21383     }
21384 }
21385
21386 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
21387
21388 static unsigned int
21389 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
21390                                         int *offset)
21391 {
21392   /* Polynomial invariant 1 == (VG / 2) - 1.  */
21393   gcc_assert (i == 1);
21394   *factor = 2;
21395   *offset = 1;
21396   return AARCH64_DWARF_VG;
21397 }
21398
21399 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
21400    if MODE is HFmode, and punt to the generic implementation otherwise.  */
21401
21402 static bool
21403 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
21404 {
21405   return (mode == HFmode
21406           ? true
21407           : default_libgcc_floating_mode_supported_p (mode));
21408 }
21409
21410 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
21411    if MODE is HFmode, and punt to the generic implementation otherwise.  */
21412
21413 static bool
21414 aarch64_scalar_mode_supported_p (scalar_mode mode)
21415 {
21416   return (mode == HFmode
21417           ? true
21418           : default_scalar_mode_supported_p (mode));
21419 }
21420
21421 /* Set the value of FLT_EVAL_METHOD.
21422    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
21423
21424     0: evaluate all operations and constants, whose semantic type has at
21425        most the range and precision of type float, to the range and
21426        precision of float; evaluate all other operations and constants to
21427        the range and precision of the semantic type;
21428
21429     N, where _FloatN is a supported interchange floating type
21430        evaluate all operations and constants, whose semantic type has at
21431        most the range and precision of _FloatN type, to the range and
21432        precision of the _FloatN type; evaluate all other operations and
21433        constants to the range and precision of the semantic type;
21434
21435    If we have the ARMv8.2-A extensions then we support _Float16 in native
21436    precision, so we should set this to 16.  Otherwise, we support the type,
21437    but want to evaluate expressions in float precision, so set this to
21438    0.  */
21439
21440 static enum flt_eval_method
21441 aarch64_excess_precision (enum excess_precision_type type)
21442 {
21443   switch (type)
21444     {
21445       case EXCESS_PRECISION_TYPE_FAST:
21446       case EXCESS_PRECISION_TYPE_STANDARD:
21447         /* We can calculate either in 16-bit range and precision or
21448            32-bit range and precision.  Make that decision based on whether
21449            we have native support for the ARMv8.2-A 16-bit floating-point
21450            instructions or not.  */
21451         return (TARGET_FP_F16INST
21452                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
21453                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
21454       case EXCESS_PRECISION_TYPE_IMPLICIT:
21455         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
21456       default:
21457         gcc_unreachable ();
21458     }
21459   return FLT_EVAL_METHOD_UNPREDICTABLE;
21460 }
21461
21462 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
21463    scheduled for speculative execution.  Reject the long-running division
21464    and square-root instructions.  */
21465
21466 static bool
21467 aarch64_sched_can_speculate_insn (rtx_insn *insn)
21468 {
21469   switch (get_attr_type (insn))
21470     {
21471       case TYPE_SDIV:
21472       case TYPE_UDIV:
21473       case TYPE_FDIVS:
21474       case TYPE_FDIVD:
21475       case TYPE_FSQRTS:
21476       case TYPE_FSQRTD:
21477       case TYPE_NEON_FP_SQRT_S:
21478       case TYPE_NEON_FP_SQRT_D:
21479       case TYPE_NEON_FP_SQRT_S_Q:
21480       case TYPE_NEON_FP_SQRT_D_Q:
21481       case TYPE_NEON_FP_DIV_S:
21482       case TYPE_NEON_FP_DIV_D:
21483       case TYPE_NEON_FP_DIV_S_Q:
21484       case TYPE_NEON_FP_DIV_D_Q:
21485         return false;
21486       default:
21487         return true;
21488     }
21489 }
21490
21491 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
21492
21493 static int
21494 aarch64_compute_pressure_classes (reg_class *classes)
21495 {
21496   int i = 0;
21497   classes[i++] = GENERAL_REGS;
21498   classes[i++] = FP_REGS;
21499   /* PR_REGS isn't a useful pressure class because many predicate pseudo
21500      registers need to go in PR_LO_REGS at some point during their
21501      lifetime.  Splitting it into two halves has the effect of making
21502      all predicates count against PR_LO_REGS, so that we try whenever
21503      possible to restrict the number of live predicates to 8.  This
21504      greatly reduces the amount of spilling in certain loops.  */
21505   classes[i++] = PR_LO_REGS;
21506   classes[i++] = PR_HI_REGS;
21507   return i;
21508 }
21509
21510 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
21511
21512 static bool
21513 aarch64_can_change_mode_class (machine_mode from,
21514                                machine_mode to, reg_class_t)
21515 {
21516   unsigned int from_flags = aarch64_classify_vector_mode (from);
21517   unsigned int to_flags = aarch64_classify_vector_mode (to);
21518
21519   bool from_sve_p = (from_flags & VEC_ANY_SVE);
21520   bool to_sve_p = (to_flags & VEC_ANY_SVE);
21521
21522   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
21523   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
21524
21525   /* Don't allow changes between partial SVE modes and other modes.
21526      The contents of partial SVE modes are distributed evenly across
21527      the register, whereas GCC expects them to be clustered together.  */
21528   if (from_partial_sve_p != to_partial_sve_p)
21529     return false;
21530
21531   /* Similarly reject changes between partial SVE modes that have
21532      different patterns of significant and insignificant bits.  */
21533   if (from_partial_sve_p
21534       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
21535           || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
21536     return false;
21537
21538   if (BYTES_BIG_ENDIAN)
21539     {
21540       /* Don't allow changes between SVE data modes and non-SVE modes.
21541          See the comment at the head of aarch64-sve.md for details.  */
21542       if (from_sve_p != to_sve_p)
21543         return false;
21544
21545       /* Don't allow changes in element size: lane 0 of the new vector
21546          would not then be lane 0 of the old vector.  See the comment
21547          above aarch64_maybe_expand_sve_subreg_move for a more detailed
21548          description.
21549
21550          In the worst case, this forces a register to be spilled in
21551          one mode and reloaded in the other, which handles the
21552          endianness correctly.  */
21553       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
21554         return false;
21555     }
21556   return true;
21557 }
21558
21559 /* Implement TARGET_EARLY_REMAT_MODES.  */
21560
21561 static void
21562 aarch64_select_early_remat_modes (sbitmap modes)
21563 {
21564   /* SVE values are not normally live across a call, so it should be
21565      worth doing early rematerialization even in VL-specific mode.  */
21566   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
21567     if (aarch64_sve_mode_p ((machine_mode) i))
21568       bitmap_set_bit (modes, i);
21569 }
21570
21571 /* Override the default target speculation_safe_value.  */
21572 static rtx
21573 aarch64_speculation_safe_value (machine_mode mode,
21574                                 rtx result, rtx val, rtx failval)
21575 {
21576   /* Maybe we should warn if falling back to hard barriers.  They are
21577      likely to be noticably more expensive than the alternative below.  */
21578   if (!aarch64_track_speculation)
21579     return default_speculation_safe_value (mode, result, val, failval);
21580
21581   if (!REG_P (val))
21582     val = copy_to_mode_reg (mode, val);
21583
21584   if (!aarch64_reg_or_zero (failval, mode))
21585     failval = copy_to_mode_reg (mode, failval);
21586
21587   emit_insn (gen_despeculate_copy (mode, result, val, failval));
21588   return result;
21589 }
21590
21591 /* Implement TARGET_ESTIMATED_POLY_VALUE.
21592    Look into the tuning structure for an estimate.
21593    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
21594    Advanced SIMD 128 bits.  */
21595
21596 static HOST_WIDE_INT
21597 aarch64_estimated_poly_value (poly_int64 val)
21598 {
21599   enum aarch64_sve_vector_bits_enum width_source
21600     = aarch64_tune_params.sve_width;
21601
21602   /* If we still don't have an estimate, use the default.  */
21603   if (width_source == SVE_SCALABLE)
21604     return default_estimated_poly_value (val);
21605
21606   HOST_WIDE_INT over_128 = width_source - 128;
21607   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
21608 }
21609
21610
21611 /* Return true for types that could be supported as SIMD return or
21612    argument types.  */
21613
21614 static bool
21615 supported_simd_type (tree t)
21616 {
21617   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
21618     {
21619       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
21620       return s == 1 || s == 2 || s == 4 || s == 8;
21621     }
21622   return false;
21623 }
21624
21625 /* Return true for types that currently are supported as SIMD return
21626    or argument types.  */
21627
21628 static bool
21629 currently_supported_simd_type (tree t, tree b)
21630 {
21631   if (COMPLEX_FLOAT_TYPE_P (t))
21632     return false;
21633
21634   if (TYPE_SIZE (t) != TYPE_SIZE (b))
21635     return false;
21636
21637   return supported_simd_type (t);
21638 }
21639
21640 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
21641
21642 static int
21643 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
21644                                         struct cgraph_simd_clone *clonei,
21645                                         tree base_type, int num)
21646 {
21647   tree t, ret_type, arg_type;
21648   unsigned int elt_bits, vec_bits, count;
21649
21650   if (!TARGET_SIMD)
21651     return 0;
21652
21653   if (clonei->simdlen
21654       && (clonei->simdlen < 2
21655           || clonei->simdlen > 1024
21656           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
21657     {
21658       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21659                   "unsupported simdlen %d", clonei->simdlen);
21660       return 0;
21661     }
21662
21663   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
21664   if (TREE_CODE (ret_type) != VOID_TYPE
21665       && !currently_supported_simd_type (ret_type, base_type))
21666     {
21667       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
21668         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21669                     "GCC does not currently support mixed size types "
21670                     "for %<simd%> functions");
21671       else if (supported_simd_type (ret_type))
21672         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21673                     "GCC does not currently support return type %qT "
21674                     "for %<simd%> functions", ret_type);
21675       else
21676         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21677                     "unsupported return type %qT for %<simd%> functions",
21678                     ret_type);
21679       return 0;
21680     }
21681
21682   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
21683     {
21684       arg_type = TREE_TYPE (t);
21685
21686       if (!currently_supported_simd_type (arg_type, base_type))
21687         {
21688           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
21689             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21690                         "GCC does not currently support mixed size types "
21691                         "for %<simd%> functions");
21692           else
21693             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21694                         "GCC does not currently support argument type %qT "
21695                         "for %<simd%> functions", arg_type);
21696           return 0;
21697         }
21698     }
21699
21700   clonei->vecsize_mangle = 'n';
21701   clonei->mask_mode = VOIDmode;
21702   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
21703   if (clonei->simdlen == 0)
21704     {
21705       count = 2;
21706       vec_bits = (num == 0 ? 64 : 128);
21707       clonei->simdlen = vec_bits / elt_bits;
21708     }
21709   else
21710     {
21711       count = 1;
21712       vec_bits = clonei->simdlen * elt_bits;
21713       if (vec_bits != 64 && vec_bits != 128)
21714         {
21715           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21716                       "GCC does not currently support simdlen %d for type %qT",
21717                       clonei->simdlen, base_type);
21718           return 0;
21719         }
21720     }
21721   clonei->vecsize_int = vec_bits;
21722   clonei->vecsize_float = vec_bits;
21723   return count;
21724 }
21725
21726 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
21727
21728 static void
21729 aarch64_simd_clone_adjust (struct cgraph_node *node)
21730 {
21731   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
21732      use the correct ABI.  */
21733
21734   tree t = TREE_TYPE (node->decl);
21735   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
21736                                         TYPE_ATTRIBUTES (t));
21737 }
21738
21739 /* Implement TARGET_SIMD_CLONE_USABLE.  */
21740
21741 static int
21742 aarch64_simd_clone_usable (struct cgraph_node *node)
21743 {
21744   switch (node->simdclone->vecsize_mangle)
21745     {
21746     case 'n':
21747       if (!TARGET_SIMD)
21748         return -1;
21749       return 0;
21750     default:
21751       gcc_unreachable ();
21752     }
21753 }
21754
21755 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
21756
21757 static int
21758 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
21759 {
21760   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
21761       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
21762     return 0;
21763   return 1;
21764 }
21765
21766 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
21767
21768 static const char *
21769 aarch64_get_multilib_abi_name (void)
21770 {
21771   if (TARGET_BIG_END)
21772     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
21773   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
21774 }
21775
21776 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
21777    global variable based guard use the default else
21778    return a null tree.  */
21779 static tree
21780 aarch64_stack_protect_guard (void)
21781 {
21782   if (aarch64_stack_protector_guard == SSP_GLOBAL)
21783     return default_stack_protect_guard ();
21784
21785   return NULL_TREE;
21786 }
21787
21788 /* Return the diagnostic message string if conversion from FROMTYPE to
21789    TOTYPE is not allowed, NULL otherwise.  */
21790
21791 static const char *
21792 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
21793 {
21794   if (element_mode (fromtype) != element_mode (totype))
21795     {
21796       /* Do no allow conversions to/from BFmode scalar types.  */
21797       if (TYPE_MODE (fromtype) == BFmode)
21798         return N_("invalid conversion from type %<bfloat16_t%>");
21799       if (TYPE_MODE (totype) == BFmode)
21800         return N_("invalid conversion to type %<bfloat16_t%>");
21801     }
21802
21803   /* Conversion allowed.  */
21804   return NULL;
21805 }
21806
21807 /* Return the diagnostic message string if the unary operation OP is
21808    not permitted on TYPE, NULL otherwise.  */
21809
21810 static const char *
21811 aarch64_invalid_unary_op (int op, const_tree type)
21812 {
21813   /* Reject all single-operand operations on BFmode except for &.  */
21814   if (element_mode (type) == BFmode && op != ADDR_EXPR)
21815     return N_("operation not permitted on type %<bfloat16_t%>");
21816
21817   /* Operation allowed.  */
21818   return NULL;
21819 }
21820
21821 /* Return the diagnostic message string if the binary operation OP is
21822    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
21823
21824 static const char *
21825 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
21826                            const_tree type2)
21827 {
21828   /* Reject all 2-operand operations on BFmode.  */
21829   if (element_mode (type1) == BFmode
21830       || element_mode (type2) == BFmode)
21831     return N_("operation not permitted on type %<bfloat16_t%>");
21832
21833   /* Operation allowed.  */
21834   return NULL;
21835 }
21836
21837 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
21838    section at the end if needed.  */
21839 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
21840 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
21841 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
21842 void
21843 aarch64_file_end_indicate_exec_stack ()
21844 {
21845   file_end_indicate_exec_stack ();
21846
21847   unsigned feature_1_and = 0;
21848   if (aarch64_bti_enabled ())
21849     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
21850
21851   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
21852     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
21853
21854   if (feature_1_and)
21855     {
21856       /* Generate .note.gnu.property section.  */
21857       switch_to_section (get_section (".note.gnu.property",
21858                                       SECTION_NOTYPE, NULL));
21859
21860       /* PT_NOTE header: namesz, descsz, type.
21861          namesz = 4 ("GNU\0")
21862          descsz = 16 (Size of the program property array)
21863                   [(12 + padding) * Number of array elements]
21864          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
21865       assemble_align (POINTER_SIZE);
21866       assemble_integer (GEN_INT (4), 4, 32, 1);
21867       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
21868       assemble_integer (GEN_INT (5), 4, 32, 1);
21869
21870       /* PT_NOTE name.  */
21871       assemble_string ("GNU", 4);
21872
21873       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
21874          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
21875          datasz = 4
21876          data   = feature_1_and.  */
21877       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
21878       assemble_integer (GEN_INT (4), 4, 32, 1);
21879       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
21880
21881       /* Pad the size of the note to the required alignment.  */
21882       assemble_align (POINTER_SIZE);
21883     }
21884 }
21885 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
21886 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
21887 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
21888
21889 /* Target-specific selftests.  */
21890
21891 #if CHECKING_P
21892
21893 namespace selftest {
21894
21895 /* Selftest for the RTL loader.
21896    Verify that the RTL loader copes with a dump from
21897    print_rtx_function.  This is essentially just a test that class
21898    function_reader can handle a real dump, but it also verifies
21899    that lookup_reg_by_dump_name correctly handles hard regs.
21900    The presence of hard reg names in the dump means that the test is
21901    target-specific, hence it is in this file.  */
21902
21903 static void
21904 aarch64_test_loading_full_dump ()
21905 {
21906   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
21907
21908   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
21909
21910   rtx_insn *insn_1 = get_insn_by_uid (1);
21911   ASSERT_EQ (NOTE, GET_CODE (insn_1));
21912
21913   rtx_insn *insn_15 = get_insn_by_uid (15);
21914   ASSERT_EQ (INSN, GET_CODE (insn_15));
21915   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
21916
21917   /* Verify crtl->return_rtx.  */
21918   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
21919   ASSERT_EQ (0, REGNO (crtl->return_rtx));
21920   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
21921 }
21922
21923 /* Run all target-specific selftests.  */
21924
21925 static void
21926 aarch64_run_selftests (void)
21927 {
21928   aarch64_test_loading_full_dump ();
21929 }
21930
21931 } // namespace selftest
21932
21933 #endif /* #if CHECKING_P */
21934
21935 #undef TARGET_STACK_PROTECT_GUARD
21936 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
21937
21938 #undef TARGET_ADDRESS_COST
21939 #define TARGET_ADDRESS_COST aarch64_address_cost
21940
21941 /* This hook will determines whether unnamed bitfields affect the alignment
21942    of the containing structure.  The hook returns true if the structure
21943    should inherit the alignment requirements of an unnamed bitfield's
21944    type.  */
21945 #undef TARGET_ALIGN_ANON_BITFIELD
21946 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
21947
21948 #undef TARGET_ASM_ALIGNED_DI_OP
21949 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
21950
21951 #undef TARGET_ASM_ALIGNED_HI_OP
21952 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
21953
21954 #undef TARGET_ASM_ALIGNED_SI_OP
21955 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
21956
21957 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
21958 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
21959   hook_bool_const_tree_hwi_hwi_const_tree_true
21960
21961 #undef TARGET_ASM_FILE_START
21962 #define TARGET_ASM_FILE_START aarch64_start_file
21963
21964 #undef TARGET_ASM_OUTPUT_MI_THUNK
21965 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
21966
21967 #undef TARGET_ASM_SELECT_RTX_SECTION
21968 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
21969
21970 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
21971 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
21972
21973 #undef TARGET_BUILD_BUILTIN_VA_LIST
21974 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
21975
21976 #undef TARGET_CALLEE_COPIES
21977 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
21978
21979 #undef TARGET_CAN_ELIMINATE
21980 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
21981
21982 #undef TARGET_CAN_INLINE_P
21983 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
21984
21985 #undef TARGET_CANNOT_FORCE_CONST_MEM
21986 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
21987
21988 #undef TARGET_CASE_VALUES_THRESHOLD
21989 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
21990
21991 #undef TARGET_CONDITIONAL_REGISTER_USAGE
21992 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
21993
21994 /* Only the least significant bit is used for initialization guard
21995    variables.  */
21996 #undef TARGET_CXX_GUARD_MASK_BIT
21997 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
21998
21999 #undef TARGET_C_MODE_FOR_SUFFIX
22000 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
22001
22002 #ifdef TARGET_BIG_ENDIAN_DEFAULT
22003 #undef  TARGET_DEFAULT_TARGET_FLAGS
22004 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
22005 #endif
22006
22007 #undef TARGET_CLASS_MAX_NREGS
22008 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
22009
22010 #undef TARGET_BUILTIN_DECL
22011 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
22012
22013 #undef TARGET_BUILTIN_RECIPROCAL
22014 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
22015
22016 #undef TARGET_C_EXCESS_PRECISION
22017 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
22018
22019 #undef  TARGET_EXPAND_BUILTIN
22020 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
22021
22022 #undef TARGET_EXPAND_BUILTIN_VA_START
22023 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
22024
22025 #undef TARGET_FOLD_BUILTIN
22026 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
22027
22028 #undef TARGET_FUNCTION_ARG
22029 #define TARGET_FUNCTION_ARG aarch64_function_arg
22030
22031 #undef TARGET_FUNCTION_ARG_ADVANCE
22032 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
22033
22034 #undef TARGET_FUNCTION_ARG_BOUNDARY
22035 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
22036
22037 #undef TARGET_FUNCTION_ARG_PADDING
22038 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
22039
22040 #undef TARGET_GET_RAW_RESULT_MODE
22041 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
22042 #undef TARGET_GET_RAW_ARG_MODE
22043 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
22044
22045 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
22046 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
22047
22048 #undef TARGET_FUNCTION_VALUE
22049 #define TARGET_FUNCTION_VALUE aarch64_function_value
22050
22051 #undef TARGET_FUNCTION_VALUE_REGNO_P
22052 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
22053
22054 #undef TARGET_GIMPLE_FOLD_BUILTIN
22055 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
22056
22057 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
22058 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
22059
22060 #undef  TARGET_INIT_BUILTINS
22061 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
22062
22063 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
22064 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
22065   aarch64_ira_change_pseudo_allocno_class
22066
22067 #undef TARGET_LEGITIMATE_ADDRESS_P
22068 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
22069
22070 #undef TARGET_LEGITIMATE_CONSTANT_P
22071 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
22072
22073 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
22074 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
22075   aarch64_legitimize_address_displacement
22076
22077 #undef TARGET_LIBGCC_CMP_RETURN_MODE
22078 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
22079
22080 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
22081 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
22082 aarch64_libgcc_floating_mode_supported_p
22083
22084 #undef TARGET_MANGLE_TYPE
22085 #define TARGET_MANGLE_TYPE aarch64_mangle_type
22086
22087 #undef TARGET_INVALID_CONVERSION
22088 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
22089
22090 #undef TARGET_INVALID_UNARY_OP
22091 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
22092
22093 #undef TARGET_INVALID_BINARY_OP
22094 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
22095
22096 #undef TARGET_VERIFY_TYPE_CONTEXT
22097 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
22098
22099 #undef TARGET_MEMORY_MOVE_COST
22100 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
22101
22102 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
22103 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
22104
22105 #undef TARGET_MUST_PASS_IN_STACK
22106 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
22107
22108 /* This target hook should return true if accesses to volatile bitfields
22109    should use the narrowest mode possible.  It should return false if these
22110    accesses should use the bitfield container type.  */
22111 #undef TARGET_NARROW_VOLATILE_BITFIELD
22112 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
22113
22114 #undef  TARGET_OPTION_OVERRIDE
22115 #define TARGET_OPTION_OVERRIDE aarch64_override_options
22116
22117 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
22118 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
22119   aarch64_override_options_after_change
22120
22121 #undef TARGET_OPTION_SAVE
22122 #define TARGET_OPTION_SAVE aarch64_option_save
22123
22124 #undef TARGET_OPTION_RESTORE
22125 #define TARGET_OPTION_RESTORE aarch64_option_restore
22126
22127 #undef TARGET_OPTION_PRINT
22128 #define TARGET_OPTION_PRINT aarch64_option_print
22129
22130 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
22131 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
22132
22133 #undef TARGET_SET_CURRENT_FUNCTION
22134 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
22135
22136 #undef TARGET_PASS_BY_REFERENCE
22137 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
22138
22139 #undef TARGET_PREFERRED_RELOAD_CLASS
22140 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
22141
22142 #undef TARGET_SCHED_REASSOCIATION_WIDTH
22143 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
22144
22145 #undef TARGET_PROMOTED_TYPE
22146 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
22147
22148 #undef TARGET_SECONDARY_RELOAD
22149 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
22150
22151 #undef TARGET_SHIFT_TRUNCATION_MASK
22152 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
22153
22154 #undef TARGET_SETUP_INCOMING_VARARGS
22155 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
22156
22157 #undef TARGET_STRUCT_VALUE_RTX
22158 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
22159
22160 #undef TARGET_REGISTER_MOVE_COST
22161 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
22162
22163 #undef TARGET_RETURN_IN_MEMORY
22164 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
22165
22166 #undef TARGET_RETURN_IN_MSB
22167 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
22168
22169 #undef TARGET_RTX_COSTS
22170 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
22171
22172 #undef TARGET_SCALAR_MODE_SUPPORTED_P
22173 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
22174
22175 #undef TARGET_SCHED_ISSUE_RATE
22176 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
22177
22178 #undef TARGET_SCHED_VARIABLE_ISSUE
22179 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
22180
22181 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
22182 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
22183   aarch64_sched_first_cycle_multipass_dfa_lookahead
22184
22185 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
22186 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
22187   aarch64_first_cycle_multipass_dfa_lookahead_guard
22188
22189 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
22190 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
22191   aarch64_get_separate_components
22192
22193 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
22194 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
22195   aarch64_components_for_bb
22196
22197 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
22198 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
22199   aarch64_disqualify_components
22200
22201 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
22202 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
22203   aarch64_emit_prologue_components
22204
22205 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
22206 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
22207   aarch64_emit_epilogue_components
22208
22209 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
22210 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
22211   aarch64_set_handled_components
22212
22213 #undef TARGET_TRAMPOLINE_INIT
22214 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
22215
22216 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
22217 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
22218
22219 #undef TARGET_VECTOR_MODE_SUPPORTED_P
22220 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
22221
22222 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
22223 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
22224
22225 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
22226 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
22227   aarch64_builtin_support_vector_misalignment
22228
22229 #undef TARGET_ARRAY_MODE
22230 #define TARGET_ARRAY_MODE aarch64_array_mode
22231
22232 #undef TARGET_ARRAY_MODE_SUPPORTED_P
22233 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
22234
22235 #undef TARGET_VECTORIZE_ADD_STMT_COST
22236 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
22237
22238 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
22239 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
22240   aarch64_builtin_vectorization_cost
22241
22242 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
22243 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
22244
22245 #undef TARGET_VECTORIZE_BUILTINS
22246 #define TARGET_VECTORIZE_BUILTINS
22247
22248 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
22249 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
22250   aarch64_builtin_vectorized_function
22251
22252 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
22253 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
22254   aarch64_autovectorize_vector_modes
22255
22256 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
22257 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
22258   aarch64_atomic_assign_expand_fenv
22259
22260 /* Section anchor support.  */
22261
22262 #undef TARGET_MIN_ANCHOR_OFFSET
22263 #define TARGET_MIN_ANCHOR_OFFSET -256
22264
22265 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
22266    byte offset; we can do much more for larger data types, but have no way
22267    to determine the size of the access.  We assume accesses are aligned.  */
22268 #undef TARGET_MAX_ANCHOR_OFFSET
22269 #define TARGET_MAX_ANCHOR_OFFSET 4095
22270
22271 #undef TARGET_VECTOR_ALIGNMENT
22272 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
22273
22274 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
22275 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
22276   aarch64_vectorize_preferred_vector_alignment
22277 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
22278 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
22279   aarch64_simd_vector_alignment_reachable
22280
22281 /* vec_perm support.  */
22282
22283 #undef TARGET_VECTORIZE_VEC_PERM_CONST
22284 #define TARGET_VECTORIZE_VEC_PERM_CONST \
22285   aarch64_vectorize_vec_perm_const
22286
22287 #undef TARGET_VECTORIZE_RELATED_MODE
22288 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
22289 #undef TARGET_VECTORIZE_GET_MASK_MODE
22290 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
22291 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
22292 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
22293   aarch64_empty_mask_is_expensive
22294 #undef TARGET_PREFERRED_ELSE_VALUE
22295 #define TARGET_PREFERRED_ELSE_VALUE \
22296   aarch64_preferred_else_value
22297
22298 #undef TARGET_INIT_LIBFUNCS
22299 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
22300
22301 #undef TARGET_FIXED_CONDITION_CODE_REGS
22302 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
22303
22304 #undef TARGET_FLAGS_REGNUM
22305 #define TARGET_FLAGS_REGNUM CC_REGNUM
22306
22307 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
22308 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
22309
22310 #undef TARGET_ASAN_SHADOW_OFFSET
22311 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
22312
22313 #undef TARGET_LEGITIMIZE_ADDRESS
22314 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
22315
22316 #undef TARGET_SCHED_CAN_SPECULATE_INSN
22317 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
22318
22319 #undef TARGET_CAN_USE_DOLOOP_P
22320 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
22321
22322 #undef TARGET_SCHED_ADJUST_PRIORITY
22323 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
22324
22325 #undef TARGET_SCHED_MACRO_FUSION_P
22326 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
22327
22328 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
22329 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
22330
22331 #undef TARGET_SCHED_FUSION_PRIORITY
22332 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
22333
22334 #undef TARGET_UNSPEC_MAY_TRAP_P
22335 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
22336
22337 #undef TARGET_USE_PSEUDO_PIC_REG
22338 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
22339
22340 #undef TARGET_PRINT_OPERAND
22341 #define TARGET_PRINT_OPERAND aarch64_print_operand
22342
22343 #undef TARGET_PRINT_OPERAND_ADDRESS
22344 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
22345
22346 #undef TARGET_OPTAB_SUPPORTED_P
22347 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
22348
22349 #undef TARGET_OMIT_STRUCT_RETURN_REG
22350 #define TARGET_OMIT_STRUCT_RETURN_REG true
22351
22352 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
22353 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
22354   aarch64_dwarf_poly_indeterminate_value
22355
22356 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
22357 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
22358 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
22359
22360 #undef TARGET_HARD_REGNO_NREGS
22361 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
22362 #undef TARGET_HARD_REGNO_MODE_OK
22363 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
22364
22365 #undef TARGET_MODES_TIEABLE_P
22366 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
22367
22368 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
22369 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
22370   aarch64_hard_regno_call_part_clobbered
22371
22372 #undef TARGET_INSN_CALLEE_ABI
22373 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
22374
22375 #undef TARGET_CONSTANT_ALIGNMENT
22376 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
22377
22378 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
22379 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
22380   aarch64_stack_clash_protection_alloca_probe_range
22381
22382 #undef TARGET_COMPUTE_PRESSURE_CLASSES
22383 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
22384
22385 #undef TARGET_CAN_CHANGE_MODE_CLASS
22386 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
22387
22388 #undef TARGET_SELECT_EARLY_REMAT_MODES
22389 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
22390
22391 #undef TARGET_SPECULATION_SAFE_VALUE
22392 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
22393
22394 #undef TARGET_ESTIMATED_POLY_VALUE
22395 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
22396
22397 #undef TARGET_ATTRIBUTE_TABLE
22398 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
22399
22400 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
22401 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
22402   aarch64_simd_clone_compute_vecsize_and_simdlen
22403
22404 #undef TARGET_SIMD_CLONE_ADJUST
22405 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
22406
22407 #undef TARGET_SIMD_CLONE_USABLE
22408 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
22409
22410 #undef TARGET_COMP_TYPE_ATTRIBUTES
22411 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
22412
22413 #undef TARGET_GET_MULTILIB_ABI_NAME
22414 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
22415
22416 #undef TARGET_FNTYPE_ABI
22417 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
22418
22419 #if CHECKING_P
22420 #undef TARGET_RUN_TARGET_SELFTESTS
22421 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
22422 #endif /* #if CHECKING_P */
22423
22424 #undef TARGET_ASM_POST_CFI_STARTPROC
22425 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
22426
22427 #undef TARGET_STRICT_ARGUMENT_NAMING
22428 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
22429
22430 #undef TARGET_MD_ASM_ADJUST
22431 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
22432
22433 struct gcc_target targetm = TARGET_INITIALIZER;
22434
22435 #include "gt-aarch64.h"