gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76
  77 /* This file should be included last.  */
  78 #include "target-def.h"
  79
  80 /* Defined for convenience.  */
  81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  82
  83 /* Information about a legitimate vector immediate operand.  */
  84 struct simd_immediate_info
  85 {
  86   enum insn_type { MOV, MVN, INDEX, PTRUE };
  87   enum modifier_type { LSL, MSL };
  88
  89   simd_immediate_info () {}
  90   simd_immediate_info (scalar_float_mode, rtx);
  91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  92                        insn_type = MOV, modifier_type = LSL,
  93                        unsigned int = 0);
  94   simd_immediate_info (scalar_mode, rtx, rtx);
  95   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  96
  97   /* The mode of the elements.  */
  98   scalar_mode elt_mode;
  99
 100   /* The instruction to use to move the immediate into a vector.  */
 101   insn_type insn;
 102
 103   union
 104   {
 105     /* For MOV and MVN.  */
 106     struct
 107     {
 108       /* The value of each element.  */
 109       rtx value;
 110
 111       /* The kind of shift modifier to use, and the number of bits to shift.
 112          This is (LSL, 0) if no shift is needed.  */
 113       modifier_type modifier;
 114       unsigned int shift;
 115     } mov;
 116
 117     /* For INDEX.  */
 118     struct
 119     {
 120       /* The value of the first element and the step to be added for each
 121          subsequent element.  */
 122       rtx base, step;
 123     } index;
 124
 125     /* For PTRUE.  */
 126     aarch64_svpattern pattern;
 127   } u;
 128 };
 129
 130 /* Construct a floating-point immediate in which each element has mode
 131    ELT_MODE_IN and value VALUE_IN.  */
 132 inline simd_immediate_info
 133 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 134   : elt_mode (elt_mode_in), insn (MOV)
 135 {
 136   u.mov.value = value_in;
 137   u.mov.modifier = LSL;
 138   u.mov.shift = 0;
 139 }
 140
 141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 142    and value VALUE_IN.  The other parameters are as for the structure
 143    fields.  */
 144 inline simd_immediate_info
 145 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 146                        unsigned HOST_WIDE_INT value_in,
 147                        insn_type insn_in, modifier_type modifier_in,
 148                        unsigned int shift_in)
 149   : elt_mode (elt_mode_in), insn (insn_in)
 150 {
 151   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 152   u.mov.modifier = modifier_in;
 153   u.mov.shift = shift_in;
 154 }
 155
 156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 157    and where element I is equal to BASE_IN + I * STEP_IN.  */
 158 inline simd_immediate_info
 159 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 160   : elt_mode (elt_mode_in), insn (INDEX)
 161 {
 162   u.index.base = base_in;
 163   u.index.step = step_in;
 164 }
 165
 166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 167    and has PTRUE pattern PATTERN_IN.  */
 168 inline simd_immediate_info
 169 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 170                        aarch64_svpattern pattern_in)
 171   : elt_mode (elt_mode_in), insn (PTRUE)
 172 {
 173   u.pattern = pattern_in;
 174 }
 175
 176 /* The current code model.  */
 177 enum aarch64_code_model aarch64_cmodel;
 178
 179 /* The number of 64-bit elements in an SVE vector.  */
 180 poly_uint16 aarch64_sve_vg;
 181
 182 #ifdef HAVE_AS_TLS
 183 #undef TARGET_HAVE_TLS
 184 #define TARGET_HAVE_TLS 1
 185 #endif
 186
 187 static bool aarch64_composite_type_p (const_tree, machine_mode);
 188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 189                                                      const_tree,
 190                                                      machine_mode *, int *,
 191                                                      bool *);
 192 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 193 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 194 static void aarch64_override_options_after_change (void);
 195 static bool aarch64_vector_mode_supported_p (machine_mode);
 196 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 198                                                          const_tree type,
 199                                                          int misalignment,
 200                                                          bool is_packed);
 201 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 202 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 203                                             aarch64_addr_query_type);
 204 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 205
 206 /* Major revision number of the ARM Architecture implemented by the target.  */
 207 unsigned aarch64_architecture_version;
 208
 209 /* The processor for which instructions should be scheduled.  */
 210 enum aarch64_processor aarch64_tune = cortexa53;
 211
 212 /* Mask to specify which instruction scheduling options should be used.  */
 213 uint64_t aarch64_tune_flags = 0;
 214
 215 /* Global flag for PC relative loads.  */
 216 bool aarch64_pcrelative_literal_loads;
 217
 218 /* Global flag for whether frame pointer is enabled.  */
 219 bool aarch64_use_frame_pointer;
 220
 221 #define BRANCH_PROTECT_STR_MAX 255
 222 char *accepted_branch_protection_string = NULL;
 223
 224 static enum aarch64_parse_opt_result
 225 aarch64_parse_branch_protection (const char*, char**);
 226
 227 /* Support for command line parsing of boolean flags in the tuning
 228    structures.  */
 229 struct aarch64_flag_desc
 230 {
 231   const char* name;
 232   unsigned int flag;
 233 };
 234
 235 #define AARCH64_FUSION_PAIR(name, internal_name) \
 236   { name, AARCH64_FUSE_##internal_name },
 237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 238 {
 239   { "none", AARCH64_FUSE_NOTHING },
 240 #include "aarch64-fusion-pairs.def"
 241   { "all", AARCH64_FUSE_ALL },
 242   { NULL, AARCH64_FUSE_NOTHING }
 243 };
 244
 245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 246   { name, AARCH64_EXTRA_TUNE_##internal_name },
 247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 248 {
 249   { "none", AARCH64_EXTRA_TUNE_NONE },
 250 #include "aarch64-tuning-flags.def"
 251   { "all", AARCH64_EXTRA_TUNE_ALL },
 252   { NULL, AARCH64_EXTRA_TUNE_NONE }
 253 };
 254
 255 /* Tuning parameters.  */
 256
 257 static const struct cpu_addrcost_table generic_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 274 {
 275     {
 276       0, /* hi  */
 277       0, /* si  */
 278       0, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   1, /* register_offset  */
 284   1, /* register_sextend  */
 285   2, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_addrcost_table xgene1_addrcost_table =
 290 {
 291     {
 292       1, /* hi  */
 293       0, /* si  */
 294       0, /* di  */
 295       1, /* ti  */
 296     },
 297   1, /* pre_modify  */
 298   1, /* post_modify  */
 299   0, /* register_offset  */
 300   1, /* register_sextend  */
 301   1, /* register_zextend  */
 302   0, /* imm_offset  */
 303 };
 304
 305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 306 {
 307     {
 308       1, /* hi  */
 309       1, /* si  */
 310       1, /* di  */
 311       2, /* ti  */
 312     },
 313   0, /* pre_modify  */
 314   0, /* post_modify  */
 315   2, /* register_offset  */
 316   3, /* register_sextend  */
 317   3, /* register_zextend  */
 318   0, /* imm_offset  */
 319 };
 320
 321 static const struct cpu_addrcost_table tsv110_addrcost_table =
 322 {
 323     {
 324       1, /* hi  */
 325       0, /* si  */
 326       0, /* di  */
 327       1, /* ti  */
 328     },
 329   0, /* pre_modify  */
 330   0, /* post_modify  */
 331   0, /* register_offset  */
 332   1, /* register_sextend  */
 333   1, /* register_zextend  */
 334   0, /* imm_offset  */
 335 };
 336
 337 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 338 {
 339     {
 340       1, /* hi  */
 341       1, /* si  */
 342       1, /* di  */
 343       2, /* ti  */
 344     },
 345   1, /* pre_modify  */
 346   1, /* post_modify  */
 347   3, /* register_offset  */
 348   3, /* register_sextend  */
 349   3, /* register_zextend  */
 350   2, /* imm_offset  */
 351 };
 352
 353 static const struct cpu_regmove_cost generic_regmove_cost =
 354 {
 355   1, /* GP2GP  */
 356   /* Avoid the use of slow int<->fp moves for spilling by setting
 357      their cost higher than memmov_cost.  */
 358   5, /* GP2FP  */
 359   5, /* FP2GP  */
 360   2 /* FP2FP  */
 361 };
 362
 363 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 364 {
 365   1, /* GP2GP  */
 366   /* Avoid the use of slow int<->fp moves for spilling by setting
 367      their cost higher than memmov_cost.  */
 368   5, /* GP2FP  */
 369   5, /* FP2GP  */
 370   2 /* FP2FP  */
 371 };
 372
 373 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 374 {
 375   1, /* GP2GP  */
 376   /* Avoid the use of slow int<->fp moves for spilling by setting
 377      their cost higher than memmov_cost.  */
 378   5, /* GP2FP  */
 379   5, /* FP2GP  */
 380   2 /* FP2FP  */
 381 };
 382
 383 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 384 {
 385   1, /* GP2GP  */
 386   /* Avoid the use of slow int<->fp moves for spilling by setting
 387      their cost higher than memmov_cost (actual, 4 and 9).  */
 388   9, /* GP2FP  */
 389   9, /* FP2GP  */
 390   1 /* FP2FP  */
 391 };
 392
 393 static const struct cpu_regmove_cost thunderx_regmove_cost =
 394 {
 395   2, /* GP2GP  */
 396   2, /* GP2FP  */
 397   6, /* FP2GP  */
 398   4 /* FP2FP  */
 399 };
 400
 401 static const struct cpu_regmove_cost xgene1_regmove_cost =
 402 {
 403   1, /* GP2GP  */
 404   /* Avoid the use of slow int<->fp moves for spilling by setting
 405      their cost higher than memmov_cost.  */
 406   8, /* GP2FP  */
 407   8, /* FP2GP  */
 408   2 /* FP2FP  */
 409 };
 410
 411 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 412 {
 413   2, /* GP2GP  */
 414   /* Avoid the use of int<->fp moves for spilling.  */
 415   6, /* GP2FP  */
 416   6, /* FP2GP  */
 417   4 /* FP2FP  */
 418 };
 419
 420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 421 {
 422   1, /* GP2GP  */
 423   /* Avoid the use of int<->fp moves for spilling.  */
 424   8, /* GP2FP  */
 425   8, /* FP2GP  */
 426   4  /* FP2FP  */
 427 };
 428
 429 static const struct cpu_regmove_cost tsv110_regmove_cost =
 430 {
 431   1, /* GP2GP  */
 432   /* Avoid the use of slow int<->fp moves for spilling by setting
 433      their cost higher than memmov_cost.  */
 434   2, /* GP2FP  */
 435   3, /* FP2GP  */
 436   2  /* FP2FP  */
 437 };
 438
 439 /* Generic costs for vector insn classes.  */
 440 static const struct cpu_vector_cost generic_vector_cost =
 441 {
 442   1, /* scalar_int_stmt_cost  */
 443   1, /* scalar_fp_stmt_cost  */
 444   1, /* scalar_load_cost  */
 445   1, /* scalar_store_cost  */
 446   1, /* vec_int_stmt_cost  */
 447   1, /* vec_fp_stmt_cost  */
 448   2, /* vec_permute_cost  */
 449   1, /* vec_to_scalar_cost  */
 450   1, /* scalar_to_vec_cost  */
 451   1, /* vec_align_load_cost  */
 452   1, /* vec_unalign_load_cost  */
 453   1, /* vec_unalign_store_cost  */
 454   1, /* vec_store_cost  */
 455   3, /* cond_taken_branch_cost  */
 456   1 /* cond_not_taken_branch_cost  */
 457 };
 458
 459 /* QDF24XX costs for vector insn classes.  */
 460 static const struct cpu_vector_cost qdf24xx_vector_cost =
 461 {
 462   1, /* scalar_int_stmt_cost  */
 463   1, /* scalar_fp_stmt_cost  */
 464   1, /* scalar_load_cost  */
 465   1, /* scalar_store_cost  */
 466   1, /* vec_int_stmt_cost  */
 467   3, /* vec_fp_stmt_cost  */
 468   2, /* vec_permute_cost  */
 469   1, /* vec_to_scalar_cost  */
 470   1, /* scalar_to_vec_cost  */
 471   1, /* vec_align_load_cost  */
 472   1, /* vec_unalign_load_cost  */
 473   1, /* vec_unalign_store_cost  */
 474   1, /* vec_store_cost  */
 475   3, /* cond_taken_branch_cost  */
 476   1 /* cond_not_taken_branch_cost  */
 477 };
 478
 479 /* ThunderX costs for vector insn classes.  */
 480 static const struct cpu_vector_cost thunderx_vector_cost =
 481 {
 482   1, /* scalar_int_stmt_cost  */
 483   1, /* scalar_fp_stmt_cost  */
 484   3, /* scalar_load_cost  */
 485   1, /* scalar_store_cost  */
 486   4, /* vec_int_stmt_cost  */
 487   1, /* vec_fp_stmt_cost  */
 488   4, /* vec_permute_cost  */
 489   2, /* vec_to_scalar_cost  */
 490   2, /* scalar_to_vec_cost  */
 491   3, /* vec_align_load_cost  */
 492   5, /* vec_unalign_load_cost  */
 493   5, /* vec_unalign_store_cost  */
 494   1, /* vec_store_cost  */
 495   3, /* cond_taken_branch_cost  */
 496   3 /* cond_not_taken_branch_cost  */
 497 };
 498
 499 static const struct cpu_vector_cost tsv110_vector_cost =
 500 {
 501   1, /* scalar_int_stmt_cost  */
 502   1, /* scalar_fp_stmt_cost  */
 503   5, /* scalar_load_cost  */
 504   1, /* scalar_store_cost  */
 505   2, /* vec_int_stmt_cost  */
 506   2, /* vec_fp_stmt_cost  */
 507   2, /* vec_permute_cost  */
 508   3, /* vec_to_scalar_cost  */
 509   2, /* scalar_to_vec_cost  */
 510   5, /* vec_align_load_cost  */
 511   5, /* vec_unalign_load_cost  */
 512   1, /* vec_unalign_store_cost  */
 513   1, /* vec_store_cost  */
 514   1, /* cond_taken_branch_cost  */
 515   1 /* cond_not_taken_branch_cost  */
 516 };
 517
 518 /* Generic costs for vector insn classes.  */
 519 static const struct cpu_vector_cost cortexa57_vector_cost =
 520 {
 521   1, /* scalar_int_stmt_cost  */
 522   1, /* scalar_fp_stmt_cost  */
 523   4, /* scalar_load_cost  */
 524   1, /* scalar_store_cost  */
 525   2, /* vec_int_stmt_cost  */
 526   2, /* vec_fp_stmt_cost  */
 527   3, /* vec_permute_cost  */
 528   8, /* vec_to_scalar_cost  */
 529   8, /* scalar_to_vec_cost  */
 530   4, /* vec_align_load_cost  */
 531   4, /* vec_unalign_load_cost  */
 532   1, /* vec_unalign_store_cost  */
 533   1, /* vec_store_cost  */
 534   1, /* cond_taken_branch_cost  */
 535   1 /* cond_not_taken_branch_cost  */
 536 };
 537
 538 static const struct cpu_vector_cost exynosm1_vector_cost =
 539 {
 540   1, /* scalar_int_stmt_cost  */
 541   1, /* scalar_fp_stmt_cost  */
 542   5, /* scalar_load_cost  */
 543   1, /* scalar_store_cost  */
 544   3, /* vec_int_stmt_cost  */
 545   3, /* vec_fp_stmt_cost  */
 546   3, /* vec_permute_cost  */
 547   3, /* vec_to_scalar_cost  */
 548   3, /* scalar_to_vec_cost  */
 549   5, /* vec_align_load_cost  */
 550   5, /* vec_unalign_load_cost  */
 551   1, /* vec_unalign_store_cost  */
 552   1, /* vec_store_cost  */
 553   1, /* cond_taken_branch_cost  */
 554   1 /* cond_not_taken_branch_cost  */
 555 };
 556
 557 /* Generic costs for vector insn classes.  */
 558 static const struct cpu_vector_cost xgene1_vector_cost =
 559 {
 560   1, /* scalar_int_stmt_cost  */
 561   1, /* scalar_fp_stmt_cost  */
 562   5, /* scalar_load_cost  */
 563   1, /* scalar_store_cost  */
 564   2, /* vec_int_stmt_cost  */
 565   2, /* vec_fp_stmt_cost  */
 566   2, /* vec_permute_cost  */
 567   4, /* vec_to_scalar_cost  */
 568   4, /* scalar_to_vec_cost  */
 569   10, /* vec_align_load_cost  */
 570   10, /* vec_unalign_load_cost  */
 571   2, /* vec_unalign_store_cost  */
 572   2, /* vec_store_cost  */
 573   2, /* cond_taken_branch_cost  */
 574   1 /* cond_not_taken_branch_cost  */
 575 };
 576
 577 /* Costs for vector insn classes for Vulcan.  */
 578 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 579 {
 580   1, /* scalar_int_stmt_cost  */
 581   6, /* scalar_fp_stmt_cost  */
 582   4, /* scalar_load_cost  */
 583   1, /* scalar_store_cost  */
 584   5, /* vec_int_stmt_cost  */
 585   6, /* vec_fp_stmt_cost  */
 586   3, /* vec_permute_cost  */
 587   6, /* vec_to_scalar_cost  */
 588   5, /* scalar_to_vec_cost  */
 589   8, /* vec_align_load_cost  */
 590   8, /* vec_unalign_load_cost  */
 591   4, /* vec_unalign_store_cost  */
 592   4, /* vec_store_cost  */
 593   2, /* cond_taken_branch_cost  */
 594   1  /* cond_not_taken_branch_cost  */
 595 };
 596
 597 /* Generic costs for branch instructions.  */
 598 static const struct cpu_branch_cost generic_branch_cost =
 599 {
 600   1,  /* Predictable.  */
 601   3   /* Unpredictable.  */
 602 };
 603
 604 /* Generic approximation modes.  */
 605 static const cpu_approx_modes generic_approx_modes =
 606 {
 607   AARCH64_APPROX_NONE,  /* division  */
 608   AARCH64_APPROX_NONE,  /* sqrt  */
 609   AARCH64_APPROX_NONE   /* recip_sqrt  */
 610 };
 611
 612 /* Approximation modes for Exynos M1.  */
 613 static const cpu_approx_modes exynosm1_approx_modes =
 614 {
 615   AARCH64_APPROX_NONE,  /* division  */
 616   AARCH64_APPROX_ALL,   /* sqrt  */
 617   AARCH64_APPROX_ALL    /* recip_sqrt  */
 618 };
 619
 620 /* Approximation modes for X-Gene 1.  */
 621 static const cpu_approx_modes xgene1_approx_modes =
 622 {
 623   AARCH64_APPROX_NONE,  /* division  */
 624   AARCH64_APPROX_NONE,  /* sqrt  */
 625   AARCH64_APPROX_ALL    /* recip_sqrt  */
 626 };
 627
 628 /* Generic prefetch settings (which disable prefetch).  */
 629 static const cpu_prefetch_tune generic_prefetch_tune =
 630 {
 631   0,                    /* num_slots  */
 632   -1,                   /* l1_cache_size  */
 633   -1,                   /* l1_cache_line_size  */
 634   -1,                   /* l2_cache_size  */
 635   true,                 /* prefetch_dynamic_strides */
 636   -1,                   /* minimum_stride */
 637   -1                    /* default_opt_level  */
 638 };
 639
 640 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 641 {
 642   0,                    /* num_slots  */
 643   -1,                   /* l1_cache_size  */
 644   64,                   /* l1_cache_line_size  */
 645   -1,                   /* l2_cache_size  */
 646   true,                 /* prefetch_dynamic_strides */
 647   -1,                   /* minimum_stride */
 648   -1                    /* default_opt_level  */
 649 };
 650
 651 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 652 {
 653   4,                    /* num_slots  */
 654   32,                   /* l1_cache_size  */
 655   64,                   /* l1_cache_line_size  */
 656   512,                  /* l2_cache_size  */
 657   false,                /* prefetch_dynamic_strides */
 658   2048,                 /* minimum_stride */
 659   3                     /* default_opt_level  */
 660 };
 661
 662 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 663 {
 664   8,                    /* num_slots  */
 665   32,                   /* l1_cache_size  */
 666   128,                  /* l1_cache_line_size  */
 667   16*1024,              /* l2_cache_size  */
 668   true,                 /* prefetch_dynamic_strides */
 669   -1,                   /* minimum_stride */
 670   3                     /* default_opt_level  */
 671 };
 672
 673 static const cpu_prefetch_tune thunderx_prefetch_tune =
 674 {
 675   8,                    /* num_slots  */
 676   32,                   /* l1_cache_size  */
 677   128,                  /* l1_cache_line_size  */
 678   -1,                   /* l2_cache_size  */
 679   true,                 /* prefetch_dynamic_strides */
 680   -1,                   /* minimum_stride */
 681   -1                    /* default_opt_level  */
 682 };
 683
 684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 685 {
 686   8,                    /* num_slots  */
 687   32,                   /* l1_cache_size  */
 688   64,                   /* l1_cache_line_size  */
 689   256,                  /* l2_cache_size  */
 690   true,                 /* prefetch_dynamic_strides */
 691   -1,                   /* minimum_stride */
 692   -1                    /* default_opt_level  */
 693 };
 694
 695 static const cpu_prefetch_tune tsv110_prefetch_tune =
 696 {
 697   0,                    /* num_slots  */
 698   64,                   /* l1_cache_size  */
 699   64,                   /* l1_cache_line_size  */
 700   512,                  /* l2_cache_size  */
 701   true,                 /* prefetch_dynamic_strides */
 702   -1,                   /* minimum_stride */
 703   -1                    /* default_opt_level  */
 704 };
 705
 706 static const cpu_prefetch_tune xgene1_prefetch_tune =
 707 {
 708   8,                    /* num_slots  */
 709   32,                   /* l1_cache_size  */
 710   64,                   /* l1_cache_line_size  */
 711   256,                  /* l2_cache_size  */
 712   true,                 /* prefetch_dynamic_strides */
 713   -1,                   /* minimum_stride */
 714   -1                    /* default_opt_level  */
 715 };
 716
 717 static const struct tune_params generic_tunings =
 718 {
 719   &cortexa57_extra_costs,
 720   &generic_addrcost_table,
 721   &generic_regmove_cost,
 722   &generic_vector_cost,
 723   &generic_branch_cost,
 724   &generic_approx_modes,
 725   SVE_NOT_IMPLEMENTED, /* sve_width  */
 726   4, /* memmov_cost  */
 727   2, /* issue_rate  */
 728   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 729   "16:12",      /* function_align.  */
 730   "4",  /* jump_align.  */
 731   "8",  /* loop_align.  */
 732   2,    /* int_reassoc_width.  */
 733   4,    /* fp_reassoc_width.  */
 734   1,    /* vec_reassoc_width.  */
 735   2,    /* min_div_recip_mul_sf.  */
 736   2,    /* min_div_recip_mul_df.  */
 737   0,    /* max_case_values.  */
 738   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 739   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 740   &generic_prefetch_tune
 741 };
 742
 743 static const struct tune_params cortexa35_tunings =
 744 {
 745   &cortexa53_extra_costs,
 746   &generic_addrcost_table,
 747   &cortexa53_regmove_cost,
 748   &generic_vector_cost,
 749   &generic_branch_cost,
 750   &generic_approx_modes,
 751   SVE_NOT_IMPLEMENTED, /* sve_width  */
 752   4, /* memmov_cost  */
 753   1, /* issue_rate  */
 754   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 755    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 756   "16", /* function_align.  */
 757   "4",  /* jump_align.  */
 758   "8",  /* loop_align.  */
 759   2,    /* int_reassoc_width.  */
 760   4,    /* fp_reassoc_width.  */
 761   1,    /* vec_reassoc_width.  */
 762   2,    /* min_div_recip_mul_sf.  */
 763   2,    /* min_div_recip_mul_df.  */
 764   0,    /* max_case_values.  */
 765   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 766   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 767   &generic_prefetch_tune
 768 };
 769
 770 static const struct tune_params cortexa53_tunings =
 771 {
 772   &cortexa53_extra_costs,
 773   &generic_addrcost_table,
 774   &cortexa53_regmove_cost,
 775   &generic_vector_cost,
 776   &generic_branch_cost,
 777   &generic_approx_modes,
 778   SVE_NOT_IMPLEMENTED, /* sve_width  */
 779   4, /* memmov_cost  */
 780   2, /* issue_rate  */
 781   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 782    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 783   "16", /* function_align.  */
 784   "4",  /* jump_align.  */
 785   "8",  /* loop_align.  */
 786   2,    /* int_reassoc_width.  */
 787   4,    /* fp_reassoc_width.  */
 788   1,    /* vec_reassoc_width.  */
 789   2,    /* min_div_recip_mul_sf.  */
 790   2,    /* min_div_recip_mul_df.  */
 791   0,    /* max_case_values.  */
 792   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 793   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 794   &generic_prefetch_tune
 795 };
 796
 797 static const struct tune_params cortexa57_tunings =
 798 {
 799   &cortexa57_extra_costs,
 800   &generic_addrcost_table,
 801   &cortexa57_regmove_cost,
 802   &cortexa57_vector_cost,
 803   &generic_branch_cost,
 804   &generic_approx_modes,
 805   SVE_NOT_IMPLEMENTED, /* sve_width  */
 806   4, /* memmov_cost  */
 807   3, /* issue_rate  */
 808   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 809    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 810   "16", /* function_align.  */
 811   "4",  /* jump_align.  */
 812   "8",  /* loop_align.  */
 813   2,    /* int_reassoc_width.  */
 814   4,    /* fp_reassoc_width.  */
 815   1,    /* vec_reassoc_width.  */
 816   2,    /* min_div_recip_mul_sf.  */
 817   2,    /* min_div_recip_mul_df.  */
 818   0,    /* max_case_values.  */
 819   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 820   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 821   &generic_prefetch_tune
 822 };
 823
 824 static const struct tune_params cortexa72_tunings =
 825 {
 826   &cortexa57_extra_costs,
 827   &generic_addrcost_table,
 828   &cortexa57_regmove_cost,
 829   &cortexa57_vector_cost,
 830   &generic_branch_cost,
 831   &generic_approx_modes,
 832   SVE_NOT_IMPLEMENTED, /* sve_width  */
 833   4, /* memmov_cost  */
 834   3, /* issue_rate  */
 835   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 836    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 837   "16", /* function_align.  */
 838   "4",  /* jump_align.  */
 839   "8",  /* loop_align.  */
 840   2,    /* int_reassoc_width.  */
 841   4,    /* fp_reassoc_width.  */
 842   1,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &generic_prefetch_tune
 849 };
 850
 851 static const struct tune_params cortexa73_tunings =
 852 {
 853   &cortexa57_extra_costs,
 854   &generic_addrcost_table,
 855   &cortexa57_regmove_cost,
 856   &cortexa57_vector_cost,
 857   &generic_branch_cost,
 858   &generic_approx_modes,
 859   SVE_NOT_IMPLEMENTED, /* sve_width  */
 860   4, /* memmov_cost.  */
 861   2, /* issue_rate.  */
 862   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 863    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 864   "16", /* function_align.  */
 865   "4",  /* jump_align.  */
 866   "8",  /* loop_align.  */
 867   2,    /* int_reassoc_width.  */
 868   4,    /* fp_reassoc_width.  */
 869   1,    /* vec_reassoc_width.  */
 870   2,    /* min_div_recip_mul_sf.  */
 871   2,    /* min_div_recip_mul_df.  */
 872   0,    /* max_case_values.  */
 873   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 874   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 875   &generic_prefetch_tune
 876 };
 877
 878
 879
 880 static const struct tune_params exynosm1_tunings =
 881 {
 882   &exynosm1_extra_costs,
 883   &exynosm1_addrcost_table,
 884   &exynosm1_regmove_cost,
 885   &exynosm1_vector_cost,
 886   &generic_branch_cost,
 887   &exynosm1_approx_modes,
 888   SVE_NOT_IMPLEMENTED, /* sve_width  */
 889   4,    /* memmov_cost  */
 890   3,    /* issue_rate  */
 891   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 892   "4",  /* function_align.  */
 893   "4",  /* jump_align.  */
 894   "4",  /* loop_align.  */
 895   2,    /* int_reassoc_width.  */
 896   4,    /* fp_reassoc_width.  */
 897   1,    /* vec_reassoc_width.  */
 898   2,    /* min_div_recip_mul_sf.  */
 899   2,    /* min_div_recip_mul_df.  */
 900   48,   /* max_case_values.  */
 901   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 902   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 903   &exynosm1_prefetch_tune
 904 };
 905
 906 static const struct tune_params thunderxt88_tunings =
 907 {
 908   &thunderx_extra_costs,
 909   &generic_addrcost_table,
 910   &thunderx_regmove_cost,
 911   &thunderx_vector_cost,
 912   &generic_branch_cost,
 913   &generic_approx_modes,
 914   SVE_NOT_IMPLEMENTED, /* sve_width  */
 915   6, /* memmov_cost  */
 916   2, /* issue_rate  */
 917   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 918   "8",  /* function_align.  */
 919   "8",  /* jump_align.  */
 920   "8",  /* loop_align.  */
 921   2,    /* int_reassoc_width.  */
 922   4,    /* fp_reassoc_width.  */
 923   1,    /* vec_reassoc_width.  */
 924   2,    /* min_div_recip_mul_sf.  */
 925   2,    /* min_div_recip_mul_df.  */
 926   0,    /* max_case_values.  */
 927   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 928   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 929   &thunderxt88_prefetch_tune
 930 };
 931
 932 static const struct tune_params thunderx_tunings =
 933 {
 934   &thunderx_extra_costs,
 935   &generic_addrcost_table,
 936   &thunderx_regmove_cost,
 937   &thunderx_vector_cost,
 938   &generic_branch_cost,
 939   &generic_approx_modes,
 940   SVE_NOT_IMPLEMENTED, /* sve_width  */
 941   6, /* memmov_cost  */
 942   2, /* issue_rate  */
 943   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 944   "8",  /* function_align.  */
 945   "8",  /* jump_align.  */
 946   "8",  /* loop_align.  */
 947   2,    /* int_reassoc_width.  */
 948   4,    /* fp_reassoc_width.  */
 949   1,    /* vec_reassoc_width.  */
 950   2,    /* min_div_recip_mul_sf.  */
 951   2,    /* min_div_recip_mul_df.  */
 952   0,    /* max_case_values.  */
 953   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 954   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 955    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 956   &thunderx_prefetch_tune
 957 };
 958
 959 static const struct tune_params tsv110_tunings =
 960 {
 961   &tsv110_extra_costs,
 962   &tsv110_addrcost_table,
 963   &tsv110_regmove_cost,
 964   &tsv110_vector_cost,
 965   &generic_branch_cost,
 966   &generic_approx_modes,
 967   SVE_NOT_IMPLEMENTED, /* sve_width  */
 968   4,    /* memmov_cost  */
 969   4,    /* issue_rate  */
 970   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 971    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 972   "16", /* function_align.  */
 973   "4",  /* jump_align.  */
 974   "8",  /* loop_align.  */
 975   2,    /* int_reassoc_width.  */
 976   4,    /* fp_reassoc_width.  */
 977   1,    /* vec_reassoc_width.  */
 978   2,    /* min_div_recip_mul_sf.  */
 979   2,    /* min_div_recip_mul_df.  */
 980   0,    /* max_case_values.  */
 981   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 982   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 983   &tsv110_prefetch_tune
 984 };
 985
 986 static const struct tune_params xgene1_tunings =
 987 {
 988   &xgene1_extra_costs,
 989   &xgene1_addrcost_table,
 990   &xgene1_regmove_cost,
 991   &xgene1_vector_cost,
 992   &generic_branch_cost,
 993   &xgene1_approx_modes,
 994   SVE_NOT_IMPLEMENTED, /* sve_width  */
 995   6, /* memmov_cost  */
 996   4, /* issue_rate  */
 997   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 998   "16", /* function_align.  */
 999   "16", /* jump_align.  */
1000   "16", /* loop_align.  */
1001   2,    /* int_reassoc_width.  */
1002   4,    /* fp_reassoc_width.  */
1003   1,    /* vec_reassoc_width.  */
1004   2,    /* min_div_recip_mul_sf.  */
1005   2,    /* min_div_recip_mul_df.  */
1006   17,   /* max_case_values.  */
1007   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1008   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1009   &xgene1_prefetch_tune
1010 };
1011
1012 static const struct tune_params emag_tunings =
1013 {
1014   &xgene1_extra_costs,
1015   &xgene1_addrcost_table,
1016   &xgene1_regmove_cost,
1017   &xgene1_vector_cost,
1018   &generic_branch_cost,
1019   &xgene1_approx_modes,
1020   SVE_NOT_IMPLEMENTED,
1021   6, /* memmov_cost  */
1022   4, /* issue_rate  */
1023   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1024   "16", /* function_align.  */
1025   "16", /* jump_align.  */
1026   "16", /* loop_align.  */
1027   2,    /* int_reassoc_width.  */
1028   4,    /* fp_reassoc_width.  */
1029   1,    /* vec_reassoc_width.  */
1030   2,    /* min_div_recip_mul_sf.  */
1031   2,    /* min_div_recip_mul_df.  */
1032   17,   /* max_case_values.  */
1033   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1034   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1035   &xgene1_prefetch_tune
1036 };
1037
1038 static const struct tune_params qdf24xx_tunings =
1039 {
1040   &qdf24xx_extra_costs,
1041   &qdf24xx_addrcost_table,
1042   &qdf24xx_regmove_cost,
1043   &qdf24xx_vector_cost,
1044   &generic_branch_cost,
1045   &generic_approx_modes,
1046   SVE_NOT_IMPLEMENTED, /* sve_width  */
1047   4, /* memmov_cost  */
1048   4, /* issue_rate  */
1049   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1050    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1051   "16", /* function_align.  */
1052   "8",  /* jump_align.  */
1053   "16", /* loop_align.  */
1054   2,    /* int_reassoc_width.  */
1055   4,    /* fp_reassoc_width.  */
1056   1,    /* vec_reassoc_width.  */
1057   2,    /* min_div_recip_mul_sf.  */
1058   2,    /* min_div_recip_mul_df.  */
1059   0,    /* max_case_values.  */
1060   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1061   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1062   &qdf24xx_prefetch_tune
1063 };
1064
1065 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1066    for now.  */
1067 static const struct tune_params saphira_tunings =
1068 {
1069   &generic_extra_costs,
1070   &generic_addrcost_table,
1071   &generic_regmove_cost,
1072   &generic_vector_cost,
1073   &generic_branch_cost,
1074   &generic_approx_modes,
1075   SVE_NOT_IMPLEMENTED, /* sve_width  */
1076   4, /* memmov_cost  */
1077   4, /* issue_rate  */
1078   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1079    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1080   "16", /* function_align.  */
1081   "8",  /* jump_align.  */
1082   "16", /* loop_align.  */
1083   2,    /* int_reassoc_width.  */
1084   4,    /* fp_reassoc_width.  */
1085   1,    /* vec_reassoc_width.  */
1086   2,    /* min_div_recip_mul_sf.  */
1087   2,    /* min_div_recip_mul_df.  */
1088   0,    /* max_case_values.  */
1089   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1090   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1091   &generic_prefetch_tune
1092 };
1093
1094 static const struct tune_params thunderx2t99_tunings =
1095 {
1096   &thunderx2t99_extra_costs,
1097   &thunderx2t99_addrcost_table,
1098   &thunderx2t99_regmove_cost,
1099   &thunderx2t99_vector_cost,
1100   &generic_branch_cost,
1101   &generic_approx_modes,
1102   SVE_NOT_IMPLEMENTED, /* sve_width  */
1103   4, /* memmov_cost.  */
1104   4, /* issue_rate.  */
1105   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1106    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1107   "16", /* function_align.  */
1108   "8",  /* jump_align.  */
1109   "16", /* loop_align.  */
1110   3,    /* int_reassoc_width.  */
1111   2,    /* fp_reassoc_width.  */
1112   2,    /* vec_reassoc_width.  */
1113   2,    /* min_div_recip_mul_sf.  */
1114   2,    /* min_div_recip_mul_df.  */
1115   0,    /* max_case_values.  */
1116   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1117   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1118   &thunderx2t99_prefetch_tune
1119 };
1120
1121 static const struct tune_params neoversen1_tunings =
1122 {
1123   &cortexa57_extra_costs,
1124   &generic_addrcost_table,
1125   &generic_regmove_cost,
1126   &cortexa57_vector_cost,
1127   &generic_branch_cost,
1128   &generic_approx_modes,
1129   SVE_NOT_IMPLEMENTED, /* sve_width  */
1130   4, /* memmov_cost  */
1131   3, /* issue_rate  */
1132   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1133   "32:16",      /* function_align.  */
1134   "32:16",      /* jump_align.  */
1135   "32:16",      /* loop_align.  */
1136   2,    /* int_reassoc_width.  */
1137   4,    /* fp_reassoc_width.  */
1138   2,    /* vec_reassoc_width.  */
1139   2,    /* min_div_recip_mul_sf.  */
1140   2,    /* min_div_recip_mul_df.  */
1141   0,    /* max_case_values.  */
1142   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1143   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1144   &generic_prefetch_tune
1145 };
1146
1147 /* Support for fine-grained override of the tuning structures.  */
1148 struct aarch64_tuning_override_function
1149 {
1150   const char* name;
1151   void (*parse_override)(const char*, struct tune_params*);
1152 };
1153
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1157
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions[] =
1160 {
1161   { "fuse", aarch64_parse_fuse_string },
1162   { "tune", aarch64_parse_tune_string },
1163   { "sve_width", aarch64_parse_sve_width_string },
1164   { NULL, NULL }
1165 };
1166
1167 /* A processor implementing AArch64.  */
1168 struct processor
1169 {
1170   const char *const name;
1171   enum aarch64_processor ident;
1172   enum aarch64_processor sched_core;
1173   enum aarch64_arch arch;
1174   unsigned architecture_version;
1175   const uint64_t flags;
1176   const struct tune_params *const tune;
1177 };
1178
1179 /* Architectures implementing AArch64.  */
1180 static const struct processor all_architectures[] =
1181 {
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1186 };
1187
1188 /* Processor cores implementing AArch64.  */
1189 static const struct processor all_cores[] =
1190 {
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1193   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1194   FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1197     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1198   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1199 };
1200
1201
1202 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1203    handling code or by target attributes.  */
1204 static const struct processor *selected_arch;
1205 static const struct processor *selected_cpu;
1206 static const struct processor *selected_tune;
1207
1208 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1209
1210 /* The current tuning set.  */
1211 struct tune_params aarch64_tune_params = generic_tunings;
1212
1213 /* Table of machine attributes.  */
1214 static const struct attribute_spec aarch64_attribute_table[] =
1215 {
1216   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217        affects_type_identity, handler, exclude } */
1218   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1219   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1220 };
1221
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1223
1224 /* An ISA extension in the co-processor and main instruction set space.  */
1225 struct aarch64_option_extension
1226 {
1227   const char *const name;
1228   const unsigned long flags_on;
1229   const unsigned long flags_off;
1230 };
1231
1232 typedef enum aarch64_cond_code
1233 {
1234   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1235   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1236   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1237 }
1238 aarch64_cc;
1239
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1241
1242 struct aarch64_branch_protect_type
1243 {
1244   /* The type's name that the user passes to the branch-protection option
1245     string.  */
1246   const char* name;
1247   /* Function to handle the protection type and set global variables.
1248     First argument is the string token corresponding with this type and the
1249     second argument is the next token in the option string.
1250     Return values:
1251     * AARCH64_PARSE_OK: Handling was sucessful.
1252     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253       should print an error.
1254     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1255       own error.  */
1256   enum aarch64_parse_opt_result (*handler)(char*, char*);
1257   /* A list of types that can follow this type in the option string.  */
1258   const aarch64_branch_protect_type* subtypes;
1259   unsigned int num_subtypes;
1260 };
1261
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str, char* rest)
1264 {
1265   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1266   aarch64_enable_bti = 0;
1267   if (rest)
1268     {
1269       error ("unexpected %<%s%> after %<%s%>", rest, str);
1270       return AARCH64_PARSE_INVALID_FEATURE;
1271     }
1272   return AARCH64_PARSE_OK;
1273 }
1274
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str, char* rest)
1277 {
1278   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1279   aarch64_ra_sign_key = AARCH64_KEY_A;
1280   aarch64_enable_bti = 1;
1281   if (rest)
1282     {
1283       error ("unexpected %<%s%> after %<%s%>", rest, str);
1284       return AARCH64_PARSE_INVALID_FEATURE;
1285     }
1286   return AARCH64_PARSE_OK;
1287 }
1288
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1291                                     char* rest ATTRIBUTE_UNUSED)
1292 {
1293   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1294   aarch64_ra_sign_key = AARCH64_KEY_A;
1295   return AARCH64_PARSE_OK;
1296 }
1297
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1300                               char* rest ATTRIBUTE_UNUSED)
1301 {
1302   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1303   return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1308                               char* rest ATTRIBUTE_UNUSED)
1309 {
1310   aarch64_ra_sign_key = AARCH64_KEY_B;
1311   return AARCH64_PARSE_OK;
1312 }
1313
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1316                                     char* rest ATTRIBUTE_UNUSED)
1317 {
1318   aarch64_enable_bti = 1;
1319   return AARCH64_PARSE_OK;
1320 }
1321
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1323   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1324   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1325   { NULL, NULL, NULL, 0 }
1326 };
1327
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1329   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1330   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1331   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1332     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1333   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1334   { NULL, NULL, NULL, 0 }
1335 };
1336
1337 /* The condition codes of the processor, and the inverse function.  */
1338 static const char * const aarch64_condition_codes[] =
1339 {
1340   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1342 };
1343
1344 /* The preferred condition codes for SVE conditions.  */
1345 static const char *const aarch64_sve_condition_codes[] =
1346 {
1347   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1349 };
1350
1351 /* Return the assembly token for svpattern value VALUE.  */
1352
1353 static const char *
1354 svpattern_token (enum aarch64_svpattern pattern)
1355 {
1356   switch (pattern)
1357     {
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359     AARCH64_FOR_SVPATTERN (CASE)
1360 #undef CASE
1361     case AARCH64_NUM_SVPATTERNS:
1362       break;
1363     }
1364   gcc_unreachable ();
1365 }
1366
1367 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1368 const char *
1369 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1370                         const char * branch_format)
1371 {
1372     rtx_code_label * tmp_label = gen_label_rtx ();
1373     char label_buf[256];
1374     char buffer[128];
1375     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1376                                  CODE_LABEL_NUMBER (tmp_label));
1377     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1378     rtx dest_label = operands[pos_label];
1379     operands[pos_label] = tmp_label;
1380
1381     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1382     output_asm_insn (buffer, operands);
1383
1384     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1385     operands[pos_label] = dest_label;
1386     output_asm_insn (buffer, operands);
1387     return "";
1388 }
1389
1390 void
1391 aarch64_err_no_fpadvsimd (machine_mode mode)
1392 {
1393   if (TARGET_GENERAL_REGS_ONLY)
1394     if (FLOAT_MODE_P (mode))
1395       error ("%qs is incompatible with the use of floating-point types",
1396              "-mgeneral-regs-only");
1397     else
1398       error ("%qs is incompatible with the use of vector types",
1399              "-mgeneral-regs-only");
1400   else
1401     if (FLOAT_MODE_P (mode))
1402       error ("%qs feature modifier is incompatible with the use of"
1403              " floating-point types", "+nofp");
1404     else
1405       error ("%qs feature modifier is incompatible with the use of"
1406              " vector types", "+nofp");
1407 }
1408
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413    and GENERAL_REGS is lower than the memory cost (in this case the best class
1414    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1415    cost results in bad allocations with many redundant int<->FP moves which
1416    are expensive on various cores.
1417    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1419    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1420    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1421    The result of this is that it is no longer inefficient to have a higher
1422    memory move cost than the register move cost.
1423 */
1424
1425 static reg_class_t
1426 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1427                                          reg_class_t best_class)
1428 {
1429   machine_mode mode;
1430
1431   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1432       || !reg_class_subset_p (FP_REGS, allocno_class))
1433     return allocno_class;
1434
1435   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1436       || !reg_class_subset_p (FP_REGS, best_class))
1437     return best_class;
1438
1439   mode = PSEUDO_REGNO_MODE (regno);
1440   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1441 }
1442
1443 static unsigned int
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1445 {
1446   if (GET_MODE_UNIT_SIZE (mode) == 4)
1447     return aarch64_tune_params.min_div_recip_mul_sf;
1448   return aarch64_tune_params.min_div_recip_mul_df;
1449 }
1450
1451 /* Return the reassociation width of treeop OPC with mode MODE.  */
1452 static int
1453 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1454 {
1455   if (VECTOR_MODE_P (mode))
1456     return aarch64_tune_params.vec_reassoc_width;
1457   if (INTEGRAL_MODE_P (mode))
1458     return aarch64_tune_params.int_reassoc_width;
1459   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1460   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1461     return aarch64_tune_params.fp_reassoc_width;
1462   return 1;
1463 }
1464
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1466 unsigned
1467 aarch64_dbx_register_number (unsigned regno)
1468 {
1469    if (GP_REGNUM_P (regno))
1470      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1471    else if (regno == SP_REGNUM)
1472      return AARCH64_DWARF_SP;
1473    else if (FP_REGNUM_P (regno))
1474      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1475    else if (PR_REGNUM_P (regno))
1476      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1477    else if (regno == VG_REGNUM)
1478      return AARCH64_DWARF_VG;
1479
1480    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481       equivalent DWARF register.  */
1482    return DWARF_FRAME_REGISTERS;
1483 }
1484
1485 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1486 static bool
1487 aarch64_advsimd_struct_mode_p (machine_mode mode)
1488 {
1489   return (TARGET_SIMD
1490           && (mode == OImode || mode == CImode || mode == XImode));
1491 }
1492
1493 /* Return true if MODE is an SVE predicate mode.  */
1494 static bool
1495 aarch64_sve_pred_mode_p (machine_mode mode)
1496 {
1497   return (TARGET_SVE
1498           && (mode == VNx16BImode
1499               || mode == VNx8BImode
1500               || mode == VNx4BImode
1501               || mode == VNx2BImode));
1502 }
1503
1504 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1505 const unsigned int VEC_ADVSIMD  = 1;
1506 const unsigned int VEC_SVE_DATA = 2;
1507 const unsigned int VEC_SVE_PRED = 4;
1508 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1509    a structure of 2, 3 or 4 vectors.  */
1510 const unsigned int VEC_STRUCT   = 8;
1511 /* Useful combinations of the above.  */
1512 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1513 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1514
1515 /* Return a set of flags describing the vector properties of mode MODE.
1516    Ignore modes that are not supported by the current target.  */
1517 static unsigned int
1518 aarch64_classify_vector_mode (machine_mode mode)
1519 {
1520   if (aarch64_advsimd_struct_mode_p (mode))
1521     return VEC_ADVSIMD | VEC_STRUCT;
1522
1523   if (aarch64_sve_pred_mode_p (mode))
1524     return VEC_SVE_PRED;
1525
1526   /* Make the decision based on the mode's enum value rather than its
1527      properties, so that we keep the correct classification regardless
1528      of -msve-vector-bits.  */
1529   switch (mode)
1530     {
1531     /* Single SVE vectors.  */
1532     case E_VNx16QImode:
1533     case E_VNx8HImode:
1534     case E_VNx4SImode:
1535     case E_VNx2DImode:
1536     case E_VNx8HFmode:
1537     case E_VNx4SFmode:
1538     case E_VNx2DFmode:
1539       return TARGET_SVE ? VEC_SVE_DATA : 0;
1540
1541     /* x2 SVE vectors.  */
1542     case E_VNx32QImode:
1543     case E_VNx16HImode:
1544     case E_VNx8SImode:
1545     case E_VNx4DImode:
1546     case E_VNx16HFmode:
1547     case E_VNx8SFmode:
1548     case E_VNx4DFmode:
1549     /* x3 SVE vectors.  */
1550     case E_VNx48QImode:
1551     case E_VNx24HImode:
1552     case E_VNx12SImode:
1553     case E_VNx6DImode:
1554     case E_VNx24HFmode:
1555     case E_VNx12SFmode:
1556     case E_VNx6DFmode:
1557     /* x4 SVE vectors.  */
1558     case E_VNx64QImode:
1559     case E_VNx32HImode:
1560     case E_VNx16SImode:
1561     case E_VNx8DImode:
1562     case E_VNx32HFmode:
1563     case E_VNx16SFmode:
1564     case E_VNx8DFmode:
1565       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1566
1567     /* 64-bit Advanced SIMD vectors.  */
1568     case E_V8QImode:
1569     case E_V4HImode:
1570     case E_V2SImode:
1571     /* ...E_V1DImode doesn't exist.  */
1572     case E_V4HFmode:
1573     case E_V2SFmode:
1574     case E_V1DFmode:
1575     /* 128-bit Advanced SIMD vectors.  */
1576     case E_V16QImode:
1577     case E_V8HImode:
1578     case E_V4SImode:
1579     case E_V2DImode:
1580     case E_V8HFmode:
1581     case E_V4SFmode:
1582     case E_V2DFmode:
1583       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1584
1585     default:
1586       return 0;
1587     }
1588 }
1589
1590 /* Return true if MODE is any of the data vector modes, including
1591    structure modes.  */
1592 static bool
1593 aarch64_vector_data_mode_p (machine_mode mode)
1594 {
1595   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1596 }
1597
1598 /* Return true if MODE is an SVE data vector mode; either a single vector
1599    or a structure of vectors.  */
1600 static bool
1601 aarch64_sve_data_mode_p (machine_mode mode)
1602 {
1603   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1604 }
1605
1606 /* Implement target hook TARGET_ARRAY_MODE.  */
1607 static opt_machine_mode
1608 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1609 {
1610   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1611       && IN_RANGE (nelems, 2, 4))
1612     return mode_for_vector (GET_MODE_INNER (mode),
1613                             GET_MODE_NUNITS (mode) * nelems);
1614
1615   return opt_machine_mode ();
1616 }
1617
1618 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1619 static bool
1620 aarch64_array_mode_supported_p (machine_mode mode,
1621                                 unsigned HOST_WIDE_INT nelems)
1622 {
1623   if (TARGET_SIMD
1624       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1625           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1626       && (nelems >= 2 && nelems <= 4))
1627     return true;
1628
1629   return false;
1630 }
1631
1632 /* Return the SVE predicate mode to use for elements that have
1633    ELEM_NBYTES bytes, if such a mode exists.  */
1634
1635 opt_machine_mode
1636 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1637 {
1638   if (TARGET_SVE)
1639     {
1640       if (elem_nbytes == 1)
1641         return VNx16BImode;
1642       if (elem_nbytes == 2)
1643         return VNx8BImode;
1644       if (elem_nbytes == 4)
1645         return VNx4BImode;
1646       if (elem_nbytes == 8)
1647         return VNx2BImode;
1648     }
1649   return opt_machine_mode ();
1650 }
1651
1652 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1653
1654 static opt_machine_mode
1655 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1656 {
1657   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1658     {
1659       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1660       machine_mode pred_mode;
1661       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1662         return pred_mode;
1663     }
1664
1665   return default_get_mask_mode (nunits, nbytes);
1666 }
1667
1668 /* Return the integer element mode associated with SVE mode MODE.  */
1669
1670 static scalar_int_mode
1671 aarch64_sve_element_int_mode (machine_mode mode)
1672 {
1673   unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1674                                                GET_MODE_NUNITS (mode));
1675   return int_mode_for_size (elt_bits, 0).require ();
1676 }
1677
1678 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1679    prefer to use the first arithmetic operand as the else value if
1680    the else value doesn't matter, since that exactly matches the SVE
1681    destructive merging form.  For ternary operations we could either
1682    pick the first operand and use FMAD-like instructions or the last
1683    operand and use FMLA-like instructions; the latter seems more
1684    natural.  */
1685
1686 static tree
1687 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1688 {
1689   return nops == 3 ? ops[2] : ops[0];
1690 }
1691
1692 /* Implement TARGET_HARD_REGNO_NREGS.  */
1693
1694 static unsigned int
1695 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1696 {
1697   /* ??? Logically we should only need to provide a value when
1698      HARD_REGNO_MODE_OK says that the combination is valid,
1699      but at the moment we need to handle all modes.  Just ignore
1700      any runtime parts for registers that can't store them.  */
1701   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1702   switch (aarch64_regno_regclass (regno))
1703     {
1704     case FP_REGS:
1705     case FP_LO_REGS:
1706     case FP_LO8_REGS:
1707       if (aarch64_sve_data_mode_p (mode))
1708         return exact_div (GET_MODE_SIZE (mode),
1709                           BYTES_PER_SVE_VECTOR).to_constant ();
1710       return CEIL (lowest_size, UNITS_PER_VREG);
1711     case PR_REGS:
1712     case PR_LO_REGS:
1713     case PR_HI_REGS:
1714       return 1;
1715     default:
1716       return CEIL (lowest_size, UNITS_PER_WORD);
1717     }
1718   gcc_unreachable ();
1719 }
1720
1721 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1722
1723 static bool
1724 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1725 {
1726   if (GET_MODE_CLASS (mode) == MODE_CC)
1727     return regno == CC_REGNUM;
1728
1729   if (regno == VG_REGNUM)
1730     /* This must have the same size as _Unwind_Word.  */
1731     return mode == DImode;
1732
1733   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1734   if (vec_flags & VEC_SVE_PRED)
1735     return PR_REGNUM_P (regno);
1736
1737   if (PR_REGNUM_P (regno))
1738     return 0;
1739
1740   if (regno == SP_REGNUM)
1741     /* The purpose of comparing with ptr_mode is to support the
1742        global register variable associated with the stack pointer
1743        register via the syntax of asm ("wsp") in ILP32.  */
1744     return mode == Pmode || mode == ptr_mode;
1745
1746   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1747     return mode == Pmode;
1748
1749   if (GP_REGNUM_P (regno))
1750     {
1751       if (known_le (GET_MODE_SIZE (mode), 8))
1752         return true;
1753       else if (known_le (GET_MODE_SIZE (mode), 16))
1754         return (regno & 1) == 0;
1755     }
1756   else if (FP_REGNUM_P (regno))
1757     {
1758       if (vec_flags & VEC_STRUCT)
1759         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1760       else
1761         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1762     }
1763
1764   return false;
1765 }
1766
1767 /* Return true if this is a definition of a vectorized simd function.  */
1768
1769 static bool
1770 aarch64_simd_decl_p (tree fndecl)
1771 {
1772   tree fntype;
1773
1774   if (fndecl == NULL)
1775     return false;
1776   fntype = TREE_TYPE (fndecl);
1777   if (fntype == NULL)
1778     return false;
1779
1780   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1781   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1782     return true;
1783
1784   return false;
1785 }
1786
1787 /* Return the mode a register save/restore should use.  DImode for integer
1788    registers, DFmode for FP registers in non-SIMD functions (they only save
1789    the bottom half of a 128 bit register), or TFmode for FP registers in
1790    SIMD functions.  */
1791
1792 static machine_mode
1793 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1794 {
1795   return GP_REGNUM_P (regno)
1796            ? E_DImode
1797            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1798 }
1799
1800 /* Return true if the instruction is a call to a SIMD function, false
1801    if it is not a SIMD function or if we do not know anything about
1802    the function.  */
1803
1804 static bool
1805 aarch64_simd_call_p (rtx_insn *insn)
1806 {
1807   rtx symbol;
1808   rtx call;
1809   tree fndecl;
1810
1811   gcc_assert (CALL_P (insn));
1812   call = get_call_rtx_from (insn);
1813   symbol = XEXP (XEXP (call, 0), 0);
1814   if (GET_CODE (symbol) != SYMBOL_REF)
1815     return false;
1816   fndecl = SYMBOL_REF_DECL (symbol);
1817   if (!fndecl)
1818     return false;
1819
1820   return aarch64_simd_decl_p (fndecl);
1821 }
1822
1823 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1824    a function that uses the SIMD ABI, take advantage of the extra
1825    call-preserved registers that the ABI provides.  */
1826
1827 void
1828 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1829                                           HARD_REG_SET *return_set)
1830 {
1831   if (aarch64_simd_call_p (insn))
1832     {
1833       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1834         if (FP_SIMD_SAVED_REGNUM_P (regno))
1835           CLEAR_HARD_REG_BIT (*return_set, regno);
1836     }
1837 }
1838
1839 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1840    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1841    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1842
1843 static bool
1844 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1845                                         machine_mode mode)
1846 {
1847   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1848   return FP_REGNUM_P (regno)
1849          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1850 }
1851
1852 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1853
1854 rtx_insn *
1855 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1856 {
1857   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1858
1859   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1860     return call_1;
1861   else
1862     return call_2;
1863 }
1864
1865 /* Implement REGMODE_NATURAL_SIZE.  */
1866 poly_uint64
1867 aarch64_regmode_natural_size (machine_mode mode)
1868 {
1869   /* The natural size for SVE data modes is one SVE data vector,
1870      and similarly for predicates.  We can't independently modify
1871      anything smaller than that.  */
1872   /* ??? For now, only do this for variable-width SVE registers.
1873      Doing it for constant-sized registers breaks lower-subreg.c.  */
1874   /* ??? And once that's fixed, we should probably have similar
1875      code for Advanced SIMD.  */
1876   if (!aarch64_sve_vg.is_constant ())
1877     {
1878       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1879       if (vec_flags & VEC_SVE_PRED)
1880         return BYTES_PER_SVE_PRED;
1881       if (vec_flags & VEC_SVE_DATA)
1882         return BYTES_PER_SVE_VECTOR;
1883     }
1884   return UNITS_PER_WORD;
1885 }
1886
1887 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1888 machine_mode
1889 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1890                                      machine_mode mode)
1891 {
1892   /* The predicate mode determines which bits are significant and
1893      which are "don't care".  Decreasing the number of lanes would
1894      lose data while increasing the number of lanes would make bits
1895      unnecessarily significant.  */
1896   if (PR_REGNUM_P (regno))
1897     return mode;
1898   if (known_ge (GET_MODE_SIZE (mode), 4))
1899     return mode;
1900   else
1901     return SImode;
1902 }
1903
1904 /* Return true if I's bits are consecutive ones from the MSB.  */
1905 bool
1906 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1907 {
1908   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1909 }
1910
1911 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1912    that strcpy from constants will be faster.  */
1913
1914 static HOST_WIDE_INT
1915 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1916 {
1917   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1918     return MAX (align, BITS_PER_WORD);
1919   return align;
1920 }
1921
1922 /* Return true if calls to DECL should be treated as
1923    long-calls (ie called via a register).  */
1924 static bool
1925 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1926 {
1927   return false;
1928 }
1929
1930 /* Return true if calls to symbol-ref SYM should be treated as
1931    long-calls (ie called via a register).  */
1932 bool
1933 aarch64_is_long_call_p (rtx sym)
1934 {
1935   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1936 }
1937
1938 /* Return true if calls to symbol-ref SYM should not go through
1939    plt stubs.  */
1940
1941 bool
1942 aarch64_is_noplt_call_p (rtx sym)
1943 {
1944   const_tree decl = SYMBOL_REF_DECL (sym);
1945
1946   if (flag_pic
1947       && decl
1948       && (!flag_plt
1949           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1950       && !targetm.binds_local_p (decl))
1951     return true;
1952
1953   return false;
1954 }
1955
1956 /* Return true if the offsets to a zero/sign-extract operation
1957    represent an expression that matches an extend operation.  The
1958    operands represent the paramters from
1959
1960    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1961 bool
1962 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1963                                 rtx extract_imm)
1964 {
1965   HOST_WIDE_INT mult_val, extract_val;
1966
1967   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1968     return false;
1969
1970   mult_val = INTVAL (mult_imm);
1971   extract_val = INTVAL (extract_imm);
1972
1973   if (extract_val > 8
1974       && extract_val < GET_MODE_BITSIZE (mode)
1975       && exact_log2 (extract_val & ~7) > 0
1976       && (extract_val & 7) <= 4
1977       && mult_val == (1 << (extract_val & 7)))
1978     return true;
1979
1980   return false;
1981 }
1982
1983 /* Emit an insn that's a simple single-set.  Both the operands must be
1984    known to be valid.  */
1985 inline static rtx_insn *
1986 emit_set_insn (rtx x, rtx y)
1987 {
1988   return emit_insn (gen_rtx_SET (x, y));
1989 }
1990
1991 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1992    return the rtx for register 0 in the proper mode.  */
1993 rtx
1994 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1995 {
1996   machine_mode mode = SELECT_CC_MODE (code, x, y);
1997   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1998
1999   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2000   return cc_reg;
2001 }
2002
2003 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2004
2005 static rtx
2006 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2007                                   machine_mode y_mode)
2008 {
2009   if (y_mode == E_QImode || y_mode == E_HImode)
2010     {
2011       if (CONST_INT_P (y))
2012         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2013       else
2014         {
2015           rtx t, cc_reg;
2016           machine_mode cc_mode;
2017
2018           t = gen_rtx_ZERO_EXTEND (SImode, y);
2019           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2020           cc_mode = CC_SWPmode;
2021           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2022           emit_set_insn (cc_reg, t);
2023           return cc_reg;
2024         }
2025     }
2026
2027   return aarch64_gen_compare_reg (code, x, y);
2028 }
2029
2030 /* Build the SYMBOL_REF for __tls_get_addr.  */
2031
2032 static GTY(()) rtx tls_get_addr_libfunc;
2033
2034 rtx
2035 aarch64_tls_get_addr (void)
2036 {
2037   if (!tls_get_addr_libfunc)
2038     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2039   return tls_get_addr_libfunc;
2040 }
2041
2042 /* Return the TLS model to use for ADDR.  */
2043
2044 static enum tls_model
2045 tls_symbolic_operand_type (rtx addr)
2046 {
2047   enum tls_model tls_kind = TLS_MODEL_NONE;
2048   if (GET_CODE (addr) == CONST)
2049     {
2050       poly_int64 addend;
2051       rtx sym = strip_offset (addr, &addend);
2052       if (GET_CODE (sym) == SYMBOL_REF)
2053         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2054     }
2055   else if (GET_CODE (addr) == SYMBOL_REF)
2056     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2057
2058   return tls_kind;
2059 }
2060
2061 /* We'll allow lo_sum's in addresses in our legitimate addresses
2062    so that combine would take care of combining addresses where
2063    necessary, but for generation purposes, we'll generate the address
2064    as :
2065    RTL                               Absolute
2066    tmp = hi (symbol_ref);            adrp  x1, foo
2067    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2068                                      nop
2069
2070    PIC                               TLS
2071    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2072    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2073                                      bl   __tls_get_addr
2074                                      nop
2075
2076    Load TLS symbol, depending on TLS mechanism and TLS access model.
2077
2078    Global Dynamic - Traditional TLS:
2079    adrp tmp, :tlsgd:imm
2080    add  dest, tmp, #:tlsgd_lo12:imm
2081    bl   __tls_get_addr
2082
2083    Global Dynamic - TLS Descriptors:
2084    adrp dest, :tlsdesc:imm
2085    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2086    add  dest, dest, #:tlsdesc_lo12:imm
2087    blr  tmp
2088    mrs  tp, tpidr_el0
2089    add  dest, dest, tp
2090
2091    Initial Exec:
2092    mrs  tp, tpidr_el0
2093    adrp tmp, :gottprel:imm
2094    ldr  dest, [tmp, #:gottprel_lo12:imm]
2095    add  dest, dest, tp
2096
2097    Local Exec:
2098    mrs  tp, tpidr_el0
2099    add  t0, tp, #:tprel_hi12:imm, lsl #12
2100    add  t0, t0, #:tprel_lo12_nc:imm
2101 */
2102
2103 static void
2104 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2105                                    enum aarch64_symbol_type type)
2106 {
2107   switch (type)
2108     {
2109     case SYMBOL_SMALL_ABSOLUTE:
2110       {
2111         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2112         rtx tmp_reg = dest;
2113         machine_mode mode = GET_MODE (dest);
2114
2115         gcc_assert (mode == Pmode || mode == ptr_mode);
2116
2117         if (can_create_pseudo_p ())
2118           tmp_reg = gen_reg_rtx (mode);
2119
2120         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2121         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2122         return;
2123       }
2124
2125     case SYMBOL_TINY_ABSOLUTE:
2126       emit_insn (gen_rtx_SET (dest, imm));
2127       return;
2128
2129     case SYMBOL_SMALL_GOT_28K:
2130       {
2131         machine_mode mode = GET_MODE (dest);
2132         rtx gp_rtx = pic_offset_table_rtx;
2133         rtx insn;
2134         rtx mem;
2135
2136         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2137            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2138            decide rtx costs, in which case pic_offset_table_rtx is not
2139            initialized.  For that case no need to generate the first adrp
2140            instruction as the final cost for global variable access is
2141            one instruction.  */
2142         if (gp_rtx != NULL)
2143           {
2144             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2145                using the page base as GOT base, the first page may be wasted,
2146                in the worst scenario, there is only 28K space for GOT).
2147
2148                The generate instruction sequence for accessing global variable
2149                is:
2150
2151                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2152
2153                Only one instruction needed. But we must initialize
2154                pic_offset_table_rtx properly.  We generate initialize insn for
2155                every global access, and allow CSE to remove all redundant.
2156
2157                The final instruction sequences will look like the following
2158                for multiply global variables access.
2159
2160                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2161
2162                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2163                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2164                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2165                  ...  */
2166
2167             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2168             crtl->uses_pic_offset_table = 1;
2169             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2170
2171             if (mode != GET_MODE (gp_rtx))
2172              gp_rtx = gen_lowpart (mode, gp_rtx);
2173
2174           }
2175
2176         if (mode == ptr_mode)
2177           {
2178             if (mode == DImode)
2179               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2180             else
2181               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2182
2183             mem = XVECEXP (SET_SRC (insn), 0, 0);
2184           }
2185         else
2186           {
2187             gcc_assert (mode == Pmode);
2188
2189             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2190             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2191           }
2192
2193         /* The operand is expected to be MEM.  Whenever the related insn
2194            pattern changed, above code which calculate mem should be
2195            updated.  */
2196         gcc_assert (GET_CODE (mem) == MEM);
2197         MEM_READONLY_P (mem) = 1;
2198         MEM_NOTRAP_P (mem) = 1;
2199         emit_insn (insn);
2200         return;
2201       }
2202
2203     case SYMBOL_SMALL_GOT_4G:
2204       {
2205         /* In ILP32, the mode of dest can be either SImode or DImode,
2206            while the got entry is always of SImode size.  The mode of
2207            dest depends on how dest is used: if dest is assigned to a
2208            pointer (e.g. in the memory), it has SImode; it may have
2209            DImode if dest is dereferenced to access the memeory.
2210            This is why we have to handle three different ldr_got_small
2211            patterns here (two patterns for ILP32).  */
2212
2213         rtx insn;
2214         rtx mem;
2215         rtx tmp_reg = dest;
2216         machine_mode mode = GET_MODE (dest);
2217
2218         if (can_create_pseudo_p ())
2219           tmp_reg = gen_reg_rtx (mode);
2220
2221         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2222         if (mode == ptr_mode)
2223           {
2224             if (mode == DImode)
2225               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2226             else
2227               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2228
2229             mem = XVECEXP (SET_SRC (insn), 0, 0);
2230           }
2231         else
2232           {
2233             gcc_assert (mode == Pmode);
2234
2235             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2236             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2237           }
2238
2239         gcc_assert (GET_CODE (mem) == MEM);
2240         MEM_READONLY_P (mem) = 1;
2241         MEM_NOTRAP_P (mem) = 1;
2242         emit_insn (insn);
2243         return;
2244       }
2245
2246     case SYMBOL_SMALL_TLSGD:
2247       {
2248         rtx_insn *insns;
2249         machine_mode mode = GET_MODE (dest);
2250         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2251
2252         start_sequence ();
2253         if (TARGET_ILP32)
2254           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2255         else
2256           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2257         insns = get_insns ();
2258         end_sequence ();
2259
2260         RTL_CONST_CALL_P (insns) = 1;
2261         emit_libcall_block (insns, dest, result, imm);
2262         return;
2263       }
2264
2265     case SYMBOL_SMALL_TLSDESC:
2266       {
2267         machine_mode mode = GET_MODE (dest);
2268         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2269         rtx tp;
2270
2271         gcc_assert (mode == Pmode || mode == ptr_mode);
2272
2273         /* In ILP32, the got entry is always of SImode size.  Unlike
2274            small GOT, the dest is fixed at reg 0.  */
2275         if (TARGET_ILP32)
2276           emit_insn (gen_tlsdesc_small_si (imm));
2277         else
2278           emit_insn (gen_tlsdesc_small_di (imm));
2279         tp = aarch64_load_tp (NULL);
2280
2281         if (mode != Pmode)
2282           tp = gen_lowpart (mode, tp);
2283
2284         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2285         if (REG_P (dest))
2286           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2287         return;
2288       }
2289
2290     case SYMBOL_SMALL_TLSIE:
2291       {
2292         /* In ILP32, the mode of dest can be either SImode or DImode,
2293            while the got entry is always of SImode size.  The mode of
2294            dest depends on how dest is used: if dest is assigned to a
2295            pointer (e.g. in the memory), it has SImode; it may have
2296            DImode if dest is dereferenced to access the memeory.
2297            This is why we have to handle three different tlsie_small
2298            patterns here (two patterns for ILP32).  */
2299         machine_mode mode = GET_MODE (dest);
2300         rtx tmp_reg = gen_reg_rtx (mode);
2301         rtx tp = aarch64_load_tp (NULL);
2302
2303         if (mode == ptr_mode)
2304           {
2305             if (mode == DImode)
2306               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2307             else
2308               {
2309                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2310                 tp = gen_lowpart (mode, tp);
2311               }
2312           }
2313         else
2314           {
2315             gcc_assert (mode == Pmode);
2316             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2317           }
2318
2319         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2320         if (REG_P (dest))
2321           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2322         return;
2323       }
2324
2325     case SYMBOL_TLSLE12:
2326     case SYMBOL_TLSLE24:
2327     case SYMBOL_TLSLE32:
2328     case SYMBOL_TLSLE48:
2329       {
2330         machine_mode mode = GET_MODE (dest);
2331         rtx tp = aarch64_load_tp (NULL);
2332
2333         if (mode != Pmode)
2334           tp = gen_lowpart (mode, tp);
2335
2336         switch (type)
2337           {
2338           case SYMBOL_TLSLE12:
2339             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2340                         (dest, tp, imm));
2341             break;
2342           case SYMBOL_TLSLE24:
2343             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2344                         (dest, tp, imm));
2345           break;
2346           case SYMBOL_TLSLE32:
2347             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2348                         (dest, imm));
2349             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2350                         (dest, dest, tp));
2351           break;
2352           case SYMBOL_TLSLE48:
2353             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2354                         (dest, imm));
2355             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2356                         (dest, dest, tp));
2357             break;
2358           default:
2359             gcc_unreachable ();
2360           }
2361
2362         if (REG_P (dest))
2363           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2364         return;
2365       }
2366
2367     case SYMBOL_TINY_GOT:
2368       emit_insn (gen_ldr_got_tiny (dest, imm));
2369       return;
2370
2371     case SYMBOL_TINY_TLSIE:
2372       {
2373         machine_mode mode = GET_MODE (dest);
2374         rtx tp = aarch64_load_tp (NULL);
2375
2376         if (mode == ptr_mode)
2377           {
2378             if (mode == DImode)
2379               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2380             else
2381               {
2382                 tp = gen_lowpart (mode, tp);
2383                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2384               }
2385           }
2386         else
2387           {
2388             gcc_assert (mode == Pmode);
2389             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2390           }
2391
2392         if (REG_P (dest))
2393           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2394         return;
2395       }
2396
2397     default:
2398       gcc_unreachable ();
2399     }
2400 }
2401
2402 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2403    handle all moves if !can_create_pseudo_p ().  The distinction is
2404    important because, unlike emit_move_insn, the move expanders know
2405    how to force Pmode objects into the constant pool even when the
2406    constant pool address is not itself legitimate.  */
2407 static rtx
2408 aarch64_emit_move (rtx dest, rtx src)
2409 {
2410   return (can_create_pseudo_p ()
2411           ? emit_move_insn (dest, src)
2412           : emit_move_insn_1 (dest, src));
2413 }
2414
2415 /* Apply UNOPTAB to OP and store the result in DEST.  */
2416
2417 static void
2418 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2419 {
2420   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2421   if (dest != tmp)
2422     emit_move_insn (dest, tmp);
2423 }
2424
2425 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2426
2427 static void
2428 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2429 {
2430   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2431                           OPTAB_DIRECT);
2432   if (dest != tmp)
2433     emit_move_insn (dest, tmp);
2434 }
2435
2436 /* Split a 128-bit move operation into two 64-bit move operations,
2437    taking care to handle partial overlap of register to register
2438    copies.  Special cases are needed when moving between GP regs and
2439    FP regs.  SRC can be a register, constant or memory; DST a register
2440    or memory.  If either operand is memory it must not have any side
2441    effects.  */
2442 void
2443 aarch64_split_128bit_move (rtx dst, rtx src)
2444 {
2445   rtx dst_lo, dst_hi;
2446   rtx src_lo, src_hi;
2447
2448   machine_mode mode = GET_MODE (dst);
2449
2450   gcc_assert (mode == TImode || mode == TFmode);
2451   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2452   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2453
2454   if (REG_P (dst) && REG_P (src))
2455     {
2456       int src_regno = REGNO (src);
2457       int dst_regno = REGNO (dst);
2458
2459       /* Handle FP <-> GP regs.  */
2460       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2461         {
2462           src_lo = gen_lowpart (word_mode, src);
2463           src_hi = gen_highpart (word_mode, src);
2464
2465           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2466           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2467           return;
2468         }
2469       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2470         {
2471           dst_lo = gen_lowpart (word_mode, dst);
2472           dst_hi = gen_highpart (word_mode, dst);
2473
2474           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2475           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2476           return;
2477         }
2478     }
2479
2480   dst_lo = gen_lowpart (word_mode, dst);
2481   dst_hi = gen_highpart (word_mode, dst);
2482   src_lo = gen_lowpart (word_mode, src);
2483   src_hi = gen_highpart_mode (word_mode, mode, src);
2484
2485   /* At most one pairing may overlap.  */
2486   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2487     {
2488       aarch64_emit_move (dst_hi, src_hi);
2489       aarch64_emit_move (dst_lo, src_lo);
2490     }
2491   else
2492     {
2493       aarch64_emit_move (dst_lo, src_lo);
2494       aarch64_emit_move (dst_hi, src_hi);
2495     }
2496 }
2497
2498 bool
2499 aarch64_split_128bit_move_p (rtx dst, rtx src)
2500 {
2501   return (! REG_P (src)
2502           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2503 }
2504
2505 /* Split a complex SIMD combine.  */
2506
2507 void
2508 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2509 {
2510   machine_mode src_mode = GET_MODE (src1);
2511   machine_mode dst_mode = GET_MODE (dst);
2512
2513   gcc_assert (VECTOR_MODE_P (dst_mode));
2514   gcc_assert (register_operand (dst, dst_mode)
2515               && register_operand (src1, src_mode)
2516               && register_operand (src2, src_mode));
2517
2518   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2519   return;
2520 }
2521
2522 /* Split a complex SIMD move.  */
2523
2524 void
2525 aarch64_split_simd_move (rtx dst, rtx src)
2526 {
2527   machine_mode src_mode = GET_MODE (src);
2528   machine_mode dst_mode = GET_MODE (dst);
2529
2530   gcc_assert (VECTOR_MODE_P (dst_mode));
2531
2532   if (REG_P (dst) && REG_P (src))
2533     {
2534       gcc_assert (VECTOR_MODE_P (src_mode));
2535       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2536     }
2537 }
2538
2539 bool
2540 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2541                               machine_mode ymode, rtx y)
2542 {
2543   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2544   gcc_assert (r != NULL);
2545   return rtx_equal_p (x, r);
2546 }
2547
2548
2549 /* Return TARGET if it is nonnull and a register of mode MODE.
2550    Otherwise, return a fresh register of mode MODE if we can,
2551    or TARGET reinterpreted as MODE if we can't.  */
2552
2553 static rtx
2554 aarch64_target_reg (rtx target, machine_mode mode)
2555 {
2556   if (target && REG_P (target) && GET_MODE (target) == mode)
2557     return target;
2558   if (!can_create_pseudo_p ())
2559     {
2560       gcc_assert (target);
2561       return gen_lowpart (mode, target);
2562     }
2563   return gen_reg_rtx (mode);
2564 }
2565
2566 /* Return a register that contains the constant in BUILDER, given that
2567    the constant is a legitimate move operand.  Use TARGET as the register
2568    if it is nonnull and convenient.  */
2569
2570 static rtx
2571 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2572 {
2573   rtx src = builder.build ();
2574   target = aarch64_target_reg (target, GET_MODE (src));
2575   emit_insn (gen_rtx_SET (target, src));
2576   return target;
2577 }
2578
2579 static rtx
2580 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2581 {
2582   if (can_create_pseudo_p ())
2583     return force_reg (mode, value);
2584   else
2585     {
2586       gcc_assert (x);
2587       aarch64_emit_move (x, value);
2588       return x;
2589     }
2590 }
2591
2592 /* Return true if predicate value X is a constant in which every element
2593    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2594    value, i.e. as a predicate in which all bits are significant.  */
2595
2596 static bool
2597 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2598 {
2599   if (GET_CODE (x) != CONST_VECTOR)
2600     return false;
2601
2602   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2603                                              GET_MODE_NUNITS (GET_MODE (x)));
2604   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2605   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2606   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2607
2608   unsigned int nelts = const_vector_encoded_nelts (x);
2609   for (unsigned int i = 0; i < nelts; ++i)
2610     {
2611       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2612       if (!CONST_INT_P (elt))
2613         return false;
2614
2615       builder.quick_push (elt);
2616       for (unsigned int j = 1; j < factor; ++j)
2617         builder.quick_push (const0_rtx);
2618     }
2619   builder.finalize ();
2620   return true;
2621 }
2622
2623 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2624    widest predicate element size it can have (that is, the largest size
2625    for which each element would still be 0 or 1).  */
2626
2627 unsigned int
2628 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2629 {
2630   /* Start with the most optimistic assumption: that we only need
2631      one bit per pattern.  This is what we will use if only the first
2632      bit in each pattern is ever set.  */
2633   unsigned int mask = GET_MODE_SIZE (DImode);
2634   mask |= builder.npatterns ();
2635
2636   /* Look for set bits.  */
2637   unsigned int nelts = builder.encoded_nelts ();
2638   for (unsigned int i = 1; i < nelts; ++i)
2639     if (INTVAL (builder.elt (i)) != 0)
2640       {
2641         if (i & 1)
2642           return 1;
2643         mask |= i;
2644       }
2645   return mask & -mask;
2646 }
2647
2648 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
2649    that the constant would have with predicate element size ELT_SIZE
2650    (ignoring the upper bits in each element) and return:
2651
2652    * -1 if all bits are set
2653    * N if the predicate has N leading set bits followed by all clear bits
2654    * 0 if the predicate does not have any of these forms.  */
2655
2656 int
2657 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2658                               unsigned int elt_size)
2659 {
2660   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2661      followed by set bits.  */
2662   if (builder.nelts_per_pattern () == 3)
2663     return 0;
2664
2665   /* Skip over leading set bits.  */
2666   unsigned int nelts = builder.encoded_nelts ();
2667   unsigned int i = 0;
2668   for (; i < nelts; i += elt_size)
2669     if (INTVAL (builder.elt (i)) == 0)
2670       break;
2671   unsigned int vl = i / elt_size;
2672
2673   /* Check for the all-true case.  */
2674   if (i == nelts)
2675     return -1;
2676
2677   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2678      repeating pattern of set bits followed by clear bits.  */
2679   if (builder.nelts_per_pattern () != 2)
2680     return 0;
2681
2682   /* We have a "foreground" value and a duplicated "background" value.
2683      If the background might repeat and the last set bit belongs to it,
2684      we might have set bits followed by clear bits followed by set bits.  */
2685   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2686     return 0;
2687
2688   /* Make sure that the rest are all clear.  */
2689   for (; i < nelts; i += elt_size)
2690     if (INTVAL (builder.elt (i)) != 0)
2691       return 0;
2692
2693   return vl;
2694 }
2695
2696 /* See if there is an svpattern that encodes an SVE predicate of mode
2697    PRED_MODE in which the first VL bits are set and the rest are clear.
2698    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2699    A VL of -1 indicates an all-true vector.  */
2700
2701 aarch64_svpattern
2702 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2703 {
2704   if (vl < 0)
2705     return AARCH64_SV_ALL;
2706
2707   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2708     return AARCH64_NUM_SVPATTERNS;
2709
2710   if (vl >= 1 && vl <= 8)
2711     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2712
2713   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2714     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2715
2716   int max_vl;
2717   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2718     {
2719       if (vl == (max_vl / 3) * 3)
2720         return AARCH64_SV_MUL3;
2721       /* These would only trigger for non-power-of-2 lengths.  */
2722       if (vl == (max_vl & -4))
2723         return AARCH64_SV_MUL4;
2724       if (vl == (1 << floor_log2 (max_vl)))
2725         return AARCH64_SV_POW2;
2726       if (vl == max_vl)
2727         return AARCH64_SV_ALL;
2728     }
2729   return AARCH64_NUM_SVPATTERNS;
2730 }
2731
2732 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2733    bits has the lowest bit set and the upper bits clear.  This is the
2734    VNx16BImode equivalent of a PTRUE for controlling elements of
2735    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
2736    all bits are significant, even the upper zeros.  */
2737
2738 rtx
2739 aarch64_ptrue_all (unsigned int elt_size)
2740 {
2741   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2742   builder.quick_push (const1_rtx);
2743   for (unsigned int i = 1; i < elt_size; ++i)
2744     builder.quick_push (const0_rtx);
2745   return builder.build ();
2746 }
2747
2748 /* Return an all-true predicate register of mode MODE.  */
2749
2750 rtx
2751 aarch64_ptrue_reg (machine_mode mode)
2752 {
2753   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2754   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2755   return gen_lowpart (mode, reg);
2756 }
2757
2758 /* Return an all-false predicate register of mode MODE.  */
2759
2760 rtx
2761 aarch64_pfalse_reg (machine_mode mode)
2762 {
2763   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2764   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2765   return gen_lowpart (mode, reg);
2766 }
2767
2768 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2769    true, or alternatively if we know that the operation predicated by
2770    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
2771    aarch64_sve_gp_strictness operand that describes the operation
2772    predicated by PRED1[0].  */
2773
2774 bool
2775 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2776 {
2777   machine_mode mode = GET_MODE (pred2);
2778   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2779               && mode == GET_MODE (pred1[0])
2780               && aarch64_sve_gp_strictness (pred1[1], SImode));
2781   return (pred1[0] == CONSTM1_RTX (mode)
2782           || INTVAL (pred1[1]) == SVE_RELAXED_GP
2783           || rtx_equal_p (pred1[0], pred2));
2784 }
2785
2786 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2787    for it.  PRED2[0] is the predicate for the instruction whose result
2788    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2789    for it.  Return true if we can prove that the two predicates are
2790    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2791    with PRED1[0] without changing behavior.  */
2792
2793 bool
2794 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2795 {
2796   machine_mode mode = GET_MODE (pred1[0]);
2797   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2798               && mode == GET_MODE (pred2[0])
2799               && aarch64_sve_ptrue_flag (pred1[1], SImode)
2800               && aarch64_sve_ptrue_flag (pred2[1], SImode));
2801
2802   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2803                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2804   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2805                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2806   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2807 }
2808
2809 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2810    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2811    Use TARGET as the target register if nonnull and convenient.  */
2812
2813 static rtx
2814 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2815                           machine_mode data_mode, rtx op1, rtx op2)
2816 {
2817   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2818   expand_operand ops[5];
2819   create_output_operand (&ops[0], target, pred_mode);
2820   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2821   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2822   create_input_operand (&ops[3], op1, data_mode);
2823   create_input_operand (&ops[4], op2, data_mode);
2824   expand_insn (icode, 5, ops);
2825   return ops[0].value;
2826 }
2827
2828 /* Use a comparison to convert integer vector SRC into MODE, which is
2829    the corresponding SVE predicate mode.  Use TARGET for the result
2830    if it's nonnull and convenient.  */
2831
2832 static rtx
2833 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2834 {
2835   machine_mode src_mode = GET_MODE (src);
2836   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2837                                    src, CONST0_RTX (src_mode));
2838 }
2839
2840 /* Return true if we can move VALUE into a register using a single
2841    CNT[BHWD] instruction.  */
2842
2843 static bool
2844 aarch64_sve_cnt_immediate_p (poly_int64 value)
2845 {
2846   HOST_WIDE_INT factor = value.coeffs[0];
2847   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2848   return (value.coeffs[1] == factor
2849           && IN_RANGE (factor, 2, 16 * 16)
2850           && (factor & 1) == 0
2851           && factor <= 16 * (factor & -factor));
2852 }
2853
2854 /* Likewise for rtx X.  */
2855
2856 bool
2857 aarch64_sve_cnt_immediate_p (rtx x)
2858 {
2859   poly_int64 value;
2860   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2861 }
2862
2863 /* Return the asm string for an instruction with a CNT-like vector size
2864    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2865    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2866    first part of the operands template (the part that comes before the
2867    vector size itself).  FACTOR is the number of quadwords.
2868    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2869    If it is zero, we can use any element size.  */
2870
2871 static char *
2872 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2873                                   unsigned int factor,
2874                                   unsigned int nelts_per_vq)
2875 {
2876   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2877
2878   if (nelts_per_vq == 0)
2879     /* There is some overlap in the ranges of the four CNT instructions.
2880        Here we always use the smallest possible element size, so that the
2881        multiplier is 1 whereever possible.  */
2882     nelts_per_vq = factor & -factor;
2883   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2884   gcc_assert (IN_RANGE (shift, 1, 4));
2885   char suffix = "dwhb"[shift - 1];
2886
2887   factor >>= shift;
2888   unsigned int written;
2889   if (factor == 1)
2890     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2891                         prefix, suffix, operands);
2892   else
2893     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2894                         prefix, suffix, operands, factor);
2895   gcc_assert (written < sizeof (buffer));
2896   return buffer;
2897 }
2898
2899 /* Return the asm string for an instruction with a CNT-like vector size
2900    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2901    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2902    first part of the operands template (the part that comes before the
2903    vector size itself).  X is the value of the vector size operand,
2904    as a polynomial integer rtx.  */
2905
2906 char *
2907 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2908                                   rtx x)
2909 {
2910   poly_int64 value = rtx_to_poly_int64 (x);
2911   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2912   return aarch64_output_sve_cnt_immediate (prefix, operands,
2913                                            value.coeffs[1], 0);
2914 }
2915
2916 /* Return true if we can add VALUE to a register using a single ADDVL
2917    or ADDPL instruction.  */
2918
2919 static bool
2920 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2921 {
2922   HOST_WIDE_INT factor = value.coeffs[0];
2923   if (factor == 0 || value.coeffs[1] != factor)
2924     return false;
2925   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2926      and a value of 16 is one vector width.  */
2927   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2928           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2929 }
2930
2931 /* Likewise for rtx X.  */
2932
2933 bool
2934 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2935 {
2936   poly_int64 value;
2937   return (poly_int_rtx_p (x, &value)
2938           && aarch64_sve_addvl_addpl_immediate_p (value));
2939 }
2940
2941 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2942    and storing the result in operand 0.  */
2943
2944 char *
2945 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2946 {
2947   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2948   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2949   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2950
2951   /* Use INC or DEC if possible.  */
2952   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2953     {
2954       if (aarch64_sve_cnt_immediate_p (offset_value))
2955         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2956                                                  offset_value.coeffs[1], 0);
2957       if (aarch64_sve_cnt_immediate_p (-offset_value))
2958         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2959                                                  -offset_value.coeffs[1], 0);
2960     }
2961
2962   int factor = offset_value.coeffs[1];
2963   if ((factor & 15) == 0)
2964     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2965   else
2966     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2967   return buffer;
2968 }
2969
2970 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2971    instruction.  If it is, store the number of elements in each vector
2972    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2973    factor in *FACTOR_OUT (if nonnull).  */
2974
2975 bool
2976 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2977                                  unsigned int *nelts_per_vq_out)
2978 {
2979   rtx elt;
2980   poly_int64 value;
2981
2982   if (!const_vec_duplicate_p (x, &elt)
2983       || !poly_int_rtx_p (elt, &value))
2984     return false;
2985
2986   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2987   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2988     /* There's no vector INCB.  */
2989     return false;
2990
2991   HOST_WIDE_INT factor = value.coeffs[0];
2992   if (value.coeffs[1] != factor)
2993     return false;
2994
2995   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2996   if ((factor % nelts_per_vq) != 0
2997       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2998     return false;
2999
3000   if (factor_out)
3001     *factor_out = factor;
3002   if (nelts_per_vq_out)
3003     *nelts_per_vq_out = nelts_per_vq;
3004   return true;
3005 }
3006
3007 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3008    instruction.  */
3009
3010 bool
3011 aarch64_sve_inc_dec_immediate_p (rtx x)
3012 {
3013   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
3014 }
3015
3016 /* Return the asm template for an SVE vector INC or DEC instruction.
3017    OPERANDS gives the operands before the vector count and X is the
3018    value of the vector count operand itself.  */
3019
3020 char *
3021 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
3022 {
3023   int factor;
3024   unsigned int nelts_per_vq;
3025   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3026     gcc_unreachable ();
3027   if (factor < 0)
3028     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
3029                                              nelts_per_vq);
3030   else
3031     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
3032                                              nelts_per_vq);
3033 }
3034
3035 static int
3036 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3037                                 scalar_int_mode mode)
3038 {
3039   int i;
3040   unsigned HOST_WIDE_INT val, val2, mask;
3041   int one_match, zero_match;
3042   int num_insns;
3043
3044   val = INTVAL (imm);
3045
3046   if (aarch64_move_imm (val, mode))
3047     {
3048       if (generate)
3049         emit_insn (gen_rtx_SET (dest, imm));
3050       return 1;
3051     }
3052
3053   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3054      (with XXXX non-zero). In that case check to see if the move can be done in
3055      a smaller mode.  */
3056   val2 = val & 0xffffffff;
3057   if (mode == DImode
3058       && aarch64_move_imm (val2, SImode)
3059       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3060     {
3061       if (generate)
3062         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3063
3064       /* Check if we have to emit a second instruction by checking to see
3065          if any of the upper 32 bits of the original DI mode value is set.  */
3066       if (val == val2)
3067         return 1;
3068
3069       i = (val >> 48) ? 48 : 32;
3070
3071       if (generate)
3072          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3073                                     GEN_INT ((val >> i) & 0xffff)));
3074
3075       return 2;
3076     }
3077
3078   if ((val >> 32) == 0 || mode == SImode)
3079     {
3080       if (generate)
3081         {
3082           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3083           if (mode == SImode)
3084             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3085                                        GEN_INT ((val >> 16) & 0xffff)));
3086           else
3087             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3088                                        GEN_INT ((val >> 16) & 0xffff)));
3089         }
3090       return 2;
3091     }
3092
3093   /* Remaining cases are all for DImode.  */
3094
3095   mask = 0xffff;
3096   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3097     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3098   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3099     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3100
3101   if (zero_match != 2 && one_match != 2)
3102     {
3103       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3104          For a 64-bit bitmask try whether changing 16 bits to all ones or
3105          zeroes creates a valid bitmask.  To check any repeated bitmask,
3106          try using 16 bits from the other 32-bit half of val.  */
3107
3108       for (i = 0; i < 64; i += 16, mask <<= 16)
3109         {
3110           val2 = val & ~mask;
3111           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3112             break;
3113           val2 = val | mask;
3114           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3115             break;
3116           val2 = val2 & ~mask;
3117           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3118           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3119             break;
3120         }
3121       if (i != 64)
3122         {
3123           if (generate)
3124             {
3125               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3126               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3127                                          GEN_INT ((val >> i) & 0xffff)));
3128             }
3129           return 2;
3130         }
3131     }
3132
3133   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3134      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3135      otherwise skip zero bits.  */
3136
3137   num_insns = 1;
3138   mask = 0xffff;
3139   val2 = one_match > zero_match ? ~val : val;
3140   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3141
3142   if (generate)
3143     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3144                                            ? (val | ~(mask << i))
3145                                            : (val & (mask << i)))));
3146   for (i += 16; i < 64; i += 16)
3147     {
3148       if ((val2 & (mask << i)) == 0)
3149         continue;
3150       if (generate)
3151         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3152                                    GEN_INT ((val >> i) & 0xffff)));
3153       num_insns ++;
3154     }
3155
3156   return num_insns;
3157 }
3158
3159 /* Return whether imm is a 128-bit immediate which is simple enough to
3160    expand inline.  */
3161 bool
3162 aarch64_mov128_immediate (rtx imm)
3163 {
3164   if (GET_CODE (imm) == CONST_INT)
3165     return true;
3166
3167   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3168
3169   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3170   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3171
3172   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3173          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3174 }
3175
3176
3177 /* Return the number of temporary registers that aarch64_add_offset_1
3178    would need to add OFFSET to a register.  */
3179
3180 static unsigned int
3181 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3182 {
3183   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3184 }
3185
3186 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3187    a non-polynomial OFFSET.  MODE is the mode of the addition.
3188    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3189    be set and CFA adjustments added to the generated instructions.
3190
3191    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3192    temporary if register allocation is already complete.  This temporary
3193    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3194    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3195    the immediate again.
3196
3197    Since this function may be used to adjust the stack pointer, we must
3198    ensure that it cannot cause transient stack deallocation (for example
3199    by first incrementing SP and then decrementing when adjusting by a
3200    large immediate).  */
3201
3202 static void
3203 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3204                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3205                       bool frame_related_p, bool emit_move_imm)
3206 {
3207   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3208   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3209
3210   HOST_WIDE_INT moffset = abs_hwi (offset);
3211   rtx_insn *insn;
3212
3213   if (!moffset)
3214     {
3215       if (!rtx_equal_p (dest, src))
3216         {
3217           insn = emit_insn (gen_rtx_SET (dest, src));
3218           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3219         }
3220       return;
3221     }
3222
3223   /* Single instruction adjustment.  */
3224   if (aarch64_uimm12_shift (moffset))
3225     {
3226       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3227       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3228       return;
3229     }
3230
3231   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3232      and either:
3233
3234      a) the offset cannot be loaded by a 16-bit move or
3235      b) there is no spare register into which we can move it.  */
3236   if (moffset < 0x1000000
3237       && ((!temp1 && !can_create_pseudo_p ())
3238           || !aarch64_move_imm (moffset, mode)))
3239     {
3240       HOST_WIDE_INT low_off = moffset & 0xfff;
3241
3242       low_off = offset < 0 ? -low_off : low_off;
3243       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3244       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3245       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3246       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3247       return;
3248     }
3249
3250   /* Emit a move immediate if required and an addition/subtraction.  */
3251   if (emit_move_imm)
3252     {
3253       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3254       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3255     }
3256   insn = emit_insn (offset < 0
3257                     ? gen_sub3_insn (dest, src, temp1)
3258                     : gen_add3_insn (dest, src, temp1));
3259   if (frame_related_p)
3260     {
3261       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3262       rtx adj = plus_constant (mode, src, offset);
3263       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3264     }
3265 }
3266
3267 /* Return the number of temporary registers that aarch64_add_offset
3268    would need to move OFFSET into a register or add OFFSET to a register;
3269    ADD_P is true if we want the latter rather than the former.  */
3270
3271 static unsigned int
3272 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3273 {
3274   /* This follows the same structure as aarch64_add_offset.  */
3275   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3276     return 0;
3277
3278   unsigned int count = 0;
3279   HOST_WIDE_INT factor = offset.coeffs[1];
3280   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3281   poly_int64 poly_offset (factor, factor);
3282   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3283     /* Need one register for the ADDVL/ADDPL result.  */
3284     count += 1;
3285   else if (factor != 0)
3286     {
3287       factor = abs (factor);
3288       if (factor > 16 * (factor & -factor))
3289         /* Need one register for the CNT result and one for the multiplication
3290            factor.  If necessary, the second temporary can be reused for the
3291            constant part of the offset.  */
3292         return 2;
3293       /* Need one register for the CNT result (which might then
3294          be shifted).  */
3295       count += 1;
3296     }
3297   return count + aarch64_add_offset_1_temporaries (constant);
3298 }
3299
3300 /* If X can be represented as a poly_int64, return the number
3301    of temporaries that are required to add it to a register.
3302    Return -1 otherwise.  */
3303
3304 int
3305 aarch64_add_offset_temporaries (rtx x)
3306 {
3307   poly_int64 offset;
3308   if (!poly_int_rtx_p (x, &offset))
3309     return -1;
3310   return aarch64_offset_temporaries (true, offset);
3311 }
3312
3313 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3314    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3315    be set and CFA adjustments added to the generated instructions.
3316
3317    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3318    temporary if register allocation is already complete.  This temporary
3319    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3320    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3321    false to avoid emitting the immediate again.
3322
3323    TEMP2, if nonnull, is a second temporary register that doesn't
3324    overlap either DEST or REG.
3325
3326    Since this function may be used to adjust the stack pointer, we must
3327    ensure that it cannot cause transient stack deallocation (for example
3328    by first incrementing SP and then decrementing when adjusting by a
3329    large immediate).  */
3330
3331 static void
3332 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3333                     poly_int64 offset, rtx temp1, rtx temp2,
3334                     bool frame_related_p, bool emit_move_imm = true)
3335 {
3336   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3337   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3338   gcc_assert (temp1 == NULL_RTX
3339               || !frame_related_p
3340               || !reg_overlap_mentioned_p (temp1, dest));
3341   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3342
3343   /* Try using ADDVL or ADDPL to add the whole value.  */
3344   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3345     {
3346       rtx offset_rtx = gen_int_mode (offset, mode);
3347       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3348       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3349       return;
3350     }
3351
3352   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3353      SVE vector register, over and above the minimum size of 128 bits.
3354      This is equivalent to half the value returned by CNTD with a
3355      vector shape of ALL.  */
3356   HOST_WIDE_INT factor = offset.coeffs[1];
3357   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3358
3359   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3360   poly_int64 poly_offset (factor, factor);
3361   if (src != const0_rtx
3362       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3363     {
3364       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3365       if (frame_related_p)
3366         {
3367           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3368           RTX_FRAME_RELATED_P (insn) = true;
3369           src = dest;
3370         }
3371       else
3372         {
3373           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3374           src = aarch64_force_temporary (mode, temp1, addr);
3375           temp1 = temp2;
3376           temp2 = NULL_RTX;
3377         }
3378     }
3379   /* Otherwise use a CNT-based sequence.  */
3380   else if (factor != 0)
3381     {
3382       /* Use a subtraction if we have a negative factor.  */
3383       rtx_code code = PLUS;
3384       if (factor < 0)
3385         {
3386           factor = -factor;
3387           code = MINUS;
3388         }
3389
3390       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3391          into the multiplication.  */
3392       rtx val;
3393       int shift = 0;
3394       if (factor & 1)
3395         /* Use a right shift by 1.  */
3396         shift = -1;
3397       else
3398         factor /= 2;
3399       HOST_WIDE_INT low_bit = factor & -factor;
3400       if (factor <= 16 * low_bit)
3401         {
3402           if (factor > 16 * 8)
3403             {
3404               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3405                  the value with the minimum multiplier and shift it into
3406                  position.  */
3407               int extra_shift = exact_log2 (low_bit);
3408               shift += extra_shift;
3409               factor >>= extra_shift;
3410             }
3411           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3412         }
3413       else
3414         {
3415           /* Use CNTD, then multiply it by FACTOR.  */
3416           val = gen_int_mode (poly_int64 (2, 2), mode);
3417           val = aarch64_force_temporary (mode, temp1, val);
3418
3419           /* Go back to using a negative multiplication factor if we have
3420              no register from which to subtract.  */
3421           if (code == MINUS && src == const0_rtx)
3422             {
3423               factor = -factor;
3424               code = PLUS;
3425             }
3426           rtx coeff1 = gen_int_mode (factor, mode);
3427           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3428           val = gen_rtx_MULT (mode, val, coeff1);
3429         }
3430
3431       if (shift > 0)
3432         {
3433           /* Multiply by 1 << SHIFT.  */
3434           val = aarch64_force_temporary (mode, temp1, val);
3435           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3436         }
3437       else if (shift == -1)
3438         {
3439           /* Divide by 2.  */
3440           val = aarch64_force_temporary (mode, temp1, val);
3441           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3442         }
3443
3444       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3445       if (src != const0_rtx)
3446         {
3447           val = aarch64_force_temporary (mode, temp1, val);
3448           val = gen_rtx_fmt_ee (code, mode, src, val);
3449         }
3450       else if (code == MINUS)
3451         {
3452           val = aarch64_force_temporary (mode, temp1, val);
3453           val = gen_rtx_NEG (mode, val);
3454         }
3455
3456       if (constant == 0 || frame_related_p)
3457         {
3458           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3459           if (frame_related_p)
3460             {
3461               RTX_FRAME_RELATED_P (insn) = true;
3462               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3463                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3464                                                               poly_offset)));
3465             }
3466           src = dest;
3467           if (constant == 0)
3468             return;
3469         }
3470       else
3471         {
3472           src = aarch64_force_temporary (mode, temp1, val);
3473           temp1 = temp2;
3474           temp2 = NULL_RTX;
3475         }
3476
3477       emit_move_imm = true;
3478     }
3479
3480   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3481                         frame_related_p, emit_move_imm);
3482 }
3483
3484 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3485    than a poly_int64.  */
3486
3487 void
3488 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3489                           rtx offset_rtx, rtx temp1, rtx temp2)
3490 {
3491   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3492                       temp1, temp2, false);
3493 }
3494
3495 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3496    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3497    if TEMP1 already contains abs (DELTA).  */
3498
3499 static inline void
3500 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3501 {
3502   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3503                       temp1, temp2, true, emit_move_imm);
3504 }
3505
3506 /* Subtract DELTA from the stack pointer, marking the instructions
3507    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3508    if nonnull.  */
3509
3510 static inline void
3511 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3512                 bool emit_move_imm = true)
3513 {
3514   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3515                       temp1, temp2, frame_related_p, emit_move_imm);
3516 }
3517
3518 /* Set DEST to (vec_series BASE STEP).  */
3519
3520 static void
3521 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3522 {
3523   machine_mode mode = GET_MODE (dest);
3524   scalar_mode inner = GET_MODE_INNER (mode);
3525
3526   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3527   if (!aarch64_sve_index_immediate_p (base))
3528     base = force_reg (inner, base);
3529   if (!aarch64_sve_index_immediate_p (step))
3530     step = force_reg (inner, step);
3531
3532   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3533 }
3534
3535 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3536    register of mode MODE.  Use TARGET for the result if it's nonnull
3537    and convenient.
3538
3539    The two vector modes must have the same element mode.  The behavior
3540    is to duplicate architectural lane N of SRC into architectural lanes
3541    N + I * STEP of the result.  On big-endian targets, architectural
3542    lane 0 of an Advanced SIMD vector is the last element of the vector
3543    in memory layout, so for big-endian targets this operation has the
3544    effect of reversing SRC before duplicating it.  Callers need to
3545    account for this.  */
3546
3547 rtx
3548 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3549 {
3550   machine_mode src_mode = GET_MODE (src);
3551   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3552   insn_code icode = (BYTES_BIG_ENDIAN
3553                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
3554                      : code_for_aarch64_vec_duplicate_vq_le (mode));
3555
3556   unsigned int i = 0;
3557   expand_operand ops[3];
3558   create_output_operand (&ops[i++], target, mode);
3559   create_output_operand (&ops[i++], src, src_mode);
3560   if (BYTES_BIG_ENDIAN)
3561     {
3562       /* Create a PARALLEL describing the reversal of SRC.  */
3563       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3564       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3565                                                   nelts_per_vq - 1, -1);
3566       create_fixed_operand (&ops[i++], sel);
3567     }
3568   expand_insn (icode, i, ops);
3569   return ops[0].value;
3570 }
3571
3572 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3573    the memory image into DEST.  Return true on success.  */
3574
3575 static bool
3576 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3577 {
3578   src = force_const_mem (GET_MODE (src), src);
3579   if (!src)
3580     return false;
3581
3582   /* Make sure that the address is legitimate.  */
3583   if (!aarch64_sve_ld1rq_operand_p (src))
3584     {
3585       rtx addr = force_reg (Pmode, XEXP (src, 0));
3586       src = replace_equiv_address (src, addr);
3587     }
3588
3589   machine_mode mode = GET_MODE (dest);
3590   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3591   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3592   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3593   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3594   return true;
3595 }
3596
3597 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3598    SVE data mode and isn't a legitimate constant.  Use TARGET for the
3599    result if convenient.
3600
3601    The returned register can have whatever mode seems most natural
3602    given the contents of SRC.  */
3603
3604 static rtx
3605 aarch64_expand_sve_const_vector (rtx target, rtx src)
3606 {
3607   machine_mode mode = GET_MODE (src);
3608   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3609   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3610   scalar_mode elt_mode = GET_MODE_INNER (mode);
3611   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3612   unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3613
3614   if (nelts_per_pattern == 1 && encoded_bits == 128)
3615     {
3616       /* The constant is a duplicated quadword but can't be narrowed
3617          beyond a quadword.  Get the memory image of the first quadword
3618          as a 128-bit vector and try using LD1RQ to load it from memory.
3619
3620          The effect for both endiannesses is to load memory lane N into
3621          architectural lanes N + I * STEP of the result.  On big-endian
3622          targets, the layout of the 128-bit vector in an Advanced SIMD
3623          register would be different from its layout in an SVE register,
3624          but this 128-bit vector is a memory value only.  */
3625       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3626       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3627       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3628         return target;
3629     }
3630
3631   if (nelts_per_pattern == 1 && encoded_bits < 128)
3632     {
3633       /* The vector is a repeating sequence of 64 bits or fewer.
3634          See if we can load them using an Advanced SIMD move and then
3635          duplicate it to fill a vector.  This is better than using a GPR
3636          move because it keeps everything in the same register file.  */
3637       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3638       rtx_vector_builder builder (vq_mode, npatterns, 1);
3639       for (unsigned int i = 0; i < npatterns; ++i)
3640         {
3641           /* We want memory lane N to go into architectural lane N,
3642              so reverse for big-endian targets.  The DUP .Q pattern
3643              has a compensating reverse built-in.  */
3644           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3645           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3646         }
3647       rtx vq_src = builder.build ();
3648       if (aarch64_simd_valid_immediate (vq_src, NULL))
3649         {
3650           vq_src = force_reg (vq_mode, vq_src);
3651           return aarch64_expand_sve_dupq (target, mode, vq_src);
3652         }
3653
3654       /* Get an integer representation of the repeating part of Advanced
3655          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
3656          which for big-endian targets is lane-swapped wrt a normal
3657          Advanced SIMD vector.  This means that for both endiannesses,
3658          memory lane N of SVE vector SRC corresponds to architectural
3659          lane N of a register holding VQ_SRC.  This in turn means that
3660          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3661          as a single 128-bit value) and thus that memory lane 0 of SRC is
3662          in the lsb of the integer.  Duplicating the integer therefore
3663          ensures that memory lane N of SRC goes into architectural lane
3664          N + I * INDEX of the SVE register.  */
3665       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3666       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3667       if (elt_value)
3668         {
3669           /* Pretend that we had a vector of INT_MODE to start with.  */
3670           elt_mode = int_mode;
3671           mode = aarch64_full_sve_mode (int_mode).require ();
3672
3673           /* If the integer can be moved into a general register by a
3674              single instruction, do that and duplicate the result.  */
3675           if (CONST_INT_P (elt_value)
3676               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3677             {
3678               elt_value = force_reg (elt_mode, elt_value);
3679               return expand_vector_broadcast (mode, elt_value);
3680             }
3681         }
3682       else if (npatterns == 1)
3683         /* We're duplicating a single value, but can't do better than
3684            force it to memory and load from there.  This handles things
3685            like symbolic constants.  */
3686         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3687
3688       if (elt_value)
3689         {
3690           /* Load the element from memory if we can, otherwise move it into
3691              a register and use a DUP.  */
3692           rtx op = force_const_mem (elt_mode, elt_value);
3693           if (!op)
3694             op = force_reg (elt_mode, elt_value);
3695           return expand_vector_broadcast (mode, op);
3696         }
3697     }
3698
3699   /* Try using INDEX.  */
3700   rtx base, step;
3701   if (const_vec_series_p (src, &base, &step))
3702     {
3703       aarch64_expand_vec_series (target, base, step);
3704       return target;
3705     }
3706
3707   /* From here on, it's better to force the whole constant to memory
3708      if we can.  */
3709   if (GET_MODE_NUNITS (mode).is_constant ())
3710     return NULL_RTX;
3711
3712   /* Expand each pattern individually.  */
3713   gcc_assert (npatterns > 1);
3714   rtx_vector_builder builder;
3715   auto_vec<rtx, 16> vectors (npatterns);
3716   for (unsigned int i = 0; i < npatterns; ++i)
3717     {
3718       builder.new_vector (mode, 1, nelts_per_pattern);
3719       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3720         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3721       vectors.quick_push (force_reg (mode, builder.build ()));
3722     }
3723
3724   /* Use permutes to interleave the separate vectors.  */
3725   while (npatterns > 1)
3726     {
3727       npatterns /= 2;
3728       for (unsigned int i = 0; i < npatterns; ++i)
3729         {
3730           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3731           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3732           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3733           vectors[i] = tmp;
3734         }
3735     }
3736   gcc_assert (vectors[0] == target);
3737   return target;
3738 }
3739
3740 /* Use WHILE to set a predicate register of mode MODE in which the first
3741    VL bits are set and the rest are clear.  Use TARGET for the register
3742    if it's nonnull and convenient.  */
3743
3744 static rtx
3745 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3746                                  unsigned int vl)
3747 {
3748   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3749   target = aarch64_target_reg (target, mode);
3750   emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3751   return target;
3752 }
3753
3754 static rtx
3755 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3756
3757 /* BUILDER is a constant predicate in which the index of every set bit
3758    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3759    by inverting every element at a multiple of ELT_SIZE and EORing the
3760    result with an ELT_SIZE PTRUE.
3761
3762    Return a register that contains the constant on success, otherwise
3763    return null.  Use TARGET as the register if it is nonnull and
3764    convenient.  */
3765
3766 static rtx
3767 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3768                                    unsigned int elt_size)
3769 {
3770   /* Invert every element at a multiple of ELT_SIZE, keeping the
3771      other bits zero.  */
3772   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3773                                   builder.nelts_per_pattern ());
3774   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3775     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3776       inv_builder.quick_push (const1_rtx);
3777     else
3778       inv_builder.quick_push (const0_rtx);
3779   inv_builder.finalize ();
3780
3781   /* See if we can load the constant cheaply.  */
3782   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3783   if (!inv)
3784     return NULL_RTX;
3785
3786   /* EOR the result with an ELT_SIZE PTRUE.  */
3787   rtx mask = aarch64_ptrue_all (elt_size);
3788   mask = force_reg (VNx16BImode, mask);
3789   target = aarch64_target_reg (target, VNx16BImode);
3790   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3791   return target;
3792 }
3793
3794 /* BUILDER is a constant predicate in which the index of every set bit
3795    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
3796    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
3797    register on success, otherwise return null.  Use TARGET as the register
3798    if nonnull and convenient.  */
3799
3800 static rtx
3801 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3802                                    unsigned int elt_size,
3803                                    unsigned int permute_size)
3804 {
3805   /* We're going to split the constant into two new constants A and B,
3806      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3807      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3808
3809      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3810      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3811
3812      where _ indicates elements that will be discarded by the permute.
3813
3814      First calculate the ELT_SIZEs for A and B.  */
3815   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3816   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3817   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3818     if (INTVAL (builder.elt (i)) != 0)
3819       {
3820         if (i & permute_size)
3821           b_elt_size |= i - permute_size;
3822         else
3823           a_elt_size |= i;
3824       }
3825   a_elt_size &= -a_elt_size;
3826   b_elt_size &= -b_elt_size;
3827
3828   /* Now construct the vectors themselves.  */
3829   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3830                                 builder.nelts_per_pattern ());
3831   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3832                                 builder.nelts_per_pattern ());
3833   unsigned int nelts = builder.encoded_nelts ();
3834   for (unsigned int i = 0; i < nelts; ++i)
3835     if (i & (elt_size - 1))
3836       {
3837         a_builder.quick_push (const0_rtx);
3838         b_builder.quick_push (const0_rtx);
3839       }
3840     else if ((i & permute_size) == 0)
3841       {
3842         /* The A and B elements are significant.  */
3843         a_builder.quick_push (builder.elt (i));
3844         b_builder.quick_push (builder.elt (i + permute_size));
3845       }
3846     else
3847       {
3848         /* The A and B elements are going to be discarded, so pick whatever
3849            is likely to give a nice constant.  We are targeting element
3850            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3851            with the aim of each being a sequence of ones followed by
3852            a sequence of zeros.  So:
3853
3854            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3855              duplicate the last X_ELT_SIZE element, to extend the
3856              current sequence of ones or zeros.
3857
3858            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3859              zero, so that the constant really does have X_ELT_SIZE and
3860              not a smaller size.  */
3861         if (a_elt_size > permute_size)
3862           a_builder.quick_push (const0_rtx);
3863         else
3864           a_builder.quick_push (a_builder.elt (i - a_elt_size));
3865         if (b_elt_size > permute_size)
3866           b_builder.quick_push (const0_rtx);
3867         else
3868           b_builder.quick_push (b_builder.elt (i - b_elt_size));
3869       }
3870   a_builder.finalize ();
3871   b_builder.finalize ();
3872
3873   /* Try loading A into a register.  */
3874   rtx_insn *last = get_last_insn ();
3875   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3876   if (!a)
3877     return NULL_RTX;
3878
3879   /* Try loading B into a register.  */
3880   rtx b = a;
3881   if (a_builder != b_builder)
3882     {
3883       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3884       if (!b)
3885         {
3886           delete_insns_since (last);
3887           return NULL_RTX;
3888         }
3889     }
3890
3891   /* Emit the TRN1 itself.  */
3892   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
3893   target = aarch64_target_reg (target, mode);
3894   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
3895                               gen_lowpart (mode, a),
3896                               gen_lowpart (mode, b)));
3897   return target;
3898 }
3899
3900 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
3901    constant in BUILDER into an SVE predicate register.  Return the register
3902    on success, otherwise return null.  Use TARGET for the register if
3903    nonnull and convenient.
3904
3905    ALLOW_RECURSE_P is true if we can use methods that would call this
3906    function recursively.  */
3907
3908 static rtx
3909 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
3910                                  bool allow_recurse_p)
3911 {
3912   if (builder.encoded_nelts () == 1)
3913     /* A PFALSE or a PTRUE .B ALL.  */
3914     return aarch64_emit_set_immediate (target, builder);
3915
3916   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
3917   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
3918     {
3919       /* If we can load the constant using PTRUE, use it as-is.  */
3920       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
3921       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
3922         return aarch64_emit_set_immediate (target, builder);
3923
3924       /* Otherwise use WHILE to set the first VL bits.  */
3925       return aarch64_sve_move_pred_via_while (target, mode, vl);
3926     }
3927
3928   if (!allow_recurse_p)
3929     return NULL_RTX;
3930
3931   /* Try inverting the vector in element size ELT_SIZE and then EORing
3932      the result with an ELT_SIZE PTRUE.  */
3933   if (INTVAL (builder.elt (0)) == 0)
3934     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
3935                                                      elt_size))
3936       return res;
3937
3938   /* Try using TRN1 to permute two simpler constants.  */
3939   for (unsigned int i = elt_size; i <= 8; i *= 2)
3940     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
3941                                                      elt_size, i))
3942       return res;
3943
3944   return NULL_RTX;
3945 }
3946
3947 /* Return an SVE predicate register that contains the VNx16BImode
3948    constant in BUILDER, without going through the move expanders.
3949
3950    The returned register can have whatever mode seems most natural
3951    given the contents of BUILDER.  Use TARGET for the result if
3952    convenient.  */
3953
3954 static rtx
3955 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
3956 {
3957   /* Try loading the constant using pure predicate operations.  */
3958   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
3959     return res;
3960
3961   /* Try forcing the constant to memory.  */
3962   if (builder.full_nelts ().is_constant ())
3963     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
3964       {
3965         target = aarch64_target_reg (target, VNx16BImode);
3966         emit_move_insn (target, mem);
3967         return target;
3968       }
3969
3970   /* The last resort is to load the constant as an integer and then
3971      compare it against zero.  Use -1 for set bits in order to increase
3972      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
3973   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
3974                                   builder.nelts_per_pattern ());
3975   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3976     int_builder.quick_push (INTVAL (builder.elt (i))
3977                             ? constm1_rtx : const0_rtx);
3978   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
3979                                            int_builder.build ());
3980 }
3981
3982 /* Set DEST to immediate IMM.  */
3983
3984 void
3985 aarch64_expand_mov_immediate (rtx dest, rtx imm)
3986 {
3987   machine_mode mode = GET_MODE (dest);
3988
3989   /* Check on what type of symbol it is.  */
3990   scalar_int_mode int_mode;
3991   if ((GET_CODE (imm) == SYMBOL_REF
3992        || GET_CODE (imm) == LABEL_REF
3993        || GET_CODE (imm) == CONST
3994        || GET_CODE (imm) == CONST_POLY_INT)
3995       && is_a <scalar_int_mode> (mode, &int_mode))
3996     {
3997       rtx mem;
3998       poly_int64 offset;
3999       HOST_WIDE_INT const_offset;
4000       enum aarch64_symbol_type sty;
4001
4002       /* If we have (const (plus symbol offset)), separate out the offset
4003          before we start classifying the symbol.  */
4004       rtx base = strip_offset (imm, &offset);
4005
4006       /* We must always add an offset involving VL separately, rather than
4007          folding it into the relocation.  */
4008       if (!offset.is_constant (&const_offset))
4009         {
4010           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4011             emit_insn (gen_rtx_SET (dest, imm));
4012           else
4013             {
4014               /* Do arithmetic on 32-bit values if the result is smaller
4015                  than that.  */
4016               if (partial_subreg_p (int_mode, SImode))
4017                 {
4018                   /* It is invalid to do symbol calculations in modes
4019                      narrower than SImode.  */
4020                   gcc_assert (base == const0_rtx);
4021                   dest = gen_lowpart (SImode, dest);
4022                   int_mode = SImode;
4023                 }
4024               if (base != const0_rtx)
4025                 {
4026                   base = aarch64_force_temporary (int_mode, dest, base);
4027                   aarch64_add_offset (int_mode, dest, base, offset,
4028                                       NULL_RTX, NULL_RTX, false);
4029                 }
4030               else
4031                 aarch64_add_offset (int_mode, dest, base, offset,
4032                                     dest, NULL_RTX, false);
4033             }
4034           return;
4035         }
4036
4037       sty = aarch64_classify_symbol (base, const_offset);
4038       switch (sty)
4039         {
4040         case SYMBOL_FORCE_TO_MEM:
4041           if (const_offset != 0
4042               && targetm.cannot_force_const_mem (int_mode, imm))
4043             {
4044               gcc_assert (can_create_pseudo_p ());
4045               base = aarch64_force_temporary (int_mode, dest, base);
4046               aarch64_add_offset (int_mode, dest, base, const_offset,
4047                                   NULL_RTX, NULL_RTX, false);
4048               return;
4049             }
4050
4051           mem = force_const_mem (ptr_mode, imm);
4052           gcc_assert (mem);
4053
4054           /* If we aren't generating PC relative literals, then
4055              we need to expand the literal pool access carefully.
4056              This is something that needs to be done in a number
4057              of places, so could well live as a separate function.  */
4058           if (!aarch64_pcrelative_literal_loads)
4059             {
4060               gcc_assert (can_create_pseudo_p ());
4061               base = gen_reg_rtx (ptr_mode);
4062               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4063               if (ptr_mode != Pmode)
4064                 base = convert_memory_address (Pmode, base);
4065               mem = gen_rtx_MEM (ptr_mode, base);
4066             }
4067
4068           if (int_mode != ptr_mode)
4069             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4070
4071           emit_insn (gen_rtx_SET (dest, mem));
4072
4073           return;
4074
4075         case SYMBOL_SMALL_TLSGD:
4076         case SYMBOL_SMALL_TLSDESC:
4077         case SYMBOL_SMALL_TLSIE:
4078         case SYMBOL_SMALL_GOT_28K:
4079         case SYMBOL_SMALL_GOT_4G:
4080         case SYMBOL_TINY_GOT:
4081         case SYMBOL_TINY_TLSIE:
4082           if (const_offset != 0)
4083             {
4084               gcc_assert(can_create_pseudo_p ());
4085               base = aarch64_force_temporary (int_mode, dest, base);
4086               aarch64_add_offset (int_mode, dest, base, const_offset,
4087                                   NULL_RTX, NULL_RTX, false);
4088               return;
4089             }
4090           /* FALLTHRU */
4091
4092         case SYMBOL_SMALL_ABSOLUTE:
4093         case SYMBOL_TINY_ABSOLUTE:
4094         case SYMBOL_TLSLE12:
4095         case SYMBOL_TLSLE24:
4096         case SYMBOL_TLSLE32:
4097         case SYMBOL_TLSLE48:
4098           aarch64_load_symref_appropriately (dest, imm, sty);
4099           return;
4100
4101         default:
4102           gcc_unreachable ();
4103         }
4104     }
4105
4106   if (!CONST_INT_P (imm))
4107     {
4108       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4109         {
4110           /* Only the low bit of each .H, .S and .D element is defined,
4111              so we can set the upper bits to whatever we like.  If the
4112              predicate is all-true in MODE, prefer to set all the undefined
4113              bits as well, so that we can share a single .B predicate for
4114              all modes.  */
4115           if (imm == CONSTM1_RTX (mode))
4116             imm = CONSTM1_RTX (VNx16BImode);
4117
4118           /* All methods for constructing predicate modes wider than VNx16BI
4119              will set the upper bits of each element to zero.  Expose this
4120              by moving such constants as a VNx16BI, so that all bits are
4121              significant and so that constants for different modes can be
4122              shared.  The wider constant will still be available as a
4123              REG_EQUAL note.  */
4124           rtx_vector_builder builder;
4125           if (aarch64_get_sve_pred_bits (builder, imm))
4126             {
4127               rtx res = aarch64_expand_sve_const_pred (dest, builder);
4128               if (dest != res)
4129                 emit_move_insn (dest, gen_lowpart (mode, res));
4130               return;
4131             }
4132         }
4133
4134       if (GET_CODE (imm) == HIGH
4135           || aarch64_simd_valid_immediate (imm, NULL))
4136         {
4137           emit_insn (gen_rtx_SET (dest, imm));
4138           return;
4139         }
4140
4141       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4142         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4143           {
4144             if (dest != res)
4145               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4146             return;
4147           }
4148
4149       rtx mem = force_const_mem (mode, imm);
4150       gcc_assert (mem);
4151       emit_move_insn (dest, mem);
4152       return;
4153     }
4154
4155   aarch64_internal_mov_immediate (dest, imm, true,
4156                                   as_a <scalar_int_mode> (mode));
4157 }
4158
4159 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
4160    that is known to contain PTRUE.  */
4161
4162 void
4163 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4164 {
4165   expand_operand ops[3];
4166   machine_mode mode = GET_MODE (dest);
4167   create_output_operand (&ops[0], dest, mode);
4168   create_input_operand (&ops[1], pred, GET_MODE(pred));
4169   create_input_operand (&ops[2], src, mode);
4170   temporary_volatile_ok v (true);
4171   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4172 }
4173
4174 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4175    operand is in memory.  In this case we need to use the predicated LD1
4176    and ST1 instead of LDR and STR, both for correctness on big-endian
4177    targets and because LD1 and ST1 support a wider range of addressing modes.
4178    PRED_MODE is the mode of the predicate.
4179
4180    See the comment at the head of aarch64-sve.md for details about the
4181    big-endian handling.  */
4182
4183 void
4184 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4185 {
4186   machine_mode mode = GET_MODE (dest);
4187   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4188   if (!register_operand (src, mode)
4189       && !register_operand (dest, mode))
4190     {
4191       rtx tmp = gen_reg_rtx (mode);
4192       if (MEM_P (src))
4193         aarch64_emit_sve_pred_move (tmp, ptrue, src);
4194       else
4195         emit_move_insn (tmp, src);
4196       src = tmp;
4197     }
4198   aarch64_emit_sve_pred_move (dest, ptrue, src);
4199 }
4200
4201 /* Called only on big-endian targets.  See whether an SVE vector move
4202    from SRC to DEST is effectively a REV[BHW] instruction, because at
4203    least one operand is a subreg of an SVE vector that has wider or
4204    narrower elements.  Return true and emit the instruction if so.
4205
4206    For example:
4207
4208      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4209
4210    represents a VIEW_CONVERT between the following vectors, viewed
4211    in memory order:
4212
4213      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4214      R1: { [0],      [1],      [2],      [3],     ... }
4215
4216    The high part of lane X in R2 should therefore correspond to lane X*2
4217    of R1, but the register representations are:
4218
4219          msb                                      lsb
4220      R2: ...... [1].high  [1].low   [0].high  [0].low
4221      R1: ...... [3]       [2]       [1]       [0]
4222
4223    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4224    We therefore need a reverse operation to swap the high and low values
4225    around.
4226
4227    This is purely an optimization.  Without it we would spill the
4228    subreg operand to the stack in one mode and reload it in the
4229    other mode, which has the same effect as the REV.  */
4230
4231 bool
4232 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4233 {
4234   gcc_assert (BYTES_BIG_ENDIAN);
4235   if (GET_CODE (dest) == SUBREG)
4236     dest = SUBREG_REG (dest);
4237   if (GET_CODE (src) == SUBREG)
4238     src = SUBREG_REG (src);
4239
4240   /* The optimization handles two single SVE REGs with different element
4241      sizes.  */
4242   if (!REG_P (dest)
4243       || !REG_P (src)
4244       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4245       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4246       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4247           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4248     return false;
4249
4250   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4251   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4252   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4253                                UNSPEC_REV_SUBREG);
4254   emit_insn (gen_rtx_SET (dest, unspec));
4255   return true;
4256 }
4257
4258 /* Return a copy of X with mode MODE, without changing its other
4259    attributes.  Unlike gen_lowpart, this doesn't care whether the
4260    mode change is valid.  */
4261
4262 static rtx
4263 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4264 {
4265   if (GET_MODE (x) == mode)
4266     return x;
4267
4268   x = shallow_copy_rtx (x);
4269   set_mode_and_regno (x, mode, REGNO (x));
4270   return x;
4271 }
4272
4273 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4274    operands.  */
4275
4276 void
4277 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4278 {
4279   /* Decide which REV operation we need.  The mode with narrower elements
4280      determines the mode of the operands and the mode with the wider
4281      elements determines the reverse width.  */
4282   machine_mode mode_with_wider_elts = GET_MODE (dest);
4283   machine_mode mode_with_narrower_elts = GET_MODE (src);
4284   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4285       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4286     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4287
4288   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4289   unsigned int unspec;
4290   if (wider_bytes == 8)
4291     unspec = UNSPEC_REV64;
4292   else if (wider_bytes == 4)
4293     unspec = UNSPEC_REV32;
4294   else if (wider_bytes == 2)
4295     unspec = UNSPEC_REV16;
4296   else
4297     gcc_unreachable ();
4298   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4299
4300   /* Emit:
4301
4302        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)] UNSPEC_PRED_X))
4303
4304      with the appropriate modes.  */
4305   ptrue = gen_lowpart (pred_mode, ptrue);
4306   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
4307   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
4308   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
4309   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
4310                         UNSPEC_PRED_X);
4311   emit_insn (gen_rtx_SET (dest, src));
4312 }
4313
4314 static bool
4315 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4316                                  tree exp ATTRIBUTE_UNUSED)
4317 {
4318   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4319     return false;
4320
4321   return true;
4322 }
4323
4324 /* Implement TARGET_PASS_BY_REFERENCE.  */
4325
4326 static bool
4327 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
4328                            machine_mode mode,
4329                            const_tree type,
4330                            bool named ATTRIBUTE_UNUSED)
4331 {
4332   HOST_WIDE_INT size;
4333   machine_mode dummymode;
4334   int nregs;
4335
4336   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4337   if (mode == BLKmode && type)
4338     size = int_size_in_bytes (type);
4339   else
4340     /* No frontends can create types with variable-sized modes, so we
4341        shouldn't be asked to pass or return them.  */
4342     size = GET_MODE_SIZE (mode).to_constant ();
4343
4344   /* Aggregates are passed by reference based on their size.  */
4345   if (type && AGGREGATE_TYPE_P (type))
4346     {
4347       size = int_size_in_bytes (type);
4348     }
4349
4350   /* Variable sized arguments are always returned by reference.  */
4351   if (size < 0)
4352     return true;
4353
4354   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4355   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4356                                                &dummymode, &nregs,
4357                                                NULL))
4358     return false;
4359
4360   /* Arguments which are variable sized or larger than 2 registers are
4361      passed by reference unless they are a homogenous floating point
4362      aggregate.  */
4363   return size > 2 * UNITS_PER_WORD;
4364 }
4365
4366 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4367 static bool
4368 aarch64_return_in_msb (const_tree valtype)
4369 {
4370   machine_mode dummy_mode;
4371   int dummy_int;
4372
4373   /* Never happens in little-endian mode.  */
4374   if (!BYTES_BIG_ENDIAN)
4375     return false;
4376
4377   /* Only composite types smaller than or equal to 16 bytes can
4378      be potentially returned in registers.  */
4379   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4380       || int_size_in_bytes (valtype) <= 0
4381       || int_size_in_bytes (valtype) > 16)
4382     return false;
4383
4384   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4385      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4386      is always passed/returned in the least significant bits of fp/simd
4387      register(s).  */
4388   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4389                                                &dummy_mode, &dummy_int, NULL))
4390     return false;
4391
4392   return true;
4393 }
4394
4395 /* Implement TARGET_FUNCTION_VALUE.
4396    Define how to find the value returned by a function.  */
4397
4398 static rtx
4399 aarch64_function_value (const_tree type, const_tree func,
4400                         bool outgoing ATTRIBUTE_UNUSED)
4401 {
4402   machine_mode mode;
4403   int unsignedp;
4404   int count;
4405   machine_mode ag_mode;
4406
4407   mode = TYPE_MODE (type);
4408   if (INTEGRAL_TYPE_P (type))
4409     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4410
4411   if (aarch64_return_in_msb (type))
4412     {
4413       HOST_WIDE_INT size = int_size_in_bytes (type);
4414
4415       if (size % UNITS_PER_WORD != 0)
4416         {
4417           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4418           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4419         }
4420     }
4421
4422   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4423                                                &ag_mode, &count, NULL))
4424     {
4425       if (!aarch64_composite_type_p (type, mode))
4426         {
4427           gcc_assert (count == 1 && mode == ag_mode);
4428           return gen_rtx_REG (mode, V0_REGNUM);
4429         }
4430       else
4431         {
4432           int i;
4433           rtx par;
4434
4435           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4436           for (i = 0; i < count; i++)
4437             {
4438               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4439               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4440               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4441               XVECEXP (par, 0, i) = tmp;
4442             }
4443           return par;
4444         }
4445     }
4446   else
4447     return gen_rtx_REG (mode, R0_REGNUM);
4448 }
4449
4450 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4451    Return true if REGNO is the number of a hard register in which the values
4452    of called function may come back.  */
4453
4454 static bool
4455 aarch64_function_value_regno_p (const unsigned int regno)
4456 {
4457   /* Maximum of 16 bytes can be returned in the general registers.  Examples
4458      of 16-byte return values are: 128-bit integers and 16-byte small
4459      structures (excluding homogeneous floating-point aggregates).  */
4460   if (regno == R0_REGNUM || regno == R1_REGNUM)
4461     return true;
4462
4463   /* Up to four fp/simd registers can return a function value, e.g. a
4464      homogeneous floating-point aggregate having four members.  */
4465   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4466     return TARGET_FLOAT;
4467
4468   return false;
4469 }
4470
4471 /* Implement TARGET_RETURN_IN_MEMORY.
4472
4473    If the type T of the result of a function is such that
4474      void func (T arg)
4475    would require that arg be passed as a value in a register (or set of
4476    registers) according to the parameter passing rules, then the result
4477    is returned in the same registers as would be used for such an
4478    argument.  */
4479
4480 static bool
4481 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4482 {
4483   HOST_WIDE_INT size;
4484   machine_mode ag_mode;
4485   int count;
4486
4487   if (!AGGREGATE_TYPE_P (type)
4488       && TREE_CODE (type) != COMPLEX_TYPE
4489       && TREE_CODE (type) != VECTOR_TYPE)
4490     /* Simple scalar types always returned in registers.  */
4491     return false;
4492
4493   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4494                                                type,
4495                                                &ag_mode,
4496                                                &count,
4497                                                NULL))
4498     return false;
4499
4500   /* Types larger than 2 registers returned in memory.  */
4501   size = int_size_in_bytes (type);
4502   return (size < 0 || size > 2 * UNITS_PER_WORD);
4503 }
4504
4505 static bool
4506 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4507                                const_tree type, int *nregs)
4508 {
4509   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4510   return aarch64_vfp_is_call_or_return_candidate (mode,
4511                                                   type,
4512                                                   &pcum->aapcs_vfp_rmode,
4513                                                   nregs,
4514                                                   NULL);
4515 }
4516
4517 /* Given MODE and TYPE of a function argument, return the alignment in
4518    bits.  The idea is to suppress any stronger alignment requested by
4519    the user and opt for the natural alignment (specified in AAPCS64 \S
4520    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
4521    calculated in versions of GCC prior to GCC-9.  This is a helper
4522    function for local use only.  */
4523
4524 static unsigned int
4525 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4526                                 bool *abi_break)
4527 {
4528   *abi_break = false;
4529   if (!type)
4530     return GET_MODE_ALIGNMENT (mode);
4531
4532   if (integer_zerop (TYPE_SIZE (type)))
4533     return 0;
4534
4535   gcc_assert (TYPE_MODE (type) == mode);
4536
4537   if (!AGGREGATE_TYPE_P (type))
4538     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4539
4540   if (TREE_CODE (type) == ARRAY_TYPE)
4541     return TYPE_ALIGN (TREE_TYPE (type));
4542
4543   unsigned int alignment = 0;
4544   unsigned int bitfield_alignment = 0;
4545   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4546     if (TREE_CODE (field) == FIELD_DECL)
4547       {
4548         alignment = std::max (alignment, DECL_ALIGN (field));
4549         if (DECL_BIT_FIELD_TYPE (field))
4550           bitfield_alignment
4551             = std::max (bitfield_alignment,
4552                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4553       }
4554
4555   if (bitfield_alignment > alignment)
4556     {
4557       *abi_break = true;
4558       return bitfield_alignment;
4559     }
4560
4561   return alignment;
4562 }
4563
4564 /* Layout a function argument according to the AAPCS64 rules.  The rule
4565    numbers refer to the rule numbers in the AAPCS64.  */
4566
4567 static void
4568 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4569                     const_tree type,
4570                     bool named ATTRIBUTE_UNUSED)
4571 {
4572   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4573   int ncrn, nvrn, nregs;
4574   bool allocate_ncrn, allocate_nvrn;
4575   HOST_WIDE_INT size;
4576   bool abi_break;
4577
4578   /* We need to do this once per argument.  */
4579   if (pcum->aapcs_arg_processed)
4580     return;
4581
4582   pcum->aapcs_arg_processed = true;
4583
4584   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
4585   if (type)
4586     size = int_size_in_bytes (type);
4587   else
4588     /* No frontends can create types with variable-sized modes, so we
4589        shouldn't be asked to pass or return them.  */
4590     size = GET_MODE_SIZE (mode).to_constant ();
4591   size = ROUND_UP (size, UNITS_PER_WORD);
4592
4593   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4594   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4595                                                  mode,
4596                                                  type,
4597                                                  &nregs);
4598
4599   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4600      The following code thus handles passing by SIMD/FP registers first.  */
4601
4602   nvrn = pcum->aapcs_nvrn;
4603
4604   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4605      and homogenous short-vector aggregates (HVA).  */
4606   if (allocate_nvrn)
4607     {
4608       if (!TARGET_FLOAT)
4609         aarch64_err_no_fpadvsimd (mode);
4610
4611       if (nvrn + nregs <= NUM_FP_ARG_REGS)
4612         {
4613           pcum->aapcs_nextnvrn = nvrn + nregs;
4614           if (!aarch64_composite_type_p (type, mode))
4615             {
4616               gcc_assert (nregs == 1);
4617               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4618             }
4619           else
4620             {
4621               rtx par;
4622               int i;
4623               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4624               for (i = 0; i < nregs; i++)
4625                 {
4626                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4627                                          V0_REGNUM + nvrn + i);
4628                   rtx offset = gen_int_mode
4629                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4630                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4631                   XVECEXP (par, 0, i) = tmp;
4632                 }
4633               pcum->aapcs_reg = par;
4634             }
4635           return;
4636         }
4637       else
4638         {
4639           /* C.3 NSRN is set to 8.  */
4640           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4641           goto on_stack;
4642         }
4643     }
4644
4645   ncrn = pcum->aapcs_ncrn;
4646   nregs = size / UNITS_PER_WORD;
4647
4648   /* C6 - C9.  though the sign and zero extension semantics are
4649      handled elsewhere.  This is the case where the argument fits
4650      entirely general registers.  */
4651   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4652     {
4653       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4654
4655       /* C.8 if the argument has an alignment of 16 then the NGRN is
4656          rounded up to the next even number.  */
4657       if (nregs == 2
4658           && ncrn % 2
4659           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4660              comparison is there because for > 16 * BITS_PER_UNIT
4661              alignment nregs should be > 2 and therefore it should be
4662              passed by reference rather than value.  */
4663           && (aarch64_function_arg_alignment (mode, type, &abi_break)
4664               == 16 * BITS_PER_UNIT))
4665         {
4666           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4667             inform (input_location, "parameter passing for argument of type "
4668                     "%qT changed in GCC 9.1", type);
4669           ++ncrn;
4670           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4671         }
4672
4673       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4674          A reg is still generated for it, but the caller should be smart
4675          enough not to use it.  */
4676       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4677         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4678       else
4679         {
4680           rtx par;
4681           int i;
4682
4683           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4684           for (i = 0; i < nregs; i++)
4685             {
4686               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4687               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4688                                        GEN_INT (i * UNITS_PER_WORD));
4689               XVECEXP (par, 0, i) = tmp;
4690             }
4691           pcum->aapcs_reg = par;
4692         }
4693
4694       pcum->aapcs_nextncrn = ncrn + nregs;
4695       return;
4696     }
4697
4698   /* C.11  */
4699   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4700
4701   /* The argument is passed on stack; record the needed number of words for
4702      this argument and align the total size if necessary.  */
4703 on_stack:
4704   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4705
4706   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4707       == 16 * BITS_PER_UNIT)
4708     {
4709       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4710       if (pcum->aapcs_stack_size != new_size)
4711         {
4712           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4713             inform (input_location, "parameter passing for argument of type "
4714                     "%qT changed in GCC 9.1", type);
4715           pcum->aapcs_stack_size = new_size;
4716         }
4717     }
4718   return;
4719 }
4720
4721 /* Implement TARGET_FUNCTION_ARG.  */
4722
4723 static rtx
4724 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4725                       const_tree type, bool named)
4726 {
4727   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4728   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4729
4730   if (mode == VOIDmode)
4731     return NULL_RTX;
4732
4733   aarch64_layout_arg (pcum_v, mode, type, named);
4734   return pcum->aapcs_reg;
4735 }
4736
4737 void
4738 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4739                            const_tree fntype ATTRIBUTE_UNUSED,
4740                            rtx libname ATTRIBUTE_UNUSED,
4741                            const_tree fndecl ATTRIBUTE_UNUSED,
4742                            unsigned n_named ATTRIBUTE_UNUSED)
4743 {
4744   pcum->aapcs_ncrn = 0;
4745   pcum->aapcs_nvrn = 0;
4746   pcum->aapcs_nextncrn = 0;
4747   pcum->aapcs_nextnvrn = 0;
4748   pcum->pcs_variant = ARM_PCS_AAPCS64;
4749   pcum->aapcs_reg = NULL_RTX;
4750   pcum->aapcs_arg_processed = false;
4751   pcum->aapcs_stack_words = 0;
4752   pcum->aapcs_stack_size = 0;
4753
4754   if (!TARGET_FLOAT
4755       && fndecl && TREE_PUBLIC (fndecl)
4756       && fntype && fntype != error_mark_node)
4757     {
4758       const_tree type = TREE_TYPE (fntype);
4759       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4760       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4761       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4762                                                    &mode, &nregs, NULL))
4763         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4764     }
4765   return;
4766 }
4767
4768 static void
4769 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4770                               machine_mode mode,
4771                               const_tree type,
4772                               bool named)
4773 {
4774   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4775   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4776     {
4777       aarch64_layout_arg (pcum_v, mode, type, named);
4778       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4779                   != (pcum->aapcs_stack_words != 0));
4780       pcum->aapcs_arg_processed = false;
4781       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4782       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4783       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4784       pcum->aapcs_stack_words = 0;
4785       pcum->aapcs_reg = NULL_RTX;
4786     }
4787 }
4788
4789 bool
4790 aarch64_function_arg_regno_p (unsigned regno)
4791 {
4792   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4793           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4794 }
4795
4796 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4797    PARM_BOUNDARY bits of alignment, but will be given anything up
4798    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4799    that both before and after the layout of each argument, the Next
4800    Stacked Argument Address (NSAA) will have a minimum alignment of
4801    8 bytes.  */
4802
4803 static unsigned int
4804 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4805 {
4806   bool abi_break;
4807   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4808                                                            &abi_break);
4809   if (abi_break & warn_psabi)
4810     inform (input_location, "parameter passing for argument of type "
4811             "%qT changed in GCC 9.1", type);
4812
4813   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4814 }
4815
4816 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4817
4818 static fixed_size_mode
4819 aarch64_get_reg_raw_mode (int regno)
4820 {
4821   if (TARGET_SVE && FP_REGNUM_P (regno))
4822     /* Don't use the SVE part of the register for __builtin_apply and
4823        __builtin_return.  The SVE registers aren't used by the normal PCS,
4824        so using them there would be a waste of time.  The PCS extensions
4825        for SVE types are fundamentally incompatible with the
4826        __builtin_return/__builtin_apply interface.  */
4827     return as_a <fixed_size_mode> (V16QImode);
4828   return default_get_reg_raw_mode (regno);
4829 }
4830
4831 /* Implement TARGET_FUNCTION_ARG_PADDING.
4832
4833    Small aggregate types are placed in the lowest memory address.
4834
4835    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4836
4837 static pad_direction
4838 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4839 {
4840   /* On little-endian targets, the least significant byte of every stack
4841      argument is passed at the lowest byte address of the stack slot.  */
4842   if (!BYTES_BIG_ENDIAN)
4843     return PAD_UPWARD;
4844
4845   /* Otherwise, integral, floating-point and pointer types are padded downward:
4846      the least significant byte of a stack argument is passed at the highest
4847      byte address of the stack slot.  */
4848   if (type
4849       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4850          || POINTER_TYPE_P (type))
4851       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4852     return PAD_DOWNWARD;
4853
4854   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4855   return PAD_UPWARD;
4856 }
4857
4858 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4859
4860    It specifies padding for the last (may also be the only)
4861    element of a block move between registers and memory.  If
4862    assuming the block is in the memory, padding upward means that
4863    the last element is padded after its highest significant byte,
4864    while in downward padding, the last element is padded at the
4865    its least significant byte side.
4866
4867    Small aggregates and small complex types are always padded
4868    upwards.
4869
4870    We don't need to worry about homogeneous floating-point or
4871    short-vector aggregates; their move is not affected by the
4872    padding direction determined here.  Regardless of endianness,
4873    each element of such an aggregate is put in the least
4874    significant bits of a fp/simd register.
4875
4876    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4877    register has useful data, and return the opposite if the most
4878    significant byte does.  */
4879
4880 bool
4881 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4882                      bool first ATTRIBUTE_UNUSED)
4883 {
4884
4885   /* Small composite types are always padded upward.  */
4886   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4887     {
4888       HOST_WIDE_INT size;
4889       if (type)
4890         size = int_size_in_bytes (type);
4891       else
4892         /* No frontends can create types with variable-sized modes, so we
4893            shouldn't be asked to pass or return them.  */
4894         size = GET_MODE_SIZE (mode).to_constant ();
4895       if (size < 2 * UNITS_PER_WORD)
4896         return true;
4897     }
4898
4899   /* Otherwise, use the default padding.  */
4900   return !BYTES_BIG_ENDIAN;
4901 }
4902
4903 static scalar_int_mode
4904 aarch64_libgcc_cmp_return_mode (void)
4905 {
4906   return SImode;
4907 }
4908
4909 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4910
4911 /* We use the 12-bit shifted immediate arithmetic instructions so values
4912    must be multiple of (1 << 12), i.e. 4096.  */
4913 #define ARITH_FACTOR 4096
4914
4915 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4916 #error Cannot use simple address calculation for stack probing
4917 #endif
4918
4919 /* The pair of scratch registers used for stack probing.  */
4920 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4921 #define PROBE_STACK_SECOND_REG R10_REGNUM
4922
4923 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4924    inclusive.  These are offsets from the current stack pointer.  */
4925
4926 static void
4927 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4928 {
4929   HOST_WIDE_INT size;
4930   if (!poly_size.is_constant (&size))
4931     {
4932       sorry ("stack probes for SVE frames");
4933       return;
4934     }
4935
4936   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4937
4938   /* See the same assertion on PROBE_INTERVAL above.  */
4939   gcc_assert ((first % ARITH_FACTOR) == 0);
4940
4941   /* See if we have a constant small number of probes to generate.  If so,
4942      that's the easy case.  */
4943   if (size <= PROBE_INTERVAL)
4944     {
4945       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4946
4947       emit_set_insn (reg1,
4948                      plus_constant (Pmode,
4949                                     stack_pointer_rtx, -(first + base)));
4950       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4951     }
4952
4953   /* The run-time loop is made up of 8 insns in the generic case while the
4954      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4955   else if (size <= 4 * PROBE_INTERVAL)
4956     {
4957       HOST_WIDE_INT i, rem;
4958
4959       emit_set_insn (reg1,
4960                      plus_constant (Pmode,
4961                                     stack_pointer_rtx,
4962                                     -(first + PROBE_INTERVAL)));
4963       emit_stack_probe (reg1);
4964
4965       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4966          it exceeds SIZE.  If only two probes are needed, this will not
4967          generate any code.  Then probe at FIRST + SIZE.  */
4968       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4969         {
4970           emit_set_insn (reg1,
4971                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4972           emit_stack_probe (reg1);
4973         }
4974
4975       rem = size - (i - PROBE_INTERVAL);
4976       if (rem > 256)
4977         {
4978           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4979
4980           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4981           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4982         }
4983       else
4984         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4985     }
4986
4987   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4988      extra careful with variables wrapping around because we might be at
4989      the very top (or the very bottom) of the address space and we have
4990      to be able to handle this case properly; in particular, we use an
4991      equality test for the loop condition.  */
4992   else
4993     {
4994       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4995
4996       /* Step 1: round SIZE to the previous multiple of the interval.  */
4997
4998       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4999
5000
5001       /* Step 2: compute initial and final value of the loop counter.  */
5002
5003       /* TEST_ADDR = SP + FIRST.  */
5004       emit_set_insn (reg1,
5005                      plus_constant (Pmode, stack_pointer_rtx, -first));
5006
5007       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
5008       HOST_WIDE_INT adjustment = - (first + rounded_size);
5009       if (! aarch64_uimm12_shift (adjustment))
5010         {
5011           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5012                                           true, Pmode);
5013           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5014         }
5015       else
5016         emit_set_insn (reg2,
5017                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
5018
5019       /* Step 3: the loop
5020
5021          do
5022            {
5023              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5024              probe at TEST_ADDR
5025            }
5026          while (TEST_ADDR != LAST_ADDR)
5027
5028          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5029          until it is equal to ROUNDED_SIZE.  */
5030
5031       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5032
5033
5034       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5035          that SIZE is equal to ROUNDED_SIZE.  */
5036
5037       if (size != rounded_size)
5038         {
5039           HOST_WIDE_INT rem = size - rounded_size;
5040
5041           if (rem > 256)
5042             {
5043               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5044
5045               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5046               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5047             }
5048           else
5049             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5050         }
5051     }
5052
5053   /* Make sure nothing is scheduled before we are done.  */
5054   emit_insn (gen_blockage ());
5055 }
5056
5057 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
5058    absolute addresses.  */
5059
5060 const char *
5061 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5062 {
5063   static int labelno = 0;
5064   char loop_lab[32];
5065   rtx xops[2];
5066
5067   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5068
5069   /* Loop.  */
5070   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5071
5072   HOST_WIDE_INT stack_clash_probe_interval
5073     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5074
5075   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
5076   xops[0] = reg1;
5077   HOST_WIDE_INT interval;
5078   if (flag_stack_clash_protection)
5079     interval = stack_clash_probe_interval;
5080   else
5081     interval = PROBE_INTERVAL;
5082
5083   gcc_assert (aarch64_uimm12_shift (interval));
5084   xops[1] = GEN_INT (interval);
5085
5086   output_asm_insn ("sub\t%0, %0, %1", xops);
5087
5088   /* If doing stack clash protection then we probe up by the ABI specified
5089      amount.  We do this because we're dropping full pages at a time in the
5090      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
5091   if (flag_stack_clash_protection)
5092     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5093   else
5094     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5095
5096   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
5097      by this amount for each iteration.  */
5098   output_asm_insn ("str\txzr, [%0, %1]", xops);
5099
5100   /* Test if TEST_ADDR == LAST_ADDR.  */
5101   xops[1] = reg2;
5102   output_asm_insn ("cmp\t%0, %1", xops);
5103
5104   /* Branch.  */
5105   fputs ("\tb.ne\t", asm_out_file);
5106   assemble_name_raw (asm_out_file, loop_lab);
5107   fputc ('\n', asm_out_file);
5108
5109   return "";
5110 }
5111
5112 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5113    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5114    of GUARD_SIZE.  When a probe is emitted it is done at most
5115    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5116    at most MIN_PROBE_THRESHOLD.  By the end of this function
5117    BASE = BASE - ADJUSTMENT.  */
5118
5119 const char *
5120 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5121                                       rtx min_probe_threshold, rtx guard_size)
5122 {
5123   /* This function is not allowed to use any instruction generation function
5124      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
5125      so instead emit the code you want using output_asm_insn.  */
5126   gcc_assert (flag_stack_clash_protection);
5127   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5128   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5129
5130   /* The minimum required allocation before the residual requires probing.  */
5131   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5132
5133   /* Clamp the value down to the nearest value that can be used with a cmp.  */
5134   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5135   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5136
5137   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5138   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5139
5140   static int labelno = 0;
5141   char loop_start_lab[32];
5142   char loop_end_lab[32];
5143   rtx xops[2];
5144
5145   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5146   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5147
5148   /* Emit loop start label.  */
5149   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5150
5151   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
5152   xops[0] = adjustment;
5153   xops[1] = probe_offset_value_rtx;
5154   output_asm_insn ("cmp\t%0, %1", xops);
5155
5156   /* Branch to end if not enough adjustment to probe.  */
5157   fputs ("\tb.lt\t", asm_out_file);
5158   assemble_name_raw (asm_out_file, loop_end_lab);
5159   fputc ('\n', asm_out_file);
5160
5161   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
5162   xops[0] = base;
5163   xops[1] = probe_offset_value_rtx;
5164   output_asm_insn ("sub\t%0, %0, %1", xops);
5165
5166   /* Probe at BASE.  */
5167   xops[1] = const0_rtx;
5168   output_asm_insn ("str\txzr, [%0, %1]", xops);
5169
5170   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
5171   xops[0] = adjustment;
5172   xops[1] = probe_offset_value_rtx;
5173   output_asm_insn ("sub\t%0, %0, %1", xops);
5174
5175   /* Branch to start if still more bytes to allocate.  */
5176   fputs ("\tb\t", asm_out_file);
5177   assemble_name_raw (asm_out_file, loop_start_lab);
5178   fputc ('\n', asm_out_file);
5179
5180   /* No probe leave.  */
5181   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5182
5183   /* BASE = BASE - ADJUSTMENT.  */
5184   xops[0] = base;
5185   xops[1] = adjustment;
5186   output_asm_insn ("sub\t%0, %0, %1", xops);
5187   return "";
5188 }
5189
5190 /* Determine whether a frame chain needs to be generated.  */
5191 static bool
5192 aarch64_needs_frame_chain (void)
5193 {
5194   /* Force a frame chain for EH returns so the return address is at FP+8.  */
5195   if (frame_pointer_needed || crtl->calls_eh_return)
5196     return true;
5197
5198   /* A leaf function cannot have calls or write LR.  */
5199   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5200
5201   /* Don't use a frame chain in leaf functions if leaf frame pointers
5202      are disabled.  */
5203   if (flag_omit_leaf_frame_pointer && is_leaf)
5204     return false;
5205
5206   return aarch64_use_frame_pointer;
5207 }
5208
5209 /* Mark the registers that need to be saved by the callee and calculate
5210    the size of the callee-saved registers area and frame record (both FP
5211    and LR may be omitted).  */
5212 static void
5213 aarch64_layout_frame (void)
5214 {
5215   HOST_WIDE_INT offset = 0;
5216   int regno, last_fp_reg = INVALID_REGNUM;
5217   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5218
5219   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5220
5221   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5222      the mid-end is doing.  */
5223   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5224
5225 #define SLOT_NOT_REQUIRED (-2)
5226 #define SLOT_REQUIRED     (-1)
5227
5228   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5229   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5230
5231   /* If this is a non-leaf simd function with calls we assume that
5232      at least one of those calls is to a non-simd function and thus
5233      we must save V8 to V23 in the prologue.  */
5234
5235   if (simd_function && !crtl->is_leaf)
5236     {
5237       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5238         if (FP_SIMD_SAVED_REGNUM_P (regno))
5239           df_set_regs_ever_live (regno, true);
5240     }
5241
5242   /* First mark all the registers that really need to be saved...  */
5243   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5244     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5245
5246   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5247     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5248
5249   /* ... that includes the eh data registers (if needed)...  */
5250   if (crtl->calls_eh_return)
5251     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5252       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5253         = SLOT_REQUIRED;
5254
5255   /* ... and any callee saved register that dataflow says is live.  */
5256   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5257     if (df_regs_ever_live_p (regno)
5258         && (regno == R30_REGNUM
5259             || !call_used_regs[regno]))
5260       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5261
5262   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5263     if (df_regs_ever_live_p (regno)
5264         && (!call_used_regs[regno]
5265             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5266       {
5267         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5268         last_fp_reg = regno;
5269       }
5270
5271   if (cfun->machine->frame.emit_frame_chain)
5272     {
5273       /* FP and LR are placed in the linkage record.  */
5274       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5275       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5276       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5277       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5278       offset = 2 * UNITS_PER_WORD;
5279     }
5280
5281   /* With stack-clash, LR must be saved in non-leaf functions.  */
5282   gcc_assert (crtl->is_leaf
5283               || (cfun->machine->frame.reg_offset[R30_REGNUM]
5284                   != SLOT_NOT_REQUIRED));
5285
5286   /* Now assign stack slots for them.  */
5287   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5288     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5289       {
5290         cfun->machine->frame.reg_offset[regno] = offset;
5291         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5292           cfun->machine->frame.wb_candidate1 = regno;
5293         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5294           cfun->machine->frame.wb_candidate2 = regno;
5295         offset += UNITS_PER_WORD;
5296       }
5297
5298   HOST_WIDE_INT max_int_offset = offset;
5299   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5300   bool has_align_gap = offset != max_int_offset;
5301
5302   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5303     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5304       {
5305         /* If there is an alignment gap between integer and fp callee-saves,
5306            allocate the last fp register to it if possible.  */
5307         if (regno == last_fp_reg
5308             && has_align_gap
5309             && !simd_function
5310             && (offset & 8) == 0)
5311           {
5312             cfun->machine->frame.reg_offset[regno] = max_int_offset;
5313             break;
5314           }
5315
5316         cfun->machine->frame.reg_offset[regno] = offset;
5317         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5318           cfun->machine->frame.wb_candidate1 = regno;
5319         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5320                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5321           cfun->machine->frame.wb_candidate2 = regno;
5322         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5323       }
5324
5325   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5326
5327   cfun->machine->frame.saved_regs_size = offset;
5328
5329   HOST_WIDE_INT varargs_and_saved_regs_size
5330     = offset + cfun->machine->frame.saved_varargs_size;
5331
5332   cfun->machine->frame.hard_fp_offset
5333     = aligned_upper_bound (varargs_and_saved_regs_size
5334                            + get_frame_size (),
5335                            STACK_BOUNDARY / BITS_PER_UNIT);
5336
5337   /* Both these values are already aligned.  */
5338   gcc_assert (multiple_p (crtl->outgoing_args_size,
5339                           STACK_BOUNDARY / BITS_PER_UNIT));
5340   cfun->machine->frame.frame_size
5341     = (cfun->machine->frame.hard_fp_offset
5342        + crtl->outgoing_args_size);
5343
5344   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5345
5346   cfun->machine->frame.initial_adjust = 0;
5347   cfun->machine->frame.final_adjust = 0;
5348   cfun->machine->frame.callee_adjust = 0;
5349   cfun->machine->frame.callee_offset = 0;
5350
5351   HOST_WIDE_INT max_push_offset = 0;
5352   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5353     max_push_offset = 512;
5354   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5355     max_push_offset = 256;
5356
5357   HOST_WIDE_INT const_size, const_fp_offset;
5358   if (cfun->machine->frame.frame_size.is_constant (&const_size)
5359       && const_size < max_push_offset
5360       && known_eq (crtl->outgoing_args_size, 0))
5361     {
5362       /* Simple, small frame with no outgoing arguments:
5363          stp reg1, reg2, [sp, -frame_size]!
5364          stp reg3, reg4, [sp, 16]  */
5365       cfun->machine->frame.callee_adjust = const_size;
5366     }
5367   else if (known_lt (crtl->outgoing_args_size
5368                      + cfun->machine->frame.saved_regs_size, 512)
5369            && !(cfun->calls_alloca
5370                 && known_lt (cfun->machine->frame.hard_fp_offset,
5371                              max_push_offset)))
5372     {
5373       /* Frame with small outgoing arguments:
5374          sub sp, sp, frame_size
5375          stp reg1, reg2, [sp, outgoing_args_size]
5376          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
5377       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5378       cfun->machine->frame.callee_offset
5379         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5380     }
5381   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5382            && const_fp_offset < max_push_offset)
5383     {
5384       /* Frame with large outgoing arguments but a small local area:
5385          stp reg1, reg2, [sp, -hard_fp_offset]!
5386          stp reg3, reg4, [sp, 16]
5387          sub sp, sp, outgoing_args_size  */
5388       cfun->machine->frame.callee_adjust = const_fp_offset;
5389       cfun->machine->frame.final_adjust
5390         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5391     }
5392   else
5393     {
5394       /* Frame with large local area and outgoing arguments using frame pointer:
5395          sub sp, sp, hard_fp_offset
5396          stp x29, x30, [sp, 0]
5397          add x29, sp, 0
5398          stp reg3, reg4, [sp, 16]
5399          sub sp, sp, outgoing_args_size  */
5400       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5401       cfun->machine->frame.final_adjust
5402         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5403     }
5404
5405   cfun->machine->frame.laid_out = true;
5406 }
5407
5408 /* Return true if the register REGNO is saved on entry to
5409    the current function.  */
5410
5411 static bool
5412 aarch64_register_saved_on_entry (int regno)
5413 {
5414   return cfun->machine->frame.reg_offset[regno] >= 0;
5415 }
5416
5417 /* Return the next register up from REGNO up to LIMIT for the callee
5418    to save.  */
5419
5420 static unsigned
5421 aarch64_next_callee_save (unsigned regno, unsigned limit)
5422 {
5423   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5424     regno ++;
5425   return regno;
5426 }
5427
5428 /* Push the register number REGNO of mode MODE to the stack with write-back
5429    adjusting the stack by ADJUSTMENT.  */
5430
5431 static void
5432 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5433                            HOST_WIDE_INT adjustment)
5434  {
5435   rtx base_rtx = stack_pointer_rtx;
5436   rtx insn, reg, mem;
5437
5438   reg = gen_rtx_REG (mode, regno);
5439   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5440                             plus_constant (Pmode, base_rtx, -adjustment));
5441   mem = gen_frame_mem (mode, mem);
5442
5443   insn = emit_move_insn (mem, reg);
5444   RTX_FRAME_RELATED_P (insn) = 1;
5445 }
5446
5447 /* Generate and return an instruction to store the pair of registers
5448    REG and REG2 of mode MODE to location BASE with write-back adjusting
5449    the stack location BASE by ADJUSTMENT.  */
5450
5451 static rtx
5452 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5453                           HOST_WIDE_INT adjustment)
5454 {
5455   switch (mode)
5456     {
5457     case E_DImode:
5458       return gen_storewb_pairdi_di (base, base, reg, reg2,
5459                                     GEN_INT (-adjustment),
5460                                     GEN_INT (UNITS_PER_WORD - adjustment));
5461     case E_DFmode:
5462       return gen_storewb_pairdf_di (base, base, reg, reg2,
5463                                     GEN_INT (-adjustment),
5464                                     GEN_INT (UNITS_PER_WORD - adjustment));
5465     case E_TFmode:
5466       return gen_storewb_pairtf_di (base, base, reg, reg2,
5467                                     GEN_INT (-adjustment),
5468                                     GEN_INT (UNITS_PER_VREG - adjustment));
5469     default:
5470       gcc_unreachable ();
5471     }
5472 }
5473
5474 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5475    stack pointer by ADJUSTMENT.  */
5476
5477 static void
5478 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5479 {
5480   rtx_insn *insn;
5481   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5482
5483   if (regno2 == INVALID_REGNUM)
5484     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5485
5486   rtx reg1 = gen_rtx_REG (mode, regno1);
5487   rtx reg2 = gen_rtx_REG (mode, regno2);
5488
5489   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5490                                               reg2, adjustment));
5491   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5492   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5493   RTX_FRAME_RELATED_P (insn) = 1;
5494 }
5495
5496 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5497    adjusting it by ADJUSTMENT afterwards.  */
5498
5499 static rtx
5500 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5501                          HOST_WIDE_INT adjustment)
5502 {
5503   switch (mode)
5504     {
5505     case E_DImode:
5506       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5507                                    GEN_INT (UNITS_PER_WORD));
5508     case E_DFmode:
5509       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5510                                    GEN_INT (UNITS_PER_WORD));
5511     case E_TFmode:
5512       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5513                                    GEN_INT (UNITS_PER_VREG));
5514     default:
5515       gcc_unreachable ();
5516     }
5517 }
5518
5519 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5520    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5521    into CFI_OPS.  */
5522
5523 static void
5524 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5525                   rtx *cfi_ops)
5526 {
5527   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5528   rtx reg1 = gen_rtx_REG (mode, regno1);
5529
5530   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5531
5532   if (regno2 == INVALID_REGNUM)
5533     {
5534       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5535       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5536       emit_move_insn (reg1, gen_frame_mem (mode, mem));
5537     }
5538   else
5539     {
5540       rtx reg2 = gen_rtx_REG (mode, regno2);
5541       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5542       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5543                                           reg2, adjustment));
5544     }
5545 }
5546
5547 /* Generate and return a store pair instruction of mode MODE to store
5548    register REG1 to MEM1 and register REG2 to MEM2.  */
5549
5550 static rtx
5551 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5552                         rtx reg2)
5553 {
5554   switch (mode)
5555     {
5556     case E_DImode:
5557       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5558
5559     case E_DFmode:
5560       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5561
5562     case E_TFmode:
5563       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5564
5565     default:
5566       gcc_unreachable ();
5567     }
5568 }
5569
5570 /* Generate and regurn a load pair isntruction of mode MODE to load register
5571    REG1 from MEM1 and register REG2 from MEM2.  */
5572
5573 static rtx
5574 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5575                        rtx mem2)
5576 {
5577   switch (mode)
5578     {
5579     case E_DImode:
5580       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5581
5582     case E_DFmode:
5583       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5584
5585     case E_TFmode:
5586       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5587
5588     default:
5589       gcc_unreachable ();
5590     }
5591 }
5592
5593 /* Return TRUE if return address signing should be enabled for the current
5594    function, otherwise return FALSE.  */
5595
5596 bool
5597 aarch64_return_address_signing_enabled (void)
5598 {
5599   /* This function should only be called after frame laid out.   */
5600   gcc_assert (cfun->machine->frame.laid_out);
5601
5602   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5603      if its LR is pushed onto stack.  */
5604   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5605           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5606               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5607 }
5608
5609 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
5610 bool
5611 aarch64_bti_enabled (void)
5612 {
5613   return (aarch64_enable_bti == 1);
5614 }
5615
5616 /* Emit code to save the callee-saved registers from register number START
5617    to LIMIT to the stack at the location starting at offset START_OFFSET,
5618    skipping any write-back candidates if SKIP_WB is true.  */
5619
5620 static void
5621 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5622                            unsigned start, unsigned limit, bool skip_wb)
5623 {
5624   rtx_insn *insn;
5625   unsigned regno;
5626   unsigned regno2;
5627
5628   for (regno = aarch64_next_callee_save (start, limit);
5629        regno <= limit;
5630        regno = aarch64_next_callee_save (regno + 1, limit))
5631     {
5632       rtx reg, mem;
5633       poly_int64 offset;
5634       int offset_diff;
5635
5636       if (skip_wb
5637           && (regno == cfun->machine->frame.wb_candidate1
5638               || regno == cfun->machine->frame.wb_candidate2))
5639         continue;
5640
5641       if (cfun->machine->reg_is_wrapped_separately[regno])
5642        continue;
5643
5644       reg = gen_rtx_REG (mode, regno);
5645       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5646       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5647                                                 offset));
5648
5649       regno2 = aarch64_next_callee_save (regno + 1, limit);
5650       offset_diff = cfun->machine->frame.reg_offset[regno2]
5651                     - cfun->machine->frame.reg_offset[regno];
5652
5653       if (regno2 <= limit
5654           && !cfun->machine->reg_is_wrapped_separately[regno2]
5655           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5656         {
5657           rtx reg2 = gen_rtx_REG (mode, regno2);
5658           rtx mem2;
5659
5660           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5661           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5662                                                      offset));
5663           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5664                                                     reg2));
5665
5666           /* The first part of a frame-related parallel insn is
5667              always assumed to be relevant to the frame
5668              calculations; subsequent parts, are only
5669              frame-related if explicitly marked.  */
5670           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5671           regno = regno2;
5672         }
5673       else
5674         insn = emit_move_insn (mem, reg);
5675
5676       RTX_FRAME_RELATED_P (insn) = 1;
5677     }
5678 }
5679
5680 /* Emit code to restore the callee registers of mode MODE from register
5681    number START up to and including LIMIT.  Restore from the stack offset
5682    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5683    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5684
5685 static void
5686 aarch64_restore_callee_saves (machine_mode mode,
5687                               poly_int64 start_offset, unsigned start,
5688                               unsigned limit, bool skip_wb, rtx *cfi_ops)
5689 {
5690   rtx base_rtx = stack_pointer_rtx;
5691   unsigned regno;
5692   unsigned regno2;
5693   poly_int64 offset;
5694
5695   for (regno = aarch64_next_callee_save (start, limit);
5696        regno <= limit;
5697        regno = aarch64_next_callee_save (regno + 1, limit))
5698     {
5699       if (cfun->machine->reg_is_wrapped_separately[regno])
5700        continue;
5701
5702       rtx reg, mem;
5703       int offset_diff;
5704
5705       if (skip_wb
5706           && (regno == cfun->machine->frame.wb_candidate1
5707               || regno == cfun->machine->frame.wb_candidate2))
5708         continue;
5709
5710       reg = gen_rtx_REG (mode, regno);
5711       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5712       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5713
5714       regno2 = aarch64_next_callee_save (regno + 1, limit);
5715       offset_diff = cfun->machine->frame.reg_offset[regno2]
5716                     - cfun->machine->frame.reg_offset[regno];
5717
5718       if (regno2 <= limit
5719           && !cfun->machine->reg_is_wrapped_separately[regno2]
5720           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5721         {
5722           rtx reg2 = gen_rtx_REG (mode, regno2);
5723           rtx mem2;
5724
5725           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5726           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5727           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5728
5729           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5730           regno = regno2;
5731         }
5732       else
5733         emit_move_insn (reg, mem);
5734       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5735     }
5736 }
5737
5738 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5739    of MODE.  */
5740
5741 static inline bool
5742 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5743 {
5744   HOST_WIDE_INT multiple;
5745   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5746           && IN_RANGE (multiple, -8, 7));
5747 }
5748
5749 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5750    of MODE.  */
5751
5752 static inline bool
5753 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5754 {
5755   HOST_WIDE_INT multiple;
5756   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5757           && IN_RANGE (multiple, 0, 63));
5758 }
5759
5760 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5761    of MODE.  */
5762
5763 bool
5764 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5765 {
5766   HOST_WIDE_INT multiple;
5767   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5768           && IN_RANGE (multiple, -64, 63));
5769 }
5770
5771 /* Return true if OFFSET is a signed 9-bit value.  */
5772
5773 bool
5774 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5775                                        poly_int64 offset)
5776 {
5777   HOST_WIDE_INT const_offset;
5778   return (offset.is_constant (&const_offset)
5779           && IN_RANGE (const_offset, -256, 255));
5780 }
5781
5782 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5783    of MODE.  */
5784
5785 static inline bool
5786 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5787 {
5788   HOST_WIDE_INT multiple;
5789   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5790           && IN_RANGE (multiple, -256, 255));
5791 }
5792
5793 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5794    of MODE.  */
5795
5796 static inline bool
5797 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5798 {
5799   HOST_WIDE_INT multiple;
5800   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5801           && IN_RANGE (multiple, 0, 4095));
5802 }
5803
5804 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5805
5806 static sbitmap
5807 aarch64_get_separate_components (void)
5808 {
5809   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5810   bitmap_clear (components);
5811
5812   /* The registers we need saved to the frame.  */
5813   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5814     if (aarch64_register_saved_on_entry (regno))
5815       {
5816         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5817         if (!frame_pointer_needed)
5818           offset += cfun->machine->frame.frame_size
5819                     - cfun->machine->frame.hard_fp_offset;
5820         /* Check that we can access the stack slot of the register with one
5821            direct load with no adjustments needed.  */
5822         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5823           bitmap_set_bit (components, regno);
5824       }
5825
5826   /* Don't mess with the hard frame pointer.  */
5827   if (frame_pointer_needed)
5828     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5829
5830   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5831   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5832   /* If registers have been chosen to be stored/restored with
5833      writeback don't interfere with them to avoid having to output explicit
5834      stack adjustment instructions.  */
5835   if (reg2 != INVALID_REGNUM)
5836     bitmap_clear_bit (components, reg2);
5837   if (reg1 != INVALID_REGNUM)
5838     bitmap_clear_bit (components, reg1);
5839
5840   bitmap_clear_bit (components, LR_REGNUM);
5841   bitmap_clear_bit (components, SP_REGNUM);
5842
5843   return components;
5844 }
5845
5846 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5847
5848 static sbitmap
5849 aarch64_components_for_bb (basic_block bb)
5850 {
5851   bitmap in = DF_LIVE_IN (bb);
5852   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5853   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5854   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5855
5856   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5857   bitmap_clear (components);
5858
5859   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5860   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5861     if ((!call_used_regs[regno]
5862         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5863        && (bitmap_bit_p (in, regno)
5864            || bitmap_bit_p (gen, regno)
5865            || bitmap_bit_p (kill, regno)))
5866       {
5867         unsigned regno2, offset, offset2;
5868         bitmap_set_bit (components, regno);
5869
5870         /* If there is a callee-save at an adjacent offset, add it too
5871            to increase the use of LDP/STP.  */
5872         offset = cfun->machine->frame.reg_offset[regno];
5873         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5874
5875         if (regno2 <= LAST_SAVED_REGNUM)
5876           {
5877             offset2 = cfun->machine->frame.reg_offset[regno2];
5878             if ((offset & ~8) == (offset2 & ~8))
5879               bitmap_set_bit (components, regno2);
5880           }
5881       }
5882
5883   return components;
5884 }
5885
5886 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5887    Nothing to do for aarch64.  */
5888
5889 static void
5890 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5891 {
5892 }
5893
5894 /* Return the next set bit in BMP from START onwards.  Return the total number
5895    of bits in BMP if no set bit is found at or after START.  */
5896
5897 static unsigned int
5898 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5899 {
5900   unsigned int nbits = SBITMAP_SIZE (bmp);
5901   if (start == nbits)
5902     return start;
5903
5904   gcc_assert (start < nbits);
5905   for (unsigned int i = start; i < nbits; i++)
5906     if (bitmap_bit_p (bmp, i))
5907       return i;
5908
5909   return nbits;
5910 }
5911
5912 /* Do the work for aarch64_emit_prologue_components and
5913    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5914    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5915    for these components or the epilogue sequence.  That is, it determines
5916    whether we should emit stores or loads and what kind of CFA notes to attach
5917    to the insns.  Otherwise the logic for the two sequences is very
5918    similar.  */
5919
5920 static void
5921 aarch64_process_components (sbitmap components, bool prologue_p)
5922 {
5923   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5924                              ? HARD_FRAME_POINTER_REGNUM
5925                              : STACK_POINTER_REGNUM);
5926
5927   unsigned last_regno = SBITMAP_SIZE (components);
5928   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5929   rtx_insn *insn = NULL;
5930
5931   while (regno != last_regno)
5932     {
5933       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5934          so DFmode for the vector registers is enough.  For simd functions
5935          we want to save the low 128 bits.  */
5936       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5937
5938       rtx reg = gen_rtx_REG (mode, regno);
5939       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5940       if (!frame_pointer_needed)
5941         offset += cfun->machine->frame.frame_size
5942                   - cfun->machine->frame.hard_fp_offset;
5943       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5944       rtx mem = gen_frame_mem (mode, addr);
5945
5946       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5947       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5948       /* No more registers to handle after REGNO.
5949          Emit a single save/restore and exit.  */
5950       if (regno2 == last_regno)
5951         {
5952           insn = emit_insn (set);
5953           RTX_FRAME_RELATED_P (insn) = 1;
5954           if (prologue_p)
5955             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5956           else
5957             add_reg_note (insn, REG_CFA_RESTORE, reg);
5958           break;
5959         }
5960
5961       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5962       /* The next register is not of the same class or its offset is not
5963          mergeable with the current one into a pair.  */
5964       if (!satisfies_constraint_Ump (mem)
5965           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5966           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5967           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5968                        GET_MODE_SIZE (mode)))
5969         {
5970           insn = emit_insn (set);
5971           RTX_FRAME_RELATED_P (insn) = 1;
5972           if (prologue_p)
5973             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5974           else
5975             add_reg_note (insn, REG_CFA_RESTORE, reg);
5976
5977           regno = regno2;
5978           continue;
5979         }
5980
5981       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5982       rtx reg2 = gen_rtx_REG (mode, regno2);
5983       if (!frame_pointer_needed)
5984         offset2 += cfun->machine->frame.frame_size
5985                   - cfun->machine->frame.hard_fp_offset;
5986       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5987       rtx mem2 = gen_frame_mem (mode, addr2);
5988       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5989                              : gen_rtx_SET (reg2, mem2);
5990
5991       if (prologue_p)
5992         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5993       else
5994         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5995
5996       RTX_FRAME_RELATED_P (insn) = 1;
5997       if (prologue_p)
5998         {
5999           add_reg_note (insn, REG_CFA_OFFSET, set);
6000           add_reg_note (insn, REG_CFA_OFFSET, set2);
6001         }
6002       else
6003         {
6004           add_reg_note (insn, REG_CFA_RESTORE, reg);
6005           add_reg_note (insn, REG_CFA_RESTORE, reg2);
6006         }
6007
6008       regno = aarch64_get_next_set_bit (components, regno2 + 1);
6009     }
6010 }
6011
6012 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
6013
6014 static void
6015 aarch64_emit_prologue_components (sbitmap components)
6016 {
6017   aarch64_process_components (components, true);
6018 }
6019
6020 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
6021
6022 static void
6023 aarch64_emit_epilogue_components (sbitmap components)
6024 {
6025   aarch64_process_components (components, false);
6026 }
6027
6028 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
6029
6030 static void
6031 aarch64_set_handled_components (sbitmap components)
6032 {
6033   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6034     if (bitmap_bit_p (components, regno))
6035       cfun->machine->reg_is_wrapped_separately[regno] = true;
6036 }
6037
6038 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
6039    determining the probe offset for alloca.  */
6040
6041 static HOST_WIDE_INT
6042 aarch64_stack_clash_protection_alloca_probe_range (void)
6043 {
6044   return STACK_CLASH_CALLER_GUARD;
6045 }
6046
6047
6048 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6049    registers.  If POLY_SIZE is not large enough to require a probe this function
6050    will only adjust the stack.  When allocating the stack space
6051    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6052    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6053    arguments.  If we are then we ensure that any allocation larger than the ABI
6054    defined buffer needs a probe so that the invariant of having a 1KB buffer is
6055    maintained.
6056
6057    We emit barriers after each stack adjustment to prevent optimizations from
6058    breaking the invariant that we never drop the stack more than a page.  This
6059    invariant is needed to make it easier to correctly handle asynchronous
6060    events, e.g. if we were to allow the stack to be dropped by more than a page
6061    and then have multiple probes up and we take a signal somewhere in between
6062    then the signal handler doesn't know the state of the stack and can make no
6063    assumptions about which pages have been probed.  */
6064
6065 static void
6066 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6067                                         poly_int64 poly_size,
6068                                         bool frame_related_p,
6069                                         bool final_adjustment_p)
6070 {
6071   HOST_WIDE_INT guard_size
6072     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6073   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6074   /* When doing the final adjustment for the outgoing argument size we can't
6075      assume that LR was saved at position 0.  So subtract it's offset from the
6076      ABI safe buffer so that we don't accidentally allow an adjustment that
6077      would result in an allocation larger than the ABI buffer without
6078      probing.  */
6079   HOST_WIDE_INT min_probe_threshold
6080     = final_adjustment_p
6081       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6082       : guard_size - guard_used_by_caller;
6083
6084   poly_int64 frame_size = cfun->machine->frame.frame_size;
6085
6086   /* We should always have a positive probe threshold.  */
6087   gcc_assert (min_probe_threshold > 0);
6088
6089   if (flag_stack_clash_protection && !final_adjustment_p)
6090     {
6091       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6092       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6093
6094       if (known_eq (frame_size, 0))
6095         {
6096           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6097         }
6098       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6099                && known_lt (final_adjust, guard_used_by_caller))
6100         {
6101           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6102         }
6103     }
6104
6105   /* If SIZE is not large enough to require probing, just adjust the stack and
6106      exit.  */
6107   if (known_lt (poly_size, min_probe_threshold)
6108       || !flag_stack_clash_protection)
6109     {
6110       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6111       return;
6112     }
6113
6114   HOST_WIDE_INT size;
6115   /* Handle the SVE non-constant case first.  */
6116   if (!poly_size.is_constant (&size))
6117     {
6118      if (dump_file)
6119       {
6120         fprintf (dump_file, "Stack clash SVE prologue: ");
6121         print_dec (poly_size, dump_file);
6122         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6123       }
6124
6125       /* First calculate the amount of bytes we're actually spilling.  */
6126       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6127                           poly_size, temp1, temp2, false, true);
6128
6129       rtx_insn *insn = get_last_insn ();
6130
6131       if (frame_related_p)
6132         {
6133           /* This is done to provide unwinding information for the stack
6134              adjustments we're about to do, however to prevent the optimizers
6135              from removing the R11 move and leaving the CFA note (which would be
6136              very wrong) we tie the old and new stack pointer together.
6137              The tie will expand to nothing but the optimizers will not touch
6138              the instruction.  */
6139           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6140           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6141           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6142
6143           /* We want the CFA independent of the stack pointer for the
6144              duration of the loop.  */
6145           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6146           RTX_FRAME_RELATED_P (insn) = 1;
6147         }
6148
6149       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6150       rtx guard_const = gen_int_mode (guard_size, Pmode);
6151
6152       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6153                                                    stack_pointer_rtx, temp1,
6154                                                    probe_const, guard_const));
6155
6156       /* Now reset the CFA register if needed.  */
6157       if (frame_related_p)
6158         {
6159           add_reg_note (insn, REG_CFA_DEF_CFA,
6160                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6161                                       gen_int_mode (poly_size, Pmode)));
6162           RTX_FRAME_RELATED_P (insn) = 1;
6163         }
6164
6165       return;
6166     }
6167
6168   if (dump_file)
6169     fprintf (dump_file,
6170              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6171              " bytes, probing will be required.\n", size);
6172
6173   /* Round size to the nearest multiple of guard_size, and calculate the
6174      residual as the difference between the original size and the rounded
6175      size.  */
6176   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6177   HOST_WIDE_INT residual = size - rounded_size;
6178
6179   /* We can handle a small number of allocations/probes inline.  Otherwise
6180      punt to a loop.  */
6181   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6182     {
6183       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6184         {
6185           aarch64_sub_sp (NULL, temp2, guard_size, true);
6186           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6187                                            guard_used_by_caller));
6188           emit_insn (gen_blockage ());
6189         }
6190       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6191     }
6192   else
6193     {
6194       /* Compute the ending address.  */
6195       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6196                           temp1, NULL, false, true);
6197       rtx_insn *insn = get_last_insn ();
6198
6199       /* For the initial allocation, we don't have a frame pointer
6200          set up, so we always need CFI notes.  If we're doing the
6201          final allocation, then we may have a frame pointer, in which
6202          case it is the CFA, otherwise we need CFI notes.
6203
6204          We can determine which allocation we are doing by looking at
6205          the value of FRAME_RELATED_P since the final allocations are not
6206          frame related.  */
6207       if (frame_related_p)
6208         {
6209           /* We want the CFA independent of the stack pointer for the
6210              duration of the loop.  */
6211           add_reg_note (insn, REG_CFA_DEF_CFA,
6212                         plus_constant (Pmode, temp1, rounded_size));
6213           RTX_FRAME_RELATED_P (insn) = 1;
6214         }
6215
6216       /* This allocates and probes the stack.  Note that this re-uses some of
6217          the existing Ada stack protection code.  However we are guaranteed not
6218          to enter the non loop or residual branches of that code.
6219
6220          The non-loop part won't be entered because if our allocation amount
6221          doesn't require a loop, the case above would handle it.
6222
6223          The residual amount won't be entered because TEMP1 is a mutliple of
6224          the allocation size.  The residual will always be 0.  As such, the only
6225          part we are actually using from that code is the loop setup.  The
6226          actual probing is done in aarch64_output_probe_stack_range.  */
6227       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6228                                                stack_pointer_rtx, temp1));
6229
6230       /* Now reset the CFA register if needed.  */
6231       if (frame_related_p)
6232         {
6233           add_reg_note (insn, REG_CFA_DEF_CFA,
6234                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6235           RTX_FRAME_RELATED_P (insn) = 1;
6236         }
6237
6238       emit_insn (gen_blockage ());
6239       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6240     }
6241
6242   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
6243      be probed.  This maintains the requirement that each page is probed at
6244      least once.  For initial probing we probe only if the allocation is
6245      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6246      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
6247      GUARD_SIZE.  This works that for any allocation that is large enough to
6248      trigger a probe here, we'll have at least one, and if they're not large
6249      enough for this code to emit anything for them, The page would have been
6250      probed by the saving of FP/LR either by this function or any callees.  If
6251      we don't have any callees then we won't have more stack adjustments and so
6252      are still safe.  */
6253   if (residual)
6254     {
6255       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6256       /* If we're doing final adjustments, and we've done any full page
6257          allocations then any residual needs to be probed.  */
6258       if (final_adjustment_p && rounded_size != 0)
6259         min_probe_threshold = 0;
6260       /* If doing a small final adjustment, we always probe at offset 0.
6261          This is done to avoid issues when LR is not at position 0 or when
6262          the final adjustment is smaller than the probing offset.  */
6263       else if (final_adjustment_p && rounded_size == 0)
6264         residual_probe_offset = 0;
6265
6266       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6267       if (residual >= min_probe_threshold)
6268         {
6269           if (dump_file)
6270             fprintf (dump_file,
6271                      "Stack clash AArch64 prologue residuals: "
6272                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6273                      "\n", residual);
6274
6275             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6276                                              residual_probe_offset));
6277           emit_insn (gen_blockage ());
6278         }
6279     }
6280 }
6281
6282 /* Return 1 if the register is used by the epilogue.  We need to say the
6283    return register is used, but only after epilogue generation is complete.
6284    Note that in the case of sibcalls, the values "used by the epilogue" are
6285    considered live at the start of the called function.
6286
6287    For SIMD functions we need to return 1 for FP registers that are saved and
6288    restored by a function but are not zero in call_used_regs.  If we do not do
6289    this optimizations may remove the restore of the register.  */
6290
6291 int
6292 aarch64_epilogue_uses (int regno)
6293 {
6294   if (epilogue_completed)
6295     {
6296       if (regno == LR_REGNUM)
6297         return 1;
6298       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6299         return 1;
6300     }
6301   return 0;
6302 }
6303
6304 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6305    is saved at BASE + OFFSET.  */
6306
6307 static void
6308 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6309                             rtx base, poly_int64 offset)
6310 {
6311   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6312   add_reg_note (insn, REG_CFA_EXPRESSION,
6313                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6314 }
6315
6316 /* AArch64 stack frames generated by this compiler look like:
6317
6318         +-------------------------------+
6319         |                               |
6320         |  incoming stack arguments     |
6321         |                               |
6322         +-------------------------------+
6323         |                               | <-- incoming stack pointer (aligned)
6324         |  callee-allocated save area   |
6325         |  for register varargs         |
6326         |                               |
6327         +-------------------------------+
6328         |  local variables              | <-- frame_pointer_rtx
6329         |                               |
6330         +-------------------------------+
6331         |  padding                      | \
6332         +-------------------------------+  |
6333         |  callee-saved registers       |  | frame.saved_regs_size
6334         +-------------------------------+  |
6335         |  LR'                          |  |
6336         +-------------------------------+  |
6337         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
6338         +-------------------------------+
6339         |  dynamic allocation           |
6340         +-------------------------------+
6341         |  padding                      |
6342         +-------------------------------+
6343         |  outgoing stack arguments     | <-- arg_pointer
6344         |                               |
6345         +-------------------------------+
6346         |                               | <-- stack_pointer_rtx (aligned)
6347
6348    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6349    but leave frame_pointer_rtx and hard_frame_pointer_rtx
6350    unchanged.
6351
6352    By default for stack-clash we assume the guard is at least 64KB, but this
6353    value is configurable to either 4KB or 64KB.  We also force the guard size to
6354    be the same as the probing interval and both values are kept in sync.
6355
6356    With those assumptions the callee can allocate up to 63KB (or 3KB depending
6357    on the guard size) of stack space without probing.
6358
6359    When probing is needed, we emit a probe at the start of the prologue
6360    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6361
6362    We have to track how much space has been allocated and the only stores
6363    to the stack we track as implicit probes are the FP/LR stores.
6364
6365    For outgoing arguments we probe if the size is larger than 1KB, such that
6366    the ABI specified buffer is maintained for the next callee.
6367
6368    The following registers are reserved during frame layout and should not be
6369    used for any other purpose:
6370
6371    - r11: Used by stack clash protection when SVE is enabled.
6372    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6373    - r14 and r15: Used for speculation tracking.
6374    - r16(IP0), r17(IP1): Used by indirect tailcalls.
6375    - r30(LR), r29(FP): Used by standard frame layout.
6376
6377    These registers must be avoided in frame layout related code unless the
6378    explicit intention is to interact with one of the features listed above.  */
6379
6380 /* Generate the prologue instructions for entry into a function.
6381    Establish the stack frame by decreasing the stack pointer with a
6382    properly calculated size and, if necessary, create a frame record
6383    filled with the values of LR and previous frame pointer.  The
6384    current FP is also set up if it is in use.  */
6385
6386 void
6387 aarch64_expand_prologue (void)
6388 {
6389   poly_int64 frame_size = cfun->machine->frame.frame_size;
6390   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6391   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6392   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6393   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6394   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6395   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6396   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6397   rtx_insn *insn;
6398
6399   /* Sign return address for functions.  */
6400   if (aarch64_return_address_signing_enabled ())
6401     {
6402       switch (aarch64_ra_sign_key)
6403         {
6404           case AARCH64_KEY_A:
6405             insn = emit_insn (gen_paciasp ());
6406             break;
6407           case AARCH64_KEY_B:
6408             insn = emit_insn (gen_pacibsp ());
6409             break;
6410           default:
6411             gcc_unreachable ();
6412         }
6413       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6414       RTX_FRAME_RELATED_P (insn) = 1;
6415     }
6416
6417   if (flag_stack_usage_info)
6418     current_function_static_stack_size = constant_lower_bound (frame_size);
6419
6420   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6421     {
6422       if (crtl->is_leaf && !cfun->calls_alloca)
6423         {
6424           if (maybe_gt (frame_size, PROBE_INTERVAL)
6425               && maybe_gt (frame_size, get_stack_check_protect ()))
6426             aarch64_emit_probe_stack_range (get_stack_check_protect (),
6427                                             (frame_size
6428                                              - get_stack_check_protect ()));
6429         }
6430       else if (maybe_gt (frame_size, 0))
6431         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6432     }
6433
6434   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6435   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6436
6437   /* In theory we should never have both an initial adjustment
6438      and a callee save adjustment.  Verify that is the case since the
6439      code below does not handle it for -fstack-clash-protection.  */
6440   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6441
6442   /* Will only probe if the initial adjustment is larger than the guard
6443      less the amount of the guard reserved for use by the caller's
6444      outgoing args.  */
6445   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6446                                           true, false);
6447
6448   if (callee_adjust != 0)
6449     aarch64_push_regs (reg1, reg2, callee_adjust);
6450
6451   if (emit_frame_chain)
6452     {
6453       poly_int64 reg_offset = callee_adjust;
6454       if (callee_adjust == 0)
6455         {
6456           reg1 = R29_REGNUM;
6457           reg2 = R30_REGNUM;
6458           reg_offset = callee_offset;
6459           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6460         }
6461       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6462                           stack_pointer_rtx, callee_offset,
6463                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6464       if (frame_pointer_needed && !frame_size.is_constant ())
6465         {
6466           /* Variable-sized frames need to describe the save slot
6467              address using DW_CFA_expression rather than DW_CFA_offset.
6468              This means that, without taking further action, the
6469              locations of the registers that we've already saved would
6470              remain based on the stack pointer even after we redefine
6471              the CFA based on the frame pointer.  We therefore need new
6472              DW_CFA_expressions to re-express the save slots with addresses
6473              based on the frame pointer.  */
6474           rtx_insn *insn = get_last_insn ();
6475           gcc_assert (RTX_FRAME_RELATED_P (insn));
6476
6477           /* Add an explicit CFA definition if this was previously
6478              implicit.  */
6479           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6480             {
6481               rtx src = plus_constant (Pmode, stack_pointer_rtx,
6482                                        callee_offset);
6483               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6484                             gen_rtx_SET (hard_frame_pointer_rtx, src));
6485             }
6486
6487           /* Change the save slot expressions for the registers that
6488              we've already saved.  */
6489           reg_offset -= callee_offset;
6490           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6491                                       reg_offset + UNITS_PER_WORD);
6492           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6493                                       reg_offset);
6494         }
6495       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6496     }
6497
6498   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6499                              callee_adjust != 0 || emit_frame_chain);
6500   if (aarch64_simd_decl_p (cfun->decl))
6501     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6502                                callee_adjust != 0 || emit_frame_chain);
6503   else
6504     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6505                                callee_adjust != 0 || emit_frame_chain);
6506
6507   /* We may need to probe the final adjustment if it is larger than the guard
6508      that is assumed by the called.  */
6509   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6510                                           !frame_pointer_needed, true);
6511 }
6512
6513 /* Return TRUE if we can use a simple_return insn.
6514
6515    This function checks whether the callee saved stack is empty, which
6516    means no restore actions are need. The pro_and_epilogue will use
6517    this to check whether shrink-wrapping opt is feasible.  */
6518
6519 bool
6520 aarch64_use_return_insn_p (void)
6521 {
6522   if (!reload_completed)
6523     return false;
6524
6525   if (crtl->profile)
6526     return false;
6527
6528   return known_eq (cfun->machine->frame.frame_size, 0);
6529 }
6530
6531 /* Return false for non-leaf SIMD functions in order to avoid
6532    shrink-wrapping them.  Doing this will lose the necessary
6533    save/restore of FP registers.  */
6534
6535 bool
6536 aarch64_use_simple_return_insn_p (void)
6537 {
6538   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6539     return false;
6540
6541   return true;
6542 }
6543
6544 /* Generate the epilogue instructions for returning from a function.
6545    This is almost exactly the reverse of the prolog sequence, except
6546    that we need to insert barriers to avoid scheduling loads that read
6547    from a deallocated stack, and we optimize the unwind records by
6548    emitting them all together if possible.  */
6549 void
6550 aarch64_expand_epilogue (bool for_sibcall)
6551 {
6552   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6553   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6554   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6555   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6556   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6557   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6558   rtx cfi_ops = NULL;
6559   rtx_insn *insn;
6560   /* A stack clash protection prologue may not have left EP0_REGNUM or
6561      EP1_REGNUM in a usable state.  The same is true for allocations
6562      with an SVE component, since we then need both temporary registers
6563      for each allocation.  For stack clash we are in a usable state if
6564      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
6565   HOST_WIDE_INT guard_size
6566     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6567   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6568
6569   /* We can re-use the registers when the allocation amount is smaller than
6570      guard_size - guard_used_by_caller because we won't be doing any probes
6571      then.  In such situations the register should remain live with the correct
6572      value.  */
6573   bool can_inherit_p = (initial_adjust.is_constant ()
6574                         && final_adjust.is_constant ())
6575                         && (!flag_stack_clash_protection
6576                             || known_lt (initial_adjust,
6577                                          guard_size - guard_used_by_caller));
6578
6579   /* We need to add memory barrier to prevent read from deallocated stack.  */
6580   bool need_barrier_p
6581     = maybe_ne (get_frame_size ()
6582                 + cfun->machine->frame.saved_varargs_size, 0);
6583
6584   /* Emit a barrier to prevent loads from a deallocated stack.  */
6585   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6586       || cfun->calls_alloca
6587       || crtl->calls_eh_return)
6588     {
6589       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6590       need_barrier_p = false;
6591     }
6592
6593   /* Restore the stack pointer from the frame pointer if it may not
6594      be the same as the stack pointer.  */
6595   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6596   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6597   if (frame_pointer_needed
6598       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6599     /* If writeback is used when restoring callee-saves, the CFA
6600        is restored on the instruction doing the writeback.  */
6601     aarch64_add_offset (Pmode, stack_pointer_rtx,
6602                         hard_frame_pointer_rtx, -callee_offset,
6603                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6604   else
6605      /* The case where we need to re-use the register here is very rare, so
6606         avoid the complicated condition and just always emit a move if the
6607         immediate doesn't fit.  */
6608      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6609
6610   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6611                                 callee_adjust != 0, &cfi_ops);
6612   if (aarch64_simd_decl_p (cfun->decl))
6613     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6614                                   callee_adjust != 0, &cfi_ops);
6615   else
6616     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6617                                   callee_adjust != 0, &cfi_ops);
6618
6619   if (need_barrier_p)
6620     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6621
6622   if (callee_adjust != 0)
6623     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6624
6625   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6626     {
6627       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
6628       insn = get_last_insn ();
6629       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6630       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6631       RTX_FRAME_RELATED_P (insn) = 1;
6632       cfi_ops = NULL;
6633     }
6634
6635   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6636      add restriction on emit_move optimization to leaf functions.  */
6637   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6638                   (!can_inherit_p || !crtl->is_leaf
6639                    || df_regs_ever_live_p (EP0_REGNUM)));
6640
6641   if (cfi_ops)
6642     {
6643       /* Emit delayed restores and reset the CFA to be SP.  */
6644       insn = get_last_insn ();
6645       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6646       REG_NOTES (insn) = cfi_ops;
6647       RTX_FRAME_RELATED_P (insn) = 1;
6648     }
6649
6650   /* We prefer to emit the combined return/authenticate instruction RETAA,
6651      however there are three cases in which we must instead emit an explicit
6652      authentication instruction.
6653
6654         1) Sibcalls don't return in a normal way, so if we're about to call one
6655            we must authenticate.
6656
6657         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6658            generating code for !TARGET_ARMV8_3 we can't use it and must
6659            explicitly authenticate.
6660
6661         3) On an eh_return path we make extra stack adjustments to update the
6662            canonical frame address to be the exception handler's CFA.  We want
6663            to authenticate using the CFA of the function which calls eh_return.
6664     */
6665   if (aarch64_return_address_signing_enabled ()
6666       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6667     {
6668       switch (aarch64_ra_sign_key)
6669         {
6670           case AARCH64_KEY_A:
6671             insn = emit_insn (gen_autiasp ());
6672             break;
6673           case AARCH64_KEY_B:
6674             insn = emit_insn (gen_autibsp ());
6675             break;
6676           default:
6677             gcc_unreachable ();
6678         }
6679       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6680       RTX_FRAME_RELATED_P (insn) = 1;
6681     }
6682
6683   /* Stack adjustment for exception handler.  */
6684   if (crtl->calls_eh_return && !for_sibcall)
6685     {
6686       /* We need to unwind the stack by the offset computed by
6687          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6688          to be SP; letting the CFA move during this adjustment
6689          is just as correct as retaining the CFA from the body
6690          of the function.  Therefore, do nothing special.  */
6691       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6692     }
6693
6694   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6695   if (!for_sibcall)
6696     emit_jump_insn (ret_rtx);
6697 }
6698
6699 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6700    normally or return to a previous frame after unwinding.
6701
6702    An EH return uses a single shared return sequence.  The epilogue is
6703    exactly like a normal epilogue except that it has an extra input
6704    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6705    that must be applied after the frame has been destroyed.  An extra label
6706    is inserted before the epilogue which initializes this register to zero,
6707    and this is the entry point for a normal return.
6708
6709    An actual EH return updates the return address, initializes the stack
6710    adjustment and jumps directly into the epilogue (bypassing the zeroing
6711    of the adjustment).  Since the return address is typically saved on the
6712    stack when a function makes a call, the saved LR must be updated outside
6713    the epilogue.
6714
6715    This poses problems as the store is generated well before the epilogue,
6716    so the offset of LR is not known yet.  Also optimizations will remove the
6717    store as it appears dead, even after the epilogue is generated (as the
6718    base or offset for loading LR is different in many cases).
6719
6720    To avoid these problems this implementation forces the frame pointer
6721    in eh_return functions so that the location of LR is fixed and known early.
6722    It also marks the store volatile, so no optimization is permitted to
6723    remove the store.  */
6724 rtx
6725 aarch64_eh_return_handler_rtx (void)
6726 {
6727   rtx tmp = gen_frame_mem (Pmode,
6728     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6729
6730   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6731   MEM_VOLATILE_P (tmp) = true;
6732   return tmp;
6733 }
6734
6735 /* Output code to add DELTA to the first argument, and then jump
6736    to FUNCTION.  Used for C++ multiple inheritance.  */
6737 static void
6738 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6739                          HOST_WIDE_INT delta,
6740                          HOST_WIDE_INT vcall_offset,
6741                          tree function)
6742 {
6743   /* The this pointer is always in x0.  Note that this differs from
6744      Arm where the this pointer maybe bumped to r1 if r0 is required
6745      to return a pointer to an aggregate.  On AArch64 a result value
6746      pointer will be in x8.  */
6747   int this_regno = R0_REGNUM;
6748   rtx this_rtx, temp0, temp1, addr, funexp;
6749   rtx_insn *insn;
6750   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6751
6752   if (aarch64_bti_enabled ())
6753     emit_insn (gen_bti_c());
6754
6755   reload_completed = 1;
6756   emit_note (NOTE_INSN_PROLOGUE_END);
6757
6758   this_rtx = gen_rtx_REG (Pmode, this_regno);
6759   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6760   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6761
6762   if (vcall_offset == 0)
6763     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6764   else
6765     {
6766       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6767
6768       addr = this_rtx;
6769       if (delta != 0)
6770         {
6771           if (delta >= -256 && delta < 256)
6772             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6773                                        plus_constant (Pmode, this_rtx, delta));
6774           else
6775             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6776                                 temp1, temp0, false);
6777         }
6778
6779       if (Pmode == ptr_mode)
6780         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6781       else
6782         aarch64_emit_move (temp0,
6783                            gen_rtx_ZERO_EXTEND (Pmode,
6784                                                 gen_rtx_MEM (ptr_mode, addr)));
6785
6786       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6787           addr = plus_constant (Pmode, temp0, vcall_offset);
6788       else
6789         {
6790           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6791                                           Pmode);
6792           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6793         }
6794
6795       if (Pmode == ptr_mode)
6796         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6797       else
6798         aarch64_emit_move (temp1,
6799                            gen_rtx_SIGN_EXTEND (Pmode,
6800                                                 gen_rtx_MEM (ptr_mode, addr)));
6801
6802       emit_insn (gen_add2_insn (this_rtx, temp1));
6803     }
6804
6805   /* Generate a tail call to the target function.  */
6806   if (!TREE_USED (function))
6807     {
6808       assemble_external (function);
6809       TREE_USED (function) = 1;
6810     }
6811   funexp = XEXP (DECL_RTL (function), 0);
6812   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6813   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6814   SIBLING_CALL_P (insn) = 1;
6815
6816   insn = get_insns ();
6817   shorten_branches (insn);
6818
6819   assemble_start_function (thunk, fnname);
6820   final_start_function (insn, file, 1);
6821   final (insn, file, 1);
6822   final_end_function ();
6823   assemble_end_function (thunk, fnname);
6824
6825   /* Stop pretending to be a post-reload pass.  */
6826   reload_completed = 0;
6827 }
6828
6829 static bool
6830 aarch64_tls_referenced_p (rtx x)
6831 {
6832   if (!TARGET_HAVE_TLS)
6833     return false;
6834   subrtx_iterator::array_type array;
6835   FOR_EACH_SUBRTX (iter, array, x, ALL)
6836     {
6837       const_rtx x = *iter;
6838       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6839         return true;
6840       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6841          TLS offsets, not real symbol references.  */
6842       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6843         iter.skip_subrtxes ();
6844     }
6845   return false;
6846 }
6847
6848
6849 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6850    a left shift of 0 or 12 bits.  */
6851 bool
6852 aarch64_uimm12_shift (HOST_WIDE_INT val)
6853 {
6854   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6855           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6856           );
6857 }
6858
6859 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6860    that can be created with a left shift of 0 or 12.  */
6861 static HOST_WIDE_INT
6862 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6863 {
6864   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6865      handle correctly.  */
6866   gcc_assert ((val & 0xffffff) == val);
6867
6868   if (((val & 0xfff) << 0) == val)
6869     return val;
6870
6871   return val & (0xfff << 12);
6872 }
6873
6874 /* Return true if val is an immediate that can be loaded into a
6875    register by a MOVZ instruction.  */
6876 static bool
6877 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6878 {
6879   if (GET_MODE_SIZE (mode) > 4)
6880     {
6881       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6882           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6883         return 1;
6884     }
6885   else
6886     {
6887       /* Ignore sign extension.  */
6888       val &= (HOST_WIDE_INT) 0xffffffff;
6889     }
6890   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6891           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6892 }
6893
6894 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6895    64-bit (DImode) integer.  */
6896
6897 static unsigned HOST_WIDE_INT
6898 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6899 {
6900   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6901   while (size < 64)
6902     {
6903       val &= (HOST_WIDE_INT_1U << size) - 1;
6904       val |= val << size;
6905       size *= 2;
6906     }
6907   return val;
6908 }
6909
6910 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6911
6912 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6913   {
6914     0x0000000100000001ull,
6915     0x0001000100010001ull,
6916     0x0101010101010101ull,
6917     0x1111111111111111ull,
6918     0x5555555555555555ull,
6919   };
6920
6921
6922 /* Return true if val is a valid bitmask immediate.  */
6923
6924 bool
6925 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6926 {
6927   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6928   int bits;
6929
6930   /* Check for a single sequence of one bits and return quickly if so.
6931      The special cases of all ones and all zeroes returns false.  */
6932   val = aarch64_replicate_bitmask_imm (val_in, mode);
6933   tmp = val + (val & -val);
6934
6935   if (tmp == (tmp & -tmp))
6936     return (val + 1) > 1;
6937
6938   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6939   if (mode == SImode)
6940     val = (val << 32) | (val & 0xffffffff);
6941
6942   /* Invert if the immediate doesn't start with a zero bit - this means we
6943      only need to search for sequences of one bits.  */
6944   if (val & 1)
6945     val = ~val;
6946
6947   /* Find the first set bit and set tmp to val with the first sequence of one
6948      bits removed.  Return success if there is a single sequence of ones.  */
6949   first_one = val & -val;
6950   tmp = val & (val + first_one);
6951
6952   if (tmp == 0)
6953     return true;
6954
6955   /* Find the next set bit and compute the difference in bit position.  */
6956   next_one = tmp & -tmp;
6957   bits = clz_hwi (first_one) - clz_hwi (next_one);
6958   mask = val ^ tmp;
6959
6960   /* Check the bit position difference is a power of 2, and that the first
6961      sequence of one bits fits within 'bits' bits.  */
6962   if ((mask >> bits) != 0 || bits != (bits & -bits))
6963     return false;
6964
6965   /* Check the sequence of one bits is repeated 64/bits times.  */
6966   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6967 }
6968
6969 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6970    Assumed precondition: VAL_IN Is not zero.  */
6971
6972 unsigned HOST_WIDE_INT
6973 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6974 {
6975   int lowest_bit_set = ctz_hwi (val_in);
6976   int highest_bit_set = floor_log2 (val_in);
6977   gcc_assert (val_in != 0);
6978
6979   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6980           (HOST_WIDE_INT_1U << lowest_bit_set));
6981 }
6982
6983 /* Create constant where bits outside of lowest bit set to highest bit set
6984    are set to 1.  */
6985
6986 unsigned HOST_WIDE_INT
6987 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6988 {
6989   return val_in | ~aarch64_and_split_imm1 (val_in);
6990 }
6991
6992 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6993
6994 bool
6995 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6996 {
6997   scalar_int_mode int_mode;
6998   if (!is_a <scalar_int_mode> (mode, &int_mode))
6999     return false;
7000
7001   if (aarch64_bitmask_imm (val_in, int_mode))
7002     return false;
7003
7004   if (aarch64_move_imm (val_in, int_mode))
7005     return false;
7006
7007   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7008
7009   return aarch64_bitmask_imm (imm2, int_mode);
7010 }
7011
7012 /* Return true if val is an immediate that can be loaded into a
7013    register in a single instruction.  */
7014 bool
7015 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7016 {
7017   scalar_int_mode int_mode;
7018   if (!is_a <scalar_int_mode> (mode, &int_mode))
7019     return false;
7020
7021   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7022     return 1;
7023   return aarch64_bitmask_imm (val, int_mode);
7024 }
7025
7026 static bool
7027 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7028 {
7029   rtx base, offset;
7030
7031   if (GET_CODE (x) == HIGH)
7032     return true;
7033
7034   /* There's no way to calculate VL-based values using relocations.  */
7035   subrtx_iterator::array_type array;
7036   FOR_EACH_SUBRTX (iter, array, x, ALL)
7037     if (GET_CODE (*iter) == CONST_POLY_INT)
7038       return true;
7039
7040   split_const (x, &base, &offset);
7041   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7042     {
7043       if (aarch64_classify_symbol (base, INTVAL (offset))
7044           != SYMBOL_FORCE_TO_MEM)
7045         return true;
7046       else
7047         /* Avoid generating a 64-bit relocation in ILP32; leave
7048            to aarch64_expand_mov_immediate to handle it properly.  */
7049         return mode != ptr_mode;
7050     }
7051
7052   return aarch64_tls_referenced_p (x);
7053 }
7054
7055 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7056    The expansion for a table switch is quite expensive due to the number
7057    of instructions, the table lookup and hard to predict indirect jump.
7058    When optimizing for speed, and -O3 enabled, use the per-core tuning if
7059    set, otherwise use tables for > 16 cases as a tradeoff between size and
7060    performance.  When optimizing for size, use the default setting.  */
7061
7062 static unsigned int
7063 aarch64_case_values_threshold (void)
7064 {
7065   /* Use the specified limit for the number of cases before using jump
7066      tables at higher optimization levels.  */
7067   if (optimize > 2
7068       && selected_cpu->tune->max_case_values != 0)
7069     return selected_cpu->tune->max_case_values;
7070   else
7071     return optimize_size ? default_case_values_threshold () : 17;
7072 }
7073
7074 /* Return true if register REGNO is a valid index register.
7075    STRICT_P is true if REG_OK_STRICT is in effect.  */
7076
7077 bool
7078 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7079 {
7080   if (!HARD_REGISTER_NUM_P (regno))
7081     {
7082       if (!strict_p)
7083         return true;
7084
7085       if (!reg_renumber)
7086         return false;
7087
7088       regno = reg_renumber[regno];
7089     }
7090   return GP_REGNUM_P (regno);
7091 }
7092
7093 /* Return true if register REGNO is a valid base register for mode MODE.
7094    STRICT_P is true if REG_OK_STRICT is in effect.  */
7095
7096 bool
7097 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7098 {
7099   if (!HARD_REGISTER_NUM_P (regno))
7100     {
7101       if (!strict_p)
7102         return true;
7103
7104       if (!reg_renumber)
7105         return false;
7106
7107       regno = reg_renumber[regno];
7108     }
7109
7110   /* The fake registers will be eliminated to either the stack or
7111      hard frame pointer, both of which are usually valid base registers.
7112      Reload deals with the cases where the eliminated form isn't valid.  */
7113   return (GP_REGNUM_P (regno)
7114           || regno == SP_REGNUM
7115           || regno == FRAME_POINTER_REGNUM
7116           || regno == ARG_POINTER_REGNUM);
7117 }
7118
7119 /* Return true if X is a valid base register for mode MODE.
7120    STRICT_P is true if REG_OK_STRICT is in effect.  */
7121
7122 static bool
7123 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7124 {
7125   if (!strict_p
7126       && GET_CODE (x) == SUBREG
7127       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7128     x = SUBREG_REG (x);
7129
7130   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7131 }
7132
7133 /* Return true if address offset is a valid index.  If it is, fill in INFO
7134    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7135
7136 static bool
7137 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7138                         machine_mode mode, bool strict_p)
7139 {
7140   enum aarch64_address_type type;
7141   rtx index;
7142   int shift;
7143
7144   /* (reg:P) */
7145   if ((REG_P (x) || GET_CODE (x) == SUBREG)
7146       && GET_MODE (x) == Pmode)
7147     {
7148       type = ADDRESS_REG_REG;
7149       index = x;
7150       shift = 0;
7151     }
7152   /* (sign_extend:DI (reg:SI)) */
7153   else if ((GET_CODE (x) == SIGN_EXTEND
7154             || GET_CODE (x) == ZERO_EXTEND)
7155            && GET_MODE (x) == DImode
7156            && GET_MODE (XEXP (x, 0)) == SImode)
7157     {
7158       type = (GET_CODE (x) == SIGN_EXTEND)
7159         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7160       index = XEXP (x, 0);
7161       shift = 0;
7162     }
7163   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7164   else if (GET_CODE (x) == MULT
7165            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7166                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7167            && GET_MODE (XEXP (x, 0)) == DImode
7168            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7169            && CONST_INT_P (XEXP (x, 1)))
7170     {
7171       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7172         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7173       index = XEXP (XEXP (x, 0), 0);
7174       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7175     }
7176   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7177   else if (GET_CODE (x) == ASHIFT
7178            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7179                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7180            && GET_MODE (XEXP (x, 0)) == DImode
7181            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7182            && CONST_INT_P (XEXP (x, 1)))
7183     {
7184       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7185         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7186       index = XEXP (XEXP (x, 0), 0);
7187       shift = INTVAL (XEXP (x, 1));
7188     }
7189   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7190   else if ((GET_CODE (x) == SIGN_EXTRACT
7191             || GET_CODE (x) == ZERO_EXTRACT)
7192            && GET_MODE (x) == DImode
7193            && GET_CODE (XEXP (x, 0)) == MULT
7194            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7195            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7196     {
7197       type = (GET_CODE (x) == SIGN_EXTRACT)
7198         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7199       index = XEXP (XEXP (x, 0), 0);
7200       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7201       if (INTVAL (XEXP (x, 1)) != 32 + shift
7202           || INTVAL (XEXP (x, 2)) != 0)
7203         shift = -1;
7204     }
7205   /* (and:DI (mult:DI (reg:DI) (const_int scale))
7206      (const_int 0xffffffff<<shift)) */
7207   else if (GET_CODE (x) == AND
7208            && GET_MODE (x) == DImode
7209            && GET_CODE (XEXP (x, 0)) == MULT
7210            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7211            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7212            && CONST_INT_P (XEXP (x, 1)))
7213     {
7214       type = ADDRESS_REG_UXTW;
7215       index = XEXP (XEXP (x, 0), 0);
7216       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7217       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7218         shift = -1;
7219     }
7220   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7221   else if ((GET_CODE (x) == SIGN_EXTRACT
7222             || GET_CODE (x) == ZERO_EXTRACT)
7223            && GET_MODE (x) == DImode
7224            && GET_CODE (XEXP (x, 0)) == ASHIFT
7225            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7226            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7227     {
7228       type = (GET_CODE (x) == SIGN_EXTRACT)
7229         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7230       index = XEXP (XEXP (x, 0), 0);
7231       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7232       if (INTVAL (XEXP (x, 1)) != 32 + shift
7233           || INTVAL (XEXP (x, 2)) != 0)
7234         shift = -1;
7235     }
7236   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7237      (const_int 0xffffffff<<shift)) */
7238   else if (GET_CODE (x) == AND
7239            && GET_MODE (x) == DImode
7240            && GET_CODE (XEXP (x, 0)) == ASHIFT
7241            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7242            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7243            && CONST_INT_P (XEXP (x, 1)))
7244     {
7245       type = ADDRESS_REG_UXTW;
7246       index = XEXP (XEXP (x, 0), 0);
7247       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7248       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7249         shift = -1;
7250     }
7251   /* (mult:P (reg:P) (const_int scale)) */
7252   else if (GET_CODE (x) == MULT
7253            && GET_MODE (x) == Pmode
7254            && GET_MODE (XEXP (x, 0)) == Pmode
7255            && CONST_INT_P (XEXP (x, 1)))
7256     {
7257       type = ADDRESS_REG_REG;
7258       index = XEXP (x, 0);
7259       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7260     }
7261   /* (ashift:P (reg:P) (const_int shift)) */
7262   else if (GET_CODE (x) == ASHIFT
7263            && GET_MODE (x) == Pmode
7264            && GET_MODE (XEXP (x, 0)) == Pmode
7265            && CONST_INT_P (XEXP (x, 1)))
7266     {
7267       type = ADDRESS_REG_REG;
7268       index = XEXP (x, 0);
7269       shift = INTVAL (XEXP (x, 1));
7270     }
7271   else
7272     return false;
7273
7274   if (!strict_p
7275       && GET_CODE (index) == SUBREG
7276       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7277     index = SUBREG_REG (index);
7278
7279   if (aarch64_sve_data_mode_p (mode))
7280     {
7281       if (type != ADDRESS_REG_REG
7282           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7283         return false;
7284     }
7285   else
7286     {
7287       if (shift != 0
7288           && !(IN_RANGE (shift, 1, 3)
7289                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7290         return false;
7291     }
7292
7293   if (REG_P (index)
7294       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7295     {
7296       info->type = type;
7297       info->offset = index;
7298       info->shift = shift;
7299       return true;
7300     }
7301
7302   return false;
7303 }
7304
7305 /* Return true if MODE is one of the modes for which we
7306    support LDP/STP operations.  */
7307
7308 static bool
7309 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7310 {
7311   return mode == SImode || mode == DImode
7312          || mode == SFmode || mode == DFmode
7313          || (aarch64_vector_mode_supported_p (mode)
7314              && (known_eq (GET_MODE_SIZE (mode), 8)
7315                  || (known_eq (GET_MODE_SIZE (mode), 16)
7316                     && (aarch64_tune_params.extra_tuning_flags
7317                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7318 }
7319
7320 /* Return true if REGNO is a virtual pointer register, or an eliminable
7321    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
7322    include stack_pointer or hard_frame_pointer.  */
7323 static bool
7324 virt_or_elim_regno_p (unsigned regno)
7325 {
7326   return ((regno >= FIRST_VIRTUAL_REGISTER
7327            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7328           || regno == FRAME_POINTER_REGNUM
7329           || regno == ARG_POINTER_REGNUM);
7330 }
7331
7332 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7333    If it is, fill in INFO appropriately.  STRICT_P is true if
7334    REG_OK_STRICT is in effect.  */
7335
7336 bool
7337 aarch64_classify_address (struct aarch64_address_info *info,
7338                           rtx x, machine_mode mode, bool strict_p,
7339                           aarch64_addr_query_type type)
7340 {
7341   enum rtx_code code = GET_CODE (x);
7342   rtx op0, op1;
7343   poly_int64 offset;
7344
7345   HOST_WIDE_INT const_size;
7346
7347   /* On BE, we use load/store pair for all large int mode load/stores.
7348      TI/TFmode may also use a load/store pair.  */
7349   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7350   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7351   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7352                             || type == ADDR_QUERY_LDP_STP_N
7353                             || mode == TImode
7354                             || mode == TFmode
7355                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7356
7357   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7358      corresponds to the actual size of the memory being loaded/stored and the
7359      mode of the corresponding addressing mode is half of that.  */
7360   if (type == ADDR_QUERY_LDP_STP_N
7361       && known_eq (GET_MODE_SIZE (mode), 16))
7362     mode = DFmode;
7363
7364   bool allow_reg_index_p = (!load_store_pair_p
7365                             && (known_lt (GET_MODE_SIZE (mode), 16)
7366                                 || vec_flags == VEC_ADVSIMD
7367                                 || vec_flags & VEC_SVE_DATA));
7368
7369   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7370      [Rn, #offset, MUL VL].  */
7371   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7372       && (code != REG && code != PLUS))
7373     return false;
7374
7375   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7376      REG addressing.  */
7377   if (advsimd_struct_p
7378       && !BYTES_BIG_ENDIAN
7379       && (code != POST_INC && code != REG))
7380     return false;
7381
7382   gcc_checking_assert (GET_MODE (x) == VOIDmode
7383                        || SCALAR_INT_MODE_P (GET_MODE (x)));
7384
7385   switch (code)
7386     {
7387     case REG:
7388     case SUBREG:
7389       info->type = ADDRESS_REG_IMM;
7390       info->base = x;
7391       info->offset = const0_rtx;
7392       info->const_offset = 0;
7393       return aarch64_base_register_rtx_p (x, strict_p);
7394
7395     case PLUS:
7396       op0 = XEXP (x, 0);
7397       op1 = XEXP (x, 1);
7398
7399       if (! strict_p
7400           && REG_P (op0)
7401           && virt_or_elim_regno_p (REGNO (op0))
7402           && poly_int_rtx_p (op1, &offset))
7403         {
7404           info->type = ADDRESS_REG_IMM;
7405           info->base = op0;
7406           info->offset = op1;
7407           info->const_offset = offset;
7408
7409           return true;
7410         }
7411
7412       if (maybe_ne (GET_MODE_SIZE (mode), 0)
7413           && aarch64_base_register_rtx_p (op0, strict_p)
7414           && poly_int_rtx_p (op1, &offset))
7415         {
7416           info->type = ADDRESS_REG_IMM;
7417           info->base = op0;
7418           info->offset = op1;
7419           info->const_offset = offset;
7420
7421           /* TImode and TFmode values are allowed in both pairs of X
7422              registers and individual Q registers.  The available
7423              address modes are:
7424              X,X: 7-bit signed scaled offset
7425              Q:   9-bit signed offset
7426              We conservatively require an offset representable in either mode.
7427              When performing the check for pairs of X registers i.e.  LDP/STP
7428              pass down DImode since that is the natural size of the LDP/STP
7429              instruction memory accesses.  */
7430           if (mode == TImode || mode == TFmode)
7431             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7432                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7433                         || offset_12bit_unsigned_scaled_p (mode, offset)));
7434
7435           /* A 7bit offset check because OImode will emit a ldp/stp
7436              instruction (only big endian will get here).
7437              For ldp/stp instructions, the offset is scaled for the size of a
7438              single element of the pair.  */
7439           if (mode == OImode)
7440             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7441
7442           /* Three 9/12 bit offsets checks because CImode will emit three
7443              ldr/str instructions (only big endian will get here).  */
7444           if (mode == CImode)
7445             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7446                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7447                                                                offset + 32)
7448                         || offset_12bit_unsigned_scaled_p (V16QImode,
7449                                                            offset + 32)));
7450
7451           /* Two 7bit offsets checks because XImode will emit two ldp/stp
7452              instructions (only big endian will get here).  */
7453           if (mode == XImode)
7454             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7455                     && aarch64_offset_7bit_signed_scaled_p (TImode,
7456                                                             offset + 32));
7457
7458           /* Make "m" use the LD1 offset range for SVE data modes, so
7459              that pre-RTL optimizers like ivopts will work to that
7460              instead of the wider LDR/STR range.  */
7461           if (vec_flags == VEC_SVE_DATA)
7462             return (type == ADDR_QUERY_M
7463                     ? offset_4bit_signed_scaled_p (mode, offset)
7464                     : offset_9bit_signed_scaled_p (mode, offset));
7465
7466           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7467             {
7468               poly_int64 end_offset = (offset
7469                                        + GET_MODE_SIZE (mode)
7470                                        - BYTES_PER_SVE_VECTOR);
7471               return (type == ADDR_QUERY_M
7472                       ? offset_4bit_signed_scaled_p (mode, offset)
7473                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7474                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7475                                                          end_offset)));
7476             }
7477
7478           if (vec_flags == VEC_SVE_PRED)
7479             return offset_9bit_signed_scaled_p (mode, offset);
7480
7481           if (load_store_pair_p)
7482             return ((known_eq (GET_MODE_SIZE (mode), 4)
7483                      || known_eq (GET_MODE_SIZE (mode), 8)
7484                      || known_eq (GET_MODE_SIZE (mode), 16))
7485                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7486           else
7487             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7488                     || offset_12bit_unsigned_scaled_p (mode, offset));
7489         }
7490
7491       if (allow_reg_index_p)
7492         {
7493           /* Look for base + (scaled/extended) index register.  */
7494           if (aarch64_base_register_rtx_p (op0, strict_p)
7495               && aarch64_classify_index (info, op1, mode, strict_p))
7496             {
7497               info->base = op0;
7498               return true;
7499             }
7500           if (aarch64_base_register_rtx_p (op1, strict_p)
7501               && aarch64_classify_index (info, op0, mode, strict_p))
7502             {
7503               info->base = op1;
7504               return true;
7505             }
7506         }
7507
7508       return false;
7509
7510     case POST_INC:
7511     case POST_DEC:
7512     case PRE_INC:
7513     case PRE_DEC:
7514       info->type = ADDRESS_REG_WB;
7515       info->base = XEXP (x, 0);
7516       info->offset = NULL_RTX;
7517       return aarch64_base_register_rtx_p (info->base, strict_p);
7518
7519     case POST_MODIFY:
7520     case PRE_MODIFY:
7521       info->type = ADDRESS_REG_WB;
7522       info->base = XEXP (x, 0);
7523       if (GET_CODE (XEXP (x, 1)) == PLUS
7524           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7525           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7526           && aarch64_base_register_rtx_p (info->base, strict_p))
7527         {
7528           info->offset = XEXP (XEXP (x, 1), 1);
7529           info->const_offset = offset;
7530
7531           /* TImode and TFmode values are allowed in both pairs of X
7532              registers and individual Q registers.  The available
7533              address modes are:
7534              X,X: 7-bit signed scaled offset
7535              Q:   9-bit signed offset
7536              We conservatively require an offset representable in either mode.
7537            */
7538           if (mode == TImode || mode == TFmode)
7539             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7540                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7541
7542           if (load_store_pair_p)
7543             return ((known_eq (GET_MODE_SIZE (mode), 4)
7544                      || known_eq (GET_MODE_SIZE (mode), 8)
7545                      || known_eq (GET_MODE_SIZE (mode), 16))
7546                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7547           else
7548             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7549         }
7550       return false;
7551
7552     case CONST:
7553     case SYMBOL_REF:
7554     case LABEL_REF:
7555       /* load literal: pc-relative constant pool entry.  Only supported
7556          for SI mode or larger.  */
7557       info->type = ADDRESS_SYMBOLIC;
7558
7559       if (!load_store_pair_p
7560           && GET_MODE_SIZE (mode).is_constant (&const_size)
7561           && const_size >= 4)
7562         {
7563           rtx sym, addend;
7564
7565           split_const (x, &sym, &addend);
7566           return ((GET_CODE (sym) == LABEL_REF
7567                    || (GET_CODE (sym) == SYMBOL_REF
7568                        && CONSTANT_POOL_ADDRESS_P (sym)
7569                        && aarch64_pcrelative_literal_loads)));
7570         }
7571       return false;
7572
7573     case LO_SUM:
7574       info->type = ADDRESS_LO_SUM;
7575       info->base = XEXP (x, 0);
7576       info->offset = XEXP (x, 1);
7577       if (allow_reg_index_p
7578           && aarch64_base_register_rtx_p (info->base, strict_p))
7579         {
7580           rtx sym, offs;
7581           split_const (info->offset, &sym, &offs);
7582           if (GET_CODE (sym) == SYMBOL_REF
7583               && (aarch64_classify_symbol (sym, INTVAL (offs))
7584                   == SYMBOL_SMALL_ABSOLUTE))
7585             {
7586               /* The symbol and offset must be aligned to the access size.  */
7587               unsigned int align;
7588
7589               if (CONSTANT_POOL_ADDRESS_P (sym))
7590                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7591               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7592                 {
7593                   tree exp = SYMBOL_REF_DECL (sym);
7594                   align = TYPE_ALIGN (TREE_TYPE (exp));
7595                   align = aarch64_constant_alignment (exp, align);
7596                 }
7597               else if (SYMBOL_REF_DECL (sym))
7598                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7599               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7600                        && SYMBOL_REF_BLOCK (sym) != NULL)
7601                 align = SYMBOL_REF_BLOCK (sym)->alignment;
7602               else
7603                 align = BITS_PER_UNIT;
7604
7605               poly_int64 ref_size = GET_MODE_SIZE (mode);
7606               if (known_eq (ref_size, 0))
7607                 ref_size = GET_MODE_SIZE (DImode);
7608
7609               return (multiple_p (INTVAL (offs), ref_size)
7610                       && multiple_p (align / BITS_PER_UNIT, ref_size));
7611             }
7612         }
7613       return false;
7614
7615     default:
7616       return false;
7617     }
7618 }
7619
7620 /* Return true if the address X is valid for a PRFM instruction.
7621    STRICT_P is true if we should do strict checking with
7622    aarch64_classify_address.  */
7623
7624 bool
7625 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7626 {
7627   struct aarch64_address_info addr;
7628
7629   /* PRFM accepts the same addresses as DImode...  */
7630   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7631   if (!res)
7632     return false;
7633
7634   /* ... except writeback forms.  */
7635   return addr.type != ADDRESS_REG_WB;
7636 }
7637
7638 bool
7639 aarch64_symbolic_address_p (rtx x)
7640 {
7641   rtx offset;
7642
7643   split_const (x, &x, &offset);
7644   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7645 }
7646
7647 /* Classify the base of symbolic expression X.  */
7648
7649 enum aarch64_symbol_type
7650 aarch64_classify_symbolic_expression (rtx x)
7651 {
7652   rtx offset;
7653
7654   split_const (x, &x, &offset);
7655   return aarch64_classify_symbol (x, INTVAL (offset));
7656 }
7657
7658
7659 /* Return TRUE if X is a legitimate address for accessing memory in
7660    mode MODE.  */
7661 static bool
7662 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7663 {
7664   struct aarch64_address_info addr;
7665
7666   return aarch64_classify_address (&addr, x, mode, strict_p);
7667 }
7668
7669 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7670    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7671 bool
7672 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7673                               aarch64_addr_query_type type)
7674 {
7675   struct aarch64_address_info addr;
7676
7677   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7678 }
7679
7680 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7681
7682 static bool
7683 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7684                                          poly_int64 orig_offset,
7685                                          machine_mode mode)
7686 {
7687   HOST_WIDE_INT size;
7688   if (GET_MODE_SIZE (mode).is_constant (&size))
7689     {
7690       HOST_WIDE_INT const_offset, second_offset;
7691
7692       /* A general SVE offset is A * VQ + B.  Remove the A component from
7693          coefficient 0 in order to get the constant B.  */
7694       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7695
7696       /* Split an out-of-range address displacement into a base and
7697          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7698          range otherwise to increase opportunities for sharing the base
7699          address of different sizes.  Unaligned accesses use the signed
7700          9-bit range, TImode/TFmode use the intersection of signed
7701          scaled 7-bit and signed 9-bit offset.  */
7702       if (mode == TImode || mode == TFmode)
7703         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7704       else if ((const_offset & (size - 1)) != 0)
7705         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7706       else
7707         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7708
7709       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7710         return false;
7711
7712       /* Split the offset into second_offset and the rest.  */
7713       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7714       *offset2 = gen_int_mode (second_offset, Pmode);
7715       return true;
7716     }
7717   else
7718     {
7719       /* Get the mode we should use as the basis of the range.  For structure
7720          modes this is the mode of one vector.  */
7721       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7722       machine_mode step_mode
7723         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7724
7725       /* Get the "mul vl" multiplier we'd like to use.  */
7726       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7727       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7728       if (vec_flags & VEC_SVE_DATA)
7729         /* LDR supports a 9-bit range, but the move patterns for
7730            structure modes require all vectors to be in range of the
7731            same base.  The simplest way of accomodating that while still
7732            promoting reuse of anchor points between different modes is
7733            to use an 8-bit range unconditionally.  */
7734         vnum = ((vnum + 128) & 255) - 128;
7735       else
7736         /* Predicates are only handled singly, so we might as well use
7737            the full range.  */
7738         vnum = ((vnum + 256) & 511) - 256;
7739       if (vnum == 0)
7740         return false;
7741
7742       /* Convert the "mul vl" multiplier into a byte offset.  */
7743       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7744       if (known_eq (second_offset, orig_offset))
7745         return false;
7746
7747       /* Split the offset into second_offset and the rest.  */
7748       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7749       *offset2 = gen_int_mode (second_offset, Pmode);
7750       return true;
7751     }
7752 }
7753
7754 /* Return the binary representation of floating point constant VALUE in INTVAL.
7755    If the value cannot be converted, return false without setting INTVAL.
7756    The conversion is done in the given MODE.  */
7757 bool
7758 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7759 {
7760
7761   /* We make a general exception for 0.  */
7762   if (aarch64_float_const_zero_rtx_p (value))
7763     {
7764       *intval = 0;
7765       return true;
7766     }
7767
7768   scalar_float_mode mode;
7769   if (GET_CODE (value) != CONST_DOUBLE
7770       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7771       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7772       /* Only support up to DF mode.  */
7773       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7774     return false;
7775
7776   unsigned HOST_WIDE_INT ival = 0;
7777
7778   long res[2];
7779   real_to_target (res,
7780                   CONST_DOUBLE_REAL_VALUE (value),
7781                   REAL_MODE_FORMAT (mode));
7782
7783   if (mode == DFmode)
7784     {
7785       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7786       ival = zext_hwi (res[order], 32);
7787       ival |= (zext_hwi (res[1 - order], 32) << 32);
7788     }
7789   else
7790       ival = zext_hwi (res[0], 32);
7791
7792   *intval = ival;
7793   return true;
7794 }
7795
7796 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7797    single MOV(+MOVK) followed by an FMOV.  */
7798 bool
7799 aarch64_float_const_rtx_p (rtx x)
7800 {
7801   machine_mode mode = GET_MODE (x);
7802   if (mode == VOIDmode)
7803     return false;
7804
7805   /* Determine whether it's cheaper to write float constants as
7806      mov/movk pairs over ldr/adrp pairs.  */
7807   unsigned HOST_WIDE_INT ival;
7808
7809   if (GET_CODE (x) == CONST_DOUBLE
7810       && SCALAR_FLOAT_MODE_P (mode)
7811       && aarch64_reinterpret_float_as_int (x, &ival))
7812     {
7813       scalar_int_mode imode = (mode == HFmode
7814                                ? SImode
7815                                : int_mode_for_mode (mode).require ());
7816       int num_instr = aarch64_internal_mov_immediate
7817                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7818       return num_instr < 3;
7819     }
7820
7821   return false;
7822 }
7823
7824 /* Return TRUE if rtx X is immediate constant 0.0 */
7825 bool
7826 aarch64_float_const_zero_rtx_p (rtx x)
7827 {
7828   if (GET_MODE (x) == VOIDmode)
7829     return false;
7830
7831   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7832     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7833   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7834 }
7835
7836 /* Return TRUE if rtx X is immediate constant that fits in a single
7837    MOVI immediate operation.  */
7838 bool
7839 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7840 {
7841   if (!TARGET_SIMD)
7842      return false;
7843
7844   machine_mode vmode;
7845   scalar_int_mode imode;
7846   unsigned HOST_WIDE_INT ival;
7847
7848   if (GET_CODE (x) == CONST_DOUBLE
7849       && SCALAR_FLOAT_MODE_P (mode))
7850     {
7851       if (!aarch64_reinterpret_float_as_int (x, &ival))
7852         return false;
7853
7854       /* We make a general exception for 0.  */
7855       if (aarch64_float_const_zero_rtx_p (x))
7856         return true;
7857
7858       imode = int_mode_for_mode (mode).require ();
7859     }
7860   else if (GET_CODE (x) == CONST_INT
7861            && is_a <scalar_int_mode> (mode, &imode))
7862     ival = INTVAL (x);
7863   else
7864     return false;
7865
7866    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7867      a 128 bit vector mode.  */
7868   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7869
7870   vmode = aarch64_simd_container_mode (imode, width);
7871   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7872
7873   return aarch64_simd_valid_immediate (v_op, NULL);
7874 }
7875
7876
7877 /* Return the fixed registers used for condition codes.  */
7878
7879 static bool
7880 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7881 {
7882   *p1 = CC_REGNUM;
7883   *p2 = INVALID_REGNUM;
7884   return true;
7885 }
7886
7887 /* This function is used by the call expanders of the machine description.
7888    RESULT is the register in which the result is returned.  It's NULL for
7889    "call" and "sibcall".
7890    MEM is the location of the function call.
7891    SIBCALL indicates whether this function call is normal call or sibling call.
7892    It will generate different pattern accordingly.  */
7893
7894 void
7895 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7896 {
7897   rtx call, callee, tmp;
7898   rtvec vec;
7899   machine_mode mode;
7900
7901   gcc_assert (MEM_P (mem));
7902   callee = XEXP (mem, 0);
7903   mode = GET_MODE (callee);
7904   gcc_assert (mode == Pmode);
7905
7906   /* Decide if we should generate indirect calls by loading the
7907      address of the callee into a register before performing
7908      the branch-and-link.  */
7909   if (SYMBOL_REF_P (callee)
7910       ? (aarch64_is_long_call_p (callee)
7911          || aarch64_is_noplt_call_p (callee))
7912       : !REG_P (callee))
7913     XEXP (mem, 0) = force_reg (mode, callee);
7914
7915   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7916
7917   if (result != NULL_RTX)
7918     call = gen_rtx_SET (result, call);
7919
7920   if (sibcall)
7921     tmp = ret_rtx;
7922   else
7923     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7924
7925   vec = gen_rtvec (2, call, tmp);
7926   call = gen_rtx_PARALLEL (VOIDmode, vec);
7927
7928   aarch64_emit_call_insn (call);
7929 }
7930
7931 /* Emit call insn with PAT and do aarch64-specific handling.  */
7932
7933 void
7934 aarch64_emit_call_insn (rtx pat)
7935 {
7936   rtx insn = emit_call_insn (pat);
7937
7938   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7939   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7940   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7941 }
7942
7943 machine_mode
7944 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7945 {
7946   machine_mode mode_x = GET_MODE (x);
7947   rtx_code code_x = GET_CODE (x);
7948
7949   /* All floating point compares return CCFP if it is an equality
7950      comparison, and CCFPE otherwise.  */
7951   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7952     {
7953       switch (code)
7954         {
7955         case EQ:
7956         case NE:
7957         case UNORDERED:
7958         case ORDERED:
7959         case UNLT:
7960         case UNLE:
7961         case UNGT:
7962         case UNGE:
7963         case UNEQ:
7964           return CCFPmode;
7965
7966         case LT:
7967         case LE:
7968         case GT:
7969         case GE:
7970         case LTGT:
7971           return CCFPEmode;
7972
7973         default:
7974           gcc_unreachable ();
7975         }
7976     }
7977
7978   /* Equality comparisons of short modes against zero can be performed
7979      using the TST instruction with the appropriate bitmask.  */
7980   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7981       && (code == EQ || code == NE)
7982       && (mode_x == HImode || mode_x == QImode))
7983     return CC_NZmode;
7984
7985   /* Similarly, comparisons of zero_extends from shorter modes can
7986      be performed using an ANDS with an immediate mask.  */
7987   if (y == const0_rtx && code_x == ZERO_EXTEND
7988       && (mode_x == SImode || mode_x == DImode)
7989       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7990       && (code == EQ || code == NE))
7991     return CC_NZmode;
7992
7993   if ((mode_x == SImode || mode_x == DImode)
7994       && y == const0_rtx
7995       && (code == EQ || code == NE || code == LT || code == GE)
7996       && (code_x == PLUS || code_x == MINUS || code_x == AND
7997           || code_x == NEG
7998           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7999               && CONST_INT_P (XEXP (x, 2)))))
8000     return CC_NZmode;
8001
8002   /* A compare with a shifted operand.  Because of canonicalization,
8003      the comparison will have to be swapped when we emit the assembly
8004      code.  */
8005   if ((mode_x == SImode || mode_x == DImode)
8006       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8007       && (code_x == ASHIFT || code_x == ASHIFTRT
8008           || code_x == LSHIFTRT
8009           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8010     return CC_SWPmode;
8011
8012   /* Similarly for a negated operand, but we can only do this for
8013      equalities.  */
8014   if ((mode_x == SImode || mode_x == DImode)
8015       && (REG_P (y) || GET_CODE (y) == SUBREG)
8016       && (code == EQ || code == NE)
8017       && code_x == NEG)
8018     return CC_Zmode;
8019
8020   /* A test for unsigned overflow from an addition.  */
8021   if ((mode_x == DImode || mode_x == TImode)
8022       && (code == LTU || code == GEU)
8023       && code_x == PLUS
8024       && rtx_equal_p (XEXP (x, 0), y))
8025     return CC_Cmode;
8026
8027   /* A test for unsigned overflow from an add with carry.  */
8028   if ((mode_x == DImode || mode_x == TImode)
8029       && (code == LTU || code == GEU)
8030       && code_x == PLUS
8031       && CONST_SCALAR_INT_P (y)
8032       && (rtx_mode_t (y, mode_x)
8033           == (wi::shwi (1, mode_x)
8034               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8035     return CC_ADCmode;
8036
8037   /* A test for signed overflow.  */
8038   if ((mode_x == DImode || mode_x == TImode)
8039       && code == NE
8040       && code_x == PLUS
8041       && GET_CODE (y) == SIGN_EXTEND)
8042     return CC_Vmode;
8043
8044   /* For everything else, return CCmode.  */
8045   return CCmode;
8046 }
8047
8048 static int
8049 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8050
8051 int
8052 aarch64_get_condition_code (rtx x)
8053 {
8054   machine_mode mode = GET_MODE (XEXP (x, 0));
8055   enum rtx_code comp_code = GET_CODE (x);
8056
8057   if (GET_MODE_CLASS (mode) != MODE_CC)
8058     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8059   return aarch64_get_condition_code_1 (mode, comp_code);
8060 }
8061
8062 static int
8063 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8064 {
8065   switch (mode)
8066     {
8067     case E_CCFPmode:
8068     case E_CCFPEmode:
8069       switch (comp_code)
8070         {
8071         case GE: return AARCH64_GE;
8072         case GT: return AARCH64_GT;
8073         case LE: return AARCH64_LS;
8074         case LT: return AARCH64_MI;
8075         case NE: return AARCH64_NE;
8076         case EQ: return AARCH64_EQ;
8077         case ORDERED: return AARCH64_VC;
8078         case UNORDERED: return AARCH64_VS;
8079         case UNLT: return AARCH64_LT;
8080         case UNLE: return AARCH64_LE;
8081         case UNGT: return AARCH64_HI;
8082         case UNGE: return AARCH64_PL;
8083         default: return -1;
8084         }
8085       break;
8086
8087     case E_CCmode:
8088       switch (comp_code)
8089         {
8090         case NE: return AARCH64_NE;
8091         case EQ: return AARCH64_EQ;
8092         case GE: return AARCH64_GE;
8093         case GT: return AARCH64_GT;
8094         case LE: return AARCH64_LE;
8095         case LT: return AARCH64_LT;
8096         case GEU: return AARCH64_CS;
8097         case GTU: return AARCH64_HI;
8098         case LEU: return AARCH64_LS;
8099         case LTU: return AARCH64_CC;
8100         default: return -1;
8101         }
8102       break;
8103
8104     case E_CC_SWPmode:
8105       switch (comp_code)
8106         {
8107         case NE: return AARCH64_NE;
8108         case EQ: return AARCH64_EQ;
8109         case GE: return AARCH64_LE;
8110         case GT: return AARCH64_LT;
8111         case LE: return AARCH64_GE;
8112         case LT: return AARCH64_GT;
8113         case GEU: return AARCH64_LS;
8114         case GTU: return AARCH64_CC;
8115         case LEU: return AARCH64_CS;
8116         case LTU: return AARCH64_HI;
8117         default: return -1;
8118         }
8119       break;
8120
8121     case E_CC_NZCmode:
8122       switch (comp_code)
8123         {
8124         case NE: return AARCH64_NE; /* = any */
8125         case EQ: return AARCH64_EQ; /* = none */
8126         case GE: return AARCH64_PL; /* = nfrst */
8127         case LT: return AARCH64_MI; /* = first */
8128         case GEU: return AARCH64_CS; /* = nlast */
8129         case GTU: return AARCH64_HI; /* = pmore */
8130         case LEU: return AARCH64_LS; /* = plast */
8131         case LTU: return AARCH64_CC; /* = last */
8132         default: return -1;
8133         }
8134       break;
8135
8136     case E_CC_NZmode:
8137       switch (comp_code)
8138         {
8139         case NE: return AARCH64_NE;
8140         case EQ: return AARCH64_EQ;
8141         case GE: return AARCH64_PL;
8142         case LT: return AARCH64_MI;
8143         default: return -1;
8144         }
8145       break;
8146
8147     case E_CC_Zmode:
8148       switch (comp_code)
8149         {
8150         case NE: return AARCH64_NE;
8151         case EQ: return AARCH64_EQ;
8152         default: return -1;
8153         }
8154       break;
8155
8156     case E_CC_Cmode:
8157       switch (comp_code)
8158         {
8159         case LTU: return AARCH64_CS;
8160         case GEU: return AARCH64_CC;
8161         default: return -1;
8162         }
8163       break;
8164
8165     case E_CC_ADCmode:
8166       switch (comp_code)
8167         {
8168         case GEU: return AARCH64_CS;
8169         case LTU: return AARCH64_CC;
8170         default: return -1;
8171         }
8172       break;
8173
8174     case E_CC_Vmode:
8175       switch (comp_code)
8176         {
8177         case NE: return AARCH64_VS;
8178         case EQ: return AARCH64_VC;
8179         default: return -1;
8180         }
8181       break;
8182
8183     default:
8184       return -1;
8185     }
8186
8187   return -1;
8188 }
8189
8190 bool
8191 aarch64_const_vec_all_same_in_range_p (rtx x,
8192                                        HOST_WIDE_INT minval,
8193                                        HOST_WIDE_INT maxval)
8194 {
8195   rtx elt;
8196   return (const_vec_duplicate_p (x, &elt)
8197           && CONST_INT_P (elt)
8198           && IN_RANGE (INTVAL (elt), minval, maxval));
8199 }
8200
8201 bool
8202 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8203 {
8204   return aarch64_const_vec_all_same_in_range_p (x, val, val);
8205 }
8206
8207 /* Return true if VEC is a constant in which every element is in the range
8208    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
8209
8210 static bool
8211 aarch64_const_vec_all_in_range_p (rtx vec,
8212                                   HOST_WIDE_INT minval,
8213                                   HOST_WIDE_INT maxval)
8214 {
8215   if (GET_CODE (vec) != CONST_VECTOR
8216       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8217     return false;
8218
8219   int nunits;
8220   if (!CONST_VECTOR_STEPPED_P (vec))
8221     nunits = const_vector_encoded_nelts (vec);
8222   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8223     return false;
8224
8225   for (int i = 0; i < nunits; i++)
8226     {
8227       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8228       if (!CONST_INT_P (vec_elem)
8229           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8230         return false;
8231     }
8232   return true;
8233 }
8234
8235 /* N Z C V.  */
8236 #define AARCH64_CC_V 1
8237 #define AARCH64_CC_C (1 << 1)
8238 #define AARCH64_CC_Z (1 << 2)
8239 #define AARCH64_CC_N (1 << 3)
8240
8241 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
8242 static const int aarch64_nzcv_codes[] =
8243 {
8244   0,            /* EQ, Z == 1.  */
8245   AARCH64_CC_Z, /* NE, Z == 0.  */
8246   0,            /* CS, C == 1.  */
8247   AARCH64_CC_C, /* CC, C == 0.  */
8248   0,            /* MI, N == 1.  */
8249   AARCH64_CC_N, /* PL, N == 0.  */
8250   0,            /* VS, V == 1.  */
8251   AARCH64_CC_V, /* VC, V == 0.  */
8252   0,            /* HI, C ==1 && Z == 0.  */
8253   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
8254   AARCH64_CC_V, /* GE, N == V.  */
8255   0,            /* LT, N != V.  */
8256   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
8257   0,            /* LE, !(Z == 0 && N == V).  */
8258   0,            /* AL, Any.  */
8259   0             /* NV, Any.  */
8260 };
8261
8262 /* Print floating-point vector immediate operand X to F, negating it
8263    first if NEGATE is true.  Return true on success, false if it isn't
8264    a constant we can handle.  */
8265
8266 static bool
8267 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8268 {
8269   rtx elt;
8270
8271   if (!const_vec_duplicate_p (x, &elt))
8272     return false;
8273
8274   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8275   if (negate)
8276     r = real_value_negate (&r);
8277
8278   /* We only handle the SVE single-bit immediates here.  */
8279   if (real_equal (&r, &dconst0))
8280     asm_fprintf (f, "0.0");
8281   else if (real_equal (&r, &dconst1))
8282     asm_fprintf (f, "1.0");
8283   else if (real_equal (&r, &dconsthalf))
8284     asm_fprintf (f, "0.5");
8285   else
8286     return false;
8287
8288   return true;
8289 }
8290
8291 /* Return the equivalent letter for size.  */
8292 static char
8293 sizetochar (int size)
8294 {
8295   switch (size)
8296     {
8297     case 64: return 'd';
8298     case 32: return 's';
8299     case 16: return 'h';
8300     case 8 : return 'b';
8301     default: gcc_unreachable ();
8302     }
8303 }
8304
8305 /* Print operand X to file F in a target specific manner according to CODE.
8306    The acceptable formatting commands given by CODE are:
8307      'c':               An integer or symbol address without a preceding #
8308                         sign.
8309      'C':               Take the duplicated element in a vector constant
8310                         and print it in hex.
8311      'D':               Take the duplicated element in a vector constant
8312                         and print it as an unsigned integer, in decimal.
8313      'e':               Print the sign/zero-extend size as a character 8->b,
8314                         16->h, 32->w.
8315      'p':               Prints N such that 2^N == X (X must be power of 2 and
8316                         const int).
8317      'P':               Print the number of non-zero bits in X (a const_int).
8318      'H':               Print the higher numbered register of a pair (TImode)
8319                         of regs.
8320      'm':               Print a condition (eq, ne, etc).
8321      'M':               Same as 'm', but invert condition.
8322      'N':               Take the duplicated element in a vector constant
8323                         and print the negative of it in decimal.
8324      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
8325      'S/T/U/V':         Print a FP/SIMD register name for a register list.
8326                         The register printed is the FP/SIMD register name
8327                         of X + 0/1/2/3 for S/T/U/V.
8328      'R':               Print a scalar FP/SIMD register name + 1.
8329      'X':               Print bottom 16 bits of integer constant in hex.
8330      'w/x':             Print a general register name or the zero register
8331                         (32-bit or 64-bit).
8332      '0':               Print a normal operand, if it's a general register,
8333                         then we assume DImode.
8334      'k':               Print NZCV for conditional compare instructions.
8335      'A':               Output address constant representing the first
8336                         argument of X, specifying a relocation offset
8337                         if appropriate.
8338      'L':               Output constant address specified by X
8339                         with a relocation offset if appropriate.
8340      'G':               Prints address of X, specifying a PC relative
8341                         relocation mode if appropriate.
8342      'y':               Output address of LDP or STP - this is used for
8343                         some LDP/STPs which don't use a PARALLEL in their
8344                         pattern (so the mode needs to be adjusted).
8345      'z':               Output address of a typical LDP or STP.  */
8346
8347 static void
8348 aarch64_print_operand (FILE *f, rtx x, int code)
8349 {
8350   rtx elt;
8351   switch (code)
8352     {
8353     case 'c':
8354       switch (GET_CODE (x))
8355         {
8356         case CONST_INT:
8357           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8358           break;
8359
8360         case SYMBOL_REF:
8361           output_addr_const (f, x);
8362           break;
8363
8364         case CONST:
8365           if (GET_CODE (XEXP (x, 0)) == PLUS
8366               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8367             {
8368               output_addr_const (f, x);
8369               break;
8370             }
8371           /* Fall through.  */
8372
8373         default:
8374           output_operand_lossage ("unsupported operand for code '%c'", code);
8375         }
8376       break;
8377
8378     case 'e':
8379       {
8380         int n;
8381
8382         if (!CONST_INT_P (x)
8383             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
8384           {
8385             output_operand_lossage ("invalid operand for '%%%c'", code);
8386             return;
8387           }
8388
8389         switch (n)
8390           {
8391           case 3:
8392             fputc ('b', f);
8393             break;
8394           case 4:
8395             fputc ('h', f);
8396             break;
8397           case 5:
8398             fputc ('w', f);
8399             break;
8400           default:
8401             output_operand_lossage ("invalid operand for '%%%c'", code);
8402             return;
8403           }
8404       }
8405       break;
8406
8407     case 'p':
8408       {
8409         int n;
8410
8411         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8412           {
8413             output_operand_lossage ("invalid operand for '%%%c'", code);
8414             return;
8415           }
8416
8417         asm_fprintf (f, "%d", n);
8418       }
8419       break;
8420
8421     case 'P':
8422       if (!CONST_INT_P (x))
8423         {
8424           output_operand_lossage ("invalid operand for '%%%c'", code);
8425           return;
8426         }
8427
8428       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8429       break;
8430
8431     case 'H':
8432       if (x == const0_rtx)
8433         {
8434           asm_fprintf (f, "xzr");
8435           break;
8436         }
8437
8438       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8439         {
8440           output_operand_lossage ("invalid operand for '%%%c'", code);
8441           return;
8442         }
8443
8444       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8445       break;
8446
8447     case 'M':
8448     case 'm':
8449       {
8450         int cond_code;
8451         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
8452         if (x == const_true_rtx)
8453           {
8454             if (code == 'M')
8455               fputs ("nv", f);
8456             return;
8457           }
8458
8459         if (!COMPARISON_P (x))
8460           {
8461             output_operand_lossage ("invalid operand for '%%%c'", code);
8462             return;
8463           }
8464
8465         cond_code = aarch64_get_condition_code (x);
8466         gcc_assert (cond_code >= 0);
8467         if (code == 'M')
8468           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8469         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8470           fputs (aarch64_sve_condition_codes[cond_code], f);
8471         else
8472           fputs (aarch64_condition_codes[cond_code], f);
8473       }
8474       break;
8475
8476     case 'N':
8477       if (!const_vec_duplicate_p (x, &elt))
8478         {
8479           output_operand_lossage ("invalid vector constant");
8480           return;
8481         }
8482
8483       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8484         asm_fprintf (f, "%wd", -INTVAL (elt));
8485       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8486                && aarch64_print_vector_float_operand (f, x, true))
8487         ;
8488       else
8489         {
8490           output_operand_lossage ("invalid vector constant");
8491           return;
8492         }
8493       break;
8494
8495     case 'b':
8496     case 'h':
8497     case 's':
8498     case 'd':
8499     case 'q':
8500       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8501         {
8502           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8503           return;
8504         }
8505       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8506       break;
8507
8508     case 'S':
8509     case 'T':
8510     case 'U':
8511     case 'V':
8512       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8513         {
8514           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8515           return;
8516         }
8517       asm_fprintf (f, "%c%d",
8518                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8519                    REGNO (x) - V0_REGNUM + (code - 'S'));
8520       break;
8521
8522     case 'R':
8523       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8524         {
8525           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8526           return;
8527         }
8528       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8529       break;
8530
8531     case 'X':
8532       if (!CONST_INT_P (x))
8533         {
8534           output_operand_lossage ("invalid operand for '%%%c'", code);
8535           return;
8536         }
8537       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8538       break;
8539
8540     case 'C':
8541       {
8542         /* Print a replicated constant in hex.  */
8543         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8544           {
8545             output_operand_lossage ("invalid operand for '%%%c'", code);
8546             return;
8547           }
8548         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8549         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8550       }
8551       break;
8552
8553     case 'D':
8554       {
8555         /* Print a replicated constant in decimal, treating it as
8556            unsigned.  */
8557         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8558           {
8559             output_operand_lossage ("invalid operand for '%%%c'", code);
8560             return;
8561           }
8562         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8563         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8564       }
8565       break;
8566
8567     case 'w':
8568     case 'x':
8569       if (x == const0_rtx
8570           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8571         {
8572           asm_fprintf (f, "%czr", code);
8573           break;
8574         }
8575
8576       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8577         {
8578           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8579           break;
8580         }
8581
8582       if (REG_P (x) && REGNO (x) == SP_REGNUM)
8583         {
8584           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8585           break;
8586         }
8587
8588       /* Fall through */
8589
8590     case 0:
8591       if (x == NULL)
8592         {
8593           output_operand_lossage ("missing operand");
8594           return;
8595         }
8596
8597       switch (GET_CODE (x))
8598         {
8599         case REG:
8600           if (aarch64_sve_data_mode_p (GET_MODE (x)))
8601             {
8602               if (REG_NREGS (x) == 1)
8603                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8604               else
8605                 {
8606                   char suffix
8607                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8608                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
8609                                REGNO (x) - V0_REGNUM, suffix,
8610                                END_REGNO (x) - V0_REGNUM - 1, suffix);
8611                 }
8612             }
8613           else
8614             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8615           break;
8616
8617         case MEM:
8618           output_address (GET_MODE (x), XEXP (x, 0));
8619           break;
8620
8621         case LABEL_REF:
8622         case SYMBOL_REF:
8623           output_addr_const (asm_out_file, x);
8624           break;
8625
8626         case CONST_INT:
8627           asm_fprintf (f, "%wd", INTVAL (x));
8628           break;
8629
8630         case CONST:
8631           if (!VECTOR_MODE_P (GET_MODE (x)))
8632             {
8633               output_addr_const (asm_out_file, x);
8634               break;
8635             }
8636           /* fall through */
8637
8638         case CONST_VECTOR:
8639           if (!const_vec_duplicate_p (x, &elt))
8640             {
8641               output_operand_lossage ("invalid vector constant");
8642               return;
8643             }
8644
8645           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8646             asm_fprintf (f, "%wd", INTVAL (elt));
8647           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8648                    && aarch64_print_vector_float_operand (f, x, false))
8649             ;
8650           else
8651             {
8652               output_operand_lossage ("invalid vector constant");
8653               return;
8654             }
8655           break;
8656
8657         case CONST_DOUBLE:
8658           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8659              be getting CONST_DOUBLEs holding integers.  */
8660           gcc_assert (GET_MODE (x) != VOIDmode);
8661           if (aarch64_float_const_zero_rtx_p (x))
8662             {
8663               fputc ('0', f);
8664               break;
8665             }
8666           else if (aarch64_float_const_representable_p (x))
8667             {
8668 #define buf_size 20
8669               char float_buf[buf_size] = {'\0'};
8670               real_to_decimal_for_mode (float_buf,
8671                                         CONST_DOUBLE_REAL_VALUE (x),
8672                                         buf_size, buf_size,
8673                                         1, GET_MODE (x));
8674               asm_fprintf (asm_out_file, "%s", float_buf);
8675               break;
8676 #undef buf_size
8677             }
8678           output_operand_lossage ("invalid constant");
8679           return;
8680         default:
8681           output_operand_lossage ("invalid operand");
8682           return;
8683         }
8684       break;
8685
8686     case 'A':
8687       if (GET_CODE (x) == HIGH)
8688         x = XEXP (x, 0);
8689
8690       switch (aarch64_classify_symbolic_expression (x))
8691         {
8692         case SYMBOL_SMALL_GOT_4G:
8693           asm_fprintf (asm_out_file, ":got:");
8694           break;
8695
8696         case SYMBOL_SMALL_TLSGD:
8697           asm_fprintf (asm_out_file, ":tlsgd:");
8698           break;
8699
8700         case SYMBOL_SMALL_TLSDESC:
8701           asm_fprintf (asm_out_file, ":tlsdesc:");
8702           break;
8703
8704         case SYMBOL_SMALL_TLSIE:
8705           asm_fprintf (asm_out_file, ":gottprel:");
8706           break;
8707
8708         case SYMBOL_TLSLE24:
8709           asm_fprintf (asm_out_file, ":tprel:");
8710           break;
8711
8712         case SYMBOL_TINY_GOT:
8713           gcc_unreachable ();
8714           break;
8715
8716         default:
8717           break;
8718         }
8719       output_addr_const (asm_out_file, x);
8720       break;
8721
8722     case 'L':
8723       switch (aarch64_classify_symbolic_expression (x))
8724         {
8725         case SYMBOL_SMALL_GOT_4G:
8726           asm_fprintf (asm_out_file, ":lo12:");
8727           break;
8728
8729         case SYMBOL_SMALL_TLSGD:
8730           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8731           break;
8732
8733         case SYMBOL_SMALL_TLSDESC:
8734           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8735           break;
8736
8737         case SYMBOL_SMALL_TLSIE:
8738           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8739           break;
8740
8741         case SYMBOL_TLSLE12:
8742           asm_fprintf (asm_out_file, ":tprel_lo12:");
8743           break;
8744
8745         case SYMBOL_TLSLE24:
8746           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8747           break;
8748
8749         case SYMBOL_TINY_GOT:
8750           asm_fprintf (asm_out_file, ":got:");
8751           break;
8752
8753         case SYMBOL_TINY_TLSIE:
8754           asm_fprintf (asm_out_file, ":gottprel:");
8755           break;
8756
8757         default:
8758           break;
8759         }
8760       output_addr_const (asm_out_file, x);
8761       break;
8762
8763     case 'G':
8764       switch (aarch64_classify_symbolic_expression (x))
8765         {
8766         case SYMBOL_TLSLE24:
8767           asm_fprintf (asm_out_file, ":tprel_hi12:");
8768           break;
8769         default:
8770           break;
8771         }
8772       output_addr_const (asm_out_file, x);
8773       break;
8774
8775     case 'k':
8776       {
8777         HOST_WIDE_INT cond_code;
8778
8779         if (!CONST_INT_P (x))
8780           {
8781             output_operand_lossage ("invalid operand for '%%%c'", code);
8782             return;
8783           }
8784
8785         cond_code = INTVAL (x);
8786         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8787         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8788       }
8789       break;
8790
8791     case 'y':
8792     case 'z':
8793       {
8794         machine_mode mode = GET_MODE (x);
8795
8796         if (GET_CODE (x) != MEM
8797             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8798           {
8799             output_operand_lossage ("invalid operand for '%%%c'", code);
8800             return;
8801           }
8802
8803         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8804                                             code == 'y'
8805                                             ? ADDR_QUERY_LDP_STP_N
8806                                             : ADDR_QUERY_LDP_STP))
8807           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8808       }
8809       break;
8810
8811     default:
8812       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8813       return;
8814     }
8815 }
8816
8817 /* Print address 'x' of a memory access with mode 'mode'.
8818    'op' is the context required by aarch64_classify_address.  It can either be
8819    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8820 static bool
8821 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8822                                 aarch64_addr_query_type type)
8823 {
8824   struct aarch64_address_info addr;
8825   unsigned int size;
8826
8827   /* Check all addresses are Pmode - including ILP32.  */
8828   if (GET_MODE (x) != Pmode
8829       && (!CONST_INT_P (x)
8830           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8831     {
8832       output_operand_lossage ("invalid address mode");
8833       return false;
8834     }
8835
8836   if (aarch64_classify_address (&addr, x, mode, true, type))
8837     switch (addr.type)
8838       {
8839       case ADDRESS_REG_IMM:
8840         if (known_eq (addr.const_offset, 0))
8841           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8842         else if (aarch64_sve_data_mode_p (mode))
8843           {
8844             HOST_WIDE_INT vnum
8845               = exact_div (addr.const_offset,
8846                            BYTES_PER_SVE_VECTOR).to_constant ();
8847             asm_fprintf (f, "[%s, #%wd, mul vl]",
8848                          reg_names[REGNO (addr.base)], vnum);
8849           }
8850         else if (aarch64_sve_pred_mode_p (mode))
8851           {
8852             HOST_WIDE_INT vnum
8853               = exact_div (addr.const_offset,
8854                            BYTES_PER_SVE_PRED).to_constant ();
8855             asm_fprintf (f, "[%s, #%wd, mul vl]",
8856                          reg_names[REGNO (addr.base)], vnum);
8857           }
8858         else
8859           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8860                        INTVAL (addr.offset));
8861         return true;
8862
8863       case ADDRESS_REG_REG:
8864         if (addr.shift == 0)
8865           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8866                        reg_names [REGNO (addr.offset)]);
8867         else
8868           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8869                        reg_names [REGNO (addr.offset)], addr.shift);
8870         return true;
8871
8872       case ADDRESS_REG_UXTW:
8873         if (addr.shift == 0)
8874           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8875                        REGNO (addr.offset) - R0_REGNUM);
8876         else
8877           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8878                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8879         return true;
8880
8881       case ADDRESS_REG_SXTW:
8882         if (addr.shift == 0)
8883           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8884                        REGNO (addr.offset) - R0_REGNUM);
8885         else
8886           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8887                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8888         return true;
8889
8890       case ADDRESS_REG_WB:
8891         /* Writeback is only supported for fixed-width modes.  */
8892         size = GET_MODE_SIZE (mode).to_constant ();
8893         switch (GET_CODE (x))
8894           {
8895           case PRE_INC:
8896             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8897             return true;
8898           case POST_INC:
8899             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8900             return true;
8901           case PRE_DEC:
8902             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8903             return true;
8904           case POST_DEC:
8905             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8906             return true;
8907           case PRE_MODIFY:
8908             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8909                          INTVAL (addr.offset));
8910             return true;
8911           case POST_MODIFY:
8912             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8913                          INTVAL (addr.offset));
8914             return true;
8915           default:
8916             break;
8917           }
8918         break;
8919
8920       case ADDRESS_LO_SUM:
8921         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8922         output_addr_const (f, addr.offset);
8923         asm_fprintf (f, "]");
8924         return true;
8925
8926       case ADDRESS_SYMBOLIC:
8927         output_addr_const (f, x);
8928         return true;
8929       }
8930
8931   return false;
8932 }
8933
8934 /* Print address 'x' of a memory access with mode 'mode'.  */
8935 static void
8936 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8937 {
8938   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8939     output_addr_const (f, x);
8940 }
8941
8942 bool
8943 aarch64_label_mentioned_p (rtx x)
8944 {
8945   const char *fmt;
8946   int i;
8947
8948   if (GET_CODE (x) == LABEL_REF)
8949     return true;
8950
8951   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8952      referencing instruction, but they are constant offsets, not
8953      symbols.  */
8954   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8955     return false;
8956
8957   fmt = GET_RTX_FORMAT (GET_CODE (x));
8958   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8959     {
8960       if (fmt[i] == 'E')
8961         {
8962           int j;
8963
8964           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8965             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8966               return 1;
8967         }
8968       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8969         return 1;
8970     }
8971
8972   return 0;
8973 }
8974
8975 /* Implement REGNO_REG_CLASS.  */
8976
8977 enum reg_class
8978 aarch64_regno_regclass (unsigned regno)
8979 {
8980   if (GP_REGNUM_P (regno))
8981     return GENERAL_REGS;
8982
8983   if (regno == SP_REGNUM)
8984     return STACK_REG;
8985
8986   if (regno == FRAME_POINTER_REGNUM
8987       || regno == ARG_POINTER_REGNUM)
8988     return POINTER_REGS;
8989
8990   if (FP_REGNUM_P (regno))
8991     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
8992             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
8993
8994   if (PR_REGNUM_P (regno))
8995     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8996
8997   return NO_REGS;
8998 }
8999
9000 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9001    If OFFSET is out of range, return an offset of an anchor point
9002    that is in range.  Return 0 otherwise.  */
9003
9004 static HOST_WIDE_INT
9005 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9006                        machine_mode mode)
9007 {
9008   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
9009   if (size > 16)
9010     return (offset + 0x400) & ~0x7f0;
9011
9012   /* For offsets that aren't a multiple of the access size, the limit is
9013      -256...255.  */
9014   if (offset & (size - 1))
9015     {
9016       /* BLKmode typically uses LDP of X-registers.  */
9017       if (mode == BLKmode)
9018         return (offset + 512) & ~0x3ff;
9019       return (offset + 0x100) & ~0x1ff;
9020     }
9021
9022   /* Small negative offsets are supported.  */
9023   if (IN_RANGE (offset, -256, 0))
9024     return 0;
9025
9026   if (mode == TImode || mode == TFmode)
9027     return (offset + 0x100) & ~0x1ff;
9028
9029   /* Use 12-bit offset by access size.  */
9030   return offset & (~0xfff * size);
9031 }
9032
9033 static rtx
9034 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
9035 {
9036   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9037      where mask is selected by alignment and size of the offset.
9038      We try to pick as large a range for the offset as possible to
9039      maximize the chance of a CSE.  However, for aligned addresses
9040      we limit the range to 4k so that structures with different sized
9041      elements are likely to use the same base.  We need to be careful
9042      not to split a CONST for some forms of address expression, otherwise
9043      it will generate sub-optimal code.  */
9044
9045   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9046     {
9047       rtx base = XEXP (x, 0);
9048       rtx offset_rtx = XEXP (x, 1);
9049       HOST_WIDE_INT offset = INTVAL (offset_rtx);
9050
9051       if (GET_CODE (base) == PLUS)
9052         {
9053           rtx op0 = XEXP (base, 0);
9054           rtx op1 = XEXP (base, 1);
9055
9056           /* Force any scaling into a temp for CSE.  */
9057           op0 = force_reg (Pmode, op0);
9058           op1 = force_reg (Pmode, op1);
9059
9060           /* Let the pointer register be in op0.  */
9061           if (REG_POINTER (op1))
9062             std::swap (op0, op1);
9063
9064           /* If the pointer is virtual or frame related, then we know that
9065              virtual register instantiation or register elimination is going
9066              to apply a second constant.  We want the two constants folded
9067              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
9068           if (virt_or_elim_regno_p (REGNO (op0)))
9069             {
9070               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9071                                    NULL_RTX, true, OPTAB_DIRECT);
9072               return gen_rtx_PLUS (Pmode, base, op1);
9073             }
9074
9075           /* Otherwise, in order to encourage CSE (and thence loop strength
9076              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
9077           base = expand_binop (Pmode, add_optab, op0, op1,
9078                                NULL_RTX, true, OPTAB_DIRECT);
9079           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9080         }
9081
9082       HOST_WIDE_INT size;
9083       if (GET_MODE_SIZE (mode).is_constant (&size))
9084         {
9085           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9086                                                              mode);
9087           if (base_offset != 0)
9088             {
9089               base = plus_constant (Pmode, base, base_offset);
9090               base = force_operand (base, NULL_RTX);
9091               return plus_constant (Pmode, base, offset - base_offset);
9092             }
9093         }
9094     }
9095
9096   return x;
9097 }
9098
9099 static reg_class_t
9100 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9101                           reg_class_t rclass,
9102                           machine_mode mode,
9103                           secondary_reload_info *sri)
9104 {
9105   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9106      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
9107      comment at the head of aarch64-sve.md for more details about the
9108      big-endian handling.  */
9109   if (BYTES_BIG_ENDIAN
9110       && reg_class_subset_p (rclass, FP_REGS)
9111       && !((REG_P (x) && HARD_REGISTER_P (x))
9112            || aarch64_simd_valid_immediate (x, NULL))
9113       && aarch64_sve_data_mode_p (mode))
9114     {
9115       sri->icode = CODE_FOR_aarch64_sve_reload_be;
9116       return NO_REGS;
9117     }
9118
9119   /* If we have to disable direct literal pool loads and stores because the
9120      function is too big, then we need a scratch register.  */
9121   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9122       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9123           || targetm.vector_mode_supported_p (GET_MODE (x)))
9124       && !aarch64_pcrelative_literal_loads)
9125     {
9126       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9127       return NO_REGS;
9128     }
9129
9130   /* Without the TARGET_SIMD instructions we cannot move a Q register
9131      to a Q register directly.  We need a scratch.  */
9132   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9133       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9134       && reg_class_subset_p (rclass, FP_REGS))
9135     {
9136       sri->icode = code_for_aarch64_reload_mov (mode);
9137       return NO_REGS;
9138     }
9139
9140   /* A TFmode or TImode memory access should be handled via an FP_REGS
9141      because AArch64 has richer addressing modes for LDR/STR instructions
9142      than LDP/STP instructions.  */
9143   if (TARGET_FLOAT && rclass == GENERAL_REGS
9144       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9145     return FP_REGS;
9146
9147   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9148       return GENERAL_REGS;
9149
9150   return NO_REGS;
9151 }
9152
9153 static bool
9154 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9155 {
9156   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9157
9158   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9159      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
9160   if (frame_pointer_needed)
9161     return to == HARD_FRAME_POINTER_REGNUM;
9162   return true;
9163 }
9164
9165 poly_int64
9166 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9167 {
9168   if (to == HARD_FRAME_POINTER_REGNUM)
9169     {
9170       if (from == ARG_POINTER_REGNUM)
9171         return cfun->machine->frame.hard_fp_offset;
9172
9173       if (from == FRAME_POINTER_REGNUM)
9174         return cfun->machine->frame.hard_fp_offset
9175                - cfun->machine->frame.locals_offset;
9176     }
9177
9178   if (to == STACK_POINTER_REGNUM)
9179     {
9180       if (from == FRAME_POINTER_REGNUM)
9181           return cfun->machine->frame.frame_size
9182                  - cfun->machine->frame.locals_offset;
9183     }
9184
9185   return cfun->machine->frame.frame_size;
9186 }
9187
9188 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
9189    previous frame.  */
9190
9191 rtx
9192 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9193 {
9194   if (count != 0)
9195     return const0_rtx;
9196   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9197 }
9198
9199
9200 static void
9201 aarch64_asm_trampoline_template (FILE *f)
9202 {
9203   int offset1 = 16;
9204   int offset2 = 20;
9205
9206   if (aarch64_bti_enabled ())
9207     {
9208       asm_fprintf (f, "\thint\t34 // bti c\n");
9209       offset1 -= 4;
9210       offset2 -= 4;
9211     }
9212
9213   if (TARGET_ILP32)
9214     {
9215       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9216       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9217                    offset1);
9218     }
9219   else
9220     {
9221       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9222       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9223                    offset2);
9224     }
9225   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9226
9227   /* The trampoline needs an extra padding instruction.  In case if BTI is
9228      enabled the padding instruction is replaced by the BTI instruction at
9229      the beginning.  */
9230   if (!aarch64_bti_enabled ())
9231     assemble_aligned_integer (4, const0_rtx);
9232
9233   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9234   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9235 }
9236
9237 static void
9238 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9239 {
9240   rtx fnaddr, mem, a_tramp;
9241   const int tramp_code_sz = 16;
9242
9243   /* Don't need to copy the trailing D-words, we fill those in below.  */
9244   emit_block_move (m_tramp, assemble_trampoline_template (),
9245                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9246   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9247   fnaddr = XEXP (DECL_RTL (fndecl), 0);
9248   if (GET_MODE (fnaddr) != ptr_mode)
9249     fnaddr = convert_memory_address (ptr_mode, fnaddr);
9250   emit_move_insn (mem, fnaddr);
9251
9252   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9253   emit_move_insn (mem, chain_value);
9254
9255   /* XXX We should really define a "clear_cache" pattern and use
9256      gen_clear_cache().  */
9257   a_tramp = XEXP (m_tramp, 0);
9258   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9259                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9260                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9261                      ptr_mode);
9262 }
9263
9264 static unsigned char
9265 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9266 {
9267   /* ??? Logically we should only need to provide a value when
9268      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9269      can hold MODE, but at the moment we need to handle all modes.
9270      Just ignore any runtime parts for registers that can't store them.  */
9271   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9272   unsigned int nregs;
9273   switch (regclass)
9274     {
9275     case TAILCALL_ADDR_REGS:
9276     case POINTER_REGS:
9277     case GENERAL_REGS:
9278     case ALL_REGS:
9279     case POINTER_AND_FP_REGS:
9280     case FP_REGS:
9281     case FP_LO_REGS:
9282     case FP_LO8_REGS:
9283       if (aarch64_sve_data_mode_p (mode)
9284           && constant_multiple_p (GET_MODE_SIZE (mode),
9285                                   BYTES_PER_SVE_VECTOR, &nregs))
9286         return nregs;
9287       return (aarch64_vector_data_mode_p (mode)
9288               ? CEIL (lowest_size, UNITS_PER_VREG)
9289               : CEIL (lowest_size, UNITS_PER_WORD));
9290     case STACK_REG:
9291     case PR_REGS:
9292     case PR_LO_REGS:
9293     case PR_HI_REGS:
9294       return 1;
9295
9296     case NO_REGS:
9297       return 0;
9298
9299     default:
9300       break;
9301     }
9302   gcc_unreachable ();
9303 }
9304
9305 static reg_class_t
9306 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9307 {
9308   if (regclass == POINTER_REGS)
9309     return GENERAL_REGS;
9310
9311   if (regclass == STACK_REG)
9312     {
9313       if (REG_P(x)
9314           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9315           return regclass;
9316
9317       return NO_REGS;
9318     }
9319
9320   /* Register eliminiation can result in a request for
9321      SP+constant->FP_REGS.  We cannot support such operations which
9322      use SP as source and an FP_REG as destination, so reject out
9323      right now.  */
9324   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9325     {
9326       rtx lhs = XEXP (x, 0);
9327
9328       /* Look through a possible SUBREG introduced by ILP32.  */
9329       if (GET_CODE (lhs) == SUBREG)
9330         lhs = SUBREG_REG (lhs);
9331
9332       gcc_assert (REG_P (lhs));
9333       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9334                                       POINTER_REGS));
9335       return NO_REGS;
9336     }
9337
9338   return regclass;
9339 }
9340
9341 void
9342 aarch64_asm_output_labelref (FILE* f, const char *name)
9343 {
9344   asm_fprintf (f, "%U%s", name);
9345 }
9346
9347 static void
9348 aarch64_elf_asm_constructor (rtx symbol, int priority)
9349 {
9350   if (priority == DEFAULT_INIT_PRIORITY)
9351     default_ctor_section_asm_out_constructor (symbol, priority);
9352   else
9353     {
9354       section *s;
9355       /* While priority is known to be in range [0, 65535], so 18 bytes
9356          would be enough, the compiler might not know that.  To avoid
9357          -Wformat-truncation false positive, use a larger size.  */
9358       char buf[23];
9359       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9360       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9361       switch_to_section (s);
9362       assemble_align (POINTER_SIZE);
9363       assemble_aligned_integer (POINTER_BYTES, symbol);
9364     }
9365 }
9366
9367 static void
9368 aarch64_elf_asm_destructor (rtx symbol, int priority)
9369 {
9370   if (priority == DEFAULT_INIT_PRIORITY)
9371     default_dtor_section_asm_out_destructor (symbol, priority);
9372   else
9373     {
9374       section *s;
9375       /* While priority is known to be in range [0, 65535], so 18 bytes
9376          would be enough, the compiler might not know that.  To avoid
9377          -Wformat-truncation false positive, use a larger size.  */
9378       char buf[23];
9379       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9380       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9381       switch_to_section (s);
9382       assemble_align (POINTER_SIZE);
9383       assemble_aligned_integer (POINTER_BYTES, symbol);
9384     }
9385 }
9386
9387 const char*
9388 aarch64_output_casesi (rtx *operands)
9389 {
9390   char buf[100];
9391   char label[100];
9392   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9393   int index;
9394   static const char *const patterns[4][2] =
9395   {
9396     {
9397       "ldrb\t%w3, [%0,%w1,uxtw]",
9398       "add\t%3, %4, %w3, sxtb #2"
9399     },
9400     {
9401       "ldrh\t%w3, [%0,%w1,uxtw #1]",
9402       "add\t%3, %4, %w3, sxth #2"
9403     },
9404     {
9405       "ldr\t%w3, [%0,%w1,uxtw #2]",
9406       "add\t%3, %4, %w3, sxtw #2"
9407     },
9408     /* We assume that DImode is only generated when not optimizing and
9409        that we don't really need 64-bit address offsets.  That would
9410        imply an object file with 8GB of code in a single function!  */
9411     {
9412       "ldr\t%w3, [%0,%w1,uxtw #2]",
9413       "add\t%3, %4, %w3, sxtw #2"
9414     }
9415   };
9416
9417   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9418
9419   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9420   index = exact_log2 (GET_MODE_SIZE (mode));
9421
9422   gcc_assert (index >= 0 && index <= 3);
9423
9424   /* Need to implement table size reduction, by chaning the code below.  */
9425   output_asm_insn (patterns[index][0], operands);
9426   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9427   snprintf (buf, sizeof (buf),
9428             "adr\t%%4, %s", targetm.strip_name_encoding (label));
9429   output_asm_insn (buf, operands);
9430   output_asm_insn (patterns[index][1], operands);
9431   output_asm_insn ("br\t%3", operands);
9432   assemble_label (asm_out_file, label);
9433   return "";
9434 }
9435
9436
9437 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9438    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9439    operator.  */
9440
9441 int
9442 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9443 {
9444   if (shift >= 0 && shift <= 3)
9445     {
9446       int size;
9447       for (size = 8; size <= 32; size *= 2)
9448         {
9449           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9450           if (mask == bits << shift)
9451             return size;
9452         }
9453     }
9454   return 0;
9455 }
9456
9457 /* Constant pools are per function only when PC relative
9458    literal loads are true or we are in the large memory
9459    model.  */
9460
9461 static inline bool
9462 aarch64_can_use_per_function_literal_pools_p (void)
9463 {
9464   return (aarch64_pcrelative_literal_loads
9465           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9466 }
9467
9468 static bool
9469 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9470 {
9471   /* We can't use blocks for constants when we're using a per-function
9472      constant pool.  */
9473   return !aarch64_can_use_per_function_literal_pools_p ();
9474 }
9475
9476 /* Select appropriate section for constants depending
9477    on where we place literal pools.  */
9478
9479 static section *
9480 aarch64_select_rtx_section (machine_mode mode,
9481                             rtx x,
9482                             unsigned HOST_WIDE_INT align)
9483 {
9484   if (aarch64_can_use_per_function_literal_pools_p ())
9485     return function_section (current_function_decl);
9486
9487   return default_elf_select_rtx_section (mode, x, align);
9488 }
9489
9490 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
9491 void
9492 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9493                                   HOST_WIDE_INT offset)
9494 {
9495   /* When using per-function literal pools, we must ensure that any code
9496      section is aligned to the minimal instruction length, lest we get
9497      errors from the assembler re "unaligned instructions".  */
9498   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9499     ASM_OUTPUT_ALIGN (f, 2);
9500 }
9501
9502 /* Costs.  */
9503
9504 /* Helper function for rtx cost calculation.  Strip a shift expression
9505    from X.  Returns the inner operand if successful, or the original
9506    expression on failure.  */
9507 static rtx
9508 aarch64_strip_shift (rtx x)
9509 {
9510   rtx op = x;
9511
9512   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9513      we can convert both to ROR during final output.  */
9514   if ((GET_CODE (op) == ASHIFT
9515        || GET_CODE (op) == ASHIFTRT
9516        || GET_CODE (op) == LSHIFTRT
9517        || GET_CODE (op) == ROTATERT
9518        || GET_CODE (op) == ROTATE)
9519       && CONST_INT_P (XEXP (op, 1)))
9520     return XEXP (op, 0);
9521
9522   if (GET_CODE (op) == MULT
9523       && CONST_INT_P (XEXP (op, 1))
9524       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9525     return XEXP (op, 0);
9526
9527   return x;
9528 }
9529
9530 /* Helper function for rtx cost calculation.  Strip an extend
9531    expression from X.  Returns the inner operand if successful, or the
9532    original expression on failure.  We deal with a number of possible
9533    canonicalization variations here. If STRIP_SHIFT is true, then
9534    we can strip off a shift also.  */
9535 static rtx
9536 aarch64_strip_extend (rtx x, bool strip_shift)
9537 {
9538   scalar_int_mode mode;
9539   rtx op = x;
9540
9541   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9542     return op;
9543
9544   /* Zero and sign extraction of a widened value.  */
9545   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9546       && XEXP (op, 2) == const0_rtx
9547       && GET_CODE (XEXP (op, 0)) == MULT
9548       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9549                                          XEXP (op, 1)))
9550     return XEXP (XEXP (op, 0), 0);
9551
9552   /* It can also be represented (for zero-extend) as an AND with an
9553      immediate.  */
9554   if (GET_CODE (op) == AND
9555       && GET_CODE (XEXP (op, 0)) == MULT
9556       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9557       && CONST_INT_P (XEXP (op, 1))
9558       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9559                            INTVAL (XEXP (op, 1))) != 0)
9560     return XEXP (XEXP (op, 0), 0);
9561
9562   /* Now handle extended register, as this may also have an optional
9563      left shift by 1..4.  */
9564   if (strip_shift
9565       && GET_CODE (op) == ASHIFT
9566       && CONST_INT_P (XEXP (op, 1))
9567       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9568     op = XEXP (op, 0);
9569
9570   if (GET_CODE (op) == ZERO_EXTEND
9571       || GET_CODE (op) == SIGN_EXTEND)
9572     op = XEXP (op, 0);
9573
9574   if (op != x)
9575     return op;
9576
9577   return x;
9578 }
9579
9580 /* Return true iff CODE is a shift supported in combination
9581    with arithmetic instructions.  */
9582
9583 static bool
9584 aarch64_shift_p (enum rtx_code code)
9585 {
9586   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9587 }
9588
9589
9590 /* Return true iff X is a cheap shift without a sign extend. */
9591
9592 static bool
9593 aarch64_cheap_mult_shift_p (rtx x)
9594 {
9595   rtx op0, op1;
9596
9597   op0 = XEXP (x, 0);
9598   op1 = XEXP (x, 1);
9599
9600   if (!(aarch64_tune_params.extra_tuning_flags
9601                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9602     return false;
9603
9604   if (GET_CODE (op0) == SIGN_EXTEND)
9605     return false;
9606
9607   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9608       && UINTVAL (op1) <= 4)
9609     return true;
9610
9611   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9612     return false;
9613
9614   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9615
9616   if (l2 > 0 && l2 <= 4)
9617     return true;
9618
9619   return false;
9620 }
9621
9622 /* Helper function for rtx cost calculation.  Calculate the cost of
9623    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9624    Return the calculated cost of the expression, recursing manually in to
9625    operands where needed.  */
9626
9627 static int
9628 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9629 {
9630   rtx op0, op1;
9631   const struct cpu_cost_table *extra_cost
9632     = aarch64_tune_params.insn_extra_cost;
9633   int cost = 0;
9634   bool compound_p = (outer == PLUS || outer == MINUS);
9635   machine_mode mode = GET_MODE (x);
9636
9637   gcc_checking_assert (code == MULT);
9638
9639   op0 = XEXP (x, 0);
9640   op1 = XEXP (x, 1);
9641
9642   if (VECTOR_MODE_P (mode))
9643     mode = GET_MODE_INNER (mode);
9644
9645   /* Integer multiply/fma.  */
9646   if (GET_MODE_CLASS (mode) == MODE_INT)
9647     {
9648       /* The multiply will be canonicalized as a shift, cost it as such.  */
9649       if (aarch64_shift_p (GET_CODE (x))
9650           || (CONST_INT_P (op1)
9651               && exact_log2 (INTVAL (op1)) > 0))
9652         {
9653           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9654                            || GET_CODE (op0) == SIGN_EXTEND;
9655           if (speed)
9656             {
9657               if (compound_p)
9658                 {
9659                   /* If the shift is considered cheap,
9660                      then don't add any cost. */
9661                   if (aarch64_cheap_mult_shift_p (x))
9662                     ;
9663                   else if (REG_P (op1))
9664                     /* ARITH + shift-by-register.  */
9665                     cost += extra_cost->alu.arith_shift_reg;
9666                   else if (is_extend)
9667                     /* ARITH + extended register.  We don't have a cost field
9668                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9669                     cost += extra_cost->alu.extend_arith;
9670                   else
9671                     /* ARITH + shift-by-immediate.  */
9672                     cost += extra_cost->alu.arith_shift;
9673                 }
9674               else
9675                 /* LSL (immediate).  */
9676                 cost += extra_cost->alu.shift;
9677
9678             }
9679           /* Strip extends as we will have costed them in the case above.  */
9680           if (is_extend)
9681             op0 = aarch64_strip_extend (op0, true);
9682
9683           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9684
9685           return cost;
9686         }
9687
9688       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9689          compound and let the below cases handle it.  After all, MNEG is a
9690          special-case alias of MSUB.  */
9691       if (GET_CODE (op0) == NEG)
9692         {
9693           op0 = XEXP (op0, 0);
9694           compound_p = true;
9695         }
9696
9697       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9698       if ((GET_CODE (op0) == ZERO_EXTEND
9699            && GET_CODE (op1) == ZERO_EXTEND)
9700           || (GET_CODE (op0) == SIGN_EXTEND
9701               && GET_CODE (op1) == SIGN_EXTEND))
9702         {
9703           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9704           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9705
9706           if (speed)
9707             {
9708               if (compound_p)
9709                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9710                 cost += extra_cost->mult[0].extend_add;
9711               else
9712                 /* MUL/SMULL/UMULL.  */
9713                 cost += extra_cost->mult[0].extend;
9714             }
9715
9716           return cost;
9717         }
9718
9719       /* This is either an integer multiply or a MADD.  In both cases
9720          we want to recurse and cost the operands.  */
9721       cost += rtx_cost (op0, mode, MULT, 0, speed);
9722       cost += rtx_cost (op1, mode, MULT, 1, speed);
9723
9724       if (speed)
9725         {
9726           if (compound_p)
9727             /* MADD/MSUB.  */
9728             cost += extra_cost->mult[mode == DImode].add;
9729           else
9730             /* MUL.  */
9731             cost += extra_cost->mult[mode == DImode].simple;
9732         }
9733
9734       return cost;
9735     }
9736   else
9737     {
9738       if (speed)
9739         {
9740           /* Floating-point FMA/FMUL can also support negations of the
9741              operands, unless the rounding mode is upward or downward in
9742              which case FNMUL is different than FMUL with operand negation.  */
9743           bool neg0 = GET_CODE (op0) == NEG;
9744           bool neg1 = GET_CODE (op1) == NEG;
9745           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9746             {
9747               if (neg0)
9748                 op0 = XEXP (op0, 0);
9749               if (neg1)
9750                 op1 = XEXP (op1, 0);
9751             }
9752
9753           if (compound_p)
9754             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9755             cost += extra_cost->fp[mode == DFmode].fma;
9756           else
9757             /* FMUL/FNMUL.  */
9758             cost += extra_cost->fp[mode == DFmode].mult;
9759         }
9760
9761       cost += rtx_cost (op0, mode, MULT, 0, speed);
9762       cost += rtx_cost (op1, mode, MULT, 1, speed);
9763       return cost;
9764     }
9765 }
9766
9767 static int
9768 aarch64_address_cost (rtx x,
9769                       machine_mode mode,
9770                       addr_space_t as ATTRIBUTE_UNUSED,
9771                       bool speed)
9772 {
9773   enum rtx_code c = GET_CODE (x);
9774   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9775   struct aarch64_address_info info;
9776   int cost = 0;
9777   info.shift = 0;
9778
9779   if (!aarch64_classify_address (&info, x, mode, false))
9780     {
9781       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9782         {
9783           /* This is a CONST or SYMBOL ref which will be split
9784              in a different way depending on the code model in use.
9785              Cost it through the generic infrastructure.  */
9786           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9787           /* Divide through by the cost of one instruction to
9788              bring it to the same units as the address costs.  */
9789           cost_symbol_ref /= COSTS_N_INSNS (1);
9790           /* The cost is then the cost of preparing the address,
9791              followed by an immediate (possibly 0) offset.  */
9792           return cost_symbol_ref + addr_cost->imm_offset;
9793         }
9794       else
9795         {
9796           /* This is most likely a jump table from a case
9797              statement.  */
9798           return addr_cost->register_offset;
9799         }
9800     }
9801
9802   switch (info.type)
9803     {
9804       case ADDRESS_LO_SUM:
9805       case ADDRESS_SYMBOLIC:
9806       case ADDRESS_REG_IMM:
9807         cost += addr_cost->imm_offset;
9808         break;
9809
9810       case ADDRESS_REG_WB:
9811         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9812           cost += addr_cost->pre_modify;
9813         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9814           cost += addr_cost->post_modify;
9815         else
9816           gcc_unreachable ();
9817
9818         break;
9819
9820       case ADDRESS_REG_REG:
9821         cost += addr_cost->register_offset;
9822         break;
9823
9824       case ADDRESS_REG_SXTW:
9825         cost += addr_cost->register_sextend;
9826         break;
9827
9828       case ADDRESS_REG_UXTW:
9829         cost += addr_cost->register_zextend;
9830         break;
9831
9832       default:
9833         gcc_unreachable ();
9834     }
9835
9836
9837   if (info.shift > 0)
9838     {
9839       /* For the sake of calculating the cost of the shifted register
9840          component, we can treat same sized modes in the same way.  */
9841       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9842         cost += addr_cost->addr_scale_costs.hi;
9843       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9844         cost += addr_cost->addr_scale_costs.si;
9845       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9846         cost += addr_cost->addr_scale_costs.di;
9847       else
9848         /* We can't tell, or this is a 128-bit vector.  */
9849         cost += addr_cost->addr_scale_costs.ti;
9850     }
9851
9852   return cost;
9853 }
9854
9855 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9856    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9857    to be taken.  */
9858
9859 int
9860 aarch64_branch_cost (bool speed_p, bool predictable_p)
9861 {
9862   /* When optimizing for speed, use the cost of unpredictable branches.  */
9863   const struct cpu_branch_cost *branch_costs =
9864     aarch64_tune_params.branch_costs;
9865
9866   if (!speed_p || predictable_p)
9867     return branch_costs->predictable;
9868   else
9869     return branch_costs->unpredictable;
9870 }
9871
9872 /* Return true if the RTX X in mode MODE is a zero or sign extract
9873    usable in an ADD or SUB (extended register) instruction.  */
9874 static bool
9875 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9876 {
9877   /* Catch add with a sign extract.
9878      This is add_<optab><mode>_multp2.  */
9879   if (GET_CODE (x) == SIGN_EXTRACT
9880       || GET_CODE (x) == ZERO_EXTRACT)
9881     {
9882       rtx op0 = XEXP (x, 0);
9883       rtx op1 = XEXP (x, 1);
9884       rtx op2 = XEXP (x, 2);
9885
9886       if (GET_CODE (op0) == MULT
9887           && CONST_INT_P (op1)
9888           && op2 == const0_rtx
9889           && CONST_INT_P (XEXP (op0, 1))
9890           && aarch64_is_extend_from_extract (mode,
9891                                              XEXP (op0, 1),
9892                                              op1))
9893         {
9894           return true;
9895         }
9896     }
9897   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9898      No shift.  */
9899   else if (GET_CODE (x) == SIGN_EXTEND
9900            || GET_CODE (x) == ZERO_EXTEND)
9901     return REG_P (XEXP (x, 0));
9902
9903   return false;
9904 }
9905
9906 static bool
9907 aarch64_frint_unspec_p (unsigned int u)
9908 {
9909   switch (u)
9910     {
9911       case UNSPEC_FRINTZ:
9912       case UNSPEC_FRINTP:
9913       case UNSPEC_FRINTM:
9914       case UNSPEC_FRINTA:
9915       case UNSPEC_FRINTN:
9916       case UNSPEC_FRINTX:
9917       case UNSPEC_FRINTI:
9918         return true;
9919
9920       default:
9921         return false;
9922     }
9923 }
9924
9925 /* Return true iff X is an rtx that will match an extr instruction
9926    i.e. as described in the *extr<mode>5_insn family of patterns.
9927    OP0 and OP1 will be set to the operands of the shifts involved
9928    on success and will be NULL_RTX otherwise.  */
9929
9930 static bool
9931 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9932 {
9933   rtx op0, op1;
9934   scalar_int_mode mode;
9935   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9936     return false;
9937
9938   *res_op0 = NULL_RTX;
9939   *res_op1 = NULL_RTX;
9940
9941   if (GET_CODE (x) != IOR)
9942     return false;
9943
9944   op0 = XEXP (x, 0);
9945   op1 = XEXP (x, 1);
9946
9947   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9948       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9949     {
9950      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9951       if (GET_CODE (op1) == ASHIFT)
9952         std::swap (op0, op1);
9953
9954       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9955         return false;
9956
9957       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9958       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9959
9960       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9961           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9962         {
9963           *res_op0 = XEXP (op0, 0);
9964           *res_op1 = XEXP (op1, 0);
9965           return true;
9966         }
9967     }
9968
9969   return false;
9970 }
9971
9972 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9973    storing it in *COST.  Result is true if the total cost of the operation
9974    has now been calculated.  */
9975 static bool
9976 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9977 {
9978   rtx inner;
9979   rtx comparator;
9980   enum rtx_code cmpcode;
9981
9982   if (COMPARISON_P (op0))
9983     {
9984       inner = XEXP (op0, 0);
9985       comparator = XEXP (op0, 1);
9986       cmpcode = GET_CODE (op0);
9987     }
9988   else
9989     {
9990       inner = op0;
9991       comparator = const0_rtx;
9992       cmpcode = NE;
9993     }
9994
9995   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9996     {
9997       /* Conditional branch.  */
9998       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9999         return true;
10000       else
10001         {
10002           if (cmpcode == NE || cmpcode == EQ)
10003             {
10004               if (comparator == const0_rtx)
10005                 {
10006                   /* TBZ/TBNZ/CBZ/CBNZ.  */
10007                   if (GET_CODE (inner) == ZERO_EXTRACT)
10008                     /* TBZ/TBNZ.  */
10009                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10010                                        ZERO_EXTRACT, 0, speed);
10011                   else
10012                     /* CBZ/CBNZ.  */
10013                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10014
10015                 return true;
10016               }
10017             }
10018           else if (cmpcode == LT || cmpcode == GE)
10019             {
10020               /* TBZ/TBNZ.  */
10021               if (comparator == const0_rtx)
10022                 return true;
10023             }
10024         }
10025     }
10026   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10027     {
10028       /* CCMP.  */
10029       if (GET_CODE (op1) == COMPARE)
10030         {
10031           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
10032           if (XEXP (op1, 1) == const0_rtx)
10033             *cost += 1;
10034           if (speed)
10035             {
10036               machine_mode mode = GET_MODE (XEXP (op1, 0));
10037               const struct cpu_cost_table *extra_cost
10038                 = aarch64_tune_params.insn_extra_cost;
10039
10040               if (GET_MODE_CLASS (mode) == MODE_INT)
10041                 *cost += extra_cost->alu.arith;
10042               else
10043                 *cost += extra_cost->fp[mode == DFmode].compare;
10044             }
10045           return true;
10046         }
10047
10048       /* It's a conditional operation based on the status flags,
10049          so it must be some flavor of CSEL.  */
10050
10051       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
10052       if (GET_CODE (op1) == NEG
10053           || GET_CODE (op1) == NOT
10054           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10055         op1 = XEXP (op1, 0);
10056       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10057         {
10058           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
10059           op1 = XEXP (op1, 0);
10060           op2 = XEXP (op2, 0);
10061         }
10062
10063       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10064       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10065       return true;
10066     }
10067
10068   /* We don't know what this is, cost all operands.  */
10069   return false;
10070 }
10071
10072 /* Check whether X is a bitfield operation of the form shift + extend that
10073    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
10074    operand to which the bitfield operation is applied.  Otherwise return
10075    NULL_RTX.  */
10076
10077 static rtx
10078 aarch64_extend_bitfield_pattern_p (rtx x)
10079 {
10080   rtx_code outer_code = GET_CODE (x);
10081   machine_mode outer_mode = GET_MODE (x);
10082
10083   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10084       && outer_mode != SImode && outer_mode != DImode)
10085     return NULL_RTX;
10086
10087   rtx inner = XEXP (x, 0);
10088   rtx_code inner_code = GET_CODE (inner);
10089   machine_mode inner_mode = GET_MODE (inner);
10090   rtx op = NULL_RTX;
10091
10092   switch (inner_code)
10093     {
10094       case ASHIFT:
10095         if (CONST_INT_P (XEXP (inner, 1))
10096             && (inner_mode == QImode || inner_mode == HImode))
10097           op = XEXP (inner, 0);
10098         break;
10099       case LSHIFTRT:
10100         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10101             && (inner_mode == QImode || inner_mode == HImode))
10102           op = XEXP (inner, 0);
10103         break;
10104       case ASHIFTRT:
10105         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10106             && (inner_mode == QImode || inner_mode == HImode))
10107           op = XEXP (inner, 0);
10108         break;
10109       default:
10110         break;
10111     }
10112
10113   return op;
10114 }
10115
10116 /* Return true if the mask and a shift amount from an RTX of the form
10117    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10118    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
10119
10120 bool
10121 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10122                                     rtx shft_amnt)
10123 {
10124   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10125          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10126          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10127          && (INTVAL (mask)
10128              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10129 }
10130
10131 /* Return true if the masks and a shift amount from an RTX of the form
10132    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10133    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
10134
10135 bool
10136 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10137                                    unsigned HOST_WIDE_INT mask1,
10138                                    unsigned HOST_WIDE_INT shft_amnt,
10139                                    unsigned HOST_WIDE_INT mask2)
10140 {
10141   unsigned HOST_WIDE_INT t;
10142
10143   /* Verify that there is no overlap in what bits are set in the two masks.  */
10144   if (mask1 != ~mask2)
10145     return false;
10146
10147   /* Verify that mask2 is not all zeros or ones.  */
10148   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10149     return false;
10150
10151   /* The shift amount should always be less than the mode size.  */
10152   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10153
10154   /* Verify that the mask being shifted is contiguous and would be in the
10155      least significant bits after shifting by shft_amnt.  */
10156   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10157   return (t == (t & -t));
10158 }
10159
10160 /* Calculate the cost of calculating X, storing it in *COST.  Result
10161    is true if the total cost of the operation has now been calculated.  */
10162 static bool
10163 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10164                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10165 {
10166   rtx op0, op1, op2;
10167   const struct cpu_cost_table *extra_cost
10168     = aarch64_tune_params.insn_extra_cost;
10169   int code = GET_CODE (x);
10170   scalar_int_mode int_mode;
10171
10172   /* By default, assume that everything has equivalent cost to the
10173      cheapest instruction.  Any additional costs are applied as a delta
10174      above this default.  */
10175   *cost = COSTS_N_INSNS (1);
10176
10177   switch (code)
10178     {
10179     case SET:
10180       /* The cost depends entirely on the operands to SET.  */
10181       *cost = 0;
10182       op0 = SET_DEST (x);
10183       op1 = SET_SRC (x);
10184
10185       switch (GET_CODE (op0))
10186         {
10187         case MEM:
10188           if (speed)
10189             {
10190               rtx address = XEXP (op0, 0);
10191               if (VECTOR_MODE_P (mode))
10192                 *cost += extra_cost->ldst.storev;
10193               else if (GET_MODE_CLASS (mode) == MODE_INT)
10194                 *cost += extra_cost->ldst.store;
10195               else if (mode == SFmode)
10196                 *cost += extra_cost->ldst.storef;
10197               else if (mode == DFmode)
10198                 *cost += extra_cost->ldst.stored;
10199
10200               *cost +=
10201                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10202                                                      0, speed));
10203             }
10204
10205           *cost += rtx_cost (op1, mode, SET, 1, speed);
10206           return true;
10207
10208         case SUBREG:
10209           if (! REG_P (SUBREG_REG (op0)))
10210             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10211
10212           /* Fall through.  */
10213         case REG:
10214           /* The cost is one per vector-register copied.  */
10215           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10216             {
10217               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10218               *cost = COSTS_N_INSNS (nregs);
10219             }
10220           /* const0_rtx is in general free, but we will use an
10221              instruction to set a register to 0.  */
10222           else if (REG_P (op1) || op1 == const0_rtx)
10223             {
10224               /* The cost is 1 per register copied.  */
10225               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10226               *cost = COSTS_N_INSNS (nregs);
10227             }
10228           else
10229             /* Cost is just the cost of the RHS of the set.  */
10230             *cost += rtx_cost (op1, mode, SET, 1, speed);
10231           return true;
10232
10233         case ZERO_EXTRACT:
10234         case SIGN_EXTRACT:
10235           /* Bit-field insertion.  Strip any redundant widening of
10236              the RHS to meet the width of the target.  */
10237           if (GET_CODE (op1) == SUBREG)
10238             op1 = SUBREG_REG (op1);
10239           if ((GET_CODE (op1) == ZERO_EXTEND
10240                || GET_CODE (op1) == SIGN_EXTEND)
10241               && CONST_INT_P (XEXP (op0, 1))
10242               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10243               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10244             op1 = XEXP (op1, 0);
10245
10246           if (CONST_INT_P (op1))
10247             {
10248               /* MOV immediate is assumed to always be cheap.  */
10249               *cost = COSTS_N_INSNS (1);
10250             }
10251           else
10252             {
10253               /* BFM.  */
10254               if (speed)
10255                 *cost += extra_cost->alu.bfi;
10256               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10257             }
10258
10259           return true;
10260
10261         default:
10262           /* We can't make sense of this, assume default cost.  */
10263           *cost = COSTS_N_INSNS (1);
10264           return false;
10265         }
10266       return false;
10267
10268     case CONST_INT:
10269       /* If an instruction can incorporate a constant within the
10270          instruction, the instruction's expression avoids calling
10271          rtx_cost() on the constant.  If rtx_cost() is called on a
10272          constant, then it is usually because the constant must be
10273          moved into a register by one or more instructions.
10274
10275          The exception is constant 0, which can be expressed
10276          as XZR/WZR and is therefore free.  The exception to this is
10277          if we have (set (reg) (const0_rtx)) in which case we must cost
10278          the move.  However, we can catch that when we cost the SET, so
10279          we don't need to consider that here.  */
10280       if (x == const0_rtx)
10281         *cost = 0;
10282       else
10283         {
10284           /* To an approximation, building any other constant is
10285              proportionally expensive to the number of instructions
10286              required to build that constant.  This is true whether we
10287              are compiling for SPEED or otherwise.  */
10288           if (!is_a <scalar_int_mode> (mode, &int_mode))
10289             int_mode = word_mode;
10290           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10291                                  (NULL_RTX, x, false, int_mode));
10292         }
10293       return true;
10294
10295     case CONST_DOUBLE:
10296
10297       /* First determine number of instructions to do the move
10298           as an integer constant.  */
10299       if (!aarch64_float_const_representable_p (x)
10300            && !aarch64_can_const_movi_rtx_p (x, mode)
10301            && aarch64_float_const_rtx_p (x))
10302         {
10303           unsigned HOST_WIDE_INT ival;
10304           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10305           gcc_assert (succeed);
10306
10307           scalar_int_mode imode = (mode == HFmode
10308                                    ? SImode
10309                                    : int_mode_for_mode (mode).require ());
10310           int ncost = aarch64_internal_mov_immediate
10311                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10312           *cost += COSTS_N_INSNS (ncost);
10313           return true;
10314         }
10315
10316       if (speed)
10317         {
10318           /* mov[df,sf]_aarch64.  */
10319           if (aarch64_float_const_representable_p (x))
10320             /* FMOV (scalar immediate).  */
10321             *cost += extra_cost->fp[mode == DFmode].fpconst;
10322           else if (!aarch64_float_const_zero_rtx_p (x))
10323             {
10324               /* This will be a load from memory.  */
10325               if (mode == DFmode)
10326                 *cost += extra_cost->ldst.loadd;
10327               else
10328                 *cost += extra_cost->ldst.loadf;
10329             }
10330           else
10331             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
10332                or MOV v0.s[0], wzr - neither of which are modeled by the
10333                cost tables.  Just use the default cost.  */
10334             {
10335             }
10336         }
10337
10338       return true;
10339
10340     case MEM:
10341       if (speed)
10342         {
10343           /* For loads we want the base cost of a load, plus an
10344              approximation for the additional cost of the addressing
10345              mode.  */
10346           rtx address = XEXP (x, 0);
10347           if (VECTOR_MODE_P (mode))
10348             *cost += extra_cost->ldst.loadv;
10349           else if (GET_MODE_CLASS (mode) == MODE_INT)
10350             *cost += extra_cost->ldst.load;
10351           else if (mode == SFmode)
10352             *cost += extra_cost->ldst.loadf;
10353           else if (mode == DFmode)
10354             *cost += extra_cost->ldst.loadd;
10355
10356           *cost +=
10357                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10358                                                      0, speed));
10359         }
10360
10361       return true;
10362
10363     case NEG:
10364       op0 = XEXP (x, 0);
10365
10366       if (VECTOR_MODE_P (mode))
10367         {
10368           if (speed)
10369             {
10370               /* FNEG.  */
10371               *cost += extra_cost->vect.alu;
10372             }
10373           return false;
10374         }
10375
10376       if (GET_MODE_CLASS (mode) == MODE_INT)
10377         {
10378           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10379               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10380             {
10381               /* CSETM.  */
10382               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10383               return true;
10384             }
10385
10386           /* Cost this as SUB wzr, X.  */
10387           op0 = CONST0_RTX (mode);
10388           op1 = XEXP (x, 0);
10389           goto cost_minus;
10390         }
10391
10392       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10393         {
10394           /* Support (neg(fma...)) as a single instruction only if
10395              sign of zeros is unimportant.  This matches the decision
10396              making in aarch64.md.  */
10397           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10398             {
10399               /* FNMADD.  */
10400               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10401               return true;
10402             }
10403           if (GET_CODE (op0) == MULT)
10404             {
10405               /* FNMUL.  */
10406               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10407               return true;
10408             }
10409           if (speed)
10410             /* FNEG.  */
10411             *cost += extra_cost->fp[mode == DFmode].neg;
10412           return false;
10413         }
10414
10415       return false;
10416
10417     case CLRSB:
10418     case CLZ:
10419       if (speed)
10420         {
10421           if (VECTOR_MODE_P (mode))
10422             *cost += extra_cost->vect.alu;
10423           else
10424             *cost += extra_cost->alu.clz;
10425         }
10426
10427       return false;
10428
10429     case COMPARE:
10430       op0 = XEXP (x, 0);
10431       op1 = XEXP (x, 1);
10432
10433       if (op1 == const0_rtx
10434           && GET_CODE (op0) == AND)
10435         {
10436           x = op0;
10437           mode = GET_MODE (op0);
10438           goto cost_logic;
10439         }
10440
10441       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10442         {
10443           /* TODO: A write to the CC flags possibly costs extra, this
10444              needs encoding in the cost tables.  */
10445
10446           mode = GET_MODE (op0);
10447           /* ANDS.  */
10448           if (GET_CODE (op0) == AND)
10449             {
10450               x = op0;
10451               goto cost_logic;
10452             }
10453
10454           if (GET_CODE (op0) == PLUS)
10455             {
10456               /* ADDS (and CMN alias).  */
10457               x = op0;
10458               goto cost_plus;
10459             }
10460
10461           if (GET_CODE (op0) == MINUS)
10462             {
10463               /* SUBS.  */
10464               x = op0;
10465               goto cost_minus;
10466             }
10467
10468           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10469               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10470               && CONST_INT_P (XEXP (op0, 2)))
10471             {
10472               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10473                  Handle it here directly rather than going to cost_logic
10474                  since we know the immediate generated for the TST is valid
10475                  so we can avoid creating an intermediate rtx for it only
10476                  for costing purposes.  */
10477               if (speed)
10478                 *cost += extra_cost->alu.logical;
10479
10480               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10481                                  ZERO_EXTRACT, 0, speed);
10482               return true;
10483             }
10484
10485           if (GET_CODE (op1) == NEG)
10486             {
10487               /* CMN.  */
10488               if (speed)
10489                 *cost += extra_cost->alu.arith;
10490
10491               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10492               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10493               return true;
10494             }
10495
10496           /* CMP.
10497
10498              Compare can freely swap the order of operands, and
10499              canonicalization puts the more complex operation first.
10500              But the integer MINUS logic expects the shift/extend
10501              operation in op1.  */
10502           if (! (REG_P (op0)
10503                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10504           {
10505             op0 = XEXP (x, 1);
10506             op1 = XEXP (x, 0);
10507           }
10508           goto cost_minus;
10509         }
10510
10511       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10512         {
10513           /* FCMP.  */
10514           if (speed)
10515             *cost += extra_cost->fp[mode == DFmode].compare;
10516
10517           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10518             {
10519               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10520               /* FCMP supports constant 0.0 for no extra cost. */
10521               return true;
10522             }
10523           return false;
10524         }
10525
10526       if (VECTOR_MODE_P (mode))
10527         {
10528           /* Vector compare.  */
10529           if (speed)
10530             *cost += extra_cost->vect.alu;
10531
10532           if (aarch64_float_const_zero_rtx_p (op1))
10533             {
10534               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10535                  cost.  */
10536               return true;
10537             }
10538           return false;
10539         }
10540       return false;
10541
10542     case MINUS:
10543       {
10544         op0 = XEXP (x, 0);
10545         op1 = XEXP (x, 1);
10546
10547 cost_minus:
10548         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10549
10550         /* Detect valid immediates.  */
10551         if ((GET_MODE_CLASS (mode) == MODE_INT
10552              || (GET_MODE_CLASS (mode) == MODE_CC
10553                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10554             && CONST_INT_P (op1)
10555             && aarch64_uimm12_shift (INTVAL (op1)))
10556           {
10557             if (speed)
10558               /* SUB(S) (immediate).  */
10559               *cost += extra_cost->alu.arith;
10560             return true;
10561           }
10562
10563         /* Look for SUB (extended register).  */
10564         if (is_a <scalar_int_mode> (mode, &int_mode)
10565             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10566           {
10567             if (speed)
10568               *cost += extra_cost->alu.extend_arith;
10569
10570             op1 = aarch64_strip_extend (op1, true);
10571             *cost += rtx_cost (op1, VOIDmode,
10572                                (enum rtx_code) GET_CODE (op1), 0, speed);
10573             return true;
10574           }
10575
10576         rtx new_op1 = aarch64_strip_extend (op1, false);
10577
10578         /* Cost this as an FMA-alike operation.  */
10579         if ((GET_CODE (new_op1) == MULT
10580              || aarch64_shift_p (GET_CODE (new_op1)))
10581             && code != COMPARE)
10582           {
10583             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10584                                             (enum rtx_code) code,
10585                                             speed);
10586             return true;
10587           }
10588
10589         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10590
10591         if (speed)
10592           {
10593             if (VECTOR_MODE_P (mode))
10594               {
10595                 /* Vector SUB.  */
10596                 *cost += extra_cost->vect.alu;
10597               }
10598             else if (GET_MODE_CLASS (mode) == MODE_INT)
10599               {
10600                 /* SUB(S).  */
10601                 *cost += extra_cost->alu.arith;
10602               }
10603             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10604               {
10605                 /* FSUB.  */
10606                 *cost += extra_cost->fp[mode == DFmode].addsub;
10607               }
10608           }
10609         return true;
10610       }
10611
10612     case PLUS:
10613       {
10614         rtx new_op0;
10615
10616         op0 = XEXP (x, 0);
10617         op1 = XEXP (x, 1);
10618
10619 cost_plus:
10620         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10621             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10622           {
10623             /* CSINC.  */
10624             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10625             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10626             return true;
10627           }
10628
10629         if (GET_MODE_CLASS (mode) == MODE_INT
10630             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10631                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10632           {
10633             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10634
10635             if (speed)
10636               /* ADD (immediate).  */
10637               *cost += extra_cost->alu.arith;
10638             return true;
10639           }
10640
10641         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10642
10643         /* Look for ADD (extended register).  */
10644         if (is_a <scalar_int_mode> (mode, &int_mode)
10645             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10646           {
10647             if (speed)
10648               *cost += extra_cost->alu.extend_arith;
10649
10650             op0 = aarch64_strip_extend (op0, true);
10651             *cost += rtx_cost (op0, VOIDmode,
10652                                (enum rtx_code) GET_CODE (op0), 0, speed);
10653             return true;
10654           }
10655
10656         /* Strip any extend, leave shifts behind as we will
10657            cost them through mult_cost.  */
10658         new_op0 = aarch64_strip_extend (op0, false);
10659
10660         if (GET_CODE (new_op0) == MULT
10661             || aarch64_shift_p (GET_CODE (new_op0)))
10662           {
10663             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10664                                             speed);
10665             return true;
10666           }
10667
10668         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10669
10670         if (speed)
10671           {
10672             if (VECTOR_MODE_P (mode))
10673               {
10674                 /* Vector ADD.  */
10675                 *cost += extra_cost->vect.alu;
10676               }
10677             else if (GET_MODE_CLASS (mode) == MODE_INT)
10678               {
10679                 /* ADD.  */
10680                 *cost += extra_cost->alu.arith;
10681               }
10682             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10683               {
10684                 /* FADD.  */
10685                 *cost += extra_cost->fp[mode == DFmode].addsub;
10686               }
10687           }
10688         return true;
10689       }
10690
10691     case BSWAP:
10692       *cost = COSTS_N_INSNS (1);
10693
10694       if (speed)
10695         {
10696           if (VECTOR_MODE_P (mode))
10697             *cost += extra_cost->vect.alu;
10698           else
10699             *cost += extra_cost->alu.rev;
10700         }
10701       return false;
10702
10703     case IOR:
10704       if (aarch_rev16_p (x))
10705         {
10706           *cost = COSTS_N_INSNS (1);
10707
10708           if (speed)
10709             {
10710               if (VECTOR_MODE_P (mode))
10711                 *cost += extra_cost->vect.alu;
10712               else
10713                 *cost += extra_cost->alu.rev;
10714             }
10715           return true;
10716         }
10717
10718       if (aarch64_extr_rtx_p (x, &op0, &op1))
10719         {
10720           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10721           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10722           if (speed)
10723             *cost += extra_cost->alu.shift;
10724
10725           return true;
10726         }
10727     /* Fall through.  */
10728     case XOR:
10729     case AND:
10730     cost_logic:
10731       op0 = XEXP (x, 0);
10732       op1 = XEXP (x, 1);
10733
10734       if (VECTOR_MODE_P (mode))
10735         {
10736           if (speed)
10737             *cost += extra_cost->vect.alu;
10738           return true;
10739         }
10740
10741       if (code == AND
10742           && GET_CODE (op0) == MULT
10743           && CONST_INT_P (XEXP (op0, 1))
10744           && CONST_INT_P (op1)
10745           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10746                                INTVAL (op1)) != 0)
10747         {
10748           /* This is a UBFM/SBFM.  */
10749           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10750           if (speed)
10751             *cost += extra_cost->alu.bfx;
10752           return true;
10753         }
10754
10755       if (is_int_mode (mode, &int_mode))
10756         {
10757           if (CONST_INT_P (op1))
10758             {
10759               /* We have a mask + shift version of a UBFIZ
10760                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10761               if (GET_CODE (op0) == ASHIFT
10762                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10763                                                          XEXP (op0, 1)))
10764                 {
10765                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10766                                      (enum rtx_code) code, 0, speed);
10767                   if (speed)
10768                     *cost += extra_cost->alu.bfx;
10769
10770                   return true;
10771                 }
10772               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10773                 {
10774                 /* We possibly get the immediate for free, this is not
10775                    modelled.  */
10776                   *cost += rtx_cost (op0, int_mode,
10777                                      (enum rtx_code) code, 0, speed);
10778                   if (speed)
10779                     *cost += extra_cost->alu.logical;
10780
10781                   return true;
10782                 }
10783             }
10784           else
10785             {
10786               rtx new_op0 = op0;
10787
10788               /* Handle ORN, EON, or BIC.  */
10789               if (GET_CODE (op0) == NOT)
10790                 op0 = XEXP (op0, 0);
10791
10792               new_op0 = aarch64_strip_shift (op0);
10793
10794               /* If we had a shift on op0 then this is a logical-shift-
10795                  by-register/immediate operation.  Otherwise, this is just
10796                  a logical operation.  */
10797               if (speed)
10798                 {
10799                   if (new_op0 != op0)
10800                     {
10801                       /* Shift by immediate.  */
10802                       if (CONST_INT_P (XEXP (op0, 1)))
10803                         *cost += extra_cost->alu.log_shift;
10804                       else
10805                         *cost += extra_cost->alu.log_shift_reg;
10806                     }
10807                   else
10808                     *cost += extra_cost->alu.logical;
10809                 }
10810
10811               /* In both cases we want to cost both operands.  */
10812               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10813                                  0, speed);
10814               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10815                                  1, speed);
10816
10817               return true;
10818             }
10819         }
10820       return false;
10821
10822     case NOT:
10823       x = XEXP (x, 0);
10824       op0 = aarch64_strip_shift (x);
10825
10826       if (VECTOR_MODE_P (mode))
10827         {
10828           /* Vector NOT.  */
10829           *cost += extra_cost->vect.alu;
10830           return false;
10831         }
10832
10833       /* MVN-shifted-reg.  */
10834       if (op0 != x)
10835         {
10836           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10837
10838           if (speed)
10839             *cost += extra_cost->alu.log_shift;
10840
10841           return true;
10842         }
10843       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10844          Handle the second form here taking care that 'a' in the above can
10845          be a shift.  */
10846       else if (GET_CODE (op0) == XOR)
10847         {
10848           rtx newop0 = XEXP (op0, 0);
10849           rtx newop1 = XEXP (op0, 1);
10850           rtx op0_stripped = aarch64_strip_shift (newop0);
10851
10852           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10853           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10854
10855           if (speed)
10856             {
10857               if (op0_stripped != newop0)
10858                 *cost += extra_cost->alu.log_shift;
10859               else
10860                 *cost += extra_cost->alu.logical;
10861             }
10862
10863           return true;
10864         }
10865       /* MVN.  */
10866       if (speed)
10867         *cost += extra_cost->alu.logical;
10868
10869       return false;
10870
10871     case ZERO_EXTEND:
10872
10873       op0 = XEXP (x, 0);
10874       /* If a value is written in SI mode, then zero extended to DI
10875          mode, the operation will in general be free as a write to
10876          a 'w' register implicitly zeroes the upper bits of an 'x'
10877          register.  However, if this is
10878
10879            (set (reg) (zero_extend (reg)))
10880
10881          we must cost the explicit register move.  */
10882       if (mode == DImode
10883           && GET_MODE (op0) == SImode
10884           && outer == SET)
10885         {
10886           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10887
10888         /* If OP_COST is non-zero, then the cost of the zero extend
10889            is effectively the cost of the inner operation.  Otherwise
10890            we have a MOV instruction and we take the cost from the MOV
10891            itself.  This is true independently of whether we are
10892            optimizing for space or time.  */
10893           if (op_cost)
10894             *cost = op_cost;
10895
10896           return true;
10897         }
10898       else if (MEM_P (op0))
10899         {
10900           /* All loads can zero extend to any size for free.  */
10901           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10902           return true;
10903         }
10904
10905       op0 = aarch64_extend_bitfield_pattern_p (x);
10906       if (op0)
10907         {
10908           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10909           if (speed)
10910             *cost += extra_cost->alu.bfx;
10911           return true;
10912         }
10913
10914       if (speed)
10915         {
10916           if (VECTOR_MODE_P (mode))
10917             {
10918               /* UMOV.  */
10919               *cost += extra_cost->vect.alu;
10920             }
10921           else
10922             {
10923               /* We generate an AND instead of UXTB/UXTH.  */
10924               *cost += extra_cost->alu.logical;
10925             }
10926         }
10927       return false;
10928
10929     case SIGN_EXTEND:
10930       if (MEM_P (XEXP (x, 0)))
10931         {
10932           /* LDRSH.  */
10933           if (speed)
10934             {
10935               rtx address = XEXP (XEXP (x, 0), 0);
10936               *cost += extra_cost->ldst.load_sign_extend;
10937
10938               *cost +=
10939                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10940                                                      0, speed));
10941             }
10942           return true;
10943         }
10944
10945       op0 = aarch64_extend_bitfield_pattern_p (x);
10946       if (op0)
10947         {
10948           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10949           if (speed)
10950             *cost += extra_cost->alu.bfx;
10951           return true;
10952         }
10953
10954       if (speed)
10955         {
10956           if (VECTOR_MODE_P (mode))
10957             *cost += extra_cost->vect.alu;
10958           else
10959             *cost += extra_cost->alu.extend;
10960         }
10961       return false;
10962
10963     case ASHIFT:
10964       op0 = XEXP (x, 0);
10965       op1 = XEXP (x, 1);
10966
10967       if (CONST_INT_P (op1))
10968         {
10969           if (speed)
10970             {
10971               if (VECTOR_MODE_P (mode))
10972                 {
10973                   /* Vector shift (immediate).  */
10974                   *cost += extra_cost->vect.alu;
10975                 }
10976               else
10977                 {
10978                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
10979                      aliases.  */
10980                   *cost += extra_cost->alu.shift;
10981                 }
10982             }
10983
10984           /* We can incorporate zero/sign extend for free.  */
10985           if (GET_CODE (op0) == ZERO_EXTEND
10986               || GET_CODE (op0) == SIGN_EXTEND)
10987             op0 = XEXP (op0, 0);
10988
10989           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10990           return true;
10991         }
10992       else
10993         {
10994           if (VECTOR_MODE_P (mode))
10995             {
10996               if (speed)
10997                 /* Vector shift (register).  */
10998                 *cost += extra_cost->vect.alu;
10999             }
11000           else
11001             {
11002               if (speed)
11003                 /* LSLV.  */
11004                 *cost += extra_cost->alu.shift_reg;
11005
11006               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11007                   && CONST_INT_P (XEXP (op1, 1))
11008                   && known_eq (INTVAL (XEXP (op1, 1)),
11009                                GET_MODE_BITSIZE (mode) - 1))
11010                 {
11011                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11012                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11013                      don't recurse into it.  */
11014                   return true;
11015                 }
11016             }
11017           return false;  /* All arguments need to be in registers.  */
11018         }
11019
11020     case ROTATE:
11021     case ROTATERT:
11022     case LSHIFTRT:
11023     case ASHIFTRT:
11024       op0 = XEXP (x, 0);
11025       op1 = XEXP (x, 1);
11026
11027       if (CONST_INT_P (op1))
11028         {
11029           /* ASR (immediate) and friends.  */
11030           if (speed)
11031             {
11032               if (VECTOR_MODE_P (mode))
11033                 *cost += extra_cost->vect.alu;
11034               else
11035                 *cost += extra_cost->alu.shift;
11036             }
11037
11038           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11039           return true;
11040         }
11041       else
11042         {
11043           if (VECTOR_MODE_P (mode))
11044             {
11045               if (speed)
11046                 /* Vector shift (register).  */
11047                 *cost += extra_cost->vect.alu;
11048             }
11049           else
11050             {
11051               if (speed)
11052                 /* ASR (register) and friends.  */
11053                 *cost += extra_cost->alu.shift_reg;
11054
11055               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11056                   && CONST_INT_P (XEXP (op1, 1))
11057                   && known_eq (INTVAL (XEXP (op1, 1)),
11058                                GET_MODE_BITSIZE (mode) - 1))
11059                 {
11060                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11061                   /* We already demanded XEXP (op1, 0) to be REG_P, so
11062                      don't recurse into it.  */
11063                   return true;
11064                 }
11065             }
11066           return false;  /* All arguments need to be in registers.  */
11067         }
11068
11069     case SYMBOL_REF:
11070
11071       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11072           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11073         {
11074           /* LDR.  */
11075           if (speed)
11076             *cost += extra_cost->ldst.load;
11077         }
11078       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11079                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11080         {
11081           /* ADRP, followed by ADD.  */
11082           *cost += COSTS_N_INSNS (1);
11083           if (speed)
11084             *cost += 2 * extra_cost->alu.arith;
11085         }
11086       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11087                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11088         {
11089           /* ADR.  */
11090           if (speed)
11091             *cost += extra_cost->alu.arith;
11092         }
11093
11094       if (flag_pic)
11095         {
11096           /* One extra load instruction, after accessing the GOT.  */
11097           *cost += COSTS_N_INSNS (1);
11098           if (speed)
11099             *cost += extra_cost->ldst.load;
11100         }
11101       return true;
11102
11103     case HIGH:
11104     case LO_SUM:
11105       /* ADRP/ADD (immediate).  */
11106       if (speed)
11107         *cost += extra_cost->alu.arith;
11108       return true;
11109
11110     case ZERO_EXTRACT:
11111     case SIGN_EXTRACT:
11112       /* UBFX/SBFX.  */
11113       if (speed)
11114         {
11115           if (VECTOR_MODE_P (mode))
11116             *cost += extra_cost->vect.alu;
11117           else
11118             *cost += extra_cost->alu.bfx;
11119         }
11120
11121       /* We can trust that the immediates used will be correct (there
11122          are no by-register forms), so we need only cost op0.  */
11123       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11124       return true;
11125
11126     case MULT:
11127       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11128       /* aarch64_rtx_mult_cost always handles recursion to its
11129          operands.  */
11130       return true;
11131
11132     case MOD:
11133     /* We can expand signed mod by power of 2 using a NEGS, two parallel
11134        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
11135        an unconditional negate.  This case should only ever be reached through
11136        the set_smod_pow2_cheap check in expmed.c.  */
11137       if (CONST_INT_P (XEXP (x, 1))
11138           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11139           && (mode == SImode || mode == DImode))
11140         {
11141           /* We expand to 4 instructions.  Reset the baseline.  */
11142           *cost = COSTS_N_INSNS (4);
11143
11144           if (speed)
11145             *cost += 2 * extra_cost->alu.logical
11146                      + 2 * extra_cost->alu.arith;
11147
11148           return true;
11149         }
11150
11151     /* Fall-through.  */
11152     case UMOD:
11153       if (speed)
11154         {
11155           /* Slighly prefer UMOD over SMOD.  */
11156           if (VECTOR_MODE_P (mode))
11157             *cost += extra_cost->vect.alu;
11158           else if (GET_MODE_CLASS (mode) == MODE_INT)
11159             *cost += (extra_cost->mult[mode == DImode].add
11160                       + extra_cost->mult[mode == DImode].idiv
11161                       + (code == MOD ? 1 : 0));
11162         }
11163       return false;  /* All arguments need to be in registers.  */
11164
11165     case DIV:
11166     case UDIV:
11167     case SQRT:
11168       if (speed)
11169         {
11170           if (VECTOR_MODE_P (mode))
11171             *cost += extra_cost->vect.alu;
11172           else if (GET_MODE_CLASS (mode) == MODE_INT)
11173             /* There is no integer SQRT, so only DIV and UDIV can get
11174                here.  */
11175             *cost += (extra_cost->mult[mode == DImode].idiv
11176                      /* Slighly prefer UDIV over SDIV.  */
11177                      + (code == DIV ? 1 : 0));
11178           else
11179             *cost += extra_cost->fp[mode == DFmode].div;
11180         }
11181       return false;  /* All arguments need to be in registers.  */
11182
11183     case IF_THEN_ELSE:
11184       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11185                                          XEXP (x, 2), cost, speed);
11186
11187     case EQ:
11188     case NE:
11189     case GT:
11190     case GTU:
11191     case LT:
11192     case LTU:
11193     case GE:
11194     case GEU:
11195     case LE:
11196     case LEU:
11197
11198       return false; /* All arguments must be in registers.  */
11199
11200     case FMA:
11201       op0 = XEXP (x, 0);
11202       op1 = XEXP (x, 1);
11203       op2 = XEXP (x, 2);
11204
11205       if (speed)
11206         {
11207           if (VECTOR_MODE_P (mode))
11208             *cost += extra_cost->vect.alu;
11209           else
11210             *cost += extra_cost->fp[mode == DFmode].fma;
11211         }
11212
11213       /* FMSUB, FNMADD, and FNMSUB are free.  */
11214       if (GET_CODE (op0) == NEG)
11215         op0 = XEXP (op0, 0);
11216
11217       if (GET_CODE (op2) == NEG)
11218         op2 = XEXP (op2, 0);
11219
11220       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11221          and the by-element operand as operand 0.  */
11222       if (GET_CODE (op1) == NEG)
11223         op1 = XEXP (op1, 0);
11224
11225       /* Catch vector-by-element operations.  The by-element operand can
11226          either be (vec_duplicate (vec_select (x))) or just
11227          (vec_select (x)), depending on whether we are multiplying by
11228          a vector or a scalar.
11229
11230          Canonicalization is not very good in these cases, FMA4 will put the
11231          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
11232       if (GET_CODE (op0) == VEC_DUPLICATE)
11233         op0 = XEXP (op0, 0);
11234       else if (GET_CODE (op1) == VEC_DUPLICATE)
11235         op1 = XEXP (op1, 0);
11236
11237       if (GET_CODE (op0) == VEC_SELECT)
11238         op0 = XEXP (op0, 0);
11239       else if (GET_CODE (op1) == VEC_SELECT)
11240         op1 = XEXP (op1, 0);
11241
11242       /* If the remaining parameters are not registers,
11243          get the cost to put them into registers.  */
11244       *cost += rtx_cost (op0, mode, FMA, 0, speed);
11245       *cost += rtx_cost (op1, mode, FMA, 1, speed);
11246       *cost += rtx_cost (op2, mode, FMA, 2, speed);
11247       return true;
11248
11249     case FLOAT:
11250     case UNSIGNED_FLOAT:
11251       if (speed)
11252         *cost += extra_cost->fp[mode == DFmode].fromint;
11253       return false;
11254
11255     case FLOAT_EXTEND:
11256       if (speed)
11257         {
11258           if (VECTOR_MODE_P (mode))
11259             {
11260               /*Vector truncate.  */
11261               *cost += extra_cost->vect.alu;
11262             }
11263           else
11264             *cost += extra_cost->fp[mode == DFmode].widen;
11265         }
11266       return false;
11267
11268     case FLOAT_TRUNCATE:
11269       if (speed)
11270         {
11271           if (VECTOR_MODE_P (mode))
11272             {
11273               /*Vector conversion.  */
11274               *cost += extra_cost->vect.alu;
11275             }
11276           else
11277             *cost += extra_cost->fp[mode == DFmode].narrow;
11278         }
11279       return false;
11280
11281     case FIX:
11282     case UNSIGNED_FIX:
11283       x = XEXP (x, 0);
11284       /* Strip the rounding part.  They will all be implemented
11285          by the fcvt* family of instructions anyway.  */
11286       if (GET_CODE (x) == UNSPEC)
11287         {
11288           unsigned int uns_code = XINT (x, 1);
11289
11290           if (uns_code == UNSPEC_FRINTA
11291               || uns_code == UNSPEC_FRINTM
11292               || uns_code == UNSPEC_FRINTN
11293               || uns_code == UNSPEC_FRINTP
11294               || uns_code == UNSPEC_FRINTZ)
11295             x = XVECEXP (x, 0, 0);
11296         }
11297
11298       if (speed)
11299         {
11300           if (VECTOR_MODE_P (mode))
11301             *cost += extra_cost->vect.alu;
11302           else
11303             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11304         }
11305
11306       /* We can combine fmul by a power of 2 followed by a fcvt into a single
11307          fixed-point fcvt.  */
11308       if (GET_CODE (x) == MULT
11309           && ((VECTOR_MODE_P (mode)
11310                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11311               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11312         {
11313           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11314                              0, speed);
11315           return true;
11316         }
11317
11318       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11319       return true;
11320
11321     case ABS:
11322       if (VECTOR_MODE_P (mode))
11323         {
11324           /* ABS (vector).  */
11325           if (speed)
11326             *cost += extra_cost->vect.alu;
11327         }
11328       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11329         {
11330           op0 = XEXP (x, 0);
11331
11332           /* FABD, which is analogous to FADD.  */
11333           if (GET_CODE (op0) == MINUS)
11334             {
11335               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11336               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11337               if (speed)
11338                 *cost += extra_cost->fp[mode == DFmode].addsub;
11339
11340               return true;
11341             }
11342           /* Simple FABS is analogous to FNEG.  */
11343           if (speed)
11344             *cost += extra_cost->fp[mode == DFmode].neg;
11345         }
11346       else
11347         {
11348           /* Integer ABS will either be split to
11349              two arithmetic instructions, or will be an ABS
11350              (scalar), which we don't model.  */
11351           *cost = COSTS_N_INSNS (2);
11352           if (speed)
11353             *cost += 2 * extra_cost->alu.arith;
11354         }
11355       return false;
11356
11357     case SMAX:
11358     case SMIN:
11359       if (speed)
11360         {
11361           if (VECTOR_MODE_P (mode))
11362             *cost += extra_cost->vect.alu;
11363           else
11364             {
11365               /* FMAXNM/FMINNM/FMAX/FMIN.
11366                  TODO: This may not be accurate for all implementations, but
11367                  we do not model this in the cost tables.  */
11368               *cost += extra_cost->fp[mode == DFmode].addsub;
11369             }
11370         }
11371       return false;
11372
11373     case UNSPEC:
11374       /* The floating point round to integer frint* instructions.  */
11375       if (aarch64_frint_unspec_p (XINT (x, 1)))
11376         {
11377           if (speed)
11378             *cost += extra_cost->fp[mode == DFmode].roundint;
11379
11380           return false;
11381         }
11382
11383       if (XINT (x, 1) == UNSPEC_RBIT)
11384         {
11385           if (speed)
11386             *cost += extra_cost->alu.rev;
11387
11388           return false;
11389         }
11390       break;
11391
11392     case TRUNCATE:
11393
11394       /* Decompose <su>muldi3_highpart.  */
11395       if (/* (truncate:DI  */
11396           mode == DImode
11397           /*   (lshiftrt:TI  */
11398           && GET_MODE (XEXP (x, 0)) == TImode
11399           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11400           /*      (mult:TI  */
11401           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11402           /*        (ANY_EXTEND:TI (reg:DI))
11403                     (ANY_EXTEND:TI (reg:DI)))  */
11404           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11405                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11406               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11407                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11408           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11409           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11410           /*     (const_int 64)  */
11411           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11412           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11413         {
11414           /* UMULH/SMULH.  */
11415           if (speed)
11416             *cost += extra_cost->mult[mode == DImode].extend;
11417           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11418                              mode, MULT, 0, speed);
11419           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11420                              mode, MULT, 1, speed);
11421           return true;
11422         }
11423
11424       /* Fall through.  */
11425     default:
11426       break;
11427     }
11428
11429   if (dump_file
11430       && flag_aarch64_verbose_cost)
11431     fprintf (dump_file,
11432       "\nFailed to cost RTX.  Assuming default cost.\n");
11433
11434   return true;
11435 }
11436
11437 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11438    calculated for X.  This cost is stored in *COST.  Returns true
11439    if the total cost of X was calculated.  */
11440 static bool
11441 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11442                    int param, int *cost, bool speed)
11443 {
11444   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11445
11446   if (dump_file
11447       && flag_aarch64_verbose_cost)
11448     {
11449       print_rtl_single (dump_file, x);
11450       fprintf (dump_file, "\n%s cost: %d (%s)\n",
11451                speed ? "Hot" : "Cold",
11452                *cost, result ? "final" : "partial");
11453     }
11454
11455   return result;
11456 }
11457
11458 static int
11459 aarch64_register_move_cost (machine_mode mode,
11460                             reg_class_t from_i, reg_class_t to_i)
11461 {
11462   enum reg_class from = (enum reg_class) from_i;
11463   enum reg_class to = (enum reg_class) to_i;
11464   const struct cpu_regmove_cost *regmove_cost
11465     = aarch64_tune_params.regmove_cost;
11466
11467   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
11468   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11469     to = GENERAL_REGS;
11470
11471   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11472     from = GENERAL_REGS;
11473
11474   /* Moving between GPR and stack cost is the same as GP2GP.  */
11475   if ((from == GENERAL_REGS && to == STACK_REG)
11476       || (to == GENERAL_REGS && from == STACK_REG))
11477     return regmove_cost->GP2GP;
11478
11479   /* To/From the stack register, we move via the gprs.  */
11480   if (to == STACK_REG || from == STACK_REG)
11481     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11482             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11483
11484   if (known_eq (GET_MODE_SIZE (mode), 16))
11485     {
11486       /* 128-bit operations on general registers require 2 instructions.  */
11487       if (from == GENERAL_REGS && to == GENERAL_REGS)
11488         return regmove_cost->GP2GP * 2;
11489       else if (from == GENERAL_REGS)
11490         return regmove_cost->GP2FP * 2;
11491       else if (to == GENERAL_REGS)
11492         return regmove_cost->FP2GP * 2;
11493
11494       /* When AdvSIMD instructions are disabled it is not possible to move
11495          a 128-bit value directly between Q registers.  This is handled in
11496          secondary reload.  A general register is used as a scratch to move
11497          the upper DI value and the lower DI value is moved directly,
11498          hence the cost is the sum of three moves. */
11499       if (! TARGET_SIMD)
11500         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11501
11502       return regmove_cost->FP2FP;
11503     }
11504
11505   if (from == GENERAL_REGS && to == GENERAL_REGS)
11506     return regmove_cost->GP2GP;
11507   else if (from == GENERAL_REGS)
11508     return regmove_cost->GP2FP;
11509   else if (to == GENERAL_REGS)
11510     return regmove_cost->FP2GP;
11511
11512   return regmove_cost->FP2FP;
11513 }
11514
11515 static int
11516 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11517                           reg_class_t rclass ATTRIBUTE_UNUSED,
11518                           bool in ATTRIBUTE_UNUSED)
11519 {
11520   return aarch64_tune_params.memmov_cost;
11521 }
11522
11523 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11524    to optimize 1.0/sqrt.  */
11525
11526 static bool
11527 use_rsqrt_p (machine_mode mode)
11528 {
11529   return (!flag_trapping_math
11530           && flag_unsafe_math_optimizations
11531           && ((aarch64_tune_params.approx_modes->recip_sqrt
11532                & AARCH64_APPROX_MODE (mode))
11533               || flag_mrecip_low_precision_sqrt));
11534 }
11535
11536 /* Function to decide when to use the approximate reciprocal square root
11537    builtin.  */
11538
11539 static tree
11540 aarch64_builtin_reciprocal (tree fndecl)
11541 {
11542   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11543
11544   if (!use_rsqrt_p (mode))
11545     return NULL_TREE;
11546   return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11547 }
11548
11549 /* Emit instruction sequence to compute either the approximate square root
11550    or its approximate reciprocal, depending on the flag RECP, and return
11551    whether the sequence was emitted or not.  */
11552
11553 bool
11554 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11555 {
11556   machine_mode mode = GET_MODE (dst);
11557
11558   if (GET_MODE_INNER (mode) == HFmode)
11559     {
11560       gcc_assert (!recp);
11561       return false;
11562     }
11563
11564   if (!recp)
11565     {
11566       if (!(flag_mlow_precision_sqrt
11567             || (aarch64_tune_params.approx_modes->sqrt
11568                 & AARCH64_APPROX_MODE (mode))))
11569         return false;
11570
11571       if (flag_finite_math_only
11572           || flag_trapping_math
11573           || !flag_unsafe_math_optimizations
11574           || optimize_function_for_size_p (cfun))
11575         return false;
11576     }
11577   else
11578     /* Caller assumes we cannot fail.  */
11579     gcc_assert (use_rsqrt_p (mode));
11580
11581   machine_mode mmsk = mode_for_int_vector (mode).require ();
11582   rtx xmsk = gen_reg_rtx (mmsk);
11583   if (!recp)
11584     /* When calculating the approximate square root, compare the
11585        argument with 0.0 and create a mask.  */
11586     emit_insn (gen_rtx_SET (xmsk,
11587                             gen_rtx_NEG (mmsk,
11588                                          gen_rtx_EQ (mmsk, src,
11589                                                      CONST0_RTX (mode)))));
11590
11591   /* Estimate the approximate reciprocal square root.  */
11592   rtx xdst = gen_reg_rtx (mode);
11593   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11594
11595   /* Iterate over the series twice for SF and thrice for DF.  */
11596   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11597
11598   /* Optionally iterate over the series once less for faster performance
11599      while sacrificing the accuracy.  */
11600   if ((recp && flag_mrecip_low_precision_sqrt)
11601       || (!recp && flag_mlow_precision_sqrt))
11602     iterations--;
11603
11604   /* Iterate over the series to calculate the approximate reciprocal square
11605      root.  */
11606   rtx x1 = gen_reg_rtx (mode);
11607   while (iterations--)
11608     {
11609       rtx x2 = gen_reg_rtx (mode);
11610       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11611
11612       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11613
11614       if (iterations > 0)
11615         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11616     }
11617
11618   if (!recp)
11619     {
11620       /* Qualify the approximate reciprocal square root when the argument is
11621          0.0 by squashing the intermediary result to 0.0.  */
11622       rtx xtmp = gen_reg_rtx (mmsk);
11623       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11624                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
11625       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11626
11627       /* Calculate the approximate square root.  */
11628       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11629     }
11630
11631   /* Finalize the approximation.  */
11632   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11633
11634   return true;
11635 }
11636
11637 /* Emit the instruction sequence to compute the approximation for the division
11638    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
11639
11640 bool
11641 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11642 {
11643   machine_mode mode = GET_MODE (quo);
11644
11645   if (GET_MODE_INNER (mode) == HFmode)
11646     return false;
11647
11648   bool use_approx_division_p = (flag_mlow_precision_div
11649                                 || (aarch64_tune_params.approx_modes->division
11650                                     & AARCH64_APPROX_MODE (mode)));
11651
11652   if (!flag_finite_math_only
11653       || flag_trapping_math
11654       || !flag_unsafe_math_optimizations
11655       || optimize_function_for_size_p (cfun)
11656       || !use_approx_division_p)
11657     return false;
11658
11659   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11660     return false;
11661
11662   /* Estimate the approximate reciprocal.  */
11663   rtx xrcp = gen_reg_rtx (mode);
11664   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11665
11666   /* Iterate over the series twice for SF and thrice for DF.  */
11667   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11668
11669   /* Optionally iterate over the series once less for faster performance,
11670      while sacrificing the accuracy.  */
11671   if (flag_mlow_precision_div)
11672     iterations--;
11673
11674   /* Iterate over the series to calculate the approximate reciprocal.  */
11675   rtx xtmp = gen_reg_rtx (mode);
11676   while (iterations--)
11677     {
11678       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11679
11680       if (iterations > 0)
11681         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11682     }
11683
11684   if (num != CONST1_RTX (mode))
11685     {
11686       /* As the approximate reciprocal of DEN is already calculated, only
11687          calculate the approximate division when NUM is not 1.0.  */
11688       rtx xnum = force_reg (mode, num);
11689       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11690     }
11691
11692   /* Finalize the approximation.  */
11693   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11694   return true;
11695 }
11696
11697 /* Return the number of instructions that can be issued per cycle.  */
11698 static int
11699 aarch64_sched_issue_rate (void)
11700 {
11701   return aarch64_tune_params.issue_rate;
11702 }
11703
11704 static int
11705 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11706 {
11707   int issue_rate = aarch64_sched_issue_rate ();
11708
11709   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11710 }
11711
11712
11713 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11714    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11715    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11716
11717 static int
11718 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11719                                                     int ready_index)
11720 {
11721   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11722 }
11723
11724
11725 /* Vectorizer cost model target hooks.  */
11726
11727 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11728 static int
11729 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11730                                     tree vectype,
11731                                     int misalign ATTRIBUTE_UNUSED)
11732 {
11733   unsigned elements;
11734   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11735   bool fp = false;
11736
11737   if (vectype != NULL)
11738     fp = FLOAT_TYPE_P (vectype);
11739
11740   switch (type_of_cost)
11741     {
11742       case scalar_stmt:
11743         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11744
11745       case scalar_load:
11746         return costs->scalar_load_cost;
11747
11748       case scalar_store:
11749         return costs->scalar_store_cost;
11750
11751       case vector_stmt:
11752         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11753
11754       case vector_load:
11755         return costs->vec_align_load_cost;
11756
11757       case vector_store:
11758         return costs->vec_store_cost;
11759
11760       case vec_to_scalar:
11761         return costs->vec_to_scalar_cost;
11762
11763       case scalar_to_vec:
11764         return costs->scalar_to_vec_cost;
11765
11766       case unaligned_load:
11767       case vector_gather_load:
11768         return costs->vec_unalign_load_cost;
11769
11770       case unaligned_store:
11771       case vector_scatter_store:
11772         return costs->vec_unalign_store_cost;
11773
11774       case cond_branch_taken:
11775         return costs->cond_taken_branch_cost;
11776
11777       case cond_branch_not_taken:
11778         return costs->cond_not_taken_branch_cost;
11779
11780       case vec_perm:
11781         return costs->vec_permute_cost;
11782
11783       case vec_promote_demote:
11784         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11785
11786       case vec_construct:
11787         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11788         return elements / 2 + 1;
11789
11790       default:
11791         gcc_unreachable ();
11792     }
11793 }
11794
11795 /* Implement targetm.vectorize.add_stmt_cost.  */
11796 static unsigned
11797 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11798                        struct _stmt_vec_info *stmt_info, int misalign,
11799                        enum vect_cost_model_location where)
11800 {
11801   unsigned *cost = (unsigned *) data;
11802   unsigned retval = 0;
11803
11804   if (flag_vect_cost_model)
11805     {
11806       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11807       int stmt_cost =
11808             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11809
11810       /* Statements in an inner loop relative to the loop being
11811          vectorized are weighted more heavily.  The value here is
11812          arbitrary and could potentially be improved with analysis.  */
11813       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11814         count *= 50; /*  FIXME  */
11815
11816       retval = (unsigned) (count * stmt_cost);
11817       cost[where] += retval;
11818     }
11819
11820   return retval;
11821 }
11822
11823 static void initialize_aarch64_code_model (struct gcc_options *);
11824
11825 /* Parse the TO_PARSE string and put the architecture struct that it
11826    selects into RES and the architectural features into ISA_FLAGS.
11827    Return an aarch64_parse_opt_result describing the parse result.
11828    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11829    When the TO_PARSE string contains an invalid extension,
11830    a copy of the string is created and stored to INVALID_EXTENSION.  */
11831
11832 static enum aarch64_parse_opt_result
11833 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11834                     uint64_t *isa_flags, std::string *invalid_extension)
11835 {
11836   const char *ext;
11837   const struct processor *arch;
11838   size_t len;
11839
11840   ext = strchr (to_parse, '+');
11841
11842   if (ext != NULL)
11843     len = ext - to_parse;
11844   else
11845     len = strlen (to_parse);
11846
11847   if (len == 0)
11848     return AARCH64_PARSE_MISSING_ARG;
11849
11850
11851   /* Loop through the list of supported ARCHes to find a match.  */
11852   for (arch = all_architectures; arch->name != NULL; arch++)
11853     {
11854       if (strlen (arch->name) == len
11855           && strncmp (arch->name, to_parse, len) == 0)
11856         {
11857           uint64_t isa_temp = arch->flags;
11858
11859           if (ext != NULL)
11860             {
11861               /* TO_PARSE string contains at least one extension.  */
11862               enum aarch64_parse_opt_result ext_res
11863                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11864
11865               if (ext_res != AARCH64_PARSE_OK)
11866                 return ext_res;
11867             }
11868           /* Extension parsing was successful.  Confirm the result
11869              arch and ISA flags.  */
11870           *res = arch;
11871           *isa_flags = isa_temp;
11872           return AARCH64_PARSE_OK;
11873         }
11874     }
11875
11876   /* ARCH name not found in list.  */
11877   return AARCH64_PARSE_INVALID_ARG;
11878 }
11879
11880 /* Parse the TO_PARSE string and put the result tuning in RES and the
11881    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11882    describing the parse result.  If there is an error parsing, RES and
11883    ISA_FLAGS are left unchanged.
11884    When the TO_PARSE string contains an invalid extension,
11885    a copy of the string is created and stored to INVALID_EXTENSION.  */
11886
11887 static enum aarch64_parse_opt_result
11888 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11889                    uint64_t *isa_flags, std::string *invalid_extension)
11890 {
11891   const char *ext;
11892   const struct processor *cpu;
11893   size_t len;
11894
11895   ext = strchr (to_parse, '+');
11896
11897   if (ext != NULL)
11898     len = ext - to_parse;
11899   else
11900     len = strlen (to_parse);
11901
11902   if (len == 0)
11903     return AARCH64_PARSE_MISSING_ARG;
11904
11905
11906   /* Loop through the list of supported CPUs to find a match.  */
11907   for (cpu = all_cores; cpu->name != NULL; cpu++)
11908     {
11909       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11910         {
11911           uint64_t isa_temp = cpu->flags;
11912
11913
11914           if (ext != NULL)
11915             {
11916               /* TO_PARSE string contains at least one extension.  */
11917               enum aarch64_parse_opt_result ext_res
11918                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11919
11920               if (ext_res != AARCH64_PARSE_OK)
11921                 return ext_res;
11922             }
11923           /* Extension parsing was successfull.  Confirm the result
11924              cpu and ISA flags.  */
11925           *res = cpu;
11926           *isa_flags = isa_temp;
11927           return AARCH64_PARSE_OK;
11928         }
11929     }
11930
11931   /* CPU name not found in list.  */
11932   return AARCH64_PARSE_INVALID_ARG;
11933 }
11934
11935 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11936    Return an aarch64_parse_opt_result describing the parse result.
11937    If the parsing fails the RES does not change.  */
11938
11939 static enum aarch64_parse_opt_result
11940 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11941 {
11942   const struct processor *cpu;
11943
11944   /* Loop through the list of supported CPUs to find a match.  */
11945   for (cpu = all_cores; cpu->name != NULL; cpu++)
11946     {
11947       if (strcmp (cpu->name, to_parse) == 0)
11948         {
11949           *res = cpu;
11950           return AARCH64_PARSE_OK;
11951         }
11952     }
11953
11954   /* CPU name not found in list.  */
11955   return AARCH64_PARSE_INVALID_ARG;
11956 }
11957
11958 /* Parse TOKEN, which has length LENGTH to see if it is an option
11959    described in FLAG.  If it is, return the index bit for that fusion type.
11960    If not, error (printing OPTION_NAME) and return zero.  */
11961
11962 static unsigned int
11963 aarch64_parse_one_option_token (const char *token,
11964                                 size_t length,
11965                                 const struct aarch64_flag_desc *flag,
11966                                 const char *option_name)
11967 {
11968   for (; flag->name != NULL; flag++)
11969     {
11970       if (length == strlen (flag->name)
11971           && !strncmp (flag->name, token, length))
11972         return flag->flag;
11973     }
11974
11975   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11976   return 0;
11977 }
11978
11979 /* Parse OPTION which is a comma-separated list of flags to enable.
11980    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11981    default state we inherit from the CPU tuning structures.  OPTION_NAME
11982    gives the top-level option we are parsing in the -moverride string,
11983    for use in error messages.  */
11984
11985 static unsigned int
11986 aarch64_parse_boolean_options (const char *option,
11987                                const struct aarch64_flag_desc *flags,
11988                                unsigned int initial_state,
11989                                const char *option_name)
11990 {
11991   const char separator = '.';
11992   const char* specs = option;
11993   const char* ntoken = option;
11994   unsigned int found_flags = initial_state;
11995
11996   while ((ntoken = strchr (specs, separator)))
11997     {
11998       size_t token_length = ntoken - specs;
11999       unsigned token_ops = aarch64_parse_one_option_token (specs,
12000                                                            token_length,
12001                                                            flags,
12002                                                            option_name);
12003       /* If we find "none" (or, for simplicity's sake, an error) anywhere
12004          in the token stream, reset the supported operations.  So:
12005
12006            adrp+add.cmp+branch.none.adrp+add
12007
12008            would have the result of turning on only adrp+add fusion.  */
12009       if (!token_ops)
12010         found_flags = 0;
12011
12012       found_flags |= token_ops;
12013       specs = ++ntoken;
12014     }
12015
12016   /* We ended with a comma, print something.  */
12017   if (!(*specs))
12018     {
12019       error ("%s string ill-formed\n", option_name);
12020       return 0;
12021     }
12022
12023   /* We still have one more token to parse.  */
12024   size_t token_length = strlen (specs);
12025   unsigned token_ops = aarch64_parse_one_option_token (specs,
12026                                                        token_length,
12027                                                        flags,
12028                                                        option_name);
12029    if (!token_ops)
12030      found_flags = 0;
12031
12032   found_flags |= token_ops;
12033   return found_flags;
12034 }
12035
12036 /* Support for overriding instruction fusion.  */
12037
12038 static void
12039 aarch64_parse_fuse_string (const char *fuse_string,
12040                             struct tune_params *tune)
12041 {
12042   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12043                                                      aarch64_fusible_pairs,
12044                                                      tune->fusible_ops,
12045                                                      "fuse=");
12046 }
12047
12048 /* Support for overriding other tuning flags.  */
12049
12050 static void
12051 aarch64_parse_tune_string (const char *tune_string,
12052                             struct tune_params *tune)
12053 {
12054   tune->extra_tuning_flags
12055     = aarch64_parse_boolean_options (tune_string,
12056                                      aarch64_tuning_flags,
12057                                      tune->extra_tuning_flags,
12058                                      "tune=");
12059 }
12060
12061 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12062    Accept the valid SVE vector widths allowed by
12063    aarch64_sve_vector_bits_enum and use it to override sve_width
12064    in TUNE.  */
12065
12066 static void
12067 aarch64_parse_sve_width_string (const char *tune_string,
12068                                 struct tune_params *tune)
12069 {
12070   int width = -1;
12071
12072   int n = sscanf (tune_string, "%d", &width);
12073   if (n == EOF)
12074     {
12075       error ("invalid format for sve_width");
12076       return;
12077     }
12078   switch (width)
12079     {
12080     case SVE_128:
12081     case SVE_256:
12082     case SVE_512:
12083     case SVE_1024:
12084     case SVE_2048:
12085       break;
12086     default:
12087       error ("invalid sve_width value: %d", width);
12088     }
12089   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12090 }
12091
12092 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12093    we understand.  If it is, extract the option string and handoff to
12094    the appropriate function.  */
12095
12096 void
12097 aarch64_parse_one_override_token (const char* token,
12098                                   size_t length,
12099                                   struct tune_params *tune)
12100 {
12101   const struct aarch64_tuning_override_function *fn
12102     = aarch64_tuning_override_functions;
12103
12104   const char *option_part = strchr (token, '=');
12105   if (!option_part)
12106     {
12107       error ("tuning string missing in option (%s)", token);
12108       return;
12109     }
12110
12111   /* Get the length of the option name.  */
12112   length = option_part - token;
12113   /* Skip the '=' to get to the option string.  */
12114   option_part++;
12115
12116   for (; fn->name != NULL; fn++)
12117     {
12118       if (!strncmp (fn->name, token, length))
12119         {
12120           fn->parse_override (option_part, tune);
12121           return;
12122         }
12123     }
12124
12125   error ("unknown tuning option (%s)",token);
12126   return;
12127 }
12128
12129 /* A checking mechanism for the implementation of the tls size.  */
12130
12131 static void
12132 initialize_aarch64_tls_size (struct gcc_options *opts)
12133 {
12134   if (aarch64_tls_size == 0)
12135     aarch64_tls_size = 24;
12136
12137   switch (opts->x_aarch64_cmodel_var)
12138     {
12139     case AARCH64_CMODEL_TINY:
12140       /* Both the default and maximum TLS size allowed under tiny is 1M which
12141          needs two instructions to address, so we clamp the size to 24.  */
12142       if (aarch64_tls_size > 24)
12143         aarch64_tls_size = 24;
12144       break;
12145     case AARCH64_CMODEL_SMALL:
12146       /* The maximum TLS size allowed under small is 4G.  */
12147       if (aarch64_tls_size > 32)
12148         aarch64_tls_size = 32;
12149       break;
12150     case AARCH64_CMODEL_LARGE:
12151       /* The maximum TLS size allowed under large is 16E.
12152          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
12153       if (aarch64_tls_size > 48)
12154         aarch64_tls_size = 48;
12155       break;
12156     default:
12157       gcc_unreachable ();
12158     }
12159
12160   return;
12161 }
12162
12163 /* Parse STRING looking for options in the format:
12164      string     :: option:string
12165      option     :: name=substring
12166      name       :: {a-z}
12167      substring  :: defined by option.  */
12168
12169 static void
12170 aarch64_parse_override_string (const char* input_string,
12171                                struct tune_params* tune)
12172 {
12173   const char separator = ':';
12174   size_t string_length = strlen (input_string) + 1;
12175   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12176   char *string = string_root;
12177   strncpy (string, input_string, string_length);
12178   string[string_length - 1] = '\0';
12179
12180   char* ntoken = string;
12181
12182   while ((ntoken = strchr (string, separator)))
12183     {
12184       size_t token_length = ntoken - string;
12185       /* Make this substring look like a string.  */
12186       *ntoken = '\0';
12187       aarch64_parse_one_override_token (string, token_length, tune);
12188       string = ++ntoken;
12189     }
12190
12191   /* One last option to parse.  */
12192   aarch64_parse_one_override_token (string, strlen (string), tune);
12193   free (string_root);
12194 }
12195
12196
12197 static void
12198 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12199 {
12200   if (accepted_branch_protection_string)
12201     {
12202       opts->x_aarch64_branch_protection_string
12203         = xstrdup (accepted_branch_protection_string);
12204     }
12205
12206   /* PR 70044: We have to be careful about being called multiple times for the
12207      same function.  This means all changes should be repeatable.  */
12208
12209   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12210      Disable the frame pointer flag so the mid-end will not use a frame
12211      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12212      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12213      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
12214   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12215   if (opts->x_flag_omit_frame_pointer == 0)
12216     opts->x_flag_omit_frame_pointer = 2;
12217
12218   /* If not optimizing for size, set the default
12219      alignment to what the target wants.  */
12220   if (!opts->x_optimize_size)
12221     {
12222       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12223         opts->x_str_align_loops = aarch64_tune_params.loop_align;
12224       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12225         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12226       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12227         opts->x_str_align_functions = aarch64_tune_params.function_align;
12228     }
12229
12230   /* We default to no pc-relative literal loads.  */
12231
12232   aarch64_pcrelative_literal_loads = false;
12233
12234   /* If -mpc-relative-literal-loads is set on the command line, this
12235      implies that the user asked for PC relative literal loads.  */
12236   if (opts->x_pcrelative_literal_loads == 1)
12237     aarch64_pcrelative_literal_loads = true;
12238
12239   /* In the tiny memory model it makes no sense to disallow PC relative
12240      literal pool loads.  */
12241   if (aarch64_cmodel == AARCH64_CMODEL_TINY
12242       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12243     aarch64_pcrelative_literal_loads = true;
12244
12245   /* When enabling the lower precision Newton series for the square root, also
12246      enable it for the reciprocal square root, since the latter is an
12247      intermediary step for the former.  */
12248   if (flag_mlow_precision_sqrt)
12249     flag_mrecip_low_precision_sqrt = true;
12250 }
12251
12252 /* 'Unpack' up the internal tuning structs and update the options
12253     in OPTS.  The caller must have set up selected_tune and selected_arch
12254     as all the other target-specific codegen decisions are
12255     derived from them.  */
12256
12257 void
12258 aarch64_override_options_internal (struct gcc_options *opts)
12259 {
12260   aarch64_tune_flags = selected_tune->flags;
12261   aarch64_tune = selected_tune->sched_core;
12262   /* Make a copy of the tuning parameters attached to the core, which
12263      we may later overwrite.  */
12264   aarch64_tune_params = *(selected_tune->tune);
12265   aarch64_architecture_version = selected_arch->architecture_version;
12266
12267   if (opts->x_aarch64_override_tune_string)
12268     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12269                                   &aarch64_tune_params);
12270
12271   /* This target defaults to strict volatile bitfields.  */
12272   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12273     opts->x_flag_strict_volatile_bitfields = 1;
12274
12275   if (aarch64_stack_protector_guard == SSP_GLOBAL
12276       && opts->x_aarch64_stack_protector_guard_offset_str)
12277     {
12278       error ("incompatible options %<-mstack-protector-guard=global%> and "
12279              "%<-mstack-protector-guard-offset=%s%>",
12280              aarch64_stack_protector_guard_offset_str);
12281     }
12282
12283   if (aarch64_stack_protector_guard == SSP_SYSREG
12284       && !(opts->x_aarch64_stack_protector_guard_offset_str
12285            && opts->x_aarch64_stack_protector_guard_reg_str))
12286     {
12287       error ("both %<-mstack-protector-guard-offset%> and "
12288              "%<-mstack-protector-guard-reg%> must be used "
12289              "with %<-mstack-protector-guard=sysreg%>");
12290     }
12291
12292   if (opts->x_aarch64_stack_protector_guard_reg_str)
12293     {
12294       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12295           error ("specify a system register with a small string length.");
12296     }
12297
12298   if (opts->x_aarch64_stack_protector_guard_offset_str)
12299     {
12300       char *end;
12301       const char *str = aarch64_stack_protector_guard_offset_str;
12302       errno = 0;
12303       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12304       if (!*str || *end || errno)
12305         error ("%qs is not a valid offset in %qs", str,
12306                "-mstack-protector-guard-offset=");
12307       aarch64_stack_protector_guard_offset = offs;
12308     }
12309
12310   initialize_aarch64_code_model (opts);
12311   initialize_aarch64_tls_size (opts);
12312
12313   int queue_depth = 0;
12314   switch (aarch64_tune_params.autoprefetcher_model)
12315     {
12316       case tune_params::AUTOPREFETCHER_OFF:
12317         queue_depth = -1;
12318         break;
12319       case tune_params::AUTOPREFETCHER_WEAK:
12320         queue_depth = 0;
12321         break;
12322       case tune_params::AUTOPREFETCHER_STRONG:
12323         queue_depth = max_insn_queue_index + 1;
12324         break;
12325       default:
12326         gcc_unreachable ();
12327     }
12328
12329   /* We don't mind passing in global_options_set here as we don't use
12330      the *options_set structs anyway.  */
12331   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12332                          queue_depth,
12333                          opts->x_param_values,
12334                          global_options_set.x_param_values);
12335
12336   /* Set up parameters to be used in prefetching algorithm.  Do not
12337      override the defaults unless we are tuning for a core we have
12338      researched values for.  */
12339   if (aarch64_tune_params.prefetch->num_slots > 0)
12340     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12341                            aarch64_tune_params.prefetch->num_slots,
12342                            opts->x_param_values,
12343                            global_options_set.x_param_values);
12344   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12345     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12346                            aarch64_tune_params.prefetch->l1_cache_size,
12347                            opts->x_param_values,
12348                            global_options_set.x_param_values);
12349   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12350     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12351                            aarch64_tune_params.prefetch->l1_cache_line_size,
12352                            opts->x_param_values,
12353                            global_options_set.x_param_values);
12354   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12355     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12356                            aarch64_tune_params.prefetch->l2_cache_size,
12357                            opts->x_param_values,
12358                            global_options_set.x_param_values);
12359   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12360     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12361                            0,
12362                            opts->x_param_values,
12363                            global_options_set.x_param_values);
12364   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12365     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12366                            aarch64_tune_params.prefetch->minimum_stride,
12367                            opts->x_param_values,
12368                            global_options_set.x_param_values);
12369
12370   /* Use the alternative scheduling-pressure algorithm by default.  */
12371   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12372                          opts->x_param_values,
12373                          global_options_set.x_param_values);
12374
12375   /* If the user hasn't changed it via configure then set the default to 64 KB
12376      for the backend.  */
12377   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12378                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
12379                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12380                          opts->x_param_values,
12381                          global_options_set.x_param_values);
12382
12383   /* Validate the guard size.  */
12384   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12385
12386   /* Enforce that interval is the same size as size so the mid-end does the
12387      right thing.  */
12388   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12389                          guard_size,
12390                          opts->x_param_values,
12391                          global_options_set.x_param_values);
12392
12393   /* The maybe_set calls won't update the value if the user has explicitly set
12394      one.  Which means we need to validate that probing interval and guard size
12395      are equal.  */
12396   int probe_interval
12397     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12398   if (guard_size != probe_interval)
12399     error ("stack clash guard size %<%d%> must be equal to probing interval "
12400            "%<%d%>", guard_size, probe_interval);
12401
12402   /* Enable sw prefetching at specified optimization level for
12403      CPUS that have prefetch.  Lower optimization level threshold by 1
12404      when profiling is enabled.  */
12405   if (opts->x_flag_prefetch_loop_arrays < 0
12406       && !opts->x_optimize_size
12407       && aarch64_tune_params.prefetch->default_opt_level >= 0
12408       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12409     opts->x_flag_prefetch_loop_arrays = 1;
12410
12411   if (opts->x_aarch64_arch_string == NULL)
12412     opts->x_aarch64_arch_string = selected_arch->name;
12413   if (opts->x_aarch64_cpu_string == NULL)
12414     opts->x_aarch64_cpu_string = selected_cpu->name;
12415   if (opts->x_aarch64_tune_string == NULL)
12416     opts->x_aarch64_tune_string = selected_tune->name;
12417
12418   aarch64_override_options_after_change_1 (opts);
12419 }
12420
12421 /* Print a hint with a suggestion for a core or architecture name that
12422    most closely resembles what the user passed in STR.  ARCH is true if
12423    the user is asking for an architecture name.  ARCH is false if the user
12424    is asking for a core name.  */
12425
12426 static void
12427 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12428 {
12429   auto_vec<const char *> candidates;
12430   const struct processor *entry = arch ? all_architectures : all_cores;
12431   for (; entry->name != NULL; entry++)
12432     candidates.safe_push (entry->name);
12433
12434 #ifdef HAVE_LOCAL_CPU_DETECT
12435   /* Add also "native" as possible value.  */
12436   if (arch)
12437     candidates.safe_push ("native");
12438 #endif
12439
12440   char *s;
12441   const char *hint = candidates_list_and_hint (str, s, candidates);
12442   if (hint)
12443     inform (input_location, "valid arguments are: %s;"
12444                              " did you mean %qs?", s, hint);
12445   else
12446     inform (input_location, "valid arguments are: %s", s);
12447
12448   XDELETEVEC (s);
12449 }
12450
12451 /* Print a hint with a suggestion for a core name that most closely resembles
12452    what the user passed in STR.  */
12453
12454 inline static void
12455 aarch64_print_hint_for_core (const char *str)
12456 {
12457   aarch64_print_hint_for_core_or_arch (str, false);
12458 }
12459
12460 /* Print a hint with a suggestion for an architecture name that most closely
12461    resembles what the user passed in STR.  */
12462
12463 inline static void
12464 aarch64_print_hint_for_arch (const char *str)
12465 {
12466   aarch64_print_hint_for_core_or_arch (str, true);
12467 }
12468
12469
12470 /* Print a hint with a suggestion for an extension name
12471    that most closely resembles what the user passed in STR.  */
12472
12473 void
12474 aarch64_print_hint_for_extensions (const std::string &str)
12475 {
12476   auto_vec<const char *> candidates;
12477   aarch64_get_all_extension_candidates (&candidates);
12478   char *s;
12479   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12480   if (hint)
12481     inform (input_location, "valid arguments are: %s;"
12482                              " did you mean %qs?", s, hint);
12483   else
12484     inform (input_location, "valid arguments are: %s;", s);
12485
12486   XDELETEVEC (s);
12487 }
12488
12489 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
12490    specified in STR and throw errors if appropriate.  Put the results if
12491    they are valid in RES and ISA_FLAGS.  Return whether the option is
12492    valid.  */
12493
12494 static bool
12495 aarch64_validate_mcpu (const char *str, const struct processor **res,
12496                        uint64_t *isa_flags)
12497 {
12498   std::string invalid_extension;
12499   enum aarch64_parse_opt_result parse_res
12500     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12501
12502   if (parse_res == AARCH64_PARSE_OK)
12503     return true;
12504
12505   switch (parse_res)
12506     {
12507       case AARCH64_PARSE_MISSING_ARG:
12508         error ("missing cpu name in %<-mcpu=%s%>", str);
12509         break;
12510       case AARCH64_PARSE_INVALID_ARG:
12511         error ("unknown value %qs for %<-mcpu%>", str);
12512         aarch64_print_hint_for_core (str);
12513         break;
12514       case AARCH64_PARSE_INVALID_FEATURE:
12515         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12516                invalid_extension.c_str (), str);
12517         aarch64_print_hint_for_extensions (invalid_extension);
12518         break;
12519       default:
12520         gcc_unreachable ();
12521     }
12522
12523   return false;
12524 }
12525
12526 /* Parses CONST_STR for branch protection features specified in
12527    aarch64_branch_protect_types, and set any global variables required.  Returns
12528    the parsing result and assigns LAST_STR to the last processed token from
12529    CONST_STR so that it can be used for error reporting.  */
12530
12531 static enum
12532 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12533                                                           char** last_str)
12534 {
12535   char *str_root = xstrdup (const_str);
12536   char* token_save = NULL;
12537   char *str = strtok_r (str_root, "+", &token_save);
12538   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12539   if (!str)
12540     res = AARCH64_PARSE_MISSING_ARG;
12541   else
12542     {
12543       char *next_str = strtok_r (NULL, "+", &token_save);
12544       /* Reset the branch protection features to their defaults.  */
12545       aarch64_handle_no_branch_protection (NULL, NULL);
12546
12547       while (str && res == AARCH64_PARSE_OK)
12548         {
12549           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12550           bool found = false;
12551           /* Search for this type.  */
12552           while (type && type->name && !found && res == AARCH64_PARSE_OK)
12553             {
12554               if (strcmp (str, type->name) == 0)
12555                 {
12556                   found = true;
12557                   res = type->handler (str, next_str);
12558                   str = next_str;
12559                   next_str = strtok_r (NULL, "+", &token_save);
12560                 }
12561               else
12562                 type++;
12563             }
12564           if (found && res == AARCH64_PARSE_OK)
12565             {
12566               bool found_subtype = true;
12567               /* Loop through each token until we find one that isn't a
12568                  subtype.  */
12569               while (found_subtype)
12570                 {
12571                   found_subtype = false;
12572                   const aarch64_branch_protect_type *subtype = type->subtypes;
12573                   /* Search for the subtype.  */
12574                   while (str && subtype && subtype->name && !found_subtype
12575                           && res == AARCH64_PARSE_OK)
12576                     {
12577                       if (strcmp (str, subtype->name) == 0)
12578                         {
12579                           found_subtype = true;
12580                           res = subtype->handler (str, next_str);
12581                           str = next_str;
12582                           next_str = strtok_r (NULL, "+", &token_save);
12583                         }
12584                       else
12585                         subtype++;
12586                     }
12587                 }
12588             }
12589           else if (!found)
12590             res = AARCH64_PARSE_INVALID_ARG;
12591         }
12592     }
12593   /* Copy the last processed token into the argument to pass it back.
12594     Used by option and attribute validation to print the offending token.  */
12595   if (last_str)
12596     {
12597       if (str) strcpy (*last_str, str);
12598       else *last_str = NULL;
12599     }
12600   if (res == AARCH64_PARSE_OK)
12601     {
12602       /* If needed, alloc the accepted string then copy in const_str.
12603         Used by override_option_after_change_1.  */
12604       if (!accepted_branch_protection_string)
12605         accepted_branch_protection_string = (char *) xmalloc (
12606                                                       BRANCH_PROTECT_STR_MAX
12607                                                         + 1);
12608       strncpy (accepted_branch_protection_string, const_str,
12609                 BRANCH_PROTECT_STR_MAX + 1);
12610       /* Forcibly null-terminate.  */
12611       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12612     }
12613   return res;
12614 }
12615
12616 static bool
12617 aarch64_validate_mbranch_protection (const char *const_str)
12618 {
12619   char *str = (char *) xmalloc (strlen (const_str));
12620   enum aarch64_parse_opt_result res =
12621     aarch64_parse_branch_protection (const_str, &str);
12622   if (res == AARCH64_PARSE_INVALID_ARG)
12623     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12624   else if (res == AARCH64_PARSE_MISSING_ARG)
12625     error ("missing argument for %<-mbranch-protection=%>");
12626   free (str);
12627   return res == AARCH64_PARSE_OK;
12628 }
12629
12630 /* Validate a command-line -march option.  Parse the arch and extensions
12631    (if any) specified in STR and throw errors if appropriate.  Put the
12632    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
12633    option is valid.  */
12634
12635 static bool
12636 aarch64_validate_march (const char *str, const struct processor **res,
12637                          uint64_t *isa_flags)
12638 {
12639   std::string invalid_extension;
12640   enum aarch64_parse_opt_result parse_res
12641     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12642
12643   if (parse_res == AARCH64_PARSE_OK)
12644     return true;
12645
12646   switch (parse_res)
12647     {
12648       case AARCH64_PARSE_MISSING_ARG:
12649         error ("missing arch name in %<-march=%s%>", str);
12650         break;
12651       case AARCH64_PARSE_INVALID_ARG:
12652         error ("unknown value %qs for %<-march%>", str);
12653         aarch64_print_hint_for_arch (str);
12654         break;
12655       case AARCH64_PARSE_INVALID_FEATURE:
12656         error ("invalid feature modifier %qs in %<-march=%s%>",
12657                invalid_extension.c_str (), str);
12658         aarch64_print_hint_for_extensions (invalid_extension);
12659         break;
12660       default:
12661         gcc_unreachable ();
12662     }
12663
12664   return false;
12665 }
12666
12667 /* Validate a command-line -mtune option.  Parse the cpu
12668    specified in STR and throw errors if appropriate.  Put the
12669    result, if it is valid, in RES.  Return whether the option is
12670    valid.  */
12671
12672 static bool
12673 aarch64_validate_mtune (const char *str, const struct processor **res)
12674 {
12675   enum aarch64_parse_opt_result parse_res
12676     = aarch64_parse_tune (str, res);
12677
12678   if (parse_res == AARCH64_PARSE_OK)
12679     return true;
12680
12681   switch (parse_res)
12682     {
12683       case AARCH64_PARSE_MISSING_ARG:
12684         error ("missing cpu name in %<-mtune=%s%>", str);
12685         break;
12686       case AARCH64_PARSE_INVALID_ARG:
12687         error ("unknown value %qs for %<-mtune%>", str);
12688         aarch64_print_hint_for_core (str);
12689         break;
12690       default:
12691         gcc_unreachable ();
12692     }
12693   return false;
12694 }
12695
12696 /* Return the CPU corresponding to the enum CPU.
12697    If it doesn't specify a cpu, return the default.  */
12698
12699 static const struct processor *
12700 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12701 {
12702   if (cpu != aarch64_none)
12703     return &all_cores[cpu];
12704
12705   /* The & 0x3f is to extract the bottom 6 bits that encode the
12706      default cpu as selected by the --with-cpu GCC configure option
12707      in config.gcc.
12708      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12709      flags mechanism should be reworked to make it more sane.  */
12710   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12711 }
12712
12713 /* Return the architecture corresponding to the enum ARCH.
12714    If it doesn't specify a valid architecture, return the default.  */
12715
12716 static const struct processor *
12717 aarch64_get_arch (enum aarch64_arch arch)
12718 {
12719   if (arch != aarch64_no_arch)
12720     return &all_architectures[arch];
12721
12722   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12723
12724   return &all_architectures[cpu->arch];
12725 }
12726
12727 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12728
12729 static poly_uint16
12730 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12731 {
12732   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12733      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12734      deciding which .md file patterns to use and when deciding whether
12735      something is a legitimate address or constant.  */
12736   if (value == SVE_SCALABLE || value == SVE_128)
12737     return poly_uint16 (2, 2);
12738   else
12739     return (int) value / 64;
12740 }
12741
12742 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12743    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12744    tuning structs.  In particular it must set selected_tune and
12745    aarch64_isa_flags that define the available ISA features and tuning
12746    decisions.  It must also set selected_arch as this will be used to
12747    output the .arch asm tags for each function.  */
12748
12749 static void
12750 aarch64_override_options (void)
12751 {
12752   uint64_t cpu_isa = 0;
12753   uint64_t arch_isa = 0;
12754   aarch64_isa_flags = 0;
12755
12756   bool valid_cpu = true;
12757   bool valid_tune = true;
12758   bool valid_arch = true;
12759
12760   selected_cpu = NULL;
12761   selected_arch = NULL;
12762   selected_tune = NULL;
12763
12764   if (aarch64_branch_protection_string)
12765     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12766
12767   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12768      If either of -march or -mtune is given, they override their
12769      respective component of -mcpu.  */
12770   if (aarch64_cpu_string)
12771     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12772                                         &cpu_isa);
12773
12774   if (aarch64_arch_string)
12775     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12776                                           &arch_isa);
12777
12778   if (aarch64_tune_string)
12779     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12780
12781 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12782   SUBTARGET_OVERRIDE_OPTIONS;
12783 #endif
12784
12785   /* If the user did not specify a processor, choose the default
12786      one for them.  This will be the CPU set during configuration using
12787      --with-cpu, otherwise it is "generic".  */
12788   if (!selected_cpu)
12789     {
12790       if (selected_arch)
12791         {
12792           selected_cpu = &all_cores[selected_arch->ident];
12793           aarch64_isa_flags = arch_isa;
12794           explicit_arch = selected_arch->arch;
12795         }
12796       else
12797         {
12798           /* Get default configure-time CPU.  */
12799           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12800           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12801         }
12802
12803       if (selected_tune)
12804         explicit_tune_core = selected_tune->ident;
12805     }
12806   /* If both -mcpu and -march are specified check that they are architecturally
12807      compatible, warn if they're not and prefer the -march ISA flags.  */
12808   else if (selected_arch)
12809     {
12810       if (selected_arch->arch != selected_cpu->arch)
12811         {
12812           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12813                        all_architectures[selected_cpu->arch].name,
12814                        selected_arch->name);
12815         }
12816       aarch64_isa_flags = arch_isa;
12817       explicit_arch = selected_arch->arch;
12818       explicit_tune_core = selected_tune ? selected_tune->ident
12819                                           : selected_cpu->ident;
12820     }
12821   else
12822     {
12823       /* -mcpu but no -march.  */
12824       aarch64_isa_flags = cpu_isa;
12825       explicit_tune_core = selected_tune ? selected_tune->ident
12826                                           : selected_cpu->ident;
12827       gcc_assert (selected_cpu);
12828       selected_arch = &all_architectures[selected_cpu->arch];
12829       explicit_arch = selected_arch->arch;
12830     }
12831
12832   /* Set the arch as well as we will need it when outputing
12833      the .arch directive in assembly.  */
12834   if (!selected_arch)
12835     {
12836       gcc_assert (selected_cpu);
12837       selected_arch = &all_architectures[selected_cpu->arch];
12838     }
12839
12840   if (!selected_tune)
12841     selected_tune = selected_cpu;
12842
12843   if (aarch64_enable_bti == 2)
12844     {
12845 #ifdef TARGET_ENABLE_BTI
12846       aarch64_enable_bti = 1;
12847 #else
12848       aarch64_enable_bti = 0;
12849 #endif
12850     }
12851
12852   /* Return address signing is currently not supported for ILP32 targets.  For
12853      LP64 targets use the configured option in the absence of a command-line
12854      option for -mbranch-protection.  */
12855   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12856     {
12857 #ifdef TARGET_ENABLE_PAC_RET
12858       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12859 #else
12860       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12861 #endif
12862     }
12863
12864 #ifndef HAVE_AS_MABI_OPTION
12865   /* The compiler may have been configured with 2.23.* binutils, which does
12866      not have support for ILP32.  */
12867   if (TARGET_ILP32)
12868     error ("assembler does not support %<-mabi=ilp32%>");
12869 #endif
12870
12871   /* Convert -msve-vector-bits to a VG count.  */
12872   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12873
12874   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12875     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12876
12877   /* Make sure we properly set up the explicit options.  */
12878   if ((aarch64_cpu_string && valid_cpu)
12879        || (aarch64_tune_string && valid_tune))
12880     gcc_assert (explicit_tune_core != aarch64_none);
12881
12882   if ((aarch64_cpu_string && valid_cpu)
12883        || (aarch64_arch_string && valid_arch))
12884     gcc_assert (explicit_arch != aarch64_no_arch);
12885
12886   /* The pass to insert speculation tracking runs before
12887      shrink-wrapping and the latter does not know how to update the
12888      tracking status.  So disable it in this case.  */
12889   if (aarch64_track_speculation)
12890     flag_shrink_wrap = 0;
12891
12892   aarch64_override_options_internal (&global_options);
12893
12894   /* Save these options as the default ones in case we push and pop them later
12895      while processing functions with potential target attributes.  */
12896   target_option_default_node = target_option_current_node
12897       = build_target_option_node (&global_options);
12898 }
12899
12900 /* Implement targetm.override_options_after_change.  */
12901
12902 static void
12903 aarch64_override_options_after_change (void)
12904 {
12905   aarch64_override_options_after_change_1 (&global_options);
12906 }
12907
12908 static struct machine_function *
12909 aarch64_init_machine_status (void)
12910 {
12911   struct machine_function *machine;
12912   machine = ggc_cleared_alloc<machine_function> ();
12913   return machine;
12914 }
12915
12916 void
12917 aarch64_init_expanders (void)
12918 {
12919   init_machine_status = aarch64_init_machine_status;
12920 }
12921
12922 /* A checking mechanism for the implementation of the various code models.  */
12923 static void
12924 initialize_aarch64_code_model (struct gcc_options *opts)
12925 {
12926    if (opts->x_flag_pic)
12927      {
12928        switch (opts->x_aarch64_cmodel_var)
12929          {
12930          case AARCH64_CMODEL_TINY:
12931            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12932            break;
12933          case AARCH64_CMODEL_SMALL:
12934 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12935            aarch64_cmodel = (flag_pic == 2
12936                              ? AARCH64_CMODEL_SMALL_PIC
12937                              : AARCH64_CMODEL_SMALL_SPIC);
12938 #else
12939            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12940 #endif
12941            break;
12942          case AARCH64_CMODEL_LARGE:
12943            sorry ("code model %qs with %<-f%s%>", "large",
12944                   opts->x_flag_pic > 1 ? "PIC" : "pic");
12945            break;
12946          default:
12947            gcc_unreachable ();
12948          }
12949      }
12950    else
12951      aarch64_cmodel = opts->x_aarch64_cmodel_var;
12952 }
12953
12954 /* Implement TARGET_OPTION_SAVE.  */
12955
12956 static void
12957 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12958 {
12959   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12960   ptr->x_aarch64_branch_protection_string
12961     = opts->x_aarch64_branch_protection_string;
12962 }
12963
12964 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
12965    using the information saved in PTR.  */
12966
12967 static void
12968 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12969 {
12970   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12971   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12972   opts->x_explicit_arch = ptr->x_explicit_arch;
12973   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12974   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12975   opts->x_aarch64_branch_protection_string
12976     = ptr->x_aarch64_branch_protection_string;
12977   if (opts->x_aarch64_branch_protection_string)
12978     {
12979       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12980                                         NULL);
12981     }
12982
12983   aarch64_override_options_internal (opts);
12984 }
12985
12986 /* Implement TARGET_OPTION_PRINT.  */
12987
12988 static void
12989 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12990 {
12991   const struct processor *cpu
12992     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12993   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
12994   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12995   std::string extension
12996     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12997
12998   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12999   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13000            arch->name, extension.c_str ());
13001 }
13002
13003 static GTY(()) tree aarch64_previous_fndecl;
13004
13005 void
13006 aarch64_reset_previous_fndecl (void)
13007 {
13008   aarch64_previous_fndecl = NULL;
13009 }
13010
13011 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13012    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13013    make sure optab availability predicates are recomputed when necessary.  */
13014
13015 void
13016 aarch64_save_restore_target_globals (tree new_tree)
13017 {
13018   if (TREE_TARGET_GLOBALS (new_tree))
13019     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13020   else if (new_tree == target_option_default_node)
13021     restore_target_globals (&default_target_globals);
13022   else
13023     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13024 }
13025
13026 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
13027    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13028    of the function, if such exists.  This function may be called multiple
13029    times on a single function so use aarch64_previous_fndecl to avoid
13030    setting up identical state.  */
13031
13032 static void
13033 aarch64_set_current_function (tree fndecl)
13034 {
13035   if (!fndecl || fndecl == aarch64_previous_fndecl)
13036     return;
13037
13038   tree old_tree = (aarch64_previous_fndecl
13039                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13040                    : NULL_TREE);
13041
13042   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13043
13044   /* If current function has no attributes but the previous one did,
13045      use the default node.  */
13046   if (!new_tree && old_tree)
13047     new_tree = target_option_default_node;
13048
13049   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
13050      the default have been handled by aarch64_save_restore_target_globals from
13051      aarch64_pragma_target_parse.  */
13052   if (old_tree == new_tree)
13053     return;
13054
13055   aarch64_previous_fndecl = fndecl;
13056
13057   /* First set the target options.  */
13058   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13059
13060   aarch64_save_restore_target_globals (new_tree);
13061 }
13062
13063 /* Enum describing the various ways we can handle attributes.
13064    In many cases we can reuse the generic option handling machinery.  */
13065
13066 enum aarch64_attr_opt_type
13067 {
13068   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
13069   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
13070   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
13071   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
13072 };
13073
13074 /* All the information needed to handle a target attribute.
13075    NAME is the name of the attribute.
13076    ATTR_TYPE specifies the type of behavior of the attribute as described
13077    in the definition of enum aarch64_attr_opt_type.
13078    ALLOW_NEG is true if the attribute supports a "no-" form.
13079    HANDLER is the function that takes the attribute string as an argument
13080    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13081    OPT_NUM is the enum specifying the option that the attribute modifies.
13082    This is needed for attributes that mirror the behavior of a command-line
13083    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13084    aarch64_attr_enum.  */
13085
13086 struct aarch64_attribute_info
13087 {
13088   const char *name;
13089   enum aarch64_attr_opt_type attr_type;
13090   bool allow_neg;
13091   bool (*handler) (const char *);
13092   enum opt_code opt_num;
13093 };
13094
13095 /* Handle the ARCH_STR argument to the arch= target attribute.  */
13096
13097 static bool
13098 aarch64_handle_attr_arch (const char *str)
13099 {
13100   const struct processor *tmp_arch = NULL;
13101   std::string invalid_extension;
13102   enum aarch64_parse_opt_result parse_res
13103     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13104
13105   if (parse_res == AARCH64_PARSE_OK)
13106     {
13107       gcc_assert (tmp_arch);
13108       selected_arch = tmp_arch;
13109       explicit_arch = selected_arch->arch;
13110       return true;
13111     }
13112
13113   switch (parse_res)
13114     {
13115       case AARCH64_PARSE_MISSING_ARG:
13116         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13117         break;
13118       case AARCH64_PARSE_INVALID_ARG:
13119         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13120         aarch64_print_hint_for_arch (str);
13121         break;
13122       case AARCH64_PARSE_INVALID_FEATURE:
13123         error ("invalid feature modifier %s of value (\"%s\") in "
13124                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13125         aarch64_print_hint_for_extensions (invalid_extension);
13126         break;
13127       default:
13128         gcc_unreachable ();
13129     }
13130
13131   return false;
13132 }
13133
13134 /* Handle the argument CPU_STR to the cpu= target attribute.  */
13135
13136 static bool
13137 aarch64_handle_attr_cpu (const char *str)
13138 {
13139   const struct processor *tmp_cpu = NULL;
13140   std::string invalid_extension;
13141   enum aarch64_parse_opt_result parse_res
13142     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13143
13144   if (parse_res == AARCH64_PARSE_OK)
13145     {
13146       gcc_assert (tmp_cpu);
13147       selected_tune = tmp_cpu;
13148       explicit_tune_core = selected_tune->ident;
13149
13150       selected_arch = &all_architectures[tmp_cpu->arch];
13151       explicit_arch = selected_arch->arch;
13152       return true;
13153     }
13154
13155   switch (parse_res)
13156     {
13157       case AARCH64_PARSE_MISSING_ARG:
13158         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13159         break;
13160       case AARCH64_PARSE_INVALID_ARG:
13161         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13162         aarch64_print_hint_for_core (str);
13163         break;
13164       case AARCH64_PARSE_INVALID_FEATURE:
13165         error ("invalid feature modifier %s of value (\"%s\") in "
13166                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13167         aarch64_print_hint_for_extensions (invalid_extension);
13168         break;
13169       default:
13170         gcc_unreachable ();
13171     }
13172
13173   return false;
13174 }
13175
13176 /* Handle the argument STR to the branch-protection= attribute.  */
13177
13178  static bool
13179  aarch64_handle_attr_branch_protection (const char* str)
13180  {
13181   char *err_str = (char *) xmalloc (strlen (str));
13182   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13183                                                                       &err_str);
13184   bool success = false;
13185   switch (res)
13186     {
13187      case AARCH64_PARSE_MISSING_ARG:
13188        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13189               " attribute");
13190        break;
13191      case AARCH64_PARSE_INVALID_ARG:
13192        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13193               "=\")%> pragma or attribute", err_str);
13194        break;
13195      case AARCH64_PARSE_OK:
13196        success = true;
13197       /* Fall through.  */
13198      case AARCH64_PARSE_INVALID_FEATURE:
13199        break;
13200      default:
13201        gcc_unreachable ();
13202     }
13203   free (err_str);
13204   return success;
13205  }
13206
13207 /* Handle the argument STR to the tune= target attribute.  */
13208
13209 static bool
13210 aarch64_handle_attr_tune (const char *str)
13211 {
13212   const struct processor *tmp_tune = NULL;
13213   enum aarch64_parse_opt_result parse_res
13214     = aarch64_parse_tune (str, &tmp_tune);
13215
13216   if (parse_res == AARCH64_PARSE_OK)
13217     {
13218       gcc_assert (tmp_tune);
13219       selected_tune = tmp_tune;
13220       explicit_tune_core = selected_tune->ident;
13221       return true;
13222     }
13223
13224   switch (parse_res)
13225     {
13226       case AARCH64_PARSE_INVALID_ARG:
13227         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13228         aarch64_print_hint_for_core (str);
13229         break;
13230       default:
13231         gcc_unreachable ();
13232     }
13233
13234   return false;
13235 }
13236
13237 /* Parse an architecture extensions target attribute string specified in STR.
13238    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
13239    if successful.  Update aarch64_isa_flags to reflect the ISA features
13240    modified.  */
13241
13242 static bool
13243 aarch64_handle_attr_isa_flags (char *str)
13244 {
13245   enum aarch64_parse_opt_result parse_res;
13246   uint64_t isa_flags = aarch64_isa_flags;
13247
13248   /* We allow "+nothing" in the beginning to clear out all architectural
13249      features if the user wants to handpick specific features.  */
13250   if (strncmp ("+nothing", str, 8) == 0)
13251     {
13252       isa_flags = 0;
13253       str += 8;
13254     }
13255
13256   std::string invalid_extension;
13257   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13258
13259   if (parse_res == AARCH64_PARSE_OK)
13260     {
13261       aarch64_isa_flags = isa_flags;
13262       return true;
13263     }
13264
13265   switch (parse_res)
13266     {
13267       case AARCH64_PARSE_MISSING_ARG:
13268         error ("missing value in %<target()%> pragma or attribute");
13269         break;
13270
13271       case AARCH64_PARSE_INVALID_FEATURE:
13272         error ("invalid feature modifier %s of value (\"%s\") in "
13273                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13274         break;
13275
13276       default:
13277         gcc_unreachable ();
13278     }
13279
13280  return false;
13281 }
13282
13283 /* The target attributes that we support.  On top of these we also support just
13284    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
13285    handled explicitly in aarch64_process_one_target_attr.  */
13286
13287 static const struct aarch64_attribute_info aarch64_attributes[] =
13288 {
13289   { "general-regs-only", aarch64_attr_mask, false, NULL,
13290      OPT_mgeneral_regs_only },
13291   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13292      OPT_mfix_cortex_a53_835769 },
13293   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13294      OPT_mfix_cortex_a53_843419 },
13295   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13296   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13297   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13298      OPT_momit_leaf_frame_pointer },
13299   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13300   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13301      OPT_march_ },
13302   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13303   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13304      OPT_mtune_ },
13305   { "branch-protection", aarch64_attr_custom, false,
13306      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13307   { "sign-return-address", aarch64_attr_enum, false, NULL,
13308      OPT_msign_return_address_ },
13309   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13310 };
13311
13312 /* Parse ARG_STR which contains the definition of one target attribute.
13313    Show appropriate errors if any or return true if the attribute is valid.  */
13314
13315 static bool
13316 aarch64_process_one_target_attr (char *arg_str)
13317 {
13318   bool invert = false;
13319
13320   size_t len = strlen (arg_str);
13321
13322   if (len == 0)
13323     {
13324       error ("malformed %<target()%> pragma or attribute");
13325       return false;
13326     }
13327
13328   char *str_to_check = (char *) alloca (len + 1);
13329   strcpy (str_to_check, arg_str);
13330
13331   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13332      It is easier to detect and handle it explicitly here rather than going
13333      through the machinery for the rest of the target attributes in this
13334      function.  */
13335   if (*str_to_check == '+')
13336     return aarch64_handle_attr_isa_flags (str_to_check);
13337
13338   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13339     {
13340       invert = true;
13341       str_to_check += 3;
13342     }
13343   char *arg = strchr (str_to_check, '=');
13344
13345   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13346      and point ARG to "foo".  */
13347   if (arg)
13348     {
13349       *arg = '\0';
13350       arg++;
13351     }
13352   const struct aarch64_attribute_info *p_attr;
13353   bool found = false;
13354   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13355     {
13356       /* If the names don't match up, or the user has given an argument
13357          to an attribute that doesn't accept one, or didn't give an argument
13358          to an attribute that expects one, fail to match.  */
13359       if (strcmp (str_to_check, p_attr->name) != 0)
13360         continue;
13361
13362       found = true;
13363       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13364                               || p_attr->attr_type == aarch64_attr_enum;
13365
13366       if (attr_need_arg_p ^ (arg != NULL))
13367         {
13368           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13369           return false;
13370         }
13371
13372       /* If the name matches but the attribute does not allow "no-" versions
13373          then we can't match.  */
13374       if (invert && !p_attr->allow_neg)
13375         {
13376           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13377           return false;
13378         }
13379
13380       switch (p_attr->attr_type)
13381         {
13382         /* Has a custom handler registered.
13383            For example, cpu=, arch=, tune=.  */
13384           case aarch64_attr_custom:
13385             gcc_assert (p_attr->handler);
13386             if (!p_attr->handler (arg))
13387               return false;
13388             break;
13389
13390           /* Either set or unset a boolean option.  */
13391           case aarch64_attr_bool:
13392             {
13393               struct cl_decoded_option decoded;
13394
13395               generate_option (p_attr->opt_num, NULL, !invert,
13396                                CL_TARGET, &decoded);
13397               aarch64_handle_option (&global_options, &global_options_set,
13398                                       &decoded, input_location);
13399               break;
13400             }
13401           /* Set or unset a bit in the target_flags.  aarch64_handle_option
13402              should know what mask to apply given the option number.  */
13403           case aarch64_attr_mask:
13404             {
13405               struct cl_decoded_option decoded;
13406               /* We only need to specify the option number.
13407                  aarch64_handle_option will know which mask to apply.  */
13408               decoded.opt_index = p_attr->opt_num;
13409               decoded.value = !invert;
13410               aarch64_handle_option (&global_options, &global_options_set,
13411                                       &decoded, input_location);
13412               break;
13413             }
13414           /* Use the option setting machinery to set an option to an enum.  */
13415           case aarch64_attr_enum:
13416             {
13417               gcc_assert (arg);
13418               bool valid;
13419               int value;
13420               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13421                                               &value, CL_TARGET);
13422               if (valid)
13423                 {
13424                   set_option (&global_options, NULL, p_attr->opt_num, value,
13425                               NULL, DK_UNSPECIFIED, input_location,
13426                               global_dc);
13427                 }
13428               else
13429                 {
13430                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13431                 }
13432               break;
13433             }
13434           default:
13435             gcc_unreachable ();
13436         }
13437     }
13438
13439   /* If we reached here we either have found an attribute and validated
13440      it or didn't match any.  If we matched an attribute but its arguments
13441      were malformed we will have returned false already.  */
13442   return found;
13443 }
13444
13445 /* Count how many times the character C appears in
13446    NULL-terminated string STR.  */
13447
13448 static unsigned int
13449 num_occurences_in_str (char c, char *str)
13450 {
13451   unsigned int res = 0;
13452   while (*str != '\0')
13453     {
13454       if (*str == c)
13455         res++;
13456
13457       str++;
13458     }
13459
13460   return res;
13461 }
13462
13463 /* Parse the tree in ARGS that contains the target attribute information
13464    and update the global target options space.  */
13465
13466 bool
13467 aarch64_process_target_attr (tree args)
13468 {
13469   if (TREE_CODE (args) == TREE_LIST)
13470     {
13471       do
13472         {
13473           tree head = TREE_VALUE (args);
13474           if (head)
13475             {
13476               if (!aarch64_process_target_attr (head))
13477                 return false;
13478             }
13479           args = TREE_CHAIN (args);
13480         } while (args);
13481
13482       return true;
13483     }
13484
13485   if (TREE_CODE (args) != STRING_CST)
13486     {
13487       error ("attribute %<target%> argument not a string");
13488       return false;
13489     }
13490
13491   size_t len = strlen (TREE_STRING_POINTER (args));
13492   char *str_to_check = (char *) alloca (len + 1);
13493   strcpy (str_to_check, TREE_STRING_POINTER (args));
13494
13495   if (len == 0)
13496     {
13497       error ("malformed %<target()%> pragma or attribute");
13498       return false;
13499     }
13500
13501   /* Used to catch empty spaces between commas i.e.
13502      attribute ((target ("attr1,,attr2"))).  */
13503   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13504
13505   /* Handle multiple target attributes separated by ','.  */
13506   char *token = strtok_r (str_to_check, ",", &str_to_check);
13507
13508   unsigned int num_attrs = 0;
13509   while (token)
13510     {
13511       num_attrs++;
13512       if (!aarch64_process_one_target_attr (token))
13513         {
13514           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13515           return false;
13516         }
13517
13518       token = strtok_r (NULL, ",", &str_to_check);
13519     }
13520
13521   if (num_attrs != num_commas + 1)
13522     {
13523       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13524       return false;
13525     }
13526
13527   return true;
13528 }
13529
13530 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
13531    process attribute ((target ("..."))).  */
13532
13533 static bool
13534 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13535 {
13536   struct cl_target_option cur_target;
13537   bool ret;
13538   tree old_optimize;
13539   tree new_target, new_optimize;
13540   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13541
13542   /* If what we're processing is the current pragma string then the
13543      target option node is already stored in target_option_current_node
13544      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
13545      having to re-parse the string.  This is especially useful to keep
13546      arm_neon.h compile times down since that header contains a lot
13547      of intrinsics enclosed in pragmas.  */
13548   if (!existing_target && args == current_target_pragma)
13549     {
13550       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13551       return true;
13552     }
13553   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13554
13555   old_optimize = build_optimization_node (&global_options);
13556   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13557
13558   /* If the function changed the optimization levels as well as setting
13559      target options, start with the optimizations specified.  */
13560   if (func_optimize && func_optimize != old_optimize)
13561     cl_optimization_restore (&global_options,
13562                              TREE_OPTIMIZATION (func_optimize));
13563
13564   /* Save the current target options to restore at the end.  */
13565   cl_target_option_save (&cur_target, &global_options);
13566
13567   /* If fndecl already has some target attributes applied to it, unpack
13568      them so that we add this attribute on top of them, rather than
13569      overwriting them.  */
13570   if (existing_target)
13571     {
13572       struct cl_target_option *existing_options
13573         = TREE_TARGET_OPTION (existing_target);
13574
13575       if (existing_options)
13576         cl_target_option_restore (&global_options, existing_options);
13577     }
13578   else
13579     cl_target_option_restore (&global_options,
13580                         TREE_TARGET_OPTION (target_option_current_node));
13581
13582   ret = aarch64_process_target_attr (args);
13583
13584   /* Set up any additional state.  */
13585   if (ret)
13586     {
13587       aarch64_override_options_internal (&global_options);
13588       /* Initialize SIMD builtins if we haven't already.
13589          Set current_target_pragma to NULL for the duration so that
13590          the builtin initialization code doesn't try to tag the functions
13591          being built with the attributes specified by any current pragma, thus
13592          going into an infinite recursion.  */
13593       if (TARGET_SIMD)
13594         {
13595           tree saved_current_target_pragma = current_target_pragma;
13596           current_target_pragma = NULL;
13597           aarch64_init_simd_builtins ();
13598           current_target_pragma = saved_current_target_pragma;
13599         }
13600       new_target = build_target_option_node (&global_options);
13601     }
13602   else
13603     new_target = NULL;
13604
13605   new_optimize = build_optimization_node (&global_options);
13606
13607   if (fndecl && ret)
13608     {
13609       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13610
13611       if (old_optimize != new_optimize)
13612         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13613     }
13614
13615   cl_target_option_restore (&global_options, &cur_target);
13616
13617   if (old_optimize != new_optimize)
13618     cl_optimization_restore (&global_options,
13619                              TREE_OPTIMIZATION (old_optimize));
13620   return ret;
13621 }
13622
13623 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
13624    tri-bool options (yes, no, don't care) and the default value is
13625    DEF, determine whether to reject inlining.  */
13626
13627 static bool
13628 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13629                                      int dont_care, int def)
13630 {
13631   /* If the callee doesn't care, always allow inlining.  */
13632   if (callee == dont_care)
13633     return true;
13634
13635   /* If the caller doesn't care, always allow inlining.  */
13636   if (caller == dont_care)
13637     return true;
13638
13639   /* Otherwise, allow inlining if either the callee and caller values
13640      agree, or if the callee is using the default value.  */
13641   return (callee == caller || callee == def);
13642 }
13643
13644 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
13645    to inline CALLEE into CALLER based on target-specific info.
13646    Make sure that the caller and callee have compatible architectural
13647    features.  Then go through the other possible target attributes
13648    and see if they can block inlining.  Try not to reject always_inline
13649    callees unless they are incompatible architecturally.  */
13650
13651 static bool
13652 aarch64_can_inline_p (tree caller, tree callee)
13653 {
13654   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13655   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13656
13657   struct cl_target_option *caller_opts
13658         = TREE_TARGET_OPTION (caller_tree ? caller_tree
13659                                            : target_option_default_node);
13660
13661   struct cl_target_option *callee_opts
13662         = TREE_TARGET_OPTION (callee_tree ? callee_tree
13663                                            : target_option_default_node);
13664
13665   /* Callee's ISA flags should be a subset of the caller's.  */
13666   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13667        != callee_opts->x_aarch64_isa_flags)
13668     return false;
13669
13670   /* Allow non-strict aligned functions inlining into strict
13671      aligned ones.  */
13672   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13673        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13674       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13675            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13676     return false;
13677
13678   bool always_inline = lookup_attribute ("always_inline",
13679                                           DECL_ATTRIBUTES (callee));
13680
13681   /* If the architectural features match up and the callee is always_inline
13682      then the other attributes don't matter.  */
13683   if (always_inline)
13684     return true;
13685
13686   if (caller_opts->x_aarch64_cmodel_var
13687       != callee_opts->x_aarch64_cmodel_var)
13688     return false;
13689
13690   if (caller_opts->x_aarch64_tls_dialect
13691       != callee_opts->x_aarch64_tls_dialect)
13692     return false;
13693
13694   /* Honour explicit requests to workaround errata.  */
13695   if (!aarch64_tribools_ok_for_inlining_p (
13696           caller_opts->x_aarch64_fix_a53_err835769,
13697           callee_opts->x_aarch64_fix_a53_err835769,
13698           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13699     return false;
13700
13701   if (!aarch64_tribools_ok_for_inlining_p (
13702           caller_opts->x_aarch64_fix_a53_err843419,
13703           callee_opts->x_aarch64_fix_a53_err843419,
13704           2, TARGET_FIX_ERR_A53_843419))
13705     return false;
13706
13707   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13708      caller and calle and they don't match up, reject inlining.  */
13709   if (!aarch64_tribools_ok_for_inlining_p (
13710           caller_opts->x_flag_omit_leaf_frame_pointer,
13711           callee_opts->x_flag_omit_leaf_frame_pointer,
13712           2, 1))
13713     return false;
13714
13715   /* If the callee has specific tuning overrides, respect them.  */
13716   if (callee_opts->x_aarch64_override_tune_string != NULL
13717       && caller_opts->x_aarch64_override_tune_string == NULL)
13718     return false;
13719
13720   /* If the user specified tuning override strings for the
13721      caller and callee and they don't match up, reject inlining.
13722      We just do a string compare here, we don't analyze the meaning
13723      of the string, as it would be too costly for little gain.  */
13724   if (callee_opts->x_aarch64_override_tune_string
13725       && caller_opts->x_aarch64_override_tune_string
13726       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13727                   caller_opts->x_aarch64_override_tune_string) != 0))
13728     return false;
13729
13730   return true;
13731 }
13732
13733 /* Return true if SYMBOL_REF X binds locally.  */
13734
13735 static bool
13736 aarch64_symbol_binds_local_p (const_rtx x)
13737 {
13738   return (SYMBOL_REF_DECL (x)
13739           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13740           : SYMBOL_REF_LOCAL_P (x));
13741 }
13742
13743 /* Return true if SYMBOL_REF X is thread local */
13744 static bool
13745 aarch64_tls_symbol_p (rtx x)
13746 {
13747   if (! TARGET_HAVE_TLS)
13748     return false;
13749
13750   if (GET_CODE (x) != SYMBOL_REF)
13751     return false;
13752
13753   return SYMBOL_REF_TLS_MODEL (x) != 0;
13754 }
13755
13756 /* Classify a TLS symbol into one of the TLS kinds.  */
13757 enum aarch64_symbol_type
13758 aarch64_classify_tls_symbol (rtx x)
13759 {
13760   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13761
13762   switch (tls_kind)
13763     {
13764     case TLS_MODEL_GLOBAL_DYNAMIC:
13765     case TLS_MODEL_LOCAL_DYNAMIC:
13766       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13767
13768     case TLS_MODEL_INITIAL_EXEC:
13769       switch (aarch64_cmodel)
13770         {
13771         case AARCH64_CMODEL_TINY:
13772         case AARCH64_CMODEL_TINY_PIC:
13773           return SYMBOL_TINY_TLSIE;
13774         default:
13775           return SYMBOL_SMALL_TLSIE;
13776         }
13777
13778     case TLS_MODEL_LOCAL_EXEC:
13779       if (aarch64_tls_size == 12)
13780         return SYMBOL_TLSLE12;
13781       else if (aarch64_tls_size == 24)
13782         return SYMBOL_TLSLE24;
13783       else if (aarch64_tls_size == 32)
13784         return SYMBOL_TLSLE32;
13785       else if (aarch64_tls_size == 48)
13786         return SYMBOL_TLSLE48;
13787       else
13788         gcc_unreachable ();
13789
13790     case TLS_MODEL_EMULATED:
13791     case TLS_MODEL_NONE:
13792       return SYMBOL_FORCE_TO_MEM;
13793
13794     default:
13795       gcc_unreachable ();
13796     }
13797 }
13798
13799 /* Return the correct method for accessing X + OFFSET, where X is either
13800    a SYMBOL_REF or LABEL_REF.  */
13801
13802 enum aarch64_symbol_type
13803 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13804 {
13805   if (GET_CODE (x) == LABEL_REF)
13806     {
13807       switch (aarch64_cmodel)
13808         {
13809         case AARCH64_CMODEL_LARGE:
13810           return SYMBOL_FORCE_TO_MEM;
13811
13812         case AARCH64_CMODEL_TINY_PIC:
13813         case AARCH64_CMODEL_TINY:
13814           return SYMBOL_TINY_ABSOLUTE;
13815
13816         case AARCH64_CMODEL_SMALL_SPIC:
13817         case AARCH64_CMODEL_SMALL_PIC:
13818         case AARCH64_CMODEL_SMALL:
13819           return SYMBOL_SMALL_ABSOLUTE;
13820
13821         default:
13822           gcc_unreachable ();
13823         }
13824     }
13825
13826   if (GET_CODE (x) == SYMBOL_REF)
13827     {
13828       if (aarch64_tls_symbol_p (x))
13829         return aarch64_classify_tls_symbol (x);
13830
13831       switch (aarch64_cmodel)
13832         {
13833         case AARCH64_CMODEL_TINY:
13834           /* When we retrieve symbol + offset address, we have to make sure
13835              the offset does not cause overflow of the final address.  But
13836              we have no way of knowing the address of symbol at compile time
13837              so we can't accurately say if the distance between the PC and
13838              symbol + offset is outside the addressible range of +/-1M in the
13839              TINY code model.  So we rely on images not being greater than
13840              1M and cap the offset at 1M and anything beyond 1M will have to
13841              be loaded using an alternative mechanism.  Furthermore if the
13842              symbol is a weak reference to something that isn't known to
13843              resolve to a symbol in this module, then force to memory.  */
13844           if ((SYMBOL_REF_WEAK (x)
13845                && !aarch64_symbol_binds_local_p (x))
13846               || !IN_RANGE (offset, -1048575, 1048575))
13847             return SYMBOL_FORCE_TO_MEM;
13848           return SYMBOL_TINY_ABSOLUTE;
13849
13850         case AARCH64_CMODEL_SMALL:
13851           /* Same reasoning as the tiny code model, but the offset cap here is
13852              4G.  */
13853           if ((SYMBOL_REF_WEAK (x)
13854                && !aarch64_symbol_binds_local_p (x))
13855               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13856                             HOST_WIDE_INT_C (4294967264)))
13857             return SYMBOL_FORCE_TO_MEM;
13858           return SYMBOL_SMALL_ABSOLUTE;
13859
13860         case AARCH64_CMODEL_TINY_PIC:
13861           if (!aarch64_symbol_binds_local_p (x))
13862             return SYMBOL_TINY_GOT;
13863           return SYMBOL_TINY_ABSOLUTE;
13864
13865         case AARCH64_CMODEL_SMALL_SPIC:
13866         case AARCH64_CMODEL_SMALL_PIC:
13867           if (!aarch64_symbol_binds_local_p (x))
13868             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13869                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13870           return SYMBOL_SMALL_ABSOLUTE;
13871
13872         case AARCH64_CMODEL_LARGE:
13873           /* This is alright even in PIC code as the constant
13874              pool reference is always PC relative and within
13875              the same translation unit.  */
13876           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13877             return SYMBOL_SMALL_ABSOLUTE;
13878           else
13879             return SYMBOL_FORCE_TO_MEM;
13880
13881         default:
13882           gcc_unreachable ();
13883         }
13884     }
13885
13886   /* By default push everything into the constant pool.  */
13887   return SYMBOL_FORCE_TO_MEM;
13888 }
13889
13890 bool
13891 aarch64_constant_address_p (rtx x)
13892 {
13893   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13894 }
13895
13896 bool
13897 aarch64_legitimate_pic_operand_p (rtx x)
13898 {
13899   if (GET_CODE (x) == SYMBOL_REF
13900       || (GET_CODE (x) == CONST
13901           && GET_CODE (XEXP (x, 0)) == PLUS
13902           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13903      return false;
13904
13905   return true;
13906 }
13907
13908 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13909    that should be rematerialized rather than spilled.  */
13910
13911 static bool
13912 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13913 {
13914   /* Support CSE and rematerialization of common constants.  */
13915   if (CONST_INT_P (x)
13916       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13917       || GET_CODE (x) == CONST_VECTOR)
13918     return true;
13919
13920   /* Do not allow vector struct mode constants for Advanced SIMD.
13921      We could support 0 and -1 easily, but they need support in
13922      aarch64-simd.md.  */
13923   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13924   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13925     return false;
13926
13927   /* Only accept variable-length vector constants if they can be
13928      handled directly.
13929
13930      ??? It would be possible to handle rematerialization of other
13931      constants via secondary reloads.  */
13932   if (vec_flags & VEC_ANY_SVE)
13933     return aarch64_simd_valid_immediate (x, NULL);
13934
13935   if (GET_CODE (x) == HIGH)
13936     x = XEXP (x, 0);
13937
13938   /* Accept polynomial constants that can be calculated by using the
13939      destination of a move as the sole temporary.  Constants that
13940      require a second temporary cannot be rematerialized (they can't be
13941      forced to memory and also aren't legitimate constants).  */
13942   poly_int64 offset;
13943   if (poly_int_rtx_p (x, &offset))
13944     return aarch64_offset_temporaries (false, offset) <= 1;
13945
13946   /* If an offset is being added to something else, we need to allow the
13947      base to be moved into the destination register, meaning that there
13948      are no free temporaries for the offset.  */
13949   x = strip_offset (x, &offset);
13950   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13951     return false;
13952
13953   /* Do not allow const (plus (anchor_symbol, const_int)).  */
13954   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13955     return false;
13956
13957   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
13958      so spilling them is better than rematerialization.  */
13959   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13960     return true;
13961
13962   /* Label references are always constant.  */
13963   if (GET_CODE (x) == LABEL_REF)
13964     return true;
13965
13966   return false;
13967 }
13968
13969 rtx
13970 aarch64_load_tp (rtx target)
13971 {
13972   if (!target
13973       || GET_MODE (target) != Pmode
13974       || !register_operand (target, Pmode))
13975     target = gen_reg_rtx (Pmode);
13976
13977   /* Can return in any reg.  */
13978   emit_insn (gen_aarch64_load_tp_hard (target));
13979   return target;
13980 }
13981
13982 /* On AAPCS systems, this is the "struct __va_list".  */
13983 static GTY(()) tree va_list_type;
13984
13985 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13986    Return the type to use as __builtin_va_list.
13987
13988    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13989
13990    struct __va_list
13991    {
13992      void *__stack;
13993      void *__gr_top;
13994      void *__vr_top;
13995      int   __gr_offs;
13996      int   __vr_offs;
13997    };  */
13998
13999 static tree
14000 aarch64_build_builtin_va_list (void)
14001 {
14002   tree va_list_name;
14003   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14004
14005   /* Create the type.  */
14006   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14007   /* Give it the required name.  */
14008   va_list_name = build_decl (BUILTINS_LOCATION,
14009                              TYPE_DECL,
14010                              get_identifier ("__va_list"),
14011                              va_list_type);
14012   DECL_ARTIFICIAL (va_list_name) = 1;
14013   TYPE_NAME (va_list_type) = va_list_name;
14014   TYPE_STUB_DECL (va_list_type) = va_list_name;
14015
14016   /* Create the fields.  */
14017   f_stack = build_decl (BUILTINS_LOCATION,
14018                         FIELD_DECL, get_identifier ("__stack"),
14019                         ptr_type_node);
14020   f_grtop = build_decl (BUILTINS_LOCATION,
14021                         FIELD_DECL, get_identifier ("__gr_top"),
14022                         ptr_type_node);
14023   f_vrtop = build_decl (BUILTINS_LOCATION,
14024                         FIELD_DECL, get_identifier ("__vr_top"),
14025                         ptr_type_node);
14026   f_groff = build_decl (BUILTINS_LOCATION,
14027                         FIELD_DECL, get_identifier ("__gr_offs"),
14028                         integer_type_node);
14029   f_vroff = build_decl (BUILTINS_LOCATION,
14030                         FIELD_DECL, get_identifier ("__vr_offs"),
14031                         integer_type_node);
14032
14033   /* Tell tree-stdarg pass about our internal offset fields.
14034      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14035      purpose to identify whether the code is updating va_list internal
14036      offset fields through irregular way.  */
14037   va_list_gpr_counter_field = f_groff;
14038   va_list_fpr_counter_field = f_vroff;
14039
14040   DECL_ARTIFICIAL (f_stack) = 1;
14041   DECL_ARTIFICIAL (f_grtop) = 1;
14042   DECL_ARTIFICIAL (f_vrtop) = 1;
14043   DECL_ARTIFICIAL (f_groff) = 1;
14044   DECL_ARTIFICIAL (f_vroff) = 1;
14045
14046   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14047   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14048   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14049   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14050   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14051
14052   TYPE_FIELDS (va_list_type) = f_stack;
14053   DECL_CHAIN (f_stack) = f_grtop;
14054   DECL_CHAIN (f_grtop) = f_vrtop;
14055   DECL_CHAIN (f_vrtop) = f_groff;
14056   DECL_CHAIN (f_groff) = f_vroff;
14057
14058   /* Compute its layout.  */
14059   layout_type (va_list_type);
14060
14061   return va_list_type;
14062 }
14063
14064 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
14065 static void
14066 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14067 {
14068   const CUMULATIVE_ARGS *cum;
14069   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14070   tree stack, grtop, vrtop, groff, vroff;
14071   tree t;
14072   int gr_save_area_size = cfun->va_list_gpr_size;
14073   int vr_save_area_size = cfun->va_list_fpr_size;
14074   int vr_offset;
14075
14076   cum = &crtl->args.info;
14077   if (cfun->va_list_gpr_size)
14078     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14079                              cfun->va_list_gpr_size);
14080   if (cfun->va_list_fpr_size)
14081     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14082                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
14083
14084   if (!TARGET_FLOAT)
14085     {
14086       gcc_assert (cum->aapcs_nvrn == 0);
14087       vr_save_area_size = 0;
14088     }
14089
14090   f_stack = TYPE_FIELDS (va_list_type_node);
14091   f_grtop = DECL_CHAIN (f_stack);
14092   f_vrtop = DECL_CHAIN (f_grtop);
14093   f_groff = DECL_CHAIN (f_vrtop);
14094   f_vroff = DECL_CHAIN (f_groff);
14095
14096   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14097                   NULL_TREE);
14098   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14099                   NULL_TREE);
14100   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14101                   NULL_TREE);
14102   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14103                   NULL_TREE);
14104   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14105                   NULL_TREE);
14106
14107   /* Emit code to initialize STACK, which points to the next varargs stack
14108      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
14109      by named arguments.  STACK is 8-byte aligned.  */
14110   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14111   if (cum->aapcs_stack_size > 0)
14112     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14113   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14114   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14115
14116   /* Emit code to initialize GRTOP, the top of the GR save area.
14117      virtual_incoming_args_rtx should have been 16 byte aligned.  */
14118   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14119   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14120   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14121
14122   /* Emit code to initialize VRTOP, the top of the VR save area.
14123      This address is gr_save_area_bytes below GRTOP, rounded
14124      down to the next 16-byte boundary.  */
14125   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14126   vr_offset = ROUND_UP (gr_save_area_size,
14127                         STACK_BOUNDARY / BITS_PER_UNIT);
14128
14129   if (vr_offset)
14130     t = fold_build_pointer_plus_hwi (t, -vr_offset);
14131   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14132   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14133
14134   /* Emit code to initialize GROFF, the offset from GRTOP of the
14135      next GPR argument.  */
14136   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14137               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14138   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14139
14140   /* Likewise emit code to initialize VROFF, the offset from FTOP
14141      of the next VR argument.  */
14142   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14143               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14144   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14145 }
14146
14147 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
14148
14149 static tree
14150 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14151                               gimple_seq *post_p ATTRIBUTE_UNUSED)
14152 {
14153   tree addr;
14154   bool indirect_p;
14155   bool is_ha;           /* is HFA or HVA.  */
14156   bool dw_align;        /* double-word align.  */
14157   machine_mode ag_mode = VOIDmode;
14158   int nregs;
14159   machine_mode mode;
14160
14161   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14162   tree stack, f_top, f_off, off, arg, roundup, on_stack;
14163   HOST_WIDE_INT size, rsize, adjust, align;
14164   tree t, u, cond1, cond2;
14165
14166   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
14167   if (indirect_p)
14168     type = build_pointer_type (type);
14169
14170   mode = TYPE_MODE (type);
14171
14172   f_stack = TYPE_FIELDS (va_list_type_node);
14173   f_grtop = DECL_CHAIN (f_stack);
14174   f_vrtop = DECL_CHAIN (f_grtop);
14175   f_groff = DECL_CHAIN (f_vrtop);
14176   f_vroff = DECL_CHAIN (f_groff);
14177
14178   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14179                   f_stack, NULL_TREE);
14180   size = int_size_in_bytes (type);
14181
14182   bool abi_break;
14183   align
14184     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14185
14186   dw_align = false;
14187   adjust = 0;
14188   if (aarch64_vfp_is_call_or_return_candidate (mode,
14189                                                type,
14190                                                &ag_mode,
14191                                                &nregs,
14192                                                &is_ha))
14193     {
14194       /* No frontends can create types with variable-sized modes, so we
14195          shouldn't be asked to pass or return them.  */
14196       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14197
14198       /* TYPE passed in fp/simd registers.  */
14199       if (!TARGET_FLOAT)
14200         aarch64_err_no_fpadvsimd (mode);
14201
14202       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14203                       unshare_expr (valist), f_vrtop, NULL_TREE);
14204       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14205                       unshare_expr (valist), f_vroff, NULL_TREE);
14206
14207       rsize = nregs * UNITS_PER_VREG;
14208
14209       if (is_ha)
14210         {
14211           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14212             adjust = UNITS_PER_VREG - ag_size;
14213         }
14214       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14215                && size < UNITS_PER_VREG)
14216         {
14217           adjust = UNITS_PER_VREG - size;
14218         }
14219     }
14220   else
14221     {
14222       /* TYPE passed in general registers.  */
14223       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14224                       unshare_expr (valist), f_grtop, NULL_TREE);
14225       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14226                       unshare_expr (valist), f_groff, NULL_TREE);
14227       rsize = ROUND_UP (size, UNITS_PER_WORD);
14228       nregs = rsize / UNITS_PER_WORD;
14229
14230       if (align > 8)
14231         {
14232           if (abi_break && warn_psabi)
14233             inform (input_location, "parameter passing for argument of type "
14234                     "%qT changed in GCC 9.1", type);
14235           dw_align = true;
14236         }
14237
14238       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14239           && size < UNITS_PER_WORD)
14240         {
14241           adjust = UNITS_PER_WORD  - size;
14242         }
14243     }
14244
14245   /* Get a local temporary for the field value.  */
14246   off = get_initialized_tmp_var (f_off, pre_p, NULL);
14247
14248   /* Emit code to branch if off >= 0.  */
14249   t = build2 (GE_EXPR, boolean_type_node, off,
14250               build_int_cst (TREE_TYPE (off), 0));
14251   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14252
14253   if (dw_align)
14254     {
14255       /* Emit: offs = (offs + 15) & -16.  */
14256       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14257                   build_int_cst (TREE_TYPE (off), 15));
14258       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14259                   build_int_cst (TREE_TYPE (off), -16));
14260       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14261     }
14262   else
14263     roundup = NULL;
14264
14265   /* Update ap.__[g|v]r_offs  */
14266   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14267               build_int_cst (TREE_TYPE (off), rsize));
14268   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14269
14270   /* String up.  */
14271   if (roundup)
14272     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14273
14274   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
14275   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14276               build_int_cst (TREE_TYPE (f_off), 0));
14277   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14278
14279   /* String up: make sure the assignment happens before the use.  */
14280   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14281   COND_EXPR_ELSE (cond1) = t;
14282
14283   /* Prepare the trees handling the argument that is passed on the stack;
14284      the top level node will store in ON_STACK.  */
14285   arg = get_initialized_tmp_var (stack, pre_p, NULL);
14286   if (align > 8)
14287     {
14288       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
14289       t = fold_build_pointer_plus_hwi (arg, 15);
14290       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14291                   build_int_cst (TREE_TYPE (t), -16));
14292       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14293     }
14294   else
14295     roundup = NULL;
14296   /* Advance ap.__stack  */
14297   t = fold_build_pointer_plus_hwi (arg, size + 7);
14298   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14299               build_int_cst (TREE_TYPE (t), -8));
14300   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14301   /* String up roundup and advance.  */
14302   if (roundup)
14303     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14304   /* String up with arg */
14305   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14306   /* Big-endianness related address adjustment.  */
14307   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14308       && size < UNITS_PER_WORD)
14309   {
14310     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14311                 size_int (UNITS_PER_WORD - size));
14312     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14313   }
14314
14315   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14316   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14317
14318   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
14319   t = off;
14320   if (adjust)
14321     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14322                 build_int_cst (TREE_TYPE (off), adjust));
14323
14324   t = fold_convert (sizetype, t);
14325   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14326
14327   if (is_ha)
14328     {
14329       /* type ha; // treat as "struct {ftype field[n];}"
14330          ... [computing offs]
14331          for (i = 0; i <nregs; ++i, offs += 16)
14332            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14333          return ha;  */
14334       int i;
14335       tree tmp_ha, field_t, field_ptr_t;
14336
14337       /* Declare a local variable.  */
14338       tmp_ha = create_tmp_var_raw (type, "ha");
14339       gimple_add_tmp_var (tmp_ha);
14340
14341       /* Establish the base type.  */
14342       switch (ag_mode)
14343         {
14344         case E_SFmode:
14345           field_t = float_type_node;
14346           field_ptr_t = float_ptr_type_node;
14347           break;
14348         case E_DFmode:
14349           field_t = double_type_node;
14350           field_ptr_t = double_ptr_type_node;
14351           break;
14352         case E_TFmode:
14353           field_t = long_double_type_node;
14354           field_ptr_t = long_double_ptr_type_node;
14355           break;
14356         case E_HFmode:
14357           field_t = aarch64_fp16_type_node;
14358           field_ptr_t = aarch64_fp16_ptr_type_node;
14359           break;
14360         case E_V2SImode:
14361         case E_V4SImode:
14362             {
14363               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14364               field_t = build_vector_type_for_mode (innertype, ag_mode);
14365               field_ptr_t = build_pointer_type (field_t);
14366             }
14367           break;
14368         default:
14369           gcc_assert (0);
14370         }
14371
14372       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
14373       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14374       addr = t;
14375       t = fold_convert (field_ptr_t, addr);
14376       t = build2 (MODIFY_EXPR, field_t,
14377                   build1 (INDIRECT_REF, field_t, tmp_ha),
14378                   build1 (INDIRECT_REF, field_t, t));
14379
14380       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
14381       for (i = 1; i < nregs; ++i)
14382         {
14383           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14384           u = fold_convert (field_ptr_t, addr);
14385           u = build2 (MODIFY_EXPR, field_t,
14386                       build2 (MEM_REF, field_t, tmp_ha,
14387                               build_int_cst (field_ptr_t,
14388                                              (i *
14389                                               int_size_in_bytes (field_t)))),
14390                       build1 (INDIRECT_REF, field_t, u));
14391           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14392         }
14393
14394       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14395       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14396     }
14397
14398   COND_EXPR_ELSE (cond2) = t;
14399   addr = fold_convert (build_pointer_type (type), cond1);
14400   addr = build_va_arg_indirect_ref (addr);
14401
14402   if (indirect_p)
14403     addr = build_va_arg_indirect_ref (addr);
14404
14405   return addr;
14406 }
14407
14408 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
14409
14410 static void
14411 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
14412                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
14413                                 int no_rtl)
14414 {
14415   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14416   CUMULATIVE_ARGS local_cum;
14417   int gr_saved = cfun->va_list_gpr_size;
14418   int vr_saved = cfun->va_list_fpr_size;
14419
14420   /* The caller has advanced CUM up to, but not beyond, the last named
14421      argument.  Advance a local copy of CUM past the last "real" named
14422      argument, to find out how many registers are left over.  */
14423   local_cum = *cum;
14424   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
14425
14426   /* Found out how many registers we need to save.
14427      Honor tree-stdvar analysis results.  */
14428   if (cfun->va_list_gpr_size)
14429     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14430                     cfun->va_list_gpr_size / UNITS_PER_WORD);
14431   if (cfun->va_list_fpr_size)
14432     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14433                     cfun->va_list_fpr_size / UNITS_PER_VREG);
14434
14435   if (!TARGET_FLOAT)
14436     {
14437       gcc_assert (local_cum.aapcs_nvrn == 0);
14438       vr_saved = 0;
14439     }
14440
14441   if (!no_rtl)
14442     {
14443       if (gr_saved > 0)
14444         {
14445           rtx ptr, mem;
14446
14447           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
14448           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14449                                - gr_saved * UNITS_PER_WORD);
14450           mem = gen_frame_mem (BLKmode, ptr);
14451           set_mem_alias_set (mem, get_varargs_alias_set ());
14452
14453           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14454                                mem, gr_saved);
14455         }
14456       if (vr_saved > 0)
14457         {
14458           /* We can't use move_block_from_reg, because it will use
14459              the wrong mode, storing D regs only.  */
14460           machine_mode mode = TImode;
14461           int off, i, vr_start;
14462
14463           /* Set OFF to the offset from virtual_incoming_args_rtx of
14464              the first vector register.  The VR save area lies below
14465              the GR one, and is aligned to 16 bytes.  */
14466           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14467                            STACK_BOUNDARY / BITS_PER_UNIT);
14468           off -= vr_saved * UNITS_PER_VREG;
14469
14470           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14471           for (i = 0; i < vr_saved; ++i)
14472             {
14473               rtx ptr, mem;
14474
14475               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14476               mem = gen_frame_mem (mode, ptr);
14477               set_mem_alias_set (mem, get_varargs_alias_set ());
14478               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14479               off += UNITS_PER_VREG;
14480             }
14481         }
14482     }
14483
14484   /* We don't save the size into *PRETEND_SIZE because we want to avoid
14485      any complication of having crtl->args.pretend_args_size changed.  */
14486   cfun->machine->frame.saved_varargs_size
14487     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14488                  STACK_BOUNDARY / BITS_PER_UNIT)
14489        + vr_saved * UNITS_PER_VREG);
14490 }
14491
14492 static void
14493 aarch64_conditional_register_usage (void)
14494 {
14495   int i;
14496   if (!TARGET_FLOAT)
14497     {
14498       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14499         {
14500           fixed_regs[i] = 1;
14501           call_used_regs[i] = 1;
14502         }
14503     }
14504   if (!TARGET_SVE)
14505     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14506       {
14507         fixed_regs[i] = 1;
14508         call_used_regs[i] = 1;
14509       }
14510
14511   /* When tracking speculation, we need a couple of call-clobbered registers
14512      to track the speculation state.  It would be nice to just use
14513      IP0 and IP1, but currently there are numerous places that just
14514      assume these registers are free for other uses (eg pointer
14515      authentication).  */
14516   if (aarch64_track_speculation)
14517     {
14518       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14519       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14520       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14521       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14522     }
14523 }
14524
14525 /* Walk down the type tree of TYPE counting consecutive base elements.
14526    If *MODEP is VOIDmode, then set it to the first valid floating point
14527    type.  If a non-floating point type is found, or if a floating point
14528    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14529    otherwise return the count in the sub-tree.  */
14530 static int
14531 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14532 {
14533   machine_mode mode;
14534   HOST_WIDE_INT size;
14535
14536   switch (TREE_CODE (type))
14537     {
14538     case REAL_TYPE:
14539       mode = TYPE_MODE (type);
14540       if (mode != DFmode && mode != SFmode
14541           && mode != TFmode && mode != HFmode)
14542         return -1;
14543
14544       if (*modep == VOIDmode)
14545         *modep = mode;
14546
14547       if (*modep == mode)
14548         return 1;
14549
14550       break;
14551
14552     case COMPLEX_TYPE:
14553       mode = TYPE_MODE (TREE_TYPE (type));
14554       if (mode != DFmode && mode != SFmode
14555           && mode != TFmode && mode != HFmode)
14556         return -1;
14557
14558       if (*modep == VOIDmode)
14559         *modep = mode;
14560
14561       if (*modep == mode)
14562         return 2;
14563
14564       break;
14565
14566     case VECTOR_TYPE:
14567       /* Use V2SImode and V4SImode as representatives of all 64-bit
14568          and 128-bit vector types.  */
14569       size = int_size_in_bytes (type);
14570       switch (size)
14571         {
14572         case 8:
14573           mode = V2SImode;
14574           break;
14575         case 16:
14576           mode = V4SImode;
14577           break;
14578         default:
14579           return -1;
14580         }
14581
14582       if (*modep == VOIDmode)
14583         *modep = mode;
14584
14585       /* Vector modes are considered to be opaque: two vectors are
14586          equivalent for the purposes of being homogeneous aggregates
14587          if they are the same size.  */
14588       if (*modep == mode)
14589         return 1;
14590
14591       break;
14592
14593     case ARRAY_TYPE:
14594       {
14595         int count;
14596         tree index = TYPE_DOMAIN (type);
14597
14598         /* Can't handle incomplete types nor sizes that are not
14599            fixed.  */
14600         if (!COMPLETE_TYPE_P (type)
14601             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14602           return -1;
14603
14604         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14605         if (count == -1
14606             || !index
14607             || !TYPE_MAX_VALUE (index)
14608             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14609             || !TYPE_MIN_VALUE (index)
14610             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14611             || count < 0)
14612           return -1;
14613
14614         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14615                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14616
14617         /* There must be no padding.  */
14618         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14619                       count * GET_MODE_BITSIZE (*modep)))
14620           return -1;
14621
14622         return count;
14623       }
14624
14625     case RECORD_TYPE:
14626       {
14627         int count = 0;
14628         int sub_count;
14629         tree field;
14630
14631         /* Can't handle incomplete types nor sizes that are not
14632            fixed.  */
14633         if (!COMPLETE_TYPE_P (type)
14634             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14635           return -1;
14636
14637         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14638           {
14639             if (TREE_CODE (field) != FIELD_DECL)
14640               continue;
14641
14642             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14643             if (sub_count < 0)
14644               return -1;
14645             count += sub_count;
14646           }
14647
14648         /* There must be no padding.  */
14649         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14650                       count * GET_MODE_BITSIZE (*modep)))
14651           return -1;
14652
14653         return count;
14654       }
14655
14656     case UNION_TYPE:
14657     case QUAL_UNION_TYPE:
14658       {
14659         /* These aren't very interesting except in a degenerate case.  */
14660         int count = 0;
14661         int sub_count;
14662         tree field;
14663
14664         /* Can't handle incomplete types nor sizes that are not
14665            fixed.  */
14666         if (!COMPLETE_TYPE_P (type)
14667             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14668           return -1;
14669
14670         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14671           {
14672             if (TREE_CODE (field) != FIELD_DECL)
14673               continue;
14674
14675             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14676             if (sub_count < 0)
14677               return -1;
14678             count = count > sub_count ? count : sub_count;
14679           }
14680
14681         /* There must be no padding.  */
14682         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14683                       count * GET_MODE_BITSIZE (*modep)))
14684           return -1;
14685
14686         return count;
14687       }
14688
14689     default:
14690       break;
14691     }
14692
14693   return -1;
14694 }
14695
14696 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14697    type as described in AAPCS64 \S 4.1.2.
14698
14699    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14700
14701 static bool
14702 aarch64_short_vector_p (const_tree type,
14703                         machine_mode mode)
14704 {
14705   poly_int64 size = -1;
14706
14707   if (type && TREE_CODE (type) == VECTOR_TYPE)
14708     size = int_size_in_bytes (type);
14709   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14710             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14711     size = GET_MODE_SIZE (mode);
14712
14713   return known_eq (size, 8) || known_eq (size, 16);
14714 }
14715
14716 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14717    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14718    array types.  The C99 floating-point complex types are also considered
14719    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14720    types, which are GCC extensions and out of the scope of AAPCS64, are
14721    treated as composite types here as well.
14722
14723    Note that MODE itself is not sufficient in determining whether a type
14724    is such a composite type or not.  This is because
14725    stor-layout.c:compute_record_mode may have already changed the MODE
14726    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14727    structure with only one field may have its MODE set to the mode of the
14728    field.  Also an integer mode whose size matches the size of the
14729    RECORD_TYPE type may be used to substitute the original mode
14730    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14731    solely relied on.  */
14732
14733 static bool
14734 aarch64_composite_type_p (const_tree type,
14735                           machine_mode mode)
14736 {
14737   if (aarch64_short_vector_p (type, mode))
14738     return false;
14739
14740   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14741     return true;
14742
14743   if (mode == BLKmode
14744       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14745       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14746     return true;
14747
14748   return false;
14749 }
14750
14751 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14752    shall be passed or returned in simd/fp register(s) (providing these
14753    parameter passing registers are available).
14754
14755    Upon successful return, *COUNT returns the number of needed registers,
14756    *BASE_MODE returns the mode of the individual register and when IS_HAF
14757    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14758    floating-point aggregate or a homogeneous short-vector aggregate.  */
14759
14760 static bool
14761 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14762                                          const_tree type,
14763                                          machine_mode *base_mode,
14764                                          int *count,
14765                                          bool *is_ha)
14766 {
14767   machine_mode new_mode = VOIDmode;
14768   bool composite_p = aarch64_composite_type_p (type, mode);
14769
14770   if (is_ha != NULL) *is_ha = false;
14771
14772   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14773       || aarch64_short_vector_p (type, mode))
14774     {
14775       *count = 1;
14776       new_mode = mode;
14777     }
14778   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14779     {
14780       if (is_ha != NULL) *is_ha = true;
14781       *count = 2;
14782       new_mode = GET_MODE_INNER (mode);
14783     }
14784   else if (type && composite_p)
14785     {
14786       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14787
14788       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14789         {
14790           if (is_ha != NULL) *is_ha = true;
14791           *count = ag_count;
14792         }
14793       else
14794         return false;
14795     }
14796   else
14797     return false;
14798
14799   *base_mode = new_mode;
14800   return true;
14801 }
14802
14803 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14804
14805 static rtx
14806 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14807                           int incoming ATTRIBUTE_UNUSED)
14808 {
14809   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14810 }
14811
14812 /* Implements target hook vector_mode_supported_p.  */
14813 static bool
14814 aarch64_vector_mode_supported_p (machine_mode mode)
14815 {
14816   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14817   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14818 }
14819
14820 /* Return the full-width SVE vector mode for element mode MODE, if one
14821    exists.  */
14822 opt_machine_mode
14823 aarch64_full_sve_mode (scalar_mode mode)
14824 {
14825   switch (mode)
14826     {
14827     case E_DFmode:
14828       return VNx2DFmode;
14829     case E_SFmode:
14830       return VNx4SFmode;
14831     case E_HFmode:
14832       return VNx8HFmode;
14833     case E_DImode:
14834         return VNx2DImode;
14835     case E_SImode:
14836       return VNx4SImode;
14837     case E_HImode:
14838       return VNx8HImode;
14839     case E_QImode:
14840       return VNx16QImode;
14841     default:
14842       return opt_machine_mode ();
14843     }
14844 }
14845
14846 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14847    if it exists.  */
14848 opt_machine_mode
14849 aarch64_vq_mode (scalar_mode mode)
14850 {
14851   switch (mode)
14852     {
14853     case E_DFmode:
14854       return V2DFmode;
14855     case E_SFmode:
14856       return V4SFmode;
14857     case E_HFmode:
14858       return V8HFmode;
14859     case E_SImode:
14860       return V4SImode;
14861     case E_HImode:
14862       return V8HImode;
14863     case E_QImode:
14864       return V16QImode;
14865     case E_DImode:
14866       return V2DImode;
14867     default:
14868       return opt_machine_mode ();
14869     }
14870 }
14871
14872 /* Return appropriate SIMD container
14873    for MODE within a vector of WIDTH bits.  */
14874 static machine_mode
14875 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14876 {
14877   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14878     return aarch64_full_sve_mode (mode).else_mode (word_mode);
14879
14880   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14881   if (TARGET_SIMD)
14882     {
14883       if (known_eq (width, 128))
14884         return aarch64_vq_mode (mode).else_mode (word_mode);
14885       else
14886         switch (mode)
14887           {
14888           case E_SFmode:
14889             return V2SFmode;
14890           case E_HFmode:
14891             return V4HFmode;
14892           case E_SImode:
14893             return V2SImode;
14894           case E_HImode:
14895             return V4HImode;
14896           case E_QImode:
14897             return V8QImode;
14898           default:
14899             break;
14900           }
14901     }
14902   return word_mode;
14903 }
14904
14905 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14906 static machine_mode
14907 aarch64_preferred_simd_mode (scalar_mode mode)
14908 {
14909   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14910   return aarch64_simd_container_mode (mode, bits);
14911 }
14912
14913 /* Return a list of possible vector sizes for the vectorizer
14914    to iterate over.  */
14915 static void
14916 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14917 {
14918   if (TARGET_SVE)
14919     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14920   sizes->safe_push (16);
14921   sizes->safe_push (8);
14922 }
14923
14924 /* Implement TARGET_MANGLE_TYPE.  */
14925
14926 static const char *
14927 aarch64_mangle_type (const_tree type)
14928 {
14929   /* The AArch64 ABI documents say that "__va_list" has to be
14930      mangled as if it is in the "std" namespace.  */
14931   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14932     return "St9__va_list";
14933
14934   /* Half-precision float.  */
14935   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14936     return "Dh";
14937
14938   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
14939      builtin types.  */
14940   if (TYPE_NAME (type) != NULL)
14941     return aarch64_mangle_builtin_type (type);
14942
14943   /* Use the default mangling.  */
14944   return NULL;
14945 }
14946
14947 /* Find the first rtx_insn before insn that will generate an assembly
14948    instruction.  */
14949
14950 static rtx_insn *
14951 aarch64_prev_real_insn (rtx_insn *insn)
14952 {
14953   if (!insn)
14954     return NULL;
14955
14956   do
14957     {
14958       insn = prev_real_insn (insn);
14959     }
14960   while (insn && recog_memoized (insn) < 0);
14961
14962   return insn;
14963 }
14964
14965 static bool
14966 is_madd_op (enum attr_type t1)
14967 {
14968   unsigned int i;
14969   /* A number of these may be AArch32 only.  */
14970   enum attr_type mlatypes[] = {
14971     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14972     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14973     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14974   };
14975
14976   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14977     {
14978       if (t1 == mlatypes[i])
14979         return true;
14980     }
14981
14982   return false;
14983 }
14984
14985 /* Check if there is a register dependency between a load and the insn
14986    for which we hold recog_data.  */
14987
14988 static bool
14989 dep_between_memop_and_curr (rtx memop)
14990 {
14991   rtx load_reg;
14992   int opno;
14993
14994   gcc_assert (GET_CODE (memop) == SET);
14995
14996   if (!REG_P (SET_DEST (memop)))
14997     return false;
14998
14999   load_reg = SET_DEST (memop);
15000   for (opno = 1; opno < recog_data.n_operands; opno++)
15001     {
15002       rtx operand = recog_data.operand[opno];
15003       if (REG_P (operand)
15004           && reg_overlap_mentioned_p (load_reg, operand))
15005         return true;
15006
15007     }
15008   return false;
15009 }
15010
15011
15012 /* When working around the Cortex-A53 erratum 835769,
15013    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15014    instruction and has a preceding memory instruction such that a NOP
15015    should be inserted between them.  */
15016
15017 bool
15018 aarch64_madd_needs_nop (rtx_insn* insn)
15019 {
15020   enum attr_type attr_type;
15021   rtx_insn *prev;
15022   rtx body;
15023
15024   if (!TARGET_FIX_ERR_A53_835769)
15025     return false;
15026
15027   if (!INSN_P (insn) || recog_memoized (insn) < 0)
15028     return false;
15029
15030   attr_type = get_attr_type (insn);
15031   if (!is_madd_op (attr_type))
15032     return false;
15033
15034   prev = aarch64_prev_real_insn (insn);
15035   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15036      Restore recog state to INSN to avoid state corruption.  */
15037   extract_constrain_insn_cached (insn);
15038
15039   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15040     return false;
15041
15042   body = single_set (prev);
15043
15044   /* If the previous insn is a memory op and there is no dependency between
15045      it and the DImode madd, emit a NOP between them.  If body is NULL then we
15046      have a complex memory operation, probably a load/store pair.
15047      Be conservative for now and emit a NOP.  */
15048   if (GET_MODE (recog_data.operand[0]) == DImode
15049       && (!body || !dep_between_memop_and_curr (body)))
15050     return true;
15051
15052   return false;
15053
15054 }
15055
15056
15057 /* Implement FINAL_PRESCAN_INSN.  */
15058
15059 void
15060 aarch64_final_prescan_insn (rtx_insn *insn)
15061 {
15062   if (aarch64_madd_needs_nop (insn))
15063     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15064 }
15065
15066
15067 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15068    instruction.  */
15069
15070 bool
15071 aarch64_sve_index_immediate_p (rtx base_or_step)
15072 {
15073   return (CONST_INT_P (base_or_step)
15074           && IN_RANGE (INTVAL (base_or_step), -16, 15));
15075 }
15076
15077 /* Return true if X is a valid immediate for the SVE ADD and SUB
15078    instructions.  Negate X first if NEGATE_P is true.  */
15079
15080 bool
15081 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15082 {
15083   rtx elt;
15084
15085   if (!const_vec_duplicate_p (x, &elt)
15086       || !CONST_INT_P (elt))
15087     return false;
15088
15089   HOST_WIDE_INT val = INTVAL (elt);
15090   if (negate_p)
15091     val = -val;
15092   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15093
15094   if (val & 0xff)
15095     return IN_RANGE (val, 0, 0xff);
15096   return IN_RANGE (val, 0, 0xff00);
15097 }
15098
15099 /* Return true if X is a valid immediate operand for an SVE logical
15100    instruction such as AND.  */
15101
15102 bool
15103 aarch64_sve_bitmask_immediate_p (rtx x)
15104 {
15105   rtx elt;
15106
15107   return (const_vec_duplicate_p (x, &elt)
15108           && CONST_INT_P (elt)
15109           && aarch64_bitmask_imm (INTVAL (elt),
15110                                   GET_MODE_INNER (GET_MODE (x))));
15111 }
15112
15113 /* Return true if X is a valid immediate for the SVE DUP and CPY
15114    instructions.  */
15115
15116 bool
15117 aarch64_sve_dup_immediate_p (rtx x)
15118 {
15119   rtx elt;
15120
15121   if (!const_vec_duplicate_p (x, &elt)
15122       || !CONST_INT_P (elt))
15123     return false;
15124
15125   HOST_WIDE_INT val = INTVAL (elt);
15126   if (val & 0xff)
15127     return IN_RANGE (val, -0x80, 0x7f);
15128   return IN_RANGE (val, -0x8000, 0x7f00);
15129 }
15130
15131 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15132    SIGNED_P says whether the operand is signed rather than unsigned.  */
15133
15134 bool
15135 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15136 {
15137   rtx elt;
15138
15139   return (const_vec_duplicate_p (x, &elt)
15140           && CONST_INT_P (elt)
15141           && (signed_p
15142               ? IN_RANGE (INTVAL (elt), -16, 15)
15143               : IN_RANGE (INTVAL (elt), 0, 127)));
15144 }
15145
15146 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15147    instruction.  Negate X first if NEGATE_P is true.  */
15148
15149 bool
15150 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15151 {
15152   rtx elt;
15153   REAL_VALUE_TYPE r;
15154
15155   if (!const_vec_duplicate_p (x, &elt)
15156       || GET_CODE (elt) != CONST_DOUBLE)
15157     return false;
15158
15159   r = *CONST_DOUBLE_REAL_VALUE (elt);
15160
15161   if (negate_p)
15162     r = real_value_negate (&r);
15163
15164   if (real_equal (&r, &dconst1))
15165     return true;
15166   if (real_equal (&r, &dconsthalf))
15167     return true;
15168   return false;
15169 }
15170
15171 /* Return true if X is a valid immediate operand for an SVE FMUL
15172    instruction.  */
15173
15174 bool
15175 aarch64_sve_float_mul_immediate_p (rtx x)
15176 {
15177   rtx elt;
15178
15179   /* GCC will never generate a multiply with an immediate of 2, so there is no
15180      point testing for it (even though it is a valid constant).  */
15181   return (const_vec_duplicate_p (x, &elt)
15182           && GET_CODE (elt) == CONST_DOUBLE
15183           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
15184 }
15185
15186 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15187    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
15188    is nonnull, use it to describe valid immediates.  */
15189 static bool
15190 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15191                                     simd_immediate_info *info,
15192                                     enum simd_immediate_check which,
15193                                     simd_immediate_info::insn_type insn)
15194 {
15195   /* Try a 4-byte immediate with LSL.  */
15196   for (unsigned int shift = 0; shift < 32; shift += 8)
15197     if ((val32 & (0xff << shift)) == val32)
15198       {
15199         if (info)
15200           *info = simd_immediate_info (SImode, val32 >> shift, insn,
15201                                        simd_immediate_info::LSL, shift);
15202         return true;
15203       }
15204
15205   /* Try a 2-byte immediate with LSL.  */
15206   unsigned int imm16 = val32 & 0xffff;
15207   if (imm16 == (val32 >> 16))
15208     for (unsigned int shift = 0; shift < 16; shift += 8)
15209       if ((imm16 & (0xff << shift)) == imm16)
15210         {
15211           if (info)
15212             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15213                                          simd_immediate_info::LSL, shift);
15214           return true;
15215         }
15216
15217   /* Try a 4-byte immediate with MSL, except for cases that MVN
15218      can handle.  */
15219   if (which == AARCH64_CHECK_MOV)
15220     for (unsigned int shift = 8; shift < 24; shift += 8)
15221       {
15222         unsigned int low = (1 << shift) - 1;
15223         if (((val32 & (0xff << shift)) | low) == val32)
15224           {
15225             if (info)
15226               *info = simd_immediate_info (SImode, val32 >> shift, insn,
15227                                            simd_immediate_info::MSL, shift);
15228             return true;
15229           }
15230       }
15231
15232   return false;
15233 }
15234
15235 /* Return true if replicating VAL64 is a valid immediate for the
15236    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
15237    use it to describe valid immediates.  */
15238 static bool
15239 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15240                                  simd_immediate_info *info,
15241                                  enum simd_immediate_check which)
15242 {
15243   unsigned int val32 = val64 & 0xffffffff;
15244   unsigned int val16 = val64 & 0xffff;
15245   unsigned int val8 = val64 & 0xff;
15246
15247   if (val32 == (val64 >> 32))
15248     {
15249       if ((which & AARCH64_CHECK_ORR) != 0
15250           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15251                                                  simd_immediate_info::MOV))
15252         return true;
15253
15254       if ((which & AARCH64_CHECK_BIC) != 0
15255           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15256                                                  simd_immediate_info::MVN))
15257         return true;
15258
15259       /* Try using a replicated byte.  */
15260       if (which == AARCH64_CHECK_MOV
15261           && val16 == (val32 >> 16)
15262           && val8 == (val16 >> 8))
15263         {
15264           if (info)
15265             *info = simd_immediate_info (QImode, val8);
15266           return true;
15267         }
15268     }
15269
15270   /* Try using a bit-to-bytemask.  */
15271   if (which == AARCH64_CHECK_MOV)
15272     {
15273       unsigned int i;
15274       for (i = 0; i < 64; i += 8)
15275         {
15276           unsigned char byte = (val64 >> i) & 0xff;
15277           if (byte != 0 && byte != 0xff)
15278             break;
15279         }
15280       if (i == 64)
15281         {
15282           if (info)
15283             *info = simd_immediate_info (DImode, val64);
15284           return true;
15285         }
15286     }
15287   return false;
15288 }
15289
15290 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15291    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
15292
15293 static bool
15294 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15295                              simd_immediate_info *info)
15296 {
15297   scalar_int_mode mode = DImode;
15298   unsigned int val32 = val64 & 0xffffffff;
15299   if (val32 == (val64 >> 32))
15300     {
15301       mode = SImode;
15302       unsigned int val16 = val32 & 0xffff;
15303       if (val16 == (val32 >> 16))
15304         {
15305           mode = HImode;
15306           unsigned int val8 = val16 & 0xff;
15307           if (val8 == (val16 >> 8))
15308             mode = QImode;
15309         }
15310     }
15311   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15312   if (IN_RANGE (val, -0x80, 0x7f))
15313     {
15314       /* DUP with no shift.  */
15315       if (info)
15316         *info = simd_immediate_info (mode, val);
15317       return true;
15318     }
15319   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15320     {
15321       /* DUP with LSL #8.  */
15322       if (info)
15323         *info = simd_immediate_info (mode, val);
15324       return true;
15325     }
15326   if (aarch64_bitmask_imm (val64, mode))
15327     {
15328       /* DUPM.  */
15329       if (info)
15330         *info = simd_immediate_info (mode, val);
15331       return true;
15332     }
15333   return false;
15334 }
15335
15336 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
15337    it to describe valid immediates.  */
15338
15339 static bool
15340 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15341 {
15342   if (x == CONST0_RTX (GET_MODE (x)))
15343     {
15344       if (info)
15345         *info = simd_immediate_info (DImode, 0);
15346       return true;
15347     }
15348
15349   /* Analyze the value as a VNx16BImode.  This should be relatively
15350      efficient, since rtx_vector_builder has enough built-in capacity
15351      to store all VLA predicate constants without needing the heap.  */
15352   rtx_vector_builder builder;
15353   if (!aarch64_get_sve_pred_bits (builder, x))
15354     return false;
15355
15356   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15357   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15358     {
15359       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15360       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15361       if (pattern != AARCH64_NUM_SVPATTERNS)
15362         {
15363           if (info)
15364             {
15365               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15366               *info = simd_immediate_info (int_mode, pattern);
15367             }
15368           return true;
15369         }
15370     }
15371   return false;
15372 }
15373
15374 /* Return true if OP is a valid SIMD immediate for the operation
15375    described by WHICH.  If INFO is nonnull, use it to describe valid
15376    immediates.  */
15377 bool
15378 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15379                               enum simd_immediate_check which)
15380 {
15381   machine_mode mode = GET_MODE (op);
15382   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15383   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15384     return false;
15385
15386   if (vec_flags & VEC_SVE_PRED)
15387     return aarch64_sve_pred_valid_immediate (op, info);
15388
15389   scalar_mode elt_mode = GET_MODE_INNER (mode);
15390   rtx base, step;
15391   unsigned int n_elts;
15392   if (GET_CODE (op) == CONST_VECTOR
15393       && CONST_VECTOR_DUPLICATE_P (op))
15394     n_elts = CONST_VECTOR_NPATTERNS (op);
15395   else if ((vec_flags & VEC_SVE_DATA)
15396            && const_vec_series_p (op, &base, &step))
15397     {
15398       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15399       if (!aarch64_sve_index_immediate_p (base)
15400           || !aarch64_sve_index_immediate_p (step))
15401         return false;
15402
15403       if (info)
15404         *info = simd_immediate_info (elt_mode, base, step);
15405       return true;
15406     }
15407   else if (GET_CODE (op) == CONST_VECTOR
15408            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15409     /* N_ELTS set above.  */;
15410   else
15411     return false;
15412
15413   scalar_float_mode elt_float_mode;
15414   if (n_elts == 1
15415       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15416     {
15417       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15418       if (aarch64_float_const_zero_rtx_p (elt)
15419           || aarch64_float_const_representable_p (elt))
15420         {
15421           if (info)
15422             *info = simd_immediate_info (elt_float_mode, elt);
15423           return true;
15424         }
15425     }
15426
15427   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15428   if (elt_size > 8)
15429     return false;
15430
15431   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15432
15433   /* Expand the vector constant out into a byte vector, with the least
15434      significant byte of the register first.  */
15435   auto_vec<unsigned char, 16> bytes;
15436   bytes.reserve (n_elts * elt_size);
15437   for (unsigned int i = 0; i < n_elts; i++)
15438     {
15439       /* The vector is provided in gcc endian-neutral fashion.
15440          For aarch64_be Advanced SIMD, it must be laid out in the vector
15441          register in reverse order.  */
15442       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15443       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15444
15445       if (elt_mode != elt_int_mode)
15446         elt = gen_lowpart (elt_int_mode, elt);
15447
15448       if (!CONST_INT_P (elt))
15449         return false;
15450
15451       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15452       for (unsigned int byte = 0; byte < elt_size; byte++)
15453         {
15454           bytes.quick_push (elt_val & 0xff);
15455           elt_val >>= BITS_PER_UNIT;
15456         }
15457     }
15458
15459   /* The immediate must repeat every eight bytes.  */
15460   unsigned int nbytes = bytes.length ();
15461   for (unsigned i = 8; i < nbytes; ++i)
15462     if (bytes[i] != bytes[i - 8])
15463       return false;
15464
15465   /* Get the repeating 8-byte value as an integer.  No endian correction
15466      is needed here because bytes is already in lsb-first order.  */
15467   unsigned HOST_WIDE_INT val64 = 0;
15468   for (unsigned int i = 0; i < 8; i++)
15469     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15470               << (i * BITS_PER_UNIT));
15471
15472   if (vec_flags & VEC_SVE_DATA)
15473     return aarch64_sve_valid_immediate (val64, info);
15474   else
15475     return aarch64_advsimd_valid_immediate (val64, info, which);
15476 }
15477
15478 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15479    has a step in the range of INDEX.  Return the index expression if so,
15480    otherwise return null.  */
15481 rtx
15482 aarch64_check_zero_based_sve_index_immediate (rtx x)
15483 {
15484   rtx base, step;
15485   if (const_vec_series_p (x, &base, &step)
15486       && base == const0_rtx
15487       && aarch64_sve_index_immediate_p (step))
15488     return step;
15489   return NULL_RTX;
15490 }
15491
15492 /* Check of immediate shift constants are within range.  */
15493 bool
15494 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15495 {
15496   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15497   if (left)
15498     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15499   else
15500     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15501 }
15502
15503 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15504    operation of width WIDTH at bit position POS.  */
15505
15506 rtx
15507 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15508 {
15509   gcc_assert (CONST_INT_P (width));
15510   gcc_assert (CONST_INT_P (pos));
15511
15512   unsigned HOST_WIDE_INT mask
15513     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15514   return GEN_INT (mask << UINTVAL (pos));
15515 }
15516
15517 bool
15518 aarch64_mov_operand_p (rtx x, machine_mode mode)
15519 {
15520   if (GET_CODE (x) == HIGH
15521       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15522     return true;
15523
15524   if (CONST_INT_P (x))
15525     return true;
15526
15527   if (VECTOR_MODE_P (GET_MODE (x)))
15528     {
15529       /* Require predicate constants to be VNx16BI before RA, so that we
15530          force everything to have a canonical form.  */
15531       if (!lra_in_progress
15532           && !reload_completed
15533           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15534           && GET_MODE (x) != VNx16BImode)
15535         return false;
15536
15537       return aarch64_simd_valid_immediate (x, NULL);
15538     }
15539
15540   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15541     return true;
15542
15543   if (aarch64_sve_cnt_immediate_p (x))
15544     return true;
15545
15546   return aarch64_classify_symbolic_expression (x)
15547     == SYMBOL_TINY_ABSOLUTE;
15548 }
15549
15550 /* Return a const_int vector of VAL.  */
15551 rtx
15552 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15553 {
15554   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15555   return gen_const_vec_duplicate (mode, c);
15556 }
15557
15558 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
15559
15560 bool
15561 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15562 {
15563   machine_mode vmode;
15564
15565   vmode = aarch64_simd_container_mode (mode, 64);
15566   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15567   return aarch64_simd_valid_immediate (op_v, NULL);
15568 }
15569
15570 /* Construct and return a PARALLEL RTX vector with elements numbering the
15571    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15572    the vector - from the perspective of the architecture.  This does not
15573    line up with GCC's perspective on lane numbers, so we end up with
15574    different masks depending on our target endian-ness.  The diagram
15575    below may help.  We must draw the distinction when building masks
15576    which select one half of the vector.  An instruction selecting
15577    architectural low-lanes for a big-endian target, must be described using
15578    a mask selecting GCC high-lanes.
15579
15580                  Big-Endian             Little-Endian
15581
15582 GCC             0   1   2   3           3   2   1   0
15583               | x | x | x | x |       | x | x | x | x |
15584 Architecture    3   2   1   0           3   2   1   0
15585
15586 Low Mask:         { 2, 3 }                { 0, 1 }
15587 High Mask:        { 0, 1 }                { 2, 3 }
15588
15589    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
15590
15591 rtx
15592 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15593 {
15594   rtvec v = rtvec_alloc (nunits / 2);
15595   int high_base = nunits / 2;
15596   int low_base = 0;
15597   int base;
15598   rtx t1;
15599   int i;
15600
15601   if (BYTES_BIG_ENDIAN)
15602     base = high ? low_base : high_base;
15603   else
15604     base = high ? high_base : low_base;
15605
15606   for (i = 0; i < nunits / 2; i++)
15607     RTVEC_ELT (v, i) = GEN_INT (base + i);
15608
15609   t1 = gen_rtx_PARALLEL (mode, v);
15610   return t1;
15611 }
15612
15613 /* Check OP for validity as a PARALLEL RTX vector with elements
15614    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15615    from the perspective of the architecture.  See the diagram above
15616    aarch64_simd_vect_par_cnst_half for more details.  */
15617
15618 bool
15619 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15620                                        bool high)
15621 {
15622   int nelts;
15623   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15624     return false;
15625
15626   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15627   HOST_WIDE_INT count_op = XVECLEN (op, 0);
15628   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15629   int i = 0;
15630
15631   if (count_op != count_ideal)
15632     return false;
15633
15634   for (i = 0; i < count_ideal; i++)
15635     {
15636       rtx elt_op = XVECEXP (op, 0, i);
15637       rtx elt_ideal = XVECEXP (ideal, 0, i);
15638
15639       if (!CONST_INT_P (elt_op)
15640           || INTVAL (elt_ideal) != INTVAL (elt_op))
15641         return false;
15642     }
15643   return true;
15644 }
15645
15646 /* Return a PARALLEL containing NELTS elements, with element I equal
15647    to BASE + I * STEP.  */
15648
15649 rtx
15650 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15651 {
15652   rtvec vec = rtvec_alloc (nelts);
15653   for (unsigned int i = 0; i < nelts; ++i)
15654     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15655   return gen_rtx_PARALLEL (VOIDmode, vec);
15656 }
15657
15658 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15659    series with step STEP.  */
15660
15661 bool
15662 aarch64_stepped_int_parallel_p (rtx op, int step)
15663 {
15664   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15665     return false;
15666
15667   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15668   for (int i = 1; i < XVECLEN (op, 0); ++i)
15669     if (!CONST_INT_P (XVECEXP (op, 0, i))
15670         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15671       return false;
15672
15673   return true;
15674 }
15675
15676 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
15677    HIGH (exclusive).  */
15678 void
15679 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15680                           const_tree exp)
15681 {
15682   HOST_WIDE_INT lane;
15683   gcc_assert (CONST_INT_P (operand));
15684   lane = INTVAL (operand);
15685
15686   if (lane < low || lane >= high)
15687   {
15688     if (exp)
15689       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15690     else
15691       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15692   }
15693 }
15694
15695 /* Peform endian correction on lane number N, which indexes a vector
15696    of mode MODE, and return the result as an SImode rtx.  */
15697
15698 rtx
15699 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15700 {
15701   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15702 }
15703
15704 /* Return TRUE if OP is a valid vector addressing mode.  */
15705
15706 bool
15707 aarch64_simd_mem_operand_p (rtx op)
15708 {
15709   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15710                         || REG_P (XEXP (op, 0)));
15711 }
15712
15713 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
15714
15715 bool
15716 aarch64_sve_ld1r_operand_p (rtx op)
15717 {
15718   struct aarch64_address_info addr;
15719   scalar_mode mode;
15720
15721   return (MEM_P (op)
15722           && is_a <scalar_mode> (GET_MODE (op), &mode)
15723           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15724           && addr.type == ADDRESS_REG_IMM
15725           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15726 }
15727
15728 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
15729 bool
15730 aarch64_sve_ld1rq_operand_p (rtx op)
15731 {
15732   struct aarch64_address_info addr;
15733   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15734   if (!MEM_P (op)
15735       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15736     return false;
15737
15738   if (addr.type == ADDRESS_REG_IMM)
15739     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15740
15741   if (addr.type == ADDRESS_REG_REG)
15742     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15743
15744   return false;
15745 }
15746
15747 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15748    The conditions for STR are the same.  */
15749 bool
15750 aarch64_sve_ldr_operand_p (rtx op)
15751 {
15752   struct aarch64_address_info addr;
15753
15754   return (MEM_P (op)
15755           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15756                                        false, ADDR_QUERY_ANY)
15757           && addr.type == ADDRESS_REG_IMM);
15758 }
15759
15760 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15761    We need to be able to access the individual pieces, so the range
15762    is different from LD[234] and ST[234].  */
15763 bool
15764 aarch64_sve_struct_memory_operand_p (rtx op)
15765 {
15766   if (!MEM_P (op))
15767     return false;
15768
15769   machine_mode mode = GET_MODE (op);
15770   struct aarch64_address_info addr;
15771   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15772                                  ADDR_QUERY_ANY)
15773       || addr.type != ADDRESS_REG_IMM)
15774     return false;
15775
15776   poly_int64 first = addr.const_offset;
15777   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15778   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15779           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15780 }
15781
15782 /* Emit a register copy from operand to operand, taking care not to
15783    early-clobber source registers in the process.
15784
15785    COUNT is the number of components into which the copy needs to be
15786    decomposed.  */
15787 void
15788 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15789                                 unsigned int count)
15790 {
15791   unsigned int i;
15792   int rdest = REGNO (operands[0]);
15793   int rsrc = REGNO (operands[1]);
15794
15795   if (!reg_overlap_mentioned_p (operands[0], operands[1])
15796       || rdest < rsrc)
15797     for (i = 0; i < count; i++)
15798       emit_move_insn (gen_rtx_REG (mode, rdest + i),
15799                       gen_rtx_REG (mode, rsrc + i));
15800   else
15801     for (i = 0; i < count; i++)
15802       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15803                       gen_rtx_REG (mode, rsrc + count - i - 1));
15804 }
15805
15806 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15807    one of VSTRUCT modes: OI, CI, or XI.  */
15808 int
15809 aarch64_simd_attr_length_rglist (machine_mode mode)
15810 {
15811   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
15812   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15813 }
15814
15815 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
15816    alignment of a vector to 128 bits.  SVE predicates have an alignment of
15817    16 bits.  */
15818 static HOST_WIDE_INT
15819 aarch64_simd_vector_alignment (const_tree type)
15820 {
15821   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15822     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15823        be set for non-predicate vectors of booleans.  Modes are the most
15824        direct way we have of identifying real SVE predicate types.  */
15825     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15826   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15827 }
15828
15829 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
15830 static poly_uint64
15831 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15832 {
15833   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15834     {
15835       /* If the length of the vector is fixed, try to align to that length,
15836          otherwise don't try to align at all.  */
15837       HOST_WIDE_INT result;
15838       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15839         result = TYPE_ALIGN (TREE_TYPE (type));
15840       return result;
15841     }
15842   return TYPE_ALIGN (type);
15843 }
15844
15845 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
15846 static bool
15847 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15848 {
15849   if (is_packed)
15850     return false;
15851
15852   /* For fixed-length vectors, check that the vectorizer will aim for
15853      full-vector alignment.  This isn't true for generic GCC vectors
15854      that are wider than the ABI maximum of 128 bits.  */
15855   poly_uint64 preferred_alignment =
15856     aarch64_vectorize_preferred_vector_alignment (type);
15857   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15858       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15859                    preferred_alignment))
15860     return false;
15861
15862   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
15863   return true;
15864 }
15865
15866 /* Return true if the vector misalignment factor is supported by the
15867    target.  */
15868 static bool
15869 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15870                                              const_tree type, int misalignment,
15871                                              bool is_packed)
15872 {
15873   if (TARGET_SIMD && STRICT_ALIGNMENT)
15874     {
15875       /* Return if movmisalign pattern is not supported for this mode.  */
15876       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15877         return false;
15878
15879       /* Misalignment factor is unknown at compile time.  */
15880       if (misalignment == -1)
15881         return false;
15882     }
15883   return default_builtin_support_vector_misalignment (mode, type, misalignment,
15884                                                       is_packed);
15885 }
15886
15887 /* If VALS is a vector constant that can be loaded into a register
15888    using DUP, generate instructions to do so and return an RTX to
15889    assign to the register.  Otherwise return NULL_RTX.  */
15890 static rtx
15891 aarch64_simd_dup_constant (rtx vals)
15892 {
15893   machine_mode mode = GET_MODE (vals);
15894   machine_mode inner_mode = GET_MODE_INNER (mode);
15895   rtx x;
15896
15897   if (!const_vec_duplicate_p (vals, &x))
15898     return NULL_RTX;
15899
15900   /* We can load this constant by using DUP and a constant in a
15901      single ARM register.  This will be cheaper than a vector
15902      load.  */
15903   x = copy_to_mode_reg (inner_mode, x);
15904   return gen_vec_duplicate (mode, x);
15905 }
15906
15907
15908 /* Generate code to load VALS, which is a PARALLEL containing only
15909    constants (for vec_init) or CONST_VECTOR, efficiently into a
15910    register.  Returns an RTX to copy into the register, or NULL_RTX
15911    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
15912 static rtx
15913 aarch64_simd_make_constant (rtx vals)
15914 {
15915   machine_mode mode = GET_MODE (vals);
15916   rtx const_dup;
15917   rtx const_vec = NULL_RTX;
15918   int n_const = 0;
15919   int i;
15920
15921   if (GET_CODE (vals) == CONST_VECTOR)
15922     const_vec = vals;
15923   else if (GET_CODE (vals) == PARALLEL)
15924     {
15925       /* A CONST_VECTOR must contain only CONST_INTs and
15926          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15927          Only store valid constants in a CONST_VECTOR.  */
15928       int n_elts = XVECLEN (vals, 0);
15929       for (i = 0; i < n_elts; ++i)
15930         {
15931           rtx x = XVECEXP (vals, 0, i);
15932           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15933             n_const++;
15934         }
15935       if (n_const == n_elts)
15936         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15937     }
15938   else
15939     gcc_unreachable ();
15940
15941   if (const_vec != NULL_RTX
15942       && aarch64_simd_valid_immediate (const_vec, NULL))
15943     /* Load using MOVI/MVNI.  */
15944     return const_vec;
15945   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15946     /* Loaded using DUP.  */
15947     return const_dup;
15948   else if (const_vec != NULL_RTX)
15949     /* Load from constant pool. We cannot take advantage of single-cycle
15950        LD1 because we need a PC-relative addressing mode.  */
15951     return const_vec;
15952   else
15953     /* A PARALLEL containing something not valid inside CONST_VECTOR.
15954        We cannot construct an initializer.  */
15955     return NULL_RTX;
15956 }
15957
15958 /* Expand a vector initialisation sequence, such that TARGET is
15959    initialised to contain VALS.  */
15960
15961 void
15962 aarch64_expand_vector_init (rtx target, rtx vals)
15963 {
15964   machine_mode mode = GET_MODE (target);
15965   scalar_mode inner_mode = GET_MODE_INNER (mode);
15966   /* The number of vector elements.  */
15967   int n_elts = XVECLEN (vals, 0);
15968   /* The number of vector elements which are not constant.  */
15969   int n_var = 0;
15970   rtx any_const = NULL_RTX;
15971   /* The first element of vals.  */
15972   rtx v0 = XVECEXP (vals, 0, 0);
15973   bool all_same = true;
15974
15975   /* This is a special vec_init<M><N> where N is not an element mode but a
15976      vector mode with half the elements of M.  We expect to find two entries
15977      of mode N in VALS and we must put their concatentation into TARGET.  */
15978   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
15979     {
15980       gcc_assert (known_eq (GET_MODE_SIZE (mode),
15981                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
15982       rtx lo = XVECEXP (vals, 0, 0);
15983       rtx hi = XVECEXP (vals, 0, 1);
15984       machine_mode narrow_mode = GET_MODE (lo);
15985       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
15986       gcc_assert (narrow_mode == GET_MODE (hi));
15987
15988       /* When we want to concatenate a half-width vector with zeroes we can
15989          use the aarch64_combinez[_be] patterns.  Just make sure that the
15990          zeroes are in the right half.  */
15991       if (BYTES_BIG_ENDIAN
15992           && aarch64_simd_imm_zero (lo, narrow_mode)
15993           && general_operand (hi, narrow_mode))
15994         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
15995       else if (!BYTES_BIG_ENDIAN
15996                && aarch64_simd_imm_zero (hi, narrow_mode)
15997                && general_operand (lo, narrow_mode))
15998         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
15999       else
16000         {
16001           /* Else create the two half-width registers and combine them.  */
16002           if (!REG_P (lo))
16003             lo = force_reg (GET_MODE (lo), lo);
16004           if (!REG_P (hi))
16005             hi = force_reg (GET_MODE (hi), hi);
16006
16007           if (BYTES_BIG_ENDIAN)
16008             std::swap (lo, hi);
16009           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16010         }
16011      return;
16012    }
16013
16014   /* Count the number of variable elements to initialise.  */
16015   for (int i = 0; i < n_elts; ++i)
16016     {
16017       rtx x = XVECEXP (vals, 0, i);
16018       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16019         ++n_var;
16020       else
16021         any_const = x;
16022
16023       all_same &= rtx_equal_p (x, v0);
16024     }
16025
16026   /* No variable elements, hand off to aarch64_simd_make_constant which knows
16027      how best to handle this.  */
16028   if (n_var == 0)
16029     {
16030       rtx constant = aarch64_simd_make_constant (vals);
16031       if (constant != NULL_RTX)
16032         {
16033           emit_move_insn (target, constant);
16034           return;
16035         }
16036     }
16037
16038   /* Splat a single non-constant element if we can.  */
16039   if (all_same)
16040     {
16041       rtx x = copy_to_mode_reg (inner_mode, v0);
16042       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16043       return;
16044     }
16045
16046   enum insn_code icode = optab_handler (vec_set_optab, mode);
16047   gcc_assert (icode != CODE_FOR_nothing);
16048
16049   /* If there are only variable elements, try to optimize
16050      the insertion using dup for the most common element
16051      followed by insertions.  */
16052
16053   /* The algorithm will fill matches[*][0] with the earliest matching element,
16054      and matches[X][1] with the count of duplicate elements (if X is the
16055      earliest element which has duplicates).  */
16056
16057   if (n_var == n_elts && n_elts <= 16)
16058     {
16059       int matches[16][2] = {0};
16060       for (int i = 0; i < n_elts; i++)
16061         {
16062           for (int j = 0; j <= i; j++)
16063             {
16064               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16065                 {
16066                   matches[i][0] = j;
16067                   matches[j][1]++;
16068                   break;
16069                 }
16070             }
16071         }
16072       int maxelement = 0;
16073       int maxv = 0;
16074       for (int i = 0; i < n_elts; i++)
16075         if (matches[i][1] > maxv)
16076           {
16077             maxelement = i;
16078             maxv = matches[i][1];
16079           }
16080
16081       /* Create a duplicate of the most common element, unless all elements
16082          are equally useless to us, in which case just immediately set the
16083          vector register using the first element.  */
16084
16085       if (maxv == 1)
16086         {
16087           /* For vectors of two 64-bit elements, we can do even better.  */
16088           if (n_elts == 2
16089               && (inner_mode == E_DImode
16090                   || inner_mode == E_DFmode))
16091
16092             {
16093               rtx x0 = XVECEXP (vals, 0, 0);
16094               rtx x1 = XVECEXP (vals, 0, 1);
16095               /* Combine can pick up this case, but handling it directly
16096                  here leaves clearer RTL.
16097
16098                  This is load_pair_lanes<mode>, and also gives us a clean-up
16099                  for store_pair_lanes<mode>.  */
16100               if (memory_operand (x0, inner_mode)
16101                   && memory_operand (x1, inner_mode)
16102                   && !STRICT_ALIGNMENT
16103                   && rtx_equal_p (XEXP (x1, 0),
16104                                   plus_constant (Pmode,
16105                                                  XEXP (x0, 0),
16106                                                  GET_MODE_SIZE (inner_mode))))
16107                 {
16108                   rtx t;
16109                   if (inner_mode == DFmode)
16110                     t = gen_load_pair_lanesdf (target, x0, x1);
16111                   else
16112                     t = gen_load_pair_lanesdi (target, x0, x1);
16113                   emit_insn (t);
16114                   return;
16115                 }
16116             }
16117           /* The subreg-move sequence below will move into lane zero of the
16118              vector register.  For big-endian we want that position to hold
16119              the last element of VALS.  */
16120           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16121           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16122           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16123         }
16124       else
16125         {
16126           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16127           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16128         }
16129
16130       /* Insert the rest.  */
16131       for (int i = 0; i < n_elts; i++)
16132         {
16133           rtx x = XVECEXP (vals, 0, i);
16134           if (matches[i][0] == maxelement)
16135             continue;
16136           x = copy_to_mode_reg (inner_mode, x);
16137           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16138         }
16139       return;
16140     }
16141
16142   /* Initialise a vector which is part-variable.  We want to first try
16143      to build those lanes which are constant in the most efficient way we
16144      can.  */
16145   if (n_var != n_elts)
16146     {
16147       rtx copy = copy_rtx (vals);
16148
16149       /* Load constant part of vector.  We really don't care what goes into the
16150          parts we will overwrite, but we're more likely to be able to load the
16151          constant efficiently if it has fewer, larger, repeating parts
16152          (see aarch64_simd_valid_immediate).  */
16153       for (int i = 0; i < n_elts; i++)
16154         {
16155           rtx x = XVECEXP (vals, 0, i);
16156           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16157             continue;
16158           rtx subst = any_const;
16159           for (int bit = n_elts / 2; bit > 0; bit /= 2)
16160             {
16161               /* Look in the copied vector, as more elements are const.  */
16162               rtx test = XVECEXP (copy, 0, i ^ bit);
16163               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16164                 {
16165                   subst = test;
16166                   break;
16167                 }
16168             }
16169           XVECEXP (copy, 0, i) = subst;
16170         }
16171       aarch64_expand_vector_init (target, copy);
16172     }
16173
16174   /* Insert the variable lanes directly.  */
16175   for (int i = 0; i < n_elts; i++)
16176     {
16177       rtx x = XVECEXP (vals, 0, i);
16178       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16179         continue;
16180       x = copy_to_mode_reg (inner_mode, x);
16181       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16182     }
16183 }
16184
16185 /* Emit RTL corresponding to:
16186    insr TARGET, ELEM.  */
16187
16188 static void
16189 emit_insr (rtx target, rtx elem)
16190 {
16191   machine_mode mode = GET_MODE (target);
16192   scalar_mode elem_mode = GET_MODE_INNER (mode);
16193   elem = force_reg (elem_mode, elem);
16194
16195   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16196   gcc_assert (icode != CODE_FOR_nothing);
16197   emit_insn (GEN_FCN (icode) (target, target, elem));
16198 }
16199
16200 /* Subroutine of aarch64_sve_expand_vector_init for handling
16201    trailing constants.
16202    This function works as follows:
16203    (a) Create a new vector consisting of trailing constants.
16204    (b) Initialize TARGET with the constant vector using emit_move_insn.
16205    (c) Insert remaining elements in TARGET using insr.
16206    NELTS is the total number of elements in original vector while
16207    while NELTS_REQD is the number of elements that are actually
16208    significant.
16209
16210    ??? The heuristic used is to do above only if number of constants
16211    is at least half the total number of elements.  May need fine tuning.  */
16212
16213 static bool
16214 aarch64_sve_expand_vector_init_handle_trailing_constants
16215  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16216 {
16217   machine_mode mode = GET_MODE (target);
16218   scalar_mode elem_mode = GET_MODE_INNER (mode);
16219   int n_trailing_constants = 0;
16220
16221   for (int i = nelts_reqd - 1;
16222        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16223        i--)
16224     n_trailing_constants++;
16225
16226   if (n_trailing_constants >= nelts_reqd / 2)
16227     {
16228       rtx_vector_builder v (mode, 1, nelts);
16229       for (int i = 0; i < nelts; i++)
16230         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16231       rtx const_vec = v.build ();
16232       emit_move_insn (target, const_vec);
16233
16234       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16235         emit_insr (target, builder.elt (i));
16236
16237       return true;
16238     }
16239
16240   return false;
16241 }
16242
16243 /* Subroutine of aarch64_sve_expand_vector_init.
16244    Works as follows:
16245    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16246    (b) Skip trailing elements from BUILDER, which are the same as
16247        element NELTS_REQD - 1.
16248    (c) Insert earlier elements in reverse order in TARGET using insr.  */
16249
16250 static void
16251 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16252                                              const rtx_vector_builder &builder,
16253                                              int nelts_reqd)
16254 {
16255   machine_mode mode = GET_MODE (target);
16256   scalar_mode elem_mode = GET_MODE_INNER (mode);
16257
16258   struct expand_operand ops[2];
16259   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16260   gcc_assert (icode != CODE_FOR_nothing);
16261
16262   create_output_operand (&ops[0], target, mode);
16263   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16264   expand_insn (icode, 2, ops);
16265
16266   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16267   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16268     emit_insr (target, builder.elt (i));
16269 }
16270
16271 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16272    when all trailing elements of builder are same.
16273    This works as follows:
16274    (a) Use expand_insn interface to broadcast last vector element in TARGET.
16275    (b) Insert remaining elements in TARGET using insr.
16276
16277    ??? The heuristic used is to do above if number of same trailing elements
16278    is at least 3/4 of total number of elements, loosely based on
16279    heuristic from mostly_zeros_p.  May need fine-tuning.  */
16280
16281 static bool
16282 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16283  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16284 {
16285   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16286   if (ndups >= (3 * nelts_reqd) / 4)
16287     {
16288       aarch64_sve_expand_vector_init_insert_elems (target, builder,
16289                                                    nelts_reqd - ndups + 1);
16290       return true;
16291     }
16292
16293   return false;
16294 }
16295
16296 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16297    of elements in BUILDER.
16298
16299    The function tries to initialize TARGET from BUILDER if it fits one
16300    of the special cases outlined below.
16301
16302    Failing that, the function divides BUILDER into two sub-vectors:
16303    v_even = even elements of BUILDER;
16304    v_odd = odd elements of BUILDER;
16305
16306    and recursively calls itself with v_even and v_odd.
16307
16308    if (recursive call succeeded for v_even or v_odd)
16309      TARGET = zip (v_even, v_odd)
16310
16311    The function returns true if it managed to build TARGET from BUILDER
16312    with one of the special cases, false otherwise.
16313
16314    Example: {a, 1, b, 2, c, 3, d, 4}
16315
16316    The vector gets divided into:
16317    v_even = {a, b, c, d}
16318    v_odd = {1, 2, 3, 4}
16319
16320    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16321    initialize tmp2 from constant vector v_odd using emit_move_insn.
16322
16323    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16324    4 elements, so we construct tmp1 from v_even using insr:
16325    tmp1 = dup(d)
16326    insr tmp1, c
16327    insr tmp1, b
16328    insr tmp1, a
16329
16330    And finally:
16331    TARGET = zip (tmp1, tmp2)
16332    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
16333
16334 static bool
16335 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16336                                 int nelts, int nelts_reqd)
16337 {
16338   machine_mode mode = GET_MODE (target);
16339
16340   /* Case 1: Vector contains trailing constants.  */
16341
16342   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16343        (target, builder, nelts, nelts_reqd))
16344     return true;
16345
16346   /* Case 2: Vector contains leading constants.  */
16347
16348   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16349   for (int i = 0; i < nelts_reqd; i++)
16350     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16351   rev_builder.finalize ();
16352
16353   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16354        (target, rev_builder, nelts, nelts_reqd))
16355     {
16356       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16357       return true;
16358     }
16359
16360   /* Case 3: Vector contains trailing same element.  */
16361
16362   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16363        (target, builder, nelts_reqd))
16364     return true;
16365
16366   /* Case 4: Vector contains leading same element.  */
16367
16368   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16369        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16370     {
16371       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16372       return true;
16373     }
16374
16375   /* Avoid recursing below 4-elements.
16376      ??? The threshold 4 may need fine-tuning.  */
16377
16378   if (nelts_reqd <= 4)
16379     return false;
16380
16381   rtx_vector_builder v_even (mode, 1, nelts);
16382   rtx_vector_builder v_odd (mode, 1, nelts);
16383
16384   for (int i = 0; i < nelts * 2; i += 2)
16385     {
16386       v_even.quick_push (builder.elt (i));
16387       v_odd.quick_push (builder.elt (i + 1));
16388     }
16389
16390   v_even.finalize ();
16391   v_odd.finalize ();
16392
16393   rtx tmp1 = gen_reg_rtx (mode);
16394   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16395                                                     nelts, nelts_reqd / 2);
16396
16397   rtx tmp2 = gen_reg_rtx (mode);
16398   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16399                                                    nelts, nelts_reqd / 2);
16400
16401   if (!did_even_p && !did_odd_p)
16402     return false;
16403
16404   /* Initialize v_even and v_odd using INSR if it didn't match any of the
16405      special cases and zip v_even, v_odd.  */
16406
16407   if (!did_even_p)
16408     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16409
16410   if (!did_odd_p)
16411     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16412
16413   rtvec v = gen_rtvec (2, tmp1, tmp2);
16414   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16415   return true;
16416 }
16417
16418 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
16419
16420 void
16421 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16422 {
16423   machine_mode mode = GET_MODE (target);
16424   int nelts = XVECLEN (vals, 0);
16425
16426   rtx_vector_builder v (mode, 1, nelts);
16427   for (int i = 0; i < nelts; i++)
16428     v.quick_push (XVECEXP (vals, 0, i));
16429   v.finalize ();
16430
16431   /* If neither sub-vectors of v could be initialized specially,
16432      then use INSR to insert all elements from v into TARGET.
16433      ??? This might not be optimal for vectors with large
16434      initializers like 16-element or above.
16435      For nelts < 4, it probably isn't useful to handle specially.  */
16436
16437   if (nelts < 4
16438       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16439     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16440 }
16441
16442 static unsigned HOST_WIDE_INT
16443 aarch64_shift_truncation_mask (machine_mode mode)
16444 {
16445   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16446     return 0;
16447   return GET_MODE_UNIT_BITSIZE (mode) - 1;
16448 }
16449
16450 /* Select a format to encode pointers in exception handling data.  */
16451 int
16452 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16453 {
16454    int type;
16455    switch (aarch64_cmodel)
16456      {
16457      case AARCH64_CMODEL_TINY:
16458      case AARCH64_CMODEL_TINY_PIC:
16459      case AARCH64_CMODEL_SMALL:
16460      case AARCH64_CMODEL_SMALL_PIC:
16461      case AARCH64_CMODEL_SMALL_SPIC:
16462        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
16463           for everything.  */
16464        type = DW_EH_PE_sdata4;
16465        break;
16466      default:
16467        /* No assumptions here.  8-byte relocs required.  */
16468        type = DW_EH_PE_sdata8;
16469        break;
16470      }
16471    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16472 }
16473
16474 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
16475
16476 static void
16477 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16478 {
16479   if (aarch64_simd_decl_p (decl))
16480     {
16481       fprintf (stream, "\t.variant_pcs\t");
16482       assemble_name (stream, name);
16483       fprintf (stream, "\n");
16484     }
16485 }
16486
16487 /* The last .arch and .tune assembly strings that we printed.  */
16488 static std::string aarch64_last_printed_arch_string;
16489 static std::string aarch64_last_printed_tune_string;
16490
16491 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
16492    by the function fndecl.  */
16493
16494 void
16495 aarch64_declare_function_name (FILE *stream, const char* name,
16496                                 tree fndecl)
16497 {
16498   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16499
16500   struct cl_target_option *targ_options;
16501   if (target_parts)
16502     targ_options = TREE_TARGET_OPTION (target_parts);
16503   else
16504     targ_options = TREE_TARGET_OPTION (target_option_current_node);
16505   gcc_assert (targ_options);
16506
16507   const struct processor *this_arch
16508     = aarch64_get_arch (targ_options->x_explicit_arch);
16509
16510   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16511   std::string extension
16512     = aarch64_get_extension_string_for_isa_flags (isa_flags,
16513                                                   this_arch->flags);
16514   /* Only update the assembler .arch string if it is distinct from the last
16515      such string we printed.  */
16516   std::string to_print = this_arch->name + extension;
16517   if (to_print != aarch64_last_printed_arch_string)
16518     {
16519       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16520       aarch64_last_printed_arch_string = to_print;
16521     }
16522
16523   /* Print the cpu name we're tuning for in the comments, might be
16524      useful to readers of the generated asm.  Do it only when it changes
16525      from function to function and verbose assembly is requested.  */
16526   const struct processor *this_tune
16527     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16528
16529   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16530     {
16531       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16532                    this_tune->name);
16533       aarch64_last_printed_tune_string = this_tune->name;
16534     }
16535
16536   aarch64_asm_output_variant_pcs (stream, fndecl, name);
16537
16538   /* Don't forget the type directive for ELF.  */
16539   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16540   ASM_OUTPUT_LABEL (stream, name);
16541 }
16542
16543 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
16544
16545 void
16546 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16547 {
16548   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16549   const char *value = IDENTIFIER_POINTER (target);
16550   aarch64_asm_output_variant_pcs (stream, decl, name);
16551   ASM_OUTPUT_DEF (stream, name, value);
16552 }
16553
16554 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
16555    function symbol references.  */
16556
16557 void
16558 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16559 {
16560   default_elf_asm_output_external (stream, decl, name);
16561   aarch64_asm_output_variant_pcs (stream, decl, name);
16562 }
16563
16564 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16565    Used to output the .cfi_b_key_frame directive when signing the current
16566    function with the B key.  */
16567
16568 void
16569 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16570 {
16571   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16572       && aarch64_ra_sign_key == AARCH64_KEY_B)
16573         asm_fprintf (f, "\t.cfi_b_key_frame\n");
16574 }
16575
16576 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
16577
16578 static void
16579 aarch64_start_file (void)
16580 {
16581   struct cl_target_option *default_options
16582     = TREE_TARGET_OPTION (target_option_default_node);
16583
16584   const struct processor *default_arch
16585     = aarch64_get_arch (default_options->x_explicit_arch);
16586   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16587   std::string extension
16588     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16589                                                   default_arch->flags);
16590
16591    aarch64_last_printed_arch_string = default_arch->name + extension;
16592    aarch64_last_printed_tune_string = "";
16593    asm_fprintf (asm_out_file, "\t.arch %s\n",
16594                 aarch64_last_printed_arch_string.c_str ());
16595
16596    default_file_start ();
16597 }
16598
16599 /* Emit load exclusive.  */
16600
16601 static void
16602 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16603                              rtx mem, rtx model_rtx)
16604 {
16605   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16606 }
16607
16608 /* Emit store exclusive.  */
16609
16610 static void
16611 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16612                               rtx rval, rtx mem, rtx model_rtx)
16613 {
16614   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16615 }
16616
16617 /* Mark the previous jump instruction as unlikely.  */
16618
16619 static void
16620 aarch64_emit_unlikely_jump (rtx insn)
16621 {
16622   rtx_insn *jump = emit_jump_insn (insn);
16623   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16624 }
16625
16626 /* Expand a compare and swap pattern.  */
16627
16628 void
16629 aarch64_expand_compare_and_swap (rtx operands[])
16630 {
16631   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16632   machine_mode mode, r_mode;
16633
16634   bval = operands[0];
16635   rval = operands[1];
16636   mem = operands[2];
16637   oldval = operands[3];
16638   newval = operands[4];
16639   is_weak = operands[5];
16640   mod_s = operands[6];
16641   mod_f = operands[7];
16642   mode = GET_MODE (mem);
16643
16644   /* Normally the succ memory model must be stronger than fail, but in the
16645      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16646      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
16647   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16648       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16649     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16650
16651   r_mode = mode;
16652   if (mode == QImode || mode == HImode)
16653     {
16654       r_mode = SImode;
16655       rval = gen_reg_rtx (r_mode);
16656     }
16657
16658   if (TARGET_LSE)
16659     {
16660       /* The CAS insn requires oldval and rval overlap, but we need to
16661          have a copy of oldval saved across the operation to tell if
16662          the operation is successful.  */
16663       if (reg_overlap_mentioned_p (rval, oldval))
16664         rval = copy_to_mode_reg (r_mode, oldval);
16665       else
16666         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16667
16668       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16669                                                    newval, mod_s));
16670       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16671     }
16672   else
16673     {
16674       /* The oldval predicate varies by mode.  Test it and force to reg.  */
16675       insn_code code = code_for_aarch64_compare_and_swap (mode);
16676       if (!insn_data[code].operand[2].predicate (oldval, mode))
16677         oldval = force_reg (mode, oldval);
16678
16679       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16680                                  is_weak, mod_s, mod_f));
16681       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16682     }
16683
16684   if (r_mode != mode)
16685     rval = gen_lowpart (mode, rval);
16686   emit_move_insn (operands[1], rval);
16687
16688   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16689   emit_insn (gen_rtx_SET (bval, x));
16690 }
16691
16692 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16693    sequence implementing an atomic operation.  */
16694
16695 static void
16696 aarch64_emit_post_barrier (enum memmodel model)
16697 {
16698   const enum memmodel base_model = memmodel_base (model);
16699
16700   if (is_mm_sync (model)
16701       && (base_model == MEMMODEL_ACQUIRE
16702           || base_model == MEMMODEL_ACQ_REL
16703           || base_model == MEMMODEL_SEQ_CST))
16704     {
16705       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16706     }
16707 }
16708
16709 /* Split a compare and swap pattern.  */
16710
16711 void
16712 aarch64_split_compare_and_swap (rtx operands[])
16713 {
16714   rtx rval, mem, oldval, newval, scratch;
16715   machine_mode mode;
16716   bool is_weak;
16717   rtx_code_label *label1, *label2;
16718   rtx x, cond;
16719   enum memmodel model;
16720   rtx model_rtx;
16721
16722   rval = operands[0];
16723   mem = operands[1];
16724   oldval = operands[2];
16725   newval = operands[3];
16726   is_weak = (operands[4] != const0_rtx);
16727   model_rtx = operands[5];
16728   scratch = operands[7];
16729   mode = GET_MODE (mem);
16730   model = memmodel_from_int (INTVAL (model_rtx));
16731
16732   /* When OLDVAL is zero and we want the strong version we can emit a tighter
16733     loop:
16734     .label1:
16735         LD[A]XR rval, [mem]
16736         CBNZ    rval, .label2
16737         ST[L]XR scratch, newval, [mem]
16738         CBNZ    scratch, .label1
16739     .label2:
16740         CMP     rval, 0.  */
16741   bool strong_zero_p = !is_weak && oldval == const0_rtx;
16742
16743   label1 = NULL;
16744   if (!is_weak)
16745     {
16746       label1 = gen_label_rtx ();
16747       emit_label (label1);
16748     }
16749   label2 = gen_label_rtx ();
16750
16751   /* The initial load can be relaxed for a __sync operation since a final
16752      barrier will be emitted to stop code hoisting.  */
16753   if (is_mm_sync (model))
16754     aarch64_emit_load_exclusive (mode, rval, mem,
16755                                  GEN_INT (MEMMODEL_RELAXED));
16756   else
16757     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16758
16759   if (strong_zero_p)
16760     {
16761       if (aarch64_track_speculation)
16762         {
16763           /* Emit an explicit compare instruction, so that we can correctly
16764              track the condition codes.  */
16765           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16766           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16767         }
16768       else
16769         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16770
16771       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16772                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16773       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16774     }
16775   else
16776     {
16777       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16778       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16779       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16780                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16781       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16782     }
16783
16784   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16785
16786   if (!is_weak)
16787     {
16788       if (aarch64_track_speculation)
16789         {
16790           /* Emit an explicit compare instruction, so that we can correctly
16791              track the condition codes.  */
16792           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16793           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16794         }
16795       else
16796         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16797
16798       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16799                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16800       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16801     }
16802   else
16803     {
16804       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16805       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16806       emit_insn (gen_rtx_SET (cond, x));
16807     }
16808
16809   emit_label (label2);
16810   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16811      to set the condition flags.  If this is not used it will be removed by
16812      later passes.  */
16813   if (strong_zero_p)
16814     {
16815       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16816       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
16817       emit_insn (gen_rtx_SET (cond, x));
16818     }
16819   /* Emit any final barrier needed for a __sync operation.  */
16820   if (is_mm_sync (model))
16821     aarch64_emit_post_barrier (model);
16822 }
16823
16824 /* Split an atomic operation.  */
16825
16826 void
16827 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16828                          rtx value, rtx model_rtx, rtx cond)
16829 {
16830   machine_mode mode = GET_MODE (mem);
16831   machine_mode wmode = (mode == DImode ? DImode : SImode);
16832   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16833   const bool is_sync = is_mm_sync (model);
16834   rtx_code_label *label;
16835   rtx x;
16836
16837   /* Split the atomic operation into a sequence.  */
16838   label = gen_label_rtx ();
16839   emit_label (label);
16840
16841   if (new_out)
16842     new_out = gen_lowpart (wmode, new_out);
16843   if (old_out)
16844     old_out = gen_lowpart (wmode, old_out);
16845   else
16846     old_out = new_out;
16847   value = simplify_gen_subreg (wmode, value, mode, 0);
16848
16849   /* The initial load can be relaxed for a __sync operation since a final
16850      barrier will be emitted to stop code hoisting.  */
16851  if (is_sync)
16852     aarch64_emit_load_exclusive (mode, old_out, mem,
16853                                  GEN_INT (MEMMODEL_RELAXED));
16854   else
16855     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16856
16857   switch (code)
16858     {
16859     case SET:
16860       new_out = value;
16861       break;
16862
16863     case NOT:
16864       x = gen_rtx_AND (wmode, old_out, value);
16865       emit_insn (gen_rtx_SET (new_out, x));
16866       x = gen_rtx_NOT (wmode, new_out);
16867       emit_insn (gen_rtx_SET (new_out, x));
16868       break;
16869
16870     case MINUS:
16871       if (CONST_INT_P (value))
16872         {
16873           value = GEN_INT (-INTVAL (value));
16874           code = PLUS;
16875         }
16876       /* Fall through.  */
16877
16878     default:
16879       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16880       emit_insn (gen_rtx_SET (new_out, x));
16881       break;
16882     }
16883
16884   aarch64_emit_store_exclusive (mode, cond, mem,
16885                                 gen_lowpart (mode, new_out), model_rtx);
16886
16887   if (aarch64_track_speculation)
16888     {
16889       /* Emit an explicit compare instruction, so that we can correctly
16890          track the condition codes.  */
16891       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16892       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16893     }
16894   else
16895     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16896
16897   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16898                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16899   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16900
16901   /* Emit any final barrier needed for a __sync operation.  */
16902   if (is_sync)
16903     aarch64_emit_post_barrier (model);
16904 }
16905
16906 static void
16907 aarch64_init_libfuncs (void)
16908 {
16909    /* Half-precision float operations.  The compiler handles all operations
16910      with NULL libfuncs by converting to SFmode.  */
16911
16912   /* Conversions.  */
16913   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16914   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16915
16916   /* Arithmetic.  */
16917   set_optab_libfunc (add_optab, HFmode, NULL);
16918   set_optab_libfunc (sdiv_optab, HFmode, NULL);
16919   set_optab_libfunc (smul_optab, HFmode, NULL);
16920   set_optab_libfunc (neg_optab, HFmode, NULL);
16921   set_optab_libfunc (sub_optab, HFmode, NULL);
16922
16923   /* Comparisons.  */
16924   set_optab_libfunc (eq_optab, HFmode, NULL);
16925   set_optab_libfunc (ne_optab, HFmode, NULL);
16926   set_optab_libfunc (lt_optab, HFmode, NULL);
16927   set_optab_libfunc (le_optab, HFmode, NULL);
16928   set_optab_libfunc (ge_optab, HFmode, NULL);
16929   set_optab_libfunc (gt_optab, HFmode, NULL);
16930   set_optab_libfunc (unord_optab, HFmode, NULL);
16931 }
16932
16933 /* Target hook for c_mode_for_suffix.  */
16934 static machine_mode
16935 aarch64_c_mode_for_suffix (char suffix)
16936 {
16937   if (suffix == 'q')
16938     return TFmode;
16939
16940   return VOIDmode;
16941 }
16942
16943 /* We can only represent floating point constants which will fit in
16944    "quarter-precision" values.  These values are characterised by
16945    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
16946    by:
16947
16948    (-1)^s * (n/16) * 2^r
16949
16950    Where:
16951      's' is the sign bit.
16952      'n' is an integer in the range 16 <= n <= 31.
16953      'r' is an integer in the range -3 <= r <= 4.  */
16954
16955 /* Return true iff X can be represented by a quarter-precision
16956    floating point immediate operand X.  Note, we cannot represent 0.0.  */
16957 bool
16958 aarch64_float_const_representable_p (rtx x)
16959 {
16960   /* This represents our current view of how many bits
16961      make up the mantissa.  */
16962   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16963   int exponent;
16964   unsigned HOST_WIDE_INT mantissa, mask;
16965   REAL_VALUE_TYPE r, m;
16966   bool fail;
16967
16968   if (!CONST_DOUBLE_P (x))
16969     return false;
16970
16971   if (GET_MODE (x) == VOIDmode
16972       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16973     return false;
16974
16975   r = *CONST_DOUBLE_REAL_VALUE (x);
16976
16977   /* We cannot represent infinities, NaNs or +/-zero.  We won't
16978      know if we have +zero until we analyse the mantissa, but we
16979      can reject the other invalid values.  */
16980   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16981       || REAL_VALUE_MINUS_ZERO (r))
16982     return false;
16983
16984   /* Extract exponent.  */
16985   r = real_value_abs (&r);
16986   exponent = REAL_EXP (&r);
16987
16988   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16989      highest (sign) bit, with a fixed binary point at bit point_pos.
16990      m1 holds the low part of the mantissa, m2 the high part.
16991      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16992      bits for the mantissa, this can fail (low bits will be lost).  */
16993   real_ldexp (&m, &r, point_pos - exponent);
16994   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16995
16996   /* If the low part of the mantissa has bits set we cannot represent
16997      the value.  */
16998   if (w.ulow () != 0)
16999     return false;
17000   /* We have rejected the lower HOST_WIDE_INT, so update our
17001      understanding of how many bits lie in the mantissa and
17002      look only at the high HOST_WIDE_INT.  */
17003   mantissa = w.elt (1);
17004   point_pos -= HOST_BITS_PER_WIDE_INT;
17005
17006   /* We can only represent values with a mantissa of the form 1.xxxx.  */
17007   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17008   if ((mantissa & mask) != 0)
17009     return false;
17010
17011   /* Having filtered unrepresentable values, we may now remove all
17012      but the highest 5 bits.  */
17013   mantissa >>= point_pos - 5;
17014
17015   /* We cannot represent the value 0.0, so reject it.  This is handled
17016      elsewhere.  */
17017   if (mantissa == 0)
17018     return false;
17019
17020   /* Then, as bit 4 is always set, we can mask it off, leaving
17021      the mantissa in the range [0, 15].  */
17022   mantissa &= ~(1 << 4);
17023   gcc_assert (mantissa <= 15);
17024
17025   /* GCC internally does not use IEEE754-like encoding (where normalized
17026      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
17027      Our mantissa values are shifted 4 places to the left relative to
17028      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17029      by 5 places to correct for GCC's representation.  */
17030   exponent = 5 - exponent;
17031
17032   return (exponent >= 0 && exponent <= 7);
17033 }
17034
17035 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17036    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
17037    output MOVI/MVNI, ORR or BIC immediate.  */
17038 char*
17039 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17040                                    enum simd_immediate_check which)
17041 {
17042   bool is_valid;
17043   static char templ[40];
17044   const char *mnemonic;
17045   const char *shift_op;
17046   unsigned int lane_count = 0;
17047   char element_char;
17048
17049   struct simd_immediate_info info;
17050
17051   /* This will return true to show const_vector is legal for use as either
17052      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17053      It will also update INFO to show how the immediate should be generated.
17054      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
17055   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17056   gcc_assert (is_valid);
17057
17058   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17059   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17060
17061   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17062     {
17063       gcc_assert (info.insn == simd_immediate_info::MOV
17064                   && info.u.mov.shift == 0);
17065       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17066          move immediate path.  */
17067       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17068         info.u.mov.value = GEN_INT (0);
17069       else
17070         {
17071           const unsigned int buf_size = 20;
17072           char float_buf[buf_size] = {'\0'};
17073           real_to_decimal_for_mode (float_buf,
17074                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17075                                     buf_size, buf_size, 1, info.elt_mode);
17076
17077           if (lane_count == 1)
17078             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17079           else
17080             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17081                       lane_count, element_char, float_buf);
17082           return templ;
17083         }
17084     }
17085
17086   gcc_assert (CONST_INT_P (info.u.mov.value));
17087
17088   if (which == AARCH64_CHECK_MOV)
17089     {
17090       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17091       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17092                   ? "msl" : "lsl");
17093       if (lane_count == 1)
17094         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17095                   mnemonic, UINTVAL (info.u.mov.value));
17096       else if (info.u.mov.shift)
17097         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17098                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17099                   element_char, UINTVAL (info.u.mov.value), shift_op,
17100                   info.u.mov.shift);
17101       else
17102         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17103                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17104                   element_char, UINTVAL (info.u.mov.value));
17105     }
17106   else
17107     {
17108       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
17109       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17110       if (info.u.mov.shift)
17111         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17112                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17113                   element_char, UINTVAL (info.u.mov.value), "lsl",
17114                   info.u.mov.shift);
17115       else
17116         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17117                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17118                   element_char, UINTVAL (info.u.mov.value));
17119     }
17120   return templ;
17121 }
17122
17123 char*
17124 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17125 {
17126
17127   /* If a floating point number was passed and we desire to use it in an
17128      integer mode do the conversion to integer.  */
17129   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17130     {
17131       unsigned HOST_WIDE_INT ival;
17132       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17133           gcc_unreachable ();
17134       immediate = gen_int_mode (ival, mode);
17135     }
17136
17137   machine_mode vmode;
17138   /* use a 64 bit mode for everything except for DI/DF mode, where we use
17139      a 128 bit vector mode.  */
17140   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17141
17142   vmode = aarch64_simd_container_mode (mode, width);
17143   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17144   return aarch64_output_simd_mov_immediate (v_op, width);
17145 }
17146
17147 /* Return the output string to use for moving immediate CONST_VECTOR
17148    into an SVE register.  */
17149
17150 char *
17151 aarch64_output_sve_mov_immediate (rtx const_vector)
17152 {
17153   static char templ[40];
17154   struct simd_immediate_info info;
17155   char element_char;
17156
17157   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17158   gcc_assert (is_valid);
17159
17160   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17161
17162   machine_mode vec_mode = GET_MODE (const_vector);
17163   if (aarch64_sve_pred_mode_p (vec_mode))
17164     {
17165       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17166       if (info.insn == simd_immediate_info::MOV)
17167         {
17168           gcc_assert (info.u.mov.value == const0_rtx);
17169           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17170         }
17171       else
17172         {
17173           gcc_assert (info.insn == simd_immediate_info::PTRUE);
17174           unsigned int total_bytes;
17175           if (info.u.pattern == AARCH64_SV_ALL
17176               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17177             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17178                       total_bytes / GET_MODE_SIZE (info.elt_mode));
17179           else
17180             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17181                       svpattern_token (info.u.pattern));
17182         }
17183       return buf;
17184     }
17185
17186   if (info.insn == simd_immediate_info::INDEX)
17187     {
17188       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17189                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17190                 element_char, INTVAL (info.u.index.base),
17191                 INTVAL (info.u.index.step));
17192       return templ;
17193     }
17194
17195   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17196     {
17197       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17198         info.u.mov.value = GEN_INT (0);
17199       else
17200         {
17201           const int buf_size = 20;
17202           char float_buf[buf_size] = {};
17203           real_to_decimal_for_mode (float_buf,
17204                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17205                                     buf_size, buf_size, 1, info.elt_mode);
17206
17207           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17208                     element_char, float_buf);
17209           return templ;
17210         }
17211     }
17212
17213   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17214             element_char, INTVAL (info.u.mov.value));
17215   return templ;
17216 }
17217
17218 /* Split operands into moves from op[1] + op[2] into op[0].  */
17219
17220 void
17221 aarch64_split_combinev16qi (rtx operands[3])
17222 {
17223   unsigned int dest = REGNO (operands[0]);
17224   unsigned int src1 = REGNO (operands[1]);
17225   unsigned int src2 = REGNO (operands[2]);
17226   machine_mode halfmode = GET_MODE (operands[1]);
17227   unsigned int halfregs = REG_NREGS (operands[1]);
17228   rtx destlo, desthi;
17229
17230   gcc_assert (halfmode == V16QImode);
17231
17232   if (src1 == dest && src2 == dest + halfregs)
17233     {
17234       /* No-op move.  Can't split to nothing; emit something.  */
17235       emit_note (NOTE_INSN_DELETED);
17236       return;
17237     }
17238
17239   /* Preserve register attributes for variable tracking.  */
17240   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17241   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17242                                GET_MODE_SIZE (halfmode));
17243
17244   /* Special case of reversed high/low parts.  */
17245   if (reg_overlap_mentioned_p (operands[2], destlo)
17246       && reg_overlap_mentioned_p (operands[1], desthi))
17247     {
17248       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17249       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17250       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17251     }
17252   else if (!reg_overlap_mentioned_p (operands[2], destlo))
17253     {
17254       /* Try to avoid unnecessary moves if part of the result
17255          is in the right place already.  */
17256       if (src1 != dest)
17257         emit_move_insn (destlo, operands[1]);
17258       if (src2 != dest + halfregs)
17259         emit_move_insn (desthi, operands[2]);
17260     }
17261   else
17262     {
17263       if (src2 != dest + halfregs)
17264         emit_move_insn (desthi, operands[2]);
17265       if (src1 != dest)
17266         emit_move_insn (destlo, operands[1]);
17267     }
17268 }
17269
17270 /* vec_perm support.  */
17271
17272 struct expand_vec_perm_d
17273 {
17274   rtx target, op0, op1;
17275   vec_perm_indices perm;
17276   machine_mode vmode;
17277   unsigned int vec_flags;
17278   bool one_vector_p;
17279   bool testing_p;
17280 };
17281
17282 /* Generate a variable permutation.  */
17283
17284 static void
17285 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17286 {
17287   machine_mode vmode = GET_MODE (target);
17288   bool one_vector_p = rtx_equal_p (op0, op1);
17289
17290   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17291   gcc_checking_assert (GET_MODE (op0) == vmode);
17292   gcc_checking_assert (GET_MODE (op1) == vmode);
17293   gcc_checking_assert (GET_MODE (sel) == vmode);
17294   gcc_checking_assert (TARGET_SIMD);
17295
17296   if (one_vector_p)
17297     {
17298       if (vmode == V8QImode)
17299         {
17300           /* Expand the argument to a V16QI mode by duplicating it.  */
17301           rtx pair = gen_reg_rtx (V16QImode);
17302           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17303           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17304         }
17305       else
17306         {
17307           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17308         }
17309     }
17310   else
17311     {
17312       rtx pair;
17313
17314       if (vmode == V8QImode)
17315         {
17316           pair = gen_reg_rtx (V16QImode);
17317           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17318           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17319         }
17320       else
17321         {
17322           pair = gen_reg_rtx (OImode);
17323           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17324           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17325         }
17326     }
17327 }
17328
17329 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17330    NELT is the number of elements in the vector.  */
17331
17332 void
17333 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17334                          unsigned int nelt)
17335 {
17336   machine_mode vmode = GET_MODE (target);
17337   bool one_vector_p = rtx_equal_p (op0, op1);
17338   rtx mask;
17339
17340   /* The TBL instruction does not use a modulo index, so we must take care
17341      of that ourselves.  */
17342   mask = aarch64_simd_gen_const_vector_dup (vmode,
17343       one_vector_p ? nelt - 1 : 2 * nelt - 1);
17344   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17345
17346   /* For big-endian, we also need to reverse the index within the vector
17347      (but not which vector).  */
17348   if (BYTES_BIG_ENDIAN)
17349     {
17350       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
17351       if (!one_vector_p)
17352         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17353       sel = expand_simple_binop (vmode, XOR, sel, mask,
17354                                  NULL, 0, OPTAB_LIB_WIDEN);
17355     }
17356   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17357 }
17358
17359 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
17360
17361 static void
17362 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17363 {
17364   emit_insn (gen_rtx_SET (target,
17365                           gen_rtx_UNSPEC (GET_MODE (target),
17366                                           gen_rtvec (2, op0, op1), code)));
17367 }
17368
17369 /* Expand an SVE vec_perm with the given operands.  */
17370
17371 void
17372 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17373 {
17374   machine_mode data_mode = GET_MODE (target);
17375   machine_mode sel_mode = GET_MODE (sel);
17376   /* Enforced by the pattern condition.  */
17377   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17378
17379   /* Note: vec_perm indices are supposed to wrap when they go beyond the
17380      size of the two value vectors, i.e. the upper bits of the indices
17381      are effectively ignored.  SVE TBL instead produces 0 for any
17382      out-of-range indices, so we need to modulo all the vec_perm indices
17383      to ensure they are all in range.  */
17384   rtx sel_reg = force_reg (sel_mode, sel);
17385
17386   /* Check if the sel only references the first values vector.  */
17387   if (GET_CODE (sel) == CONST_VECTOR
17388       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17389     {
17390       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17391       return;
17392     }
17393
17394   /* Check if the two values vectors are the same.  */
17395   if (rtx_equal_p (op0, op1))
17396     {
17397       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17398       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17399                                          NULL, 0, OPTAB_DIRECT);
17400       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17401       return;
17402     }
17403
17404   /* Run TBL on for each value vector and combine the results.  */
17405
17406   rtx res0 = gen_reg_rtx (data_mode);
17407   rtx res1 = gen_reg_rtx (data_mode);
17408   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17409   if (GET_CODE (sel) != CONST_VECTOR
17410       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17411     {
17412       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17413                                                        2 * nunits - 1);
17414       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17415                                      NULL, 0, OPTAB_DIRECT);
17416     }
17417   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17418   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17419                                      NULL, 0, OPTAB_DIRECT);
17420   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17421   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17422     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17423   else
17424     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17425 }
17426
17427 /* Recognize patterns suitable for the TRN instructions.  */
17428 static bool
17429 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17430 {
17431   HOST_WIDE_INT odd;
17432   poly_uint64 nelt = d->perm.length ();
17433   rtx out, in0, in1, x;
17434   machine_mode vmode = d->vmode;
17435
17436   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17437     return false;
17438
17439   /* Note that these are little-endian tests.
17440      We correct for big-endian later.  */
17441   if (!d->perm[0].is_constant (&odd)
17442       || (odd != 0 && odd != 1)
17443       || !d->perm.series_p (0, 2, odd, 2)
17444       || !d->perm.series_p (1, 2, nelt + odd, 2))
17445     return false;
17446
17447   /* Success!  */
17448   if (d->testing_p)
17449     return true;
17450
17451   in0 = d->op0;
17452   in1 = d->op1;
17453   /* We don't need a big-endian lane correction for SVE; see the comment
17454      at the head of aarch64-sve.md for details.  */
17455   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17456     {
17457       x = in0, in0 = in1, in1 = x;
17458       odd = !odd;
17459     }
17460   out = d->target;
17461
17462   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17463                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17464   return true;
17465 }
17466
17467 /* Recognize patterns suitable for the UZP instructions.  */
17468 static bool
17469 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17470 {
17471   HOST_WIDE_INT odd;
17472   rtx out, in0, in1, x;
17473   machine_mode vmode = d->vmode;
17474
17475   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17476     return false;
17477
17478   /* Note that these are little-endian tests.
17479      We correct for big-endian later.  */
17480   if (!d->perm[0].is_constant (&odd)
17481       || (odd != 0 && odd != 1)
17482       || !d->perm.series_p (0, 1, odd, 2))
17483     return false;
17484
17485   /* Success!  */
17486   if (d->testing_p)
17487     return true;
17488
17489   in0 = d->op0;
17490   in1 = d->op1;
17491   /* We don't need a big-endian lane correction for SVE; see the comment
17492      at the head of aarch64-sve.md for details.  */
17493   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17494     {
17495       x = in0, in0 = in1, in1 = x;
17496       odd = !odd;
17497     }
17498   out = d->target;
17499
17500   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17501                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17502   return true;
17503 }
17504
17505 /* Recognize patterns suitable for the ZIP instructions.  */
17506 static bool
17507 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17508 {
17509   unsigned int high;
17510   poly_uint64 nelt = d->perm.length ();
17511   rtx out, in0, in1, x;
17512   machine_mode vmode = d->vmode;
17513
17514   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17515     return false;
17516
17517   /* Note that these are little-endian tests.
17518      We correct for big-endian later.  */
17519   poly_uint64 first = d->perm[0];
17520   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17521       || !d->perm.series_p (0, 2, first, 1)
17522       || !d->perm.series_p (1, 2, first + nelt, 1))
17523     return false;
17524   high = maybe_ne (first, 0U);
17525
17526   /* Success!  */
17527   if (d->testing_p)
17528     return true;
17529
17530   in0 = d->op0;
17531   in1 = d->op1;
17532   /* We don't need a big-endian lane correction for SVE; see the comment
17533      at the head of aarch64-sve.md for details.  */
17534   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17535     {
17536       x = in0, in0 = in1, in1 = x;
17537       high = !high;
17538     }
17539   out = d->target;
17540
17541   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17542                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17543   return true;
17544 }
17545
17546 /* Recognize patterns for the EXT insn.  */
17547
17548 static bool
17549 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17550 {
17551   HOST_WIDE_INT location;
17552   rtx offset;
17553
17554   /* The first element always refers to the first vector.
17555      Check if the extracted indices are increasing by one.  */
17556   if (d->vec_flags == VEC_SVE_PRED
17557       || !d->perm[0].is_constant (&location)
17558       || !d->perm.series_p (0, 1, location, 1))
17559     return false;
17560
17561   /* Success! */
17562   if (d->testing_p)
17563     return true;
17564
17565   /* The case where (location == 0) is a no-op for both big- and little-endian,
17566      and is removed by the mid-end at optimization levels -O1 and higher.
17567
17568      We don't need a big-endian lane correction for SVE; see the comment
17569      at the head of aarch64-sve.md for details.  */
17570   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17571     {
17572       /* After setup, we want the high elements of the first vector (stored
17573          at the LSB end of the register), and the low elements of the second
17574          vector (stored at the MSB end of the register). So swap.  */
17575       std::swap (d->op0, d->op1);
17576       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17577          to_constant () is safe since this is restricted to Advanced SIMD
17578          vectors.  */
17579       location = d->perm.length ().to_constant () - location;
17580     }
17581
17582   offset = GEN_INT (location);
17583   emit_set_insn (d->target,
17584                  gen_rtx_UNSPEC (d->vmode,
17585                                  gen_rtvec (3, d->op0, d->op1, offset),
17586                                  UNSPEC_EXT));
17587   return true;
17588 }
17589
17590 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17591    within each 64-bit, 32-bit or 16-bit granule.  */
17592
17593 static bool
17594 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17595 {
17596   HOST_WIDE_INT diff;
17597   unsigned int i, size, unspec;
17598   machine_mode pred_mode;
17599
17600   if (d->vec_flags == VEC_SVE_PRED
17601       || !d->one_vector_p
17602       || !d->perm[0].is_constant (&diff))
17603     return false;
17604
17605   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17606   if (size == 8)
17607     {
17608       unspec = UNSPEC_REV64;
17609       pred_mode = VNx2BImode;
17610     }
17611   else if (size == 4)
17612     {
17613       unspec = UNSPEC_REV32;
17614       pred_mode = VNx4BImode;
17615     }
17616   else if (size == 2)
17617     {
17618       unspec = UNSPEC_REV16;
17619       pred_mode = VNx8BImode;
17620     }
17621   else
17622     return false;
17623
17624   unsigned int step = diff + 1;
17625   for (i = 0; i < step; ++i)
17626     if (!d->perm.series_p (i, step, diff - i, step))
17627       return false;
17628
17629   /* Success! */
17630   if (d->testing_p)
17631     return true;
17632
17633   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17634   if (d->vec_flags == VEC_SVE_DATA)
17635     {
17636       rtx pred = aarch64_ptrue_reg (pred_mode);
17637       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
17638                             UNSPEC_PRED_X);
17639     }
17640   emit_set_insn (d->target, src);
17641   return true;
17642 }
17643
17644 /* Recognize patterns for the REV insn, which reverses elements within
17645    a full vector.  */
17646
17647 static bool
17648 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17649 {
17650   poly_uint64 nelt = d->perm.length ();
17651
17652   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17653     return false;
17654
17655   if (!d->perm.series_p (0, 1, nelt - 1, -1))
17656     return false;
17657
17658   /* Success! */
17659   if (d->testing_p)
17660     return true;
17661
17662   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17663   emit_set_insn (d->target, src);
17664   return true;
17665 }
17666
17667 static bool
17668 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17669 {
17670   rtx out = d->target;
17671   rtx in0;
17672   HOST_WIDE_INT elt;
17673   machine_mode vmode = d->vmode;
17674   rtx lane;
17675
17676   if (d->vec_flags == VEC_SVE_PRED
17677       || d->perm.encoding ().encoded_nelts () != 1
17678       || !d->perm[0].is_constant (&elt))
17679     return false;
17680
17681   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17682     return false;
17683
17684   /* Success! */
17685   if (d->testing_p)
17686     return true;
17687
17688   /* The generic preparation in aarch64_expand_vec_perm_const_1
17689      swaps the operand order and the permute indices if it finds
17690      d->perm[0] to be in the second operand.  Thus, we can always
17691      use d->op0 and need not do any extra arithmetic to get the
17692      correct lane number.  */
17693   in0 = d->op0;
17694   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
17695
17696   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17697   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17698   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17699   return true;
17700 }
17701
17702 static bool
17703 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17704 {
17705   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17706   machine_mode vmode = d->vmode;
17707
17708   /* Make sure that the indices are constant.  */
17709   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17710   for (unsigned int i = 0; i < encoded_nelts; ++i)
17711     if (!d->perm[i].is_constant ())
17712       return false;
17713
17714   if (d->testing_p)
17715     return true;
17716
17717   /* Generic code will try constant permutation twice.  Once with the
17718      original mode and again with the elements lowered to QImode.
17719      So wait and don't do the selector expansion ourselves.  */
17720   if (vmode != V8QImode && vmode != V16QImode)
17721     return false;
17722
17723   /* to_constant is safe since this routine is specific to Advanced SIMD
17724      vectors.  */
17725   unsigned int nelt = d->perm.length ().to_constant ();
17726   for (unsigned int i = 0; i < nelt; ++i)
17727     /* If big-endian and two vectors we end up with a weird mixed-endian
17728        mode on NEON.  Reverse the index within each word but not the word
17729        itself.  to_constant is safe because we checked is_constant above.  */
17730     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17731                         ? d->perm[i].to_constant () ^ (nelt - 1)
17732                         : d->perm[i].to_constant ());
17733
17734   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17735   sel = force_reg (vmode, sel);
17736
17737   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17738   return true;
17739 }
17740
17741 /* Try to implement D using an SVE TBL instruction.  */
17742
17743 static bool
17744 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17745 {
17746   unsigned HOST_WIDE_INT nelt;
17747
17748   /* Permuting two variable-length vectors could overflow the
17749      index range.  */
17750   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17751     return false;
17752
17753   if (d->testing_p)
17754     return true;
17755
17756   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17757   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17758   if (d->one_vector_p)
17759     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17760   else
17761     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17762   return true;
17763 }
17764
17765 static bool
17766 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17767 {
17768   /* The pattern matching functions above are written to look for a small
17769      number to begin the sequence (0, 1, N/2).  If we begin with an index
17770      from the second operand, we can swap the operands.  */
17771   poly_int64 nelt = d->perm.length ();
17772   if (known_ge (d->perm[0], nelt))
17773     {
17774       d->perm.rotate_inputs (1);
17775       std::swap (d->op0, d->op1);
17776     }
17777
17778   if ((d->vec_flags == VEC_ADVSIMD
17779        || d->vec_flags == VEC_SVE_DATA
17780        || d->vec_flags == VEC_SVE_PRED)
17781       && known_gt (nelt, 1))
17782     {
17783       if (aarch64_evpc_rev_local (d))
17784         return true;
17785       else if (aarch64_evpc_rev_global (d))
17786         return true;
17787       else if (aarch64_evpc_ext (d))
17788         return true;
17789       else if (aarch64_evpc_dup (d))
17790         return true;
17791       else if (aarch64_evpc_zip (d))
17792         return true;
17793       else if (aarch64_evpc_uzp (d))
17794         return true;
17795       else if (aarch64_evpc_trn (d))
17796         return true;
17797       if (d->vec_flags == VEC_SVE_DATA)
17798         return aarch64_evpc_sve_tbl (d);
17799       else if (d->vec_flags == VEC_ADVSIMD)
17800         return aarch64_evpc_tbl (d);
17801     }
17802   return false;
17803 }
17804
17805 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
17806
17807 static bool
17808 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17809                                   rtx op1, const vec_perm_indices &sel)
17810 {
17811   struct expand_vec_perm_d d;
17812
17813   /* Check whether the mask can be applied to a single vector.  */
17814   if (sel.ninputs () == 1
17815       || (op0 && rtx_equal_p (op0, op1)))
17816     d.one_vector_p = true;
17817   else if (sel.all_from_input_p (0))
17818     {
17819       d.one_vector_p = true;
17820       op1 = op0;
17821     }
17822   else if (sel.all_from_input_p (1))
17823     {
17824       d.one_vector_p = true;
17825       op0 = op1;
17826     }
17827   else
17828     d.one_vector_p = false;
17829
17830   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
17831                      sel.nelts_per_input ());
17832   d.vmode = vmode;
17833   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
17834   d.target = target;
17835   d.op0 = op0;
17836   d.op1 = op1;
17837   d.testing_p = !target;
17838
17839   if (!d.testing_p)
17840     return aarch64_expand_vec_perm_const_1 (&d);
17841
17842   rtx_insn *last = get_last_insn ();
17843   bool ret = aarch64_expand_vec_perm_const_1 (&d);
17844   gcc_assert (last == get_last_insn ());
17845
17846   return ret;
17847 }
17848
17849 /* Generate a byte permute mask for a register of mode MODE,
17850    which has NUNITS units.  */
17851
17852 rtx
17853 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17854 {
17855   /* We have to reverse each vector because we dont have
17856      a permuted load that can reverse-load according to ABI rules.  */
17857   rtx mask;
17858   rtvec v = rtvec_alloc (16);
17859   unsigned int i, j;
17860   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17861
17862   gcc_assert (BYTES_BIG_ENDIAN);
17863   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17864
17865   for (i = 0; i < nunits; i++)
17866     for (j = 0; j < usize; j++)
17867       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17868   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17869   return force_reg (V16QImode, mask);
17870 }
17871
17872 /* Expand an SVE integer comparison using the SVE equivalent of:
17873
17874      (set TARGET (CODE OP0 OP1)).  */
17875
17876 void
17877 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17878 {
17879   machine_mode pred_mode = GET_MODE (target);
17880   machine_mode data_mode = GET_MODE (op0);
17881   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
17882                                       op0, op1);
17883   if (!rtx_equal_p (target, res))
17884     emit_move_insn (target, res);
17885 }
17886
17887 /* Return the UNSPEC_COND_* code for comparison CODE.  */
17888
17889 static unsigned int
17890 aarch64_unspec_cond_code (rtx_code code)
17891 {
17892   switch (code)
17893     {
17894     case NE:
17895       return UNSPEC_COND_FCMNE;
17896     case EQ:
17897       return UNSPEC_COND_FCMEQ;
17898     case LT:
17899       return UNSPEC_COND_FCMLT;
17900     case GT:
17901       return UNSPEC_COND_FCMGT;
17902     case LE:
17903       return UNSPEC_COND_FCMLE;
17904     case GE:
17905       return UNSPEC_COND_FCMGE;
17906     case UNORDERED:
17907       return UNSPEC_COND_FCMUO;
17908     default:
17909       gcc_unreachable ();
17910     }
17911 }
17912
17913 /* Emit:
17914
17915       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
17916
17917    where <X> is the operation associated with comparison CODE.
17918    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
17919
17920 static void
17921 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
17922                           bool known_ptrue_p, rtx op0, rtx op1)
17923 {
17924   rtx flag = gen_int_mode (known_ptrue_p, SImode);
17925   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17926                                gen_rtvec (4, pred, flag, op0, op1),
17927                                aarch64_unspec_cond_code (code));
17928   emit_set_insn (target, unspec);
17929 }
17930
17931 /* Emit the SVE equivalent of:
17932
17933       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
17934       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
17935       (set TARGET (ior:PRED_MODE TMP1 TMP2))
17936
17937    where <Xi> is the operation associated with comparison CODEi.
17938    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
17939
17940 static void
17941 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
17942                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
17943 {
17944   machine_mode pred_mode = GET_MODE (pred);
17945   rtx tmp1 = gen_reg_rtx (pred_mode);
17946   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
17947   rtx tmp2 = gen_reg_rtx (pred_mode);
17948   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
17949   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17950 }
17951
17952 /* Emit the SVE equivalent of:
17953
17954       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
17955       (set TARGET (not TMP))
17956
17957    where <X> is the operation associated with comparison CODE.
17958    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
17959
17960 static void
17961 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
17962                                  bool known_ptrue_p, rtx op0, rtx op1)
17963 {
17964   machine_mode pred_mode = GET_MODE (pred);
17965   rtx tmp = gen_reg_rtx (pred_mode);
17966   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
17967   aarch64_emit_unop (target, one_cmpl_optab, tmp);
17968 }
17969
17970 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17971
17972      (set TARGET (CODE OP0 OP1))
17973
17974    If CAN_INVERT_P is true, the caller can also handle inverted results;
17975    return true if the result is in fact inverted.  */
17976
17977 bool
17978 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17979                                   rtx op0, rtx op1, bool can_invert_p)
17980 {
17981   machine_mode pred_mode = GET_MODE (target);
17982   machine_mode data_mode = GET_MODE (op0);
17983
17984   rtx ptrue = aarch64_ptrue_reg (pred_mode);
17985   switch (code)
17986     {
17987     case UNORDERED:
17988       /* UNORDERED has no immediate form.  */
17989       op1 = force_reg (data_mode, op1);
17990       /* fall through */
17991     case LT:
17992     case LE:
17993     case GT:
17994     case GE:
17995     case EQ:
17996     case NE:
17997       {
17998         /* There is native support for the comparison.  */
17999         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18000         return false;
18001       }
18002
18003     case LTGT:
18004       /* This is a trapping operation (LT or GT).  */
18005       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18006       return false;
18007
18008     case UNEQ:
18009       if (!flag_trapping_math)
18010         {
18011           /* This would trap for signaling NaNs.  */
18012           op1 = force_reg (data_mode, op1);
18013           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18014                                         ptrue, true, op0, op1);
18015           return false;
18016         }
18017       /* fall through */
18018     case UNLT:
18019     case UNLE:
18020     case UNGT:
18021     case UNGE:
18022       if (flag_trapping_math)
18023         {
18024           /* Work out which elements are ordered.  */
18025           rtx ordered = gen_reg_rtx (pred_mode);
18026           op1 = force_reg (data_mode, op1);
18027           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18028                                            ptrue, true, op0, op1);
18029
18030           /* Test the opposite condition for the ordered elements,
18031              then invert the result.  */
18032           if (code == UNEQ)
18033             code = NE;
18034           else
18035             code = reverse_condition_maybe_unordered (code);
18036           if (can_invert_p)
18037             {
18038               aarch64_emit_sve_fp_cond (target, code,
18039                                         ordered, false, op0, op1);
18040               return true;
18041             }
18042           aarch64_emit_sve_invert_fp_cond (target, code,
18043                                            ordered, false, op0, op1);
18044           return false;
18045         }
18046       break;
18047
18048     case ORDERED:
18049       /* ORDERED has no immediate form.  */
18050       op1 = force_reg (data_mode, op1);
18051       break;
18052
18053     default:
18054       gcc_unreachable ();
18055     }
18056
18057   /* There is native support for the inverse comparison.  */
18058   code = reverse_condition_maybe_unordered (code);
18059   if (can_invert_p)
18060     {
18061       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18062       return true;
18063     }
18064   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18065   return false;
18066 }
18067
18068 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
18069    of the data being selected and CMP_MODE is the mode of the values being
18070    compared.  */
18071
18072 void
18073 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18074                           rtx *ops)
18075 {
18076   machine_mode pred_mode
18077     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18078                              GET_MODE_SIZE (cmp_mode)).require ();
18079   rtx pred = gen_reg_rtx (pred_mode);
18080   if (FLOAT_MODE_P (cmp_mode))
18081     {
18082       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18083                                             ops[4], ops[5], true))
18084         std::swap (ops[1], ops[2]);
18085     }
18086   else
18087     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18088
18089   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18090   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18091 }
18092
18093 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
18094    true.  However due to issues with register allocation it is preferable
18095    to avoid tieing integer scalar and FP scalar modes.  Executing integer
18096    operations in general registers is better than treating them as scalar
18097    vector operations.  This reduces latency and avoids redundant int<->FP
18098    moves.  So tie modes if they are either the same class, or vector modes
18099    with other vector modes, vector structs or any scalar mode.  */
18100
18101 static bool
18102 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18103 {
18104   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18105     return true;
18106
18107   /* We specifically want to allow elements of "structure" modes to
18108      be tieable to the structure.  This more general condition allows
18109      other rarer situations too.  The reason we don't extend this to
18110      predicate modes is that there are no predicate structure modes
18111      nor any specific instructions for extracting part of a predicate
18112      register.  */
18113   if (aarch64_vector_data_mode_p (mode1)
18114       && aarch64_vector_data_mode_p (mode2))
18115     return true;
18116
18117   /* Also allow any scalar modes with vectors.  */
18118   if (aarch64_vector_mode_supported_p (mode1)
18119       || aarch64_vector_mode_supported_p (mode2))
18120     return true;
18121
18122   return false;
18123 }
18124
18125 /* Return a new RTX holding the result of moving POINTER forward by
18126    AMOUNT bytes.  */
18127
18128 static rtx
18129 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18130 {
18131   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18132
18133   return adjust_automodify_address (pointer, GET_MODE (pointer),
18134                                     next, amount);
18135 }
18136
18137 /* Return a new RTX holding the result of moving POINTER forward by the
18138    size of the mode it points to.  */
18139
18140 static rtx
18141 aarch64_progress_pointer (rtx pointer)
18142 {
18143   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18144 }
18145
18146 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18147    MODE bytes.  */
18148
18149 static void
18150 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18151                                               machine_mode mode)
18152 {
18153   rtx reg = gen_reg_rtx (mode);
18154
18155   /* "Cast" the pointers to the correct mode.  */
18156   *src = adjust_address (*src, mode, 0);
18157   *dst = adjust_address (*dst, mode, 0);
18158   /* Emit the memcpy.  */
18159   emit_move_insn (reg, *src);
18160   emit_move_insn (*dst, reg);
18161   /* Move the pointers forward.  */
18162   *src = aarch64_progress_pointer (*src);
18163   *dst = aarch64_progress_pointer (*dst);
18164 }
18165
18166 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
18167    we succeed, otherwise return false.  */
18168
18169 bool
18170 aarch64_expand_cpymem (rtx *operands)
18171 {
18172   int n, mode_bits;
18173   rtx dst = operands[0];
18174   rtx src = operands[1];
18175   rtx base;
18176   machine_mode cur_mode = BLKmode, next_mode;
18177   bool speed_p = !optimize_function_for_size_p (cfun);
18178
18179   /* When optimizing for size, give a better estimate of the length of a
18180      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
18181      will always require an even number of instructions to do now.  And each
18182      operation requires both a load+store, so devide the max number by 2.  */
18183   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18184
18185   /* We can't do anything smart if the amount to copy is not constant.  */
18186   if (!CONST_INT_P (operands[2]))
18187     return false;
18188
18189   n = INTVAL (operands[2]);
18190
18191   /* Try to keep the number of instructions low.  For all cases we will do at
18192      most two moves for the residual amount, since we'll always overlap the
18193      remainder.  */
18194   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18195     return false;
18196
18197   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18198   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18199
18200   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18201   src = adjust_automodify_address (src, VOIDmode, base, 0);
18202
18203   /* Convert n to bits to make the rest of the code simpler.  */
18204   n = n * BITS_PER_UNIT;
18205
18206   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
18207      larger than TImode, but we should not use them for loads/stores here.  */
18208   const int copy_limit = GET_MODE_BITSIZE (TImode);
18209
18210   while (n > 0)
18211     {
18212       /* Find the largest mode in which to do the copy in without over reading
18213          or writing.  */
18214       opt_scalar_int_mode mode_iter;
18215       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18216         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18217           cur_mode = mode_iter.require ();
18218
18219       gcc_assert (cur_mode != BLKmode);
18220
18221       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18222       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18223
18224       n -= mode_bits;
18225
18226       /* Do certain trailing copies as overlapping if it's going to be
18227          cheaper.  i.e. less instructions to do so.  For instance doing a 15
18228          byte copy it's more efficient to do two overlapping 8 byte copies than
18229          8 + 6 + 1.  */
18230       if (n > 0 && n <= 8 * BITS_PER_UNIT)
18231         {
18232           next_mode = smallest_mode_for_size (n, MODE_INT);
18233           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18234           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18235           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18236           n = n_bits;
18237         }
18238     }
18239
18240   return true;
18241 }
18242
18243 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18244    SImode stores.  Handle the case when the constant has identical
18245    bottom and top halves.  This is beneficial when the two stores can be
18246    merged into an STP and we avoid synthesising potentially expensive
18247    immediates twice.  Return true if such a split is possible.  */
18248
18249 bool
18250 aarch64_split_dimode_const_store (rtx dst, rtx src)
18251 {
18252   rtx lo = gen_lowpart (SImode, src);
18253   rtx hi = gen_highpart_mode (SImode, DImode, src);
18254
18255   bool size_p = optimize_function_for_size_p (cfun);
18256
18257   if (!rtx_equal_p (lo, hi))
18258     return false;
18259
18260   unsigned int orig_cost
18261     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18262   unsigned int lo_cost
18263     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18264
18265   /* We want to transform:
18266      MOV        x1, 49370
18267      MOVK       x1, 0x140, lsl 16
18268      MOVK       x1, 0xc0da, lsl 32
18269      MOVK       x1, 0x140, lsl 48
18270      STR        x1, [x0]
18271    into:
18272      MOV        w1, 49370
18273      MOVK       w1, 0x140, lsl 16
18274      STP        w1, w1, [x0]
18275    So we want to perform this only when we save two instructions
18276    or more.  When optimizing for size, however, accept any code size
18277    savings we can.  */
18278   if (size_p && orig_cost <= lo_cost)
18279     return false;
18280
18281   if (!size_p
18282       && (orig_cost <= lo_cost + 1))
18283     return false;
18284
18285   rtx mem_lo = adjust_address (dst, SImode, 0);
18286   if (!aarch64_mem_pair_operand (mem_lo, SImode))
18287     return false;
18288
18289   rtx tmp_reg = gen_reg_rtx (SImode);
18290   aarch64_expand_mov_immediate (tmp_reg, lo);
18291   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18292   /* Don't emit an explicit store pair as this may not be always profitable.
18293      Let the sched-fusion logic decide whether to merge them.  */
18294   emit_move_insn (mem_lo, tmp_reg);
18295   emit_move_insn (mem_hi, tmp_reg);
18296
18297   return true;
18298 }
18299
18300 /* Generate RTL for a conditional branch with rtx comparison CODE in
18301    mode CC_MODE.  The destination of the unlikely conditional branch
18302    is LABEL_REF.  */
18303
18304 void
18305 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18306                               rtx label_ref)
18307 {
18308   rtx x;
18309   x = gen_rtx_fmt_ee (code, VOIDmode,
18310                       gen_rtx_REG (cc_mode, CC_REGNUM),
18311                       const0_rtx);
18312
18313   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18314                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
18315                             pc_rtx);
18316   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18317 }
18318
18319 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18320
18321    OP1 represents the TImode destination operand 1
18322    OP2 represents the TImode destination operand 2
18323    LOW_DEST represents the low half (DImode) of TImode operand 0
18324    LOW_IN1 represents the low half (DImode) of TImode operand 1
18325    LOW_IN2 represents the low half (DImode) of TImode operand 2
18326    HIGH_DEST represents the high half (DImode) of TImode operand 0
18327    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18328    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18329
18330 void
18331 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18332                             rtx *low_in1, rtx *low_in2,
18333                             rtx *high_dest, rtx *high_in1,
18334                             rtx *high_in2)
18335 {
18336   *low_dest = gen_reg_rtx (DImode);
18337   *low_in1 = gen_lowpart (DImode, op1);
18338   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18339                                   subreg_lowpart_offset (DImode, TImode));
18340   *high_dest = gen_reg_rtx (DImode);
18341   *high_in1 = gen_highpart (DImode, op1);
18342   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18343                                    subreg_highpart_offset (DImode, TImode));
18344 }
18345
18346 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18347
18348    This function differs from 'arch64_addti_scratch_regs' in that
18349    OP1 can be an immediate constant (zero). We must call
18350    subreg_highpart_offset with DImode and TImode arguments, otherwise
18351    VOIDmode will be used for the const_int which generates an internal
18352    error from subreg_size_highpart_offset which does not expect a size of zero.
18353
18354    OP1 represents the TImode destination operand 1
18355    OP2 represents the TImode destination operand 2
18356    LOW_DEST represents the low half (DImode) of TImode operand 0
18357    LOW_IN1 represents the low half (DImode) of TImode operand 1
18358    LOW_IN2 represents the low half (DImode) of TImode operand 2
18359    HIGH_DEST represents the high half (DImode) of TImode operand 0
18360    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18361    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18362
18363
18364 void
18365 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18366                              rtx *low_in1, rtx *low_in2,
18367                              rtx *high_dest, rtx *high_in1,
18368                              rtx *high_in2)
18369 {
18370   *low_dest = gen_reg_rtx (DImode);
18371   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18372                                   subreg_lowpart_offset (DImode, TImode));
18373
18374   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18375                                   subreg_lowpart_offset (DImode, TImode));
18376   *high_dest = gen_reg_rtx (DImode);
18377
18378   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18379                                    subreg_highpart_offset (DImode, TImode));
18380   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18381                                    subreg_highpart_offset (DImode, TImode));
18382 }
18383
18384 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18385
18386    OP0 represents the TImode destination operand 0
18387    LOW_DEST represents the low half (DImode) of TImode operand 0
18388    LOW_IN1 represents the low half (DImode) of TImode operand 1
18389    LOW_IN2 represents the low half (DImode) of TImode operand 2
18390    HIGH_DEST represents the high half (DImode) of TImode operand 0
18391    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18392    HIGH_IN2 represents the high half (DImode) of TImode operand 2
18393    UNSIGNED_P is true if the operation is being performed on unsigned
18394    values.  */
18395 void
18396 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18397                        rtx low_in2, rtx high_dest, rtx high_in1,
18398                        rtx high_in2, bool unsigned_p)
18399 {
18400   if (low_in2 == const0_rtx)
18401     {
18402       low_dest = low_in1;
18403       high_in2 = force_reg (DImode, high_in2);
18404       if (unsigned_p)
18405         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18406       else
18407         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18408     }
18409   else
18410     {
18411       if (CONST_INT_P (low_in2))
18412         {
18413           high_in2 = force_reg (DImode, high_in2);
18414           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18415                                               GEN_INT (-INTVAL (low_in2))));
18416         }
18417       else
18418         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18419
18420       if (unsigned_p)
18421         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18422       else
18423         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18424     }
18425
18426   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18427   emit_move_insn (gen_highpart (DImode, op0), high_dest);
18428
18429 }
18430
18431 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
18432
18433 static unsigned HOST_WIDE_INT
18434 aarch64_asan_shadow_offset (void)
18435 {
18436   if (TARGET_ILP32)
18437     return (HOST_WIDE_INT_1 << 29);
18438   else
18439     return (HOST_WIDE_INT_1 << 36);
18440 }
18441
18442 static rtx
18443 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18444                         int code, tree treeop0, tree treeop1)
18445 {
18446   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18447   rtx op0, op1;
18448   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18449   insn_code icode;
18450   struct expand_operand ops[4];
18451
18452   start_sequence ();
18453   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18454
18455   op_mode = GET_MODE (op0);
18456   if (op_mode == VOIDmode)
18457     op_mode = GET_MODE (op1);
18458
18459   switch (op_mode)
18460     {
18461     case E_QImode:
18462     case E_HImode:
18463     case E_SImode:
18464       cmp_mode = SImode;
18465       icode = CODE_FOR_cmpsi;
18466       break;
18467
18468     case E_DImode:
18469       cmp_mode = DImode;
18470       icode = CODE_FOR_cmpdi;
18471       break;
18472
18473     case E_SFmode:
18474       cmp_mode = SFmode;
18475       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18476       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18477       break;
18478
18479     case E_DFmode:
18480       cmp_mode = DFmode;
18481       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18482       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18483       break;
18484
18485     default:
18486       end_sequence ();
18487       return NULL_RTX;
18488     }
18489
18490   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18491   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18492   if (!op0 || !op1)
18493     {
18494       end_sequence ();
18495       return NULL_RTX;
18496     }
18497   *prep_seq = get_insns ();
18498   end_sequence ();
18499
18500   create_fixed_operand (&ops[0], op0);
18501   create_fixed_operand (&ops[1], op1);
18502
18503   start_sequence ();
18504   if (!maybe_expand_insn (icode, 2, ops))
18505     {
18506       end_sequence ();
18507       return NULL_RTX;
18508     }
18509   *gen_seq = get_insns ();
18510   end_sequence ();
18511
18512   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18513                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18514 }
18515
18516 static rtx
18517 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18518                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
18519 {
18520   rtx op0, op1, target;
18521   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18522   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18523   insn_code icode;
18524   struct expand_operand ops[6];
18525   int aarch64_cond;
18526
18527   push_to_sequence (*prep_seq);
18528   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18529
18530   op_mode = GET_MODE (op0);
18531   if (op_mode == VOIDmode)
18532     op_mode = GET_MODE (op1);
18533
18534   switch (op_mode)
18535     {
18536     case E_QImode:
18537     case E_HImode:
18538     case E_SImode:
18539       cmp_mode = SImode;
18540       icode = CODE_FOR_ccmpsi;
18541       break;
18542
18543     case E_DImode:
18544       cmp_mode = DImode;
18545       icode = CODE_FOR_ccmpdi;
18546       break;
18547
18548     case E_SFmode:
18549       cmp_mode = SFmode;
18550       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18551       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18552       break;
18553
18554     case E_DFmode:
18555       cmp_mode = DFmode;
18556       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18557       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18558       break;
18559
18560     default:
18561       end_sequence ();
18562       return NULL_RTX;
18563     }
18564
18565   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18566   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18567   if (!op0 || !op1)
18568     {
18569       end_sequence ();
18570       return NULL_RTX;
18571     }
18572   *prep_seq = get_insns ();
18573   end_sequence ();
18574
18575   target = gen_rtx_REG (cc_mode, CC_REGNUM);
18576   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18577
18578   if (bit_code != AND)
18579     {
18580       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18581                                                 GET_MODE (XEXP (prev, 0))),
18582                              VOIDmode, XEXP (prev, 0), const0_rtx);
18583       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18584     }
18585
18586   create_fixed_operand (&ops[0], XEXP (prev, 0));
18587   create_fixed_operand (&ops[1], target);
18588   create_fixed_operand (&ops[2], op0);
18589   create_fixed_operand (&ops[3], op1);
18590   create_fixed_operand (&ops[4], prev);
18591   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18592
18593   push_to_sequence (*gen_seq);
18594   if (!maybe_expand_insn (icode, 6, ops))
18595     {
18596       end_sequence ();
18597       return NULL_RTX;
18598     }
18599
18600   *gen_seq = get_insns ();
18601   end_sequence ();
18602
18603   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18604 }
18605
18606 #undef TARGET_GEN_CCMP_FIRST
18607 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18608
18609 #undef TARGET_GEN_CCMP_NEXT
18610 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18611
18612 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
18613    instruction fusion of some sort.  */
18614
18615 static bool
18616 aarch64_macro_fusion_p (void)
18617 {
18618   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18619 }
18620
18621
18622 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
18623    should be kept together during scheduling.  */
18624
18625 static bool
18626 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18627 {
18628   rtx set_dest;
18629   rtx prev_set = single_set (prev);
18630   rtx curr_set = single_set (curr);
18631   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
18632   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18633
18634   if (!aarch64_macro_fusion_p ())
18635     return false;
18636
18637   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18638     {
18639       /* We are trying to match:
18640          prev (mov)  == (set (reg r0) (const_int imm16))
18641          curr (movk) == (set (zero_extract (reg r0)
18642                                            (const_int 16)
18643                                            (const_int 16))
18644                              (const_int imm16_1))  */
18645
18646       set_dest = SET_DEST (curr_set);
18647
18648       if (GET_CODE (set_dest) == ZERO_EXTRACT
18649           && CONST_INT_P (SET_SRC (curr_set))
18650           && CONST_INT_P (SET_SRC (prev_set))
18651           && CONST_INT_P (XEXP (set_dest, 2))
18652           && INTVAL (XEXP (set_dest, 2)) == 16
18653           && REG_P (XEXP (set_dest, 0))
18654           && REG_P (SET_DEST (prev_set))
18655           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18656         {
18657           return true;
18658         }
18659     }
18660
18661   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18662     {
18663
18664       /*  We're trying to match:
18665           prev (adrp) == (set (reg r1)
18666                               (high (symbol_ref ("SYM"))))
18667           curr (add) == (set (reg r0)
18668                              (lo_sum (reg r1)
18669                                      (symbol_ref ("SYM"))))
18670           Note that r0 need not necessarily be the same as r1, especially
18671           during pre-regalloc scheduling.  */
18672
18673       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18674           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18675         {
18676           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18677               && REG_P (XEXP (SET_SRC (curr_set), 0))
18678               && REGNO (XEXP (SET_SRC (curr_set), 0))
18679                  == REGNO (SET_DEST (prev_set))
18680               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18681                               XEXP (SET_SRC (curr_set), 1)))
18682             return true;
18683         }
18684     }
18685
18686   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18687     {
18688
18689       /* We're trying to match:
18690          prev (movk) == (set (zero_extract (reg r0)
18691                                            (const_int 16)
18692                                            (const_int 32))
18693                              (const_int imm16_1))
18694          curr (movk) == (set (zero_extract (reg r0)
18695                                            (const_int 16)
18696                                            (const_int 48))
18697                              (const_int imm16_2))  */
18698
18699       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18700           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18701           && REG_P (XEXP (SET_DEST (prev_set), 0))
18702           && REG_P (XEXP (SET_DEST (curr_set), 0))
18703           && REGNO (XEXP (SET_DEST (prev_set), 0))
18704              == REGNO (XEXP (SET_DEST (curr_set), 0))
18705           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18706           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18707           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18708           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18709           && CONST_INT_P (SET_SRC (prev_set))
18710           && CONST_INT_P (SET_SRC (curr_set)))
18711         return true;
18712
18713     }
18714   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18715     {
18716       /* We're trying to match:
18717           prev (adrp) == (set (reg r0)
18718                               (high (symbol_ref ("SYM"))))
18719           curr (ldr) == (set (reg r1)
18720                              (mem (lo_sum (reg r0)
18721                                              (symbol_ref ("SYM")))))
18722                  or
18723           curr (ldr) == (set (reg r1)
18724                              (zero_extend (mem
18725                                            (lo_sum (reg r0)
18726                                                    (symbol_ref ("SYM"))))))  */
18727       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18728           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18729         {
18730           rtx curr_src = SET_SRC (curr_set);
18731
18732           if (GET_CODE (curr_src) == ZERO_EXTEND)
18733             curr_src = XEXP (curr_src, 0);
18734
18735           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18736               && REG_P (XEXP (XEXP (curr_src, 0), 0))
18737               && REGNO (XEXP (XEXP (curr_src, 0), 0))
18738                  == REGNO (SET_DEST (prev_set))
18739               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18740                               XEXP (SET_SRC (prev_set), 0)))
18741               return true;
18742         }
18743     }
18744
18745   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18746       && any_condjump_p (curr))
18747     {
18748       unsigned int condreg1, condreg2;
18749       rtx cc_reg_1;
18750       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18751       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18752
18753       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18754           && prev
18755           && modified_in_p (cc_reg_1, prev))
18756         {
18757           enum attr_type prev_type = get_attr_type (prev);
18758
18759           /* FIXME: this misses some which is considered simple arthematic
18760              instructions for ThunderX.  Simple shifts are missed here.  */
18761           if (prev_type == TYPE_ALUS_SREG
18762               || prev_type == TYPE_ALUS_IMM
18763               || prev_type == TYPE_LOGICS_REG
18764               || prev_type == TYPE_LOGICS_IMM)
18765             return true;
18766         }
18767     }
18768
18769   if (prev_set
18770       && curr_set
18771       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18772       && any_condjump_p (curr))
18773     {
18774       /* We're trying to match:
18775           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18776           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
18777                                                          (const_int 0))
18778                                                  (label_ref ("SYM"))
18779                                                  (pc))  */
18780       if (SET_DEST (curr_set) == (pc_rtx)
18781           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18782           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18783           && REG_P (SET_DEST (prev_set))
18784           && REGNO (SET_DEST (prev_set))
18785              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18786         {
18787           /* Fuse ALU operations followed by conditional branch instruction.  */
18788           switch (get_attr_type (prev))
18789             {
18790             case TYPE_ALU_IMM:
18791             case TYPE_ALU_SREG:
18792             case TYPE_ADC_REG:
18793             case TYPE_ADC_IMM:
18794             case TYPE_ADCS_REG:
18795             case TYPE_ADCS_IMM:
18796             case TYPE_LOGIC_REG:
18797             case TYPE_LOGIC_IMM:
18798             case TYPE_CSEL:
18799             case TYPE_ADR:
18800             case TYPE_MOV_IMM:
18801             case TYPE_SHIFT_REG:
18802             case TYPE_SHIFT_IMM:
18803             case TYPE_BFM:
18804             case TYPE_RBIT:
18805             case TYPE_REV:
18806             case TYPE_EXTEND:
18807               return true;
18808
18809             default:;
18810             }
18811         }
18812     }
18813
18814   return false;
18815 }
18816
18817 /* Return true iff the instruction fusion described by OP is enabled.  */
18818
18819 bool
18820 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18821 {
18822   return (aarch64_tune_params.fusible_ops & op) != 0;
18823 }
18824
18825 /* If MEM is in the form of [base+offset], extract the two parts
18826    of address and set to BASE and OFFSET, otherwise return false
18827    after clearing BASE and OFFSET.  */
18828
18829 bool
18830 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18831 {
18832   rtx addr;
18833
18834   gcc_assert (MEM_P (mem));
18835
18836   addr = XEXP (mem, 0);
18837
18838   if (REG_P (addr))
18839     {
18840       *base = addr;
18841       *offset = const0_rtx;
18842       return true;
18843     }
18844
18845   if (GET_CODE (addr) == PLUS
18846       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18847     {
18848       *base = XEXP (addr, 0);
18849       *offset = XEXP (addr, 1);
18850       return true;
18851     }
18852
18853   *base = NULL_RTX;
18854   *offset = NULL_RTX;
18855
18856   return false;
18857 }
18858
18859 /* Types for scheduling fusion.  */
18860 enum sched_fusion_type
18861 {
18862   SCHED_FUSION_NONE = 0,
18863   SCHED_FUSION_LD_SIGN_EXTEND,
18864   SCHED_FUSION_LD_ZERO_EXTEND,
18865   SCHED_FUSION_LD,
18866   SCHED_FUSION_ST,
18867   SCHED_FUSION_NUM
18868 };
18869
18870 /* If INSN is a load or store of address in the form of [base+offset],
18871    extract the two parts and set to BASE and OFFSET.  Return scheduling
18872    fusion type this INSN is.  */
18873
18874 static enum sched_fusion_type
18875 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18876 {
18877   rtx x, dest, src;
18878   enum sched_fusion_type fusion = SCHED_FUSION_LD;
18879
18880   gcc_assert (INSN_P (insn));
18881   x = PATTERN (insn);
18882   if (GET_CODE (x) != SET)
18883     return SCHED_FUSION_NONE;
18884
18885   src = SET_SRC (x);
18886   dest = SET_DEST (x);
18887
18888   machine_mode dest_mode = GET_MODE (dest);
18889
18890   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18891     return SCHED_FUSION_NONE;
18892
18893   if (GET_CODE (src) == SIGN_EXTEND)
18894     {
18895       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18896       src = XEXP (src, 0);
18897       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18898         return SCHED_FUSION_NONE;
18899     }
18900   else if (GET_CODE (src) == ZERO_EXTEND)
18901     {
18902       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18903       src = XEXP (src, 0);
18904       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18905         return SCHED_FUSION_NONE;
18906     }
18907
18908   if (GET_CODE (src) == MEM && REG_P (dest))
18909     extract_base_offset_in_addr (src, base, offset);
18910   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18911     {
18912       fusion = SCHED_FUSION_ST;
18913       extract_base_offset_in_addr (dest, base, offset);
18914     }
18915   else
18916     return SCHED_FUSION_NONE;
18917
18918   if (*base == NULL_RTX || *offset == NULL_RTX)
18919     fusion = SCHED_FUSION_NONE;
18920
18921   return fusion;
18922 }
18923
18924 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18925
18926    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18927    and PRI are only calculated for these instructions.  For other instruction,
18928    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
18929    type instruction fusion can be added by returning different priorities.
18930
18931    It's important that irrelevant instructions get the largest FUSION_PRI.  */
18932
18933 static void
18934 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18935                                int *fusion_pri, int *pri)
18936 {
18937   int tmp, off_val;
18938   rtx base, offset;
18939   enum sched_fusion_type fusion;
18940
18941   gcc_assert (INSN_P (insn));
18942
18943   tmp = max_pri - 1;
18944   fusion = fusion_load_store (insn, &base, &offset);
18945   if (fusion == SCHED_FUSION_NONE)
18946     {
18947       *pri = tmp;
18948       *fusion_pri = tmp;
18949       return;
18950     }
18951
18952   /* Set FUSION_PRI according to fusion type and base register.  */
18953   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18954
18955   /* Calculate PRI.  */
18956   tmp /= 2;
18957
18958   /* INSN with smaller offset goes first.  */
18959   off_val = (int)(INTVAL (offset));
18960   if (off_val >= 0)
18961     tmp -= (off_val & 0xfffff);
18962   else
18963     tmp += ((- off_val) & 0xfffff);
18964
18965   *pri = tmp;
18966   return;
18967 }
18968
18969 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18970    Adjust priority of sha1h instructions so they are scheduled before
18971    other SHA1 instructions.  */
18972
18973 static int
18974 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18975 {
18976   rtx x = PATTERN (insn);
18977
18978   if (GET_CODE (x) == SET)
18979     {
18980       x = SET_SRC (x);
18981
18982       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18983         return priority + 10;
18984     }
18985
18986   return priority;
18987 }
18988
18989 /* Given OPERANDS of consecutive load/store, check if we can merge
18990    them into ldp/stp.  LOAD is true if they are load instructions.
18991    MODE is the mode of memory operands.  */
18992
18993 bool
18994 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18995                                 machine_mode mode)
18996 {
18997   HOST_WIDE_INT offval_1, offval_2, msize;
18998   enum reg_class rclass_1, rclass_2;
18999   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19000
19001   if (load)
19002     {
19003       mem_1 = operands[1];
19004       mem_2 = operands[3];
19005       reg_1 = operands[0];
19006       reg_2 = operands[2];
19007       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19008       if (REGNO (reg_1) == REGNO (reg_2))
19009         return false;
19010     }
19011   else
19012     {
19013       mem_1 = operands[0];
19014       mem_2 = operands[2];
19015       reg_1 = operands[1];
19016       reg_2 = operands[3];
19017     }
19018
19019   /* The mems cannot be volatile.  */
19020   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19021     return false;
19022
19023   /* If we have SImode and slow unaligned ldp,
19024      check the alignment to be at least 8 byte. */
19025   if (mode == SImode
19026       && (aarch64_tune_params.extra_tuning_flags
19027           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19028       && !optimize_size
19029       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19030     return false;
19031
19032   /* Check if the addresses are in the form of [base+offset].  */
19033   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19034   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19035     return false;
19036   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19037   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19038     return false;
19039
19040   /* Check if the bases are same.  */
19041   if (!rtx_equal_p (base_1, base_2))
19042     return false;
19043
19044   /* The operands must be of the same size.  */
19045   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19046                          GET_MODE_SIZE (GET_MODE (mem_2))));
19047
19048   offval_1 = INTVAL (offset_1);
19049   offval_2 = INTVAL (offset_2);
19050   /* We should only be trying this for fixed-sized modes.  There is no
19051      SVE LDP/STP instruction.  */
19052   msize = GET_MODE_SIZE (mode).to_constant ();
19053   /* Check if the offsets are consecutive.  */
19054   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19055     return false;
19056
19057   /* Check if the addresses are clobbered by load.  */
19058   if (load)
19059     {
19060       if (reg_mentioned_p (reg_1, mem_1))
19061         return false;
19062
19063       /* In increasing order, the last load can clobber the address.  */
19064       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19065         return false;
19066     }
19067
19068   /* One of the memory accesses must be a mempair operand.
19069      If it is not the first one, they need to be swapped by the
19070      peephole.  */
19071   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19072        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19073     return false;
19074
19075   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19076     rclass_1 = FP_REGS;
19077   else
19078     rclass_1 = GENERAL_REGS;
19079
19080   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19081     rclass_2 = FP_REGS;
19082   else
19083     rclass_2 = GENERAL_REGS;
19084
19085   /* Check if the registers are of same class.  */
19086   if (rclass_1 != rclass_2)
19087     return false;
19088
19089   return true;
19090 }
19091
19092 /* Given OPERANDS of consecutive load/store that can be merged,
19093    swap them if they are not in ascending order.  */
19094 void
19095 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19096 {
19097   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19098   HOST_WIDE_INT offval_1, offval_2;
19099
19100   if (load)
19101     {
19102       mem_1 = operands[1];
19103       mem_2 = operands[3];
19104     }
19105   else
19106     {
19107       mem_1 = operands[0];
19108       mem_2 = operands[2];
19109     }
19110
19111   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19112   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19113
19114   offval_1 = INTVAL (offset_1);
19115   offval_2 = INTVAL (offset_2);
19116
19117   if (offval_1 > offval_2)
19118     {
19119       /* Irrespective of whether this is a load or a store,
19120          we do the same swap.  */
19121       std::swap (operands[0], operands[2]);
19122       std::swap (operands[1], operands[3]);
19123     }
19124 }
19125
19126 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19127    comparison between the two.  */
19128 int
19129 aarch64_host_wide_int_compare (const void *x, const void *y)
19130 {
19131   return wi::cmps (* ((const HOST_WIDE_INT *) x),
19132                    * ((const HOST_WIDE_INT *) y));
19133 }
19134
19135 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19136    other pointing to a REG rtx containing an offset, compare the offsets
19137    of the two pairs.
19138
19139    Return:
19140
19141         1 iff offset (X) > offset (Y)
19142         0 iff offset (X) == offset (Y)
19143         -1 iff offset (X) < offset (Y)  */
19144 int
19145 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19146 {
19147   const rtx * operands_1 = (const rtx *) x;
19148   const rtx * operands_2 = (const rtx *) y;
19149   rtx mem_1, mem_2, base, offset_1, offset_2;
19150
19151   if (MEM_P (operands_1[0]))
19152     mem_1 = operands_1[0];
19153   else
19154     mem_1 = operands_1[1];
19155
19156   if (MEM_P (operands_2[0]))
19157     mem_2 = operands_2[0];
19158   else
19159     mem_2 = operands_2[1];
19160
19161   /* Extract the offsets.  */
19162   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19163   extract_base_offset_in_addr (mem_2, &base, &offset_2);
19164
19165   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19166
19167   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19168 }
19169
19170 /* Given OPERANDS of consecutive load/store, check if we can merge
19171    them into ldp/stp by adjusting the offset.  LOAD is true if they
19172    are load instructions.  MODE is the mode of memory operands.
19173
19174    Given below consecutive stores:
19175
19176      str  w1, [xb, 0x100]
19177      str  w1, [xb, 0x104]
19178      str  w1, [xb, 0x108]
19179      str  w1, [xb, 0x10c]
19180
19181    Though the offsets are out of the range supported by stp, we can
19182    still pair them after adjusting the offset, like:
19183
19184      add  scratch, xb, 0x100
19185      stp  w1, w1, [scratch]
19186      stp  w1, w1, [scratch, 0x8]
19187
19188    The peephole patterns detecting this opportunity should guarantee
19189    the scratch register is avaliable.  */
19190
19191 bool
19192 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19193                                        scalar_mode mode)
19194 {
19195   const int num_insns = 4;
19196   enum reg_class rclass;
19197   HOST_WIDE_INT offvals[num_insns], msize;
19198   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19199
19200   if (load)
19201     {
19202       for (int i = 0; i < num_insns; i++)
19203         {
19204           reg[i] = operands[2 * i];
19205           mem[i] = operands[2 * i + 1];
19206
19207           gcc_assert (REG_P (reg[i]));
19208         }
19209
19210       /* Do not attempt to merge the loads if the loads clobber each other.  */
19211       for (int i = 0; i < 8; i += 2)
19212         for (int j = i + 2; j < 8; j += 2)
19213           if (reg_overlap_mentioned_p (operands[i], operands[j]))
19214             return false;
19215     }
19216   else
19217     for (int i = 0; i < num_insns; i++)
19218       {
19219         mem[i] = operands[2 * i];
19220         reg[i] = operands[2 * i + 1];
19221       }
19222
19223   /* Skip if memory operand is by itself valid for ldp/stp.  */
19224   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19225     return false;
19226
19227   for (int i = 0; i < num_insns; i++)
19228     {
19229       /* The mems cannot be volatile.  */
19230       if (MEM_VOLATILE_P (mem[i]))
19231         return false;
19232
19233       /* Check if the addresses are in the form of [base+offset].  */
19234       extract_base_offset_in_addr (mem[i], base + i, offset + i);
19235       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19236         return false;
19237     }
19238
19239   /* Check if the registers are of same class.  */
19240   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19241     ? FP_REGS : GENERAL_REGS;
19242
19243   for (int i = 1; i < num_insns; i++)
19244     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19245       {
19246         if (rclass != FP_REGS)
19247           return false;
19248       }
19249     else
19250       {
19251         if (rclass != GENERAL_REGS)
19252           return false;
19253       }
19254
19255   /* Only the last register in the order in which they occur
19256      may be clobbered by the load.  */
19257   if (rclass == GENERAL_REGS && load)
19258     for (int i = 0; i < num_insns - 1; i++)
19259       if (reg_mentioned_p (reg[i], mem[i]))
19260         return false;
19261
19262   /* Check if the bases are same.  */
19263   for (int i = 0; i < num_insns - 1; i++)
19264     if (!rtx_equal_p (base[i], base[i + 1]))
19265       return false;
19266
19267   for (int i = 0; i < num_insns; i++)
19268     offvals[i] = INTVAL (offset[i]);
19269
19270   msize = GET_MODE_SIZE (mode);
19271
19272   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
19273   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19274          aarch64_host_wide_int_compare);
19275
19276   if (!(offvals[1] == offvals[0] + msize
19277         && offvals[3] == offvals[2] + msize))
19278     return false;
19279
19280   /* Check that offsets are within range of each other.  The ldp/stp
19281      instructions have 7 bit immediate offsets, so use 0x80.  */
19282   if (offvals[2] - offvals[0] >= msize * 0x80)
19283     return false;
19284
19285   /* The offsets must be aligned with respect to each other.  */
19286   if (offvals[0] % msize != offvals[2] % msize)
19287     return false;
19288
19289   /* If we have SImode and slow unaligned ldp,
19290      check the alignment to be at least 8 byte. */
19291   if (mode == SImode
19292       && (aarch64_tune_params.extra_tuning_flags
19293           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19294       && !optimize_size
19295       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19296     return false;
19297
19298   return true;
19299 }
19300
19301 /* Given OPERANDS of consecutive load/store, this function pairs them
19302    into LDP/STP after adjusting the offset.  It depends on the fact
19303    that the operands can be sorted so the offsets are correct for STP.
19304    MODE is the mode of memory operands.  CODE is the rtl operator
19305    which should be applied to all memory operands, it's SIGN_EXTEND,
19306    ZERO_EXTEND or UNKNOWN.  */
19307
19308 bool
19309 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19310                              scalar_mode mode, RTX_CODE code)
19311 {
19312   rtx base, offset_1, offset_3, t1, t2;
19313   rtx mem_1, mem_2, mem_3, mem_4;
19314   rtx temp_operands[8];
19315   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19316                 stp_off_upper_limit, stp_off_lower_limit, msize;
19317
19318   /* We make changes on a copy as we may still bail out.  */
19319   for (int i = 0; i < 8; i ++)
19320     temp_operands[i] = operands[i];
19321
19322   /* Sort the operands.  */
19323   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19324
19325   /* Copy the memory operands so that if we have to bail for some
19326      reason the original addresses are unchanged.  */
19327   if (load)
19328     {
19329       mem_1 = copy_rtx (temp_operands[1]);
19330       mem_2 = copy_rtx (temp_operands[3]);
19331       mem_3 = copy_rtx (temp_operands[5]);
19332       mem_4 = copy_rtx (temp_operands[7]);
19333     }
19334   else
19335     {
19336       mem_1 = copy_rtx (temp_operands[0]);
19337       mem_2 = copy_rtx (temp_operands[2]);
19338       mem_3 = copy_rtx (temp_operands[4]);
19339       mem_4 = copy_rtx (temp_operands[6]);
19340       gcc_assert (code == UNKNOWN);
19341     }
19342
19343   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19344   extract_base_offset_in_addr (mem_3, &base, &offset_3);
19345   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19346               && offset_3 != NULL_RTX);
19347
19348   /* Adjust offset so it can fit in LDP/STP instruction.  */
19349   msize = GET_MODE_SIZE (mode);
19350   stp_off_upper_limit = msize * (0x40 - 1);
19351   stp_off_lower_limit = - msize * 0x40;
19352
19353   off_val_1 = INTVAL (offset_1);
19354   off_val_3 = INTVAL (offset_3);
19355
19356   /* The base offset is optimally half way between the two STP/LDP offsets.  */
19357   if (msize <= 4)
19358     base_off = (off_val_1 + off_val_3) / 2;
19359   else
19360     /* However, due to issues with negative LDP/STP offset generation for
19361        larger modes, for DF, DI and vector modes. we must not use negative
19362        addresses smaller than 9 signed unadjusted bits can store.  This
19363        provides the most range in this case.  */
19364     base_off = off_val_1;
19365
19366   /* Adjust the base so that it is aligned with the addresses but still
19367      optimal.  */
19368   if (base_off % msize != off_val_1 % msize)
19369     /* Fix the offset, bearing in mind we want to make it bigger not
19370        smaller.  */
19371     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19372   else if (msize <= 4)
19373     /* The negative range of LDP/STP is one larger than the positive range.  */
19374     base_off += msize;
19375
19376   /* Check if base offset is too big or too small.  We can attempt to resolve
19377      this issue by setting it to the maximum value and seeing if the offsets
19378      still fit.  */
19379   if (base_off >= 0x1000)
19380     {
19381       base_off = 0x1000 - 1;
19382       /* We must still make sure that the base offset is aligned with respect
19383          to the address.  But it may may not be made any bigger.  */
19384       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19385     }
19386
19387   /* Likewise for the case where the base is too small.  */
19388   if (base_off <= -0x1000)
19389     {
19390       base_off = -0x1000 + 1;
19391       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19392     }
19393
19394   /* Offset of the first STP/LDP.  */
19395   new_off_1 = off_val_1 - base_off;
19396
19397   /* Offset of the second STP/LDP.  */
19398   new_off_3 = off_val_3 - base_off;
19399
19400   /* The offsets must be within the range of the LDP/STP instructions.  */
19401   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19402       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19403     return false;
19404
19405   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19406                                                   new_off_1), true);
19407   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19408                                                   new_off_1 + msize), true);
19409   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19410                                                   new_off_3), true);
19411   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19412                                                   new_off_3 + msize), true);
19413
19414   if (!aarch64_mem_pair_operand (mem_1, mode)
19415       || !aarch64_mem_pair_operand (mem_3, mode))
19416     return false;
19417
19418   if (code == ZERO_EXTEND)
19419     {
19420       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19421       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19422       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19423       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19424     }
19425   else if (code == SIGN_EXTEND)
19426     {
19427       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19428       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19429       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19430       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19431     }
19432
19433   if (load)
19434     {
19435       operands[0] = temp_operands[0];
19436       operands[1] = mem_1;
19437       operands[2] = temp_operands[2];
19438       operands[3] = mem_2;
19439       operands[4] = temp_operands[4];
19440       operands[5] = mem_3;
19441       operands[6] = temp_operands[6];
19442       operands[7] = mem_4;
19443     }
19444   else
19445     {
19446       operands[0] = mem_1;
19447       operands[1] = temp_operands[1];
19448       operands[2] = mem_2;
19449       operands[3] = temp_operands[3];
19450       operands[4] = mem_3;
19451       operands[5] = temp_operands[5];
19452       operands[6] = mem_4;
19453       operands[7] = temp_operands[7];
19454     }
19455
19456   /* Emit adjusting instruction.  */
19457   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19458   /* Emit ldp/stp instructions.  */
19459   t1 = gen_rtx_SET (operands[0], operands[1]);
19460   t2 = gen_rtx_SET (operands[2], operands[3]);
19461   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19462   t1 = gen_rtx_SET (operands[4], operands[5]);
19463   t2 = gen_rtx_SET (operands[6], operands[7]);
19464   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19465   return true;
19466 }
19467
19468 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
19469    it isn't worth branching around empty masked ops (including masked
19470    stores).  */
19471
19472 static bool
19473 aarch64_empty_mask_is_expensive (unsigned)
19474 {
19475   return false;
19476 }
19477
19478 /* Return 1 if pseudo register should be created and used to hold
19479    GOT address for PIC code.  */
19480
19481 bool
19482 aarch64_use_pseudo_pic_reg (void)
19483 {
19484   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19485 }
19486
19487 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
19488
19489 static int
19490 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19491 {
19492   switch (XINT (x, 1))
19493     {
19494     case UNSPEC_GOTSMALLPIC:
19495     case UNSPEC_GOTSMALLPIC28K:
19496     case UNSPEC_GOTTINYPIC:
19497       return 0;
19498     default:
19499       break;
19500     }
19501
19502   return default_unspec_may_trap_p (x, flags);
19503 }
19504
19505
19506 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19507    return the log2 of that value.  Otherwise return -1.  */
19508
19509 int
19510 aarch64_fpconst_pow_of_2 (rtx x)
19511 {
19512   const REAL_VALUE_TYPE *r;
19513
19514   if (!CONST_DOUBLE_P (x))
19515     return -1;
19516
19517   r = CONST_DOUBLE_REAL_VALUE (x);
19518
19519   if (REAL_VALUE_NEGATIVE (*r)
19520       || REAL_VALUE_ISNAN (*r)
19521       || REAL_VALUE_ISINF (*r)
19522       || !real_isinteger (r, DFmode))
19523     return -1;
19524
19525   return exact_log2 (real_to_integer (r));
19526 }
19527
19528 /* If X is a vector of equal CONST_DOUBLE values and that value is
19529    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
19530
19531 int
19532 aarch64_vec_fpconst_pow_of_2 (rtx x)
19533 {
19534   int nelts;
19535   if (GET_CODE (x) != CONST_VECTOR
19536       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19537     return -1;
19538
19539   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19540     return -1;
19541
19542   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19543   if (firstval <= 0)
19544     return -1;
19545
19546   for (int i = 1; i < nelts; i++)
19547     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19548       return -1;
19549
19550   return firstval;
19551 }
19552
19553 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19554    to float.
19555
19556    __fp16 always promotes through this hook.
19557    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19558    through the generic excess precision logic rather than here.  */
19559
19560 static tree
19561 aarch64_promoted_type (const_tree t)
19562 {
19563   if (SCALAR_FLOAT_TYPE_P (t)
19564       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19565     return float_type_node;
19566
19567   return NULL_TREE;
19568 }
19569
19570 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
19571
19572 static bool
19573 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19574                            optimization_type opt_type)
19575 {
19576   switch (op)
19577     {
19578     case rsqrt_optab:
19579       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19580
19581     default:
19582       return true;
19583     }
19584 }
19585
19586 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
19587
19588 static unsigned int
19589 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19590                                         int *offset)
19591 {
19592   /* Polynomial invariant 1 == (VG / 2) - 1.  */
19593   gcc_assert (i == 1);
19594   *factor = 2;
19595   *offset = 1;
19596   return AARCH64_DWARF_VG;
19597 }
19598
19599 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19600    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19601
19602 static bool
19603 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19604 {
19605   return (mode == HFmode
19606           ? true
19607           : default_libgcc_floating_mode_supported_p (mode));
19608 }
19609
19610 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19611    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19612
19613 static bool
19614 aarch64_scalar_mode_supported_p (scalar_mode mode)
19615 {
19616   return (mode == HFmode
19617           ? true
19618           : default_scalar_mode_supported_p (mode));
19619 }
19620
19621 /* Set the value of FLT_EVAL_METHOD.
19622    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19623
19624     0: evaluate all operations and constants, whose semantic type has at
19625        most the range and precision of type float, to the range and
19626        precision of float; evaluate all other operations and constants to
19627        the range and precision of the semantic type;
19628
19629     N, where _FloatN is a supported interchange floating type
19630        evaluate all operations and constants, whose semantic type has at
19631        most the range and precision of _FloatN type, to the range and
19632        precision of the _FloatN type; evaluate all other operations and
19633        constants to the range and precision of the semantic type;
19634
19635    If we have the ARMv8.2-A extensions then we support _Float16 in native
19636    precision, so we should set this to 16.  Otherwise, we support the type,
19637    but want to evaluate expressions in float precision, so set this to
19638    0.  */
19639
19640 static enum flt_eval_method
19641 aarch64_excess_precision (enum excess_precision_type type)
19642 {
19643   switch (type)
19644     {
19645       case EXCESS_PRECISION_TYPE_FAST:
19646       case EXCESS_PRECISION_TYPE_STANDARD:
19647         /* We can calculate either in 16-bit range and precision or
19648            32-bit range and precision.  Make that decision based on whether
19649            we have native support for the ARMv8.2-A 16-bit floating-point
19650            instructions or not.  */
19651         return (TARGET_FP_F16INST
19652                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19653                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19654       case EXCESS_PRECISION_TYPE_IMPLICIT:
19655         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19656       default:
19657         gcc_unreachable ();
19658     }
19659   return FLT_EVAL_METHOD_UNPREDICTABLE;
19660 }
19661
19662 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
19663    scheduled for speculative execution.  Reject the long-running division
19664    and square-root instructions.  */
19665
19666 static bool
19667 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19668 {
19669   switch (get_attr_type (insn))
19670     {
19671       case TYPE_SDIV:
19672       case TYPE_UDIV:
19673       case TYPE_FDIVS:
19674       case TYPE_FDIVD:
19675       case TYPE_FSQRTS:
19676       case TYPE_FSQRTD:
19677       case TYPE_NEON_FP_SQRT_S:
19678       case TYPE_NEON_FP_SQRT_D:
19679       case TYPE_NEON_FP_SQRT_S_Q:
19680       case TYPE_NEON_FP_SQRT_D_Q:
19681       case TYPE_NEON_FP_DIV_S:
19682       case TYPE_NEON_FP_DIV_D:
19683       case TYPE_NEON_FP_DIV_S_Q:
19684       case TYPE_NEON_FP_DIV_D_Q:
19685         return false;
19686       default:
19687         return true;
19688     }
19689 }
19690
19691 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
19692
19693 static int
19694 aarch64_compute_pressure_classes (reg_class *classes)
19695 {
19696   int i = 0;
19697   classes[i++] = GENERAL_REGS;
19698   classes[i++] = FP_REGS;
19699   /* PR_REGS isn't a useful pressure class because many predicate pseudo
19700      registers need to go in PR_LO_REGS at some point during their
19701      lifetime.  Splitting it into two halves has the effect of making
19702      all predicates count against PR_LO_REGS, so that we try whenever
19703      possible to restrict the number of live predicates to 8.  This
19704      greatly reduces the amount of spilling in certain loops.  */
19705   classes[i++] = PR_LO_REGS;
19706   classes[i++] = PR_HI_REGS;
19707   return i;
19708 }
19709
19710 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
19711
19712 static bool
19713 aarch64_can_change_mode_class (machine_mode from,
19714                                machine_mode to, reg_class_t)
19715 {
19716   if (BYTES_BIG_ENDIAN)
19717     {
19718       bool from_sve_p = aarch64_sve_data_mode_p (from);
19719       bool to_sve_p = aarch64_sve_data_mode_p (to);
19720
19721       /* Don't allow changes between SVE data modes and non-SVE modes.
19722          See the comment at the head of aarch64-sve.md for details.  */
19723       if (from_sve_p != to_sve_p)
19724         return false;
19725
19726       /* Don't allow changes in element size: lane 0 of the new vector
19727          would not then be lane 0 of the old vector.  See the comment
19728          above aarch64_maybe_expand_sve_subreg_move for a more detailed
19729          description.
19730
19731          In the worst case, this forces a register to be spilled in
19732          one mode and reloaded in the other, which handles the
19733          endianness correctly.  */
19734       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19735         return false;
19736     }
19737   return true;
19738 }
19739
19740 /* Implement TARGET_EARLY_REMAT_MODES.  */
19741
19742 static void
19743 aarch64_select_early_remat_modes (sbitmap modes)
19744 {
19745   /* SVE values are not normally live across a call, so it should be
19746      worth doing early rematerialization even in VL-specific mode.  */
19747   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19748     {
19749       machine_mode mode = (machine_mode) i;
19750       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19751       if (vec_flags & VEC_ANY_SVE)
19752         bitmap_set_bit (modes, i);
19753     }
19754 }
19755
19756 /* Override the default target speculation_safe_value.  */
19757 static rtx
19758 aarch64_speculation_safe_value (machine_mode mode,
19759                                 rtx result, rtx val, rtx failval)
19760 {
19761   /* Maybe we should warn if falling back to hard barriers.  They are
19762      likely to be noticably more expensive than the alternative below.  */
19763   if (!aarch64_track_speculation)
19764     return default_speculation_safe_value (mode, result, val, failval);
19765
19766   if (!REG_P (val))
19767     val = copy_to_mode_reg (mode, val);
19768
19769   if (!aarch64_reg_or_zero (failval, mode))
19770     failval = copy_to_mode_reg (mode, failval);
19771
19772   emit_insn (gen_despeculate_copy (mode, result, val, failval));
19773   return result;
19774 }
19775
19776 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19777    Look into the tuning structure for an estimate.
19778    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19779    Advanced SIMD 128 bits.  */
19780
19781 static HOST_WIDE_INT
19782 aarch64_estimated_poly_value (poly_int64 val)
19783 {
19784   enum aarch64_sve_vector_bits_enum width_source
19785     = aarch64_tune_params.sve_width;
19786
19787   /* If we still don't have an estimate, use the default.  */
19788   if (width_source == SVE_SCALABLE)
19789     return default_estimated_poly_value (val);
19790
19791   HOST_WIDE_INT over_128 = width_source - 128;
19792   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19793 }
19794
19795
19796 /* Return true for types that could be supported as SIMD return or
19797    argument types.  */
19798
19799 static bool
19800 supported_simd_type (tree t)
19801 {
19802   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19803     {
19804       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19805       return s == 1 || s == 2 || s == 4 || s == 8;
19806     }
19807   return false;
19808 }
19809
19810 /* Return true for types that currently are supported as SIMD return
19811    or argument types.  */
19812
19813 static bool
19814 currently_supported_simd_type (tree t, tree b)
19815 {
19816   if (COMPLEX_FLOAT_TYPE_P (t))
19817     return false;
19818
19819   if (TYPE_SIZE (t) != TYPE_SIZE (b))
19820     return false;
19821
19822   return supported_simd_type (t);
19823 }
19824
19825 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
19826
19827 static int
19828 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19829                                         struct cgraph_simd_clone *clonei,
19830                                         tree base_type, int num)
19831 {
19832   tree t, ret_type, arg_type;
19833   unsigned int elt_bits, vec_bits, count;
19834
19835   if (!TARGET_SIMD)
19836     return 0;
19837
19838   if (clonei->simdlen
19839       && (clonei->simdlen < 2
19840           || clonei->simdlen > 1024
19841           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19842     {
19843       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19844                   "unsupported simdlen %d", clonei->simdlen);
19845       return 0;
19846     }
19847
19848   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19849   if (TREE_CODE (ret_type) != VOID_TYPE
19850       && !currently_supported_simd_type (ret_type, base_type))
19851     {
19852       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19853         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19854                     "GCC does not currently support mixed size types "
19855                     "for %<simd%> functions");
19856       else if (supported_simd_type (ret_type))
19857         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19858                     "GCC does not currently support return type %qT "
19859                     "for %<simd%> functions", ret_type);
19860       else
19861         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19862                     "unsupported return type %qT for %<simd%> functions",
19863                     ret_type);
19864       return 0;
19865     }
19866
19867   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19868     {
19869       arg_type = TREE_TYPE (t);
19870
19871       if (!currently_supported_simd_type (arg_type, base_type))
19872         {
19873           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19874             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19875                         "GCC does not currently support mixed size types "
19876                         "for %<simd%> functions");
19877           else
19878             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19879                         "GCC does not currently support argument type %qT "
19880                         "for %<simd%> functions", arg_type);
19881           return 0;
19882         }
19883     }
19884
19885   clonei->vecsize_mangle = 'n';
19886   clonei->mask_mode = VOIDmode;
19887   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19888   if (clonei->simdlen == 0)
19889     {
19890       count = 2;
19891       vec_bits = (num == 0 ? 64 : 128);
19892       clonei->simdlen = vec_bits / elt_bits;
19893     }
19894   else
19895     {
19896       count = 1;
19897       vec_bits = clonei->simdlen * elt_bits;
19898       if (vec_bits != 64 && vec_bits != 128)
19899         {
19900           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19901                       "GCC does not currently support simdlen %d for type %qT",
19902                       clonei->simdlen, base_type);
19903           return 0;
19904         }
19905     }
19906   clonei->vecsize_int = vec_bits;
19907   clonei->vecsize_float = vec_bits;
19908   return count;
19909 }
19910
19911 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
19912
19913 static void
19914 aarch64_simd_clone_adjust (struct cgraph_node *node)
19915 {
19916   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19917      use the correct ABI.  */
19918
19919   tree t = TREE_TYPE (node->decl);
19920   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19921                                         TYPE_ATTRIBUTES (t));
19922 }
19923
19924 /* Implement TARGET_SIMD_CLONE_USABLE.  */
19925
19926 static int
19927 aarch64_simd_clone_usable (struct cgraph_node *node)
19928 {
19929   switch (node->simdclone->vecsize_mangle)
19930     {
19931     case 'n':
19932       if (!TARGET_SIMD)
19933         return -1;
19934       return 0;
19935     default:
19936       gcc_unreachable ();
19937     }
19938 }
19939
19940 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19941
19942 static int
19943 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19944 {
19945   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19946       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19947     return 0;
19948   return 1;
19949 }
19950
19951 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19952
19953 static const char *
19954 aarch64_get_multilib_abi_name (void)
19955 {
19956   if (TARGET_BIG_END)
19957     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19958   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19959 }
19960
19961 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19962    global variable based guard use the default else
19963    return a null tree.  */
19964 static tree
19965 aarch64_stack_protect_guard (void)
19966 {
19967   if (aarch64_stack_protector_guard == SSP_GLOBAL)
19968     return default_stack_protect_guard ();
19969
19970   return NULL_TREE;
19971 }
19972
19973 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
19974    section at the end if needed.  */
19975 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
19976 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
19977 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
19978 void
19979 aarch64_file_end_indicate_exec_stack ()
19980 {
19981   file_end_indicate_exec_stack ();
19982
19983   unsigned feature_1_and = 0;
19984   if (aarch64_bti_enabled ())
19985     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19986
19987   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19988     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19989
19990   if (feature_1_and)
19991     {
19992       /* Generate .note.gnu.property section.  */
19993       switch_to_section (get_section (".note.gnu.property",
19994                                       SECTION_NOTYPE, NULL));
19995
19996       /* PT_NOTE header: namesz, descsz, type.
19997          namesz = 4 ("GNU\0")
19998          descsz = 16 (Size of the program property array)
19999                   [(12 + padding) * Number of array elements]
20000          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
20001       assemble_align (POINTER_SIZE);
20002       assemble_integer (GEN_INT (4), 4, 32, 1);
20003       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20004       assemble_integer (GEN_INT (5), 4, 32, 1);
20005
20006       /* PT_NOTE name.  */
20007       assemble_string ("GNU", 4);
20008
20009       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20010          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20011          datasz = 4
20012          data   = feature_1_and.  */
20013       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20014       assemble_integer (GEN_INT (4), 4, 32, 1);
20015       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20016
20017       /* Pad the size of the note to the required alignment.  */
20018       assemble_align (POINTER_SIZE);
20019     }
20020 }
20021 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20022 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20023 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20024
20025 /* Target-specific selftests.  */
20026
20027 #if CHECKING_P
20028
20029 namespace selftest {
20030
20031 /* Selftest for the RTL loader.
20032    Verify that the RTL loader copes with a dump from
20033    print_rtx_function.  This is essentially just a test that class
20034    function_reader can handle a real dump, but it also verifies
20035    that lookup_reg_by_dump_name correctly handles hard regs.
20036    The presence of hard reg names in the dump means that the test is
20037    target-specific, hence it is in this file.  */
20038
20039 static void
20040 aarch64_test_loading_full_dump ()
20041 {
20042   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20043
20044   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20045
20046   rtx_insn *insn_1 = get_insn_by_uid (1);
20047   ASSERT_EQ (NOTE, GET_CODE (insn_1));
20048
20049   rtx_insn *insn_15 = get_insn_by_uid (15);
20050   ASSERT_EQ (INSN, GET_CODE (insn_15));
20051   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20052
20053   /* Verify crtl->return_rtx.  */
20054   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20055   ASSERT_EQ (0, REGNO (crtl->return_rtx));
20056   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20057 }
20058
20059 /* Run all target-specific selftests.  */
20060
20061 static void
20062 aarch64_run_selftests (void)
20063 {
20064   aarch64_test_loading_full_dump ();
20065 }
20066
20067 } // namespace selftest
20068
20069 #endif /* #if CHECKING_P */
20070
20071 #undef TARGET_STACK_PROTECT_GUARD
20072 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20073
20074 #undef TARGET_ADDRESS_COST
20075 #define TARGET_ADDRESS_COST aarch64_address_cost
20076
20077 /* This hook will determines whether unnamed bitfields affect the alignment
20078    of the containing structure.  The hook returns true if the structure
20079    should inherit the alignment requirements of an unnamed bitfield's
20080    type.  */
20081 #undef TARGET_ALIGN_ANON_BITFIELD
20082 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20083
20084 #undef TARGET_ASM_ALIGNED_DI_OP
20085 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20086
20087 #undef TARGET_ASM_ALIGNED_HI_OP
20088 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20089
20090 #undef TARGET_ASM_ALIGNED_SI_OP
20091 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20092
20093 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20094 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20095   hook_bool_const_tree_hwi_hwi_const_tree_true
20096
20097 #undef TARGET_ASM_FILE_START
20098 #define TARGET_ASM_FILE_START aarch64_start_file
20099
20100 #undef TARGET_ASM_OUTPUT_MI_THUNK
20101 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20102
20103 #undef TARGET_ASM_SELECT_RTX_SECTION
20104 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20105
20106 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20107 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20108
20109 #undef TARGET_BUILD_BUILTIN_VA_LIST
20110 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20111
20112 #undef TARGET_CALLEE_COPIES
20113 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
20114
20115 #undef TARGET_CAN_ELIMINATE
20116 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20117
20118 #undef TARGET_CAN_INLINE_P
20119 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20120
20121 #undef TARGET_CANNOT_FORCE_CONST_MEM
20122 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20123
20124 #undef TARGET_CASE_VALUES_THRESHOLD
20125 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20126
20127 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20128 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20129
20130 /* Only the least significant bit is used for initialization guard
20131    variables.  */
20132 #undef TARGET_CXX_GUARD_MASK_BIT
20133 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20134
20135 #undef TARGET_C_MODE_FOR_SUFFIX
20136 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20137
20138 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20139 #undef  TARGET_DEFAULT_TARGET_FLAGS
20140 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20141 #endif
20142
20143 #undef TARGET_CLASS_MAX_NREGS
20144 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20145
20146 #undef TARGET_BUILTIN_DECL
20147 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20148
20149 #undef TARGET_BUILTIN_RECIPROCAL
20150 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20151
20152 #undef TARGET_C_EXCESS_PRECISION
20153 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20154
20155 #undef  TARGET_EXPAND_BUILTIN
20156 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20157
20158 #undef TARGET_EXPAND_BUILTIN_VA_START
20159 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20160
20161 #undef TARGET_FOLD_BUILTIN
20162 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20163
20164 #undef TARGET_FUNCTION_ARG
20165 #define TARGET_FUNCTION_ARG aarch64_function_arg
20166
20167 #undef TARGET_FUNCTION_ARG_ADVANCE
20168 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20169
20170 #undef TARGET_FUNCTION_ARG_BOUNDARY
20171 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20172
20173 #undef TARGET_FUNCTION_ARG_PADDING
20174 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20175
20176 #undef TARGET_GET_RAW_RESULT_MODE
20177 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20178 #undef TARGET_GET_RAW_ARG_MODE
20179 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20180
20181 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20182 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20183
20184 #undef TARGET_FUNCTION_VALUE
20185 #define TARGET_FUNCTION_VALUE aarch64_function_value
20186
20187 #undef TARGET_FUNCTION_VALUE_REGNO_P
20188 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20189
20190 #undef TARGET_GIMPLE_FOLD_BUILTIN
20191 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20192
20193 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20194 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20195
20196 #undef  TARGET_INIT_BUILTINS
20197 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
20198
20199 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20200 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20201   aarch64_ira_change_pseudo_allocno_class
20202
20203 #undef TARGET_LEGITIMATE_ADDRESS_P
20204 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20205
20206 #undef TARGET_LEGITIMATE_CONSTANT_P
20207 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20208
20209 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20210 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20211   aarch64_legitimize_address_displacement
20212
20213 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20214 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20215
20216 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20217 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20218 aarch64_libgcc_floating_mode_supported_p
20219
20220 #undef TARGET_MANGLE_TYPE
20221 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20222
20223 #undef TARGET_MEMORY_MOVE_COST
20224 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20225
20226 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20227 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20228
20229 #undef TARGET_MUST_PASS_IN_STACK
20230 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20231
20232 /* This target hook should return true if accesses to volatile bitfields
20233    should use the narrowest mode possible.  It should return false if these
20234    accesses should use the bitfield container type.  */
20235 #undef TARGET_NARROW_VOLATILE_BITFIELD
20236 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20237
20238 #undef  TARGET_OPTION_OVERRIDE
20239 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20240
20241 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20242 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20243   aarch64_override_options_after_change
20244
20245 #undef TARGET_OPTION_SAVE
20246 #define TARGET_OPTION_SAVE aarch64_option_save
20247
20248 #undef TARGET_OPTION_RESTORE
20249 #define TARGET_OPTION_RESTORE aarch64_option_restore
20250
20251 #undef TARGET_OPTION_PRINT
20252 #define TARGET_OPTION_PRINT aarch64_option_print
20253
20254 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20255 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20256
20257 #undef TARGET_SET_CURRENT_FUNCTION
20258 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20259
20260 #undef TARGET_PASS_BY_REFERENCE
20261 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20262
20263 #undef TARGET_PREFERRED_RELOAD_CLASS
20264 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20265
20266 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20267 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20268
20269 #undef TARGET_PROMOTED_TYPE
20270 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20271
20272 #undef TARGET_SECONDARY_RELOAD
20273 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20274
20275 #undef TARGET_SHIFT_TRUNCATION_MASK
20276 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20277
20278 #undef TARGET_SETUP_INCOMING_VARARGS
20279 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20280
20281 #undef TARGET_STRUCT_VALUE_RTX
20282 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
20283
20284 #undef TARGET_REGISTER_MOVE_COST
20285 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20286
20287 #undef TARGET_RETURN_IN_MEMORY
20288 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20289
20290 #undef TARGET_RETURN_IN_MSB
20291 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20292
20293 #undef TARGET_RTX_COSTS
20294 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20295
20296 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20297 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20298
20299 #undef TARGET_SCHED_ISSUE_RATE
20300 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20301
20302 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20303 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20304   aarch64_sched_first_cycle_multipass_dfa_lookahead
20305
20306 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20307 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20308   aarch64_first_cycle_multipass_dfa_lookahead_guard
20309
20310 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20311 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20312   aarch64_get_separate_components
20313
20314 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20315 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20316   aarch64_components_for_bb
20317
20318 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20319 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20320   aarch64_disqualify_components
20321
20322 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20323 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20324   aarch64_emit_prologue_components
20325
20326 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20327 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20328   aarch64_emit_epilogue_components
20329
20330 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20331 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20332   aarch64_set_handled_components
20333
20334 #undef TARGET_TRAMPOLINE_INIT
20335 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20336
20337 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20338 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20339
20340 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20341 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20342
20343 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20344 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20345   aarch64_builtin_support_vector_misalignment
20346
20347 #undef TARGET_ARRAY_MODE
20348 #define TARGET_ARRAY_MODE aarch64_array_mode
20349
20350 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20351 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20352
20353 #undef TARGET_VECTORIZE_ADD_STMT_COST
20354 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20355
20356 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20357 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20358   aarch64_builtin_vectorization_cost
20359
20360 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20361 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20362
20363 #undef TARGET_VECTORIZE_BUILTINS
20364 #define TARGET_VECTORIZE_BUILTINS
20365
20366 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20367 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20368   aarch64_builtin_vectorized_function
20369
20370 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20371 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20372   aarch64_autovectorize_vector_sizes
20373
20374 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20375 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20376   aarch64_atomic_assign_expand_fenv
20377
20378 /* Section anchor support.  */
20379
20380 #undef TARGET_MIN_ANCHOR_OFFSET
20381 #define TARGET_MIN_ANCHOR_OFFSET -256
20382
20383 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20384    byte offset; we can do much more for larger data types, but have no way
20385    to determine the size of the access.  We assume accesses are aligned.  */
20386 #undef TARGET_MAX_ANCHOR_OFFSET
20387 #define TARGET_MAX_ANCHOR_OFFSET 4095
20388
20389 #undef TARGET_VECTOR_ALIGNMENT
20390 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20391
20392 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20393 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20394   aarch64_vectorize_preferred_vector_alignment
20395 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20396 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20397   aarch64_simd_vector_alignment_reachable
20398
20399 /* vec_perm support.  */
20400
20401 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20402 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20403   aarch64_vectorize_vec_perm_const
20404
20405 #undef TARGET_VECTORIZE_GET_MASK_MODE
20406 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20407 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20408 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20409   aarch64_empty_mask_is_expensive
20410 #undef TARGET_PREFERRED_ELSE_VALUE
20411 #define TARGET_PREFERRED_ELSE_VALUE \
20412   aarch64_preferred_else_value
20413
20414 #undef TARGET_INIT_LIBFUNCS
20415 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20416
20417 #undef TARGET_FIXED_CONDITION_CODE_REGS
20418 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20419
20420 #undef TARGET_FLAGS_REGNUM
20421 #define TARGET_FLAGS_REGNUM CC_REGNUM
20422
20423 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20424 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20425
20426 #undef TARGET_ASAN_SHADOW_OFFSET
20427 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20428
20429 #undef TARGET_LEGITIMIZE_ADDRESS
20430 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20431
20432 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20433 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20434
20435 #undef TARGET_CAN_USE_DOLOOP_P
20436 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20437
20438 #undef TARGET_SCHED_ADJUST_PRIORITY
20439 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20440
20441 #undef TARGET_SCHED_MACRO_FUSION_P
20442 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20443
20444 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20445 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20446
20447 #undef TARGET_SCHED_FUSION_PRIORITY
20448 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20449
20450 #undef TARGET_UNSPEC_MAY_TRAP_P
20451 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20452
20453 #undef TARGET_USE_PSEUDO_PIC_REG
20454 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20455
20456 #undef TARGET_PRINT_OPERAND
20457 #define TARGET_PRINT_OPERAND aarch64_print_operand
20458
20459 #undef TARGET_PRINT_OPERAND_ADDRESS
20460 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20461
20462 #undef TARGET_OPTAB_SUPPORTED_P
20463 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20464
20465 #undef TARGET_OMIT_STRUCT_RETURN_REG
20466 #define TARGET_OMIT_STRUCT_RETURN_REG true
20467
20468 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20469 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20470   aarch64_dwarf_poly_indeterminate_value
20471
20472 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
20473 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20474 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20475
20476 #undef TARGET_HARD_REGNO_NREGS
20477 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20478 #undef TARGET_HARD_REGNO_MODE_OK
20479 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20480
20481 #undef TARGET_MODES_TIEABLE_P
20482 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20483
20484 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20485 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20486   aarch64_hard_regno_call_part_clobbered
20487
20488 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20489 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20490   aarch64_remove_extra_call_preserved_regs
20491
20492 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20493 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20494   aarch64_return_call_with_max_clobbers
20495
20496 #undef TARGET_CONSTANT_ALIGNMENT
20497 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20498
20499 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20500 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20501   aarch64_stack_clash_protection_alloca_probe_range
20502
20503 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20504 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20505
20506 #undef TARGET_CAN_CHANGE_MODE_CLASS
20507 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20508
20509 #undef TARGET_SELECT_EARLY_REMAT_MODES
20510 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20511
20512 #undef TARGET_SPECULATION_SAFE_VALUE
20513 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20514
20515 #undef TARGET_ESTIMATED_POLY_VALUE
20516 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20517
20518 #undef TARGET_ATTRIBUTE_TABLE
20519 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20520
20521 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20522 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20523   aarch64_simd_clone_compute_vecsize_and_simdlen
20524
20525 #undef TARGET_SIMD_CLONE_ADJUST
20526 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20527
20528 #undef TARGET_SIMD_CLONE_USABLE
20529 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20530
20531 #undef TARGET_COMP_TYPE_ATTRIBUTES
20532 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20533
20534 #undef TARGET_GET_MULTILIB_ABI_NAME
20535 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20536
20537 #if CHECKING_P
20538 #undef TARGET_RUN_TARGET_SELFTESTS
20539 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20540 #endif /* #if CHECKING_P */
20541
20542 #undef TARGET_ASM_POST_CFI_STARTPROC
20543 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20544
20545 struct gcc_target targetm = TARGET_INITIALIZER;
20546
20547 #include "gt-aarch64.h"